diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1caef802b7340c7308f1b6655711481b91f0d889..ea69114d6b21e1306f07cdf0684ac1a025bfbaac 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,6 +12,10 @@ Python 2.7 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 2.7 with legacy PyOpenCL:
   script:
@@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL:
   except:
   - tags
   retry: 2
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL:
   script:
@@ -43,6 +51,10 @@ Python 3.6 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL Twice With Cache:
   script:
@@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 # PyPy POCL:
 #   script:
@@ -77,7 +93,7 @@ Python 3.6 POCL Examples:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert"
+  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
   - ". ./build-py-project-and-run-examples.sh"
   tags:
   - python3.6
@@ -87,6 +103,7 @@ Python 3.6 POCL Examples:
   except:
   - tags
 
+
 CentOS binary:
   script:
   - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..601df74bd9a655b3b29decdbdf499d55b25b6385
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Andreas Klöckner and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh
index 1e35a1e1b9949b37f95b05ebeef223c8a5955ff8..035634b16072e0188270abd8736dab99ce31dada 100755
--- a/build-helpers/make-linux-build-docker-inner-part-2.sh
+++ b/build-helpers/make-linux-build-docker-inner-part-2.sh
@@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy
 cd loopy
 
 grep -v pyopencl requirements.txt > myreq.txt
+
+# needed for pyinstaller package to be usable
+echo packaging >> myreq.txt
+
 pip install -r myreq.txt
 python setup.py install
 
diff --git a/doc/index.rst b/doc/index.rst
index d862a8acd0cb258bfd1e9623bd5cef895871f6b1..b77bbb16f413defe5010c75d28464051553b4486 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -25,18 +25,18 @@ Want to try out loopy?
 
 There's no need to go through :ref:`installation` if you'd just like to get a
 feel for what loopy is.  Instead, you may
-`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/36708/artifacts/browse/build-helpers/>`_.
+`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/-/jobs/66778/artifacts/browse/build-helpers/>`_.
 This is purposefully built on an ancient Linux distribution, so it should work
 on most versions of Linux that are currently out there.
 
 Once you have the binary, do the following::
 
     chmod +x ./loopy-centos6
-    ./loopy-centos6 --target=opencl hello-loopy-lp.py
-    ./loopy-centos6 --target=cuda hello-loopy-lp.py
-    ./loopy-centos6 --target=ispc hello-loopy-lp.py
+    ./loopy-centos6 --target=opencl hello-loopy.loopy
+    ./loopy-centos6 --target=cuda hello-loopy.loopy
+    ./loopy-centos6 --target=ispc hello-loopy.loopy
 
-Grab the example here: :download:`examples/python/hello-loopy.py <../examples/python/hello-loopy-lp.py>`.
+Grab the example here: :download:`examples/python/hello-loopy.loopy <../examples/python/hello-loopy.loopy>`.
 
 You may also donwload the most recent version by going to the `list of builds
 <https://gitlab.tiker.net/inducer/loopy/builds>`_, clicking on the newest one
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index c9ce206260c04fc883a0f980df0b18a9a826bbd9..896388d2911a6d3c0e7783d7b1b3833b87c770d0 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -406,7 +406,7 @@ Arguments
     :members:
     :undoc-members:
 
-.. autoclass:: GlobalArg
+.. autoclass:: ArrayArg
     :members:
     :undoc-members:
 
@@ -593,7 +593,7 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to
 Implementation Detail: The Base Array
 -------------------------------------
 
-All array-like data in :mod:`loopy` (such as :class:`GlobalArg` and
+All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and
 :class:`TemporaryVariable`) derive from single, shared base array type,
 described next.
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 1272d2a59119725a903fa7cd1a08b7de8629c6f6..397f34a987ed336795d00e2770c2fbeadf089ae7 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1361,7 +1361,7 @@ code-generation however additional implementation may be required for custom
 functions.  The full lists of available functions may be found in a the
 :class:`TargetBase` implementation (e.g. :class:`CudaTarget`)
 
-Custom user functions may be represented using the method described in :ref:`_functions`
+Custom user functions may be represented using the method described in :ref:`functions`
 
 
 Data-dependent control flow
@@ -1641,15 +1641,15 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ...
     <BLANKLINE>
 
 Each line of output will look roughly like::
 
 
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
@@ -1684,13 +1684,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', CG.SUBGROUP)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', CG.SUBGROUP)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', CG.SUBGROUP)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', CG.SUBGROUP)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1708,13 +1708,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ...
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, None, load, None, None) : ...
-    MemAccess(None, None, None, None, store, None, None) : ...
+    MemAccess(None, None, None, None, load, None, None, None) : ...
+    MemAccess(None, None, None, None, store, None, None, None) : ...
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
@@ -1726,12 +1726,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
 The lines of output above might look like::
 
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), {}, {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), {}, {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), {}, {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, load, g, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, load, h, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, store, e, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
 
 One can see how these functions might be useful in computing, for example,
 achieved memory bandwidth in byte/sec or performance in FLOP/sec.
@@ -1751,12 +1751,12 @@ this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access consecutive array
@@ -1766,13 +1766,13 @@ array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1792,12 +1792,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access *nonconsecutive*
@@ -1806,13 +1806,13 @@ changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
diff --git a/examples/python/hello-loopy.loopy b/examples/python/hello-loopy.loopy
index 0ba44d6eccb18236ac13e17ca747318af3962634..7f79730985119096daf3bbdd31ff17a2c0e7ab2c 100644
--- a/examples/python/hello-loopy.loopy
+++ b/examples/python/hello-loopy.loopy
@@ -1,7 +1,7 @@
 # This is a version of hello-loopy.py that can be run through
 # a loopy binary using
 #
-# ./loopy --lang=loopy hello-loopy-lp.py -
+# ./loopy --lang=loopy hello-loopy.loopy -
 
 knl = lp.make_kernel(
         "{ [i]: 0<=i<n }",
diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index b1df6e3d01317a315354cf10a55b9312090dc61f..6939bb6ad8727b3e9f11966285b6a22de84bb032 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -53,7 +53,7 @@ class FTreeWalkerBase(object):
 
     ENTITY_RE = re.compile(
             r"^(?P<name>[_0-9a-zA-Z]+)"
-            "(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
+            r"(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
 
     def parse_dimension_specs(self, node, dim_decls):
         def parse_bounds(bounds_str):
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 6b0033808c616829e60615b92849fa6353751a82..e3342d0f9d0cac2ef3c4e3d56423ce4b4ba0ac8a 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -142,7 +142,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     .. note::
 
         This data structure and its attributes should be considered immutable,
-        even if it contains mutable data types. See :method:`copy` for an easy
+        even if it contains mutable data types. See :meth:`copy` for an easy
         way of producing a modified copy.
 
     .. attribute:: domains
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 6bf733a84f9df48fbb8433015e1e137f6dc0392c..bae9d7d1fbc873076a84b933e5c78f5c9b19dbb5 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -549,15 +549,15 @@ class ArrayBase(ImmutableRecord):
     .. attribute :: name
 
     .. attribute :: dtype
-        the :class:`loopy.loopytype` of the array.
-        if this is *none*, :mod:`loopy` will try to continue without
-        knowing the type of this array, where the idea is that precise
-        knowledge of the type will become available at invocation time.
-        :class:`loopy.compiledkernel` (and thereby
-        :meth:`loopy.loopkernel.__call__`) automatically add this type
-        information based on invocation arguments.
-
-        note that some transformations, such as :func:`loopy.add_padding`
+
+        The :class:`loopy.types.LoopyType` of the array. If this is *None*,
+        :mod:`loopy` will try to continue without knowing the type of this
+        array, where the idea is that precise knowledge of the type will become
+        available at invocation time.  Calling the kernel
+        (via :meth:`loopy.LoopKernel.__call__`)
+        automatically adds this type information based on invocation arguments.
+
+        Note that some transformations, such as :func:`loopy.add_padding`
         cannot be performed without knowledge of the exact *dtype*.
 
     .. attribute :: shape
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 3e776bd0609f8c4c6f63aadae811d97a0f97b579..7877f8b939444bf3dc095037ffeaa1bb548c39d6 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -49,7 +49,7 @@ from warnings import warn
 class auto(object):  # noqa
     """A generic placeholder object for something that should be automatically
     determined.  See, for example, the *shape* or *strides* argument of
-    :class:`GlobalArg`.
+    :class:`ArrayArg`.
     """
 
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 95c3c336c4aeb78efb3a8dccfbdbc554103fdef1..b8be6191d933899dbac2f37eb5b85267defa7690 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1241,7 +1241,7 @@ def draw_dependencies_as_unicode_arrows(
         for dep in insn.depends_on:
             reverse_deps.setdefault(dep, set()).add(insn.id)
 
-    # mapping of (from_id, to_id) tuples to column_index
+    # mapping of to_id tuples to column_index
     dep_to_column = {}
 
     # {{{ find column assignments
@@ -1318,7 +1318,7 @@ def draw_dependencies_as_unicode_arrows(
 
             elif insn.id in starts:
                 starts.remove(insn.id)
-                if starts:
+                if starts or pointed_at_insn_id not in processed_ids:
                     # will continue downward
                     row[col] = do_flag_downward(u"├", pointed_at_insn_id)
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 652f8b8933ee79935f8bf08e7de2356972922ccc..58b68486be3e08a818cc87388eea31f0788bb959 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -794,9 +794,13 @@ def generate_loop_schedules_internal(
 
         if not is_ready:
             if debug_mode:
-                print("instruction '%s' is missing insn depedencies '%s'" % (
-                        format_insn(kernel, insn.id), ",".join(
-                            insn.depends_on - sched_state.scheduled_insn_ids)))
+                # These are not that interesting when understanding scheduler
+                # failures.
+
+                # print("instruction '%s' is missing insn depedencies '%s'" % (
+                #         format_insn(kernel, insn.id), ",".join(
+                #             insn.depends_on - sched_state.scheduled_insn_ids)))
+                pass
             continue
 
         want = kernel.insn_inames(insn) - sched_state.parallel_inames
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 3fecfb778c81ff9db101abca543ae6992e0b3575..9ce2bb081eca67cc6f41864c7ce5965e018ce853 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -581,6 +581,11 @@ class MemAccess(Record):
        A :class:`str` that specifies the variable name of the data
        accessed.
 
+    .. attribute:: variable_tag
+
+       A :class:`str` that specifies the variable tag of a
+       :class:`pymbolic.primitives.TaggedVariable`.
+
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
@@ -597,7 +602,8 @@ class MemAccess(Record):
     """
 
     def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None,
-                 direction=None, variable=None, count_granularity=None):
+                 direction=None, variable=None, variable_tag=None,
+                 count_granularity=None):
 
         if count_granularity not in CountGranularity.ALL+[None]:
             raise ValueError("Op.__init__: count_granularity '%s' is "
@@ -607,12 +613,14 @@ class MemAccess(Record):
         if dtype is None:
             Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides,
                             gid_strides=gid_strides, direction=direction,
-                            variable=variable, count_granularity=count_granularity)
+                            variable=variable, variable_tag=variable_tag,
+                            count_granularity=count_granularity)
         else:
             from loopy.types import to_loopy_type
             Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype),
                             lid_strides=lid_strides, gid_strides=gid_strides,
                             direction=direction, variable=variable,
+                            variable_tag=variable_tag,
                             count_granularity=count_granularity)
 
     def __hash__(self):
@@ -622,7 +630,7 @@ class MemAccess(Record):
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
-        return "MemAccess(%s, %s, %s, %s, %s, %s, %s)" % (
+        return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s)" % (
             self.mtype,
             self.dtype,
             None if self.lid_strides is None else dict(
@@ -631,6 +639,7 @@ class MemAccess(Record):
                 sorted(six.iteritems(self.gid_strides))),
             self.direction,
             self.variable,
+            self.variable_tag,
             self.count_granularity)
 
 # }}}
@@ -697,8 +706,9 @@ class CounterBase(CombineMapper):
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CounterBase):
-    def __init__(self, knl):
+    def __init__(self, knl, count_within_subscripts=True):
         self.knl = knl
+        self.count_within_subscripts = count_within_subscripts
         from loopy.type_inference import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl)
 
@@ -719,7 +729,10 @@ class ExpressionOpCounter(CounterBase):
                     ) + self.rec(expr.parameters)
 
     def map_subscript(self, expr):
-        return self.rec(expr.index)
+        if self.count_within_subscripts:
+            return self.rec(expr.index)
+        else:
+            return ToCountMap()
 
     def map_sum(self, expr):
         assert expr.children
@@ -981,6 +994,10 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
     def map_subscript(self, expr):
         name = expr.aggregate.name
+        try:
+            var_tag = expr.aggregate.tag
+        except AttributeError:
+            var_tag = None
 
         if name in self.knl.arg_dict:
             array = self.knl.arg_dict[name]
@@ -1009,6 +1026,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
                             lid_strides=dict(sorted(six.iteritems(lid_strides))),
                             gid_strides=dict(sorted(six.iteritems(gid_strides))),
                             variable=name,
+                            variable_tag=var_tag,
                             count_granularity=count_granularity
                             ): 1}
                           ) + self.rec(expr.index_tuple)
@@ -1314,7 +1332,7 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
 # {{{ get_op_map
 
 def get_op_map(knl, numpy_types=True, count_redundant_work=False,
-               subgroup_size=None):
+               count_within_subscripts=True, subgroup_size=None):
 
     """Count the number of operations in a loopy kernel.
 
@@ -1330,6 +1348,9 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for code
         optimization.)
 
+    :arg count_within_subscripts: A :class:`bool` specifying whether to
+        count operations inside array indices.
+
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
         ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
@@ -1382,7 +1403,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
     knl = preprocess_kernel(knl)
 
     op_map = ToCountMap()
-    op_counter = ExpressionOpCounter(knl)
+    op_counter = ExpressionOpCounter(knl, count_within_subscripts)
 
     from loopy.kernel.instruction import (
             CallInstruction, CInstruction, Assignment,
@@ -1627,6 +1648,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                             gid_strides=mem_access.gid_strides,
                             direction=mem_access.direction,
                             variable=mem_access.variable,
+                            variable_tag=mem_access.variable_tag,
                             count_granularity=mem_access.count_granularity),
                         ct)
                         for mem_access, ct in six.iteritems(access_map.count_map)),
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 8927cd6fba97d847e425fe9b2eaa8960297cdb76..f4d46854b8dd15c8c1e9a716017ce2724b4db2fc 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1629,7 +1629,7 @@ def get_access_range(domain, subscript, assumptions, shape=None,
             if shape is not None:
                 try:
                     shape_aff = guarded_aff_from_expr(access_map.space, shape[idim])
-                except ExpressionToAffineConversionError as sub_err:
+                except ExpressionToAffineConversionError:
                     pass
 
             if shape_aff is None:
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 673d3b284c79c99e88f2dc08ba1b6de41d2ee9a4..d6f55091a3ba994781e87f180faa49629a049772 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -290,7 +290,7 @@ class CUDACASTBuilder(CASTBuilder):
     _VEC_AXES = "xyzw"
 
     def add_vector_access(self, access_expr, index):
-        return access_expr.a(self._VEC_AXES[index])
+        return access_expr.attr(self._VEC_AXES[index])
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 73e8e00929ddc04386200d7d450b91d66439eab6..34faf0a03d60b5be391c7f49e9baf247093e965a 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device):
 
             new_storage_shape = storage_shape
 
-        new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape)
+        new_temp_vars[temp_var.name] = temp_var.copy(
+                storage_shape=tuple(new_storage_shape))
 
     return kernel.copy(temporary_variables=new_temp_vars)
 
diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py
index cfbbd56e906c5e622debcd82bd5368aa3b1fb34c..a20a798cfa35c64c0cbd7097b41824dda2a35a84 100644
--- a/loopy/transform/add_barrier.py
+++ b/loopy/transform/add_barrier.py
@@ -44,15 +44,15 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None,
     be any inputs that are understood by :func:`loopy.match.parse_match`.
 
     :arg insn_before: String expression that specifies the instruction(s)
-    before the barrier which is to be added
+        before the barrier which is to be added
     :arg insn_after: String expression that specifies the instruction(s) after
-    the barrier which is to be added
+        the barrier which is to be added
     :arg id: String on which the id of the barrier would be based on.
     :arg tags: The tag of the group to which the barrier must be added
     :arg synchronization_kind: Kind of barrier to be added. May be "global" or
-    "local"
+        "local"
     :arg kind: Type of memory to be synchronied. May be "global" or "local". Ignored
-    for "global" bariers.  If not supplied, defaults to :arg:`synchronization_kind`
+        for "global" bariers.  If not supplied, defaults to *synchronization_kind*
     """
 
     if mem_kind is None:
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index f0b9814c43698a64af23f1555a27e910ef89762e..f6568918d30f33d4c7103e40d02bdc40c38dfa1b 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -106,6 +106,7 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
         sequential=False):
     """Takes in a kernel that carries out an operation and returns a kernel
     that carries out a batch of these operations.
+
     .. note::
        For temporaries in a kernel that are private or read only
        globals and if `sequential=True`, loopy does not does not batch these
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 801da4c13057edb089d9e9cba098ba41e9919ed6..63d3a40fb6c6967cac5e6149d5cf51bb7c2efbb9 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -160,7 +160,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         matching contexts.  See :func:`loopy.match.parse_stack_match`
         for syntax.
     :arg temporary_scope: If given, override the choice of
-    :class:`AddressSpace` for the created temporary.
+        :class:`AddressSpace` for the created temporary.
     :arg default_tag: The default :ref:`iname-tags` to be assigned to the
         inames used for fetching and storing
     :arg fetch_bounding_box: If the access footprint is non-convex
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index ad1da3e7e67d9d609f51bfed4db7141d14e508dd..83598dcc26646261703ce9b24fecebdd8a975774 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -66,8 +66,6 @@ __doc__ = """
 
 .. autofunction:: affine_map_inames
 
-.. autofunction:: realize_ilp
-
 .. autofunction:: find_unused_axis_tag
 
 .. autofunction:: make_reduction_inames_unique
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 3dd7009ea4215d94cc55c318d1263dfc7c9f2fe7..4d7f7007726ec93ee0608557b193c25a5377a3c4 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -27,6 +27,7 @@ import islpy as isl
 
 from loopy.diagnostic import LoopyError
 from loopy.symbolic import CombineMapper
+from pymbolic.mapper import Collector
 
 
 # {{{ find_instructions
@@ -362,9 +363,10 @@ def uniquify_instruction_ids(kernel):
 # {{{ remove_work
 
 class _MemAccessGatherer(CombineMapper):
-    def __init__(self, kernel, address_space):
+    def __init__(self, kernel, address_space, exclude_vars=[]):
         self.kernel = kernel
         self.address_space = address_space
+        self.exclude_vars = exclude_vars
 
     def combine(self, values):
         from pytools import flatten
@@ -381,7 +383,9 @@ class _MemAccessGatherer(CombineMapper):
             return set()
 
         descr = self.kernel.get_var_descriptor(name)
-        if descr.address_space == self.address_space:
+        if descr.address_space == self.address_space and \
+                    name not in self.exclude_vars:
+            # TODO what about tags?
             result = set([expr])
         else:
             result = set()
@@ -397,6 +401,34 @@ class _MemAccessGatherer(CombineMapper):
         return self._map_access(expr, expr.aggregate.name, expr.index)
 
 
+class _VariableGatherer(Collector):
+    # TODO add tests for this
+    def __init__(self, search_variables):
+        self.search_variables = search_variables
+
+    #def combine(self, values):
+    #    from pytools import flatten
+    #    return set(flatten(values))
+
+    def map_variable(self, expr):
+        if expr.name in self.search_variables:
+            return set([expr.name])
+        else:
+            return set()
+
+    map_tagged_variable = map_variable
+
+    # TODO do I need this?
+    def map_reduction(self, expr):
+        return self.rec(expr.expr)
+
+    # TODO do I need this?
+    map_linear_subscript = CombineMapper.map_subscript
+
+#def map_subscript(self, expr):
+#    return self.rec(expr.index)
+
+
 def _make_grid_size_domain(kernel, var_name_gen=None):
     if var_name_gen is None:
         var_name_gen = kernel.get_var_name_generator()
@@ -420,7 +452,7 @@ def _make_grid_size_domain(kernel, var_name_gen=None):
     return ggrid_var_names, lgrid_var_names, grid_range_dom
 
 
-def remove_work(kernel):
+def remove_work(kernel, remove_vars=[], new_knl_name=None, use_unused_inames=False):
     """This transform removes operations in a kernel, leaving only
     accesses to global memory.
 
@@ -434,7 +466,8 @@ def remove_work(kernel):
 
     kernel = lp.preprocess_kernel(kernel)
 
-    gatherer = _MemAccessGatherer(kernel, lp.AddressSpace.GLOBAL)
+    gatherer = _MemAccessGatherer(kernel, lp.AddressSpace.GLOBAL,
+                                  exclude_vars=remove_vars)
 
     from loopy.kernel.instruction import MultiAssignmentBase, make_assignment
 
@@ -442,14 +475,38 @@ def remove_work(kernel):
     old_to_new_ids = {}
     insn_id_gen = kernel.get_instruction_id_generator()
 
+    if new_knl_name:
+        new_name = new_knl_name
+    else:
+        new_name = kernel.name
     var_name_gen = kernel.get_var_name_generator()
     read_tgt_var_name = var_name_gen("read_tgt")
     new_temporary_variables = kernel.temporary_variables.copy()
+    new_args = [arg.copy() for arg in kernel.args if arg.name not in remove_vars]
     new_temporary_variables[read_tgt_var_name] = lp.TemporaryVariable(
             read_tgt_var_name, address_space=lp.AddressSpace.PRIVATE)
 
     new_instructions = []
 
+    # TODO figure out which of these tags I really need to deal with
+    from loopy.kernel.data import  GroupIndexTag, LocalIndexTag, UnrollTag
+    parallel_inames = set([iname for iname, tag in kernel.iname_to_tag.items() if
+                              (isinstance(tag, LocalIndexTag) or
+                               isinstance(tag, GroupIndexTag))])
+    unrolled_inames = set([iname for iname, tag in kernel.iname_to_tag.items() if
+                              isinstance(tag, UnrollTag)])
+
+    iname_gatherer = _VariableGatherer(kernel.all_inames() -
+                                       (parallel_inames | unrolled_inames))
+
+    def get_unused_inames(insn):
+        inames_required = insn.within_inames - (parallel_inames | unrolled_inames)
+        inames_found = set.union(
+                            *[iname_gatherer(assignee) for assignee in insn.assignees],
+                            iname_gatherer(insn.expression),
+                            )
+        return inames_required - inames_found
+
     # {{{ create init insn for read target
 
     ggrid_var_names, lgrid_var_names, grid_range_dom = _make_grid_size_domain(kernel)
@@ -468,8 +525,28 @@ def remove_work(kernel):
 
     # {{{ rewrite instructions
 
+    def add_unrepresented_grid_vars_to_inames(inames):
+        local_axes_needed = set(range(len(lgrid_var_names)))
+        group_axes_needed = set(range(len(ggrid_var_names)))
+        #TODO deal with key errors
+        for iname in inames:
+            try:
+                tag = kernel.iname_to_tag[iname]
+            except KeyError:
+                continue
+            if isinstance(tag, LocalIndexTag):
+                local_axes_needed.remove(tag.axis)
+            elif isinstance(tag, GroupIndexTag):
+                group_axes_needed.remove(tag.axis)
+        return inames | set([lgrid_var_names[axis] for axis in local_axes_needed] +
+               [ggrid_var_names[axis] for axis in group_axes_needed])
+
     read_insn_ids = []
 
+    read_tgt_var_written_to_global = False
+    type_inf = lp.type_inference.TypeInferenceMapper(kernel)
+    read_tgt_var_dtype = None
+    read_tgt_var_read_expr_acc = None
     for insn in kernel.instructions:
         if not isinstance(insn, MultiAssignmentBase):
             new_instructions.append(insn)
@@ -482,28 +559,50 @@ def remove_work(kernel):
         reader_accesses = gatherer(insn.expression)
 
         new_insn_ids = set()
+        new_within_inames = frozenset(
+                add_unrepresented_grid_vars_to_inames(insn.within_inames))
+
+        if use_unused_inames:
+            inserted_inames = sorted(list(get_unused_inames(insn)))
+        else:
+            inserted_inames = []
+
+        from pytools import product
         for read_expr in reader_accesses:
             new_id = insn_id_gen(insn.id)
             read_insn_ids.append(insn.id)
+            add_expr = read_expr*product([p.Variable(iname) for iname in inserted_inames])
             new_instructions.append(
                     make_assignment(
                         (p.Variable(read_tgt_var_name),),
-                        p.Variable(read_tgt_var_name) + read_expr,
+                        p.Variable(read_tgt_var_name) + add_expr,
                         id=new_id,
-                        within_inames=insn.within_inames,
+                        within_inames=new_within_inames,
                         depends_on=insn.depends_on | frozenset([read_tgt_init_id])))
             new_insn_ids.add(new_id)
 
+            # determine type of add_expr
+            # TODO loopy already has a way of figuring this out,
+            # can we use that instead? (need this to set type of output arg)
+            if read_tgt_var_dtype is None:
+                read_tgt_var_dtype = type_inf(add_expr)
+                read_tgt_var_read_expr_acc = add_expr
+            elif type_inf(add_expr) != read_tgt_var_dtype:
+                read_tgt_var_read_expr_acc += add_expr
+                read_tgt_var_dtype = type_inf(read_tgt_var_read_expr_acc)
+
         for write_expr in writer_accesses:
             new_id = insn_id_gen(insn.id)
             new_instructions.append(
                     make_assignment(
                         (write_expr,),
-                        17,
+                        p.Variable(read_tgt_var_name)*product(
+                            [p.Variable(iname) for iname in inserted_inames]),
                         id=new_id,
-                        within_inames=insn.within_inames,
+                        within_inames=new_within_inames,
                         depends_on=insn.depends_on))
             new_insn_ids.add(new_id)
+            read_tgt_var_written_to_global = True  # TODO part of hack above
 
         old_to_new_ids[insn.id] = frozenset(new_insn_ids)
 
@@ -511,6 +610,8 @@ def remove_work(kernel):
 
     # {{{ create write-out insn for read target
 
+    # TODO writing to temp doesn't guarantee execution, need to write to global mem
+    """
     _, lgrid = kernel.get_grid_size_upper_bounds_as_exprs()
     read_tgt_local_dest_name = var_name_gen("read_tgt_dest")
     new_temporary_variables[read_tgt_local_dest_name] = lp.TemporaryVariable(
@@ -528,6 +629,63 @@ def remove_work(kernel):
             id=write_read_tgt_id,
             depends_on=frozenset(read_insn_ids),
             within_inames=grid_inames))
+    """
+
+    if not read_tgt_var_written_to_global:
+        # TODO must write read_tgt_var to global or instructions may not execute,
+        # TODO if write variable has been removed, need a new write variable
+        # TODO if write variable has not been removed, just write to that variable?
+
+        # define order for indexing/shape/strides
+        def index_order(local_list, global_list):
+            # produce this order: [g.n, l.n, ...,  g.1, l.1, g.0, l.0]
+            # accept both dicts of {dim: val} and ordered lists [val0, val1, ...]
+            result = []
+            for i in reversed(range(len(local_list))):
+                result.append(global_list[i])
+                result.append(local_list[i])
+            return result
+
+        # define local/global strides
+        # TODO decide what this mem access pattern should be
+        ggrid, lgrid = kernel.get_grid_size_upper_bounds_as_exprs()
+        lstrides = {0: 1}
+        assert len(lgrid) == len(ggrid)  # TODO is this necessary?
+        for dim in range(1, len(lgrid)):
+            lstrides[dim] = lstrides[dim-1]*lgrid[dim-1]*ggrid[dim-1]
+        gstrides = {}
+        for dim in range(0, len(ggrid)):
+            gstrides[dim] = lstrides[dim]*lgrid[dim]
+
+        # use consistent index ordering for strides, shape, and index
+        strides = index_order(lstrides, gstrides)
+        shape = tuple(index_order(lgrid, ggrid))
+        index = tuple(p.Variable(i) for i in
+                      index_order(lgrid_var_names, ggrid_var_names))
+
+        # create new global arg to write results
+        read_tgt_global_dest_name = var_name_gen("read_tgt_dest")
+        new_args.append(lp.GlobalArg(
+                name=read_tgt_global_dest_name,
+                shape=shape,
+                dtype=read_tgt_var_dtype,
+                strides=",".join(str(s) for s in strides),
+                ))
+        # TODO WEIRD BEHAVIOR: when kernel has not been cached, this works fine,
+        # but when kernel is used again, this new arg ends up const and writing
+        # to it causes an error... ???
+
+        # create instruction writing read_tgt to new global arg
+        write_read_tgt_id = insn_id_gen("write_read_tgt")
+        old_to_new_ids[write_read_tgt_id] = [write_read_tgt_id]
+        new_instructions.append(
+            make_assignment(
+                (p.Variable(read_tgt_global_dest_name)[index],),
+                p.Variable(read_tgt_var_name),
+                id=write_read_tgt_id,
+                depends_on=frozenset(read_insn_ids),
+                within_inames=grid_inames,
+                ))
 
     # }}}
 
@@ -549,9 +707,12 @@ def remove_work(kernel):
             domains=kernel.domains + [grid_range_dom],
             state=lp.KernelState.INITIAL,
             instructions=new_instructions_2,
-            temporary_variables=new_temporary_variables)
+            args=new_args,
+            temporary_variables=new_temporary_variables,
+            name=new_name,
+            _cached_written_variables=None,
+            )
 
-    from loopy.kernel.data import GroupIndexTag, LocalIndexTag
     kernel = lp.tag_inames(kernel, dict(
         (ggrid_var_names[i], GroupIndexTag(i))
         for i in range(len(ggrid_var_names))))
@@ -559,6 +720,14 @@ def remove_work(kernel):
         (lgrid_var_names[i], LocalIndexTag(i))
         for i in range(len(lgrid_var_names))))
 
+    if not kernel.loop_priority:
+        from loopy.diagnostic import warn_with_kernel
+        warn_with_kernel(kernel, "remove_work_loop_priority",
+            "Kernel loop_priority unspecified. "
+            "remove_work() may yield loop priority differing "
+            "from that of original kernel. To ensure desired "
+            "loop priority, use lp.prioritize_loops().")
+
     return kernel
 
 # }}}
diff --git a/requirements.txt b/requirements.txt
index a3e88cfea99e7413211c35d11464932f98e23758..97c2024764715d0a715520800e2e1dd467183479 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git
 git+https://github.com/inducer/f2py
 
 # Optional, needed for using the C preprocessor on Fortran
-ply>=3.6
-
-# This is needed for the pyinstaller executable to be usable.
-packaging
+ply>=3.6
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index b939ce0cf8b680bb1eb3501ed6d7f563e9c1c7b6..eec3dfd1f52ed97c58f5281716eac8fc18980094 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504
 max-line-length=85
 exclude=
     loopy/target/c/compyte/ndarray,
diff --git a/test/test_loopy.py b/test/test_loopy.py
index accf9c1dff5a1f660871dd63d6af3337aced6490..38d1cd6b0e5f2e9ccd64c6ddb41b161040e515e4 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2908,6 +2908,25 @@ def test_dep_cycle_printing_and_error():
         print(lp.generate_code(knl)[0])
 
 
+def test_backwards_dep_printing_and_error():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+            c[i] = a[i] + b[i]                       {id=insn1}
+            c[i] = 2*c[i]                            {id=insn2, dep=insn1}
+            c[i] = 7*c[i] + a[i]*a[i] + b[i]*b[i]    {id=insn3, dep=insn2}
+            b[i] = b[i] + c[i]                                 {id=insn4, dep=insn3}
+            d[i] = 7*a[i ]                                     {id=insn5, dep=insn4}
+            a[i] = a[i] + d[i]                                 {id=insn6, dep=insn5}
+            """, [
+                lp.GlobalArg('a, b', dtype=np.float64),
+                "..."
+            ])
+
+    # Used to crash with KeyError
+    print(knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 15d5ea7c98b6bde2aab89441a908b71324faae16..54b608a183840cc5d33f1e738f36fc605d16d94a 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -47,8 +47,8 @@ __all__ = [
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
-@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("ilp_multiple", [1, 2])
+@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("opt_level", [11])
 def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     ctx = ctx_factory()
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 78eca4d0c141526d9bac652d5baa3890933ac0c1..ef229d5cd08554d6656d23d83bc0c6b66ee77b9f 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -225,22 +225,28 @@ def test_global_parallel_reduction(ctx_factory, size):
             "{[i]: 0 <= i < n }",
             """
             # Using z[0] instead of z works around a bug in ancient PyOpenCL.
-            z[0] = sum(i, i/13)
+            z[0] = sum(i, a[i])
             """)
 
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
     ref_knl = knl
 
     gsize = 128
     knl = lp.split_iname(knl, "i", gsize * 20)
-    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
-    knl = lp.split_reduction_inward(knl, "i_inner_inner")
+    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
+    knl = lp.split_reduction_outward(knl, "i_outer")
     knl = lp.split_reduction_inward(knl, "i_inner_outer")
     from loopy.transform.data import reduction_arg_to_subst_rule
     knl = reduction_arg_to_subst_rule(knl, "i_outer")
+
     knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
             temporary_scope=lp.temp_var_scope.GLOBAL,
             default_tag="l.auto")
     knl = lp.realize_reduction(knl)
+    knl = lp.tag_inames(knl, "i_outer_0:g.0")
+
+    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
+    # otherwise there will be useless save/reload code generated.
     knl = lp.add_dependency(
             knl, "writes:acc_i_outer",
             "id:red_i_outer_arg_barrier")
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 3f2366521673597f0cd7e96a22780ffd2c89bdc1..b29edf1ed05f7728b2cbe5b5ad8a74c26944ed8c 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -57,7 +57,8 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -161,7 +162,8 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -206,7 +208,8 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=False)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -226,7 +229,7 @@ def test_op_counter_bitwise():
     i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP)
                       ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
-    assert i32add == n*m+n*m*ell*n_subgroups
+    assert i32add == n*m*ell*n_subgroups
     assert i32bw == 2*n*m*ell*n_subgroups
     assert i64bw == 2*n*m*n_subgroups
     assert i64add == i64mul == n*m*n_subgroups
@@ -1057,6 +1060,65 @@ def test_all_counters_parallel_matmul():
     assert local_mem_s == m*2/bsize*n_subgroups
 
 
+def test_mem_access_tagged_variables():
+    bsize = 16
+    knl = lp.make_kernel(
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
+            [
+                "c$mmresult[i, j] = sum(k, a$mmaload[i, k]*b$mmbload[k, j])"
+            ],
+            name="matmul", assumptions="n,m,ell >= 1")
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
+    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
+    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", bsize)
+    # knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
+    # knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
+
+    n = 512
+    m = 256
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
+    group_size = bsize*bsize
+    n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
+
+    mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                           subgroup_size=SGS)
+
+    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
+                             lid_strides={0: 1},
+                             gid_strides={1: bsize},
+                             direction='load', variable='b',
+                             variable_tag='mmbload',
+                             count_granularity=CG.WORKITEM)
+                             ].eval_with_dict(params)
+    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
+                             lid_strides={1: Variable('m')},
+                             gid_strides={0: Variable('m')*bsize},
+                             direction='load',
+                             variable='a',
+                             variable_tag='mmaload',
+                             count_granularity=CG.SUBGROUP)
+                             ].eval_with_dict(params)
+
+    assert f32s1lb == n*m*ell
+
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32s1la == m*n_subgroups
+
+    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
+                             lid_strides={0: 1, 1: Variable('ell')},
+                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
+                             direction='store', variable='c',
+                             variable_tag='mmresult',
+                             count_granularity=CG.WORKITEM)
+                             ].eval_with_dict(params)
+
+    assert f32coal == n*ell
+
+
 def test_gather_access_footprint():
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i,j,k<n}",
@@ -1153,7 +1215,8 @@ def test_summations_and_filters():
     assert f32lall == (3*n*m*ell)*n_subgroups
     assert f64lall == (2*n*m)*n_subgroups
 
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
diff --git a/test/test_target.py b/test/test_target.py
index 75b3c05aeeb0fd1b9cfb5b0c10ad25976ea2464c..bcf85a340a29afc8772686d23c5fe3e8a03ccffd 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -358,6 +358,23 @@ def test_ispc_streaming_stores():
     lp.generate_code_v2(knl).all_code()
 
 
+def test_cuda_short_vector():
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "out[i] = 2*a[i]",
+        target=lp.CudaTarget())
+
+    knl = lp.set_options(knl, write_code=True)
+    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
+    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
+    knl = lp.tag_array_axes(knl, "a,out", "C,vec")
+
+    knl = lp.set_options(knl, write_wrapper=True)
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+
+    print(lp.generate_code_v2(knl).device_code())
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])