diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5ea075d194a9da75a1c18d180c65239be83eb85e..f96b43d67fcc1ca53a736fb4893990b8bd363a1a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -69,6 +69,7 @@ Python 2.7 with legacy PyOpenCL:
   - pocl
   except:
   - tags
+  retry: 2
 
 Python 3.6 POCL:
   script:
diff --git a/doc/index.rst b/doc/index.rst
index a0bad2898be4aab74dead90aae825e4e0a460c87..d862a8acd0cb258bfd1e9623bd5cef895871f6b1 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -18,12 +18,14 @@ When you run this script, the following kernel is generated, compiled, and execu
 
 (See the full example for how to print the generated code.)
 
+.. _static-binary:
+
 Want to try out loopy?
 ----------------------
 
 There's no need to go through :ref:`installation` if you'd just like to get a
 feel for what loopy is.  Instead, you may
-`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/1989/artifacts/browse/build-helpers/>`_.
+`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/36708/artifacts/browse/build-helpers/>`_.
 This is purposefully built on an ancient Linux distribution, so it should work
 on most versions of Linux that are currently out there.
 
diff --git a/doc/misc.rst b/doc/misc.rst
index 347b5d098c8dc0e37bb72659c0b0de5a8b4e3704..cd6fe102cb9c97a619d8b6512f103c9dcabe65b5 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -3,6 +3,18 @@
 Installation
 ============
 
+Option 0: Static Binary
+-----------------------
+
+If you would just like to experiment with :mod:`loopy`'s code transformation
+abilities, the easiest way to get loopy is to download a statically-linked
+Linux binary.
+
+See :ref:`static-binary` for details.
+
+Option 1: From Source, no PyOpenCL integration
+-----------------------------------------------
+
 This command should install :mod:`loopy`::
 
     pip install loo.py
@@ -26,10 +38,59 @@ You may also clone its git repository::
     git clone --recursive git://github.com/inducer/loopy
     git clone --recursive http://git.tiker.net/trees/loopy.git
 
+Option 2: From Conda Forge, with PyOpenCL integration
+-----------------------------------------------------
+
+This set of instructions is intended for 64-bit Linux and
+MacOS support computers:
+
+#.  Make sure your system has the basics to build software.
+
+    On Debian derivatives (Ubuntu and many more),
+    installing ``build-essential`` should do the trick.
+
+    Everywhere else, just making sure you have the ``g++`` package should be
+    enough.
+
+#.  Install `miniconda <https://conda.io/miniconda.html>`_.
+    (Both Python 2 and 3 should work. In the absence of other constraints, prefer Python 3.)
+
+#.  ``export CONDA=/WHERE/YOU/INSTALLED/miniconda3``
+
+    If you accepted the default location, this should work:
+
+    ``export CONDA=$HOME/miniconda3``
+
+#.  ``$CONDA/bin/conda create -n dev``
+
+#.  ``source $CONDA/bin/activate dev``
+
+#.  ``conda config --add channels conda-forge``
+
+#.  ``conda install git pip pocl islpy pyopencl`` (Linux)
+
+    or
+
+    ``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X)
+
+#.  Type the following command::
+
+        pip install git+https://github.com/inducer/loopy
+
+Next time you want to use :mod:`loopy`, just run the following command::
+
+    source /WHERE/YOU/INSTALLED/miniconda3/bin/activate dev
+
+You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
+
+See the `PyOpenCL installation instructions
+<https://documen.tician.de/pyopencl/misc.html#installation>`_ for options
+regarding OpenCL drivers.
+
 User-visible Changes
 ====================
 
-Version 2016.2
+Version 2017.2
 --------------
 .. note::
 
@@ -57,7 +118,7 @@ Licensing
 
 Loopy is licensed to you under the MIT/X Consortium license:
 
-Copyright (c) 2009-13 Andreas Klöckner and Contributors.
+Copyright (c) 2009-17 Andreas Klöckner and Contributors.
 
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
@@ -342,6 +403,11 @@ Here's a Bibtex entry for your convenience::
        doi = "{10.1145/2627373.2627387}",
     }
 
+Getting help
+============
+
+Email the friendly folks on the `loopy mailing list <https://lists.tiker.net/listinfo/loopy>`_.
+
 Acknowledgments
 ===============
 
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 9138d9a41d7b33db956fd8aba55c0b3b788db064..07b7836d82596892f1d94e336dfa81e1b5a7a881 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -130,6 +130,7 @@ Iname Implementation Tags
 Tag                             Meaning
 =============================== ====================================================
 ``None`` | ``"for"``            Sequential loop
+``"ord"``                       Forced-order sequential loop
 ``"l.N"``                       Local (intra-group) axis N ("local")
 ``"g.N"``                       Group-number axis N ("group")
 ``"unr"``                       Unroll
@@ -326,15 +327,25 @@ Expressions
 Loopy's expressions are a slight superset of the expressions supported by
 :mod:`pymbolic`.
 
-* ``if``
-* ``elif`` (following an ``if``)
-* ``else`` (following an ``if`` / ``elif``)
+* ``if(cond, then, else_)``
+
+* ``a[[ 8*i + j ]]``: Linear subscripts.
+  See :class:`loopy.symbolic.LinearSubscript`.
+
 * ``reductions``
-    * duplication of reduction inames
+  See :class:`loopy.symbolic.Reduction`.
+
     * ``reduce`` vs ``simul_reduce``
+
 * complex-valued arithmetic
+
 * tagging of array access and substitution rule use ("$")
+  See :class:`loopy.symbolic.TaggedVariable`.
+
 * ``indexof``, ``indexof_vec``
+* ``cast(type, value)``: No parse syntax currently.
+  See :class:`loopy.symbolic.TypeCast`.
+
 
 TODO: Functions
 TODO: Reductions
@@ -579,4 +590,15 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to
     :members:
     :undoc-members:
 
+Implementation Detail: The Base Array
+-------------------------------------
+
+All array-like data in :mod:`loopy` (such as :class:`GlobalArg` and
+:class:`TemporaryVariable`) derive from single, shared base array type,
+described next.
+
+.. currentmodule:: loopy.kernel.array
+
+.. autoclass:: ArrayBase
+
 .. vim: tw=75:spell:fdm=marker
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index d293e3ebe998a632bd547f94a67e675ff0592bfb..8bdd17b6295e9328bbbb4acbadd2be7e14ae625b 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -74,6 +74,8 @@ Manipulating Instructions
 
 .. autofunction:: add_nosync
 
+.. autofunction:: add_barrier
+
 Registering Library Routines
 ----------------------------
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 92ec799f7045cf63dc75d1386d8a51fd7d42954c..69f89548618e86b408a31af240bee84678c859c1 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -122,7 +122,9 @@ always see loopy's view of a kernel by printing it.
     i: None
     ---------------------------------------------------------------------------
     INSTRUCTIONS:
-     [i]                                  out[i] <- 2*a[i]   # insn
+    for i
+      out[i] = 2*a[i]  {id=insn}
+    end i
     ---------------------------------------------------------------------------
 
 You'll likely have noticed that there's quite a bit more information here
@@ -1105,11 +1107,12 @@ work item:
 
 :mod:`loopy` supports two kinds of barriers:
 
-* *Local barriers* ensure consistency of local memory accesses to items within
+* *Local barriers* ensure consistency of memory accesses to items within
   *the same* work group. This synchronizes with all instructions in the work
-  group.
+  group.  The type of memory (local or global) may be specified by the
+  :attr:`loopy.instruction.BarrierInstruction.mem_kind`
 
-* *Global barriers* ensure consistency of global memory accesses
+* *Global barriers* ensure consistency of memory accesses
   across *all* work groups, i.e. it synchronizes with every work item
   executing the kernel. Note that there is no exact equivalent for
   this kind of barrier in OpenCL. [#global-barrier-note]_
@@ -1118,14 +1121,17 @@ Once a work item has reached a barrier, it waits for everyone that it
 synchronizes with to reach the barrier before continuing. This means that unless
 all work items reach the same barrier, the kernel will hang during execution.
 
+Barrier insertion
+~~~~~~~~~~~~~~~~~
+
 By default, :mod:`loopy` inserts local barriers between two instructions when it
 detects that a dependency involving local memory may occur across work items. To
 see this in action, take a look at the section on :ref:`local_temporaries`.
 
-In contrast, :mod:`loopy` will *not* insert global barriers automatically.
-Global barriers require manual intervention along with some special
-post-processing which we describe below. Consider the following kernel, which
-attempts to rotate its input to the right by 1 in parallel:
+In contrast, :mod:`loopy` will *not* insert global barriers automatically and
+instead will report an error if it detects the need for a global barrier. As an
+example, consider the following kernel, which attempts to rotate its input to
+the right by 1 in parallel:
 
 .. doctest::
 
@@ -1153,8 +1159,22 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted:
    ...
    MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
 
-The syntax for a global barrier instruction is ``... gbarrier``. This needs to
-be added between the pair of offending instructions.
+The syntax for a inserting a global barrier instruction is
+``... gbarrier``. :mod:`loopy` also supports manually inserting local
+barriers. The syntax for a local barrier instruction is ``... lbarrier``.
+
+Saving temporaries across global barriers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For some platforms (currently only PyOpenCL), :mod:`loopy` implements global
+barriers by splitting the kernel into a host side kernel and multiple
+device-side kernels. On such platforms, it will be necessary to save non-global
+temporaries that are live across kernel calls. This section presents an example
+of how to use :func:`loopy.save_and_reload_temporaries` which is helpful for
+that purpose.
+
+Let us start with an example. Consider the kernel from above with a
+``... gbarrier`` instruction that has already been inserted.
 
 .. doctest::
 
@@ -1175,17 +1195,16 @@ be added between the pair of offending instructions.
    ...     assumptions="n mod 16 = 0")
    >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
 
-When we try to generate code for this, it will still not work.
+Here is what happens when we try to generate code for the kernel:
 
    >>> cgr = lp.generate_code_v2(knl)
    Traceback (most recent call last):
    ...
    MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
 
-To understand what is going on, you need to know that :mod:`loopy` implements
-global barriers by splitting the kernel into multiple device-side kernels. The
-splitting happens when the instruction schedule is generated. To see the
-schedule, we must first call :func:`loopy.get_one_scheduled_kernel`:
+This happens due to the kernel splitting done by :mod:`loopy`. The splitting
+happens when the instruction schedule is generated. To see the schedule, we
+should call :func:`loopy.get_one_scheduled_kernel`:
 
    >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
    >>> print(knl)
@@ -1196,11 +1215,11 @@ schedule, we must first call :func:`loopy.get_one_scheduled_kernel`:
    ---------------------------------------------------------------------------
    SCHEDULE:
       0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[])
-      1:     [maketmp] tmp <- arr[i_inner + i_outer*16]
+      1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
       2: RETURN FROM KERNEL rotate_v2
-      3: ---BARRIER:global---
+      3: ... gbarrier
       4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[])
-      5:     [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+      5:     arr[((1 + i_inner + i_outer*16) % n)] = tmp  {id=rotate}
       6: RETURN FROM KERNEL rotate_v2_0
    ---------------------------------------------------------------------------
 
@@ -1234,13 +1253,13 @@ put those instructions into the schedule.
    ---------------------------------------------------------------------------
    SCHEDULE:
       0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
-      1:     [maketmp] tmp <- arr[i_inner + i_outer*16]
-      2:     [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp
+      1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
+      2:     tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp  {id=tmp.save}
       3: RETURN FROM KERNEL rotate_v2
-      4: ---BARRIER:global---
+      4: ... gbarrier
       5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[])
-      6:     [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]
-      7:     [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+      6:     tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]  {id=tmp.reload}
+      7:     arr[((1 + i_inner + i_outer*16) % n)] = tmp  {id=rotate}
       8: RETURN FROM KERNEL rotate_v2_0
    ---------------------------------------------------------------------------
 
@@ -1280,7 +1299,7 @@ The kernel translates into two OpenCL kernels.
      arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp;
    }
 
-Executing the kernel does what we expect.
+Now we can execute the kernel.
 
    >>> arr = cl.array.arange(queue, 16, dtype=np.int32)
    >>> print(arr)
diff --git a/loopy/__init__.py b/loopy/__init__.py
index aa1d43172a4bd6472f5974c292c4256946fcf542..7a853d11570226a7a3fe35539f590e7f78ea3f44 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -27,7 +27,7 @@ import six
 from six.moves import range, zip
 
 from loopy.symbolic import (
-        TaggedVariable, Reduction, LinearSubscript, )
+        TaggedVariable, Reduction, LinearSubscript, TypeCast)
 from loopy.diagnostic import LoopyError, LoopyWarning
 
 
@@ -112,7 +112,7 @@ from loopy.transform.ilp import realize_ilp
 from loopy.transform.batch import to_batched
 from loopy.transform.parameter import assume, fix_parameters
 from loopy.transform.save import save_and_reload_temporaries
-
+from loopy.transform.add_barrier import add_barrier
 # }}}
 
 from loopy.type_inference import infer_unknown_types
@@ -145,7 +145,7 @@ from loopy.target.numba import NumbaTarget, NumbaCudaTarget
 
 
 __all__ = [
-        "TaggedVariable", "Reduction", "LinearSubscript",
+        "TaggedVariable", "Reduction", "LinearSubscript", "TypeCast",
 
         "auto",
 
@@ -215,6 +215,8 @@ __all__ = [
 
         "save_and_reload_temporaries",
 
+        "add_barrier",
+
         # }}}
 
         "get_dot_dependency_graph",
diff --git a/loopy/check.py b/loopy/check.py
index 741195ae6ac87d01de3a4ac620ce510fd62ff470..7e661b566b15c47ec99e03ffdeb035057602da76 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -60,6 +60,12 @@ def check_identifiers_in_subst_rules(knl):
 
 # {{{ sanity checks run pre-scheduling
 
+
+# FIXME: Replace with an enum. See
+# https://gitlab.tiker.net/inducer/loopy/issues/85
+VALID_NOSYNC_SCOPES = frozenset(["local", "global", "any"])
+
+
 def check_insn_attributes(kernel):
     all_insn_ids = set(insn.id for insn in kernel.instructions)
 
@@ -76,6 +82,30 @@ def check_insn_attributes(kernel):
                     % (insn.id, ", ".join(
                         insn.depends_on - all_insn_ids)))
 
+        no_sync_with_insn_ids = set(id for id, scope in insn.no_sync_with)
+        if not no_sync_with_insn_ids <= all_insn_ids:
+            raise LoopyError("insn '%s' has nosync directive with unknown "
+                    "instruction ids: %s"
+                    % (insn.id,
+                       ", ".join(no_sync_with_insn_ids - all_insn_ids)))
+
+        no_sync_with_scopes = set(scope for id, scope in insn.no_sync_with)
+        if not no_sync_with_scopes <= VALID_NOSYNC_SCOPES:
+            raise LoopyError("insn '%s' has invalid nosync scopes: %s"
+                    % (insn.id,
+                       ", ".join(no_sync_with_scopes - VALID_NOSYNC_SCOPES)))
+
+
+def check_for_duplicate_insn_ids(knl):
+    insn_ids = set()
+
+    for insn in knl.instructions:
+        if not isinstance(insn.id, str):
+            raise LoopyError("instruction id %r is not a string" % insn.id)
+        if insn.id in insn_ids:
+            raise LoopyError("duplicate instruction id: '%s'" % insn.id)
+        insn_ids.add(insn.id)
+
 
 def check_loop_priority_inames_known(kernel):
     for prio in kernel.loop_priority:
@@ -114,20 +144,20 @@ def check_for_inactive_iname_access(kernel):
 
 def _is_racing_iname_tag(tv, tag):
     from loopy.kernel.data import (temp_var_scope,
-            LocalIndexTagBase, GroupIndexTag, ParallelTag, auto)
+            LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto)
 
     if tv.scope == temp_var_scope.PRIVATE:
         return (
-                isinstance(tag, ParallelTag)
+                isinstance(tag, ConcurrentTag)
                 and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag)))
 
     elif tv.scope == temp_var_scope.LOCAL:
         return (
-                isinstance(tag, ParallelTag)
+                isinstance(tag, ConcurrentTag)
                 and not isinstance(tag, GroupIndexTag))
 
     elif tv.scope == temp_var_scope.GLOBAL:
-        return isinstance(tag, ParallelTag)
+        return isinstance(tag, ConcurrentTag)
 
     elif tv.scope == auto:
         raise LoopyError("scope of temp var '%s' has not yet been"
@@ -139,7 +169,7 @@ def _is_racing_iname_tag(tv, tag):
 
 
 def check_for_write_races(kernel):
-    from loopy.kernel.data import ParallelTag
+    from loopy.kernel.data import ConcurrentTag
 
     iname_to_tag = kernel.iname_to_tag.get
     for insn in kernel.instructions:
@@ -160,7 +190,7 @@ def check_for_write_races(kernel):
                 raceable_parallel_insn_inames = set(
                         iname
                         for iname in kernel.insn_inames(insn)
-                        if isinstance(iname_to_tag(iname), ParallelTag))
+                        if isinstance(iname_to_tag(iname), ConcurrentTag))
 
             elif assignee_name in kernel.temporary_variables:
                 temp_var = kernel.temporary_variables[assignee_name]
@@ -200,13 +230,13 @@ def check_for_orphaned_user_hardware_axes(kernel):
 
 
 def check_for_data_dependent_parallel_bounds(kernel):
-    from loopy.kernel.data import ParallelTag
+    from loopy.kernel.data import ConcurrentTag
 
     for i, dom in enumerate(kernel.domains):
         dom_inames = set(dom.get_var_names(dim_type.set))
         par_inames = set(iname
                 for iname in dom_inames
-                if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
+                if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag))
 
         if not par_inames:
             continue
@@ -356,6 +386,7 @@ def pre_schedule_checks(kernel):
     try:
         logger.debug("%s: pre-schedule check: start" % kernel.name)
 
+        check_for_duplicate_insn_ids(kernel)
         check_for_orphaned_user_hardware_axes(kernel)
         check_for_double_use_of_hw_axes(kernel)
         check_insn_attributes(kernel)
@@ -370,7 +401,7 @@ def pre_schedule_checks(kernel):
         logger.debug("%s: pre-schedule check: done" % kernel.name)
     except KeyboardInterrupt:
         raise
-    except:
+    except Exception:
         print(75*"=")
         print("failing kernel during pre-schedule check:")
         print(75*"=")
@@ -628,7 +659,7 @@ def pre_codegen_checks(kernel):
         check_that_shapes_and_strides_are_arguments(kernel)
 
         logger.debug("pre-codegen check %s: done" % kernel.name)
-    except:
+    except Exception:
         print(75*"=")
         print("failing kernel during pre-schedule check:")
         print(75*"=")
@@ -677,6 +708,16 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                 (insn_impl_domain & assumptions)
                 .project_out_except(insn_inames, [dim_type.set]))
 
+        from loopy.kernel.instruction import BarrierInstruction
+        from loopy.kernel.data import LocalIndexTag
+        if isinstance(insn, BarrierInstruction):
+            # project out local-id-mapped inames, solves #94 on gitlab
+            non_lid_inames = frozenset(
+                [iname for iname in insn_inames if not isinstance(
+                    kernel.iname_to_tag.get(iname), LocalIndexTag)])
+            insn_impl_domain = insn_impl_domain.project_out_except(
+                non_lid_inames, [dim_type.set])
+
         insn_domain = kernel.get_inames_domain(insn_inames)
         insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
         assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
@@ -684,6 +725,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
             .project_out_except(insn_inames, [dim_type.set])
             .project_out_except(insn_parameters, [dim_type.param]))
 
+        if isinstance(insn, BarrierInstruction):
+            # project out local-id-mapped inames, solves #94 on gitlab
+            desired_domain = desired_domain.project_out_except(
+                non_lid_inames, [dim_type.set])
+
         insn_impl_domain = (insn_impl_domain
                 .project_out_except(insn_parameters, [dim_type.param]))
         insn_impl_domain, desired_domain = align_two(
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 009dadc1a0d6236f092029dbc03ad0c035c7b8f8..e83515d31f1c61e52569d8d0754ce79e7a7f602f 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -28,7 +28,7 @@ from loopy.diagnostic import LoopyError, warn
 from pytools import ImmutableRecord
 import islpy as isl
 
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
@@ -357,8 +357,9 @@ class CodeGenerationState(object):
 # }}}
 
 
-code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
-        key_builder=LoopyKeyBuilder())
+code_gen_cache = WriteOncePersistentDict(
+         "loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
+         key_builder=LoopyKeyBuilder())
 
 
 class PreambleInfo(ImmutableRecord):
@@ -367,6 +368,7 @@ class PreambleInfo(ImmutableRecord):
     .. attribute:: seen_dtypes
     .. attribute:: seen_functions
     .. attribute:: seen_atomic_dtypes
+    .. attribute:: codegen_state
     """
 
 
@@ -495,7 +497,9 @@ def generate_code_v2(kernel):
             seen_dtypes=seen_dtypes,
             seen_functions=seen_functions,
             # a set of LoopyTypes (!)
-            seen_atomic_dtypes=seen_atomic_dtypes)
+            seen_atomic_dtypes=seen_atomic_dtypes,
+            codegen_state=codegen_state
+            )
 
     preamble_generators = (kernel.preamble_generators
             + kernel.target.get_device_ast_builder().preamble_generators())
@@ -507,15 +511,15 @@ def generate_code_v2(kernel):
     # }}}
 
     # For faster unpickling in the common case when implemented_domains isn't needed.
-    from loopy.tools import LazilyUnpicklingDictionary
+    from loopy.tools import LazilyUnpicklingDict
     codegen_result = codegen_result.copy(
-            implemented_domains=LazilyUnpicklingDictionary(
+            implemented_domains=LazilyUnpicklingDict(
                     codegen_result.implemented_domains))
 
     logger.info("%s: generate code: done" % kernel.name)
 
     if CACHING_ENABLED:
-        code_gen_cache[input_kernel] = codegen_result
+        code_gen_cache.store_if_not_present(input_kernel, codegen_result)
 
     return codegen_result
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 61f4b3a9b8c38dfc25ebc81243812aa963423f8a..f398a063dc41f3f82267f6d4850158e4c45f4733 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -58,7 +58,7 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai
 def get_usable_inames_for_conditional(kernel, sched_index):
     from loopy.schedule import (
         find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
-    from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag
+    from loopy.kernel.data import ConcurrentTag, LocalIndexTagBase, IlpBaseTag
 
     result = find_active_inames_at(kernel, sched_index)
     crosses_barrier = has_barrier_within(kernel, sched_index)
@@ -97,7 +97,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
         #   at the innermost level of nesting.
 
         if (
-                isinstance(tag, ParallelTag)
+                isinstance(tag, ConcurrentTag)
                 and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier)
                 and not isinstance(tag, IlpBaseTag)
                 ):
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 789c00d33b7bb41816e6901e24046d4b0eefb27d..e3e209726879741c31d686f2a6530e1b7ec67b97 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -40,7 +40,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
 
     kernel = codegen_state.kernel
 
-    from loopy.kernel.data import LocalIndexTag, HardwareParallelTag
+    from loopy.kernel.data import LocalIndexTag, HardwareConcurrentTag
 
     from loopy.schedule import find_active_inames_at, has_barrier_within
     result = find_active_inames_at(kernel, sched_index)
@@ -48,7 +48,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
     has_barrier = has_barrier_within(kernel, sched_index)
 
     for iname, tag in six.iteritems(kernel.iname_to_tag):
-        if (isinstance(tag, HardwareParallelTag)
+        if (isinstance(tag, HardwareConcurrentTag)
                 and codegen_state.is_generating_device_code):
             if not has_barrier or not isinstance(tag, LocalIndexTag):
                 result.add(iname)
@@ -135,12 +135,13 @@ def generate_code_for_sched_index(codegen_state, sched_index):
                 generate_sequential_loop_dim_code)
 
         from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag,
-                LoopedIlpTag, VectorizeTag)
+                LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag)
         if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
             func = generate_unroll_loop
         elif isinstance(tag, VectorizeTag):
             func = generate_vectorize_loop
-        elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)):
+        elif tag is None or isinstance(tag, (
+                LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)):
             func = generate_sequential_loop_dim_code
         else:
             raise RuntimeError("encountered (invalid) EnterLoop "
@@ -155,7 +156,8 @@ def generate_code_for_sched_index(codegen_state, sched_index):
 
         if codegen_state.is_generating_device_code:
             barrier_ast = codegen_state.ast_builder.emit_barrier(
-                    sched_item.kind, sched_item.comment)
+                    sched_item.synchronization_kind, sched_item.mem_kind,
+                    sched_item.comment)
             if sched_item.originating_insn_id:
                 return CodeGenerationResult.new(
                         codegen_state,
@@ -166,7 +168,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
                 return barrier_ast
         else:
             # host code
-            if sched_item.kind in ["global", "local"]:
+            if sched_item.synchronization_kind in ["global", "local"]:
                 # host code is assumed globally and locally synchronous
                 return CodeGenerationResult(
                         host_program=None,
@@ -175,8 +177,9 @@ def generate_code_for_sched_index(codegen_state, sched_index):
                         implemented_data_info=codegen_state.implemented_data_info)
 
             else:
-                raise LoopyError("do not know how to emit code for barrier kind '%s'"
-                        "in host code" % sched_item.kind)
+                raise LoopyError("do not know how to emit code for barrier "
+                                 "synchronization kind '%s'" "in host code"
+                                 % sched_item.synchronization_kind)
 
         # }}}
 
@@ -240,6 +243,15 @@ def build_loop_nest(codegen_state, schedule_index):
 
     kernel = codegen_state.kernel
 
+    # If the AST builder does not implement conditionals, we can save us
+    # some work about hoisting conditionals and directly go into recursion.
+    if not codegen_state.ast_builder.can_implement_conditionals:
+        result = []
+        inner = generate_code_for_sched_index(codegen_state, schedule_index)
+        if inner is not None:
+            result.append(inner)
+        return merge_codegen_results(codegen_state, result)
+
     # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices
 
     # i.e. go up to the next LeaveLoop, and skip over inner loops.
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 0110a06095fa0bd690045f050136027d7bed3a28..1db7b0445efd2a2e27e761164fa919647df37a07 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -90,7 +90,7 @@ def get_slab_decomposition(kernel, iname):
                         iname_rel_aff(space,
                             iname, "<=", upper_bound_aff-upper_incr)))
         else:
-            lower_slab = None
+            upper_slab = None
 
         slabs = []
 
@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     kernel = codegen_state.kernel
 
     from loopy.kernel.data import (
-            UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)
+            UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag)
 
     from loopy.schedule import get_insn_ids_for_block_at
     insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
@@ -243,7 +243,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
 
         hw_inames_left = [iname
                 for iname in all_inames_by_insns
-                if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]
+                if isinstance(kernel.iname_to_tag.get(iname), HardwareConcurrentTag)]
 
     if not hw_inames_left:
         return next_func(codegen_state)
@@ -446,7 +446,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
 
         from loopy.symbolic import pw_aff_to_expr
 
-        if ubound.is_equal(lbound):
+        if impl_ubound.is_equal(impl_lbound):
             # single-trip, generate just a variable assignment, not a loop
             inner = merge_codegen_results(codegen_state, [
                 astb.emit_initializer(
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 5f0884fd44ed5064f3f195d103b164f2163d1d19..5a747d070a47ff89336c22c8237ff03e567d0a8a 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -203,7 +203,7 @@ def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context)
     if len(pieces) == 1:
         (_, result), = pieces
         if constants_only and not result.is_cst():
-            raise ValueError("a numeric %s was not found for PwAff '%s'"
+            raise StaticValueFindingError("a numeric %s was not found for PwAff '%s'"
                     % (what, pw_aff))
         return result
 
@@ -329,7 +329,7 @@ def is_nonnegative(expr, over_set):
     from loopy.symbolic import aff_from_expr
     try:
         aff = aff_from_expr(space, -expr-1)
-    except:
+    except Exception:
         return None
     expr_neg_set = isl.BasicSet.universe(space).add_constraint(
             isl.Constraint.inequality_from_aff(aff))
@@ -616,10 +616,12 @@ def get_simple_strides(bset, key_by="name"):
         # recognizes constraints of the form
         #  -i0 + 2*floor((i0)/2) == 0
 
-        if aff.dim(dim_type.div) != 1:
+        divs_with_coeffs = _get_indices_and_coeffs(aff, [dim_type.div])
+        if len(divs_with_coeffs) != 1:
             continue
 
-        idiv = 0
+        (_, idiv, div_coeff), = divs_with_coeffs
+
         div = aff.get_div(idiv)
 
         # check for sub-divs
@@ -630,7 +632,7 @@ def get_simple_strides(bset, key_by="name"):
         denom = div.get_denominator_val().to_python()
 
         # if the coefficient in front of the div is not the same as the denominator
-        if not aff.get_coefficient_val(dim_type.div, idiv).div(denom).is_one():
+        if not div_coeff.div(denom).is_one():
             # not supported
             continue
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 622f5e49be1e40b4156113d92907fe8b1d9fb859..88a5717642af6d9ebc1bd7770936ae44e8cbf44b 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -40,6 +40,8 @@ from loopy.library.function import (
         single_arg_function_mangler)
 
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
+from loopy.tools import natsorted
+from loopy.diagnostic import StaticValueFindingError
 
 
 # {{{ unique var names
@@ -212,45 +214,17 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             state=kernel_state.INITIAL,
             target=None,
 
-            # When kernels get intersected in slab decomposition,
-            # their grid sizes shouldn't change. This provides
-            # a way to forward sub-kernel grid size requests.
             overridden_get_grid_sizes_for_insn_ids=None):
+        """
+        :arg overridden_get_grid_sizes_for_insn_ids: A callable. When kernels get
+            intersected in slab decomposition, their grid sizes shouldn't
+            change. This provides a way to forward sub-kernel grid size requests.
+        """
 
         if cache_manager is None:
             from loopy.kernel.tools import SetOperationCacheManager
             cache_manager = SetOperationCacheManager()
 
-        # {{{ make instruction ids unique
-
-        from loopy.kernel.creation import UniqueName
-
-        insn_ids = set()
-        for insn in instructions:
-            if insn.id is not None and not isinstance(insn.id, UniqueName):
-                if insn.id in insn_ids:
-                    raise RuntimeError("duplicate instruction id: %s" % insn.id)
-                insn_ids.add(insn.id)
-
-        insn_id_gen = UniqueNameGenerator(insn_ids)
-
-        new_instructions = []
-
-        for insn in instructions:
-            if insn.id is None:
-                new_instructions.append(
-                        insn.copy(id=insn_id_gen("insn")))
-            elif isinstance(insn.id, UniqueName):
-                new_instructions.append(
-                        insn.copy(id=insn_id_gen(insn.id.name)))
-            else:
-                new_instructions.append(insn)
-
-        instructions = new_instructions
-        del new_instructions
-
-        # }}}
-
         # {{{ process assumptions
 
         if assumptions is None:
@@ -729,12 +703,12 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         tag_key_uses = {}
 
-        from loopy.kernel.data import HardwareParallelTag
+        from loopy.kernel.data import HardwareConcurrentTag
 
         for iname in cond_inames:
             tag = self.iname_to_tag.get(iname)
 
-            if isinstance(tag, HardwareParallelTag):
+            if isinstance(tag, HardwareConcurrentTag):
                 tag_key_uses.setdefault(tag.key, []).append(iname)
 
         multi_use_keys = set(
@@ -744,7 +718,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         multi_use_inames = set()
         for iname in cond_inames:
             tag = self.iname_to_tag.get(iname)
-            if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys:
+            if isinstance(tag, HardwareConcurrentTag) and tag.key in multi_use_keys:
                 multi_use_inames.add(iname)
 
         return frozenset(cond_inames - multi_use_inames)
@@ -986,8 +960,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             try:
                 # insist block size is constant
                 size = static_max_of_pw_aff(size,
-                        constants_only=isinstance(tag, LocalIndexTag))
-            except ValueError:
+                        constants_only=isinstance(tag, LocalIndexTag),
+                        context=self.assumptions)
+            except StaticValueFindingError:
                 pass
 
             tgt_dict[tag.axis] = size
@@ -1156,20 +1131,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         else:
             sep = []
 
-        def natorder(key):
-            # Return natural ordering for strings, as opposed to dictionary order.
-            # E.g. will result in
-            #  'abc1' < 'abc9' < 'abc10'
-            # rather than
-            #  'abc1' < 'abc10' < 'abc9'
-            # Based on
-            # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
-            import re
-            return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
-
-        def natsorted(seq, key=lambda x: x):
-            return sorted(seq, key=lambda y: natorder(key(y)))
-
         if "name" in what:
             lines.extend(sep)
             lines.append("KERNEL: " + kernel.name)
@@ -1207,7 +1168,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if "rules" in what and kernel.substitutions:
             lines.extend(sep)
             if show_labels:
-                lines.append("SUBSTIUTION RULES:")
+                lines.append("SUBSTITUTION RULES:")
             for rule_name in natsorted(six.iterkeys(kernel.substitutions)):
                 lines.append(str(kernel.substitutions[rule_name]))
 
@@ -1215,113 +1176,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             lines.extend(sep)
             if show_labels:
                 lines.append("INSTRUCTIONS:")
-            loop_list_width = 35
-
-            # {{{ topological sort
 
-            printed_insn_ids = set()
-            printed_insn_order = []
-
-            def insert_insn_into_order(insn):
-                if insn.id in printed_insn_ids:
-                    return
-                printed_insn_ids.add(insn.id)
-
-                for dep_id in natsorted(insn.depends_on):
-                    insert_insn_into_order(kernel.id_to_insn[dep_id])
-
-                printed_insn_order.append(insn)
-
-            for insn in kernel.instructions:
-                insert_insn_into_order(insn)
-
-            # }}}
-
-            import loopy as lp
-
-            Fore = self.options._fore  # noqa
-            Style = self.options._style  # noqa
-
-            from loopy.kernel.tools import draw_dependencies_as_unicode_arrows
-            for insn, (arrows, extender) in zip(
-                    printed_insn_order,
-                    draw_dependencies_as_unicode_arrows(
-                        printed_insn_order, fore=Fore, style=Style)):
-
-                if isinstance(insn, lp.MultiAssignmentBase):
-                    lhs = ", ".join(str(a) for a in insn.assignees)
-                    rhs = str(insn.expression)
-                    trailing = []
-                elif isinstance(insn, lp.CInstruction):
-                    lhs = ", ".join(str(a) for a in insn.assignees)
-                    rhs = "CODE(%s|%s)" % (
-                            ", ".join(str(x) for x in insn.read_variables),
-                            ", ".join("%s=%s" % (name, expr)
-                                for name, expr in insn.iname_exprs))
-
-                    trailing = ["    "+l for l in insn.code.split("\n")]
-                elif isinstance(insn, lp.BarrierInstruction):
-                    lhs = ""
-                    rhs = "... %sbarrier" % insn.kind[0]
-                    trailing = []
-
-                elif isinstance(insn, lp.NoOpInstruction):
-                    lhs = ""
-                    rhs = "... nop"
-                    trailing = []
-
-                else:
-                    raise LoopyError("unexpected instruction type: %s"
-                            % type(insn).__name__)
-
-                order = self._get_iname_order_for_printing()
-                loop_list = ",".join(
-                    sorted(kernel.insn_inames(insn), key=lambda iname: order[iname]))
-
-                options = [Fore.GREEN+insn.id+Style.RESET_ALL]
-                if insn.priority:
-                    options.append("priority=%d" % insn.priority)
-                if insn.tags:
-                    options.append("tags=%s" % ":".join(insn.tags))
-                if isinstance(insn, lp.Assignment) and insn.atomicity:
-                    options.append("atomic=%s" % ":".join(
-                        str(a) for a in insn.atomicity))
-                if insn.groups:
-                    options.append("groups=%s" % ":".join(insn.groups))
-                if insn.conflicts_with_groups:
-                    options.append(
-                            "conflicts=%s" % ":".join(insn.conflicts_with_groups))
-                if insn.no_sync_with:
-                    options.append("no_sync_with=%s" % ":".join(
-                        "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
-
-                if lhs:
-                    core = "%s <- %s" % (
-                        Fore.CYAN+lhs+Style.RESET_ALL,
-                        Fore.MAGENTA+rhs+Style.RESET_ALL,
-                        )
-                else:
-                    core = Fore.MAGENTA+rhs+Style.RESET_ALL
-
-                if len(loop_list) > loop_list_width:
-                    lines.append("%s [%s]" % (arrows, loop_list))
-                    lines.append("%s %s%s   # %s" % (
-                        extender,
-                        (loop_list_width+2)*" ",
-                        core,
-                        ", ".join(options)))
-                else:
-                    lines.append("%s [%s]%s%s   # %s" % (
-                        arrows,
-                        loop_list, " "*(loop_list_width-len(loop_list)),
-                        core,
-                        ",".join(options)))
-
-                lines.extend(trailing)
-
-                if insn.predicates:
-                    lines.append(10*" " + "if (%s)" % " && ".join(
-                        [str(x) for x in insn.predicates]))
+            from loopy.kernel.tools import stringify_instruction_list
+            lines.extend(stringify_instruction_list(kernel))
 
         dep_lines = []
         for insn in kernel.instructions:
@@ -1502,6 +1359,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return hash(key_hash.digest())
 
     def __eq__(self, other):
+        if self is other:
+            return True
+
         if not isinstance(other, LoopKernel):
             return False
 
@@ -1515,7 +1375,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                         return False
 
             elif field_name == "assumptions":
-                if not self.assumptions.plain_is_equal(other.assumptions):
+                if not (
+                        self.assumptions.plain_is_equal(other.assumptions)
+                        or self.assumptions.is_equal(other.assumptions)):
                     return False
 
             elif getattr(self, field_name) != getattr(other, field_name):
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 531cc822e1bc76573ef6e0812970d16bd6df0b17..5d4240b9ab3e1ce2ad356a93b5e21b3bbf4d499e 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -567,6 +567,14 @@ class ArrayBase(ImmutableRecord):
         informational/documentational purpose. On occasion, they are used
         to generate more informative names than could be achieved by
         axis numbers.
+
+    .. automethod:: __init__
+    .. automethod:: __eq__
+    .. automethod:: num_user_axes
+    .. automethod:: num_target_axes
+    .. automethod:: vector_size
+
+    (supports persistent hashing)
     """
 
     # Note that order may also wind up in attributes, if the
@@ -579,7 +587,8 @@ class ArrayBase(ImmutableRecord):
             target=None,
             **kwargs):
         """
-        All of the following are optional. Specify either strides or shape.
+        All of the following (except *name*) are optional.
+        Specify either strides or shape.
 
         :arg name: May contain multiple names separated by
             commas, in which case multiple arguments,
@@ -643,8 +652,9 @@ class ArrayBase(ImmutableRecord):
         :arg offset: Offset from the beginning of the buffer to the point from
             which the strides are counted. May be one of
 
-            * 0
+            * 0 or None
             * a string (that is interpreted as an argument name).
+            * a pymbolic expression
             * :class:`loopy.auto`, in which case an offset argument
               is added automatically, immediately following this argument.
               :class:`loopy.CompiledKernel` is even smarter in its treatment of
@@ -877,6 +887,7 @@ class ArrayBase(ImmutableRecord):
         :class:`pytools.persistent_dict.PersistentDict`.
         """
 
+        key_builder.rec(key_hash, type(self).__name__.encode("utf-8"))
         key_builder.rec(key_hash, self.name)
         key_builder.rec(key_hash, self.dtype)
         self.update_persistent_hash_for_shape(key_hash, key_builder, self.shape)
@@ -1039,7 +1050,9 @@ class ArrayBase(ImmutableRecord):
 
                             is_written=is_written)
 
-                if self.offset:
+                import loopy as lp
+
+                if self.offset is lp.auto:
                     offset_name = full_name+"_offset"
                     yield ImplementedDataInfo(
                                 target=target,
@@ -1205,12 +1218,16 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
         return result
 
     def apply_offset(sub):
-        if ary.offset:
-            offset_name = ary.offset
-            if offset_name is lp.auto:
-                offset_name = array_name+"_offset"
+        import loopy as lp
 
-            return var(offset_name) + sub
+        if ary.offset:
+            if ary.offset is lp.auto:
+                return var(array_name+"_offset") + sub
+            elif isinstance(ary.offset, str):
+                return var(ary.offset) + sub
+            else:
+                # assume it's an expression
+                return ary.offset + sub
         else:
             return sub
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 89cb5f26a4940656cca1ab09841311148e113275..fb935476d54b3f9eb0a3bf858c883fe4c75eaa5a 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -51,9 +51,14 @@ logger = logging.getLogger(__name__)
 
 _IDENTIFIER_RE = re.compile(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\b")
 
+# source: check_keywords() in isl_stream.c, ISL version 0.17
+_ISL_KEYWORDS = frozenset("""
+        exists and or implies not infty infinity NaN min max rat true false ceild
+        floord mod ceil floor""".split())
+
 
 def _gather_isl_identifiers(s):
-    return set(_IDENTIFIER_RE.findall(s)) - set(["and", "or", "exists"])
+    return set(_IDENTIFIER_RE.findall(s)) - _ISL_KEYWORDS
 
 
 class UniqueName:
@@ -352,6 +357,14 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
                                 % v)
             del assignee_name
 
+        elif opt_key == "mem_kind":
+            opt_value = opt_value.lower().strip()
+            if opt_value not in ['local', 'global']:
+                raise LoopyError("Unknown memory synchronization type %s specified"
+                    " expected, 'local' or 'global'."
+                    % opt_value)
+            result["mem_kind"] = opt_value
+
         else:
             raise ValueError(
                     "unrecognized instruction option '%s' "
@@ -420,6 +433,17 @@ SUBST_RE = re.compile(
         r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$")
 
 
+def check_illegal_options(insn_options, insn_type):
+    illegal_options = []
+    if insn_type not in ['gbarrier', 'lbarrier']:
+        illegal_options.append('mem_kind')
+
+    bad_options = [x for x in illegal_options if x in insn_options]
+    if bad_options:
+        raise LoopyError("Cannot supply option(s) '%s' to instruction type '%s'" %
+                         ', '.join(bad_options), insn_type)
+
+
 def parse_insn(groups, insn_options):
     """
     :return: a tuple ``(insn, inames_to_dup)``, where insn is a
@@ -434,7 +458,7 @@ def parse_insn(groups, insn_options):
     if "lhs" in groups:
         try:
             lhs = parse(groups["lhs"])
-        except:
+        except Exception:
             print("While parsing left hand side '%s', "
                     "the following error occurred:" % groups["lhs"])
             raise
@@ -443,7 +467,7 @@ def parse_insn(groups, insn_options):
 
     try:
         rhs = parse(groups["rhs"])
-    except:
+    except Exception:
         print("While parsing right hand side '%s', "
                 "the following error occurred:" % groups["rhs"])
         raise
@@ -493,6 +517,9 @@ def parse_insn(groups, insn_options):
             groups["options"],
             assignee_names=assignee_names)
 
+    # check for bad options
+    check_illegal_options(insn_options, 'assignment')
+
     insn_id = insn_options.pop("insn_id", None)
     inames_to_dup = insn_options.pop("inames_to_dup", [])
 
@@ -517,14 +544,14 @@ def parse_subst_rule(groups):
     from loopy.symbolic import parse
     try:
         lhs = parse(groups["lhs"])
-    except:
+    except Exception:
         print("While parsing left hand side '%s', "
                 "the following error occurred:" % groups["lhs"])
         raise
 
     try:
         rhs = parse(groups["rhs"])
-    except:
+    except Exception:
         print("While parsing right hand side '%s', "
                 "the following error occurred:" % groups["rhs"])
         raise
@@ -578,13 +605,15 @@ def parse_special_insn(groups, insn_options):
 
     from loopy.kernel.instruction import NoOpInstruction, BarrierInstruction
     special_insn_kind = groups["kind"]
+    # check for bad options
+    check_illegal_options(insn_options, special_insn_kind)
 
     if special_insn_kind == "gbarrier":
         cls = BarrierInstruction
-        kwargs["kind"] = "global"
+        kwargs["synchronization_kind"] = "global"
     elif special_insn_kind == "lbarrier":
         cls = BarrierInstruction
-        kwargs["kind"] = "local"
+        kwargs["synchronization_kind"] = "local"
     elif special_insn_kind == "nop":
         cls = NoOpInstruction
     else:
@@ -792,6 +821,8 @@ def parse_instructions(instructions, defines):
                     parse_insn_options(
                         insn_options_stack[-1],
                         with_options_match.group("options")))
+            # check for bad options
+            check_illegal_options(insn_options_stack[-1], 'with-block')
             continue
 
         for_match = FOR_RE.match(insn)
@@ -896,7 +927,8 @@ def parse_instructions(instructions, defines):
             obj = insn_options_stack.pop()
             #if this object is the end of an if statement
             if obj['predicates'] == if_predicates_stack[-1]["insn_predicates"] and\
-                    if_predicates_stack[-1]["insn_predicates"]:
+                    if_predicates_stack[-1]["insn_predicates"] and\
+                    obj['within_inames'] == if_predicates_stack[-1]['within_inames']:
                 if_predicates_stack.pop()
             continue
 
@@ -991,7 +1023,7 @@ def parse_domains(domains, defines):
 
             try:
                 dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom)
-            except:
+            except Exception:
                 print("failed to parse domain '%s'" % dom)
                 raise
         else:
@@ -1859,6 +1891,13 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     :arg seq_dependencies: If *True*, dependencies that sequentially
         connect the given *instructions* will be added. Defaults to
         *False*.
+    :arg fixed_parameters: A dictionary of *name*/*value* pairs, where *name*
+        will be fixed to *value*. *name* may refer to :ref:`domain-parameters`
+        or :ref:`arguments`. See also :func:`loopy.fix_parameters`.
+
+    .. versionchanged:: 2017.2
+
+        *fixed_parameters* added.
 
     .. versionchanged:: 2016.3
 
@@ -1876,6 +1915,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     flags = kwargs.pop("flags", None)
     target = kwargs.pop("target", None)
     seq_dependencies = kwargs.pop("seq_dependencies", False)
+    fixed_parameters = kwargs.pop("fixed_parameters", {})
 
     if defines:
         from warnings import warn
@@ -1976,6 +2016,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
             target=target,
             **kwargs)
 
+    from loopy.transform.instruction import uniquify_instruction_ids
+    knl = uniquify_instruction_ids(knl)
+    from loopy.check import check_for_duplicate_insn_ids
+    check_for_duplicate_insn_ids(knl)
+
     if seq_dependencies:
         knl = add_sequential_dependencies(knl)
 
@@ -1996,11 +2041,14 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     # -------------------------------------------------------------------------
     # Must create temporaries before inferring inames (because those temporaries
     # mediate dependencies that are then used for iname propagation.)
+    # Must create temporaries before fixing parameters.
     # -------------------------------------------------------------------------
     knl = add_used_inames(knl)
     # NOTE: add_inferred_inames will be phased out and throws warnings if it
     # does something.
     knl = add_inferred_inames(knl)
+    from loopy.transform.parameter import fix_parameters
+    knl = fix_parameters(knl, **fixed_parameters)
     # -------------------------------------------------------------------------
     # Ordering dependency:
     # -------------------------------------------------------------------------
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 94b31df12dae516d3539438b7e4ed66ed765e697..96933f57a003aaca58ed00d2d73c3301b0c448c7 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -77,14 +77,19 @@ class IndexTag(ImmutableRecord):
         return type(self).__name__
 
 
-class ParallelTag(IndexTag):
+class ConcurrentTag(IndexTag):
     pass
 
 
-class HardwareParallelTag(ParallelTag):
+class HardwareConcurrentTag(ConcurrentTag):
     pass
 
 
+# deprecated aliases
+ParallelTag = ConcurrentTag
+HardwareParallelTag = HardwareConcurrentTag
+
+
 class UniqueTag(IndexTag):
     pass
 
@@ -105,11 +110,11 @@ class AxisTag(UniqueTag):
                 self.print_name, self.axis)
 
 
-class GroupIndexTag(HardwareParallelTag, AxisTag):
+class GroupIndexTag(HardwareConcurrentTag, AxisTag):
     print_name = "g"
 
 
-class LocalIndexTagBase(HardwareParallelTag):
+class LocalIndexTagBase(HardwareConcurrentTag):
     pass
 
 
@@ -130,7 +135,7 @@ class AutoFitLocalIndexTag(AutoLocalIndexTagBase):
 
 # {{{ ilp-like
 
-class IlpBaseTag(ParallelTag):
+class IlpBaseTag(ConcurrentTag):
     pass
 
 
@@ -161,6 +166,11 @@ class ForceSequentialTag(IndexTag):
         return "forceseq"
 
 
+class InOrderSequentialSequentialTag(IndexTag):
+    def __str__(self):
+        return "ord"
+
+
 def parse_tag(tag):
     if tag is None:
         return tag
@@ -173,6 +183,8 @@ def parse_tag(tag):
 
     if tag == "for":
         return None
+    elif tag == "ord":
+        return InOrderSequentialSequentialTag()
     elif tag in ["unr"]:
         return UnrollTag()
     elif tag in ["vec"]:
@@ -346,6 +358,14 @@ class TemporaryVariable(ArrayBase):
 
         A :class:`bool` indicating whether the variable may be written during
         its lifetime. If *True*, *initializer* must be given.
+
+    .. attribute:: _base_storage_access_may_be_aliasing
+
+        Whether the temporary is used to alias the underlying base storage.
+        Defaults to *False*. If *False*, C-based code generators will declare
+        the temporary as a ``restrict`` const pointer to the base storage
+        memory location. If *True*, the restrict part is omitted on this
+        declaration.
     """
 
     min_target_axes = 0
@@ -358,12 +378,14 @@ class TemporaryVariable(ArrayBase):
             "base_storage",
             "initializer",
             "read_only",
+            "_base_storage_access_may_be_aliasing",
             ]
 
     def __init__(self, name, dtype=None, shape=(), scope=auto,
             dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
             base_indices=None, storage_shape=None,
-            base_storage=None, initializer=None, read_only=False, **kwargs):
+            base_storage=None, initializer=None, read_only=False,
+            _base_storage_access_may_be_aliasing=False, **kwargs):
         """
         :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
         :arg shape: :class:`loopy.auto` or a shape tuple
@@ -419,6 +441,13 @@ class TemporaryVariable(ArrayBase):
                     "mutually exclusive"
                     % name)
 
+        if base_storage is None and _base_storage_access_may_be_aliasing:
+            raise LoopyError(
+                    "temporary variable '%s': "
+                    "_base_storage_access_may_be_aliasing option, but no "
+                    "base_storage given!"
+                    % name)
+
         ArrayBase.__init__(self, name=intern(name),
                 dtype=dtype, shape=shape,
                 dim_tags=dim_tags, offset=offset, dim_names=dim_names,
@@ -428,6 +457,8 @@ class TemporaryVariable(ArrayBase):
                 base_storage=base_storage,
                 initializer=initializer,
                 read_only=read_only,
+                _base_storage_access_may_be_aliasing=(
+                    _base_storage_access_may_be_aliasing),
                 **kwargs)
 
     @property
@@ -489,7 +520,10 @@ class TemporaryVariable(ArrayBase):
                 and (
                     (self.initializer is None and other.initializer is None)
                     or np.array_equal(self.initializer, other.initializer))
-                and self.read_only == other.read_only)
+                and self.read_only == other.read_only
+                and (self._base_storage_access_may_be_aliasing
+                    == other._base_storage_access_may_be_aliasing)
+                )
 
     def update_persistent_hash(self, key_hash, key_builder):
         """Custom hash computation function for use with
@@ -500,6 +534,8 @@ class TemporaryVariable(ArrayBase):
         self.update_persistent_hash_for_shape(key_hash, key_builder,
                 self.storage_shape)
         key_builder.rec(key_hash, self.base_indices)
+        key_builder.rec(key_hash, self.scope)
+        key_builder.rec(key_hash, self.base_storage)
 
         initializer = self.initializer
         if initializer is not None:
@@ -507,10 +543,22 @@ class TemporaryVariable(ArrayBase):
         key_builder.rec(key_hash, initializer)
 
         key_builder.rec(key_hash, self.read_only)
+        key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing)
 
 # }}}
 
 
+def iname_tag_to_temp_var_scope(iname_tag):
+    iname_tag = parse_tag(iname_tag)
+
+    if isinstance(iname_tag, GroupIndexTag):
+        return temp_var_scope.GLOBAL
+    elif isinstance(iname_tag, LocalIndexTag):
+        return temp_var_scope.LOCAL
+    else:
+        return temp_var_scope.PRIVATE
+
+
 # {{{ substitution rule
 
 class SubstitutionRule(ImmutableRecord):
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index d5c388af60a39987c09092fc93325f067a8f4cf7..dbd99e85016b00b3df4827ad7999e7b57e58af24 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -152,6 +152,12 @@ class InstructionBase(ImmutableRecord):
             "within_inames_is_final within_inames "
             "priority boostable boostable_into".split())
 
+    # Names of fields that are pymbolic expressions. Needed for key building
+    pymbolic_fields = set("")
+
+    # Names of fields that are sets of pymbolic expressions. Needed for key building
+    pymbolic_set_fields = set(["predicates"])
+
     def __init__(self, id, depends_on, depends_on_is_final,
             groups, conflicts_with_groups,
             no_sync_with,
@@ -407,7 +413,27 @@ class InstructionBase(ImmutableRecord):
 
         return result
 
-    # {{{ comparison, hashing
+    # {{{ hashing and key building
+
+    @property
+    @memoize_method
+    def _key_builder(self):
+        from loopy.tools import LoopyEqKeyBuilder
+        key_builder = LoopyEqKeyBuilder()
+        key_builder.update_for_class(self.__class__)
+
+        for field_name in self.fields:
+            field_value = getattr(self, field_name)
+            if field_name in self.pymbolic_fields:
+                key_builder.update_for_pymbolic_field(field_name, field_value)
+            elif field_name in self.pymbolic_set_fields:
+                # First sort the fields, as a canonical form
+                items = tuple(sorted(field_value, key=str))
+                key_builder.update_for_pymbolic_field(field_name, items)
+            else:
+                key_builder.update_for_field(field_name, field_value)
+
+        return key_builder
 
     def update_persistent_hash(self, key_hash, key_builder):
         """Custom hash computation function for use with
@@ -416,9 +442,7 @@ class InstructionBase(ImmutableRecord):
         Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
         """
 
-        # Order matters for hash forming--sort the field names
-        for field_name in sorted(self.fields):
-            key_builder.rec(key_hash, getattr(self, field_name))
+        key_builder.rec(key_hash, self._key_builder.hash_key())
 
     # }}}
 
@@ -648,6 +672,7 @@ class MultiAssignmentBase(InstructionBase):
     """An assignment instruction with an expression as a right-hand side."""
 
     fields = InstructionBase.fields | set(["expression"])
+    pymbolic_fields = InstructionBase.pymbolic_fields | set(["expression"])
 
     @memoize_method
     def read_dependency_names(self):
@@ -734,6 +759,7 @@ class Assignment(MultiAssignmentBase):
 
     fields = MultiAssignmentBase.fields | \
             set("assignee temp_var_type atomicity".split())
+    pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignee"])
 
     def __init__(self,
             assignee, expression,
@@ -818,26 +844,6 @@ class Assignment(MultiAssignmentBase):
             result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates)
         return result
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        """Custom hash computation function for use with
-        :class:`pytools.persistent_dict.PersistentDict`.
-
-        Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
-        """
-
-        # Order matters for hash forming--sort the fields.
-        for field_name in sorted(self.fields):
-            if field_name in ["assignee", "expression"]:
-                key_builder.update_for_pymbolic_expression(
-                        key_hash, getattr(self, field_name))
-            elif field_name == "predicates":
-                preds = sorted(self.predicates, key=str)
-                for pred in preds:
-                    key_builder.update_for_pymbolic_expression(
-                            key_hash, pred)
-            else:
-                key_builder.rec(key_hash, getattr(self, field_name))
-
     # {{{ for interface uniformity with CallInstruction
 
     @property
@@ -886,6 +892,7 @@ class CallInstruction(MultiAssignmentBase):
 
     fields = MultiAssignmentBase.fields | \
             set("assignees temp_var_types".split())
+    pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignees"])
 
     def __init__(self,
             assignees, expression,
@@ -987,26 +994,6 @@ class CallInstruction(MultiAssignmentBase):
             result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates)
         return result
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        """Custom hash computation function for use with
-        :class:`pytools.persistent_dict.PersistentDict`.
-
-        Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
-        """
-
-        # Order matters for hash forming--sort the fields.
-        for field_name in sorted(self.fields):
-            if field_name in ["assignees", "expression"]:
-                key_builder.update_for_pymbolic_expression(
-                        key_hash, getattr(self, field_name))
-            elif field_name == "predicates":
-                preds = sorted(self.predicates, key=str)
-                for pred in preds:
-                    key_builder.update_for_pymbolic_expression(
-                            key_hash, pred)
-            else:
-                key_builder.rec(key_hash, getattr(self, field_name))
-
     @property
     def atomicity(self):
         # Function calls can impossibly be atomic, and even the result assignment
@@ -1086,6 +1073,10 @@ class CInstruction(InstructionBase):
 
     fields = InstructionBase.fields | \
             set("iname_exprs code read_variables assignees".split())
+    pymbolic_fields = InstructionBase.pymbolic_fields | \
+            set("iname_exprs assignees".split())
+    pymbolic_set_fields = InstructionBase.pymbolic_set_fields | \
+            set(["read_variables"])
 
     def __init__(self,
             iname_exprs, code,
@@ -1210,25 +1201,6 @@ class CInstruction(InstructionBase):
         return first_line + "\n    " + "\n    ".join(
                 self.code.split("\n"))
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        """Custom hash computation function for use with
-        :class:`pytools.persistent_dict.PersistentDict`.
-
-        Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
-        """
-
-        # Order matters for hash forming--sort the fields.
-        for field_name in sorted(self.fields):
-            if field_name == "assignees":
-                for a in self.assignees:
-                    key_builder.update_for_pymbolic_expression(key_hash, a)
-            elif field_name == "iname_exprs":
-                for name, val in self.iname_exprs:
-                    key_builder.rec(key_hash, name)
-                    key_builder.update_for_pymbolic_expression(key_hash, val)
-            else:
-                key_builder.rec(key_hash, getattr(self, field_name))
-
 # }}}
 
 
@@ -1308,19 +1280,29 @@ class NoOpInstruction(_DataObliviousInstruction):
 
 class BarrierInstruction(_DataObliviousInstruction):
     """An instruction that requires synchronization with all
-    concurrent work items of :attr:`kind`.
+    concurrent work items of :attr:`synchronization_kind`.
 
-    .. attribute:: kind
+    .. attribute:: synchronization_kind
 
         A string, ``"global"`` or ``"local"``.
 
+    .. attribute:: mem_kind
+
+        A string, ``"global"`` or ``"local"``. Chooses which memory type to
+        sychronize, for targets that require this (e.g. OpenCL)
+
     The textual syntax in a :mod:`loopy` kernel is::
 
         ... gbarrier
         ... lbarrier
+
+    Note that the memory type :attr:`mem_kind` can be specified for local barriers::
+
+        ... lbarrier {mem_kind=global}
     """
 
-    fields = _DataObliviousInstruction.fields | set(["kind"])
+    fields = _DataObliviousInstruction.fields | set(["synchronization_kind",
+                                                     "mem_kind"])
 
     def __init__(self, id, depends_on=None, depends_on_is_final=None,
             groups=None, conflicts_with_groups=None,
@@ -1328,7 +1310,8 @@ class BarrierInstruction(_DataObliviousInstruction):
             within_inames_is_final=None, within_inames=None,
             priority=None,
             boostable=None, boostable_into=None,
-            predicates=None, tags=None, kind="global"):
+            predicates=None, tags=None, synchronization_kind="global",
+            mem_kind="local"):
 
         if predicates:
             raise LoopyError("conditional barriers are not supported")
@@ -1346,20 +1329,32 @@ class BarrierInstruction(_DataObliviousInstruction):
                 boostable=boostable,
                 boostable_into=boostable_into,
                 predicates=predicates,
-                tags=tags,
+                tags=tags
                 )
 
-        self.kind = kind
+        self.synchronization_kind = synchronization_kind
+        self.mem_kind = mem_kind
 
     def __str__(self):
-        first_line = "%s: ... %sbarrier" % (self.id, self.kind[0])
+        first_line = "%s: ... %sbarrier" % (self.id, self.synchronization_kind[0])
 
         options = self.get_str_options()
+        if self.synchronization_kind == "local":
+            # add the memory kind
+            options += ['mem_kind={}'.format(self.mem_kind)]
         if options:
             first_line += " {%s}" % (": ".join(options))
 
         return first_line
 
+    @property
+    def kind(self):
+        from warnings import warn
+        warn("BarrierInstruction.kind is deprecated, use synchronization_kind "
+             "instead", DeprecationWarning, stacklevel=2)
+        return self.synchronization_kind
+
 # }}}
 
+
 # vim: foldmethod=marker
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 8bdc72d54a91c6e8b4f9ec0ca3053831627d3eae..a65e7fb4ceefd28a909dcb6cee24ea437f15a60e 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -35,7 +35,7 @@ import islpy as isl
 from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
 from pytools import memoize_on_first_arg
-
+from loopy.tools import natsorted
 
 import logging
 logger = logging.getLogger(__name__)
@@ -620,11 +620,11 @@ class DomainParameterFinder(object):
                         if dep.name in param_names:
                             from pymbolic.algorithm import solve_affine_equations_for
                             try:
-                                # friggin' overkill :)
+                                # overkill :)
                                 param_expr = solve_affine_equations_for(
                                         [dep.name], [(shape_i, var("shape_i"))]
                                         )[dep.name]
-                            except:
+                            except Exception:
                                 # went wrong? oh well
                                 pass
                             else:
@@ -1070,7 +1070,7 @@ def guess_var_shape(kernel, var_name):
 
             if n_axes == 1:
                 # Leave shape undetermined--we can live with that for 1D.
-                shape = (None,)
+                shape = None
             else:
                 raise LoopyError("cannot determine access range for '%s': "
                         "undetermined index in subscript(s) '%s'"
@@ -1092,7 +1092,7 @@ def guess_var_shape(kernel, var_name):
                             kernel.cache_manager.dim_max(
                                 armap.access_range, i) + 1,
                             constants_only=False)))
-            except:
+            except Exception:
                 print("While trying to find shape axis %d of "
                         "variable '%s', the following "
                         "exception occurred:" % (i, var_name),
@@ -1371,7 +1371,170 @@ def draw_dependencies_as_unicode_arrows(
                 conform_to_uniform_length(extender))
             for row, extender in rows]
 
-    return rows
+    return uniform_length, rows
+
+# }}}
+
+
+# {{{ stringify_instruction_list
+
+def stringify_instruction_list(kernel):
+    # {{{ topological sort
+
+    printed_insn_ids = set()
+    printed_insn_order = []
+
+    def insert_insn_into_order(insn):
+        if insn.id in printed_insn_ids:
+            return
+        printed_insn_ids.add(insn.id)
+
+        for dep_id in natsorted(insn.depends_on):
+            insert_insn_into_order(kernel.id_to_insn[dep_id])
+
+        printed_insn_order.append(insn)
+
+    for insn in kernel.instructions:
+        insert_insn_into_order(insn)
+
+    # }}}
+
+    import loopy as lp
+
+    Fore = kernel.options._fore  # noqa
+    Style = kernel.options._style  # noqa
+
+    uniform_arrow_length, arrows_and_extenders = \
+            draw_dependencies_as_unicode_arrows(
+                    printed_insn_order, fore=Fore, style=Style)
+
+    leader = " " * uniform_arrow_length
+    lines = []
+    current_inames = [set()]
+
+    if uniform_arrow_length:
+        indent_level = [1]
+    else:
+        indent_level = [0]
+
+    indent_increment = 2
+
+    iname_order = kernel._get_iname_order_for_printing()
+
+    def add_pre_line(s):
+        lines.append(leader + " " * indent_level[0] + s)
+
+    def add_main_line(s):
+        lines.append(arrows + " " * indent_level[0] + s)
+
+    def add_post_line(s):
+        lines.append(extender + " " * indent_level[0] + s)
+
+    def adapt_to_new_inames_list(new_inames):
+        added = []
+        removed = []
+
+        # FIXME: Doesn't respect strict nesting
+        for iname in iname_order:
+            is_in_current = iname in current_inames[0]
+            is_in_new = iname in new_inames
+
+            if is_in_new == is_in_current:
+                pass
+            elif is_in_new and not is_in_current:
+                added.append(iname)
+            elif not is_in_new and is_in_current:
+                removed.append(iname)
+            else:
+                assert False
+
+        if removed:
+            indent_level[0] -= indent_increment * len(removed)
+            add_pre_line("end " + ", ".join(removed))
+        if added:
+            add_pre_line("for " + ", ".join(added))
+            indent_level[0] += indent_increment * len(added)
+
+        current_inames[0] = new_inames
+
+    for insn, (arrows, extender) in zip(printed_insn_order, arrows_and_extenders):
+        if isinstance(insn, lp.MultiAssignmentBase):
+            lhs = ", ".join(str(a) for a in insn.assignees)
+            rhs = str(insn.expression)
+            trailing = []
+        elif isinstance(insn, lp.CInstruction):
+            lhs = ", ".join(str(a) for a in insn.assignees)
+            rhs = "CODE(%s|%s)" % (
+                    ", ".join(str(x) for x in insn.read_variables),
+                    ", ".join("%s=%s" % (name, expr)
+                        for name, expr in insn.iname_exprs))
+
+            trailing = [l for l in insn.code.split("\n")]
+        elif isinstance(insn, lp.BarrierInstruction):
+            lhs = ""
+            rhs = "... %sbarrier" % insn.synchronization_kind[0]
+            trailing = []
+
+        elif isinstance(insn, lp.NoOpInstruction):
+            lhs = ""
+            rhs = "... nop"
+            trailing = []
+
+        else:
+            raise LoopyError("unexpected instruction type: %s"
+                    % type(insn).__name__)
+
+        adapt_to_new_inames_list(kernel.insn_inames(insn))
+
+        options = ["id="+Fore.GREEN+insn.id+Style.RESET_ALL]
+        if insn.priority:
+            options.append("priority=%d" % insn.priority)
+        if insn.tags:
+            options.append("tags=%s" % ":".join(insn.tags))
+        if isinstance(insn, lp.Assignment) and insn.atomicity:
+            options.append("atomic=%s" % ":".join(
+                str(a) for a in insn.atomicity))
+        if insn.groups:
+            options.append("groups=%s" % ":".join(insn.groups))
+        if insn.conflicts_with_groups:
+            options.append(
+                    "conflicts=%s" % ":".join(insn.conflicts_with_groups))
+        if insn.no_sync_with:
+            options.append("no_sync_with=%s" % ":".join(
+                "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
+        if isinstance(insn, lp.BarrierInstruction) and \
+                insn.synchronization_kind == 'local':
+            options.append('mem_kind=%s' % insn.mem_kind)
+
+        if lhs:
+            core = "%s = %s" % (
+                Fore.CYAN+lhs+Style.RESET_ALL,
+                Fore.MAGENTA+rhs+Style.RESET_ALL,
+                )
+        else:
+            core = Fore.MAGENTA+rhs+Style.RESET_ALL
+
+        options_str = "  {%s}" % ", ".join(options)
+
+        if insn.predicates:
+            # FIXME: precedence
+            add_pre_line("if %s" % " and ".join([str(x) for x in insn.predicates]))
+            indent_level[0] += indent_increment
+
+        add_main_line(core + options_str)
+
+        for t in trailing:
+            add_post_line(t)
+
+        if insn.predicates:
+            indent_level[0] -= indent_increment
+            add_post_line("end")
+
+        leader = extender
+
+    adapt_to_new_inames_list([])
+
+    return lines
 
 # }}}
 
@@ -1394,7 +1557,8 @@ def get_global_barrier_order(kernel):
     def is_barrier(my_insn_id):
         insn = kernel.id_to_insn[my_insn_id]
         from loopy.kernel.instruction import BarrierInstruction
-        return isinstance(insn, BarrierInstruction) and insn.kind == "global"
+        return isinstance(insn, BarrierInstruction) and \
+            insn.synchronization_kind == "global"
 
     while unvisited:
         stack = [unvisited.pop()]
@@ -1487,7 +1651,8 @@ def find_most_recent_global_barrier(kernel, insn_id):
     def is_barrier(my_insn_id):
         insn = kernel.id_to_insn[my_insn_id]
         from loopy.kernel.instruction import BarrierInstruction
-        return isinstance(insn, BarrierInstruction) and insn.kind == "global"
+        return isinstance(insn, BarrierInstruction) and \
+            insn.synchronization_kind == "global"
 
     global_barrier_to_ordinal = dict(
             (b, i) for i, b in enumerate(global_barrier_order))
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index ced1aaaa13ed8275c1e3a376d1c24895287b3239..ac7ac19887388649670154fcd36eba79ba3b4315 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -30,7 +30,7 @@ from loopy.diagnostic import (
 
 import islpy as isl
 
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
 
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
@@ -292,7 +292,7 @@ def _classify_reduction_inames(kernel, inames):
 
     from loopy.kernel.data import (
             LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
-            ParallelTag)
+            ConcurrentTag)
 
     for iname in inames:
         iname_tag = kernel.iname_to_tag.get(iname)
@@ -305,7 +305,7 @@ def _classify_reduction_inames(kernel, inames):
         elif isinstance(iname_tag, LocalIndexTagBase):
             local_par.append(iname)
 
-        elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
+        elif isinstance(iname_tag, (ConcurrentTag, VectorizeTag)):
             nonlocal_par.append(iname)
 
         else:
@@ -610,7 +610,7 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
     if len(coeffs) == 0:
         try:
             scan_iname_aff.get_constant_val()
-        except:
+        except Exception:
             raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff)
 
         # If this point is reached we're assuming the domain is of the form
@@ -956,7 +956,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                         nresults=nresults,
                         depends_on=insn.depends_on,
                         within_inames=insn.within_inames | expr.inames,
-                        within_inames_is_final=insn.within_inames_is_final)
+                        within_inames_is_final=insn.within_inames_is_final,
+                        predicates=insn.predicates,
+                        )
 
                 newly_generated_insn_id_set.add(get_args_insn_id)
 
@@ -970,7 +972,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         return updated_inner_exprs
 
     def expand_inner_reduction(id, expr, nresults, depends_on, within_inames,
-            within_inames_is_final):
+            within_inames_is_final, predicates):
         # FIXME: use make_temporaries
         from pymbolic.primitives import Call
         from loopy.symbolic import Reduction
@@ -997,7 +999,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 expression=expr,
                 depends_on=depends_on,
                 within_inames=within_inames,
-                within_inames_is_final=within_inames_is_final)
+                within_inames_is_final=within_inames_is_final,
+                predicates=predicates)
 
         generated_insns.append(call_insn)
 
@@ -1038,7 +1041,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 within_inames=outer_insn_inames - frozenset(expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=init_insn_depends_on,
-                expression=expr.operation.neutral_element(*arg_dtypes))
+                expression=expr.operation.neutral_element(*arg_dtypes),
+                predicates=insn.predicates,)
 
         generated_insns.append(init_insn)
 
@@ -1064,7 +1068,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     nresults=nresults,
                     depends_on=insn.depends_on,
                     within_inames=update_insn_iname_deps,
-                    within_inames_is_final=insn.within_inames_is_final)
+                    within_inames_is_final=insn.within_inames_is_final,
+                    predicates=insn.predicates,
+                    )
 
             reduction_insn_depends_on.add(get_args_insn_id)
         else:
@@ -1079,7 +1085,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     reduction_expr),
                 depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on,
                 within_inames=update_insn_iname_deps,
-                within_inames_is_final=insn.within_inames_is_final)
+                within_inames_is_final=insn.within_inames_is_final,
+                predicates=insn.predicates,)
 
         generated_insns.append(reduction_insn)
 
@@ -1186,7 +1193,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 expression=neutral,
                 within_inames=base_iname_deps | frozenset([base_exec_iname]),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset())
+                depends_on=frozenset(),
+                predicates=insn.predicates,
+                )
         generated_insns.append(init_insn)
 
         init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname))
@@ -1196,7 +1205,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 expression=neutral,
                 within_inames=base_iname_deps | frozenset([base_exec_iname]),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset())
+                depends_on=frozenset(),
+                predicates=insn.predicates,
+                )
         generated_insns.append(init_neutral_insn)
 
         transfer_depends_on = set([init_neutral_id, init_id])
@@ -1216,7 +1227,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     within_inames=(
                         (outer_insn_inames - frozenset(expr.inames))
                         | frozenset([red_iname])),
-                    within_inames_is_final=insn.within_inames_is_final)
+                    within_inames_is_final=insn.within_inames_is_final,
+                    predicates=insn.predicates,
+                    )
 
             transfer_depends_on.add(get_args_insn_id)
         else:
@@ -1239,7 +1252,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     | frozenset([red_iname])),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
-                no_sync_with=frozenset([(init_id, "any")]))
+                no_sync_with=frozenset([(init_id, "any")]),
+                predicates=insn.predicates,
+                )
         generated_insns.append(transfer_insn)
 
         cur_size = 1
@@ -1280,6 +1295,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=insn.within_inames_is_final,
                     depends_on=frozenset([prev_id]),
+                    predicates=insn.predicates,
                     )
 
             generated_insns.append(stage_insn)
@@ -1398,7 +1414,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     (sweep_iname,) + expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=init_insn_depends_on,
-                expression=expr.operation.neutral_element(*arg_dtypes))
+                expression=expr.operation.neutral_element(*arg_dtypes),
+                predicates=insn.predicates,
+                )
 
         generated_insns.append(init_insn)
 
@@ -1425,7 +1443,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 depends_on=frozenset(update_insn_depends_on),
                 within_inames=update_insn_iname_deps,
                 no_sync_with=insn.no_sync_with,
-                within_inames_is_final=insn.within_inames_is_final)
+                within_inames_is_final=insn.within_inames_is_final,
+                predicates=insn.predicates,
+                )
 
         generated_insns.append(scan_insn)
 
@@ -1531,7 +1551,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 expression=neutral,
                 within_inames=base_iname_deps | frozenset([base_exec_iname]),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=init_insn_depends_on)
+                depends_on=init_insn_depends_on,
+                predicates=insn.predicates,
+                )
         generated_insns.append(init_insn)
 
         transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on
@@ -1561,7 +1583,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 within_inames=outer_insn_inames - frozenset(expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=frozenset(transfer_insn_depends_on),
-                no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with)
+                no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with,
+                predicates=insn.predicates,
+                )
 
         generated_insns.append(transfer_insn)
 
@@ -1590,7 +1614,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                         within_inames=(
                             base_iname_deps | frozenset([stage_exec_iname])),
                         within_inames_is_final=insn.within_inames_is_final,
-                        depends_on=frozenset([prev_id]))
+                        depends_on=frozenset([prev_id]),
+                        predicates=insn.predicates,
+                        )
 
                 if cur_size == 1:
                     # Performance hack: don't add a barrier here with transfer_insn.
@@ -1623,6 +1649,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=insn.within_inames_is_final,
                     depends_on=frozenset([prev_id]),
+                    predicates=insn.predicates,
                     )
 
             generated_insns.append(write_stage_insn)
@@ -1928,7 +1955,7 @@ def find_idempotence(kernel):
             for insn in kernel.instructions)
 
     from collections import defaultdict
-    dep_graph = defaultdict(lambda: set())
+    dep_graph = defaultdict(set)
 
     for insn in kernel.instructions:
         dep_graph[insn.id] = set(writer_id
@@ -2020,7 +2047,8 @@ def limit_boostability(kernel):
 # }}}
 
 
-preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
+preprocess_cache = WriteOncePersistentDict(
+        "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
@@ -2126,7 +2154,7 @@ def preprocess_kernel(kernel, device=None):
     # }}}
 
     if CACHING_ENABLED:
-        preprocess_cache[input_kernel] = kernel
+        preprocess_cache.store_if_not_present(input_kernel, kernel)
 
     return kernel
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 4281e50bd006a3cddf5a3cae0ffffe3d78abcfac..850f0a61fcdc2878d43895bc0e024032532aa680 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -29,7 +29,7 @@ import sys
 import islpy as isl
 from loopy.diagnostic import warn_with_kernel, LoopyError  # noqa
 
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
@@ -84,14 +84,18 @@ class Barrier(ScheduleItem):
 
         A plain-text comment explaining why the barrier was inserted.
 
-    .. attribute:: kind
+    .. attribute:: synchronization_kind
+
+        ``"local"`` or ``"global"``
+
+    .. attribute:: mem_kind
 
         ``"local"`` or ``"global"``
 
     .. attribute:: originating_insn_id
     """
 
-    hash_fields = ["comment", "kind"]
+    hash_fields = ["comment", "synchronization_kind", "mem_kind"]
     __slots__ = hash_fields + ["originating_insn_id"]
 
 # }}}
@@ -206,13 +210,13 @@ def find_loop_nest_with_map(kernel):
     """
     result = {}
 
-    from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
 
     all_nonpar_inames = set([
             iname
             for iname in kernel.all_inames()
             if not isinstance(kernel.iname_to_tag.get(iname),
-                (ParallelTag, IlpBaseTag, VectorizeTag))])
+                (ConcurrentTag, IlpBaseTag, VectorizeTag))])
 
     iname_to_insns = kernel.iname_to_insns()
 
@@ -274,10 +278,10 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
 
     result = {}
 
-    from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
     for insn in kernel.instructions:
         for iname in kernel.insn_inames(insn):
-            if isinstance(kernel.iname_to_tag.get(iname), ParallelTag):
+            if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag):
                 continue
 
             iname_dep = result.setdefault(iname, set())
@@ -308,7 +312,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
                         continue
 
                     tag = kernel.iname_to_tag.get(dep_insn_iname)
-                    if isinstance(tag, (ParallelTag, IlpBaseTag, VectorizeTag)):
+                    if isinstance(tag, (ConcurrentTag, IlpBaseTag, VectorizeTag)):
                         # Parallel tags don't really nest, so we'll disregard
                         # them here.
                         continue
@@ -431,14 +435,19 @@ def format_insn(kernel, insn_id):
     from loopy.kernel.instruction import (
             MultiAssignmentBase, NoOpInstruction, BarrierInstruction)
     if isinstance(insn, MultiAssignmentBase):
-        return "[%s] %s%s%s <- %s%s%s" % (
-            format_insn_id(kernel, insn_id),
+        return "%s%s%s = %s%s%s  {id=%s}" % (
             Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL,
-            Fore.MAGENTA, str(insn.expression), Style.RESET_ALL)
+            Fore.MAGENTA, str(insn.expression), Style.RESET_ALL,
+            format_insn_id(kernel, insn_id))
     elif isinstance(insn, BarrierInstruction):
-        return "[%s] %s... %sbarrier%s" % (
+        mem_kind = ''
+        if insn.synchronization_kind == 'local':
+            mem_kind = '{mem_kind=%s}' % insn.mem_kind
+
+        return "[%s] %s... %sbarrier%s%s" % (
                 format_insn_id(kernel, insn_id),
-                Fore.MAGENTA, insn.kind[0], Style.RESET_ALL)
+                Fore.MAGENTA, insn.synchronization_kind[0], mem_kind,
+                Style.RESET_ALL)
     elif isinstance(insn, NoOpInstruction):
         return "[%s] %s... nop%s" % (
                 format_insn_id(kernel, insn_id),
@@ -456,11 +465,11 @@ def dump_schedule(kernel, schedule):
     from loopy.kernel.data import MultiAssignmentBase
     for sched_item in schedule:
         if isinstance(sched_item, EnterLoop):
-            lines.append(indent + "FOR %s" % sched_item.iname)
+            lines.append(indent + "for %s" % sched_item.iname)
             indent += "    "
         elif isinstance(sched_item, LeaveLoop):
             indent = indent[:-4]
-            lines.append(indent + "END %s" % sched_item.iname)
+            lines.append(indent + "end %s" % sched_item.iname)
         elif isinstance(sched_item, CallKernel):
             lines.append(indent +
                          "CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % (
@@ -479,7 +488,8 @@ def dump_schedule(kernel, schedule):
                 insn_str = sched_item.insn_id
             lines.append(indent + insn_str)
         elif isinstance(sched_item, Barrier):
-            lines.append(indent + "---BARRIER:%s---" % sched_item.kind)
+            lines.append(indent + "... %sbarrier" %
+                         sched_item.synchronization_kind[0])
         else:
             assert False
 
@@ -833,7 +843,8 @@ def generate_loop_schedules_internal(
         # {{{ check if scheduler state allows insn scheduling
 
         from loopy.kernel.instruction import BarrierInstruction
-        if isinstance(insn, BarrierInstruction) and insn.kind == "global":
+        if isinstance(insn, BarrierInstruction) and \
+                insn.synchronization_kind == "global":
             if not sched_state.may_schedule_global_barriers:
                 if debug_mode:
                     print("can't schedule '%s' because global barriers are "
@@ -1318,7 +1329,8 @@ def convert_barrier_instructions_to_barriers(kernel, schedule):
             insn = kernel.id_to_insn[sched_item.insn_id]
             if isinstance(insn, BarrierInstruction):
                 result.append(Barrier(
-                    kind=insn.kind,
+                    synchronization_kind=insn.synchronization_kind,
+                    mem_kind=insn.mem_kind,
                     originating_insn_id=insn.id,
                     comment="Barrier inserted due to %s" % insn.id))
                 continue
@@ -1415,8 +1427,8 @@ class DependencyTracker(object):
             raise ValueError("unknown 'var_kind': %s" % var_kind)
 
         from collections import defaultdict
-        self.writer_map = defaultdict(lambda: set())
-        self.reader_map = defaultdict(lambda: set())
+        self.writer_map = defaultdict(set)
+        self.reader_map = defaultdict(set)
         self.temp_to_base_storage = kernel.get_temporary_to_base_storage_map()
 
     def map_to_base_storage(self, var_names):
@@ -1577,7 +1589,8 @@ def _insn_ids_reaching_end(schedule, kind, reverse):
             #     end
             #     barrier()
             # end
-            if barrier_kind_more_or_equally_global(sched_item.kind, kind):
+            if barrier_kind_more_or_equally_global(
+                    sched_item.synchronization_kind, kind):
                 insn_ids_alive_at_scope[-1].clear()
         else:
             insn_ids_alive_at_scope[-1] |= set(
@@ -1607,15 +1620,17 @@ def append_barrier_or_raise_error(schedule, dep, verify_only):
                     tgt=dep.target.id, src=dep.source.id))
         schedule.append(Barrier(
             comment=comment,
-            kind=dep.var_kind,
+            synchronization_kind=dep.var_kind,
+            mem_kind=dep.var_kind,
             originating_insn_id=None))
 
 
-def insert_barriers(kernel, schedule, kind, verify_only, level=0):
+def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0):
     """
-    :arg kind: "local" or "global". The :attr:`Barrier.kind` to be inserted.
-        Generally, this function will be called once for each kind of barrier
-        at the top level, where more global barriers should be inserted first.
+    :arg synchronization_kind: "local" or "global".
+        The :attr:`Barrier.synchronization_kind` to be inserted. Generally, this
+        function will be called once for each kind of barrier at the top level, where
+        more global barriers should be inserted first.
     :arg verify_only: do not insert barriers, only complain if they are
         missing.
     :arg level: the current level of loop nesting, 0 for outermost.
@@ -1624,14 +1639,15 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
     # {{{ insert barriers at outermost scheduling level
 
     def insert_barriers_at_outer_level(schedule, reverse=False):
-        dep_tracker = DependencyTracker(kernel, var_kind=kind, reverse=reverse)
+        dep_tracker = DependencyTracker(kernel, var_kind=synchronization_kind,
+                                        reverse=reverse)
 
         if reverse:
             # Populate the dependency tracker with sources from the tail end of
             # the schedule block.
             for insn_id in (
                     insn_ids_reaching_end_without_intervening_barrier(
-                        schedule, kind)):
+                        schedule, synchronization_kind)):
                 dep_tracker.add_source(insn_id)
 
         result = []
@@ -1645,11 +1661,11 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
 
                 loop_head = (
                     insn_ids_reachable_from_start_without_intervening_barrier(
-                        subloop, kind))
+                        subloop, synchronization_kind))
 
                 loop_tail = (
                     insn_ids_reaching_end_without_intervening_barrier(
-                        subloop, kind))
+                        subloop, synchronization_kind))
 
                 # Checks if a barrier is needed before the loop. This handles
                 # dependencies with targets that can be reached without an
@@ -1688,7 +1704,8 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
 
             elif isinstance(sched_item, Barrier):
                 result.append(sched_item)
-                if barrier_kind_more_or_equally_global(sched_item.kind, kind):
+                if barrier_kind_more_or_equally_global(
+                        sched_item.synchronization_kind, synchronization_kind):
                     dep_tracker.discard_all_sources()
                 i += 1
 
@@ -1724,7 +1741,8 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
         if isinstance(sched_item, EnterLoop):
             subloop, new_i = gather_schedule_block(schedule, i)
             new_subloop = insert_barriers(
-                    kernel, subloop[1:-1], kind, verify_only, level + 1)
+                    kernel, subloop[1:-1], synchronization_kind, verify_only,
+                    level + 1)
             result.append(subloop[0])
             result.extend(new_subloop)
             result.append(subloop[-1])
@@ -1756,7 +1774,8 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
 
 def generate_loop_schedules(kernel, debug_args={}):
     from pytools import MinRecursionLimit
-    with MinRecursionLimit(len(kernel.instructions) * 2):
+    with MinRecursionLimit(max(len(kernel.instructions) * 2,
+                               len(kernel.all_inames()) * 4)):
         for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args):
             yield sched
 
@@ -1786,7 +1805,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
         for item in preschedule
         for insn_id in sched_item_to_insn_id(item))
 
-    from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag
+    from loopy.kernel.data import IlpBaseTag, ConcurrentTag, VectorizeTag
     ilp_inames = set(
             iname
             for iname in kernel.all_inames()
@@ -1797,7 +1816,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
             if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag))
     parallel_inames = set(
             iname for iname in kernel.all_inames()
-            if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
+            if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag))
 
     loop_nest_with_map = find_loop_nest_with_map(kernel)
     loop_nest_around_map = find_loop_nest_around_map(kernel)
@@ -1889,11 +1908,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
                     if not kernel.options.disable_global_barriers:
                         logger.debug("%s: barrier insertion: global" % kernel.name)
                         gen_sched = insert_barriers(kernel, gen_sched,
-                                kind="global", verify_only=True)
+                                synchronization_kind="global", verify_only=True)
 
                     logger.debug("%s: barrier insertion: local" % kernel.name)
-                    gen_sched = insert_barriers(kernel, gen_sched, kind="local",
-                            verify_only=False)
+                    gen_sched = insert_barriers(kernel, gen_sched,
+                        synchronization_kind="local", verify_only=False)
                     logger.debug("%s: barrier insertion: done" % kernel.name)
 
                 new_kernel = kernel.copy(
@@ -1939,7 +1958,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
 # }}}
 
 
-schedule_cache = PersistentDict("loopy-schedule-cache-v4-"+DATA_MODEL_VERSION,
+schedule_cache = WriteOncePersistentDict(
+        "loopy-schedule-cache-v4-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
@@ -1970,7 +1990,7 @@ def get_one_scheduled_kernel(kernel):
             kernel.name, time()-start_time))
 
     if CACHING_ENABLED and not from_cache:
-        schedule_cache[sched_cache_key] = result
+        schedule_cache.store_if_not_present(sched_cache_key, result)
 
     return result
 
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index 1a0789c2f61e21e4a0371e2a73195c9771245527..5c41f03997e5193333f5be213f2f87d38147b6df 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -106,7 +106,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen):
                         [end_item])
 
             elif isinstance(sched_item, Barrier):
-                if sched_item.kind == "global":
+                if sched_item.synchronization_kind == "global":
                     # Wrap the current chunk into a kernel call.
                     schedule_required_splitting = True
                     if current_chunk:
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 9b15ec471fb681698b85c1dd2f92376fbc731f00..72d0c6c7d7a634cd96379d17b7a91f6a638e0ab9 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -38,7 +38,6 @@ __doc__ = """
 
 .. currentmodule:: loopy
 
-.. autoclass:: GuardedPwQPolynomial
 .. autoclass:: ToCountMap
 .. autoclass:: Op
 .. autoclass:: MemAccess
@@ -50,6 +49,11 @@ __doc__ = """
 .. autofunction:: gather_access_footprints
 .. autofunction:: gather_access_footprint_bytes
 
+.. currentmodule:: loopy.statistics
+
+.. autoclass:: GuardedPwQPolynomial
+
+.. currentmodule:: loopy
 """
 
 
@@ -996,6 +1000,9 @@ def add_assumptions_guard(kernel, pwqpolynomial):
 
 def count(kernel, set, space=None):
     try:
+        if space is not None:
+            set = set.align_params(space)
+
         return add_assumptions_guard(kernel, set.card())
     except AttributeError:
         pass
@@ -1410,7 +1417,8 @@ def get_synchronization_map(knl):
                 iname_list.pop()
 
         elif isinstance(sched_item, Barrier):
-            result = result + ToCountMap({"barrier_%s" % sched_item.kind:
+            result = result + ToCountMap({"barrier_%s" %
+                                          sched_item.synchronization_kind:
                                           get_count_poly(iname_list)})
 
         elif isinstance(sched_item, CallKernel):
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index f1a494f30d469511817d204c0476ff79abe00e3b..2d31c63ef13774599de27ae871be64bc5acb7514 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -104,7 +104,9 @@ class IdentityMapperMixin(object):
         return expr
 
     def map_type_annotation(self, expr, *args):
-        return TypeAnnotation(expr.type, self.rec(expr.child))
+        return type(expr)(expr.type, self.rec(expr.child))
+
+    map_type_cast = map_type_annotation
 
     map_linear_subscript = IdentityMapperBase.map_subscript
 
@@ -147,6 +149,11 @@ class WalkMapper(WalkMapperBase):
 
         self.rec(expr.expr, *args)
 
+    def map_type_cast(self, expr, *args):
+        if not self.visit(expr):
+            return
+        self.rec(expr.child, *args)
+
     map_tagged_variable = WalkMapperBase.map_variable
 
     def map_loopy_function_identifier(self, expr, *args):
@@ -219,6 +226,10 @@ class StringifyMapper(StringifyMapperBase):
     def map_rule_argument(self, expr, enclosing_prec):
         return "<arg%d>" % expr.index
 
+    def map_type_cast(self, expr, enclosing_prec):
+        from pymbolic.mapper.stringifier import PREC_NONE
+        return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE))
+
 
 class UnidirectionalUnifier(UnidirectionalUnifierBase):
     def map_reduction(self, expr, other, unis):
@@ -273,6 +284,9 @@ class DependencyMapper(DependencyMapperBase):
 
     map_linear_subscript = DependencyMapperBase.map_subscript
 
+    def map_type_cast(self, expr):
+        return self.rec(expr.child)
+
 
 class SubstitutionRuleExpander(IdentityMapper):
     def __init__(self, rules):
@@ -398,6 +412,10 @@ class TypedCSE(p.CommonSubexpression):
 
 
 class TypeAnnotation(p.Expression):
+    """Undocumented for now. Currently only used internally around LHSs of
+    assignments that create temporaries.
+    """
+
     def __init__(self, type, child):
         super(TypeAnnotation, self).__init__()
         self.type = type
@@ -406,9 +424,55 @@ class TypeAnnotation(p.Expression):
     def __getinitargs__(self):
         return (self.type, self.child)
 
+    def stringifier(self):
+        return StringifyMapper
+
     mapper_method = intern("map_type_annotation")
 
 
+class TypeCast(p.Expression):
+    """Only defined for numerical types with semantics matching
+    :meth:`numpy.ndarray.astype`.
+
+    .. attribute:: child
+
+        The expression to be cast.
+    """
+
+    def __init__(self, type, child):
+        super(TypeCast, self).__init__()
+
+        from loopy.types import to_loopy_type, NumpyType
+        type = to_loopy_type(type)
+
+        if (not isinstance(type, NumpyType)
+                or not issubclass(type.dtype.type, np.number)):
+            from loopy.diagnostic import LoopyError
+            raise LoopyError("TypeCast only supports numerical numpy types, "
+                    "not '%s'" % type)
+
+        # We're storing the type as a name for now to avoid
+        # numpy pickling bug madness. (see loopy.types)
+        self._type_name = type.dtype.name
+        self.child = child
+
+    @property
+    def type(self):
+        from loopy.types import NumpyType
+        return NumpyType(np.dtype(self._type_name))
+
+    # init_arg_names is a misnomer--they're attribute names used for pickling.
+    init_arg_names = ("_type_name", "child")
+
+    def __getinitargs__(self):
+        return (self._type_name, self.child)
+
+    def stringifier(self):
+        return StringifyMapper
+
+    mapper_method = intern("map_type_cast")
+
+
 class TaggedVariable(p.Variable):
     """This is an identifier with a tag, such as 'matrix$one', where
     'one' identifies this specific use of the identifier. This mechanism
@@ -1232,6 +1296,9 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
         super(PwAffEvaluationMapper, self).__init__(context)
 
     def map_constant(self, expr):
+        if isinstance(expr, np.integer):
+            expr = int(expr)
+
         return self.pw_zero + expr
 
     def map_min(self, expr):
@@ -1559,6 +1626,9 @@ class BatchedAccessRangeMapper(WalkMapper):
     def map_reduction(self, expr, inames):
         return WalkMapper.map_reduction(self, expr, inames | set(expr.inames))
 
+    def map_type_cast(self, expr, inames):
+        return self.rec(expr.child, inames)
+
 
 class AccessRangeMapper(object):
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 5d5743bae322fc59c989cafd85122c8ca619c422..aac528087cf812a91553d416f166be898a1cd132 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -55,6 +55,7 @@ class TargetBase(object):
     comparison_fields = ()
 
     def update_persistent_hash(self, key_hash, key_builder):
+        key_hash.update(type(self).__name__.encode())
         for field_name in self.hash_fields:
             key_builder.rec(key_hash, getattr(self, field_name))
 
@@ -188,9 +189,10 @@ class ASTBuilderBase(object):
     def add_vector_access(self, access_expr, index):
         raise NotImplementedError()
 
-    def emit_barrier(self, kind, comment):
+    def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
-        :arg kind: ``"local"`` or ``"global"``
+        :arg synchronization_kind: ``"local"`` or ``"global"``
+        :arg mem_kind: ``"local"`` or ``"global"``
         """
         raise NotImplementedError()
 
@@ -210,6 +212,10 @@ class ASTBuilderBase(object):
             static_lbound, static_ubound, inner):
         raise NotImplementedError()
 
+    @property
+    def can_implement_conditionals(self):
+        return False
+
     def emit_if(self, condition_str, ast):
         raise NotImplementedError()
 
@@ -274,28 +280,6 @@ class DummyHostASTBuilder(ASTBuilderBase):
     def ast_block_scope_class(self):
         return _DummyASTBlock
 
-    def emit_assignment(self, codegen_state, insn):
-        return None
-
-    def emit_multiple_assignment(self, codegen_state, insn):
-        return None
-
-    def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            static_lbound, static_ubound, inner):
-        return None
-
-    def emit_if(self, condition_str, ast):
-        return None
-
-    def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
-        return None
-
-    def emit_blank_line(self):
-        return None
-
-    def emit_comment(self, s):
-        return None
-
 # }}}
 
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 0e1f0ff86ca5eaa1932f766b3f8b79f5167ce6f4..423311cdb259c77e77070f5fc27a542dd2c89fc9 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -27,12 +27,14 @@ THE SOFTWARE.
 import six
 
 import numpy as np  # noqa
+from loopy.kernel.data import CallMangleInfo
 from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
 from loopy.diagnostic import LoopyError
 from cgen import Pointer, NestedDeclarator, Block
 from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
 from pymbolic.mapper.stringifier import PREC_NONE
 from loopy.symbolic import IdentityMapper
+from loopy.types import NumpyType
 import pymbolic.primitives as p
 
 from pytools import memoize_method
@@ -315,9 +317,75 @@ class _ConstRestrictPointer(Pointer):
         return sub_tp, ("*const __restrict__ %s" % sub_decl)
 
 
+class _ConstPointer(Pointer):
+    def get_decl_pait(self):
+        sub_tp, sub_decl = self.subdecl.get_decl_pair()
+        return sub_tp, ("*const %s" % sub_decl)
+
+
+# {{{ symbol mangler
+
+def c_symbol_mangler(kernel, name):
+    # float NAN as defined in C99 standard
+    if name == "NAN":
+        return NumpyType(np.dtype(np.float32)), name
+    return None
+
+# }}}
+
+
+# {{{ function mangler
+
+def c_function_mangler(target, name, arg_dtypes):
+    # convert abs(), min(), max() to fabs(), fmin(), fmax() to comply with
+    # C99 standard
+    if not isinstance(name, str):
+        return None
+
+    if (name == "abs"
+            and len(arg_dtypes) == 1
+            and arg_dtypes[0].numpy_dtype.kind == "f"):
+        return CallMangleInfo(
+                target_name="fabs",
+                result_dtypes=arg_dtypes,
+                arg_dtypes=arg_dtypes)
+
+    if name in ["max", "min"] and len(arg_dtypes) == 2:
+        dtype = np.find_common_type(
+                [], [dtype.numpy_dtype for dtype in arg_dtypes])
+
+        if dtype.kind == "c":
+            raise RuntimeError("min/max do not support complex numbers")
+
+        if dtype.kind == "f":
+            name = "f" + name
+
+        result_dtype = NumpyType(dtype)
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(result_dtype,),
+                arg_dtypes=2*(result_dtype,))
+
+    return None
+
+# }}}
+
+
 class CASTBuilder(ASTBuilderBase):
     # {{{ library
 
+    def function_manglers(self):
+        return (
+                super(CASTBuilder, self).function_manglers() + [
+                    c_function_mangler
+                    ])
+
+    def symbol_manglers(self):
+        return (
+                super(CASTBuilder, self).symbol_manglers() + [
+                    c_symbol_mangler
+                    ])
+
     def preamble_generators(self):
         return (
                 super(CASTBuilder, self).preamble_generators() + [
@@ -344,7 +412,16 @@ class CASTBuilder(ASTBuilderBase):
         result = []
 
         from loopy.kernel.data import temp_var_scope
-
+        from loopy.schedule import CallKernel
+        # We only need to write declarations for global variables with
+        # the first device program. `is_first_dev_prog` determines
+        # whether this is the first device program in the schedule.
+        is_first_dev_prog = True
+        for i in range(schedule_index):
+            if isinstance(kernel.schedule[i], CallKernel):
+                is_first_dev_prog = False
+                break
+        if is_first_dev_prog:
         for tv in sorted(
                 six.itervalues(kernel.temporary_variables),
                 key=lambda tv: tv.name):
@@ -421,6 +498,15 @@ class CASTBuilder(ASTBuilderBase):
         base_storage_to_align_bytes = {}
 
         from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line
+        # Getting the temporary variables that are needed for the current
+        # sub-kernel.
+        from loopy.schedule.tools import (
+                temporaries_read_in_subkernel,
+                temporaries_written_in_subkernel)
+        subkernel = kernel.schedule[schedule_index].kernel_name
+        sub_knl_temps = (
+                temporaries_read_in_subkernel(kernel, subkernel) |
+                temporaries_written_in_subkernel(kernel, subkernel))
 
         for tv in sorted(
                 six.itervalues(kernel.temporary_variables),
@@ -430,7 +516,8 @@ class CASTBuilder(ASTBuilderBase):
             if not tv.base_storage:
                 for idi in decl_info:
                     # global temp vars are mapped to arguments or global declarations
-                    if tv.scope != temp_var_scope.GLOBAL:
+                    if tv.scope != temp_var_scope.GLOBAL and (
+                            tv.name in sub_knl_temps):
                         decl = self.wrap_temporary_decl(
                                 self.get_temporary_decl(
                                     codegen_state, schedule_index, tv, idi),
@@ -470,13 +557,17 @@ class CASTBuilder(ASTBuilderBase):
                     temp_var_decl = self.wrap_temporary_decl(
                             temp_var_decl, tv.scope)
 
+                    if tv._base_storage_access_may_be_aliasing:
+                        ptrtype = _ConstPointer
+                    else:
                     # The 'restrict' part of this is a complete lie--of course
                     # all these temporaries are aliased. But we're promising to
                     # not use them to shovel data from one representation to the
                     # other. That counts, right?
+                        ptrtype = _ConstRestrictPointer
 
-                    cast_decl = _ConstRestrictPointer(cast_decl)
-                    temp_var_decl = _ConstRestrictPointer(temp_var_decl)
+                    cast_decl = ptrtype(cast_decl)
+                    temp_var_decl = ptrtype(temp_var_decl)
 
                     cast_tp, cast_d = cast_decl.get_decl_pair()
                     temp_var_decl = Initializer(
@@ -797,6 +888,10 @@ class CASTBuilder(ASTBuilderBase):
         from cgen import Comment
         return Comment(s)
 
+    @property
+    def can_implement_conditionals(self):
+        return True
+
     def emit_if(self, condition_str, ast):
         from cgen import If
         return If(condition_str, ast)
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 8f924d3aee3b9f2982006fdb7b558cccac6785e3..caee73eb1c3320f03ceac66e55e8f5c0bfadbbc2 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -29,8 +29,10 @@ import numpy as np
 
 from pymbolic.mapper import RecursiveMapper, IdentityMapper
 from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT,
-        PREC_POWER,
-        PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND)
+        PREC_POWER, PREC_SHIFT,
+        PREC_UNARY, PREC_LOGICAL_OR, PREC_LOGICAL_AND,
+        PREC_BITWISE_AND, PREC_BITWISE_OR)
+
 import islpy as isl
 import pymbolic.primitives as p
 from pymbolic import var
@@ -338,6 +340,11 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                     expr.operator,
                     self.rec(expr.right, inner_type_context))
 
+    def map_type_cast(self, expr, type_context):
+        registry = self.codegen_state.ast_builder.target.get_dtype_registry()
+        cast = var("(%s)" % registry.dtype_to_ctype(expr.type))
+        return cast(self.rec(expr.child, type_context))
+
     def map_constant(self, expr, type_context):
         if isinstance(expr, (complex, np.complexfloating)):
             try:
@@ -782,6 +789,16 @@ class CExpressionToCodeMapper(RecursiveMapper):
     def map_literal(self, expr, enclosing_prec):
         return expr.s
 
+    def map_left_shift(self, expr, enclosing_prec):
+        return self.parenthesize_if_needed(
+            self.join_rec(" << ", (expr.shiftee, expr.shift), PREC_SHIFT),
+            enclosing_prec, PREC_SHIFT)
+
+    def map_right_shift(self, expr, enclosing_prec):
+        return self.parenthesize_if_needed(
+            self.join_rec(" >> ", (expr.shiftee, expr.shift), PREC_SHIFT),
+            enclosing_prec, PREC_SHIFT)
+
     def map_logical_not(self, expr, enclosing_prec):
         return self.parenthesize_if_needed(
                 "!" + self.rec(expr.child, PREC_UNARY),
@@ -807,6 +824,21 @@ class CExpressionToCodeMapper(RecursiveMapper):
             result = "(%s)" % result
         return result
 
+    def map_bitwise_not(self, expr, enclosing_prec):
+        return self.parenthesize_if_needed(
+                "~" + self.rec(expr.child, PREC_UNARY),
+                enclosing_prec, PREC_UNARY)
+
+    def map_bitwise_and(self, expr, enclosing_prec):
+        return self.parenthesize_if_needed(
+                self.join_rec(" & ", expr.children, PREC_BITWISE_AND),
+                enclosing_prec, PREC_BITWISE_AND)
+
+    def map_bitwise_or(self, expr, enclosing_prec):
+        return self.parenthesize_if_needed(
+                self.join_rec(" | ", expr.children, PREC_BITWISE_OR),
+                enclosing_prec, PREC_BITWISE_OR)
+
     def map_sum(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_SUM
 
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 2bdffb5aa69bdc0f72fe12a58faa6d0e78920e0f..027f27838bf68511905bd34cf75d0b361c749629 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -291,18 +291,19 @@ class CUDACASTBuilder(CASTBuilder):
     def add_vector_access(self, access_expr, index):
         return access_expr.a(self._VEC_AXES[index])
 
-    def emit_barrier(self, kind, comment):
+    def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
         :arg kind: ``"local"`` or ``"global"``
+        :arg memkind: unused
         :return: a :class:`loopy.codegen.GeneratedInstruction`.
         """
-        if kind == "local":
+        if synchronization_kind == "local":
             if comment:
                 comment = " /* %s */" % comment
 
             from cgen import Statement
             return Statement("__syncthreads()%s" % comment)
-        elif kind == "global":
+        elif synchronization_kind == "global":
             raise LoopyError("CUDA does not have global barriers")
         else:
             raise LoopyError("unknown barrier kind")
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 61788df2dd9d32978a550990fb7c84501f76e856..2909f16f56315b136f4f2677348bfe0c3e5553b4 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -30,6 +30,13 @@ from loopy.diagnostic import LoopyError
 from pytools.py_codegen import (
         Indentation, PythonFunctionGenerator)
 
+import logging
+logger = logging.getLogger(__name__)
+
+from pytools.persistent_dict import WriteOncePersistentDict
+from loopy.tools import LoopyKeyBuilder
+from loopy.version import DATA_MODEL_VERSION
+
 
 # {{{ object array argument packing
 
@@ -419,7 +426,8 @@ class ExecutionWrapperGeneratorBase(object):
             # {{{ allocate written arrays, if needed
 
             if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
-                    and arg.shape is not None:
+                    and arg.shape is not None \
+                    and all(si is not None for si in arg.shape):
 
                 if not isinstance(arg.dtype, NumpyType):
                     raise LoopyError("do not know how to pass arg of type '%s'"
@@ -653,6 +661,11 @@ class _Kernels(object):
     pass
 
 
+typed_and_scheduled_cache = WriteOncePersistentDict(
+        "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION,
+        key_builder=LoopyKeyBuilder())
+
+
 # {{{ kernel executor
 
 class KernelExecutorBase(object):
@@ -716,6 +729,31 @@ class KernelExecutorBase(object):
 
         return kernel
 
+    @memoize_method
+    def get_typed_and_scheduled_kernel(self, arg_to_dtype_set):
+        from loopy import CACHING_ENABLED
+
+        from loopy.preprocess import prepare_for_caching
+        # prepare_for_caching() gets run by preprocess, but the kernel at this
+        # stage is not guaranteed to be preprocessed.
+        cacheable_kernel = prepare_for_caching(self.kernel)
+        cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set)
+
+        if CACHING_ENABLED:
+            try:
+                return typed_and_scheduled_cache[cache_key]
+            except KeyError:
+                pass
+
+        logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name)
+
+        kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set)
+
+        if CACHING_ENABLED:
+            typed_and_scheduled_cache.store_if_not_present(cache_key, kernel)
+
+        return kernel
+
     def arg_to_dtype_set(self, kwargs):
         if not self.has_runtime_typed_args:
             return None
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 35dade90494906b61aad9eb66e7271f2c5d1e180..45a59847ba9f175df5ca1be46aa78566b2aab03b 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -288,15 +288,15 @@ class ISPCASTBuilder(CASTBuilder):
     def add_vector_access(self, access_expr, index):
         return access_expr[index]
 
-    def emit_barrier(self, kind, comment):
+    def emit_barrier(self, synchronization_kind, mem_kind, comment):
         from cgen import Comment, Statement
 
         assert comment
 
-        if kind == "local":
+        if synchronization_kind == "local":
             return Comment("local barrier: %s" % comment)
 
-        elif kind == "global":
+        elif synchronization_kind == "global":
             return Statement("sync; /* %s */" % comment)
 
         else:
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index a5f7562c41c3ec8eca673904550e078d2a992241..2763caace891570a1b7f8b13f225001a03d3aa65 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -167,30 +167,6 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
     if not isinstance(name, str):
         return None
 
-    if (name == "abs"
-            and len(arg_dtypes) == 1
-            and arg_dtypes[0].numpy_dtype.kind == "f"):
-        return CallMangleInfo(
-                target_name="fabs",
-                result_dtypes=arg_dtypes,
-                arg_dtypes=arg_dtypes)
-
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
-        dtype = np.find_common_type(
-                [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
-        if dtype.kind == "c":
-            raise RuntimeError("min/max do not support complex numbers")
-
-        if dtype.kind == "f":
-            name = "f" + name
-
-        result_dtype = NumpyType(dtype)
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(result_dtype,),
-                arg_dtypes=2*(result_dtype,))
-
     if name == "dot":
         scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
         return CallMangleInfo(
@@ -450,18 +426,20 @@ class OpenCLCASTBuilder(CASTBuilder):
         # The 'int' avoids an 'L' suffix for long ints.
         return access_expr.attr("s%s" % hex(int(index))[2:])
 
-    def emit_barrier(self, kind, comment):
+    def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
         :arg kind: ``"local"`` or ``"global"``
         :return: a :class:`loopy.codegen.GeneratedInstruction`.
         """
-        if kind == "local":
+        if synchronization_kind == "local":
             if comment:
                 comment = " /* %s */" % comment
 
+            mem_kind = mem_kind.upper()
+
             from cgen import Statement
-            return Statement("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment)
-        elif kind == "global":
+            return Statement("barrier(CLK_%s_MEM_FENCE)%s" % (mem_kind, comment))
+        elif synchronization_kind == "global":
             raise LoopyError("OpenCL does not have global barriers")
         else:
             raise LoopyError("unknown barrier kind")
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 11951abcf17e94c0fdba51042e3060735215b423..ce04986d3d2a39dcf7126339055d32fa16ffcc25 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -283,6 +283,10 @@ class PythonASTBuilderBase(ASTBuilderBase):
         from genpy import Comment
         return Comment(s)
 
+    @property
+    def can_implement_conditionals(self):
+        return True
+
     def emit_if(self, condition_str, ast):
         from genpy import If
         return If(condition_str, ast)
diff --git a/loopy/tools.py b/loopy/tools.py
index 56b673b597fc3bf43a6b03f87607ea8d3db0866a..d6952d54782f113685299641c828907fb7f32a46 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import collections
 import numpy as np
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
 from loopy.symbolic import WalkMapper as LoopyWalkMapper
@@ -50,7 +51,12 @@ class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase):
     See also :meth:`LoopyKeyBuilder.update_for_pymbolic_expression`.
     """
 
-    # <empty implementation>
+    def map_reduction(self, expr, *args):
+        if not self.visit(expr):
+            return
+
+        self.key_hash.update(type(expr.operation).__name__.encode("utf-8"))
+        self.rec(expr.expr, *args)
 
 
 class LoopyKeyBuilder(KeyBuilderBase):
@@ -113,6 +119,53 @@ class PymbolicExpressionHashWrapper(object):
 # }}}
 
 
+# {{{ eq key builder
+
+class LoopyEqKeyBuilder(object):
+    """Unlike :class:`loopy.tools.LoopyKeyBuilder`, this builds keys for use in
+    equality comparison, such that `key(a) == key(b)` if and only if `a == b`.
+    The types of objects being compared should satisfy structural equality.
+
+    The output is suitable for use with :class:`loopy.tools.LoopyKeyBuilder`
+    provided all fields are persistent hashable.
+
+    As an optimization, top-level pymbolic expression fields are stringified for
+    faster comparisons / hash calculations.
+
+    Usage::
+
+        kb = LoopyEqKeyBuilder()
+        kb.update_for_class(insn.__class__)
+        kb.update_for_field("field", insn.field)
+        ...
+        key = kb.key()
+
+    """
+
+    def __init__(self):
+        self.field_dict = {}
+
+    def update_for_class(self, class_):
+        self.class_ = class_
+
+    def update_for_field(self, field_name, value):
+        self.field_dict[field_name] = value
+
+    def update_for_pymbolic_field(self, field_name, value):
+        self.field_dict[field_name] = str(value).encode("utf-8")
+
+    def key(self):
+        return (self.class_.__name__.encode("utf-8"), self.field_dict)
+
+    def hash_key(self):
+        """Similar to key(), but excludes field names for faster hashing.
+        """
+        return (self.class_.__name__.encode("utf-8"),) + tuple(
+                self.field_dict[k] for k in sorted(self.field_dict.keys()))
+
+# }}}
+
+
 # {{{ remove common indentation
 
 def remove_common_indentation(code, require_leading_newline=True,
@@ -340,23 +393,19 @@ def compute_sccs(graph):
 # }}}
 
 
-# {{{ lazily unpickling dictionary
-
+# {{{ pickled container value
 
-class _PickledObjectWrapper(object):
-    """
-    A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDictionary`).
+class _PickledObject(object):
+    """A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDict` and
+    :class:`LazilyUnpicklingList`).
     """
 
-    @classmethod
-    def from_object(cls, obj):
-        if isinstance(obj, cls):
-            return obj
-        from pickle import dumps
-        return cls(dumps(obj))
-
-    def __init__(self, objstring):
-        self.objstring = objstring
+    def __init__(self, obj):
+        if isinstance(obj, _PickledObject):
+            self.objstring = obj.objstring
+        else:
+            from pickle import dumps
+            self.objstring = dumps(obj)
 
     def unpickle(self):
         from pickle import loads
@@ -366,12 +415,35 @@ class _PickledObjectWrapper(object):
         return {"objstring": self.objstring}
 
 
-import collections
+class _PickledObjectWithEqAndPersistentHashKeys(_PickledObject):
+    """Like :class:`_PickledObject`, with two additional attributes:
 
+        * `eq_key`
+        * `persistent_hash_key`
 
-class LazilyUnpicklingDictionary(collections.MutableMapping):
+    This allows for comparison and for persistent hashing without unpickling.
     """
-    A dictionary-like object which lazily unpickles its values.
+
+    def __init__(self, obj, eq_key, persistent_hash_key):
+        _PickledObject.__init__(self, obj)
+        self.eq_key = eq_key
+        self.persistent_hash_key = persistent_hash_key
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, self.persistent_hash_key)
+
+    def __getstate__(self):
+        return {"objstring": self.objstring,
+                "eq_key": self.eq_key,
+                "persistent_hash_key": self.persistent_hash_key}
+
+# }}}
+
+
+# {{{ lazily unpickling dictionary
+
+class LazilyUnpicklingDict(collections.MutableMapping):
+    """A dictionary-like object which lazily unpickles its values.
     """
 
     def __init__(self, *args, **kwargs):
@@ -379,7 +451,7 @@ class LazilyUnpicklingDictionary(collections.MutableMapping):
 
     def __getitem__(self, key):
         value = self._map[key]
-        if isinstance(value, _PickledObjectWrapper):
+        if isinstance(value, _PickledObject):
             value = self._map[key] = value.unpickle()
         return value
 
@@ -397,12 +469,105 @@ class LazilyUnpicklingDictionary(collections.MutableMapping):
 
     def __getstate__(self):
         return {"_map": dict(
-            (key, _PickledObjectWrapper.from_object(val))
+            (key, _PickledObject(val))
             for key, val in six.iteritems(self._map))}
 
 # }}}
 
 
+# {{{ lazily unpickling list
+
+class LazilyUnpicklingList(collections.MutableSequence):
+    """A list which lazily unpickles its values."""
+
+    def __init__(self, *args, **kwargs):
+        self._list = list(*args, **kwargs)
+
+    def __getitem__(self, key):
+        item = self._list[key]
+        if isinstance(item, _PickledObject):
+            item = self._list[key] = item.unpickle()
+        return item
+
+    def __setitem__(self, key, value):
+        self._list[key] = value
+
+    def __delitem__(self, key):
+        del self._list[key]
+
+    def __len__(self):
+        return len(self._list)
+
+    def insert(self, key, value):
+        self._list.insert(key, value)
+
+    def __getstate__(self):
+        return {"_list": [_PickledObject(val) for val in self._list]}
+
+
+class LazilyUnpicklingListWithEqAndPersistentHashing(LazilyUnpicklingList):
+    """A list which lazily unpickles its values, and supports equality comparison
+    and persistent hashing without unpickling.
+
+    Persistent hashing only works in conjunction with :class:`LoopyKeyBuilder`.
+
+    Equality comparison and persistent hashing are implemented by supplying
+    functions `eq_key_getter` and `persistent_hash_key_getter` to the
+    constructor. These functions should return keys that can be used in place of
+    the original object for the respective purposes of equality comparison and
+    persistent hashing.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.eq_key_getter = kwargs.pop("eq_key_getter")
+        self.persistent_hash_key_getter = kwargs.pop("persistent_hash_key_getter")
+        LazilyUnpicklingList.__init__(self, *args, **kwargs)
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.update_for_list(key_hash, self._list)
+
+    def _get_eq_key(self, obj):
+        if isinstance(obj, _PickledObjectWithEqAndPersistentHashKeys):
+            return obj.eq_key
+        return self.eq_key_getter(obj)
+
+    def _get_persistent_hash_key(self, obj):
+        if isinstance(obj, _PickledObjectWithEqAndPersistentHashKeys):
+            return obj.persistent_hash_key
+        return self.persistent_hash_key_getter(obj)
+
+    def __eq__(self, other):
+        if not isinstance(other, (list, LazilyUnpicklingList)):
+            return NotImplemented
+
+        if isinstance(other, LazilyUnpicklingList):
+            other = other._list
+
+        if len(self) != len(other):
+            return False
+
+        for a, b in zip(self._list, other):
+            if self._get_eq_key(a) != self._get_eq_key(b):
+                return False
+
+        return True
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __getstate__(self):
+        return {"_list": [
+                _PickledObjectWithEqAndPersistentHashKeys(
+                    val,
+                    self._get_eq_key(val),
+                    self._get_persistent_hash_key(val))
+                for val in self._list],
+                "eq_key_getter": self.eq_key_getter,
+                "persistent_hash_key_getter": self.persistent_hash_key_getter}
+
+# }}}
+
+
 def is_interned(s):
     return s is None or intern(s) is s
 
@@ -411,4 +576,19 @@ def intern_frozenset_of_ids(fs):
     return frozenset(intern(s) for s in fs)
 
 
+def natorder(key):
+    # Return natural ordering for strings, as opposed to dictionary order.
+    # E.g. will result in
+    #  'abc1' < 'abc9' < 'abc10'
+    # rather than
+    #  'abc1' < 'abc10' < 'abc9'
+    # Based on
+    # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
+    import re
+    return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
+
+
+def natsorted(seq, key=lambda x: x):
+    return sorted(seq, key=lambda y: natorder(key(y)))
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbbd56e906c5e622debcd82bd5368aa3b1fb34c
--- /dev/null
+++ b/loopy/transform/add_barrier.py
@@ -0,0 +1,87 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2017 Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from loopy.kernel.instruction import BarrierInstruction
+from loopy.match import parse_match
+from loopy.transform.instruction import add_dependency
+
+__doc__ = """
+.. currentmodule:: loopy
+
+.. autofunction:: add_barrier
+"""
+
+
+# {{{ add_barrier
+
+def add_barrier(knl, insn_before="", insn_after="", id_based_on=None,
+                tags=None, synchronization_kind="global", mem_kind=None):
+    """Takes in a kernel that needs to be added a barrier and returns a kernel
+    which has a barrier inserted into it. It takes input of 2 instructions and
+    then adds a barrier in between those 2 instructions. The expressions can
+    be any inputs that are understood by :func:`loopy.match.parse_match`.
+
+    :arg insn_before: String expression that specifies the instruction(s)
+    before the barrier which is to be added
+    :arg insn_after: String expression that specifies the instruction(s) after
+    the barrier which is to be added
+    :arg id: String on which the id of the barrier would be based on.
+    :arg tags: The tag of the group to which the barrier must be added
+    :arg synchronization_kind: Kind of barrier to be added. May be "global" or
+    "local"
+    :arg kind: Type of memory to be synchronied. May be "global" or "local". Ignored
+    for "global" bariers.  If not supplied, defaults to :arg:`synchronization_kind`
+    """
+
+    if mem_kind is None:
+        mem_kind = synchronization_kind
+
+    if id_based_on is None:
+        id = knl.make_unique_instruction_id(
+            based_on=synchronization_kind[0]+"_barrier")
+    else:
+        id = knl.make_unique_instruction_id(based_on=id_based_on)
+
+    match = parse_match(insn_before)
+    insn_before_list = [insn.id for insn in knl.instructions if match(knl,
+                        insn)]
+
+    barrier_to_add = BarrierInstruction(depends_on=frozenset(insn_before_list),
+                                        depends_on_is_final=True,
+                                        id=id,
+                                        tags=tags,
+                                        synchronization_kind=synchronization_kind,
+                                        mem_kind=mem_kind)
+
+    new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add])
+    new_knl = add_dependency(kernel=new_knl,
+                             insn_match=insn_after,
+                             depends_on="id:"+id)
+
+    return new_knl
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py
index f4e6526a7b083f0b38dda1209b607aa38a62b68e..618e36f20da8b3f9089ecf5ce88d6b3177528570 100644
--- a/loopy/transform/array_buffer_map.py
+++ b/loopy/transform/array_buffer_map.py
@@ -239,14 +239,14 @@ class ArrayToBufferMap(object):
         non1_storage_axis_flags = []
         non1_storage_shape = []
 
-        for saxis, bi, l in zip(
+        for saxis, bi, saxis_len in zip(
                 storage_axis_names, storage_base_indices, storage_shape):
-            has_length_non1 = l != 1
+            has_length_non1 = saxis_len != 1
 
             non1_storage_axis_flags.append(has_length_non1)
 
             if has_length_non1:
-                non1_storage_shape.append(l)
+                non1_storage_shape.append(saxis_len)
 
         # }}}
 
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 92cff7a507d672a3acc51a8abed572a04cb7e86a..1b059b6a73d3064596b8679fbc87f94287b2d9fe 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -29,7 +29,7 @@ from loopy.symbolic import (get_dependencies,
         RuleAwareIdentityMapper, SubstitutionRuleMappingContext,
         SubstitutionMapper)
 from pymbolic.mapper.substitutor import make_subst_func
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
 from loopy.version import DATA_MODEL_VERSION
 from loopy.diagnostic import LoopyError
@@ -124,7 +124,8 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
 # }}}
 
 
-buffer_array_cache = PersistentDict("loopy-buffer-array-cache-"+DATA_MODEL_VERSION,
+buffer_array_cache = WriteOncePersistentDict(
+        "loopy-buffer-array-cache-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
@@ -531,7 +532,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
 
     if CACHING_ENABLED:
         from loopy.preprocess import prepare_for_caching
-        buffer_array_cache[cache_key] = prepare_for_caching(kernel)
+        buffer_array_cache.store_if_not_present(
+                cache_key, prepare_for_caching(kernel))
 
     return kernel
 
diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 77840753258fa545aa01ef3e8c58cbc36e66ed72..0ac71d603ebe8b5150fb854dd3978676dd9d98c3 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -38,6 +38,7 @@ from loopy.symbolic import IdentityMapper
 class ExtraInameIndexInserter(IdentityMapper):
     def __init__(self, var_to_new_inames):
         self.var_to_new_inames = var_to_new_inames
+        self.seen_ilp_inames = set()
 
     def map_subscript(self, expr):
         try:
@@ -50,6 +51,7 @@ class ExtraInameIndexInserter(IdentityMapper):
                 index = (index,)
             index = tuple(self.rec(i) for i in index)
 
+            self.seen_ilp_inames.update(v.name for v in new_idx)
             return expr.aggregate.index(index + new_idx)
 
     def map_variable(self, expr):
@@ -58,6 +60,7 @@ class ExtraInameIndexInserter(IdentityMapper):
         except KeyError:
             return expr
         else:
+            self.seen_ilp_inames.update(v.name for v in new_idx)
             return expr.index(new_idx)
 
 
@@ -160,13 +163,30 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
     # }}}
 
     from pymbolic import var
-    eiii = ExtraInameIndexInserter(
-            dict((var_name, tuple(var(iname) for iname in inames))
-                for var_name, inames in six.iteritems(var_to_new_ilp_inames)))
-
-    new_insns = [
-            insn.with_transformed_expressions(eiii)
-            for insn in kernel.instructions]
+    var_to_extra_iname = dict(
+            (var_name, tuple(var(iname) for iname in inames))
+            for var_name, inames in six.iteritems(var_to_new_ilp_inames))
+
+    new_insns = []
+
+    for insn in kernel.instructions:
+        eiii = ExtraInameIndexInserter(var_to_extra_iname)
+        new_insn = insn.with_transformed_expressions(eiii)
+        if not eiii.seen_ilp_inames <= insn.within_inames:
+
+            from loopy.diagnostic import warn_with_kernel
+            warn_with_kernel(
+                    kernel,
+                    "implicit_ilp_iname",
+                    "Instruction '%s': touched variable that (for ILP) "
+                    "required iname(s) '%s', but that the instruction was not "
+                    "previously within the iname(s). Previously, this would "
+                    "implicitly promote the instruction, but that behavior is "
+                    "deprecated and will stop working in 2018.1."
+                    % (insn.id, ", ".join(
+                        eiii.seen_ilp_inames - insn.within_inames)))
+
+        new_insns.append(new_insn)
 
     return kernel.copy(
         temporary_variables=new_temp_vars,
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index ea90abfe27c8de69daf39021b3d0ea5463a2e4c8..22fd7b3bb2c643bc3c1309f4e3fdb89438ae7d2b 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -641,7 +641,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
 
     iname_to_tag = [(iname, parse_tag(tag)) for iname, tag in iname_to_tag]
 
-    from loopy.kernel.data import (ParallelTag, AutoLocalIndexTagBase,
+    from loopy.kernel.data import (ConcurrentTag, AutoLocalIndexTagBase,
             ForceSequentialTag)
 
     # {{{ globbing
@@ -686,13 +686,13 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
         if iname not in kernel.all_inames():
             raise ValueError("cannot tag '%s'--not known" % iname)
 
-        if isinstance(new_tag, ParallelTag) \
+        if isinstance(new_tag, ConcurrentTag) \
                 and isinstance(old_tag, ForceSequentialTag):
             raise ValueError("cannot tag '%s' as parallel--"
                     "iname requires sequential execution" % iname)
 
         if isinstance(new_tag, ForceSequentialTag) \
-                and isinstance(old_tag, ParallelTag):
+                and isinstance(old_tag, ConcurrentTag):
             raise ValueError("'%s' is already tagged as parallel, "
                     "but is now prohibited from being parallel "
                     "(likely because of participation in a precompute or "
@@ -972,9 +972,9 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
     # Get the duplication options as a tuple of iname and a set
     for iname, insns in _get_iname_duplication_options(insn_deps):
         # Check whether this iname has a parallel tag and discard it if so
-        from loopy.kernel.data import ParallelTag
+        from loopy.kernel.data import ConcurrentTag
         if (iname in knl.iname_to_tag
-                    and isinstance(knl.iname_to_tag[iname], ParallelTag)):
+                    and isinstance(knl.iname_to_tag[iname], ConcurrentTag)):
             continue
 
         # If we find a duplication option and fo not use boostable_into
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 2be78f8e5c25a3b48c195f52715f9d6453100e3b..37c5d85a1ade5c8f7fadb2c6a785cf7cea3dde40 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -301,4 +301,39 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False):
 # }}}
 
 
+# {{{ uniquify_instruction_ids
+
+def uniquify_instruction_ids(kernel):
+    """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique
+    strings.
+
+    This function does *not* deduplicate existing instruction ids.
+    """
+
+    from loopy.kernel.creation import UniqueName
+
+    insn_ids = set(
+            insn.id for insn in kernel.instructions
+            if insn.id is not None and not isinstance(insn.id, UniqueName))
+
+    from pytools import UniqueNameGenerator
+    insn_id_gen = UniqueNameGenerator(insn_ids)
+
+    new_instructions = []
+
+    for insn in kernel.instructions:
+        if insn.id is None:
+            new_instructions.append(
+                    insn.copy(id=insn_id_gen("insn")))
+        elif isinstance(insn.id, UniqueName):
+            new_instructions.append(
+                    insn.copy(id=insn_id_gen(insn.id.name)))
+        else:
+            new_instructions.append(insn)
+
+    return kernel.copy(instructions=new_instructions)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 6077332c4fc4322ac7ffb02ade4a0e24c7066245..4755ca1774a15480a2c6b255380dd724e47f9042 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -811,7 +811,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         barrier_insn = BarrierInstruction(
                 id=barrier_insn_id,
                 depends_on=frozenset([compute_insn_id]),
-                kind="global")
+                synchronization_kind="global",
+                mem_kind="global")
         compute_dep_id = barrier_insn_id
 
         added_compute_insns.append(barrier_insn)
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 3d4f5c2d4765aa7cbf1e56c76d127bf8f4d61a06..b53488b486c6750742b269f47cfd4f08b8f8fab9 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -351,7 +351,8 @@ class TemporarySaver(object):
             self.subkernel_to_slice_indices[subkernel])
 
         def is_global_barrier(item):
-            return isinstance(item, Barrier) and item.kind == "global"
+            return isinstance(item, Barrier) and \
+                item.synchronization_kind == "global"
 
         try:
             pre_barrier = next(item for item in
@@ -402,13 +403,13 @@ class TemporarySaver(object):
                     continue
 
                 from loopy.kernel.data import (
-                    GroupIndexTag, LocalIndexTag, ParallelTag)
+                    GroupIndexTag, LocalIndexTag, ConcurrentTag)
 
                 if isinstance(tag, GroupIndexTag):
                     my_group_tags.append(tag)
                 elif isinstance(tag, LocalIndexTag):
                     my_local_tags.append(tag)
-                elif isinstance(tag, ParallelTag):
+                elif isinstance(tag, ConcurrentTag):
                     raise LoopyError(
                         "iname '%s' is tagged with '%s' - only "
                         "group and local tags are supported for "
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 79ceff9fdf1e2c4b3b544e8ae85f8194b36ec444..a681afe06520483c83530c241e39229412e88f03 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -1,6 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -24,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
 
 from loopy.symbolic import (
         get_dependencies, SubstitutionMapper,
@@ -141,6 +140,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
     dfmapper = CallbackMapper(gather_exprs, WalkMapper())
 
     for insn in kernel.instructions:
+        dfmapper(insn.assignees)
         dfmapper(insn.expression)
 
     for sr in six.itervalues(kernel.substitutions):
@@ -178,8 +178,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
     new_insns = []
 
     for insn in kernel.instructions:
-        new_expr = cbmapper(insn.expression)
-        new_insns.append(insn.copy(expression=new_expr))
+        new_insns.append(insn.with_transformed_expressions(cbmapper))
 
     from loopy.kernel.data import SubstitutionRule
     new_substs = {
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 409cbbc5ebd5feb13b04eeba1671f639663bfcf1..6ffc1dff5220ab48c6c87ec29fec6e44d57ba133 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -237,6 +237,12 @@ class TypeInferenceMapper(CombineMapper):
         else:
             raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr)
 
+    def map_type_cast(self, expr):
+        subtype, = self.rec(expr.child)
+        if not issubclass(subtype.dtype.type, np.number):
+            raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type))
+        return [expr.type]
+
     def map_subscript(self, expr):
         return self.rec(expr.aggregate)
 
diff --git a/loopy/version.py b/loopy/version.py
index 02244f55d0dbf207a4641c3ebf6cc33b536f0421..e142162729d5a374082fa853dcc763665f7dfe33 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -21,7 +21,7 @@ THE SOFTWARE.
 """
 
 
-VERSION = (2016, 2)
+VERSION = (2017, 2)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v64-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v69-islpy%s" % _islpy_version
diff --git a/requirements.txt b/requirements.txt
index 3ff69a123d10cc7bc6799ebfb8913bfd0eed839e..1a23022821116aea068b76eab72f9a5596694eea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 git+https://github.com/inducer/pytools.git
 git+https://github.com/inducer/islpy.git
 git+https://github.com/inducer/cgen.git
-git+https://github.com/pyopencl/pyopencl.git
+git+https://github.com/inducer/pyopencl.git
 git+https://github.com/inducer/pymbolic.git
 git+https://github.com/inducer/genpy.git
 git+https://github.com/inducer/codepy.git
diff --git a/setup.py b/setup.py
index b8bc17d888aae8409000c936b487afb94a5250d0..b8f36d12559f05a47ef57dd06efd4761e3b3ad9a 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setup(name="loo.py",
           ],
 
       install_requires=[
-          "pytools>=2017.3",
+          "pytools>=2017.6",
           "pymbolic>=2016.2",
           "genpy>=2016.1.2",
           "cgen>=2016.1",
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 6e05aa6adba66ce0a1896527249d321de104c512..842a0127e3118ec8e7a0ea89ed17decc091e8566 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -278,14 +278,14 @@ def test_matmul(ctx_factory, buffer_inames):
     logging.basicConfig(level=logging.INFO)
 
     fortran_src = """
-        subroutine dgemm(m,n,l,a,b,c)
+        subroutine dgemm(m,n,ell,a,b,c)
           implicit none
-          real*8 a(m,l),b(l,n),c(m,n)
-          integer m,n,k,i,j,l
+          real*8 a(m,ell),b(ell,n),c(m,n)
+          integer m,n,k,i,j,ell
 
           do j = 1,n
             do i = 1,m
-              do k = 1,l
+              do k = 1,ell
                 c(i,j) = c(i,j) + b(k,j)*a(i,k)
               end do
             end do
@@ -306,7 +306,7 @@ def test_matmul(ctx_factory, buffer_inames):
     knl = lp.split_iname(knl, "k", 32)
     knl = lp.assume(knl, "n mod 32 = 0")
     knl = lp.assume(knl, "m mod 32 = 0")
-    knl = lp.assume(knl, "l mod 16 = 0")
+    knl = lp.assume(knl, "ell mod 16 = 0")
 
     knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
@@ -317,7 +317,7 @@ def test_matmul(ctx_factory, buffer_inames):
             init_expression="0", store_expression="base+buffer")
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
 
 
 @pytest.mark.xfail
@@ -457,14 +457,14 @@ def test_parse_and_fuse_two_kernels():
 
 def test_precompute_some_exist(ctx_factory):
     fortran_src = """
-        subroutine dgemm(m,n,l,a,b,c)
+        subroutine dgemm(m,n,ell,a,b,c)
           implicit none
-          real*8 a(m,l),b(l,n),c(m,n)
-          integer m,n,k,i,j,l
+          real*8 a(m,ell),b(ell,n),c(m,n)
+          integer m,n,k,i,j,ell
 
           do j = 1,n
             do i = 1,m
-              do k = 1,l
+              do k = 1,ell
                 c(i,j) = c(i,j) + b(k,j)*a(i,k)
               end do
             end do
@@ -483,7 +483,7 @@ def test_precompute_some_exist(ctx_factory):
     knl = lp.split_iname(knl, "k", 8)
     knl = lp.assume(knl, "n mod 8 = 0")
     knl = lp.assume(knl, "m mod 8 = 0")
-    knl = lp.assume(knl, "l mod 8 = 0")
+    knl = lp.assume(knl, "ell mod 8 = 0")
 
     knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
@@ -495,7 +495,7 @@ def test_precompute_some_exist(ctx_factory):
     ref_knl = knl
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
 
 
 if __name__ == "__main__":
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 772d536d1e00fedc0b7abcd2f8c05350fe3b633e..3d422f1d8b5a847d4445468978ee529db95c481f 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -230,14 +230,14 @@ def test_funny_shape_matrix_mul(ctx_factory):
 
     n = get_suitable_size(ctx)
     m = n+12
-    l = m+12
+    ell = m+12
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
                 ],
-            name="matmul", assumptions="n,m,l >= 1")
+            name="matmul", assumptions="n,m,ell >= 1")
 
     knl = lp.add_dtypes(knl, {
         "a": np.float32,
@@ -261,7 +261,7 @@ def test_funny_shape_matrix_mul(ctx_factory):
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
             op_count=[2*n**3/1e9], op_label=["GFlops"],
-            parameters={"n": n, "m": m, "l": l})
+            parameters={"n": n, "m": m, "ell": ell})
 
 
 def test_rank_one(ctx_factory):
diff --git a/test/test_loopy.py b/test/test_loopy.py
index d7b1f37c18f71527bcb31f856d2f6d09fbc9df9a..7da06c91956f42cf716ec685232def61e7d84564 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -52,6 +52,31 @@ __all__ = [
         ]
 
 
+def test_globals_decl_once_with_multi_subprogram(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    np.random.seed(17)
+    a = np.random.randn(16)
+    cnst = np.random.randn(16)
+    knl = lp.make_kernel(
+            "{[i, ii]: 0<=i, ii<n}",
+            """
+            out[i] = a[i]+cnst[i]{id=first}
+            out[ii] = 2*out[ii]+cnst[ii]{id=second}
+            """,
+            [lp.TemporaryVariable(
+                'cnst', shape=('n'), initializer=cnst,
+                scope=lp.temp_var_scope.GLOBAL,
+                read_only=True), '...'])
+    knl = lp.fix_parameters(knl, n=16)
+    knl = lp.add_barrier(knl, "id:first", "id:second")
+
+    knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.split_iname(knl, "ii", 2, outer_tag="g.0", inner_tag="l.0")
+    evt, (out,) = knl(queue, a=a)
+    assert np.linalg.norm(out-((2*(a+cnst)+cnst))) <= 1e-15
+
+
 def test_complicated_subst(ctx_factory):
     #ctx = ctx_factory()
 
@@ -518,6 +543,32 @@ def test_arg_guessing_with_reduction(ctx_factory):
     print(knl)
     print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
 
+
+def test_unknown_arg_shape(ctx_factory):
+    ctx = ctx_factory()
+    from loopy.target.pyopencl import PyOpenCLTarget
+    from loopy.compiled import CompiledKernel
+    bsize = [256, 0]
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<n and 0<=j<m}",
+        """
+        for i
+            <int32> gid = i/256
+            <int32> start = gid*256
+            for j
+                a[start + j] = a[start + j] + j
+            end
+        end
+        """,
+        seq_dependencies=True,
+        name="uniform_l",
+        target=PyOpenCLTarget(),
+        assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0]))
+
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    cl_kernel_info = CompiledKernel(ctx, knl).cl_kernel_info(frozenset())  # noqa
+
 # }}}
 
 
@@ -1064,6 +1115,28 @@ def test_literal_local_barrier(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
+def test_local_barrier_mem_kind():
+    def __test_type(mtype, expected):
+        insn = '... lbarrier'
+        if mtype:
+            insn += '{mem_kind=%s}' % mtype
+        knl = lp.make_kernel(
+                "{ [i]: 0<=i<n }",
+                """
+                for i
+                    %s
+                end
+                """ % insn, seq_dependencies=True,
+                target=lp.PyOpenCLTarget())
+
+        cgr = lp.generate_code_v2(knl)
+        assert 'barrier(%s)' % expected in cgr.device_code()
+
+    __test_type('', 'CLK_LOCAL_MEM_FENCE')
+    __test_type('global', 'CLK_GLOBAL_MEM_FENCE')
+    __test_type('local', 'CLK_LOCAL_MEM_FENCE')
+
+
 def test_kernel_splitting(ctx_factory):
     ctx = ctx_factory()
 
@@ -2008,6 +2081,37 @@ def test_if_else(ctx_factory):
     out_ref[4::6] = 11
     out_ref[2::6] = 3
 
+    knl = lp.make_kernel(
+            "{ [i,j]: 0<=i,j<50}",
+            """
+            for i
+                if i < 25
+                    for j
+                        if j % 2 == 0
+                            a[i, j] = 1
+                        else
+                            a[i, j] = 0
+                        end
+                    end
+                else
+                    for j
+                        if j % 2 == 0
+                            a[i, j] = 0
+                        else
+                            a[i, j] = 1
+                        end
+                    end
+                end
+            end
+            """
+            )
+
+    evt, (out,) = knl(queue, out_host=True)
+
+    out_ref = np.zeros((50, 50))
+    out_ref[:25, 0::2] = 1
+    out_ref[25:, 1::2] = 1
+
     assert np.array_equal(out_ref, out)
 
 
@@ -2180,11 +2284,12 @@ def test_nosync_option_parsing():
         """,
         options=lp.Options(allow_terminal_colors=False))
     kernel_str = str(knl)
-    assert "# insn1,no_sync_with=insn1@any" in kernel_str
-    assert "# insn2,no_sync_with=insn1@any:insn2@any" in kernel_str
-    assert "# insn3,no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str
-    assert "# insn4,no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str  # noqa
-    assert "# insn5,no_sync_with=insn1@any" in kernel_str
+    print(kernel_str)
+    assert "id=insn1, no_sync_with=insn1@any" in kernel_str
+    assert "id=insn2, no_sync_with=insn1@any:insn2@any" in kernel_str
+    assert "id=insn3, no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str
+    assert "id=insn4, no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str  # noqa
+    assert "id=insn5, no_sync_with=insn1@any" in kernel_str
 
 
 def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
@@ -2265,6 +2370,43 @@ def test_barrier_insertion_near_bottom_of_loop():
     assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
 
 
+def test_barrier_in_overridden_get_grid_size_expanded_kernel():
+    from loopy.kernel.data import temp_var_scope as scopes
+
+    # make simple barrier'd kernel
+    knl = lp.make_kernel('{[i]: 0 <= i < 10}',
+                   """
+              for i
+                    a[i] = i {id=a}
+                    ... lbarrier {id=barrier}
+                    b[i + 1] = a[i] {nosync=a}
+              end
+                   """,
+                   [lp.TemporaryVariable("a", np.float32, shape=(10,), order='C',
+                                         scope=scopes.LOCAL),
+                    lp.GlobalArg("b", np.float32, shape=(11,), order='C')],
+               seq_dependencies=True)
+
+    # split into kernel w/ vesize larger than iname domain
+    vecsize = 16
+    knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0')
+
+    # artifically expand via overridden_get_grid_sizes_for_insn_ids
+    class GridOverride(object):
+        def __init__(self, clean, vecsize=vecsize):
+            self.clean = clean
+            self.vecsize = vecsize
+
+        def __call__(self, insn_ids, ignore_auto=True):
+            gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto)
+            return gsize, (self.vecsize,)
+
+    knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride(
+        knl.copy(), vecsize))
+    # make sure we can generate the code
+    lp.generate_code_v2(knl)
+
+
 def test_multi_argument_reduction_type_inference():
     from loopy.type_inference import TypeInferenceMapper
     from loopy.library.reduction import SegmentedSumReductionOperation
@@ -2416,6 +2558,26 @@ def test_kernel_var_name_generator():
     assert vng("b") != "b"
 
 
+def test_fixed_parameters(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "[n] -> {[i]: 0 <= i < n}",
+            """
+            <>tmp[i] = i
+            tmp[0] = 0
+            """,
+            fixed_parameters=dict(n=1))
+
+    knl(queue)
+
+
+def test_parameter_inference():
+    knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "")
+    assert knl.all_params() == set(["n"])
+
+
 def test_execution_backend_can_cache_dtypes(ctx_factory):
     # When the kernel is invoked, the execution backend uses it as a cache key
     # for the type inference and scheduling cache. This tests to make sure that
@@ -2431,6 +2593,167 @@ def test_execution_backend_can_cache_dtypes(ctx_factory):
     knl(queue)
 
 
+def test_preamble_with_separate_temporaries(ctx_factory):
+    from loopy.kernel.data import temp_var_scope as scopes
+    # create a function mangler
+
+    func_name = 'indirect'
+    func_arg_dtypes = (np.int32, np.int32, np.int32)
+    func_result_dtypes = (np.int32,)
+
+    def __indirectmangler(kernel, name, arg_dtypes):
+        """
+        A function that will return a :class:`loopy.kernel.data.CallMangleInfo`
+        to interface with the calling :class:`loopy.LoopKernel`
+        """
+        if name != func_name:
+            return None
+
+        from loopy.types import to_loopy_type
+        from loopy.kernel.data import CallMangleInfo
+
+        def __compare(d1, d2):
+            # compare dtypes ignoring atomic
+            return to_loopy_type(d1, for_atomic=True) == \
+                to_loopy_type(d2, for_atomic=True)
+
+        # check types
+        if len(arg_dtypes) != len(arg_dtypes):
+            raise Exception('Unexpected number of arguments provided to mangler '
+                            '{}, expected {}, got {}'.format(
+                                func_name, len(func_arg_dtypes), len(arg_dtypes)))
+
+        for i, (d1, d2) in enumerate(zip(func_arg_dtypes, arg_dtypes)):
+            if not __compare(d1, d2):
+                raise Exception('Argument at index {} for mangler {} does not '
+                                'match expected dtype.  Expected {}, got {}'.
+                                format(i, func_name, str(d1), str(d2)))
+
+        # get target for creation
+        target = arg_dtypes[0].target
+        return CallMangleInfo(
+            target_name=func_name,
+            result_dtypes=tuple(to_loopy_type(x, target=target) for x in
+                                func_result_dtypes),
+            arg_dtypes=arg_dtypes)
+
+    # create the preamble generator
+    def create_preamble(arr):
+        def __indirectpreamble(preamble_info):
+            # find a function matching our name
+            func_match = next(
+                (x for x in preamble_info.seen_functions
+                 if x.name == func_name), None)
+            desc = 'custom_funcs_indirect'
+            if func_match is not None:
+                from loopy.types import to_loopy_type
+                # check types
+                if tuple(to_loopy_type(x) for x in func_arg_dtypes) == \
+                        func_match.arg_dtypes:
+                    # if match, create our temporary
+                    var = lp.TemporaryVariable(
+                        'lookup', initializer=arr, dtype=arr.dtype, shape=arr.shape,
+                        scope=scopes.GLOBAL, read_only=True)
+                    # and code
+                    code = """
+            int {name}(int start, int end, int match)
+            {{
+                int result = start;
+                for (int i = start + 1; i < end; ++i)
+                {{
+                    if (lookup[i] == match)
+                        result = i;
+                }}
+                return result;
+            }}
+            """.format(name=func_name)
+
+            # generate temporary variable code
+            from cgen import Initializer
+            from loopy.target.c import generate_array_literal
+            codegen_state = preamble_info.codegen_state.copy(
+                is_generating_device_code=True)
+            kernel = preamble_info.kernel
+            ast_builder = codegen_state.ast_builder
+            target = kernel.target
+            decl_info, = var.decl_info(target, index_dtype=kernel.index_dtype)
+            decl = ast_builder.wrap_global_constant(
+                    ast_builder.get_temporary_decl(
+                        codegen_state, None, var,
+                        decl_info))
+            if var.initializer is not None:
+                decl = Initializer(decl, generate_array_literal(
+                    codegen_state, var, var.initializer))
+            # return generated code
+            yield (desc, '\n'.join([str(decl), code]))
+        return __indirectpreamble
+
+    # and finally create a test
+    n = 10
+    # for each entry come up with a random number of data points
+    num_data = np.asarray(np.random.randint(2, 10, size=n), dtype=np.int32)
+    # turn into offsets
+    offsets = np.asarray(np.hstack(([0], np.cumsum(num_data))), dtype=np.int32)
+    # create lookup data
+    lookup = np.empty(0)
+    for i in num_data:
+        lookup = np.hstack((lookup, np.arange(i)))
+    lookup = np.asarray(lookup, dtype=np.int32)
+    # and create data array
+    data = np.random.rand(np.product(num_data))
+
+    # make kernel
+    kernel = lp.make_kernel('{[i]: 0 <= i < n}',
+    """
+    for i
+        <>ind = indirect(offsets[i], offsets[i + 1], 1)
+        out[i] = data[ind]
+    end
+    """,
+    [lp.GlobalArg('out', shape=('n',)),
+     lp.TemporaryVariable(
+        'offsets', shape=(offsets.size,), initializer=offsets, scope=scopes.GLOBAL,
+        read_only=True),
+     lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)],
+    )
+    # fixt params, and add manglers / preamble
+    kernel = lp.fix_parameters(kernel, **{'n': n})
+    kernel = lp.register_preamble_generators(kernel, [create_preamble(lookup)])
+    kernel = lp.register_function_manglers(kernel, [__indirectmangler])
+
+    print(lp.generate_code(kernel)[0])
+    # and call (functionality unimportant, more that it compiles)
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+    # check that it actually performs the lookup correctly
+    assert np.allclose(kernel(
+        queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1])
+
+
+def test_add_prefetch_works_in_lhs_index():
+    knl = lp.make_kernel(
+            "{ [n,k,l,k1,l1,k2,l2]: "
+            "start<=n<end and 0<=k,k1,k2<3 and 0<=l,l1,l2<2 }",
+            """
+            for n
+                <> a1_tmp[k,l] = a1[a1_map[n, k],l]
+                a1_tmp[k1,l1] = a1_tmp[k1,l1] + 1
+                a1_out[a1_map[n,k2], l2] = a1_tmp[k2,l2]
+            end
+            """,
+            [
+                lp.GlobalArg("a1,a1_out", None, "ndofs,2"),
+                lp.GlobalArg("a1_map", None, "nelements,3"),
+                "..."
+            ])
+
+    knl = lp.add_prefetch(knl, "a1_map", "k")
+
+    from loopy.symbolic import get_dependencies
+    for insn in knl.instructions:
+        assert "a1_map" not in get_dependencies(insn.assignees)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_misc.py b/test/test_misc.py
index a22e424630255df4225586eeb9f0d62a03d5318f..0273948b38b28b85e42a600bffb65fbf86dcc554 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -92,26 +92,36 @@ def test_SetTrie():
         s.add_or_update(set([1, 4]))
 
 
-class PicklableItem(object):
+class PickleDetector(object):
+    """Contains a class attribute which flags if any instance was unpickled.
+    """
 
-    flags = {"unpickled": False}
+    @classmethod
+    def reset(cls):
+        cls.instance_unpickled = False
 
     def __getstate__(self):
-        return True
+        return {"state": self.state}
 
     def __setstate__(self, state):
-        PicklableItem.flags["unpickled"] = True
+        self.__class__.instance_unpickled = True
+        self.state = state["state"]
 
 
-def test_LazilyUnpicklingDictionary():
-    def is_unpickled():
-        return PicklableItem.flags["unpickled"]
+class PickleDetectorForLazilyUnpicklingDict(PickleDetector):
+    instance_unpickled = False
 
-    from loopy.tools import LazilyUnpicklingDictionary
+    def __init__(self):
+        self.state = None
 
-    mapping = LazilyUnpicklingDictionary({0: PicklableItem()})
 
-    assert not is_unpickled()
+def test_LazilyUnpicklingDict():
+    from loopy.tools import LazilyUnpicklingDict
+
+    cls = PickleDetectorForLazilyUnpicklingDict
+    mapping = LazilyUnpicklingDict({0: cls()})
+
+    assert not cls.instance_unpickled
 
     from pickle import loads, dumps
 
@@ -120,30 +130,160 @@ def test_LazilyUnpicklingDictionary():
     # {{{ test lazy loading
 
     mapping = loads(pickled_mapping)
-    assert not is_unpickled()
+    assert not cls.instance_unpickled
     list(mapping.keys())
-    assert not is_unpickled()
-    assert isinstance(mapping[0], PicklableItem)
-    assert is_unpickled()
+    assert not cls.instance_unpickled
+    assert isinstance(mapping[0], cls)
+    assert cls.instance_unpickled
+
+    # }}}
+
+    # {{{ conversion
+
+    cls.reset()
+    mapping = loads(pickled_mapping)
+    dict(mapping)
+    assert cls.instance_unpickled
 
     # }}}
 
     # {{{ test multi round trip
 
     mapping = loads(dumps(loads(pickled_mapping)))
-    assert isinstance(mapping[0], PicklableItem)
+    assert isinstance(mapping[0], cls)
 
     # }}}
 
     # {{{ test empty map
 
-    mapping = LazilyUnpicklingDictionary({})
+    mapping = LazilyUnpicklingDict({})
     mapping = loads(dumps(mapping))
     assert len(mapping) == 0
 
     # }}}
 
 
+class PickleDetectorForLazilyUnpicklingList(PickleDetector):
+    instance_unpickled = False
+
+    def __init__(self):
+        self.state = None
+
+
+def test_LazilyUnpicklingList():
+    from loopy.tools import LazilyUnpicklingList
+
+    cls = PickleDetectorForLazilyUnpicklingList
+    lst = LazilyUnpicklingList([cls()])
+    assert not cls.instance_unpickled
+
+    from pickle import loads, dumps
+    pickled_lst = dumps(lst)
+
+    # {{{ test lazy loading
+
+    lst = loads(pickled_lst)
+    assert not cls.instance_unpickled
+    assert isinstance(lst[0], cls)
+    assert cls.instance_unpickled
+
+    # }}}
+
+    # {{{ conversion
+
+    cls.reset()
+    lst = loads(pickled_lst)
+    list(lst)
+    assert cls.instance_unpickled
+
+    # }}}
+
+    # {{{ test multi round trip
+
+    lst = loads(dumps(loads(dumps(lst))))
+    assert isinstance(lst[0], cls)
+
+    # }}}
+
+    # {{{ test empty list
+
+    lst = LazilyUnpicklingList([])
+    lst = loads(dumps(lst))
+    assert len(lst) == 0
+
+    # }}}
+
+
+class PickleDetectorForLazilyUnpicklingListWithEqAndPersistentHashing(
+        PickleDetector):
+    instance_unpickled = False
+
+    def __init__(self, comparison_key):
+        self.state = comparison_key
+
+    def __repr__(self):
+        return repr(self.state)
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, repr(self))
+
+
+def test_LazilyUnpicklingListWithEqAndPersistentHashing():
+    from loopy.tools import LazilyUnpicklingListWithEqAndPersistentHashing
+
+    cls = PickleDetectorForLazilyUnpicklingListWithEqAndPersistentHashing
+    from pickle import loads, dumps
+
+    # {{{ test comparison of a pair of lazy lists
+
+    lst0 = LazilyUnpicklingListWithEqAndPersistentHashing(
+            [cls(0), cls(1)],
+            eq_key_getter=repr,
+            persistent_hash_key_getter=repr)
+    lst1 = LazilyUnpicklingListWithEqAndPersistentHashing(
+            [cls(0), cls(1)],
+            eq_key_getter=repr,
+            persistent_hash_key_getter=repr)
+
+    assert not cls.instance_unpickled
+
+    assert lst0 == lst1
+    assert not cls.instance_unpickled
+
+    lst0 = loads(dumps(lst0))
+    lst1 = loads(dumps(lst1))
+
+    assert lst0 == lst1
+    assert not cls.instance_unpickled
+
+    lst0.append(cls(3))
+    lst1.append(cls(2))
+
+    assert lst0 != lst1
+
+    # }}}
+
+    # {{{ comparison with plain lists
+
+    lst = [cls(0), cls(1), cls(3)]
+
+    assert lst == lst0
+    assert lst0 == lst
+    assert not cls.instance_unpickled
+
+    # }}}
+
+    # {{{ persistent hashing
+
+    from loopy.tools import LoopyKeyBuilder
+    kb = LoopyKeyBuilder()
+
+    assert kb(lst0) == kb(lst)
+    assert not cls.instance_unpickled
+
+    # }}}
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 0de08f5f616937604bc2c93581c5a8a1770164f4..eff3dbd0e07439bbec399479183a7e9ddb69b9ff 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -28,6 +28,7 @@ import pytest
 import loopy as lp
 import pyopencl as cl
 import sys
+import os
 
 pytestmark = pytest.mark.importorskip("fparser")
 
@@ -49,7 +50,7 @@ __all__ = [
 def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     ctx = ctx_factory()
 
-    filename = "strongVolumeKernels.f90"
+    filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90")
     with open(filename, "r") as sourcef:
         source = sourcef.read()
 
diff --git a/test/test_reduction.py b/test/test_reduction.py
index be11d7c8cada94596dceb1a8e0e678f8adb582e9..0c37d2228ee41f3e8af7ef6f6fcd68afa7a66960 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -97,22 +97,22 @@ def test_nested_dependent_reduction(ctx_factory):
                 "{[j]: 0<=j<i+sumlen}"
                 ],
             [
-                "<> sumlen = l[i]",
+                "<> sumlen = ell[i]",
                 "a[i] = sum(j, j)",
                 ],
             [
                 lp.ValueArg("n", np.int32),
                 lp.GlobalArg("a", dtype, ("n",)),
-                lp.GlobalArg("l", np.int32, ("n",)),
+                lp.GlobalArg("ell", np.int32, ("n",)),
                 ])
 
     cknl = lp.CompiledKernel(ctx, knl)
 
     n = 330
-    l = np.arange(n, dtype=np.int32)
-    evt, (a,) = cknl(queue, l=l, n=n, out_host=True)
+    ell = np.arange(n, dtype=np.int32)
+    evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True)
 
-    tgt_result = (2*l-1)*2*l/2
+    tgt_result = (2*ell-1)*2*ell/2
     assert (a == tgt_result).all()
 
 
@@ -413,6 +413,27 @@ def test_parallel_multi_output_reduction(ctx_factory):
         assert max_index == np.argmax(np.abs(a))
 
 
+def test_reduction_with_conditional():
+    # Test whether realization of a reduction inherits predicates
+    # of the original instruction. Tested with the CTarget, because
+    # the PyOpenCL target will hoist the conditional into the host
+    # code in this minimal example.
+    knl = lp.make_kernel(
+                "{ [i] : 0<=i<42 }",
+                """
+                if n > 0
+                    <>b = sum(i, a[i])
+                end
+                """,
+                [lp.GlobalArg("a", dtype=np.float32, shape=(42,)),
+                 lp.GlobalArg("n", dtype=np.float32, shape=())],
+                target=lp.CTarget())
+    code = lp.generate_body(knl)
+
+    # Check that the if appears before the loop that realizes the reduction.
+    assert code.index("if") < code.index("for")
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_statistics.py b/test/test_statistics.py
index a72b62af90050008f837e144f1f28d4a4de1c730..eeb4a5a288afdd5b9295b0b681abb61b5f021d97 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -37,14 +37,14 @@ from pymbolic.primitives import Variable
 def test_op_counter_basic():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                 e[i, k+1] = -g[i,k]*h[i,k+1]
                 """
             ],
-            name="basic", assumptions="n,m,l >= 1")
+            name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
@@ -52,14 +52,14 @@ def test_op_counter_basic():
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
     f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
     f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
     f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
     i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    assert f32add == f32mul == f32div == n*m*l
+    assert f32add == f32mul == f32div == n*m*ell
     assert f64mul == n*m
     assert i32add == n*m*2
 
@@ -67,21 +67,21 @@ def test_op_counter_basic():
 def test_op_counter_reduction():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
             ],
-            name="matmul_serial", assumptions="n,m,l >= 1")
+            name="matmul_serial", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
     f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    assert f32add == f32mul == n*m*l
+    assert f32add == f32mul == n*m*ell
 
     op_map_dtype = op_map.group_by('dtype')
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
@@ -91,20 +91,23 @@ def test_op_counter_reduction():
 def test_op_counter_logic():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
-                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
+                e[i,k] = if(
+                        not(k<ell-2) and k>6 or k/2==ell,
+                        g[i,k]*2,
+                        g[i,k]+h[i,k]/2)
                 """
             ],
-            name="logic", assumptions="n,m,l >= 1")
+            name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
     f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params)
     f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
@@ -118,14 +121,14 @@ def test_op_counter_logic():
 def test_op_counter_specialops():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                 e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k])
                 """
             ],
-            name="specialops", assumptions="n,m,l >= 1")
+            name="specialops", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
@@ -133,8 +136,8 @@ def test_op_counter_specialops():
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
     f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
     f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
@@ -143,8 +146,8 @@ def test_op_counter_specialops():
     i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
     f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
-    assert f32div == 2*n*m*l
-    assert f32mul == f32add == n*m*l
+    assert f32div == 2*n*m*ell
+    assert f32mul == f32add == n*m*ell
     assert f64add == 3*n*m
     assert f64pow == i32add == f64rsq == f64sin == n*m
 
@@ -152,14 +155,14 @@ def test_op_counter_specialops():
 def test_op_counter_bitwise():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                 e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                 """
             ],
-            name="bitwise", assumptions="n,m,l >= 1")
+            name="bitwise", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(
             knl, dict(
@@ -169,16 +172,16 @@ def test_op_counter_bitwise():
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
     i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params)
     i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
     i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
     i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
     i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
-    assert i32add == n*m+n*m*l
-    assert i32bw == 2*n*m*l
+    assert i32add == n*m+n*m*ell
+    assert i32bw == 2*n*m*ell
     assert i64bw == 2*n*m
     assert i64add == i64mul == n*m
     assert i64shift == 2*n*m
@@ -218,22 +221,22 @@ def test_op_counter_triangular_domain():
 def test_mem_access_counter_basic():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                 e[i, k] = g[i,k]*h[i,k+1]
                 """
             ],
-            name="basic", assumptions="n,m,l >= 1")
+            name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
                    ].eval_with_dict(params)
@@ -246,7 +249,7 @@ def test_mem_access_counter_basic():
     f64l += mem_map[lp.MemAccess('global', np.float64,
                           stride=0, direction='load', variable='h')
                     ].eval_with_dict(params)
-    assert f32l == 3*n*m*l
+    assert f32l == 3*n*m*ell
     assert f64l == 2*n*m
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
@@ -255,37 +258,37 @@ def test_mem_access_counter_basic():
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e')
                    ].eval_with_dict(params)
-    assert f32s == n*m*l
+    assert f32s == n*m*ell
     assert f64s == n*m
 
 
 def test_mem_access_counter_reduction():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
             ],
-            name="matmul", assumptions="n,m,l >= 1")
+            name="matmul", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                     ].eval_with_dict(params)
-    assert f32l == 2*n*m*l
+    assert f32l == 2*n*m*ell
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
                    ].eval_with_dict(params)
-    assert f32s == n*l
+    assert f32s == n*ell
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -298,20 +301,22 @@ def test_mem_access_counter_reduction():
 def test_mem_access_counter_logic():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
-                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
+                e[i,k] = if(not(k<ell-2) and k>6 or k/2==ell,
+                    g[i,k]*2,
+                    g[i,k]+h[i,k]/2)
                 """
             ],
-            name="logic", assumptions="n,m,l >= 1")
+            name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
 
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
 
@@ -332,22 +337,22 @@ def test_mem_access_counter_logic():
 def test_mem_access_counter_specialops():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                 e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                 """
             ],
-            name="specialops", assumptions="n,m,l >= 1")
+            name="specialops", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
                   ].eval_with_dict(params)
@@ -360,7 +365,7 @@ def test_mem_access_counter_specialops():
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                           stride=0, direction='load', variable='h')
                    ].eval_with_dict(params)
-    assert f32 == 2*n*m*l
+    assert f32 == 2*n*m*ell
     assert f64 == 2*n*m
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
@@ -369,26 +374,26 @@ def test_mem_access_counter_specialops():
     f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e')
                   ].eval_with_dict(params)
-    assert f32 == n*m*l
+    assert f32 == n*m*ell
     assert f64 == n*m
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'])
     #tot = lp.eval_and_sum_polys(filtered_map, params)
     tot = filtered_map.eval_and_sum(params)
-    assert tot == n*m*l + n*m
+    assert tot == n*m*ell + n*m
 
 
 def test_mem_access_counter_bitwise():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                 e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                 """
             ],
-            name="bitwise", assumptions="n,m,l >= 1")
+            name="bitwise", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(
             knl, dict(
@@ -398,8 +403,8 @@ def test_mem_access_counter_bitwise():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a')
                   ].eval_with_dict(params)
@@ -412,7 +417,7 @@ def test_mem_access_counter_bitwise():
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                           stride=0, direction='load', variable='h')
                    ].eval_with_dict(params)
-    assert i32 == 4*n*m+2*n*m*l
+    assert i32 == 4*n*m+2*n*m*ell
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c')
@@ -420,20 +425,20 @@ def test_mem_access_counter_bitwise():
     i32 += mem_map[lp.MemAccess('global', np.int32,
                           stride=0, direction='store', variable='e')
                    ].eval_with_dict(params)
-    assert i32 == n*m+n*m*l
+    assert i32 == n*m+n*m*ell
 
 
 def test_mem_access_counter_mixed():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
             c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k]
             e[i, k] = g[i,k]*(2+h[i,k])
             """
             ],
-            name="mixed", assumptions="n,m,l >= 1")
+            name="mixed", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64,
                 x=np.float32))
@@ -444,8 +449,8 @@ def test_mem_access_counter_mixed():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g')
                          ].eval_with_dict(params)
@@ -463,9 +468,9 @@ def test_mem_access_counter_mixed():
                                    stride=Variable('m'), direction='load',
                                    variable='b')
                             ].eval_with_dict(params)
-    assert f64uniform == 2*n*m*l/bsize
-    assert f32uniform == n*m*l/bsize
-    assert f32nonconsec == 3*n*m*l
+    assert f64uniform == 2*n*m*ell/bsize
+    assert f32uniform == n*m*ell/bsize
+    assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e')
@@ -474,21 +479,21 @@ def test_mem_access_counter_mixed():
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                            ].eval_with_dict(params)
-    assert f64uniform == n*m*l/bsize
-    assert f32nonconsec == n*m*l
+    assert f64uniform == n*m*ell/bsize
+    assert f32nonconsec == n*m*ell
 
 
 def test_mem_access_counter_nonconsec():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
             c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
             e[i, k] = g[i,k]*(2+h[i,k])
             """
             ],
-            name="nonconsec", assumptions="n,m,l >= 1")
+            name="nonconsec", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.split_iname(knl, "i", 16)
@@ -497,8 +502,8 @@ def test_mem_access_counter_nonconsec():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                   stride=Variable('m'), direction='load',
                                   variable='g')
@@ -508,39 +513,39 @@ def test_mem_access_counter_nonconsec():
                                    variable='h')
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                  stride=Variable('m')*Variable('l'),
+                                  stride=Variable('m')*Variable('ell'),
                                   direction='load', variable='a')
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                   stride=Variable('m')*Variable('l'),
+                                   stride=Variable('m')*Variable('ell'),
                                    direction='load', variable='b')
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
-    assert f32nonconsec == 3*n*m*l
+    assert f32nonconsec == 3*n*m*ell
 
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                   stride=Variable('m'), direction='store',
                                   variable='e')
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                  stride=Variable('m')*Variable('l'),
+                                  stride=Variable('m')*Variable('ell'),
                                   direction='store', variable='c')
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
-    assert f32nonconsec == n*m*l
+    assert f32nonconsec == n*m*ell
 
 
 def test_mem_access_counter_consec():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
             c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
             e[i, k] = g[i,k]*(2+h[i,k])
             """
             ],
-            name="consec", assumptions="n,m,l >= 1")
+            name="consec", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
@@ -548,8 +553,8 @@ def test_mem_access_counter_consec():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='g')
@@ -563,8 +568,8 @@ def test_mem_access_counter_consec():
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b')
                          ].eval_with_dict(params)
-    assert f64consec == 2*n*m*l
-    assert f32consec == 3*n*m*l
+    assert f64consec == 2*n*m*ell
+    assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='store', variable='e')
@@ -572,29 +577,29 @@ def test_mem_access_counter_consec():
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c')
                         ].eval_with_dict(params)
-    assert f64consec == n*m*l
-    assert f32consec == n*m*l
+    assert f64consec == n*m*ell
+    assert f32consec == n*m*ell
 
 
 def test_barrier_counter_nobarriers():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                 e[i, k] = g[i,k]*h[i,k+1]
                 """
             ],
-            name="basic", assumptions="n,m,l >= 1")
+            name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
     sync_map = lp.get_synchronization_map(knl)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     assert len(sync_map) == 1
     assert sync_map["kernel_launch"].eval_with_dict(params) == 1
 
@@ -602,7 +607,7 @@ def test_barrier_counter_nobarriers():
 def test_barrier_counter_barriers():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
             [
                 """
             c[i,j,k] = 2*a[i,j,k] {id=first}
@@ -620,8 +625,8 @@ def test_barrier_counter_barriers():
     print(sync_map)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     barrier_count = sync_map["barrier_local"].eval_with_dict(params)
     assert barrier_count == 50*10*2
 
@@ -630,11 +635,11 @@ def test_all_counters_parallel_matmul():
 
     bsize = 16
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
             ],
-            name="matmul", assumptions="n,m,l >= 1")
+            name="matmul", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
     knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
@@ -644,8 +649,8 @@ def test_all_counters_parallel_matmul():
 
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
 
     sync_map = lp.get_synchronization_map(knl)
     assert len(sync_map) == 2
@@ -666,7 +671,7 @@ def test_all_counters_parallel_matmul():
                         lp.Op(np.dtype(np.int32), 'mul')
                         ].eval_with_dict(params)
 
-    assert f32mul+f32add == n*m*l*2
+    assert f32mul+f32add == n*m*ell*2
 
     op_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
@@ -677,21 +682,21 @@ def test_all_counters_parallel_matmul():
                      stride=1, direction='load', variable='a')
                      ].eval_with_dict(params)
 
-    assert f32s1lb == n*m*l/bsize
-    assert f32s1la == n*m*l/bsize
+    assert f32s1lb == n*m*ell/bsize
+    assert f32s1la == n*m*ell/bsize
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='store', variable='c')
                      ].eval_with_dict(params)
 
-    assert f32coal == n*l
+    assert f32coal == n*ell
 
     local_mem_map = lp.get_mem_access_map(knl,
                         count_redundant_work=True).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load')
                                 ].eval_with_dict(params)
-    assert local_mem_l == n*m*l*2
+    assert local_mem_l == n*m*ell*2
 
 
 def test_gather_access_footprint():
@@ -729,38 +734,38 @@ def test_gather_access_footprint_2():
 def test_summations_and_filters():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                 e[i, k+1] = -g[i,k]*h[i,k+1]
                 """
             ],
-            name="basic", assumptions="n,m,l >= 1")
+            name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a']
                                 ).eval_and_sum(params)
-    assert loads_a == 2*n*m*l
+    assert loads_a == 2*n*m*ell
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store']
                                       ).eval_and_sum(params)
-    assert global_stores == n*m*l + n*m
+    assert global_stores == n*m*ell + n*m
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
                                  ).to_bytes().eval_and_sum(params)
-    assert ld_bytes == 4*n*m*l*3 + 8*n*m*2
-    assert st_bytes == 4*n*m*l + 8*n*m
+    assert ld_bytes == 4*n*m*ell*3 + 8*n*m*2
+    assert st_bytes == 4*n*m*ell + 8*n*m
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -768,7 +773,7 @@ def test_summations_and_filters():
                           ].eval_with_dict(params)
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
-    assert f32lall == 3*n*m*l
+    assert f32lall == 3*n*m*ell
     assert f64lall == 2*n*m
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
@@ -779,14 +784,14 @@ def test_summations_and_filters():
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
     f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
     i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
-    assert f32 == n*m*l*3
+    assert f32 == n*m*ell*3
     assert f64 == n*m
     assert i32 == n*m*2
 
     addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
     f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
-    assert addsub_all == n*m*l + n*m*2
-    assert f32ops_all == n*m*l*3
+    assert addsub_all == n*m*ell + n*m*2
+    assert f32ops_all == n*m*ell*3
 
     non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
     assert non_field == 0
@@ -795,7 +800,7 @@ def test_summations_and_filters():
     ops_noname = op_map.group_by('dtype')
     mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
     f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
-    assert mul_all == n*m*l + n*m
+    assert mul_all == n*m*ell + n*m
     assert f64ops_all == n*m
 
     def func_filter(key):
@@ -805,6 +810,32 @@ def test_summations_and_filters():
     assert s1f64l == 2*n*m
 
 
+def test_strided_footprint():
+    param_dict = dict(n=2**20)
+    knl = lp.make_kernel(
+        "[n] -> {[i]: 0<=i<n}",
+        [
+            "z[i] = x[3*i]"
+        ], name="s3")
+
+    knl = lp.add_and_infer_dtypes(knl, dict(x=np.float32))
+
+    unr = 4
+    bx = 256
+
+    knl = lp.split_iname(knl, "i", bx*unr, outer_tag="g.0", slabs=(0, 1))
+    knl = lp.split_iname(knl, "i_inner", bx, outer_tag="unr", inner_tag="l.0")
+
+    footprints = lp.gather_access_footprints(knl)
+    x_l_foot = footprints[('x', 'read')]
+
+    from loopy.statistics import count
+    num = count(knl, x_l_foot).eval_with_dict(param_dict)
+    denom = count(knl, x_l_foot.remove_divs()).eval_with_dict(param_dict)
+
+    assert 2*num < denom
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_target.py b/test/test_target.py
index ad0cb7439bfdd6200e020c0becadcd73072ceef4..aa6f004634f207a7b9733da4a3d7e06d13d7db7c 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -140,6 +140,32 @@ def test_generate_c_snippet():
     print(lp.generate_body(knl))
 
 
+def test_c_min_max():
+    # Test fmin() fmax() is generated for C backend instead of max() and min()
+    from loopy.target.c import CTarget
+    import pymbolic.primitives as p
+    i = p.Variable("i")
+    xi = p.Subscript(p.Variable("x"), i)
+    yi = p.Subscript(p.Variable("y"), i)
+    zi = p.Subscript(p.Variable("z"), i)
+
+    n = 100
+    domain = "{[i]: 0<=i<%d}" % n
+    data = [lp.GlobalArg("x", np.float64, shape=(n,)),
+            lp.GlobalArg("y", np.float64, shape=(n,)),
+            lp.GlobalArg("z", np.float64, shape=(n,))]
+
+    inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))]
+    knl = lp.make_kernel(domain, inst, data, target=CTarget())
+    code = lp.generate_code_v2(knl).device_code()
+    assert "fmin" in code
+
+    inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))]
+    knl = lp.make_kernel(domain, inst, data, target=CTarget())
+    code = lp.generate_code_v2(knl).device_code()
+    assert "fmax" in code
+
+
 @pytest.mark.parametrize("tp", ["f32", "f64"])
 def test_random123(ctx_factory, tp):
     ctx = ctx_factory()
@@ -240,6 +266,44 @@ def test_numba_cuda_target():
     print(lp.generate_code_v2(knl).all_code())
 
 
+def test_sized_integer_c_codegen(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    from pymbolic import var
+    knl = lp.make_kernel(
+        "{[i]: 0<=i<n}",
+        [lp.Assignment("a[i]", lp.TypeCast(np.int64, 1) << var("i"))]
+        )
+
+    knl = lp.set_options(knl, write_code=True)
+    n = 40
+
+    evt, (a,) = knl(queue, n=n)
+
+    a_ref = 1 << np.arange(n, dtype=np.int64)
+
+    assert np.array_equal(a_ref, a.get())
+
+
+def test_child_invalid_type_cast():
+    from pymbolic import var
+    knl = lp.make_kernel(
+        "{[i]: 0<=i<n}",
+        ["<> ctr = make_uint2(0, 0)",
+         lp.Assignment("a[i]", lp.TypeCast(np.int64, var("ctr")) << var("i"))]
+        )
+
+    with pytest.raises(lp.LoopyError):
+        knl = lp.preprocess_kernel(knl)
+
+
+def test_target_invalid_type_cast():
+    dtype = np.dtype([('', '<u4'), ('', '<i4')])
+    with pytest.raises(lp.LoopyError):
+        lp.TypeCast(dtype, 1)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_transform.py b/test/test_transform.py
index b5fcdf04c4781c5f370c911ceb7efcb4042f6b4e..e50605b46672f8e9c1817431f1577742b1f6fb4c 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -105,6 +105,27 @@ def test_to_batched(ctx_factory):
     bknl(queue, a=a, x=x)
 
 
+def test_add_barrier(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    knl = lp.make_kernel(
+            "{[i, j, ii, jj]: 0<=i,j, ii, jj<n}",
+            """
+            out[j, i] = a[i, j]{id=transpose}
+            out[ii, jj] = 2*out[ii, jj]{id=double}
+            """)
+    a = np.random.randn(16, 16)
+    knl = lp.add_barrier(knl, "id:transpose", "id:double", "gb1")
+
+    knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.split_iname(knl, "j", 2, outer_tag="g.1", inner_tag="l.1")
+    knl = lp.split_iname(knl, "ii", 2, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.split_iname(knl, "jj", 2, outer_tag="g.1", inner_tag="l.1")
+
+    evt, (out,) = knl(queue, a=a)
+    assert (np.linalg.norm(out-2*a.T) < 1e-16)
+
+
 def test_rename_argument(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -438,6 +459,23 @@ def test_add_nosync():
     assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with
 
 
+def test_uniquify_instruction_ids():
+    i1 = lp.Assignment("b", 1, id=None)
+    i2 = lp.Assignment("b", 1, id=None)
+    i3 = lp.Assignment("b", 1, id=lp.UniqueName("b"))
+    i4 = lp.Assignment("b", 1, id=lp.UniqueName("b"))
+
+    knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4])
+
+    from loopy.transform.instruction import uniquify_instruction_ids
+    knl = uniquify_instruction_ids(knl)
+
+    insn_ids = set(insn.id for insn in knl.instructions)
+
+    assert len(insn_ids) == 4
+    assert all(isinstance(id, str) for id in insn_ids)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])