diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5ea075d194a9da75a1c18d180c65239be83eb85e..f96b43d67fcc1ca53a736fb4893990b8bd363a1a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -69,6 +69,7 @@ Python 2.7 with legacy PyOpenCL:
   - pocl
   except:
   - tags
+  retry: 2
 
 Python 3.6 POCL:
   script:
diff --git a/doc/index.rst b/doc/index.rst
index a0bad2898be4aab74dead90aae825e4e0a460c87..d862a8acd0cb258bfd1e9623bd5cef895871f6b1 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -18,12 +18,14 @@ When you run this script, the following kernel is generated, compiled, and execu
 
 (See the full example for how to print the generated code.)
 
+.. _static-binary:
+
 Want to try out loopy?
 ----------------------
 
 There's no need to go through :ref:`installation` if you'd just like to get a
 feel for what loopy is.  Instead, you may
-`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/1989/artifacts/browse/build-helpers/>`_.
+`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/36708/artifacts/browse/build-helpers/>`_.
 This is purposefully built on an ancient Linux distribution, so it should work
 on most versions of Linux that are currently out there.
 
diff --git a/doc/misc.rst b/doc/misc.rst
index 9db3b85a7d96c9ccf56592bcefb2b8639984f4f8..cd6fe102cb9c97a619d8b6512f103c9dcabe65b5 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -3,6 +3,18 @@
 Installation
 ============
 
+Option 0: Static Binary
+-----------------------
+
+If you would just like to experiment with :mod:`loopy`'s code transformation
+abilities, the easiest way to get loopy is to download a statically-linked
+Linux binary.
+
+See :ref:`static-binary` for details.
+
+Option 1: From Source, no PyOpenCL integration
+-----------------------------------------------
+
 This command should install :mod:`loopy`::
 
     pip install loo.py
@@ -26,10 +38,59 @@ You may also clone its git repository::
     git clone --recursive git://github.com/inducer/loopy
     git clone --recursive http://git.tiker.net/trees/loopy.git
 
+Option 2: From Conda Forge, with PyOpenCL integration
+-----------------------------------------------------
+
+This set of instructions is intended for 64-bit Linux and
+MacOS support computers:
+
+#.  Make sure your system has the basics to build software.
+
+    On Debian derivatives (Ubuntu and many more),
+    installing ``build-essential`` should do the trick.
+
+    Everywhere else, just making sure you have the ``g++`` package should be
+    enough.
+
+#.  Install `miniconda <https://conda.io/miniconda.html>`_.
+    (Both Python 2 and 3 should work. In the absence of other constraints, prefer Python 3.)
+
+#.  ``export CONDA=/WHERE/YOU/INSTALLED/miniconda3``
+
+    If you accepted the default location, this should work:
+
+    ``export CONDA=$HOME/miniconda3``
+
+#.  ``$CONDA/bin/conda create -n dev``
+
+#.  ``source $CONDA/bin/activate dev``
+
+#.  ``conda config --add channels conda-forge``
+
+#.  ``conda install git pip pocl islpy pyopencl`` (Linux)
+
+    or
+
+    ``conda install osx-pocl-opencl git pip pocl islpy pyopencl`` (OS X)
+
+#.  Type the following command::
+
+        pip install git+https://github.com/inducer/loopy
+
+Next time you want to use :mod:`loopy`, just run the following command::
+
+    source /WHERE/YOU/INSTALLED/miniconda3/bin/activate dev
+
+You may also like to add this to a startup file (like :file:`$HOME/.bashrc`) or create an alias for it.
+
+See the `PyOpenCL installation instructions
+<https://documen.tician.de/pyopencl/misc.html#installation>`_ for options
+regarding OpenCL drivers.
+
 User-visible Changes
 ====================
 
-Version 2016.2
+Version 2017.2
 --------------
 .. note::
 
@@ -57,7 +118,7 @@ Licensing
 
 Loopy is licensed to you under the MIT/X Consortium license:
 
-Copyright (c) 2009-13 Andreas Klöckner and Contributors.
+Copyright (c) 2009-17 Andreas Klöckner and Contributors.
 
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 85716fd93ff4768e8ec075c8afa7f0a9b0363999..3f01b0764f71e9ce2de86a66cc71f56473a7dc9f 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -130,6 +130,7 @@ Iname Implementation Tags
 Tag                             Meaning
 =============================== ====================================================
 ``None`` | ``"for"``            Sequential loop
+``"ord"``                       Forced-order sequential loop
 ``"l.N"``                       Local (intra-group) axis N ("local")
 ``"g.N"``                       Group-number axis N ("group")
 ``"unr"``                       Unroll
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 12c058fb741279db55521118f6711f197735dbd0..8b85387259228777f028fb70b1c0cf2efcc2d2ef 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -122,7 +122,9 @@ always see loopy's view of a kernel by printing it.
     i: None
     ---------------------------------------------------------------------------
     INSTRUCTIONS:
-     [i]                                  out[i] <- 2*a[i]   # insn
+    for i
+      out[i] = 2*a[i]  {id=insn}
+    end i
     ---------------------------------------------------------------------------
 
 You'll likely have noticed that there's quite a bit more information here
@@ -1212,11 +1214,11 @@ should call :func:`loopy.get_one_scheduled_kernel`:
    ---------------------------------------------------------------------------
    SCHEDULE:
       0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[])
-      1:     [maketmp] tmp <- arr[i_inner + i_outer*16]
+      1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
       2: RETURN FROM KERNEL rotate_v2
-      3: ---BARRIER:global---
+      3: ... gbarrier
       4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[])
-      5:     [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+      5:     arr[((1 + i_inner + i_outer*16) % n)] = tmp  {id=rotate}
       6: RETURN FROM KERNEL rotate_v2_0
    ---------------------------------------------------------------------------
 
@@ -1250,13 +1252,13 @@ put those instructions into the schedule.
    ---------------------------------------------------------------------------
    SCHEDULE:
       0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[])
-      1:     [maketmp] tmp <- arr[i_inner + i_outer*16]
-      2:     [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp
+      1:     tmp = arr[i_inner + i_outer*16]  {id=maketmp}
+      2:     tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp  {id=tmp.save}
       3: RETURN FROM KERNEL rotate_v2
-      4: ---BARRIER:global---
+      4: ... gbarrier
       5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[])
-      6:     [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]
-      7:     [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp
+      6:     tmp = tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0]  {id=tmp.reload}
+      7:     arr[((1 + i_inner + i_outer*16) % n)] = tmp  {id=rotate}
       8: RETURN FROM KERNEL rotate_v2_0
    ---------------------------------------------------------------------------
 
diff --git a/loopy/check.py b/loopy/check.py
index a8ec1ad35e42410454b36fa38ef5f0a2fbefc0d6..6bac368381c708b72b2b7f235792df97d0bcd15e 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -144,20 +144,20 @@ def check_for_inactive_iname_access(kernel):
 
 def _is_racing_iname_tag(tv, tag):
     from loopy.kernel.data import (temp_var_scope,
-            LocalIndexTagBase, GroupIndexTag, ParallelTag, auto)
+            LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto)
 
     if tv.scope == temp_var_scope.PRIVATE:
         return (
-                isinstance(tag, ParallelTag)
+                isinstance(tag, ConcurrentTag)
                 and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag)))
 
     elif tv.scope == temp_var_scope.LOCAL:
         return (
-                isinstance(tag, ParallelTag)
+                isinstance(tag, ConcurrentTag)
                 and not isinstance(tag, GroupIndexTag))
 
     elif tv.scope == temp_var_scope.GLOBAL:
-        return isinstance(tag, ParallelTag)
+        return isinstance(tag, ConcurrentTag)
 
     elif tv.scope == auto:
         raise LoopyError("scope of temp var '%s' has not yet been"
@@ -169,7 +169,7 @@ def _is_racing_iname_tag(tv, tag):
 
 
 def check_for_write_races(kernel):
-    from loopy.kernel.data import ParallelTag
+    from loopy.kernel.data import ConcurrentTag
 
     iname_to_tag = kernel.iname_to_tag.get
     for insn in kernel.instructions:
@@ -190,7 +190,7 @@ def check_for_write_races(kernel):
                 raceable_parallel_insn_inames = set(
                         iname
                         for iname in kernel.insn_inames(insn)
-                        if isinstance(iname_to_tag(iname), ParallelTag))
+                        if isinstance(iname_to_tag(iname), ConcurrentTag))
 
             elif assignee_name in kernel.temporary_variables:
                 temp_var = kernel.temporary_variables[assignee_name]
@@ -230,13 +230,13 @@ def check_for_orphaned_user_hardware_axes(kernel):
 
 
 def check_for_data_dependent_parallel_bounds(kernel):
-    from loopy.kernel.data import ParallelTag
+    from loopy.kernel.data import ConcurrentTag
 
     for i, dom in enumerate(kernel.domains):
         dom_inames = set(dom.get_var_names(dim_type.set))
         par_inames = set(iname
                 for iname in dom_inames
-                if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
+                if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag))
 
         if not par_inames:
             continue
@@ -401,7 +401,7 @@ def pre_schedule_checks(kernel):
         logger.debug("%s: pre-schedule check: done" % kernel.name)
     except KeyboardInterrupt:
         raise
-    except:
+    except Exception:
         print(75*"=")
         print("failing kernel during pre-schedule check:")
         print(75*"=")
@@ -659,7 +659,7 @@ def pre_codegen_checks(kernel):
         check_that_shapes_and_strides_are_arguments(kernel)
 
         logger.debug("pre-codegen check %s: done" % kernel.name)
-    except:
+    except Exception:
         print(75*"=")
         print("failing kernel during pre-schedule check:")
         print(75*"=")
@@ -708,6 +708,16 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                 (insn_impl_domain & assumptions)
                 .project_out_except(insn_inames, [dim_type.set]))
 
+        from loopy.kernel.instruction import BarrierInstruction
+        from loopy.kernel.data import LocalIndexTag
+        if isinstance(insn, BarrierInstruction):
+            # project out local-id-mapped inames, solves #94 on gitlab
+            non_lid_inames = frozenset(
+                [iname for iname in insn_inames if not isinstance(
+                    kernel.iname_to_tag.get(iname), LocalIndexTag)])
+            insn_impl_domain = insn_impl_domain.project_out_except(
+                non_lid_inames, [dim_type.set])
+
         insn_domain = kernel.get_inames_domain(insn_inames)
         insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param))
         assumptions, insn_domain = align_two(assumption_non_param, insn_domain)
@@ -715,6 +725,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
             .project_out_except(insn_inames, [dim_type.set])
             .project_out_except(insn_parameters, [dim_type.param]))
 
+        if isinstance(insn, BarrierInstruction):
+            # project out local-id-mapped inames, solves #94 on gitlab
+            desired_domain = desired_domain.project_out_except(
+                non_lid_inames, [dim_type.set])
+
         insn_impl_domain = (insn_impl_domain
                 .project_out_except(insn_parameters, [dim_type.param]))
         insn_impl_domain, desired_domain = align_two(
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 07bcdc7c6c4a0c23d374a14bc21e4e161b73be03..e83515d31f1c61e52569d8d0754ce79e7a7f602f 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -28,7 +28,7 @@ from loopy.diagnostic import LoopyError, warn
 from pytools import ImmutableRecord
 import islpy as isl
 
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
@@ -357,8 +357,9 @@ class CodeGenerationState(object):
 # }}}
 
 
-code_gen_cache = PersistentDict("loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
-        key_builder=LoopyKeyBuilder())
+code_gen_cache = WriteOncePersistentDict(
+         "loopy-code-gen-cache-v3-"+DATA_MODEL_VERSION,
+         key_builder=LoopyKeyBuilder())
 
 
 class PreambleInfo(ImmutableRecord):
@@ -367,6 +368,7 @@ class PreambleInfo(ImmutableRecord):
     .. attribute:: seen_dtypes
     .. attribute:: seen_functions
     .. attribute:: seen_atomic_dtypes
+    .. attribute:: codegen_state
     """
 
 
@@ -495,7 +497,9 @@ def generate_code_v2(kernel):
             seen_dtypes=seen_dtypes,
             seen_functions=seen_functions,
             # a set of LoopyTypes (!)
-            seen_atomic_dtypes=seen_atomic_dtypes)
+            seen_atomic_dtypes=seen_atomic_dtypes,
+            codegen_state=codegen_state
+            )
 
     preamble_generators = (kernel.preamble_generators
             + kernel.target.get_device_ast_builder().preamble_generators())
@@ -515,7 +519,7 @@ def generate_code_v2(kernel):
     logger.info("%s: generate code: done" % kernel.name)
 
     if CACHING_ENABLED:
-        code_gen_cache[input_kernel] = codegen_result
+        code_gen_cache.store_if_not_present(input_kernel, codegen_result)
 
     return codegen_result
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 61f4b3a9b8c38dfc25ebc81243812aa963423f8a..f398a063dc41f3f82267f6d4850158e4c45f4733 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -58,7 +58,7 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai
 def get_usable_inames_for_conditional(kernel, sched_index):
     from loopy.schedule import (
         find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
-    from loopy.kernel.data import ParallelTag, LocalIndexTagBase, IlpBaseTag
+    from loopy.kernel.data import ConcurrentTag, LocalIndexTagBase, IlpBaseTag
 
     result = find_active_inames_at(kernel, sched_index)
     crosses_barrier = has_barrier_within(kernel, sched_index)
@@ -97,7 +97,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
         #   at the innermost level of nesting.
 
         if (
-                isinstance(tag, ParallelTag)
+                isinstance(tag, ConcurrentTag)
                 and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier)
                 and not isinstance(tag, IlpBaseTag)
                 ):
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 789c00d33b7bb41816e6901e24046d4b0eefb27d..5240042337163f0aefcbc7fdb8f3151ac280053f 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -40,7 +40,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
 
     kernel = codegen_state.kernel
 
-    from loopy.kernel.data import LocalIndexTag, HardwareParallelTag
+    from loopy.kernel.data import LocalIndexTag, HardwareConcurrentTag
 
     from loopy.schedule import find_active_inames_at, has_barrier_within
     result = find_active_inames_at(kernel, sched_index)
@@ -48,7 +48,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
     has_barrier = has_barrier_within(kernel, sched_index)
 
     for iname, tag in six.iteritems(kernel.iname_to_tag):
-        if (isinstance(tag, HardwareParallelTag)
+        if (isinstance(tag, HardwareConcurrentTag)
                 and codegen_state.is_generating_device_code):
             if not has_barrier or not isinstance(tag, LocalIndexTag):
                 result.add(iname)
@@ -135,12 +135,13 @@ def generate_code_for_sched_index(codegen_state, sched_index):
                 generate_sequential_loop_dim_code)
 
         from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag,
-                LoopedIlpTag, VectorizeTag)
+                LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag)
         if isinstance(tag, (UnrollTag, UnrolledIlpTag)):
             func = generate_unroll_loop
         elif isinstance(tag, VectorizeTag):
             func = generate_vectorize_loop
-        elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)):
+        elif tag is None or isinstance(tag, (
+                LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)):
             func = generate_sequential_loop_dim_code
         else:
             raise RuntimeError("encountered (invalid) EnterLoop "
@@ -240,6 +241,15 @@ def build_loop_nest(codegen_state, schedule_index):
 
     kernel = codegen_state.kernel
 
+    # If the AST builder does not implement conditionals, we can save us
+    # some work about hoisting conditionals and directly go into recursion.
+    if not codegen_state.ast_builder.can_implement_conditionals:
+        result = []
+        inner = generate_code_for_sched_index(codegen_state, schedule_index)
+        if inner is not None:
+            result.append(inner)
+        return merge_codegen_results(codegen_state, result)
+
     # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices
 
     # i.e. go up to the next LeaveLoop, and skip over inner loops.
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 1a132049731cd094ba5665857f1afa4f9b04684a..1db7b0445efd2a2e27e761164fa919647df37a07 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     kernel = codegen_state.kernel
 
     from loopy.kernel.data import (
-            UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)
+            UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag)
 
     from loopy.schedule import get_insn_ids_for_block_at
     insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
@@ -243,7 +243,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
 
         hw_inames_left = [iname
                 for iname in all_inames_by_insns
-                if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]
+                if isinstance(kernel.iname_to_tag.get(iname), HardwareConcurrentTag)]
 
     if not hw_inames_left:
         return next_func(codegen_state)
diff --git a/loopy/execution.py b/loopy/execution.py
index 07e28f06d33e5884ac57c9505593c9ee916c3171..a1228f8f3bb3493e83936ee0b3998bbd5b8cdcc2 100644
--- a/loopy/execution.py
+++ b/loopy/execution.py
@@ -31,7 +31,7 @@ from loopy.diagnostic import LoopyError
 import logging
 logger = logging.getLogger(__name__)
 
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
@@ -120,7 +120,7 @@ class SeparateArrayPackingController(object):
 
 # {{{ KernelExecutorBase
 
-typed_and_scheduled_cache = PersistentDict(
+typed_and_scheduled_cache = WriteOncePersistentDict(
         "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
@@ -204,7 +204,7 @@ class KernelExecutorBase(object):
         kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set)
 
         if CACHING_ENABLED:
-            typed_and_scheduled_cache[cache_key] = kernel
+            typed_and_scheduled_cache.store_if_not_present(cache_key, kernel)
 
         return kernel
 
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index f7ce5d9fc983c2ab946b5d959f283ef9328b7f29..49ab3fd68303e18a6bec371fc54db4e63f57346d 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -329,7 +329,7 @@ def is_nonnegative(expr, over_set):
     from loopy.symbolic import aff_from_expr
     try:
         aff = aff_from_expr(space, -expr-1)
-    except:
+    except Exception:
         return None
     expr_neg_set = isl.BasicSet.universe(space).add_constraint(
             isl.Constraint.inequality_from_aff(aff))
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 084c37b45cc4af25689ae3e121f170382c4e8d16..cad11fc78075342a1c270f68486900ead65a95fd 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -40,6 +40,7 @@ from loopy.library.function import (
         single_arg_function_mangler)
 
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
+from loopy.tools import natsorted
 
 
 # {{{ unique var names
@@ -701,12 +702,12 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         tag_key_uses = {}
 
-        from loopy.kernel.data import HardwareParallelTag
+        from loopy.kernel.data import HardwareConcurrentTag
 
         for iname in cond_inames:
             tag = self.iname_to_tag.get(iname)
 
-            if isinstance(tag, HardwareParallelTag):
+            if isinstance(tag, HardwareConcurrentTag):
                 tag_key_uses.setdefault(tag.key, []).append(iname)
 
         multi_use_keys = set(
@@ -716,7 +717,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         multi_use_inames = set()
         for iname in cond_inames:
             tag = self.iname_to_tag.get(iname)
-            if isinstance(tag, HardwareParallelTag) and tag.key in multi_use_keys:
+            if isinstance(tag, HardwareConcurrentTag) and tag.key in multi_use_keys:
                 multi_use_inames.add(iname)
 
         return frozenset(cond_inames - multi_use_inames)
@@ -958,7 +959,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             try:
                 # insist block size is constant
                 size = static_max_of_pw_aff(size,
-                        constants_only=isinstance(tag, LocalIndexTag))
+                        constants_only=isinstance(tag, LocalIndexTag),
+                        context=self.assumptions)
             except ValueError:
                 pass
 
@@ -1128,20 +1130,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         else:
             sep = []
 
-        def natorder(key):
-            # Return natural ordering for strings, as opposed to dictionary order.
-            # E.g. will result in
-            #  'abc1' < 'abc9' < 'abc10'
-            # rather than
-            #  'abc1' < 'abc10' < 'abc9'
-            # Based on
-            # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
-            import re
-            return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
-
-        def natsorted(seq, key=lambda x: x):
-            return sorted(seq, key=lambda y: natorder(key(y)))
-
         if "name" in what:
             lines.extend(sep)
             lines.append("KERNEL: " + kernel.name)
@@ -1187,113 +1175,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             lines.extend(sep)
             if show_labels:
                 lines.append("INSTRUCTIONS:")
-            loop_list_width = 35
-
-            # {{{ topological sort
-
-            printed_insn_ids = set()
-            printed_insn_order = []
-
-            def insert_insn_into_order(insn):
-                if insn.id in printed_insn_ids:
-                    return
-                printed_insn_ids.add(insn.id)
-
-                for dep_id in natsorted(insn.depends_on):
-                    insert_insn_into_order(kernel.id_to_insn[dep_id])
-
-                printed_insn_order.append(insn)
-
-            for insn in kernel.instructions:
-                insert_insn_into_order(insn)
-
-            # }}}
-
-            import loopy as lp
-
-            Fore = self.options._fore  # noqa
-            Style = self.options._style  # noqa
-
-            from loopy.kernel.tools import draw_dependencies_as_unicode_arrows
-            for insn, (arrows, extender) in zip(
-                    printed_insn_order,
-                    draw_dependencies_as_unicode_arrows(
-                        printed_insn_order, fore=Fore, style=Style)):
-
-                if isinstance(insn, lp.MultiAssignmentBase):
-                    lhs = ", ".join(str(a) for a in insn.assignees)
-                    rhs = str(insn.expression)
-                    trailing = []
-                elif isinstance(insn, lp.CInstruction):
-                    lhs = ", ".join(str(a) for a in insn.assignees)
-                    rhs = "CODE(%s|%s)" % (
-                            ", ".join(str(x) for x in insn.read_variables),
-                            ", ".join("%s=%s" % (name, expr)
-                                for name, expr in insn.iname_exprs))
-
-                    trailing = ["    "+l for l in insn.code.split("\n")]
-                elif isinstance(insn, lp.BarrierInstruction):
-                    lhs = ""
-                    rhs = "... %sbarrier" % insn.kind[0]
-                    trailing = []
-
-                elif isinstance(insn, lp.NoOpInstruction):
-                    lhs = ""
-                    rhs = "... nop"
-                    trailing = []
 
-                else:
-                    raise LoopyError("unexpected instruction type: %s"
-                            % type(insn).__name__)
-
-                order = self._get_iname_order_for_printing()
-                loop_list = ",".join(
-                    sorted(kernel.insn_inames(insn), key=lambda iname: order[iname]))
-
-                options = [Fore.GREEN+insn.id+Style.RESET_ALL]
-                if insn.priority:
-                    options.append("priority=%d" % insn.priority)
-                if insn.tags:
-                    options.append("tags=%s" % ":".join(insn.tags))
-                if isinstance(insn, lp.Assignment) and insn.atomicity:
-                    options.append("atomic=%s" % ":".join(
-                        str(a) for a in insn.atomicity))
-                if insn.groups:
-                    options.append("groups=%s" % ":".join(insn.groups))
-                if insn.conflicts_with_groups:
-                    options.append(
-                            "conflicts=%s" % ":".join(insn.conflicts_with_groups))
-                if insn.no_sync_with:
-                    options.append("no_sync_with=%s" % ":".join(
-                        "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
-
-                if lhs:
-                    core = "%s <- %s" % (
-                        Fore.CYAN+lhs+Style.RESET_ALL,
-                        Fore.MAGENTA+rhs+Style.RESET_ALL,
-                        )
-                else:
-                    core = Fore.MAGENTA+rhs+Style.RESET_ALL
-
-                if len(loop_list) > loop_list_width:
-                    lines.append("%s [%s]" % (arrows, loop_list))
-                    lines.append("%s %s%s   # %s" % (
-                        extender,
-                        (loop_list_width+2)*" ",
-                        core,
-                        ", ".join(options)))
-                else:
-                    lines.append("%s [%s]%s%s   # %s" % (
-                        arrows,
-                        loop_list, " "*(loop_list_width-len(loop_list)),
-                        core,
-                        ",".join(options)))
-
-                lines.extend(trailing)
-
-                if insn.predicates:
-                    lines.append(10*" " + "if (%s)" % " && ".join(
-                        [str(x) for x in insn.predicates]))
+            from loopy.kernel.tools import stringify_instruction_list
+            lines.extend(stringify_instruction_list(kernel))
 
         dep_lines = []
         for insn in kernel.instructions:
@@ -1474,6 +1358,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return hash(key_hash.digest())
 
     def __eq__(self, other):
+        if self is other:
+            return True
+
         if not isinstance(other, LoopKernel):
             return False
 
@@ -1487,7 +1374,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                         return False
 
             elif field_name == "assumptions":
-                if not self.assumptions.plain_is_equal(other.assumptions):
+                if not (
+                        self.assumptions.plain_is_equal(other.assumptions)
+                        or self.assumptions.is_equal(other.assumptions)):
                     return False
 
             elif getattr(self, field_name) != getattr(other, field_name):
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index e4cb17657632c53120e56aacf29b20bc0778d73f..dcac16479e368908f50f5dff1ef0f4c0edcc3e7b 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -439,7 +439,7 @@ def parse_insn(groups, insn_options):
     if "lhs" in groups:
         try:
             lhs = parse(groups["lhs"])
-        except:
+        except Exception:
             print("While parsing left hand side '%s', "
                     "the following error occurred:" % groups["lhs"])
             raise
@@ -448,7 +448,7 @@ def parse_insn(groups, insn_options):
 
     try:
         rhs = parse(groups["rhs"])
-    except:
+    except Exception:
         print("While parsing right hand side '%s', "
                 "the following error occurred:" % groups["rhs"])
         raise
@@ -522,14 +522,14 @@ def parse_subst_rule(groups):
     from loopy.symbolic import parse
     try:
         lhs = parse(groups["lhs"])
-    except:
+    except Exception:
         print("While parsing left hand side '%s', "
                 "the following error occurred:" % groups["lhs"])
         raise
 
     try:
         rhs = parse(groups["rhs"])
-    except:
+    except Exception:
         print("While parsing right hand side '%s', "
                 "the following error occurred:" % groups["rhs"])
         raise
@@ -901,7 +901,8 @@ def parse_instructions(instructions, defines):
             obj = insn_options_stack.pop()
             #if this object is the end of an if statement
             if obj['predicates'] == if_predicates_stack[-1]["insn_predicates"] and\
-                    if_predicates_stack[-1]["insn_predicates"]:
+                    if_predicates_stack[-1]["insn_predicates"] and\
+                    obj['within_inames'] == if_predicates_stack[-1]['within_inames']:
                 if_predicates_stack.pop()
             continue
 
@@ -996,7 +997,7 @@ def parse_domains(domains, defines):
 
             try:
                 dom = isl.BasicSet.read_from_str(isl.DEFAULT_CONTEXT, dom)
-            except:
+            except Exception:
                 print("failed to parse domain '%s'" % dom)
                 raise
         else:
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 94b31df12dae516d3539438b7e4ed66ed765e697..96933f57a003aaca58ed00d2d73c3301b0c448c7 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -77,14 +77,19 @@ class IndexTag(ImmutableRecord):
         return type(self).__name__
 
 
-class ParallelTag(IndexTag):
+class ConcurrentTag(IndexTag):
     pass
 
 
-class HardwareParallelTag(ParallelTag):
+class HardwareConcurrentTag(ConcurrentTag):
     pass
 
 
+# deprecated aliases
+ParallelTag = ConcurrentTag
+HardwareParallelTag = HardwareConcurrentTag
+
+
 class UniqueTag(IndexTag):
     pass
 
@@ -105,11 +110,11 @@ class AxisTag(UniqueTag):
                 self.print_name, self.axis)
 
 
-class GroupIndexTag(HardwareParallelTag, AxisTag):
+class GroupIndexTag(HardwareConcurrentTag, AxisTag):
     print_name = "g"
 
 
-class LocalIndexTagBase(HardwareParallelTag):
+class LocalIndexTagBase(HardwareConcurrentTag):
     pass
 
 
@@ -130,7 +135,7 @@ class AutoFitLocalIndexTag(AutoLocalIndexTagBase):
 
 # {{{ ilp-like
 
-class IlpBaseTag(ParallelTag):
+class IlpBaseTag(ConcurrentTag):
     pass
 
 
@@ -161,6 +166,11 @@ class ForceSequentialTag(IndexTag):
         return "forceseq"
 
 
+class InOrderSequentialSequentialTag(IndexTag):
+    def __str__(self):
+        return "ord"
+
+
 def parse_tag(tag):
     if tag is None:
         return tag
@@ -173,6 +183,8 @@ def parse_tag(tag):
 
     if tag == "for":
         return None
+    elif tag == "ord":
+        return InOrderSequentialSequentialTag()
     elif tag in ["unr"]:
         return UnrollTag()
     elif tag in ["vec"]:
@@ -346,6 +358,14 @@ class TemporaryVariable(ArrayBase):
 
         A :class:`bool` indicating whether the variable may be written during
         its lifetime. If *True*, *initializer* must be given.
+
+    .. attribute:: _base_storage_access_may_be_aliasing
+
+        Whether the temporary is used to alias the underlying base storage.
+        Defaults to *False*. If *False*, C-based code generators will declare
+        the temporary as a ``restrict`` const pointer to the base storage
+        memory location. If *True*, the restrict part is omitted on this
+        declaration.
     """
 
     min_target_axes = 0
@@ -358,12 +378,14 @@ class TemporaryVariable(ArrayBase):
             "base_storage",
             "initializer",
             "read_only",
+            "_base_storage_access_may_be_aliasing",
             ]
 
     def __init__(self, name, dtype=None, shape=(), scope=auto,
             dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
             base_indices=None, storage_shape=None,
-            base_storage=None, initializer=None, read_only=False, **kwargs):
+            base_storage=None, initializer=None, read_only=False,
+            _base_storage_access_may_be_aliasing=False, **kwargs):
         """
         :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
         :arg shape: :class:`loopy.auto` or a shape tuple
@@ -419,6 +441,13 @@ class TemporaryVariable(ArrayBase):
                     "mutually exclusive"
                     % name)
 
+        if base_storage is None and _base_storage_access_may_be_aliasing:
+            raise LoopyError(
+                    "temporary variable '%s': "
+                    "_base_storage_access_may_be_aliasing option, but no "
+                    "base_storage given!"
+                    % name)
+
         ArrayBase.__init__(self, name=intern(name),
                 dtype=dtype, shape=shape,
                 dim_tags=dim_tags, offset=offset, dim_names=dim_names,
@@ -428,6 +457,8 @@ class TemporaryVariable(ArrayBase):
                 base_storage=base_storage,
                 initializer=initializer,
                 read_only=read_only,
+                _base_storage_access_may_be_aliasing=(
+                    _base_storage_access_may_be_aliasing),
                 **kwargs)
 
     @property
@@ -489,7 +520,10 @@ class TemporaryVariable(ArrayBase):
                 and (
                     (self.initializer is None and other.initializer is None)
                     or np.array_equal(self.initializer, other.initializer))
-                and self.read_only == other.read_only)
+                and self.read_only == other.read_only
+                and (self._base_storage_access_may_be_aliasing
+                    == other._base_storage_access_may_be_aliasing)
+                )
 
     def update_persistent_hash(self, key_hash, key_builder):
         """Custom hash computation function for use with
@@ -500,6 +534,8 @@ class TemporaryVariable(ArrayBase):
         self.update_persistent_hash_for_shape(key_hash, key_builder,
                 self.storage_shape)
         key_builder.rec(key_hash, self.base_indices)
+        key_builder.rec(key_hash, self.scope)
+        key_builder.rec(key_hash, self.base_storage)
 
         initializer = self.initializer
         if initializer is not None:
@@ -507,10 +543,22 @@ class TemporaryVariable(ArrayBase):
         key_builder.rec(key_hash, initializer)
 
         key_builder.rec(key_hash, self.read_only)
+        key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing)
 
 # }}}
 
 
+def iname_tag_to_temp_var_scope(iname_tag):
+    iname_tag = parse_tag(iname_tag)
+
+    if isinstance(iname_tag, GroupIndexTag):
+        return temp_var_scope.GLOBAL
+    elif isinstance(iname_tag, LocalIndexTag):
+        return temp_var_scope.LOCAL
+    else:
+        return temp_var_scope.PRIVATE
+
+
 # {{{ substitution rule
 
 class SubstitutionRule(ImmutableRecord):
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 8bdc72d54a91c6e8b4f9ec0ca3053831627d3eae..02df0f2b4fd27dcb0f8b847411aa3dea7f3f9169 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -35,7 +35,7 @@ import islpy as isl
 from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
 from pytools import memoize_on_first_arg
-
+from loopy.tools import natsorted
 
 import logging
 logger = logging.getLogger(__name__)
@@ -620,11 +620,11 @@ class DomainParameterFinder(object):
                         if dep.name in param_names:
                             from pymbolic.algorithm import solve_affine_equations_for
                             try:
-                                # friggin' overkill :)
+                                # overkill :)
                                 param_expr = solve_affine_equations_for(
                                         [dep.name], [(shape_i, var("shape_i"))]
                                         )[dep.name]
-                            except:
+                            except Exception:
                                 # went wrong? oh well
                                 pass
                             else:
@@ -1070,7 +1070,7 @@ def guess_var_shape(kernel, var_name):
 
             if n_axes == 1:
                 # Leave shape undetermined--we can live with that for 1D.
-                shape = (None,)
+                shape = None
             else:
                 raise LoopyError("cannot determine access range for '%s': "
                         "undetermined index in subscript(s) '%s'"
@@ -1092,7 +1092,7 @@ def guess_var_shape(kernel, var_name):
                             kernel.cache_manager.dim_max(
                                 armap.access_range, i) + 1,
                             constants_only=False)))
-            except:
+            except Exception:
                 print("While trying to find shape axis %d of "
                         "variable '%s', the following "
                         "exception occurred:" % (i, var_name),
@@ -1371,7 +1371,167 @@ def draw_dependencies_as_unicode_arrows(
                 conform_to_uniform_length(extender))
             for row, extender in rows]
 
-    return rows
+    return uniform_length, rows
+
+# }}}
+
+
+# {{{ stringify_instruction_list
+
+def stringify_instruction_list(kernel):
+    # {{{ topological sort
+
+    printed_insn_ids = set()
+    printed_insn_order = []
+
+    def insert_insn_into_order(insn):
+        if insn.id in printed_insn_ids:
+            return
+        printed_insn_ids.add(insn.id)
+
+        for dep_id in natsorted(insn.depends_on):
+            insert_insn_into_order(kernel.id_to_insn[dep_id])
+
+        printed_insn_order.append(insn)
+
+    for insn in kernel.instructions:
+        insert_insn_into_order(insn)
+
+    # }}}
+
+    import loopy as lp
+
+    Fore = kernel.options._fore  # noqa
+    Style = kernel.options._style  # noqa
+
+    uniform_arrow_length, arrows_and_extenders = \
+            draw_dependencies_as_unicode_arrows(
+                    printed_insn_order, fore=Fore, style=Style)
+
+    leader = " " * uniform_arrow_length
+    lines = []
+    current_inames = [set()]
+
+    if uniform_arrow_length:
+        indent_level = [1]
+    else:
+        indent_level = [0]
+
+    indent_increment = 2
+
+    iname_order = kernel._get_iname_order_for_printing()
+
+    def add_pre_line(s):
+        lines.append(leader + " " * indent_level[0] + s)
+
+    def add_main_line(s):
+        lines.append(arrows + " " * indent_level[0] + s)
+
+    def add_post_line(s):
+        lines.append(extender + " " * indent_level[0] + s)
+
+    def adapt_to_new_inames_list(new_inames):
+        added = []
+        removed = []
+
+        # FIXME: Doesn't respect strict nesting
+        for iname in iname_order:
+            is_in_current = iname in current_inames[0]
+            is_in_new = iname in new_inames
+
+            if is_in_new == is_in_current:
+                pass
+            elif is_in_new and not is_in_current:
+                added.append(iname)
+            elif not is_in_new and is_in_current:
+                removed.append(iname)
+            else:
+                assert False
+
+        if removed:
+            indent_level[0] -= indent_increment * len(removed)
+            add_pre_line("end " + ", ".join(removed))
+        if added:
+            add_pre_line("for " + ", ".join(added))
+            indent_level[0] += indent_increment * len(added)
+
+        current_inames[0] = new_inames
+
+    for insn, (arrows, extender) in zip(printed_insn_order, arrows_and_extenders):
+        if isinstance(insn, lp.MultiAssignmentBase):
+            lhs = ", ".join(str(a) for a in insn.assignees)
+            rhs = str(insn.expression)
+            trailing = []
+        elif isinstance(insn, lp.CInstruction):
+            lhs = ", ".join(str(a) for a in insn.assignees)
+            rhs = "CODE(%s|%s)" % (
+                    ", ".join(str(x) for x in insn.read_variables),
+                    ", ".join("%s=%s" % (name, expr)
+                        for name, expr in insn.iname_exprs))
+
+            trailing = [l for l in insn.code.split("\n")]
+        elif isinstance(insn, lp.BarrierInstruction):
+            lhs = ""
+            rhs = "... %sbarrier" % insn.kind[0]
+            trailing = []
+
+        elif isinstance(insn, lp.NoOpInstruction):
+            lhs = ""
+            rhs = "... nop"
+            trailing = []
+
+        else:
+            raise LoopyError("unexpected instruction type: %s"
+                    % type(insn).__name__)
+
+        adapt_to_new_inames_list(kernel.insn_inames(insn))
+
+        options = ["id="+Fore.GREEN+insn.id+Style.RESET_ALL]
+        if insn.priority:
+            options.append("priority=%d" % insn.priority)
+        if insn.tags:
+            options.append("tags=%s" % ":".join(insn.tags))
+        if isinstance(insn, lp.Assignment) and insn.atomicity:
+            options.append("atomic=%s" % ":".join(
+                str(a) for a in insn.atomicity))
+        if insn.groups:
+            options.append("groups=%s" % ":".join(insn.groups))
+        if insn.conflicts_with_groups:
+            options.append(
+                    "conflicts=%s" % ":".join(insn.conflicts_with_groups))
+        if insn.no_sync_with:
+            options.append("no_sync_with=%s" % ":".join(
+                "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
+
+        if lhs:
+            core = "%s = %s" % (
+                Fore.CYAN+lhs+Style.RESET_ALL,
+                Fore.MAGENTA+rhs+Style.RESET_ALL,
+                )
+        else:
+            core = Fore.MAGENTA+rhs+Style.RESET_ALL
+
+        options_str = "  {%s}" % ", ".join(options)
+
+        if insn.predicates:
+            # FIXME: precedence
+            add_pre_line("if %s" % " and ".join([str(x) for x in insn.predicates]))
+            indent_level[0] += indent_increment
+
+        add_main_line(core + options_str)
+
+        for t in trailing:
+            add_post_line(t)
+
+        if insn.predicates:
+            indent_level[0] -= indent_increment
+            add_post_line("end")
+
+        leader = extender
+
+    adapt_to_new_inames_list([])
+
+    return lines
 
 # }}}
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 541b44f58c5b02e9beba15211cb861fd09f14096..ac7ac19887388649670154fcd36eba79ba3b4315 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -30,7 +30,7 @@ from loopy.diagnostic import (
 
 import islpy as isl
 
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
 
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
@@ -292,7 +292,7 @@ def _classify_reduction_inames(kernel, inames):
 
     from loopy.kernel.data import (
             LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
-            ParallelTag)
+            ConcurrentTag)
 
     for iname in inames:
         iname_tag = kernel.iname_to_tag.get(iname)
@@ -305,7 +305,7 @@ def _classify_reduction_inames(kernel, inames):
         elif isinstance(iname_tag, LocalIndexTagBase):
             local_par.append(iname)
 
-        elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
+        elif isinstance(iname_tag, (ConcurrentTag, VectorizeTag)):
             nonlocal_par.append(iname)
 
         else:
@@ -610,7 +610,7 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound):
     if len(coeffs) == 0:
         try:
             scan_iname_aff.get_constant_val()
-        except:
+        except Exception:
             raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff)
 
         # If this point is reached we're assuming the domain is of the form
@@ -956,7 +956,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                         nresults=nresults,
                         depends_on=insn.depends_on,
                         within_inames=insn.within_inames | expr.inames,
-                        within_inames_is_final=insn.within_inames_is_final)
+                        within_inames_is_final=insn.within_inames_is_final,
+                        predicates=insn.predicates,
+                        )
 
                 newly_generated_insn_id_set.add(get_args_insn_id)
 
@@ -970,7 +972,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         return updated_inner_exprs
 
     def expand_inner_reduction(id, expr, nresults, depends_on, within_inames,
-            within_inames_is_final):
+            within_inames_is_final, predicates):
         # FIXME: use make_temporaries
         from pymbolic.primitives import Call
         from loopy.symbolic import Reduction
@@ -997,7 +999,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 expression=expr,
                 depends_on=depends_on,
                 within_inames=within_inames,
-                within_inames_is_final=within_inames_is_final)
+                within_inames_is_final=within_inames_is_final,
+                predicates=predicates)
 
         generated_insns.append(call_insn)
 
@@ -1038,7 +1041,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 within_inames=outer_insn_inames - frozenset(expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=init_insn_depends_on,
-                expression=expr.operation.neutral_element(*arg_dtypes))
+                expression=expr.operation.neutral_element(*arg_dtypes),
+                predicates=insn.predicates,)
 
         generated_insns.append(init_insn)
 
@@ -1064,7 +1068,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     nresults=nresults,
                     depends_on=insn.depends_on,
                     within_inames=update_insn_iname_deps,
-                    within_inames_is_final=insn.within_inames_is_final)
+                    within_inames_is_final=insn.within_inames_is_final,
+                    predicates=insn.predicates,
+                    )
 
             reduction_insn_depends_on.add(get_args_insn_id)
         else:
@@ -1079,7 +1085,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     reduction_expr),
                 depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on,
                 within_inames=update_insn_iname_deps,
-                within_inames_is_final=insn.within_inames_is_final)
+                within_inames_is_final=insn.within_inames_is_final,
+                predicates=insn.predicates,)
 
         generated_insns.append(reduction_insn)
 
@@ -1186,7 +1193,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 expression=neutral,
                 within_inames=base_iname_deps | frozenset([base_exec_iname]),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset())
+                depends_on=frozenset(),
+                predicates=insn.predicates,
+                )
         generated_insns.append(init_insn)
 
         init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname))
@@ -1196,7 +1205,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 expression=neutral,
                 within_inames=base_iname_deps | frozenset([base_exec_iname]),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset())
+                depends_on=frozenset(),
+                predicates=insn.predicates,
+                )
         generated_insns.append(init_neutral_insn)
 
         transfer_depends_on = set([init_neutral_id, init_id])
@@ -1216,7 +1227,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     within_inames=(
                         (outer_insn_inames - frozenset(expr.inames))
                         | frozenset([red_iname])),
-                    within_inames_is_final=insn.within_inames_is_final)
+                    within_inames_is_final=insn.within_inames_is_final,
+                    predicates=insn.predicates,
+                    )
 
             transfer_depends_on.add(get_args_insn_id)
         else:
@@ -1239,7 +1252,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     | frozenset([red_iname])),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
-                no_sync_with=frozenset([(init_id, "any")]))
+                no_sync_with=frozenset([(init_id, "any")]),
+                predicates=insn.predicates,
+                )
         generated_insns.append(transfer_insn)
 
         cur_size = 1
@@ -1280,6 +1295,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=insn.within_inames_is_final,
                     depends_on=frozenset([prev_id]),
+                    predicates=insn.predicates,
                     )
 
             generated_insns.append(stage_insn)
@@ -1398,7 +1414,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     (sweep_iname,) + expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=init_insn_depends_on,
-                expression=expr.operation.neutral_element(*arg_dtypes))
+                expression=expr.operation.neutral_element(*arg_dtypes),
+                predicates=insn.predicates,
+                )
 
         generated_insns.append(init_insn)
 
@@ -1425,7 +1443,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 depends_on=frozenset(update_insn_depends_on),
                 within_inames=update_insn_iname_deps,
                 no_sync_with=insn.no_sync_with,
-                within_inames_is_final=insn.within_inames_is_final)
+                within_inames_is_final=insn.within_inames_is_final,
+                predicates=insn.predicates,
+                )
 
         generated_insns.append(scan_insn)
 
@@ -1531,7 +1551,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 expression=neutral,
                 within_inames=base_iname_deps | frozenset([base_exec_iname]),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=init_insn_depends_on)
+                depends_on=init_insn_depends_on,
+                predicates=insn.predicates,
+                )
         generated_insns.append(init_insn)
 
         transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on
@@ -1561,7 +1583,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 within_inames=outer_insn_inames - frozenset(expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=frozenset(transfer_insn_depends_on),
-                no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with)
+                no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with,
+                predicates=insn.predicates,
+                )
 
         generated_insns.append(transfer_insn)
 
@@ -1590,7 +1614,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                         within_inames=(
                             base_iname_deps | frozenset([stage_exec_iname])),
                         within_inames_is_final=insn.within_inames_is_final,
-                        depends_on=frozenset([prev_id]))
+                        depends_on=frozenset([prev_id]),
+                        predicates=insn.predicates,
+                        )
 
                 if cur_size == 1:
                     # Performance hack: don't add a barrier here with transfer_insn.
@@ -1623,6 +1649,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=insn.within_inames_is_final,
                     depends_on=frozenset([prev_id]),
+                    predicates=insn.predicates,
                     )
 
             generated_insns.append(write_stage_insn)
@@ -2020,7 +2047,8 @@ def limit_boostability(kernel):
 # }}}
 
 
-preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
+preprocess_cache = WriteOncePersistentDict(
+        "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
@@ -2126,7 +2154,7 @@ def preprocess_kernel(kernel, device=None):
     # }}}
 
     if CACHING_ENABLED:
-        preprocess_cache[input_kernel] = kernel
+        preprocess_cache.store_if_not_present(input_kernel, kernel)
 
     return kernel
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index d28e7b1b3def2b988b4624aed9caf8f65c70b2c5..abf4d799fbdb14f86fa29dde26e6654130fc66de 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -29,7 +29,7 @@ import sys
 import islpy as isl
 from loopy.diagnostic import warn_with_kernel, LoopyError  # noqa
 
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
@@ -206,13 +206,13 @@ def find_loop_nest_with_map(kernel):
     """
     result = {}
 
-    from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
 
     all_nonpar_inames = set([
             iname
             for iname in kernel.all_inames()
             if not isinstance(kernel.iname_to_tag.get(iname),
-                (ParallelTag, IlpBaseTag, VectorizeTag))])
+                (ConcurrentTag, IlpBaseTag, VectorizeTag))])
 
     iname_to_insns = kernel.iname_to_insns()
 
@@ -274,10 +274,10 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
 
     result = {}
 
-    from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
     for insn in kernel.instructions:
         for iname in kernel.insn_inames(insn):
-            if isinstance(kernel.iname_to_tag.get(iname), ParallelTag):
+            if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag):
                 continue
 
             iname_dep = result.setdefault(iname, set())
@@ -308,7 +308,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
                         continue
 
                     tag = kernel.iname_to_tag.get(dep_insn_iname)
-                    if isinstance(tag, (ParallelTag, IlpBaseTag, VectorizeTag)):
+                    if isinstance(tag, (ConcurrentTag, IlpBaseTag, VectorizeTag)):
                         # Parallel tags don't really nest, so we'll disregard
                         # them here.
                         continue
@@ -431,10 +431,10 @@ def format_insn(kernel, insn_id):
     from loopy.kernel.instruction import (
             MultiAssignmentBase, NoOpInstruction, BarrierInstruction)
     if isinstance(insn, MultiAssignmentBase):
-        return "[%s] %s%s%s <- %s%s%s" % (
-            format_insn_id(kernel, insn_id),
+        return "%s%s%s = %s%s%s  {id=%s}" % (
             Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL,
-            Fore.MAGENTA, str(insn.expression), Style.RESET_ALL)
+            Fore.MAGENTA, str(insn.expression), Style.RESET_ALL,
+            format_insn_id(kernel, insn_id))
     elif isinstance(insn, BarrierInstruction):
         return "[%s] %s... %sbarrier%s" % (
                 format_insn_id(kernel, insn_id),
@@ -456,11 +456,11 @@ def dump_schedule(kernel, schedule):
     from loopy.kernel.data import MultiAssignmentBase
     for sched_item in schedule:
         if isinstance(sched_item, EnterLoop):
-            lines.append(indent + "FOR %s" % sched_item.iname)
+            lines.append(indent + "for %s" % sched_item.iname)
             indent += "    "
         elif isinstance(sched_item, LeaveLoop):
             indent = indent[:-4]
-            lines.append(indent + "END %s" % sched_item.iname)
+            lines.append(indent + "end %s" % sched_item.iname)
         elif isinstance(sched_item, CallKernel):
             lines.append(indent +
                          "CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % (
@@ -479,7 +479,7 @@ def dump_schedule(kernel, schedule):
                 insn_str = sched_item.insn_id
             lines.append(indent + insn_str)
         elif isinstance(sched_item, Barrier):
-            lines.append(indent + "---BARRIER:%s---" % sched_item.kind)
+            lines.append(indent + "... %sbarrier" % sched_item.kind[0])
         else:
             assert False
 
@@ -1787,7 +1787,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
         for item in preschedule
         for insn_id in sched_item_to_insn_id(item))
 
-    from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag
+    from loopy.kernel.data import IlpBaseTag, ConcurrentTag, VectorizeTag
     ilp_inames = set(
             iname
             for iname in kernel.all_inames()
@@ -1798,7 +1798,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
             if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag))
     parallel_inames = set(
             iname for iname in kernel.all_inames()
-            if isinstance(kernel.iname_to_tag.get(iname), ParallelTag))
+            if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag))
 
     loop_nest_with_map = find_loop_nest_with_map(kernel)
     loop_nest_around_map = find_loop_nest_around_map(kernel)
@@ -1940,7 +1940,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
 # }}}
 
 
-schedule_cache = PersistentDict("loopy-schedule-cache-v4-"+DATA_MODEL_VERSION,
+schedule_cache = WriteOncePersistentDict(
+        "loopy-schedule-cache-v4-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
@@ -1971,7 +1972,7 @@ def get_one_scheduled_kernel(kernel):
             kernel.name, time()-start_time))
 
     if CACHING_ENABLED and not from_cache:
-        schedule_cache[sched_cache_key] = result
+        schedule_cache.store_if_not_present(sched_cache_key, result)
 
     return result
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 233cfe5e881ef594ebabc536ab8c7b3d18d5cf17..88d7ec328345fd4c97d75b449385316f99c2509d 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1000,6 +1000,9 @@ def add_assumptions_guard(kernel, pwqpolynomial):
 
 def count(kernel, set, space=None):
     try:
+        if space is not None:
+            set = set.align_params(space)
+
         return add_assumptions_guard(kernel, set.card())
     except AttributeError:
         pass
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index f1a494f30d469511817d204c0476ff79abe00e3b..543c2743bb98b09b706c2fdbf9188ed0a85d97f2 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1232,6 +1232,9 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
         super(PwAffEvaluationMapper, self).__init__(context)
 
     def map_constant(self, expr):
+        if isinstance(expr, np.integer):
+            expr = int(expr)
+
         return self.pw_zero + expr
 
     def map_min(self, expr):
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 7e307ef8bdd4d89e24b26dbacf39733ab3350307..5800a0236e8ae5f81a63942c31a74822bc2fab96 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -211,6 +211,10 @@ class ASTBuilderBase(object):
             static_lbound, static_ubound, inner):
         raise NotImplementedError()
 
+    @property
+    def can_implement_conditionals(self):
+        return False
+
     def emit_if(self, condition_str, ast):
         raise NotImplementedError()
 
@@ -275,28 +279,6 @@ class DummyHostASTBuilder(ASTBuilderBase):
     def ast_block_scope_class(self):
         return _DummyASTBlock
 
-    def emit_assignment(self, codegen_state, insn):
-        return None
-
-    def emit_multiple_assignment(self, codegen_state, insn):
-        return None
-
-    def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            static_lbound, static_ubound, inner):
-        return None
-
-    def emit_if(self, condition_str, ast):
-        return None
-
-    def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
-        return None
-
-    def emit_blank_line(self):
-        return None
-
-    def emit_comment(self, s):
-        return None
-
 # }}}
 
 
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index a2ad682505bbdb7ed5977a28e201ebc6655c7784..e54ac0f693c4704c13b8c435e4bc7acaac1b1a47 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -307,6 +307,12 @@ class _ConstRestrictPointer(Pointer):
         return sub_tp, ("*const __restrict__ %s" % sub_decl)
 
 
+class _ConstPointer(Pointer):
+    def get_decl_pait(self):
+        sub_tp, sub_decl = self.subdecl.get_decl_pair()
+        return sub_tp, ("*const %s" % sub_decl)
+
+
 class CASTBuilder(ASTBuilderBase):
     # {{{ library
 
@@ -462,13 +468,17 @@ class CASTBuilder(ASTBuilderBase):
                     temp_var_decl = self.wrap_temporary_decl(
                             temp_var_decl, tv.scope)
 
-                    # The 'restrict' part of this is a complete lie--of course
-                    # all these temporaries are aliased. But we're promising to
-                    # not use them to shovel data from one representation to the
-                    # other. That counts, right?
+                    if tv._base_storage_access_may_be_aliasing:
+                        ptrtype = _ConstPointer
+                    else:
+                        # The 'restrict' part of this is a complete lie--of course
+                        # all these temporaries are aliased. But we're promising to
+                        # not use them to shovel data from one representation to the
+                        # other. That counts, right?
+                        ptrtype = _ConstRestrictPointer
 
-                    cast_decl = _ConstRestrictPointer(cast_decl)
-                    temp_var_decl = _ConstRestrictPointer(temp_var_decl)
+                    cast_decl = ptrtype(cast_decl)
+                    temp_var_decl = ptrtype(temp_var_decl)
 
                     cast_tp, cast_d = cast_decl.get_decl_pair()
                     temp_var_decl = Initializer(
@@ -789,6 +799,10 @@ class CASTBuilder(ASTBuilderBase):
         from cgen import Comment
         return Comment(s)
 
+    @property
+    def can_implement_conditionals(self):
+        return True
+
     def emit_if(self, condition_str, ast):
         from cgen import If
         return If(condition_str, ast)
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 2da25ba39ceef38a4af105913973226bd3773729..975c691a74d0d17bdca39243f515c5d04284893d 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -328,7 +328,8 @@ def generate_arg_setup(gen, kernel, implemented_data_info, options):
         # {{{ allocate written arrays, if needed
 
         if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
-                and arg.shape is not None:
+                and arg.shape is not None \
+                and all(si is not None for si in arg.shape):
 
             if not isinstance(arg.dtype, NumpyType):
                 raise LoopyError("do not know how to pass arg of type '%s'"
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 11951abcf17e94c0fdba51042e3060735215b423..ce04986d3d2a39dcf7126339055d32fa16ffcc25 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -283,6 +283,10 @@ class PythonASTBuilderBase(ASTBuilderBase):
         from genpy import Comment
         return Comment(s)
 
+    @property
+    def can_implement_conditionals(self):
+        return True
+
     def emit_if(self, condition_str, ast):
         from genpy import If
         return If(condition_str, ast)
diff --git a/loopy/tools.py b/loopy/tools.py
index 1ebbe5c8a4fd2b68e3bfcf5ed493384599dac2c5..d6952d54782f113685299641c828907fb7f32a46 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -576,4 +576,19 @@ def intern_frozenset_of_ids(fs):
     return frozenset(intern(s) for s in fs)
 
 
+def natorder(key):
+    # Return natural ordering for strings, as opposed to dictionary order.
+    # E.g. will result in
+    #  'abc1' < 'abc9' < 'abc10'
+    # rather than
+    #  'abc1' < 'abc10' < 'abc9'
+    # Based on
+    # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
+    import re
+    return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
+
+
+def natsorted(seq, key=lambda x: x):
+    return sorted(seq, key=lambda y: natorder(key(y)))
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py
index f4e6526a7b083f0b38dda1209b607aa38a62b68e..618e36f20da8b3f9089ecf5ce88d6b3177528570 100644
--- a/loopy/transform/array_buffer_map.py
+++ b/loopy/transform/array_buffer_map.py
@@ -239,14 +239,14 @@ class ArrayToBufferMap(object):
         non1_storage_axis_flags = []
         non1_storage_shape = []
 
-        for saxis, bi, l in zip(
+        for saxis, bi, saxis_len in zip(
                 storage_axis_names, storage_base_indices, storage_shape):
-            has_length_non1 = l != 1
+            has_length_non1 = saxis_len != 1
 
             non1_storage_axis_flags.append(has_length_non1)
 
             if has_length_non1:
-                non1_storage_shape.append(l)
+                non1_storage_shape.append(saxis_len)
 
         # }}}
 
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 92cff7a507d672a3acc51a8abed572a04cb7e86a..1b059b6a73d3064596b8679fbc87f94287b2d9fe 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -29,7 +29,7 @@ from loopy.symbolic import (get_dependencies,
         RuleAwareIdentityMapper, SubstitutionRuleMappingContext,
         SubstitutionMapper)
 from pymbolic.mapper.substitutor import make_subst_func
-from pytools.persistent_dict import PersistentDict
+from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
 from loopy.version import DATA_MODEL_VERSION
 from loopy.diagnostic import LoopyError
@@ -124,7 +124,8 @@ class ArrayAccessReplacer(RuleAwareIdentityMapper):
 # }}}
 
 
-buffer_array_cache = PersistentDict("loopy-buffer-array-cache-"+DATA_MODEL_VERSION,
+buffer_array_cache = WriteOncePersistentDict(
+        "loopy-buffer-array-cache-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
@@ -531,7 +532,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
 
     if CACHING_ENABLED:
         from loopy.preprocess import prepare_for_caching
-        buffer_array_cache[cache_key] = prepare_for_caching(kernel)
+        buffer_array_cache.store_if_not_present(
+                cache_key, prepare_for_caching(kernel))
 
     return kernel
 
diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 77840753258fa545aa01ef3e8c58cbc36e66ed72..0ac71d603ebe8b5150fb854dd3978676dd9d98c3 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -38,6 +38,7 @@ from loopy.symbolic import IdentityMapper
 class ExtraInameIndexInserter(IdentityMapper):
     def __init__(self, var_to_new_inames):
         self.var_to_new_inames = var_to_new_inames
+        self.seen_ilp_inames = set()
 
     def map_subscript(self, expr):
         try:
@@ -50,6 +51,7 @@ class ExtraInameIndexInserter(IdentityMapper):
                 index = (index,)
             index = tuple(self.rec(i) for i in index)
 
+            self.seen_ilp_inames.update(v.name for v in new_idx)
             return expr.aggregate.index(index + new_idx)
 
     def map_variable(self, expr):
@@ -58,6 +60,7 @@ class ExtraInameIndexInserter(IdentityMapper):
         except KeyError:
             return expr
         else:
+            self.seen_ilp_inames.update(v.name for v in new_idx)
             return expr.index(new_idx)
 
 
@@ -160,13 +163,30 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
     # }}}
 
     from pymbolic import var
-    eiii = ExtraInameIndexInserter(
-            dict((var_name, tuple(var(iname) for iname in inames))
-                for var_name, inames in six.iteritems(var_to_new_ilp_inames)))
-
-    new_insns = [
-            insn.with_transformed_expressions(eiii)
-            for insn in kernel.instructions]
+    var_to_extra_iname = dict(
+            (var_name, tuple(var(iname) for iname in inames))
+            for var_name, inames in six.iteritems(var_to_new_ilp_inames))
+
+    new_insns = []
+
+    for insn in kernel.instructions:
+        eiii = ExtraInameIndexInserter(var_to_extra_iname)
+        new_insn = insn.with_transformed_expressions(eiii)
+        if not eiii.seen_ilp_inames <= insn.within_inames:
+
+            from loopy.diagnostic import warn_with_kernel
+            warn_with_kernel(
+                    kernel,
+                    "implicit_ilp_iname",
+                    "Instruction '%s': touched variable that (for ILP) "
+                    "required iname(s) '%s', but that the instruction was not "
+                    "previously within the iname(s). Previously, this would "
+                    "implicitly promote the instruction, but that behavior is "
+                    "deprecated and will stop working in 2018.1."
+                    % (insn.id, ", ".join(
+                        eiii.seen_ilp_inames - insn.within_inames)))
+
+        new_insns.append(new_insn)
 
     return kernel.copy(
         temporary_variables=new_temp_vars,
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index ea90abfe27c8de69daf39021b3d0ea5463a2e4c8..22fd7b3bb2c643bc3c1309f4e3fdb89438ae7d2b 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -641,7 +641,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
 
     iname_to_tag = [(iname, parse_tag(tag)) for iname, tag in iname_to_tag]
 
-    from loopy.kernel.data import (ParallelTag, AutoLocalIndexTagBase,
+    from loopy.kernel.data import (ConcurrentTag, AutoLocalIndexTagBase,
             ForceSequentialTag)
 
     # {{{ globbing
@@ -686,13 +686,13 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
         if iname not in kernel.all_inames():
             raise ValueError("cannot tag '%s'--not known" % iname)
 
-        if isinstance(new_tag, ParallelTag) \
+        if isinstance(new_tag, ConcurrentTag) \
                 and isinstance(old_tag, ForceSequentialTag):
             raise ValueError("cannot tag '%s' as parallel--"
                     "iname requires sequential execution" % iname)
 
         if isinstance(new_tag, ForceSequentialTag) \
-                and isinstance(old_tag, ParallelTag):
+                and isinstance(old_tag, ConcurrentTag):
             raise ValueError("'%s' is already tagged as parallel, "
                     "but is now prohibited from being parallel "
                     "(likely because of participation in a precompute or "
@@ -972,9 +972,9 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
     # Get the duplication options as a tuple of iname and a set
     for iname, insns in _get_iname_duplication_options(insn_deps):
         # Check whether this iname has a parallel tag and discard it if so
-        from loopy.kernel.data import ParallelTag
+        from loopy.kernel.data import ConcurrentTag
         if (iname in knl.iname_to_tag
-                    and isinstance(knl.iname_to_tag[iname], ParallelTag)):
+                    and isinstance(knl.iname_to_tag[iname], ConcurrentTag)):
             continue
 
         # If we find a duplication option and fo not use boostable_into
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 3d4f5c2d4765aa7cbf1e56c76d127bf8f4d61a06..2ba2338b0af541274cc0362c9f71cec9c2887ffc 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -402,13 +402,13 @@ class TemporarySaver(object):
                     continue
 
                 from loopy.kernel.data import (
-                    GroupIndexTag, LocalIndexTag, ParallelTag)
+                    GroupIndexTag, LocalIndexTag, ConcurrentTag)
 
                 if isinstance(tag, GroupIndexTag):
                     my_group_tags.append(tag)
                 elif isinstance(tag, LocalIndexTag):
                     my_local_tags.append(tag)
-                elif isinstance(tag, ParallelTag):
+                elif isinstance(tag, ConcurrentTag):
                     raise LoopyError(
                         "iname '%s' is tagged with '%s' - only "
                         "group and local tags are supported for "
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 79ceff9fdf1e2c4b3b544e8ae85f8194b36ec444..a681afe06520483c83530c241e39229412e88f03 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -1,6 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-import six
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -24,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
 
 from loopy.symbolic import (
         get_dependencies, SubstitutionMapper,
@@ -141,6 +140,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
     dfmapper = CallbackMapper(gather_exprs, WalkMapper())
 
     for insn in kernel.instructions:
+        dfmapper(insn.assignees)
         dfmapper(insn.expression)
 
     for sr in six.itervalues(kernel.substitutions):
@@ -178,8 +178,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
     new_insns = []
 
     for insn in kernel.instructions:
-        new_expr = cbmapper(insn.expression)
-        new_insns.append(insn.copy(expression=new_expr))
+        new_insns.append(insn.with_transformed_expressions(cbmapper))
 
     from loopy.kernel.data import SubstitutionRule
     new_substs = {
diff --git a/loopy/version.py b/loopy/version.py
index 3a9781748d00a0e453d4a56e374a25aa72ab4733..5e07e979f2d44684be00290328244496176337b3 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v66-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v68-islpy%s" % _islpy_version
diff --git a/setup.py b/setup.py
index 67d943af3be4446834bf7262a91b8596b601ca85..94843bf69e4e25677ccc0713e5f598e9dcfd55e2 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setup(name="loo.py",
           ],
 
       install_requires=[
-          "pytools>=2017.3",
+          "pytools>=2017.6",
           "pymbolic>=2016.2",
           "genpy>=2016.1.2",
           "cgen>=2016.1",
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 6e05aa6adba66ce0a1896527249d321de104c512..842a0127e3118ec8e7a0ea89ed17decc091e8566 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -278,14 +278,14 @@ def test_matmul(ctx_factory, buffer_inames):
     logging.basicConfig(level=logging.INFO)
 
     fortran_src = """
-        subroutine dgemm(m,n,l,a,b,c)
+        subroutine dgemm(m,n,ell,a,b,c)
           implicit none
-          real*8 a(m,l),b(l,n),c(m,n)
-          integer m,n,k,i,j,l
+          real*8 a(m,ell),b(ell,n),c(m,n)
+          integer m,n,k,i,j,ell
 
           do j = 1,n
             do i = 1,m
-              do k = 1,l
+              do k = 1,ell
                 c(i,j) = c(i,j) + b(k,j)*a(i,k)
               end do
             end do
@@ -306,7 +306,7 @@ def test_matmul(ctx_factory, buffer_inames):
     knl = lp.split_iname(knl, "k", 32)
     knl = lp.assume(knl, "n mod 32 = 0")
     knl = lp.assume(knl, "m mod 32 = 0")
-    knl = lp.assume(knl, "l mod 16 = 0")
+    knl = lp.assume(knl, "ell mod 16 = 0")
 
     knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
@@ -317,7 +317,7 @@ def test_matmul(ctx_factory, buffer_inames):
             init_expression="0", store_expression="base+buffer")
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
 
 
 @pytest.mark.xfail
@@ -457,14 +457,14 @@ def test_parse_and_fuse_two_kernels():
 
 def test_precompute_some_exist(ctx_factory):
     fortran_src = """
-        subroutine dgemm(m,n,l,a,b,c)
+        subroutine dgemm(m,n,ell,a,b,c)
           implicit none
-          real*8 a(m,l),b(l,n),c(m,n)
-          integer m,n,k,i,j,l
+          real*8 a(m,ell),b(ell,n),c(m,n)
+          integer m,n,k,i,j,ell
 
           do j = 1,n
             do i = 1,m
-              do k = 1,l
+              do k = 1,ell
                 c(i,j) = c(i,j) + b(k,j)*a(i,k)
               end do
             end do
@@ -483,7 +483,7 @@ def test_precompute_some_exist(ctx_factory):
     knl = lp.split_iname(knl, "k", 8)
     knl = lp.assume(knl, "n mod 8 = 0")
     knl = lp.assume(knl, "m mod 8 = 0")
-    knl = lp.assume(knl, "l mod 8 = 0")
+    knl = lp.assume(knl, "ell mod 8 = 0")
 
     knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
     knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
@@ -495,7 +495,7 @@ def test_precompute_some_exist(ctx_factory):
     ref_knl = knl
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
 
 
 if __name__ == "__main__":
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 772d536d1e00fedc0b7abcd2f8c05350fe3b633e..3d422f1d8b5a847d4445468978ee529db95c481f 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -230,14 +230,14 @@ def test_funny_shape_matrix_mul(ctx_factory):
 
     n = get_suitable_size(ctx)
     m = n+12
-    l = m+12
+    ell = m+12
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
                 ],
-            name="matmul", assumptions="n,m,l >= 1")
+            name="matmul", assumptions="n,m,ell >= 1")
 
     knl = lp.add_dtypes(knl, {
         "a": np.float32,
@@ -261,7 +261,7 @@ def test_funny_shape_matrix_mul(ctx_factory):
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
             op_count=[2*n**3/1e9], op_label=["GFlops"],
-            parameters={"n": n, "m": m, "l": l})
+            parameters={"n": n, "m": m, "ell": ell})
 
 
 def test_rank_one(ctx_factory):
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 3a53202de080ce67e37f03a9ccd13435759f58b7..4a0e312436dd218ee5090dd4f334edc375afd5f7 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -518,6 +518,32 @@ def test_arg_guessing_with_reduction(ctx_factory):
     print(knl)
     print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
 
+
+def test_unknown_arg_shape(ctx_factory):
+    ctx = ctx_factory()
+    from loopy.target.pyopencl import PyOpenCLTarget
+    from loopy.compiled import CompiledKernel
+    bsize = [256, 0]
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<n and 0<=j<m}",
+        """
+        for i
+            <int32> gid = i/256
+            <int32> start = gid*256
+            for j
+                a[start + j] = a[start + j] + j
+            end
+        end
+        """,
+        seq_dependencies=True,
+        name="uniform_l",
+        target=PyOpenCLTarget(),
+        assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0]))
+
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    cl_kernel_info = CompiledKernel(ctx, knl).cl_kernel_info(frozenset())  # noqa
+
 # }}}
 
 
@@ -2008,6 +2034,37 @@ def test_if_else(ctx_factory):
     out_ref[4::6] = 11
     out_ref[2::6] = 3
 
+    knl = lp.make_kernel(
+            "{ [i,j]: 0<=i,j<50}",
+            """
+            for i
+                if i < 25
+                    for j
+                        if j % 2 == 0
+                            a[i, j] = 1
+                        else
+                            a[i, j] = 0
+                        end
+                    end
+                else
+                    for j
+                        if j % 2 == 0
+                            a[i, j] = 0
+                        else
+                            a[i, j] = 1
+                        end
+                    end
+                end
+            end
+            """
+            )
+
+    evt, (out,) = knl(queue, out_host=True)
+
+    out_ref = np.zeros((50, 50))
+    out_ref[:25, 0::2] = 1
+    out_ref[25:, 1::2] = 1
+
     assert np.array_equal(out_ref, out)
 
 
@@ -2180,11 +2237,12 @@ def test_nosync_option_parsing():
         """,
         options=lp.Options(allow_terminal_colors=False))
     kernel_str = str(knl)
-    assert "# insn1,no_sync_with=insn1@any" in kernel_str
-    assert "# insn2,no_sync_with=insn1@any:insn2@any" in kernel_str
-    assert "# insn3,no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str
-    assert "# insn4,no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str  # noqa
-    assert "# insn5,no_sync_with=insn1@any" in kernel_str
+    print(kernel_str)
+    assert "id=insn1, no_sync_with=insn1@any" in kernel_str
+    assert "id=insn2, no_sync_with=insn1@any:insn2@any" in kernel_str
+    assert "id=insn3, no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str
+    assert "id=insn4, no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str  # noqa
+    assert "id=insn5, no_sync_with=insn1@any" in kernel_str
 
 
 def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
@@ -2265,6 +2323,43 @@ def test_barrier_insertion_near_bottom_of_loop():
     assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
 
 
+def test_barrier_in_overridden_get_grid_size_expanded_kernel():
+    from loopy.kernel.data import temp_var_scope as scopes
+
+    # make simple barrier'd kernel
+    knl = lp.make_kernel('{[i]: 0 <= i < 10}',
+                   """
+              for i
+                    a[i] = i {id=a}
+                    ... lbarrier {id=barrier}
+                    b[i + 1] = a[i] {nosync=a}
+              end
+                   """,
+                   [lp.TemporaryVariable("a", np.float32, shape=(10,), order='C',
+                                         scope=scopes.LOCAL),
+                    lp.GlobalArg("b", np.float32, shape=(11,), order='C')],
+               seq_dependencies=True)
+
+    # split into kernel w/ vesize larger than iname domain
+    vecsize = 16
+    knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0')
+
+    # artifically expand via overridden_get_grid_sizes_for_insn_ids
+    class GridOverride(object):
+        def __init__(self, clean, vecsize=vecsize):
+            self.clean = clean
+            self.vecsize = vecsize
+
+        def __call__(self, insn_ids, ignore_auto=True):
+            gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto)
+            return gsize, (self.vecsize,)
+
+    knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride(
+        knl.copy(), vecsize))
+    # make sure we can generate the code
+    lp.generate_code_v2(knl)
+
+
 def test_multi_argument_reduction_type_inference():
     from loopy.type_inference import TypeInferenceMapper
     from loopy.library.reduction import SegmentedSumReductionOperation
@@ -2451,6 +2546,167 @@ def test_execution_backend_can_cache_dtypes(ctx_factory):
     knl(queue)
 
 
+def test_preamble_with_separate_temporaries(ctx_factory):
+    from loopy.kernel.data import temp_var_scope as scopes
+    # create a function mangler
+
+    func_name = 'indirect'
+    func_arg_dtypes = (np.int32, np.int32, np.int32)
+    func_result_dtypes = (np.int32,)
+
+    def __indirectmangler(kernel, name, arg_dtypes):
+        """
+        A function that will return a :class:`loopy.kernel.data.CallMangleInfo`
+        to interface with the calling :class:`loopy.LoopKernel`
+        """
+        if name != func_name:
+            return None
+
+        from loopy.types import to_loopy_type
+        from loopy.kernel.data import CallMangleInfo
+
+        def __compare(d1, d2):
+            # compare dtypes ignoring atomic
+            return to_loopy_type(d1, for_atomic=True) == \
+                to_loopy_type(d2, for_atomic=True)
+
+        # check types
+        if len(arg_dtypes) != len(arg_dtypes):
+            raise Exception('Unexpected number of arguments provided to mangler '
+                            '{}, expected {}, got {}'.format(
+                                func_name, len(func_arg_dtypes), len(arg_dtypes)))
+
+        for i, (d1, d2) in enumerate(zip(func_arg_dtypes, arg_dtypes)):
+            if not __compare(d1, d2):
+                raise Exception('Argument at index {} for mangler {} does not '
+                                'match expected dtype.  Expected {}, got {}'.
+                                format(i, func_name, str(d1), str(d2)))
+
+        # get target for creation
+        target = arg_dtypes[0].target
+        return CallMangleInfo(
+            target_name=func_name,
+            result_dtypes=tuple(to_loopy_type(x, target=target) for x in
+                                func_result_dtypes),
+            arg_dtypes=arg_dtypes)
+
+    # create the preamble generator
+    def create_preamble(arr):
+        def __indirectpreamble(preamble_info):
+            # find a function matching our name
+            func_match = next(
+                (x for x in preamble_info.seen_functions
+                 if x.name == func_name), None)
+            desc = 'custom_funcs_indirect'
+            if func_match is not None:
+                from loopy.types import to_loopy_type
+                # check types
+                if tuple(to_loopy_type(x) for x in func_arg_dtypes) == \
+                        func_match.arg_dtypes:
+                    # if match, create our temporary
+                    var = lp.TemporaryVariable(
+                        'lookup', initializer=arr, dtype=arr.dtype, shape=arr.shape,
+                        scope=scopes.GLOBAL, read_only=True)
+                    # and code
+                    code = """
+            int {name}(int start, int end, int match)
+            {{
+                int result = start;
+                for (int i = start + 1; i < end; ++i)
+                {{
+                    if (lookup[i] == match)
+                        result = i;
+                }}
+                return result;
+            }}
+            """.format(name=func_name)
+
+            # generate temporary variable code
+            from cgen import Initializer
+            from loopy.target.c import generate_array_literal
+            codegen_state = preamble_info.codegen_state.copy(
+                is_generating_device_code=True)
+            kernel = preamble_info.kernel
+            ast_builder = codegen_state.ast_builder
+            target = kernel.target
+            decl_info, = var.decl_info(target, index_dtype=kernel.index_dtype)
+            decl = ast_builder.wrap_global_constant(
+                    ast_builder.get_temporary_decl(
+                        codegen_state, None, var,
+                        decl_info))
+            if var.initializer is not None:
+                decl = Initializer(decl, generate_array_literal(
+                    codegen_state, var, var.initializer))
+            # return generated code
+            yield (desc, '\n'.join([str(decl), code]))
+        return __indirectpreamble
+
+    # and finally create a test
+    n = 10
+    # for each entry come up with a random number of data points
+    num_data = np.asarray(np.random.randint(2, 10, size=n), dtype=np.int32)
+    # turn into offsets
+    offsets = np.asarray(np.hstack(([0], np.cumsum(num_data))), dtype=np.int32)
+    # create lookup data
+    lookup = np.empty(0)
+    for i in num_data:
+        lookup = np.hstack((lookup, np.arange(i)))
+    lookup = np.asarray(lookup, dtype=np.int32)
+    # and create data array
+    data = np.random.rand(np.product(num_data))
+
+    # make kernel
+    kernel = lp.make_kernel('{[i]: 0 <= i < n}',
+    """
+    for i
+        <>ind = indirect(offsets[i], offsets[i + 1], 1)
+        out[i] = data[ind]
+    end
+    """,
+    [lp.GlobalArg('out', shape=('n',)),
+     lp.TemporaryVariable(
+        'offsets', shape=(offsets.size,), initializer=offsets, scope=scopes.GLOBAL,
+        read_only=True),
+     lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)],
+    )
+    # fixt params, and add manglers / preamble
+    kernel = lp.fix_parameters(kernel, **{'n': n})
+    kernel = lp.register_preamble_generators(kernel, [create_preamble(lookup)])
+    kernel = lp.register_function_manglers(kernel, [__indirectmangler])
+
+    print(lp.generate_code(kernel)[0])
+    # and call (functionality unimportant, more that it compiles)
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+    # check that it actually performs the lookup correctly
+    assert np.allclose(kernel(
+        queue, data=data.flatten('C'))[1][0], data[offsets[:-1] + 1])
+
+
+def test_add_prefetch_works_in_lhs_index():
+    knl = lp.make_kernel(
+            "{ [n,k,l,k1,l1,k2,l2]: "
+            "start<=n<end and 0<=k,k1,k2<3 and 0<=l,l1,l2<2 }",
+            """
+            for n
+                <> a1_tmp[k,l] = a1[a1_map[n, k],l]
+                a1_tmp[k1,l1] = a1_tmp[k1,l1] + 1
+                a1_out[a1_map[n,k2], l2] = a1_tmp[k2,l2]
+            end
+            """,
+            [
+                lp.GlobalArg("a1,a1_out", None, "ndofs,2"),
+                lp.GlobalArg("a1_map", None, "nelements,3"),
+                "..."
+            ])
+
+    knl = lp.add_prefetch(knl, "a1_map", "k")
+
+    from loopy.symbolic import get_dependencies
+    for insn in knl.instructions:
+        assert "a1_map" not in get_dependencies(insn.assignees)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_reduction.py b/test/test_reduction.py
index be11d7c8cada94596dceb1a8e0e678f8adb582e9..0c37d2228ee41f3e8af7ef6f6fcd68afa7a66960 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -97,22 +97,22 @@ def test_nested_dependent_reduction(ctx_factory):
                 "{[j]: 0<=j<i+sumlen}"
                 ],
             [
-                "<> sumlen = l[i]",
+                "<> sumlen = ell[i]",
                 "a[i] = sum(j, j)",
                 ],
             [
                 lp.ValueArg("n", np.int32),
                 lp.GlobalArg("a", dtype, ("n",)),
-                lp.GlobalArg("l", np.int32, ("n",)),
+                lp.GlobalArg("ell", np.int32, ("n",)),
                 ])
 
     cknl = lp.CompiledKernel(ctx, knl)
 
     n = 330
-    l = np.arange(n, dtype=np.int32)
-    evt, (a,) = cknl(queue, l=l, n=n, out_host=True)
+    ell = np.arange(n, dtype=np.int32)
+    evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True)
 
-    tgt_result = (2*l-1)*2*l/2
+    tgt_result = (2*ell-1)*2*ell/2
     assert (a == tgt_result).all()
 
 
@@ -413,6 +413,27 @@ def test_parallel_multi_output_reduction(ctx_factory):
         assert max_index == np.argmax(np.abs(a))
 
 
+def test_reduction_with_conditional():
+    # Test whether realization of a reduction inherits predicates
+    # of the original instruction. Tested with the CTarget, because
+    # the PyOpenCL target will hoist the conditional into the host
+    # code in this minimal example.
+    knl = lp.make_kernel(
+                "{ [i] : 0<=i<42 }",
+                """
+                if n > 0
+                    <>b = sum(i, a[i])
+                end
+                """,
+                [lp.GlobalArg("a", dtype=np.float32, shape=(42,)),
+                 lp.GlobalArg("n", dtype=np.float32, shape=())],
+                target=lp.CTarget())
+    code = lp.generate_body(knl)
+
+    # Check that the if appears before the loop that realizes the reduction.
+    assert code.index("if") < code.index("for")
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_statistics.py b/test/test_statistics.py
index cf86539efec7be7e85fecfadc3b19d26fac7bb6d..eeb4a5a288afdd5b9295b0b681abb61b5f021d97 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -37,14 +37,14 @@ from pymbolic.primitives import Variable
 def test_op_counter_basic():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                 e[i, k+1] = -g[i,k]*h[i,k+1]
                 """
             ],
-            name="basic", assumptions="n,m,l >= 1")
+            name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
@@ -52,14 +52,14 @@ def test_op_counter_basic():
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
     f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
     f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
     f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul')].eval_with_dict(params)
     i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
-    assert f32add == f32mul == f32div == n*m*l
+    assert f32add == f32mul == f32div == n*m*ell
     assert f64mul == n*m
     assert i32add == n*m*2
 
@@ -67,21 +67,21 @@ def test_op_counter_basic():
 def test_op_counter_reduction():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
             ],
-            name="matmul_serial", assumptions="n,m,l >= 1")
+            name="matmul_serial", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
     f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul')].eval_with_dict(params)
-    assert f32add == f32mul == n*m*l
+    assert f32add == f32mul == n*m*ell
 
     op_map_dtype = op_map.group_by('dtype')
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
@@ -91,20 +91,23 @@ def test_op_counter_reduction():
 def test_op_counter_logic():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
-                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
+                e[i,k] = if(
+                        not(k<ell-2) and k>6 or k/2==ell,
+                        g[i,k]*2,
+                        g[i,k]+h[i,k]/2)
                 """
             ],
-            name="logic", assumptions="n,m,l >= 1")
+            name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
     f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(params)
     f64div = op_map[lp.Op(np.dtype(np.float64), 'div')].eval_with_dict(params)
@@ -118,14 +121,14 @@ def test_op_counter_logic():
 def test_op_counter_specialops():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                 e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k])
                 """
             ],
-            name="specialops", assumptions="n,m,l >= 1")
+            name="specialops", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
@@ -133,8 +136,8 @@ def test_op_counter_specialops():
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
     f32div = op_map[lp.Op(np.float32, 'div')].eval_with_dict(params)
     f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
@@ -143,8 +146,8 @@ def test_op_counter_specialops():
     i32add = op_map[lp.Op(np.dtype(np.int32), 'add')].eval_with_dict(params)
     f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt')].eval_with_dict(params)
     f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
-    assert f32div == 2*n*m*l
-    assert f32mul == f32add == n*m*l
+    assert f32div == 2*n*m*ell
+    assert f32mul == f32add == n*m*ell
     assert f64add == 3*n*m
     assert f64pow == i32add == f64rsq == f64sin == n*m
 
@@ -152,14 +155,14 @@ def test_op_counter_specialops():
 def test_op_counter_bitwise():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                 e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                 """
             ],
-            name="bitwise", assumptions="n,m,l >= 1")
+            name="bitwise", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(
             knl, dict(
@@ -169,16 +172,16 @@ def test_op_counter_bitwise():
     op_map = lp.get_op_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
     i32bw = op_map[lp.Op(np.int32, 'bw')].eval_with_dict(params)
     i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw')].eval_with_dict(params)
     i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul')].eval_with_dict(params)
     i64add = op_map[lp.Op(np.dtype(np.int64), 'add')].eval_with_dict(params)
     i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift')].eval_with_dict(params)
-    assert i32add == n*m+n*m*l
-    assert i32bw == 2*n*m*l
+    assert i32add == n*m+n*m*ell
+    assert i32bw == 2*n*m*ell
     assert i64bw == 2*n*m
     assert i64add == i64mul == n*m
     assert i64shift == 2*n*m
@@ -218,22 +221,22 @@ def test_op_counter_triangular_domain():
 def test_mem_access_counter_basic():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                 e[i, k] = g[i,k]*h[i,k+1]
                 """
             ],
-            name="basic", assumptions="n,m,l >= 1")
+            name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
                    ].eval_with_dict(params)
@@ -246,7 +249,7 @@ def test_mem_access_counter_basic():
     f64l += mem_map[lp.MemAccess('global', np.float64,
                           stride=0, direction='load', variable='h')
                     ].eval_with_dict(params)
-    assert f32l == 3*n*m*l
+    assert f32l == 3*n*m*ell
     assert f64l == 2*n*m
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
@@ -255,37 +258,37 @@ def test_mem_access_counter_basic():
     f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                          stride=0, direction='store', variable='e')
                    ].eval_with_dict(params)
-    assert f32s == n*m*l
+    assert f32s == n*m*ell
     assert f64s == n*m
 
 
 def test_mem_access_counter_reduction():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
             ],
-            name="matmul", assumptions="n,m,l >= 1")
+            name="matmul", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32l = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess('global', np.float32,
                           stride=0, direction='load', variable='b')
                     ].eval_with_dict(params)
-    assert f32l == 2*n*m*l
+    assert f32l == 2*n*m*ell
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                          stride=0, direction='store', variable='c')
                    ].eval_with_dict(params)
-    assert f32s == n*l
+    assert f32s == n*ell
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -298,20 +301,22 @@ def test_mem_access_counter_reduction():
 def test_mem_access_counter_logic():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
-                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
+                e[i,k] = if(not(k<ell-2) and k>6 or k/2==ell,
+                    g[i,k]*2,
+                    g[i,k]+h[i,k]/2)
                 """
             ],
-            name="logic", assumptions="n,m,l >= 1")
+            name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
 
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
 
@@ -332,22 +337,22 @@ def test_mem_access_counter_logic():
 def test_mem_access_counter_specialops():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                 e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                 """
             ],
-            name="specialops", assumptions="n,m,l >= 1")
+            name="specialops", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f32 = mem_map[lp.MemAccess('global', np.float32,
                          stride=0, direction='load', variable='a')
                   ].eval_with_dict(params)
@@ -360,7 +365,7 @@ def test_mem_access_counter_specialops():
     f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                           stride=0, direction='load', variable='h')
                    ].eval_with_dict(params)
-    assert f32 == 2*n*m*l
+    assert f32 == 2*n*m*ell
     assert f64 == 2*n*m
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
@@ -369,26 +374,26 @@ def test_mem_access_counter_specialops():
     f64 = mem_map[lp.MemAccess('global', np.float64,
                          stride=0, direction='store', variable='e')
                   ].eval_with_dict(params)
-    assert f32 == n*m*l
+    assert f32 == n*m*ell
     assert f64 == n*m
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'])
     #tot = lp.eval_and_sum_polys(filtered_map, params)
     tot = filtered_map.eval_and_sum(params)
-    assert tot == n*m*l + n*m
+    assert tot == n*m*ell + n*m
 
 
 def test_mem_access_counter_bitwise():
 
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                 e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                 """
             ],
-            name="bitwise", assumptions="n,m,l >= 1")
+            name="bitwise", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(
             knl, dict(
@@ -398,8 +403,8 @@ def test_mem_access_counter_bitwise():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='load', variable='a')
                   ].eval_with_dict(params)
@@ -412,7 +417,7 @@ def test_mem_access_counter_bitwise():
     i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                           stride=0, direction='load', variable='h')
                    ].eval_with_dict(params)
-    assert i32 == 4*n*m+2*n*m*l
+    assert i32 == 4*n*m+2*n*m*ell
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                          stride=0, direction='store', variable='c')
@@ -420,20 +425,20 @@ def test_mem_access_counter_bitwise():
     i32 += mem_map[lp.MemAccess('global', np.int32,
                           stride=0, direction='store', variable='e')
                    ].eval_with_dict(params)
-    assert i32 == n*m+n*m*l
+    assert i32 == n*m+n*m*ell
 
 
 def test_mem_access_counter_mixed():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
             c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k]
             e[i, k] = g[i,k]*(2+h[i,k])
             """
             ],
-            name="mixed", assumptions="n,m,l >= 1")
+            name="mixed", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64,
                 x=np.float32))
@@ -444,8 +449,8 @@ def test_mem_access_counter_mixed():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='load', variable='g')
                          ].eval_with_dict(params)
@@ -463,9 +468,9 @@ def test_mem_access_counter_mixed():
                                    stride=Variable('m'), direction='load',
                                    variable='b')
                             ].eval_with_dict(params)
-    assert f64uniform == 2*n*m*l/bsize
-    assert f32uniform == n*m*l/bsize
-    assert f32nonconsec == 3*n*m*l
+    assert f64uniform == 2*n*m*ell/bsize
+    assert f32uniform == n*m*ell/bsize
+    assert f32nonconsec == 3*n*m*ell
 
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 stride=0, direction='store', variable='e')
@@ -474,21 +479,21 @@ def test_mem_access_counter_mixed():
                                   stride=Variable('m'), direction='store',
                                   variable='c')
                            ].eval_with_dict(params)
-    assert f64uniform == n*m*l/bsize
-    assert f32nonconsec == n*m*l
+    assert f64uniform == n*m*ell/bsize
+    assert f32nonconsec == n*m*ell
 
 
 def test_mem_access_counter_nonconsec():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
             c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
             e[i, k] = g[i,k]*(2+h[i,k])
             """
             ],
-            name="nonconsec", assumptions="n,m,l >= 1")
+            name="nonconsec", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.split_iname(knl, "i", 16)
@@ -497,8 +502,8 @@ def test_mem_access_counter_nonconsec():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                   stride=Variable('m'), direction='load',
                                   variable='g')
@@ -508,39 +513,39 @@ def test_mem_access_counter_nonconsec():
                                    variable='h')
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                  stride=Variable('m')*Variable('l'),
+                                  stride=Variable('m')*Variable('ell'),
                                   direction='load', variable='a')
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
-                                   stride=Variable('m')*Variable('l'),
+                                   stride=Variable('m')*Variable('ell'),
                                    direction='load', variable='b')
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
-    assert f32nonconsec == 3*n*m*l
+    assert f32nonconsec == 3*n*m*ell
 
     f64nonconsec = mem_map[lp.MemAccess('global', np.float64,
                                   stride=Variable('m'), direction='store',
                                   variable='e')
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
-                                  stride=Variable('m')*Variable('l'),
+                                  stride=Variable('m')*Variable('ell'),
                                   direction='store', variable='c')
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
-    assert f32nonconsec == n*m*l
+    assert f32nonconsec == n*m*ell
 
 
 def test_mem_access_counter_consec():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
             c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
             e[i, k] = g[i,k]*(2+h[i,k])
             """
             ],
-            name="consec", assumptions="n,m,l >= 1")
+            name="consec", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
@@ -548,8 +553,8 @@ def test_mem_access_counter_consec():
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='load', variable='g')
@@ -563,8 +568,8 @@ def test_mem_access_counter_consec():
     f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         stride=1, direction='load', variable='b')
                          ].eval_with_dict(params)
-    assert f64consec == 2*n*m*l
-    assert f32consec == 3*n*m*l
+    assert f64consec == 2*n*m*ell
+    assert f32consec == 3*n*m*ell
 
     f64consec = mem_map[lp.MemAccess('global', np.float64,
                         stride=1, direction='store', variable='e')
@@ -572,29 +577,29 @@ def test_mem_access_counter_consec():
     f32consec = mem_map[lp.MemAccess('global', np.float32,
                         stride=1, direction='store', variable='c')
                         ].eval_with_dict(params)
-    assert f64consec == n*m*l
-    assert f32consec == n*m*l
+    assert f64consec == n*m*ell
+    assert f32consec == n*m*ell
 
 
 def test_barrier_counter_nobarriers():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                 e[i, k] = g[i,k]*h[i,k+1]
                 """
             ],
-            name="basic", assumptions="n,m,l >= 1")
+            name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
     sync_map = lp.get_synchronization_map(knl)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     assert len(sync_map) == 1
     assert sync_map["kernel_launch"].eval_with_dict(params) == 1
 
@@ -602,7 +607,7 @@ def test_barrier_counter_nobarriers():
 def test_barrier_counter_barriers():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
             [
                 """
             c[i,j,k] = 2*a[i,j,k] {id=first}
@@ -620,8 +625,8 @@ def test_barrier_counter_barriers():
     print(sync_map)
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
     barrier_count = sync_map["barrier_local"].eval_with_dict(params)
     assert barrier_count == 50*10*2
 
@@ -630,11 +635,11 @@ def test_all_counters_parallel_matmul():
 
     bsize = 16
     knl = lp.make_kernel(
-            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
             ],
-            name="matmul", assumptions="n,m,l >= 1")
+            name="matmul", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
     knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
@@ -644,8 +649,8 @@ def test_all_counters_parallel_matmul():
 
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
 
     sync_map = lp.get_synchronization_map(knl)
     assert len(sync_map) == 2
@@ -666,7 +671,7 @@ def test_all_counters_parallel_matmul():
                         lp.Op(np.dtype(np.int32), 'mul')
                         ].eval_with_dict(params)
 
-    assert f32mul+f32add == n*m*l*2
+    assert f32mul+f32add == n*m*ell*2
 
     op_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
@@ -677,21 +682,21 @@ def test_all_counters_parallel_matmul():
                      stride=1, direction='load', variable='a')
                      ].eval_with_dict(params)
 
-    assert f32s1lb == n*m*l/bsize
-    assert f32s1la == n*m*l/bsize
+    assert f32s1lb == n*m*ell/bsize
+    assert f32s1la == n*m*ell/bsize
 
     f32coal = op_map[lp.MemAccess('global', np.float32,
                      stride=1, direction='store', variable='c')
                      ].eval_with_dict(params)
 
-    assert f32coal == n*l
+    assert f32coal == n*ell
 
     local_mem_map = lp.get_mem_access_map(knl,
                         count_redundant_work=True).filter_by(mtype=['local'])
     local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                              direction='load')
                                 ].eval_with_dict(params)
-    assert local_mem_l == n*m*l*2
+    assert local_mem_l == n*m*ell*2
 
 
 def test_gather_access_footprint():
@@ -729,38 +734,38 @@ def test_gather_access_footprint_2():
 def test_summations_and_filters():
 
     knl = lp.make_kernel(
-            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
+            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
             [
                 """
                 c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                 e[i, k+1] = -g[i,k]*h[i,k+1]
                 """
             ],
-            name="basic", assumptions="n,m,l >= 1")
+            name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
                         dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     n = 512
     m = 256
-    l = 128
-    params = {'n': n, 'm': m, 'l': l}
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a']
                                 ).eval_and_sum(params)
-    assert loads_a == 2*n*m*l
+    assert loads_a == 2*n*m*ell
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store']
                                       ).eval_and_sum(params)
-    assert global_stores == n*m*l + n*m
+    assert global_stores == n*m*ell + n*m
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
     st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
                                  ).to_bytes().eval_and_sum(params)
-    assert ld_bytes == 4*n*m*l*3 + 8*n*m*2
-    assert st_bytes == 4*n*m*l + 8*n*m
+    assert ld_bytes == 4*n*m*ell*3 + 8*n*m*2
+    assert st_bytes == 4*n*m*ell + 8*n*m
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -768,7 +773,7 @@ def test_summations_and_filters():
                           ].eval_with_dict(params)
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
-    assert f32lall == 3*n*m*l
+    assert f32lall == 3*n*m*ell
     assert f64lall == 2*n*m
 
     op_map = lp.get_op_map(knl, count_redundant_work=True)
@@ -779,14 +784,14 @@ def test_summations_and_filters():
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
     f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
     i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
-    assert f32 == n*m*l*3
+    assert f32 == n*m*ell*3
     assert f64 == n*m
     assert i32 == n*m*2
 
     addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
     f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
-    assert addsub_all == n*m*l + n*m*2
-    assert f32ops_all == n*m*l*3
+    assert addsub_all == n*m*ell + n*m*2
+    assert f32ops_all == n*m*ell*3
 
     non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
     assert non_field == 0
@@ -795,7 +800,7 @@ def test_summations_and_filters():
     ops_noname = op_map.group_by('dtype')
     mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
     f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
-    assert mul_all == n*m*l + n*m
+    assert mul_all == n*m*ell + n*m
     assert f64ops_all == n*m
 
     def func_filter(key):