diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index eeb510f2f23a786135c719af081185e3045bade2..5ea075d194a9da75a1c18d180c65239be83eb85e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -11,6 +11,7 @@ Python 2.7 AMD CPU:
   - amd-cl-cpu
   except:
   - tags
+
 Python 2.6 POCL:
   script:
   - export PY_EXE=python2.6
@@ -24,6 +25,7 @@ Python 2.6 POCL:
   - pocl
   except:
   - tags
+
 Python 3.5 AMD CPU:
   script:
   - export PY_EXE=python3.5
@@ -38,6 +40,7 @@ Python 3.5 AMD CPU:
   - amd-cl-cpu
   except:
   - tags
+
 Python 2.7 POCL:
   script:
   - export PY_EXE=python2.7
@@ -51,6 +54,7 @@ Python 2.7 POCL:
   - pocl
   except:
   - tags
+
 Python 2.7 with legacy PyOpenCL:
   script:
   - export PY_EXE=python2.7
@@ -65,6 +69,22 @@ Python 2.7 with legacy PyOpenCL:
   - pocl
   except:
   - tags
+
+Python 3.6 POCL:
+  script:
+  - export PY_EXE=python3.6
+  - export PYOPENCL_TEST=portable
+  - export EXTRA_INSTALL="numpy mako"
+  - export LOOPY_NO_CACHE=1
+  - export NO_DOCTESTS=1
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3.6
+  - pocl
+  except:
+  - tags
+
 # PyPy AMD CPU:
 #   script:
 #   - export PY_EXE=pypy
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 2d754dec23762b289d3bf30ed6a7740326b11817..8674434084077ba1f791c46123a083346715209e 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -249,16 +249,32 @@ These are usually key-value pairs. The following attributes are recognized:
   dependencies.
 
 * ``nosync=id1:id2`` prescribes that no barrier synchronization is necessary
-  the instructions with identifiers ``id1`` and ``id2`` to the, even if
-  a dependency chain exists and variables are accessed in an apparently
-  racy way.
+  for the instructions with identifiers ``id1`` and ``id2``, even if a
+  dependency chain exists and variables are accessed in an apparently racy
+  way.
 
   Identifiers here are allowed to be wildcards as defined by the Python
   function :func:`fnmatch.fnmatchcase`. This is helpful in conjunction with
   ``id_prefix``.
 
+  Identifiers (including wildcards) accept an optional `@scope` suffix,
+  which prescribes that no synchronization at level `scope` is needed.
+  This does not preclude barriers at levels different from `scope`.
+  Allowable `scope` values are:
+
+  * `local`
+  * `global`
+  * `any`
+
+  As an example, ``nosync=id1@local:id2@global`` prescribes that no local
+  synchronization is needed with instruction ``id1`` and no global
+  synchronization is needed with instruction ``id2``.
+
+  ``nosync=id1@any`` has the same effect as ``nosync=id1``.
+
 * ``nosync_query=...`` provides an alternative way of specifying ``nosync``,
-  just like ``dep_query`` and ``dep``.
+  just like ``dep_query`` and ``dep``. As with ``nosync``, ``nosync_query``
+  accepts an optional `@scope` suffix.
 
 * ``priority=integer`` sets the instructions priority to the value
   ``integer``. Instructions with higher priority will be scheduled sooner,
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index d44e8f250ac7cbc88ad3338e4031064002133a65..942c7d56e01f9d037b0e2b601f88bc8b96dda151 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -54,7 +54,9 @@ And some data on the host:
 .. }}}
 
 We'll also disable console syntax highlighting because it confuses
-doctest::
+doctest:
+
+.. doctest::
 
     >>> # not a documented interface
     >>> import loopy.options
@@ -120,7 +122,7 @@ always see loopy's view of a kernel by printing it.
     i: None
     ---------------------------------------------------------------------------
     INSTRUCTIONS:
-    [i]                                  out[i] <- 2*a[i]   # insn
+     [i]                                  out[i] <- 2*a[i]   # insn
     ---------------------------------------------------------------------------
 
 You'll likely have noticed that there's quite a bit more information here
@@ -454,8 +456,8 @@ control is the nesting of loops. For example, should the *i* loop be nested
 around the *j* loop, or the other way around, in the following simple
 zero-fill kernel?
 
-It turns out that Loopy will typically choose a loop nesting for us, but it
-does not like doing so. Loo.py will react to the following code
+It turns out that Loopy will choose a loop nesting for us, but it might be
+ambiguous. Consider the following code:
 
 .. doctest::
 
@@ -465,13 +467,8 @@ does not like doing so. Loo.py will react to the following code
     ...     a[i,j] = 0
     ...     """)
 
-By saying::
-
-    LoopyWarning: kernel scheduling was ambiguous--more than one schedule found, ignoring
-
-And by picking one of the possible loop orderings at random.
-
-The warning (and the nondeterminism it warns about) is easily resolved:
+Both nestings of the inames `i` and `j` result in a correct kernel.
+This ambiguity can be resolved:
 
 .. doctest::
 
diff --git a/loopy/check.py b/loopy/check.py
index 7562eacd76079b83a20e75e965787676c3d4eb9a..6a1e3dc33a33b826ad54c42a549b35ad275d9fe5 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -693,13 +693,13 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
 
             parameter_inames = set(
                     insn_domain.get_dim_name(dim_type.param, i)
-                    for i in range(insn_domain.dim(dim_type.param)))
+                    for i in range(insn_impl_domain.dim(dim_type.param)))
 
             lines = []
-            for kind, diff_set, gist_domain in [
-                    ("implemented, but not desired", i_minus_d,
+            for bigger, smaller, diff_set, gist_domain in [
+                    ("implemented", "desired", i_minus_d,
                         desired_domain.gist(insn_impl_domain)),
-                    ("desired, but not implemented", d_minus_i,
+                    ("desired", "implemented", d_minus_i,
                         insn_impl_domain.gist(desired_domain))]:
 
                 if diff_set.is_empty():
@@ -721,9 +721,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                         iname, pt.get_coordinate_val(tp, dim).to_python()))
 
                 lines.append(
-                        "sample point in %s: %s" % (kind, ", ".join(point_axes)))
+                        "sample point in %s but not %s: %s" % (
+                            bigger, smaller, ", ".join(point_axes)))
                 lines.append(
-                        "gist of %s: %s" % (kind, gist_domain))
+                        "gist of constraints in %s but not %s: %s" % (
+                            smaller, bigger, gist_domain))
 
             if code is not None:
                 print(79*"-")
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 6f312ec798e13fa4b1d183c27578089857b13e3d..009dadc1a0d6236f092029dbc03ad0c035c7b8f8 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -506,6 +506,12 @@ def generate_code_v2(kernel):
 
     # }}}
 
+    # For faster unpickling in the common case when implemented_domains isn't needed.
+    from loopy.tools import LazilyUnpicklingDictionary
+    codegen_result = codegen_result.copy(
+            implemented_domains=LazilyUnpicklingDictionary(
+                    codegen_result.implemented_domains))
+
     logger.info("%s: generate code: done" % kernel.name)
 
     if CACHING_ENABLED:
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index c490abb6ed1635c135fc77468f27cd833b1d57b2..3ef7c8f6ad6c8af09dd01bf9e1341179d2be0be7 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -126,10 +126,13 @@ def generate_assignment_instruction_code(codegen_state, insn):
 
     # }}}
 
-    from pymbolic.primitives import Variable, Subscript
+    from pymbolic.primitives import Variable, Subscript, Lookup
     from loopy.symbolic import LinearSubscript
 
     lhs = insn.assignee
+    if isinstance(lhs, Lookup):
+        lhs = lhs.aggregate
+
     if isinstance(lhs, Variable):
         assignee_var_name = lhs.name
         assignee_indices = ()
@@ -145,6 +148,8 @@ def generate_assignment_instruction_code(codegen_state, insn):
     else:
         raise RuntimeError("invalid lvalue '%s'" % lhs)
 
+    del lhs
+
     result = codegen_state.ast_builder.emit_assignment(codegen_state, insn)
 
     # {{{ tracing
@@ -221,7 +226,7 @@ def generate_call_code(codegen_state, insn):
 
     if codegen_state.vectorization_info:
         if insn.atomicity:
-            raise Unvectorizable("function call")
+            raise Unvectorizable("atomic operation")
 
     # }}}
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index ad80475c1d27f67b3df8a885f60dd96ff28efe6a..0110a06095fa0bd690045f050136027d7bed3a28 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -220,7 +220,7 @@ def intersect_kernel_with_slab(kernel, slab, iname):
 
     domch = DomainChanger(kernel, (iname,))
     orig_domain = domch.get_original_domain()
-    orig_domain, slab = isl.align_two(orig_domain, slab)
+    orig_domain, slab = isl.align_two(slab, orig_domain)
     return domch.get_kernel_with(orig_domain & slab)
 
 
@@ -376,10 +376,10 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
 
         # move inames that are usable into parameters
         moved_inames = []
-        for iname in dom_and_slab.get_var_names(dim_type.set):
-            if iname in usable_inames:
-                moved_inames.append(iname)
-                dt, idx = dom_and_slab.get_var_dict()[iname]
+        for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)):
+            if das_iname in usable_inames:
+                moved_inames.append(das_iname)
+                dt, idx = dom_and_slab.get_var_dict()[das_iname]
                 dom_and_slab = dom_and_slab.move_dims(
                         dim_type.param, dom_and_slab.dim(dim_type.param),
                         dt, idx, 1)
@@ -422,8 +422,9 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
                 impl_lbound,
                 impl_ubound)
 
-        for iname in moved_inames:
-            dt, idx = impl_loop.get_var_dict()[iname]
+        for moved_iname in moved_inames:
+            # move moved_iname to 'set' dim_type in impl_loop
+            dt, idx = impl_loop.get_var_dict()[moved_iname]
             impl_loop = impl_loop.move_dims(
                     dim_type.set, impl_loop.dim(dim_type.set),
                     dt, idx, 1)
@@ -432,7 +433,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
                 codegen_state
                 .intersect(impl_loop)
                 .copy(kernel=intersect_kernel_with_slab(
-                    kernel, slab, iname)))
+                    kernel, slab, loop_iname)))
 
         inner = build_loop_nest(new_codegen_state, sched_index+1)
 
@@ -464,12 +465,17 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
 
         else:
             inner_ast = inner.current_ast(codegen_state)
+
+            from loopy.isl_helpers import simplify_pw_aff
+
             result.append(
                 inner.with_new_ast(
                     codegen_state,
                     astb.emit_sequential_loop(
                         codegen_state, loop_iname, kernel.index_dtype,
-                        pw_aff_to_expr(lbound), pw_aff_to_expr(ubound), inner_ast)))
+                        pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)),
+                        pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)),
+                        inner_ast)))
 
     return merge_codegen_results(codegen_state, result)
 
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 602830de38e457c5ff4a55d7685dc346a7b4de35..0ebe90fbca0d31c05eaee64321e2b73709292331 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -142,6 +142,55 @@ def iname_rel_aff(space, iname, rel, aff):
         raise ValueError("unknown value of 'rel': %s" % rel)
 
 
+# {{{ simplify_pw_aff
+
+def simplify_pw_aff(pw_aff, context=None):
+    if context is not None:
+        pw_aff = pw_aff.gist_params(context)
+
+    old_pw_aff = pw_aff
+
+    while True:
+        restart = False
+        did_something = False
+
+        pieces = pw_aff.get_pieces()
+        for i, (dom_i, aff_i) in enumerate(pieces):
+            for j, (dom_j, aff_j) in enumerate(pieces):
+                if i == j:
+                    continue
+
+                if aff_i.gist(dom_j).is_equal(aff_j):
+                    # aff_i is sufficient to conver aff_j, eliminate aff_j
+                    new_pieces = pieces[:]
+                    if i < j:
+                        new_pieces.pop(j)
+                        new_pieces.pop(i)
+                    else:
+                        new_pieces.pop(i)
+                        new_pieces.pop(j)
+
+                    pw_aff = isl.PwAff.alloc(dom_i | dom_j, aff_i)
+                    for dom, aff in new_pieces:
+                        pw_aff = pw_aff.union_max(isl.PwAff.alloc(dom, aff))
+
+                    restart = True
+                    did_something = True
+                    break
+
+            if restart:
+                break
+
+        if not did_something:
+            break
+
+    assert pw_aff.get_aggregate_domain() <= pw_aff.eq_set(old_pw_aff)
+
+    return pw_aff
+
+# }}}
+
+
 # {{{ static_*_of_pw_aff
 
 def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context):
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 5dff5e53c04521bcd2f53cb2fc971ec12227149c..793d31791a3295ef1d7c03132f43489ab828f089 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -199,7 +199,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             # When kernels get intersected in slab decomposition,
             # their grid sizes shouldn't change. This provides
             # a way to forward sub-kernel grid size requests.
-            get_grid_sizes_for_insn_ids=None):
+            overridden_get_grid_sizes_for_insn_ids=None):
 
         if cache_manager is None:
             from loopy.kernel.tools import SetOperationCacheManager
@@ -265,10 +265,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if np.iinfo(index_dtype.numpy_dtype).min >= 0:
             raise TypeError("index_dtype must be signed")
 
-        if get_grid_sizes_for_insn_ids is not None:
-            # overwrites method down below
-            self.get_grid_sizes_for_insn_ids = get_grid_sizes_for_insn_ids
-
         if state not in [
                 kernel_state.INITIAL,
                 kernel_state.PREPROCESSED,
@@ -302,7 +298,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 index_dtype=index_dtype,
                 options=options,
                 state=state,
-                target=target)
+                target=target,
+                overridden_get_grid_sizes_for_insn_ids=(
+                    overridden_get_grid_sizes_for_insn_ids))
 
         self._kernel_executor_cache = {}
 
@@ -467,7 +465,11 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
             discard_level_count = 0
             while discard_level_count < len(iname_set_stack):
-                last_inames = iname_set_stack[-1-discard_level_count]
+                last_inames = (
+                        iname_set_stack[-1-discard_level_count])
+                if discard_level_count + 1 < len(iname_set_stack):
+                    last_inames = (
+                            last_inames - iname_set_stack[-2-discard_level_count])
 
                 if is_domain_dependent_on_inames(self, dom_idx, last_inames):
                     break
@@ -919,6 +921,11 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         *global_size* and *local_size* are :class:`islpy.PwAff` objects.
         """
 
+        if self.overridden_get_grid_sizes_for_insn_ids:
+            return self.overridden_get_grid_sizes_for_insn_ids(
+                    insn_ids,
+                    ignore_auto=ignore_auto)
+
         all_inames_by_insns = set()
         for insn_id in insn_ids:
             all_inames_by_insns |= self.insn_inames(insn_id)
@@ -1071,6 +1078,21 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # {{{ pretty-printing
 
+    @memoize_method
+    def _get_iname_order_for_printing(self):
+        try:
+            from loopy.kernel.tools import get_visual_iname_order_embedding
+            embedding = get_visual_iname_order_embedding(self)
+        except ValueError:
+            from loopy.diagnostic import warn_with_kernel
+            warn_with_kernel(self,
+                "iname-order",
+                "get_visual_iname_order_embedding() could not determine a "
+                "consistent iname nesting order")
+            embedding = dict((iname, iname) for iname in self.all_inames())
+
+        return embedding
+
     def stringify(self, what=None, with_dependencies=False):
         all_what = set([
             "name",
@@ -1112,6 +1134,20 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         sep = 75*"-"
 
+        def natorder(key):
+            # Return natural ordering for strings, as opposed to dictionary order.
+            # E.g. will result in
+            #  'abc1' < 'abc9' < 'abc10'
+            # rather than
+            #  'abc1' < 'abc10' < 'abc9'
+            # Based on
+            # http://code.activestate.com/recipes/285264-natural-string-sorting/#c7
+            import re
+            return [int(n) if n else s for n, s in re.findall(r'(\d+)|(\D+)', key)]
+
+        def natsorted(seq, key=lambda x: x):
+            return sorted(seq, key=lambda y: natorder(key(y)))
+
         if "name" in what:
             lines.append(sep)
             lines.append("KERNEL: " + kernel.name)
@@ -1119,7 +1155,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if "arguments" in what:
             lines.append(sep)
             lines.append("ARGUMENTS:")
-            for arg_name in sorted(kernel.arg_dict):
+            for arg_name in natsorted(kernel.arg_dict):
                 lines.append(str(kernel.arg_dict[arg_name]))
 
         if "domains" in what:
@@ -1131,21 +1167,21 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if "tags" in what:
             lines.append(sep)
             lines.append("INAME IMPLEMENTATION TAGS:")
-            for iname in sorted(kernel.all_inames()):
+            for iname in natsorted(kernel.all_inames()):
                 line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname))
                 lines.append(line)
 
         if "variables" in what and kernel.temporary_variables:
             lines.append(sep)
             lines.append("TEMPORARIES:")
-            for tv in sorted(six.itervalues(kernel.temporary_variables),
+            for tv in natsorted(six.itervalues(kernel.temporary_variables),
                     key=lambda tv: tv.name):
                 lines.append(str(tv))
 
         if "rules" in what and kernel.substitutions:
             lines.append(sep)
             lines.append("SUBSTIUTION RULES:")
-            for rule_name in sorted(six.iterkeys(kernel.substitutions)):
+            for rule_name in natsorted(six.iterkeys(kernel.substitutions)):
                 lines.append(str(kernel.substitutions[rule_name]))
 
         if "instructions" in what:
@@ -1163,7 +1199,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                     return
                 printed_insn_ids.add(insn.id)
 
-                for dep_id in sorted(insn.depends_on):
+                for dep_id in natsorted(insn.depends_on):
                     insert_insn_into_order(kernel.id_to_insn[dep_id])
 
                 printed_insn_order.append(insn)
@@ -1210,7 +1246,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                     raise LoopyError("unexpected instruction type: %s"
                             % type(insn).__name__)
 
-                loop_list = ",".join(sorted(kernel.insn_inames(insn)))
+                order = self._get_iname_order_for_printing()
+                loop_list = ",".join(
+                    sorted(kernel.insn_inames(insn), key=lambda iname: order[iname]))
 
                 options = [Fore.GREEN+insn.id+Style.RESET_ALL]
                 if insn.priority:
@@ -1226,9 +1264,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                     options.append(
                             "conflicts=%s" % ":".join(insn.conflicts_with_groups))
                 if insn.no_sync_with:
-                    # FIXME: Find a syntax to express scopes.
-                    options.append("no_sync_with=%s" % ":".join(id for id, _ in
-                                                                insn.no_sync_with))
+                    options.append("no_sync_with=%s" % ":".join(
+                        "%s@%s" % entry for entry in sorted(insn.no_sync_with)))
 
                 if lhs:
                     core = "%s <- %s" % (
@@ -1239,14 +1276,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                     core = Fore.MAGENTA+rhs+Style.RESET_ALL
 
                 if len(loop_list) > loop_list_width:
-                    lines.append("%s[%s]" % (arrows, loop_list))
-                    lines.append("%s%s%s   # %s" % (
+                    lines.append("%s [%s]" % (arrows, loop_list))
+                    lines.append("%s %s%s   # %s" % (
                         extender,
                         (loop_list_width+2)*" ",
                         core,
                         ", ".join(options)))
                 else:
-                    lines.append("%s[%s]%s%s   # %s" % (
+                    lines.append("%s [%s]%s%s   # %s" % (
                         arrows,
                         loop_list, " "*(loop_list_width-len(loop_list)),
                         core,
@@ -1280,6 +1317,13 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return "\n".join(lines)
 
     def __str__(self):
+        if six.PY3:
+            return self.stringify()
+        else:
+            # Path of least resistance...
+            return self.stringify().encode("utf-8")
+
+    def __unicode__(self):
         return self.stringify()
 
     # }}}
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 019b899594ee1670d1ee59654f58b93ff8afacb9..14b18150f5b84218f39ba23662eb6106ffb596a0 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -26,6 +26,8 @@ THE SOFTWARE.
 
 
 import numpy as np
+
+from pymbolic.mapper import CSECachingMapperMixin
 from loopy.tools import intern_frozenset_of_ids
 from loopy.symbolic import IdentityMapper, WalkMapper
 from loopy.kernel.data import (
@@ -165,6 +167,11 @@ def get_default_insn_options_dict():
         }
 
 
+from collections import namedtuple
+
+_NosyncParseResult = namedtuple("_NosyncParseResult", "expr, scope")
+
+
 def parse_insn_options(opt_dict, options_str, assignee_names=None):
     if options_str is None:
         return opt_dict
@@ -173,6 +180,20 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
 
     result = opt_dict.copy()
 
+    def parse_nosync_option(opt_value):
+        if "@" in opt_value:
+            expr, scope = opt_value.split("@")
+        else:
+            expr = opt_value
+            scope = "any"
+        allowable_scopes = ("local", "global", "any")
+        if scope not in allowable_scopes:
+            raise ValueError(
+                "unknown scope for nosync option: '%s' "
+                "(allowable scopes are %s)" %
+                (scope, ', '.join("'%s'" % s for s in allowable_scopes)))
+        return _NosyncParseResult(expr, scope)
+
     for option in options_str.split(","):
         option = option.strip()
         if not option:
@@ -240,23 +261,24 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
                 raise LoopyError("'nosync' option may not be specified "
                         "in a 'with' block")
 
-            # TODO: Come up with a syntax that allows the user to express
-            # different synchronization scopes.
             result["no_sync_with"] = result["no_sync_with"].union(frozenset(
-                    (intern(dep.strip()), "any")
-                    for dep in opt_value.split(":") if dep.strip()))
+                    (option.expr.strip(), option.scope)
+                    for option in (
+                            parse_nosync_option(entry)
+                            for entry in opt_value.split(":"))
+                    if option.expr.strip()))
 
         elif opt_key == "nosync_query" and opt_value is not None:
             if is_with_block:
                 raise LoopyError("'nosync' option may not be specified "
                         "in a 'with' block")
 
+            match_expr, scope = parse_nosync_option(opt_value)
+
             from loopy.match import parse_match
-            match = parse_match(opt_value)
-            # TODO: Come up with a syntax that allows the user to express
-            # different synchronization scopes.
+            match = parse_match(match_expr)
             result["no_sync_with"] = result["no_sync_with"].union(
-                    frozenset([(match, "any")]))
+                    frozenset([(match, scope)]))
 
         elif opt_key == "groups" and opt_value is not None:
             result["groups"] = frozenset(
@@ -372,7 +394,7 @@ ELSE_RE = re.compile("^\s*else\s*$")
 INSN_RE = re.compile(
         "^"
         "\s*"
-        "(?P<lhs>.+?)"
+        "(?P<lhs>[^{]+?)"
         "\s*(?<!\:)=\s*"
         "(?P<rhs>.+?)"
         "\s*?"
@@ -426,7 +448,7 @@ def parse_insn(groups, insn_options):
                 "the following error occurred:" % groups["rhs"])
         raise
 
-    from pymbolic.primitives import Variable, Subscript
+    from pymbolic.primitives import Variable, Subscript, Lookup
     from loopy.symbolic import TypeAnnotation
 
     if not isinstance(lhs, tuple):
@@ -447,11 +469,15 @@ def parse_insn(groups, insn_options):
         else:
             temp_var_types.append(None)
 
+        inner_lhs_i = lhs_i
+        if isinstance(inner_lhs_i, Lookup):
+            inner_lhs_i = inner_lhs_i.aggregate
+
         from loopy.symbolic import LinearSubscript
-        if isinstance(lhs_i, Variable):
-            assignee_names.append(lhs_i.name)
-        elif isinstance(lhs_i, (Subscript, LinearSubscript)):
-            assignee_names.append(lhs_i.aggregate.name)
+        if isinstance(inner_lhs_i, Variable):
+            assignee_names.append(inner_lhs_i.name)
+        elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)):
+            assignee_names.append(inner_lhs_i.aggregate.name)
         else:
             raise LoopyError("left hand side of assignment '%s' must "
                     "be variable or subscript" % (lhs_i,))
@@ -995,7 +1021,7 @@ def parse_domains(domains, defines):
 
 # {{{ guess kernel args (if requested)
 
-class IndexRankFinder(WalkMapper):
+class IndexRankFinder(CSECachingMapperMixin, WalkMapper):
     def __init__(self, arg_name):
         self.arg_name = arg_name
         self.index_ranks = []
@@ -1012,6 +1038,13 @@ class IndexRankFinder(WalkMapper):
             else:
                 self.index_ranks.append(len(expr.index))
 
+    def map_common_subexpression_uncached(self, expr):
+        if not self.visit(expr):
+            return
+
+        self.rec(expr.child)
+        self.post_visit(expr)
+
 
 class ArgumentGuesser:
     def __init__(self, domains, instructions, temporary_variables,
@@ -1392,11 +1425,11 @@ def create_temporaries(knl, default_order):
 
 # {{{ determine shapes of temporaries
 
-def find_var_shape(knl, var_name, feed_expression):
-    from loopy.symbolic import AccessRangeMapper, SubstitutionRuleExpander
+def find_shapes_of_vars(knl, var_names, feed_expression):
+    from loopy.symbolic import BatchedAccessRangeMapper, SubstitutionRuleExpander
     submap = SubstitutionRuleExpander(knl.substitutions)
 
-    armap = AccessRangeMapper(knl, var_name)
+    armap = BatchedAccessRangeMapper(knl, var_names)
 
     def run_through_armap(expr, inames):
         armap(submap(expr), inames)
@@ -1404,61 +1437,105 @@ def find_var_shape(knl, var_name, feed_expression):
 
     feed_expression(run_through_armap)
 
-    if armap.access_range is not None:
-        base_indices, shape = list(zip(*[
-                knl.cache_manager.base_index_and_length(
-                    armap.access_range, i)
-                for i in range(armap.access_range.dim(dim_type.set))]))
-    else:
-        if armap.bad_subscripts:
-            raise RuntimeError("cannot determine access range for '%s': "
-                    "undetermined index in subscript(s) '%s'"
-                    % (var_name, ", ".join(
-                            str(i) for i in armap.bad_subscripts)))
+    var_to_base_indices = {}
+    var_to_shape = {}
+    var_to_error = {}
+
+    from loopy.diagnostic import StaticValueFindingError
+
+    for var_name in var_names:
+        access_range = armap.access_ranges[var_name]
+        bad_subscripts = armap.bad_subscripts[var_name]
+
+        if access_range is not None:
+            try:
+                base_indices, shape = list(zip(*[
+                        knl.cache_manager.base_index_and_length(
+                            access_range, i)
+                        for i in range(access_range.dim(dim_type.set))]))
+            except StaticValueFindingError as e:
+                var_to_error[var_name] = str(e)
+                continue
+
+        else:
+            if bad_subscripts:
+                raise RuntimeError("cannot determine access range for '%s': "
+                        "undetermined index in subscript(s) '%s'"
+                        % (var_name, ", ".join(
+                                str(i) for i in bad_subscripts)))
+
+            # no subscripts found, let's call it a scalar
+            base_indices = ()
+            shape = ()
 
-        # no subscripts found, let's call it a scalar
-        base_indices = ()
-        shape = ()
+        var_to_base_indices[var_name] = base_indices
+        var_to_shape[var_name] = shape
 
-    return base_indices, shape
+    return var_to_base_indices, var_to_shape, var_to_error
 
 
 def determine_shapes_of_temporaries(knl):
     new_temp_vars = knl.temporary_variables.copy()
 
     import loopy as lp
-    from loopy.diagnostic import StaticValueFindingError
 
-    new_temp_vars = {}
+    vars_needing_shape_inference = set()
+
     for tv in six.itervalues(knl.temporary_variables):
         if tv.shape is lp.auto or tv.base_indices is lp.auto:
-            def feed_all_expressions(receiver):
-                for insn in knl.instructions:
-                    insn.with_transformed_expressions(
-                            lambda expr: receiver(expr, knl.insn_inames(insn)))
+            vars_needing_shape_inference.add(tv.name)
 
-            def feed_assignee_of_instruction(receiver):
-                for insn in knl.instructions:
-                    for assignee in insn.assignees:
-                        receiver(assignee, knl.insn_inames(insn))
+    def feed_all_expressions(receiver):
+        for insn in knl.instructions:
+            insn.with_transformed_expressions(
+                lambda expr: receiver(expr, knl.insn_inames(insn)))
 
-            try:
-                base_indices, shape = find_var_shape(
-                        knl, tv.name, feed_all_expressions)
-            except StaticValueFindingError as e:
-                warn_with_kernel(knl, "temp_shape_fallback",
-                        "Had to fall back to legacy method of determining "
-                        "shape of temporary '%s' because: %s"
-                        % (tv.name, str(e)))
+    var_to_base_indices, var_to_shape, var_to_error = (
+        find_shapes_of_vars(
+                knl, vars_needing_shape_inference, feed_all_expressions))
+
+    # {{{ fall back to legacy method
+
+    if len(var_to_error) > 0:
+        vars_needing_shape_inference = set(var_to_error.keys())
+
+        from six import iteritems
+        for varname, err in iteritems(var_to_error):
+            warn_with_kernel(knl, "temp_shape_fallback",
+                             "Had to fall back to legacy method of determining "
+                             "shape of temporary '%s' because: %s"
+                             % (varname, err))
+
+        def feed_assignee_of_instruction(receiver):
+            for insn in knl.instructions:
+                for assignee in insn.assignees:
+                    receiver(assignee, knl.insn_inames(insn))
 
-                base_indices, shape = find_var_shape(
-                        knl, tv.name, feed_assignee_of_instruction)
+        var_to_base_indices_fallback, var_to_shape_fallback, var_to_error = (
+            find_shapes_of_vars(
+                    knl, vars_needing_shape_inference, feed_assignee_of_instruction))
 
-            if tv.base_indices is lp.auto:
-                tv = tv.copy(base_indices=base_indices)
-            if tv.shape is lp.auto:
-                tv = tv.copy(shape=shape)
+        if len(var_to_error) > 0:
+            # No way around errors: propagate an exception upward.
+            formatted_errors = (
+                "\n\n".join("'%s': %s" % (varname, var_to_error[varname])
+                for varname in sorted(var_to_error.keys())))
 
+            raise LoopyError("got the following exception(s) trying to find the "
+                    "shape of temporary variables: %s" % formatted_errors)
+
+        var_to_base_indices.update(var_to_base_indices_fallback)
+        var_to_shape.update(var_to_shape_fallback)
+
+    # }}}
+
+    new_temp_vars = {}
+
+    for tv in six.itervalues(knl.temporary_variables):
+        if tv.base_indices is lp.auto:
+            tv = tv.copy(base_indices=var_to_base_indices[tv.name])
+        if tv.shape is lp.auto:
+            tv = tv.copy(shape=var_to_shape[tv.name])
         new_temp_vars[tv.name] = tv
 
     return knl.copy(temporary_variables=new_temp_vars)
@@ -1551,15 +1628,30 @@ def _resolve_dependencies(knl, insn, deps):
     new_deps = []
 
     for dep in deps:
+        found_any = False
+
         if isinstance(dep, MatchExpressionBase):
             for new_dep in find_instructions(knl, dep):
                 if new_dep.id != insn.id:
                     new_deps.append(new_dep.id)
+                    found_any = True
         else:
             from fnmatch import fnmatchcase
             for other_insn in knl.instructions:
                 if fnmatchcase(other_insn.id, dep):
                     new_deps.append(other_insn.id)
+                    found_any = True
+
+        if not found_any and knl.options.check_dep_resolution:
+            raise LoopyError("instruction '%s' declared a depency on '%s', "
+                    "which did not resolve to any instruction present in the "
+                    "kernel '%s'. Set the kernel option 'check_dep_resolution'"
+                    "to False to disable this check." % (insn.id, dep, knl.name))
+
+    for dep_id in new_deps:
+        if dep_id not in knl.id_to_insn:
+            raise LoopyError("instruction '%s' depends on instruction id '%s', "
+                    "which was not found" % (insn.id, dep_id))
 
     return frozenset(new_deps)
 
@@ -1574,7 +1666,7 @@ def resolve_dependencies(knl):
                         (resolved_insn_id, nosync_scope)
                         for nosync_dep, nosync_scope in insn.no_sync_with
                         for resolved_insn_id in
-                        _resolve_dependencies(knl, insn, nosync_dep)),
+                        _resolve_dependencies(knl, insn, (nosync_dep,))),
                     ))
 
     return knl.copy(instructions=new_insns)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 5195e0a107ad4c7f944f2eb56b5f3b248b5a4997..001dd06326edcad14d8ecd39e29229dd45de8ef2 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -379,10 +379,10 @@ class TemporaryVariable(ArrayBase):
                         "offset must be 0 if initializer specified"
                         % name)
 
+            from loopy.types import NumpyType, to_loopy_type
             if dtype is auto or dtype is None:
-                from loopy.types import NumpyType
                 dtype = NumpyType(initializer.dtype)
-            elif dtype.numpy_dtype != initializer.dtype:
+            elif to_loopy_type(dtype) != to_loopy_type(initializer.dtype):
                 raise LoopyError(
                         "temporary variable '%s': "
                         "dtype of initializer does not match "
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 7c82de1358f9bd11eb552a54a1e834317eb59ce5..0d22dbb88ed99c7c92480d1d39b924cc2198cc3f 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -222,6 +222,10 @@ class InstructionBase(ImmutableRecord):
         if within_inames_is_final is None:
             within_inames_is_final = False
 
+        if isinstance(depends_on, str):
+            depends_on = frozenset(
+                    s.strip() for s in depends_on.split(",") if s.strip())
+
         if depends_on_is_final is None:
             depends_on_is_final = False
 
@@ -388,10 +392,8 @@ class InstructionBase(ImmutableRecord):
         if self.depends_on:
             result.append("dep="+":".join(self.depends_on))
         if self.no_sync_with:
-            # TODO: Come up with a syntax to express different kinds of
-            # synchronization scopes.
             result.append("nosync="+":".join(
-                    insn_id for insn_id, _ in self.no_sync_with))
+                    "%s@%s" % entry for entry in self.no_sync_with))
         if self.groups:
             result.append("groups=%s" % ":".join(self.groups))
         if self.conflicts_with_groups:
@@ -440,7 +442,8 @@ class InstructionBase(ImmutableRecord):
 
         from loopy.tools import intern_frozenset_of_ids
 
-        self.id = intern(self.id)
+        if self.id is not None:
+            self.id = intern(self.id)
         self.depends_on = intern_frozenset_of_ids(self.depends_on)
         self.groups = intern_frozenset_of_ids(self.groups)
         self.conflicts_with_groups = (
@@ -452,9 +455,12 @@ class InstructionBase(ImmutableRecord):
 
 
 def _get_assignee_var_name(expr):
-    from pymbolic.primitives import Variable, Subscript
+    from pymbolic.primitives import Variable, Subscript, Lookup
     from loopy.symbolic import LinearSubscript
 
+    if isinstance(expr, Lookup):
+        expr = expr.aggregate
+
     if isinstance(expr, Variable):
         return expr.name
 
@@ -474,9 +480,12 @@ def _get_assignee_var_name(expr):
 
 
 def _get_assignee_subscript_deps(expr):
-    from pymbolic.primitives import Variable, Subscript
+    from pymbolic.primitives import Variable, Subscript, Lookup
     from loopy.symbolic import LinearSubscript, get_dependencies
 
+    if isinstance(expr, Lookup):
+        expr = expr.aggregate
+
     if isinstance(expr, Variable):
         return frozenset()
     elif isinstance(expr, Subscript):
@@ -767,9 +776,9 @@ class Assignment(MultiAssignmentBase):
         if isinstance(expression, str):
             expression = parse(expression)
 
-        from pymbolic.primitives import Variable, Subscript
+        from pymbolic.primitives import Variable, Subscript, Lookup
         from loopy.symbolic import LinearSubscript
-        if not isinstance(assignee, (Variable, Subscript, LinearSubscript)):
+        if not isinstance(assignee, (Variable, Subscript, LinearSubscript, Lookup)):
             raise LoopyError("invalid lvalue '%s'" % assignee)
 
         self.assignee = assignee
@@ -990,9 +999,21 @@ class CallInstruction(MultiAssignmentBase):
             if field_name in ["assignees", "expression"]:
                 key_builder.update_for_pymbolic_expression(
                         key_hash, getattr(self, field_name))
+            elif field_name == "predicates":
+                preds = sorted(self.predicates, key=str)
+                for pred in preds:
+                    key_builder.update_for_pymbolic_expression(
+                            key_hash, pred)
             else:
                 key_builder.rec(key_hash, getattr(self, field_name))
 
+    @property
+    def atomicity(self):
+        # Function calls can impossibly be atomic, and even the result assignment
+        # is troublesome, especially in the case of multiple results. Avoid the
+        # issue altogether by disallowing atomicity.
+        return ()
+
 # }}}
 
 
@@ -1287,7 +1308,7 @@ class NoOpInstruction(_DataObliviousInstruction):
 
 class BarrierInstruction(_DataObliviousInstruction):
     """An instruction that requires synchronization with all
-    concurrent work items of :attr:`kind.
+    concurrent work items of :attr:`kind`.
 
     .. attribute:: kind
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 7545070261a9be7a8b5e3e5dde4fdce3e815f44a..2033425236836ecf000d6c341c46dcb8b087a29a 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -439,7 +439,8 @@ class DomainChanger:
                 # Changing the domain might look like it wants to change grid
                 # sizes. Not true.
                 # (Relevant for 'slab decomposition')
-                get_grid_sizes_for_insn_ids=self.kernel.get_grid_sizes_for_insn_ids)
+                overridden_get_grid_sizes_for_insn_ids=(
+                    self.kernel.get_grid_sizes_for_insn_ids))
 
 # }}}
 
@@ -527,7 +528,9 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
             lines.append("%s -> %s" % (insn_2, insn_1))
 
     if iname_cluster:
-        from loopy.schedule import EnterLoop, LeaveLoop, RunInstruction, Barrier
+        from loopy.schedule import (
+                EnterLoop, LeaveLoop, RunInstruction, Barrier,
+                CallKernel, ReturnFromKernel)
 
         for sched_item in kernel.schedule:
             if isinstance(sched_item, EnterLoop):
@@ -537,7 +540,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
                 lines.append("}")
             elif isinstance(sched_item, RunInstruction):
                 lines.append(sched_item.insn_id)
-            elif isinstance(sched_item, Barrier):
+            elif isinstance(sched_item, (CallKernel, ReturnFromKernel, Barrier)):
                 pass
             else:
                 raise LoopyError("schedule item not unterstood: %r" % sched_item)
@@ -1098,6 +1101,120 @@ def guess_var_shape(kernel, var_name):
 # }}}
 
 
+# {{{ loop nest tracker
+
+class SetTrie(object):
+    """
+    Similar to a trie, but uses an unordered sequence as the key.
+    """
+
+    def __init__(self, children=(), all_items=None):
+        self.children = dict(children)
+        # all_items should be shared within a trie.
+        if all_items is None:
+            self.all_items = set()
+        else:
+            self.all_items = all_items
+
+    def descend(self, on_found=lambda prefix: None, prefix=frozenset()):
+        on_found(prefix)
+        from six import iteritems
+        for prefix, child in sorted(
+                iteritems(self.children),
+                key=lambda it: sorted(it[0])):
+            child.descend(on_found, prefix=prefix)
+
+    def check_consistent_insert(self, items_to_insert):
+        if items_to_insert & self.all_items:
+            raise ValueError("inconsistent nesting")
+
+    def add_or_update(self, key):
+        if len(key) == 0:
+            return
+
+        from six import iteritems
+
+        for child_key, child in iteritems(self.children):
+            common = child_key & key
+            if common:
+                break
+        else:
+            # Key not found - insert new child
+            self.check_consistent_insert(key)
+            self.children[frozenset(key)] = SetTrie(all_items=self.all_items)
+            self.all_items.update(key)
+            return
+
+        if child_key <= key:
+            # child is a prefix of key:
+            child.add_or_update(key - common)
+        elif key < child_key:
+            # key is a strict prefix of child:
+            #
+            #  -[new child]
+            #     |
+            #   [child]
+            #
+            del self.children[child_key]
+            self.children[common] = SetTrie(
+                children={frozenset(child_key - common): child},
+                all_items=self.all_items)
+        else:
+            # key and child share a common prefix:
+            #
+            # -[new placeholder]
+            #      /        \
+            #  [new child]   [child]
+            #
+            self.check_consistent_insert(key - common)
+
+            del self.children[child_key]
+            self.children[common] = SetTrie(
+                children={
+                    frozenset(child_key - common): child,
+                    frozenset(key - common): SetTrie(all_items=self.all_items)},
+                all_items=self.all_items)
+            self.all_items.update(key - common)
+
+
+def get_visual_iname_order_embedding(kernel):
+    """
+    Return :class:`dict` `embedding` mapping inames to a totally ordered set of
+    values, such that `embedding[iname1] < embedding[iname2]` when `iname2`
+    is nested inside `iname1`.
+    """
+    from loopy.kernel.data import IlpBaseTag
+    # Ignore ILP tagged inames, since they do not have to form a strict loop
+    # nest.
+    ilp_inames = frozenset(
+        iname for iname in kernel.iname_to_tag
+        if isinstance(kernel.iname_to_tag[iname], IlpBaseTag))
+
+    iname_trie = SetTrie()
+
+    for insn in kernel.instructions:
+        within_inames = set(
+            iname for iname in insn.within_inames
+            if iname not in ilp_inames)
+        iname_trie.add_or_update(within_inames)
+
+    embedding = {}
+
+    def update_embedding(inames):
+        embedding.update(
+            dict((iname, (len(embedding), iname)) for iname in inames))
+
+    iname_trie.descend(update_embedding)
+
+    for iname in ilp_inames:
+        # Nest ilp_inames innermost, so they don't interrupt visual order.
+        embedding[iname] = (len(embedding), iname)
+
+    return embedding
+
+# }}}
+
+
 # {{{ find_recursive_dependencies
 
 def find_recursive_dependencies(kernel, insn_ids):
@@ -1183,7 +1300,7 @@ def draw_dependencies_as_unicode_arrows(
     def make_extender():
         result = n_columns[0] * [" "]
         for col, (_, updown) in six.iteritems(columns_in_use):
-            result[col] = do_flag_downward("â”‚", updown)
+            result[col] = do_flag_downward(u"â”‚", updown)
 
         return result
 
@@ -1198,7 +1315,7 @@ def draw_dependencies_as_unicode_arrows(
             if dep_key not in dep_to_column:
                 col = dep_to_column[dep_key] = find_free_column()
                 columns_in_use[col] = (rdep, "up")
-                row[col] = "â†±"
+                row[col] = u"â†±"
 
         for dep in insn.depends_on:
             assert dep != insn.id
@@ -1206,15 +1323,15 @@ def draw_dependencies_as_unicode_arrows(
             if dep_key not in dep_to_column:
                 col = dep_to_column[dep_key] = find_free_column()
                 columns_in_use[col] = (dep, "down")
-                row[col] = do_flag_downward("â”Œ", "down")
+                row[col] = do_flag_downward(u"â”Œ", "down")
 
         for col, (end, updown) in list(six.iteritems(columns_in_use)):
             if insn.id == end:
                 del columns_in_use[col]
                 if updown == "up":
-                    row[col] = "â””"
+                    row[col] = u"â””"
                 else:
-                    row[col] = do_flag_downward("â†³", updown)
+                    row[col] = do_flag_downward(u"â†³", updown)
 
         extender = make_extender()
 
@@ -1224,17 +1341,30 @@ def draw_dependencies_as_unicode_arrows(
 
     uniform_length = min(n_columns[0], max_columns)
 
+    added_ellipsis = [False]
+
     def conform_to_uniform_length(s):
         if len(s) <= uniform_length:
             return s + " "*(uniform_length-len(s))
         else:
-            return s[:uniform_length] + "..."
+            added_ellipsis[0] = True
+            return s[:uniform_length] + u"â€¦"
 
-    return [
+    rows = [
             (conform_to_uniform_length(row),
                 conform_to_uniform_length(extender))
             for row, extender in rows]
 
+    if added_ellipsis[0]:
+        uniform_length += 1
+
+    rows = [
+            (conform_to_uniform_length(row),
+                conform_to_uniform_length(extender))
+            for row, extender in rows]
+
+    return rows
+
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/options.py b/loopy/options.py
index 7c778681dced904b31e0cc39cff529b8c026640d..25bb7014ce07a30c49f7f78d5a6325eaba36291d 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -95,6 +95,11 @@ class Options(ImmutableRecord):
         determining whether an iname duplication is necessary
         for the kernel to be schedulable.
 
+    .. attribute:: check_dep_resolution
+
+        Whether loopy should issue an error if a dependency
+        expression does not match any instructions in the kernel.
+
     .. rubric:: Invocation-related options
 
     .. attribute:: skip_arg_checks
@@ -200,6 +205,7 @@ class Options(ImmutableRecord):
                     allow_terminal_colors_def),
                 disable_global_barriers=kwargs.get("disable_global_barriers",
                     False),
+                check_dep_resolution=kwargs.get("check_dep_resolution", True),
                 )
 
     # {{{ legacy compatibility
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 6b5488a20bc9d714fb5fde908b559ddebf4b9591..0d8e771954cf26cc11747e745946389420fa5e1b 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -417,6 +417,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         # }}}
 
+        neutral_var_names = [
+                var_name_gen("neutral_"+red_iname)
+                for i in range(nresults)]
         acc_var_names = [
                 var_name_gen("acc_"+red_iname)
                 for i in range(nresults)]
@@ -429,6 +432,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     shape=outer_local_iname_sizes + (size,),
                     dtype=dtype,
                     scope=temp_var_scope.LOCAL)
+        for name, dtype in zip(neutral_var_names, reduction_dtypes):
+            new_temporary_variables[name] = TemporaryVariable(
+                    name=name,
+                    shape=(),
+                    dtype=dtype,
+                    scope=temp_var_scope.PRIVATE)
 
         base_iname_deps = outer_insn_inames - frozenset(expr.inames)
 
@@ -446,6 +455,22 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                 depends_on=frozenset())
         generated_insns.append(init_insn)
 
+        def _strip_if_scalar(c):
+            if len(acc_vars) == 1:
+                return c[0]
+            else:
+                return c
+
+        init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname))
+        init_neutral_insn = make_assignment(
+                id=init_neutral_id,
+                assignees=tuple(var(nvn) for nvn in neutral_var_names),
+                expression=neutral,
+                within_inames=base_iname_deps | frozenset([base_exec_iname]),
+                within_inames_is_final=insn.within_inames_is_final,
+                depends_on=frozenset())
+        generated_insns.append(init_neutral_insn)
+
         transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname))
         transfer_insn = make_assignment(
                 id=transfer_id,
@@ -453,21 +478,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                     acc_var[outer_local_iname_vars + (var(red_iname),)]
                     for acc_var in acc_vars),
                 expression=expr.operation(
-                    arg_dtype, neutral, expr.expr, expr.inames),
+                    arg_dtype,
+                    _strip_if_scalar(tuple(var(nvn) for nvn in neutral_var_names)),
+                    expr.expr, expr.inames),
                 within_inames=(
                     (outer_insn_inames - frozenset(expr.inames))
                     | frozenset([red_iname])),
                 within_inames_is_final=insn.within_inames_is_final,
-                depends_on=frozenset([init_id]) | insn.depends_on,
+                depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on,
                 no_sync_with=frozenset([(init_id, "any")]))
         generated_insns.append(transfer_insn)
 
-        def _strip_if_scalar(c):
-            if len(acc_vars) == 1:
-                return c[0]
-            else:
-                return c
-
         cur_size = 1
         while cur_size < size:
             cur_size *= 2
@@ -493,15 +514,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
                         for acc_var in acc_vars),
                     expression=expr.operation(
                         arg_dtype,
-                        _strip_if_scalar([
+                        _strip_if_scalar(tuple(
                             acc_var[
                                 outer_local_iname_vars + (var(stage_exec_iname),)]
-                            for acc_var in acc_vars]),
-                        _strip_if_scalar([
+                            for acc_var in acc_vars)),
+                        _strip_if_scalar(tuple(
                             acc_var[
                                 outer_local_iname_vars + (
                                     var(stage_exec_iname) + new_size,)]
-                            for acc_var in acc_vars]),
+                            for acc_var in acc_vars)),
                         expr.inames),
                     within_inames=(
                         base_iname_deps | frozenset([stage_exec_iname])),
@@ -518,7 +539,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
 
         new_insn_add_depends_on.add(prev_id)
         new_insn_add_no_sync_with.add((prev_id, "any"))
-        new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname)
+        new_insn_add_within_inames.add(base_exec_iname or stage_exec_iname)
 
         if nresults == 1:
             assert len(acc_vars) == 1
@@ -738,52 +759,43 @@ def find_idempotence(kernel):
             (insn.id, insn.read_dependency_names() & var_names)
             for insn in kernel.instructions)
 
-    non_idempotently_updated_vars = set()
-
-    # FIXME: This can be made more efficient by simply starting
-    # from all written variables and not even considering
-    # instructions as the start of the first pass.
+    from collections import defaultdict
+    dep_graph = defaultdict(lambda: set())
 
-    new_insns = []
     for insn in kernel.instructions:
-        all_my_var_writers = set()
-        for var in reads_map[insn.id]:
-            var_writers = writer_map.get(var, set())
-            all_my_var_writers |= var_writers
+        dep_graph[insn.id] = set(writer_id
+                for var in reads_map[insn.id]
+                for writer_id in writer_map.get(var, set()))
 
-        # {{{ find dependency loops, flag boostability
+    # Find SCCs of dep_graph. These are used for checking if the instruction is
+    # in a dependency cycle.
+    from loopy.tools import compute_sccs
 
-        while True:
-            last_all_my_var_writers = all_my_var_writers
+    sccs = dict((item, scc)
+            for scc in compute_sccs(dep_graph)
+            for item in scc)
 
-            for writer_insn_id in last_all_my_var_writers:
-                for var in reads_map[writer_insn_id]:
-                    all_my_var_writers = \
-                            all_my_var_writers | writer_map.get(var, set())
-
-            if last_all_my_var_writers == all_my_var_writers:
-                break
-
-        # }}}
+    non_idempotently_updated_vars = set()
 
-        boostable = insn.id not in all_my_var_writers
+    new_insns = []
+    for insn in kernel.instructions:
+        boostable = len(sccs[insn.id]) == 1 and insn.id not in dep_graph[insn.id]
 
         if not boostable:
             non_idempotently_updated_vars.update(
                     insn.assignee_var_names())
 
-        insn = insn.copy(boostable=boostable)
-
-        new_insns.append(insn)
+        new_insns.append(insn.copy(boostable=boostable))
 
     # {{{ remove boostability from isns that access non-idempotently updated vars
 
     new2_insns = []
     for insn in new_insns:
-        accessed_vars = insn.dependency_names()
-        boostable = insn.boostable and not bool(
-                non_idempotently_updated_vars & accessed_vars)
-        new2_insns.append(insn.copy(boostable=boostable))
+        if insn.boostable and bool(
+                non_idempotently_updated_vars & insn.dependency_names()):
+            new2_insns.append(insn.copy(boostable=False))
+        else:
+            new2_insns.append(insn)
 
     # }}}
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index e23e5f350864a623d2ff05551fc2d559c361d734..c078da2ec58dabbbf646bfcf593ea0138941cc85 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -1384,172 +1384,251 @@ class DependencyRecord(ImmutableRecord):
                 var_kind=var_kind)
 
 
-def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
-    """If there exists a dependency between target and source and the two access
-    a common variable of *var_kind* in a way that requires a barrier (essentially,
-    at least one write), then the function will return a tuple
-    ``(target, source, var_name)``. Otherwise, it will return *None*.
-
-    This function finds direct or indirect instruction dependencies, but does
-    not attempt to guess dependencies that exist based on common access to
-    variables.
-
-    :arg reverse: a :class:`bool` indicating whether
-        forward or reverse dependencies are sought. (see above)
-    :arg var_kind: "global" or "local", the kind of variable based on which
-        barrier-needing dependencies should be found.
+class DependencyTracker(object):
     """
+    A utility to help track dependencies between originating from a set
+    of sources (as defined by :meth:`add_source`. For each target,
+    dependencies can then be obtained using :meth:`gen_dependencies_with_target_at`.
 
-    # If target or source are insn IDs, look up the actual instructions.
-    from loopy.kernel.data import InstructionBase
-    if not isinstance(source, InstructionBase):
-        source = kernel.id_to_insn[source]
-    if not isinstance(target, InstructionBase):
-        target = kernel.id_to_insn[target]
-
-    if reverse:
-        source, target = target, source
-
-    if source.id in kernel.get_nosync_set(target.id, var_kind):
-        return
-
-    # {{{ check that a dependency exists
-
-    dep_descr = None
+    .. automethod:: add_source
+    .. automethod:: gen_dependencies_with_target_at
+    """
 
-    target_deps = kernel.recursive_insn_dep_map()[target.id]
-    if source.id in target_deps:
-        if reverse:
-            dep_descr = "{src} rev-depends on {tgt}"
+    def __init__(self, kernel, var_kind, reverse):
+        """
+        :arg var_kind: "global" or "local", the kind of variable based on which
+            barrier-needing dependencies should be found.
+        :arg reverse:
+            In straight-line code, this  only tracks 'b depends on
+            a'-type 'forward' dependencies. But a loop of the type::
+
+                for i in range(10):
+                    A
+                    B
+
+            effectively glues multiple copies of 'A;B' one after the other::
+
+                A
+                B
+                A
+                B
+                ...
+
+            Now, if B depends on (i.e. is required to be textually before) A in a
+            way requiring a barrier, then we will assume that the reverse
+            dependency exists as well, i.e. a barrier between the tail end of
+            execution of B and the next beginning of A is also needed.
+
+            Setting *reverse* to *True* tracks these reverse (instead of forward)
+            dependencies.
+        """
+        self.kernel = kernel
+        self.reverse = reverse
+        self.var_kind = var_kind
+
+        if var_kind == "local":
+            self.relevant_vars = kernel.local_var_names()
+        elif var_kind == "global":
+            self.relevant_vars = kernel.global_var_names()
         else:
-            dep_descr = "{tgt} depends on {src}"
-
-    grps = source.groups & target.conflicts_with_groups
-    if grps:
-        dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps)
-
-    grps = target.groups & source.conflicts_with_groups
-    if grps:
-        dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps)
+            raise ValueError("unknown 'var_kind': %s" % var_kind)
 
-    if not dep_descr:
-        return None
+        from collections import defaultdict
+        self.writer_map = defaultdict(lambda: set())
+        self.reader_map = defaultdict(lambda: set())
+        self.temp_to_base_storage = kernel.get_temporary_to_base_storage_map()
 
-    # }}}
-
-    if var_kind == "local":
-        relevant_vars = kernel.local_var_names()
-    elif var_kind == "global":
-        relevant_vars = kernel.global_var_names()
-    else:
-        raise ValueError("unknown 'var_kind': %s" % var_kind)
-
-    temp_to_base_storage = kernel.get_temporary_to_base_storage_map()
-
-    def map_to_base_storage(var_names):
+    def map_to_base_storage(self, var_names):
         result = set(var_names)
 
         for name in var_names:
-            bs = temp_to_base_storage.get(name)
+            bs = self.temp_to_base_storage.get(name)
             if bs is not None:
                 result.add(bs)
 
         return result
 
-    tgt_write = map_to_base_storage(
-            set(target.assignee_var_names()) & relevant_vars)
-    tgt_read = map_to_base_storage(
-            target.read_dependency_names() & relevant_vars)
+    def discard_all_sources(self):
+        self.writer_map.clear()
+        self.reader_map.clear()
 
-    src_write = map_to_base_storage(
-            set(source.assignee_var_names()) & relevant_vars)
-    src_read = map_to_base_storage(
-            source.read_dependency_names() & relevant_vars)
+    def add_source(self, source):
+        """
+        Specify that an instruction may be used as the source of a dependency edge.
+        """
+        # If source is an insn ID, look up the actual instruction.
+        source = self.kernel.id_to_insn.get(source, source)
 
-    waw = tgt_write & src_write
-    raw = tgt_read & src_write
-    war = tgt_write & src_read
+        for written in self.map_to_base_storage(
+                set(source.assignee_var_names()) & self.relevant_vars):
+            self.writer_map[written].add(source.id)
 
-    for var_name in sorted(raw | war):
-        return DependencyRecord(
-                source=source,
-                target=target,
-                dep_descr=dep_descr,
-                variable=var_name,
-                var_kind=var_kind)
+        for read in self.map_to_base_storage(
+                source.read_dependency_names() & self.relevant_vars):
+            self.reader_map[read].add(source.id)
 
-    if source is target:
-        return None
+    def gen_dependencies_with_target_at(self, target):
+        """
+        Generate :class:`DependencyRecord` instances for dependencies edges
+        whose target is the given instruction.
 
-    for var_name in sorted(waw):
-        return DependencyRecord(
-                source=source,
-                target=target,
-                dep_descr=dep_descr,
-                variable=var_name,
-                var_kind=var_kind)
+        :arg target: The ID of the instruction for which dependencies
+            with conflicting var access should be found.
+        """
+        # If target is an insn ID, look up the actual instruction.
+        target = self.kernel.id_to_insn.get(target, target)
+
+        tgt_write = self.map_to_base_storage(
+            set(target.assignee_var_names()) & self.relevant_vars)
+        tgt_read = self.map_to_base_storage(
+            target.read_dependency_names() & self.relevant_vars)
+
+        for (accessed_vars, accessor_map) in [
+                (tgt_read, self.writer_map),
+                (tgt_write, self.reader_map),
+                (tgt_write, self.writer_map)]:
+
+            for dep in self.get_conflicting_accesses(
+                    accessed_vars, accessor_map, target.id):
+                yield dep
+
+    def get_conflicting_accesses(
+            self, accessed_vars, var_to_accessor_map, target):
+
+        def determine_conflict_nature(source, target):
+            if (not self.reverse and source in
+                    self.kernel.get_nosync_set(target, scope=self.var_kind)):
+                return None
+            if (self.reverse and target in
+                    self.kernel.get_nosync_set(source, scope=self.var_kind)):
+                return None
+            return self.describe_dependency(source, target)
+
+        for var in sorted(accessed_vars):
+            for source in sorted(var_to_accessor_map[var]):
+                dep_descr = determine_conflict_nature(source, target)
+
+                if dep_descr is None:
+                    continue
 
-    return None
+                yield DependencyRecord(
+                        source=self.kernel.id_to_insn[source],
+                        target=self.kernel.id_to_insn[target],
+                        dep_descr=dep_descr,
+                        variable=var,
+                        var_kind=self.var_kind)
 
+    def describe_dependency(self, source, target):
+        dep_descr = None
 
-def barrier_kind_more_or_equally_global(kind1, kind2):
-    return (kind1 == kind2) or (kind1 == "global" and kind2 == "local")
+        source = self.kernel.id_to_insn[source]
+        target = self.kernel.id_to_insn[target]
 
+        if self.reverse:
+            source, target = target, source
 
-def get_tail_starting_at_last_barrier(schedule, kind):
-    result = []
+        target_deps = self.kernel.recursive_insn_dep_map()[target.id]
+        if source.id in target_deps:
+            if self.reverse:
+                dep_descr = "{tgt} rev-depends on {src}"
+            else:
+                dep_descr = "{tgt} depends on {src}"
 
-    for sched_item in reversed(schedule):
-        if isinstance(sched_item, Barrier):
-            if barrier_kind_more_or_equally_global(sched_item.kind, kind):
-                break
+        grps = source.groups & target.conflicts_with_groups
+        if not grps:
+            grps = target.groups & source.conflicts_with_groups
 
-        elif isinstance(sched_item, RunInstruction):
-            result.append(sched_item.insn_id)
+        if grps:
+            dep_descr = "{src} conflicts with {tgt} (via '%s')" % ", ".join(grps)
 
-        elif isinstance(sched_item, (EnterLoop, LeaveLoop)):
-            pass
+        return dep_descr
 
-        elif isinstance(sched_item, (CallKernel, ReturnFromKernel)):
-            pass
 
-        else:
-            raise ValueError("unexpected schedule item type '%s'"
-                    % type(sched_item).__name__)
+def barrier_kind_more_or_equally_global(kind1, kind2):
+    return (kind1 == kind2) or (kind1 == "global" and kind2 == "local")
 
-    return reversed(result)
 
+def insn_ids_reaching_end_without_intervening_barrier(schedule, kind):
+    return _insn_ids_reaching_end(schedule, kind, reverse=False)
 
-def insn_ids_from_schedule(schedule):
-    result = []
-    for sched_item in reversed(schedule):
-        if isinstance(sched_item, RunInstruction):
-            result.append(sched_item.insn_id)
 
-        elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier, CallKernel,
-                                     ReturnFromKernel)):
-            pass
+def insn_ids_reachable_from_start_without_intervening_barrier(schedule, kind):
+    return _insn_ids_reaching_end(schedule, kind, reverse=True)
 
-        else:
-            raise ValueError("unexpected schedule item type '%s'"
-                    % type(sched_item).__name__)
 
-    return result
+def _insn_ids_reaching_end(schedule, kind, reverse):
+    if reverse:
+        schedule = reversed(schedule)
+        enter_scope_item_kind = LeaveLoop
+        leave_scope_item_kind = EnterLoop
+    else:
+        enter_scope_item_kind = EnterLoop
+        leave_scope_item_kind = LeaveLoop
+
+    insn_ids_alive_at_scope = [set()]
+
+    for sched_item in schedule:
+        if isinstance(sched_item, enter_scope_item_kind):
+            insn_ids_alive_at_scope.append(set())
+        elif isinstance(sched_item, leave_scope_item_kind):
+            innermost_scope = insn_ids_alive_at_scope.pop()
+            # Instructions in deeper scopes are alive but could be killed by
+            # barriers at a shallower level, e.g.:
+            #
+            # for i
+            #     insn0
+            # end
+            # barrier()   <= kills insn0
+            #
+            # Hence we merge this scope into the parent scope.
+            insn_ids_alive_at_scope[-1].update(innermost_scope)
+        elif isinstance(sched_item, Barrier):
+            # This barrier kills only the instruction ids that are alive at
+            # the current scope (or deeper). Without further analysis, we
+            # can't assume that instructions at shallower scope can be
+            # killed by deeper barriers, since loops might be empty, e.g.:
+            #
+            # insn0          <= isn't killed by barrier (i loop could be empty)
+            # for i
+            #     insn1      <= is killed by barrier
+            #     for j
+            #         insn2  <= is killed by barrier
+            #     end
+            #     barrier()
+            # end
+            if barrier_kind_more_or_equally_global(sched_item.kind, kind):
+                insn_ids_alive_at_scope[-1].clear()
+        else:
+            insn_ids_alive_at_scope[-1] |= set(
+                    insn_id for insn_id in sched_item_to_insn_id(sched_item))
+
+    assert len(insn_ids_alive_at_scope) == 1
+    return insn_ids_alive_at_scope[-1]
+
+
+def append_barrier_or_raise_error(schedule, dep, verify_only):
+    if verify_only:
+        from loopy.diagnostic import MissingBarrierError
+        raise MissingBarrierError(
+                "Dependency '%s' (for variable '%s') "
+                "requires synchronization "
+                "by a %s barrier (add a 'no_sync_with' "
+                "instruction option to state that no "
+                "synchronization is needed)"
+                % (
+                    dep.dep_descr.format(
+                        tgt=dep.target.id, src=dep.source.id),
+                    dep.variable,
+                    dep.var_kind))
+    else:
+        comment = "for %s (%s)" % (
+                dep.variable, dep.dep_descr.format(
+                    tgt=dep.target.id, src=dep.source.id))
+        schedule.append(Barrier(comment=comment, kind=dep.var_kind))
 
 
-def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
+def insert_barriers(kernel, schedule, kind, verify_only, level=0):
     """
-    :arg reverse: a :class:`bool`. For ``level > 0``, this function should be
-        called twice, first with ``reverse=False`` to insert barriers for
-        forward dependencies, and then again with ``reverse=True`` to insert
-        reverse depedencies. This order is preferable because the forward pass
-        will limit the number of instructions that need to be considered as
-        depedency source candidates by already inserting some number of
-        barriers into *schedule*.
-
-        Calling it with ``reverse==True and level==0` is not necessary,
-        since the root of the schedule is in no loop, therefore not repeated,
-        and therefore reverse dependencies don't need to be added.
     :arg kind: "local" or "global". The :attr:`Barrier.kind` to be inserted.
         Generally, this function will be called once for each kind of barrier
         at the top level, where more global barriers should be inserted first.
@@ -1557,184 +1636,118 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
         missing.
     :arg level: the current level of loop nesting, 0 for outermost.
     """
-    result = []
-
-    # In straight-line code, we have only 'b depends on a'-type 'forward'
-    # dependencies. But a loop of the type
-    #
-    # for i in range(10):
-    #     A
-    #     B
-    #
-    # effectively glues multiple copies of 'A;B' one after the other:
-    #
-    # A
-    # B
-    # A
-    # B
-    # ...
-    #
-    # Now, if B depends on (i.e. is required to be textually before) A in a way
-    # requiring a barrier, then we will assume that the reverse dependency exists
-    # as well, i.e. a barrier between the tail end fo execution of B and the next
-    # beginning of A is also needed.
-
-    if level == 0 and reverse:
-        # The global schedule is in no loop, therefore not repeated, and
-        # therefore reverse dependencies don't need to be added.
-        return schedule
-
-    # a list of instruction IDs that could lead to barrier-needing dependencies.
-    if reverse:
-        candidates = set(get_tail_starting_at_last_barrier(schedule, kind))
-    else:
-        candidates = set()
-
-    past_first_barrier = [False]
-
-    def seen_barrier():
-        past_first_barrier[0] = True
 
-        # We've just gone across a barrier, so anything that needed
-        # one from above just got one.
+    # {{{ insert barriers at outermost scheduling level
 
-        candidates.clear()
+    def insert_barriers_at_outer_level(schedule, reverse=False):
+        dep_tracker = DependencyTracker(kernel, var_kind=kind, reverse=reverse)
 
-    def issue_barrier(dep):
-        seen_barrier()
+        if reverse:
+            # Populate the dependency tracker with sources from the tail end of
+            # the schedule block.
+            for insn_id in (
+                    insn_ids_reaching_end_without_intervening_barrier(
+                        schedule, kind)):
+                dep_tracker.add_source(insn_id)
 
-        comment = None
-        if dep is not None:
-            comment = "for %s (%s)" % (
-                    dep.variable, dep.dep_descr.format(
-                        tgt=dep.target.id, src=dep.source.id))
+        result = []
 
-        result.append(Barrier(comment=comment, kind=dep.var_kind))
+        i = 0
+        while i < len(schedule):
+            sched_item = schedule[i]
 
-    i = 0
-    while i < len(schedule):
-        sched_item = schedule[i]
+            if isinstance(sched_item, EnterLoop):
+                subloop, new_i = gather_schedule_block(schedule, i)
 
-        if isinstance(sched_item, EnterLoop):
-            # {{{ recurse for nested loop
+                loop_head = (
+                    insn_ids_reachable_from_start_without_intervening_barrier(
+                        subloop, kind))
 
-            subloop, new_i = gather_schedule_block(schedule, i)
-            i = new_i
+                loop_tail = (
+                    insn_ids_reaching_end_without_intervening_barrier(
+                        subloop, kind))
 
-            # Run barrier insertion for inner loop
-            subresult = subloop[1:-1]
-            for sub_reverse in [False, True]:
-                subresult = insert_barriers(
-                        kernel, subresult,
-                        reverse=sub_reverse, kind=kind,
-                        verify_only=verify_only,
-                        level=level+1)
+                # Checks if a barrier is needed before the loop. This handles
+                # dependencies with targets that can be reached without an
+                # intervening barrier from the start of the loop:
+                #
+                # a[x] = ...      <= source
+                # for i
+                #     ... = a[y]  <= target
+                #     barrier()
+                #     ...
+                from itertools import chain
+                for dep in chain.from_iterable(
+                        dep_tracker.gen_dependencies_with_target_at(insn)
+                        for insn in loop_head):
+                    append_barrier_or_raise_error(result, dep, verify_only)
+                    # This barrier gets inserted outside the loop, hence it is
+                    # executed unconditionally and so kills all sources before
+                    # the loop.
+                    dep_tracker.discard_all_sources()
+                    break
 
-            # {{{ find barriers in loop body
+                result.extend(subloop)
 
-            first_barrier_index = None
-            last_barrier_index = None
+                # Handle dependencies with sources not killed inside the loop:
+                #
+                # for i
+                #     ...
+                #     barrier()
+                #     b[i] = ...  <= source
+                # end for
+                # ... = f(b)      <= target
+                for item in loop_tail:
+                    dep_tracker.add_source(item)
+
+                i = new_i
+
+            elif isinstance(sched_item, Barrier):
+                result.append(sched_item)
+                if barrier_kind_more_or_equally_global(sched_item.kind, kind):
+                    dep_tracker.discard_all_sources()
+                i += 1
+
+            elif isinstance(sched_item, RunInstruction):
+                for dep in dep_tracker.gen_dependencies_with_target_at(
+                        sched_item.insn_id):
+                    append_barrier_or_raise_error(result, dep, verify_only)
+                    dep_tracker.discard_all_sources()
+                    break
+                result.append(sched_item)
+                dep_tracker.add_source(sched_item.insn_id)
+                i += 1
 
-            for j, sub_sched_item in enumerate(subresult):
-                if (isinstance(sub_sched_item, Barrier) and
-                        barrier_kind_more_or_equally_global(
-                            sub_sched_item.kind, kind)):
+            elif isinstance(sched_item, (CallKernel, ReturnFromKernel)):
+                result.append(sched_item)
+                i += 1
 
-                    seen_barrier()
-                    last_barrier_index = j
-                    if first_barrier_index is None:
-                        first_barrier_index = j
+            else:
+                raise ValueError("unexpected schedule item type '%s'"
+                        % type(sched_item).__name__)
 
-            # }}}
+        return result
 
-            # {{{ check if a barrier is needed before the loop
-
-            # (for leading (before-first-barrier) bit of loop body)
-            for insn_id in insn_ids_from_schedule(subresult[:first_barrier_index]):
-                search_set = sorted(candidates)
-
-                for dep_src_insn_id in search_set:
-                    dep = get_barrier_needing_dependency(
-                            kernel,
-                            target=insn_id,
-                            source=dep_src_insn_id,
-                            reverse=reverse, var_kind=kind)
-                    if dep:
-                        if verify_only:
-                            from loopy.diagnostic import MissingBarrierError
-                            raise MissingBarrierError(
-                                    "Dependency '%s' (for variable '%s') "
-                                    "requires synchronization "
-                                    "by a %s barrier (add a 'no_sync_with' "
-                                    "instruction option to state that no"
-                                    "synchronization is needed)"
-                                    % (
-                                        dep.dep_descr.format(
-                                            tgt=dep.target.id, src=dep.source.id),
-                                        dep.variable,
-                                        kind))
-                        else:
-                            issue_barrier(dep=dep)
-                            break
+    # }}}
 
-            # }}}
+    # {{{ recursively insert barriers in loops
 
-            # add trailing end (past-last-barrier) of loop body to candidates
-            if last_barrier_index is None:
-                candidates.update(insn_ids_from_schedule(subresult))
-            else:
-                candidates.update(
-                        insn_ids_from_schedule(
-                            subresult[last_barrier_index+1:]))
+    result = []
+    i = 0
+    while i < len(schedule):
+        sched_item = schedule[i]
 
+        if isinstance(sched_item, EnterLoop):
+            subloop, new_i = gather_schedule_block(schedule, i)
+            new_subloop = insert_barriers(
+                    kernel, subloop[1:-1], kind, verify_only, level + 1)
             result.append(subloop[0])
-            result.extend(subresult)
+            result.extend(new_subloop)
             result.append(subloop[-1])
+            i = new_i
 
-            # }}}
-
-        elif isinstance(sched_item, Barrier):
-            i += 1
-
-            if barrier_kind_more_or_equally_global(sched_item.kind, kind):
-                seen_barrier()
-
-            result.append(sched_item)
-
-        elif isinstance(sched_item, RunInstruction):
-            i += 1
-
-            search_set = sorted(candidates)
-
-            for dep_src_insn_id in search_set:
-                dep = get_barrier_needing_dependency(
-                        kernel,
-                        target=sched_item.insn_id,
-                        source=dep_src_insn_id,
-                        reverse=reverse, var_kind=kind)
-                if dep:
-                    if verify_only:
-                        from loopy.diagnostic import MissingBarrierError
-                        raise MissingBarrierError(
-                                "Dependency '%s' (for variable '%s') "
-                                "requires synchronization "
-                                "by a %s barrier (add a 'no_sync_with' "
-                                "instruction option to state that no "
-                                "synchronization is needed)"
-                                % (
-                                    dep.dep_descr.format(
-                                        tgt=dep.target.id, src=dep.source.id),
-                                    dep.variable,
-                                    kind))
-
-                    else:
-                        issue_barrier(dep=dep)
-                        break
-
-            result.append(sched_item)
-            candidates.add(sched_item.insn_id)
-
-        elif isinstance(sched_item, (CallKernel, ReturnFromKernel)):
+        elif isinstance(sched_item,
+                (Barrier, RunInstruction, CallKernel, ReturnFromKernel)):
             result.append(sched_item)
             i += 1
 
@@ -1742,13 +1755,13 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0):
             raise ValueError("unexpected schedule item type '%s'"
                     % type(sched_item).__name__)
 
-        if past_first_barrier[0] and reverse:
-            # We can quit here, because we're only trying add
-            # reverse-dep barriers to the beginning of the loop, up to
-            # the first barrier.
+    # }}}
+
+    result = insert_barriers_at_outer_level(result)
 
-            result.extend(schedule[i:])
-            break
+    # When level = 0 there is no loop.
+    if level != 0:
+        result = insert_barriers_at_outer_level(result, reverse=True)
 
     return result
 
@@ -1897,11 +1910,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
                     if not kernel.options.disable_global_barriers:
                         logger.info("%s: barrier insertion: global" % kernel.name)
                         gen_sched = insert_barriers(kernel, gen_sched,
-                                reverse=False, kind="global", verify_only=True)
+                                kind="global", verify_only=True)
 
                     logger.info("%s: barrier insertion: local" % kernel.name)
-                    gen_sched = insert_barriers(kernel, gen_sched,
-                            reverse=False, kind="local", verify_only=False)
+                    gen_sched = insert_barriers(kernel, gen_sched, kind="local",
+                            verify_only=False)
                     logger.info("%s: barrier insertion: done" % kernel.name)
 
                 new_kernel = kernel.copy(
@@ -1959,7 +1972,7 @@ def get_one_scheduled_kernel(kernel):
 
     if CACHING_ENABLED:
         try:
-            result, ambiguous = schedule_cache[sched_cache_key]
+            result = schedule_cache[sched_cache_key]
 
             logger.debug("%s: schedule cache hit" % kernel.name)
             from_cache = True
@@ -1967,38 +1980,18 @@ def get_one_scheduled_kernel(kernel):
             pass
 
     if not from_cache:
-        ambiguous = False
-
-        kernel_count = 0
-
         from time import time
         start_time = time()
 
         logger.info("%s: schedule start" % kernel.name)
 
-        for scheduled_kernel in generate_loop_schedules(kernel):
-            kernel_count += 1
-
-            if kernel_count == 1:
-                # use the first schedule
-                result = scheduled_kernel
-
-            if kernel_count == 2:
-                ambiguous = True
-                break
+        result = next(iter(generate_loop_schedules(kernel)))
 
         logger.info("%s: scheduling done after %.2f s" % (
             kernel.name, time()-start_time))
 
-    if ambiguous:
-        from warnings import warn
-        from loopy.diagnostic import LoopyWarning
-        warn("scheduling for kernel '%s' was ambiguous--more than one "
-                "schedule found, ignoring" % kernel.name, LoopyWarning,
-                stacklevel=2)
-
     if CACHING_ENABLED and not from_cache:
-        schedule_cache[sched_cache_key] = result, ambiguous
+        schedule_cache[sched_cache_key] = result
 
     return result
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 52fd6e57f92e7f9599a3a0fb4256f97347708303..50c891be476810887720c4e13c9659966b431f5d 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -38,6 +38,7 @@ from pymbolic.mapper import (
         IdentityMapper as IdentityMapperBase,
         WalkMapper as WalkMapperBase,
         CallbackMapper as CallbackMapperBase,
+        CSECachingMapperMixin,
         )
 from pymbolic.mapper.evaluator import \
         EvaluationMapper as EvaluationMapperBase
@@ -113,10 +114,14 @@ class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):
     pass
 
 
-class PartialEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
+class PartialEvaluationMapper(
+        EvaluationMapperBase, CSECachingMapperMixin, IdentityMapperMixin):
     def map_variable(self, expr):
         return expr
 
+    def map_common_subexpression_uncached(self, expr):
+        return type(expr)(self.rec(expr.child), expr.prefix, expr.scope)
+
 
 class WalkMapper(WalkMapperBase):
     def map_literal(self, expr, *args):
@@ -162,8 +167,10 @@ class CombineMapper(CombineMapperBase):
     map_linear_subscript = CombineMapperBase.map_subscript
 
 
-class SubstitutionMapper(SubstitutionMapperBase, IdentityMapperMixin):
-    pass
+class SubstitutionMapper(
+        CSECachingMapperMixin, SubstitutionMapperBase, IdentityMapperMixin):
+    def map_common_subexpression_uncached(self, expr):
+        return type(expr)(self.rec(expr.child), expr.prefix, expr.scope)
 
 
 class ConstantFoldingMapper(ConstantFoldingMapperBase,
@@ -1471,12 +1478,13 @@ def get_access_range(domain, subscript, assumptions):
 
 # {{{ access range mapper
 
-class AccessRangeMapper(WalkMapper):
-    def __init__(self, kernel, arg_name):
+class BatchedAccessRangeMapper(WalkMapper):
+
+    def __init__(self, kernel, arg_names):
         self.kernel = kernel
-        self.arg_name = arg_name
-        self.access_range = None
-        self.bad_subscripts = []
+        self.arg_names = set(arg_names)
+        self.access_ranges = dict((arg, None) for arg in arg_names)
+        self.bad_subscripts = dict((arg, []) for arg in arg_names)
 
     def map_subscript(self, expr, inames):
         domain = self.kernel.get_inames_domain(inames)
@@ -1484,38 +1492,58 @@ class AccessRangeMapper(WalkMapper):
 
         assert isinstance(expr.aggregate, p.Variable)
 
-        if expr.aggregate.name != self.arg_name:
+        if expr.aggregate.name not in self.arg_names:
             return
 
+        arg_name = expr.aggregate.name
         subscript = expr.index_tuple
 
         if not get_dependencies(subscript) <= set(domain.get_var_dict()):
-            self.bad_subscripts.append(expr)
+            self.bad_subscripts[arg_name].append(expr)
             return
 
         access_range = get_access_range(domain, subscript, self.kernel.assumptions)
 
-        if self.access_range is None:
-            self.access_range = access_range
+        if self.access_ranges[arg_name] is None:
+            self.access_ranges[arg_name] = access_range
         else:
-            if (self.access_range.dim(dim_type.set)
+            if (self.access_ranges[arg_name].dim(dim_type.set)
                     != access_range.dim(dim_type.set)):
                 raise RuntimeError(
                         "error while determining shape of argument '%s': "
                         "varying number of indices encountered"
-                        % self.arg_name)
+                        % arg_name)
 
-            self.access_range = self.access_range | access_range
+            self.access_ranges[arg_name] = (
+                    self.access_ranges[arg_name] | access_range)
 
     def map_linear_subscript(self, expr, inames):
         self.rec(expr.index, inames)
 
-        if expr.aggregate.name == self.arg_name:
-            self.bad_subscripts.append(expr)
+        if expr.aggregate.name in self.arg_names:
+            self.bad_subscripts[expr.aggregate.name].append(expr)
 
     def map_reduction(self, expr, inames):
         return WalkMapper.map_reduction(self, expr, inames | set(expr.inames))
 
+
+class AccessRangeMapper(object):
+
+    def __init__(self, kernel, arg_name):
+        self.arg_name = arg_name
+        self.inner_mapper = BatchedAccessRangeMapper(kernel, [arg_name])
+
+    def __call__(self, expr, inames):
+        return self.inner_mapper(expr, inames)
+
+    @property
+    def access_range(self):
+        return self.inner_mapper.access_ranges[self.arg_name]
+
+    @property
+    def bad_subscripts(self):
+        return self.inner_mapper.bad_subscripts[self.arg_name]
+
 # }}}
 
 
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 61e8e4f396126e17123c1bf775dbfeee2fe21f0d..a8f47adb991e331f8a473c4eb14b1ea634c7a3b1 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -669,8 +669,6 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
 
         import pyopencl as cl
 
-        logger.info("%s: opencl compilation start" % self.kernel.name)
-
         cl_program = (
                 cl.Program(self.context, dev_code)
                 .build(options=kernel.options.cl_build_options))
@@ -679,8 +677,6 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
         for dp in codegen_result.device_programs:
             setattr(cl_kernels, dp.name, getattr(cl_program, dp.name))
 
-        logger.info("%s: opencl compilation done" % self.kernel.name)
-
         return _CLKernelInfo(
                 kernel=kernel,
                 cl_kernels=cl_kernels,
diff --git a/loopy/tools.py b/loopy/tools.py
index ae370d5aaac9ff75f530e1d0951a2f904b686e42..56b673b597fc3bf43a6b03f87607ea8d3db0866a 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -281,6 +281,128 @@ def empty_aligned(shape, dtype, order='C', n=64):
 # }}}
 
 
+# {{{ compute SCCs with Tarjan's algorithm
+
+def compute_sccs(graph):
+    to_search = set(graph.keys())
+    visit_order = {}
+    scc_root = {}
+    sccs = []
+
+    while to_search:
+        top = next(iter(to_search))
+        call_stack = [(top, iter(graph[top]), None)]
+        visit_stack = []
+        visiting = set()
+
+        scc = []
+
+        while call_stack:
+            top, children, last_popped_child = call_stack.pop()
+
+            if top not in visiting:
+                # Unvisited: mark as visited, initialize SCC root.
+                count = len(visit_order)
+                visit_stack.append(top)
+                visit_order[top] = count
+                scc_root[top] = count
+                visiting.add(top)
+                to_search.discard(top)
+
+            # Returned from a recursion, update SCC.
+            if last_popped_child is not None:
+                scc_root[top] = min(
+                    scc_root[top],
+                    scc_root[last_popped_child])
+
+            for child in children:
+                if child not in visit_order:
+                    # Recurse.
+                    call_stack.append((top, children, child))
+                    call_stack.append((child, iter(graph[child]), None))
+                    break
+                if child in visiting:
+                    scc_root[top] = min(
+                        scc_root[top],
+                        visit_order[child])
+            else:
+                if scc_root[top] == visit_order[top]:
+                    scc = []
+                    while visit_stack[-1] != top:
+                        scc.append(visit_stack.pop())
+                    scc.append(visit_stack.pop())
+                    for item in scc:
+                        visiting.remove(item)
+                    sccs.append(scc)
+
+    return sccs
+
+# }}}
+
+
+# {{{ lazily unpickling dictionary
+
+
+class _PickledObjectWrapper(object):
+    """
+    A class meant to wrap a pickled value (for :class:`LazilyUnpicklingDictionary`).
+    """
+
+    @classmethod
+    def from_object(cls, obj):
+        if isinstance(obj, cls):
+            return obj
+        from pickle import dumps
+        return cls(dumps(obj))
+
+    def __init__(self, objstring):
+        self.objstring = objstring
+
+    def unpickle(self):
+        from pickle import loads
+        return loads(self.objstring)
+
+    def __getstate__(self):
+        return {"objstring": self.objstring}
+
+
+import collections
+
+
+class LazilyUnpicklingDictionary(collections.MutableMapping):
+    """
+    A dictionary-like object which lazily unpickles its values.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self._map = dict(*args, **kwargs)
+
+    def __getitem__(self, key):
+        value = self._map[key]
+        if isinstance(value, _PickledObjectWrapper):
+            value = self._map[key] = value.unpickle()
+        return value
+
+    def __setitem__(self, key, value):
+        self._map[key] = value
+
+    def __delitem__(self, key):
+        del self._map[key]
+
+    def __len__(self):
+        return len(self._map)
+
+    def __iter__(self):
+        return iter(self._map)
+
+    def __getstate__(self):
+        return {"_map": dict(
+            (key, _PickledObjectWrapper.from_object(val))
+            for key, val in six.iteritems(self._map))}
+
+# }}}
+
+
 def is_interned(s):
     return s is None or intern(s) is s
 
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index 88e3898e2cceeeb62edea306283fcb718c3b088d..e7a86300f9d040cba1688e5bb0f3dcbbd926f783 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -63,7 +63,7 @@ class _BatchVariableChanger(RuleAwareIdentityMapper):
         if not self.needs_batch_subscript(expr.aggregate.name):
             return super(_BatchVariableChanger, self).map_subscript(expr, expn_state)
 
-        idx = expr.index
+        idx = self.rec(expr.index, expn_state)
         if not isinstance(idx, tuple):
             idx = (idx,)
 
@@ -73,7 +73,7 @@ class _BatchVariableChanger(RuleAwareIdentityMapper):
         if not self.needs_batch_subscript(expr.name):
             return super(_BatchVariableChanger, self).map_variable(expr, expn_state)
 
-        return expr.aggregate[self.batch_iname_expr]
+        return expr[self.batch_iname_expr]
 
 
 def _add_unique_dim_name(name, dim_names):
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 6871cdbf46e7b117ea4e9047f0095f303b25667e..575311b11716f5a52e4713aa51922eb348c839d9 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -209,7 +209,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
               oooooooooo
               oooooooooo
 
-        Passing ``fetch_bounding_box=True` gives :mod:`loopy` permission
+        Passing ``fetch_bounding_box=True`` gives :mod:`loopy` permission
         to instead fetch the 'bounding box' of the footprint, i.e.
         this set in the stencil example::
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index c35b5064365293ac78cdd01af537c9d28bd67193..ea90abfe27c8de69daf39021b3d0ea5463a2e4c8 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -986,8 +986,9 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
             # Emit a warning that we needed boostable_into
             from warnings import warn
             from loopy.diagnostic import LoopyWarning
-            warn("Kernel '%s' required the deprecated 'boostable_into"
-                 "field in order to be schedulable!" % knl.name, LoopyWarning)
+            warn("Kernel '%s' required the deprecated 'boostable_into' "
+                 "instruction attribute in order to be schedulable!" % knl.name,
+                 LoopyWarning)
 
             # Return to avoid yielding the duplication
             # options without boostable_into
@@ -1198,7 +1199,8 @@ class _ReductionSplitter(RuleAwareIdentityMapper):
                 return Reduction(expr.operation, tuple(self.inames),
                         Reduction(expr.operation, tuple(leftover_inames),
                             self.rec(expr.expr, expn_state),
-                            expr.allow_simultaneous))
+                            expr.allow_simultaneous),
+                        expr.allow_simultaneous)
             else:
                 assert False
         else:
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index bc8981281a94bde2685be2efa659b8db01b493bb..8afc1695a38a37baf165f6ec6ef6567e2012173b 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -439,7 +439,8 @@ class TemporarySaver(object):
         kernel = self.kernel.copy(
             instructions=new_instructions,
             iname_to_tag=self.updated_iname_to_tag,
-            temporary_variables=self.updated_temporary_variables)
+            temporary_variables=self.updated_temporary_variables,
+            overridden_get_grid_sizes_for_insn_ids=None)
 
         from loopy.kernel.tools import assign_automatic_axes
         return assign_automatic_axes(kernel)
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index a31f011a0ce8e5403b54984eb45db0970a8370b0..4c1e423e93e104fecd0b49a2b1ef2b4a261e38e7 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -38,6 +38,12 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+def _debug(kernel, s, *args):
+    if logger.isEnabledFor(logging.DEBUG):
+        logstr = s % args
+        logger.debug("%s: %s" % (kernel.name, logstr))
+
+
 # {{{ type inference mapper
 
 class TypeInferenceMapper(CombineMapper):
@@ -112,32 +118,28 @@ class TypeInferenceMapper(CombineMapper):
                 0 <= len(dtype_set) <= 1
                 for dtype_set in dtype_sets)
 
-        if not all(
-                isinstance(dtype, NumpyType)
+        from pytools import is_single_valued
+
+        dtypes = [dtype
                 for dtype_set in dtype_sets
-                for dtype in dtype_set):
-            from pytools import is_single_valued, single_valued
-            if not is_single_valued(
-                    dtype
-                    for dtype_set in dtype_sets
-                    for dtype in dtype_set):
+                for dtype in dtype_set]
+
+        if not all(isinstance(dtype, NumpyType) for dtype in dtypes):
+            if not is_single_valued(dtypes):
                 raise TypeInferenceFailure(
                         "Nothing known about operations between '%s'"
-                        % ", ".join(str(dtype)
-                            for dtype_set in dtype_sets
-                            for dtype in dtype_set))
+                        % ", ".join(str(dtype) for dtype in dtypes))
 
-            return single_valued(dtype
-                            for dtype_set in dtype_sets
-                            for dtype in dtype_set)
+            return [dtypes[0]]
 
-        numpy_dtypes = [dtype.dtype
-                for dtype_set in dtype_sets
-                for dtype in dtype_set]
+        numpy_dtypes = [dtype.dtype for dtype in dtypes]
 
         if not numpy_dtypes:
             return []
 
+        if is_single_valued(numpy_dtypes):
+            return [dtypes[0]]
+
         result = numpy_dtypes.pop()
         while numpy_dtypes:
             other = numpy_dtypes.pop()
@@ -179,7 +181,6 @@ class TypeInferenceMapper(CombineMapper):
             else:
                 dtype_sets.append(dtype_set)
 
-        from pytools import all
         if all(dtype.is_integral()
                 for dtype_set in dtype_sets
                 for dtype in dtype_set):
@@ -383,8 +384,8 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
     if var_name in kernel.all_params():
         return [kernel.index_dtype], []
 
-    def debug(s):
-        logger.debug("%s: %s" % (kernel.name, s))
+    from functools import partial
+    debug = partial(_debug, kernel)
 
     dtype_sets = []
 
@@ -399,7 +400,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
 
         expr = subst_expander(writer_insn.expression)
 
-        debug("             via expr %s" % expr)
+        debug("             via expr %s", expr)
         if isinstance(writer_insn, lp.Assignment):
             result = type_inf_mapper(expr, return_dtype_set=True)
         elif isinstance(writer_insn, lp.CallInstruction):
@@ -421,7 +422,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
                 if result_i is not None:
                     result.append(result_i)
 
-        debug("             result: %s" % result)
+        debug("             result: %s", result)
 
         dtype_sets.append(result)
 
@@ -462,8 +463,11 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     logger.debug("%s: infer types" % kernel.name)
 
-    def debug(s):
-        logger.debug("%s: %s" % (kernel.name, s))
+    from functools import partial
+    debug = partial(_debug, kernel)
+
+    import time
+    start_time = time.time()
 
     unexpanded_kernel = kernel
     if kernel.substitutions:
@@ -489,6 +493,27 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     # }}}
 
+    logger.debug("finding types for {count:d} names".format(
+            count=len(names_for_type_inference)))
+
+    writer_map = kernel.writer_map()
+
+    dep_graph = dict(
+            (written_var, set(
+                read_var
+                for insn_id in writer_map.get(written_var, [])
+                for read_var in kernel.id_to_insn[insn_id].read_dependency_names()
+                if read_var in names_for_type_inference))
+            for written_var in names_for_type_inference)
+
+    from loopy.tools import compute_sccs
+
+    # To speed up processing, we sort the variables by computing the SCCs of the
+    # type dependency graph. Each SCC represents a set of variables whose types
+    # mutually depend on themselves. The SCCs are returned and processed in
+    # topological order.
+    sccs = compute_sccs(dep_graph)
+
     item_lookup = _DictUnionView([
             new_temp_vars,
             new_arg_dict
@@ -502,75 +527,89 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     from loopy.kernel.data import TemporaryVariable, KernelArgument
 
-    changed_during_last_queue_run = False
-    queue = names_for_type_inference[:]
-
-    failed_names = set()
-    while queue or changed_during_last_queue_run:
-        if not queue and changed_during_last_queue_run:
-            changed_during_last_queue_run = False
-            queue = names_for_type_inference[:]
-
-        name = queue.pop(0)
-        item = item_lookup[name]
-
-        debug("inferring type for %s %s" % (type(item).__name__, item.name))
-
-        result, symbols_with_unavailable_types = \
-                _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander)
+    for var_chain in sccs:
+        changed_during_last_queue_run = False
+        queue = var_chain[:]
+        failed_names = set()
+
+        while queue or changed_during_last_queue_run:
+            if not queue and changed_during_last_queue_run:
+                changed_during_last_queue_run = False
+                # Optimization: If there's a single variable in the SCC without
+                # a self-referential dependency, then the type is known after a
+                # single iteration (we don't need to look at the expressions
+                # again).
+                if len(var_chain) == 1:
+                    single_var, = var_chain
+                    if single_var not in dep_graph[single_var]:
+                        break
+                queue = var_chain[:]
+
+            name = queue.pop(0)
+            item = item_lookup[name]
+
+            debug("inferring type for %s %s", type(item).__name__, item.name)
+
+            result, symbols_with_unavailable_types = (
+                    _infer_var_type(
+                            kernel, item.name, type_inf_mapper, subst_expander))
+
+            failed = not result
+            if not failed:
+                new_dtype, = result
+                debug("     success: %s", new_dtype)
+                if new_dtype != item.dtype:
+                    debug("     changed from: %s", item.dtype)
+                    changed_during_last_queue_run = True
+
+                    if isinstance(item, TemporaryVariable):
+                        new_temp_vars[name] = item.copy(dtype=new_dtype)
+                    elif isinstance(item, KernelArgument):
+                        new_arg_dict[name] = item.copy(dtype=new_dtype)
+                    else:
+                        raise LoopyError("unexpected item type in type inference")
+            else:
+                debug("     failure")
+
+            if failed:
+                if item.name in failed_names:
+                    # this item has failed before, give up.
+                    advice = ""
+                    if symbols_with_unavailable_types:
+                        advice += (
+                                " (need type of '%s'--check for missing arguments)"
+                                % ", ".join(symbols_with_unavailable_types))
+
+                    if expect_completion:
+                        raise LoopyError(
+                                "could not determine type of '%s'%s"
+                                % (item.name, advice))
+
+                    else:
+                        # We're done here.
+                        break
 
-        failed = not result
-        if not failed:
-            new_dtype, = result
-            debug("     success: %s" % new_dtype)
-            if new_dtype != item.dtype:
-                debug("     changed from: %s" % item.dtype)
-                changed_during_last_queue_run = True
+                # remember that this item failed
+                failed_names.add(item.name)
 
-                if isinstance(item, TemporaryVariable):
-                    new_temp_vars[name] = item.copy(dtype=new_dtype)
-                elif isinstance(item, KernelArgument):
-                    new_arg_dict[name] = item.copy(dtype=new_dtype)
-                else:
-                    raise LoopyError("unexpected item type in type inference")
-        else:
-            debug("     failure")
-
-        if failed:
-            if item.name in failed_names:
-                # this item has failed before, give up.
-                advice = ""
-                if symbols_with_unavailable_types:
-                    advice += (
-                            " (need type of '%s'--check for missing arguments)"
-                            % ", ".join(symbols_with_unavailable_types))
-
-                if expect_completion:
-                    raise LoopyError(
-                            "could not determine type of '%s'%s"
-                            % (item.name, advice))
-
-                else:
-                    # We're done here.
+                if set(queue) == failed_names:
+                    # We did what we could...
+                    print(queue, failed_names, item.name)
+                    assert not expect_completion
                     break
 
-            # remember that this item failed
-            failed_names.add(item.name)
-
-            if set(queue) == failed_names:
-                # We did what we could...
-                print(queue, failed_names, item.name)
-                assert not expect_completion
-                break
-
-            # can't infer type yet, put back into queue
-            queue.append(name)
-        else:
-            # we've made progress, reset failure markers
-            failed_names = set()
+                # can't infer type yet, put back into queue
+                queue.append(name)
+            else:
+                # we've made progress, reset failure markers
+                failed_names = set()
 
     # }}}
 
+    end_time = time.time()
+    logger.debug("type inference took {dur:.2f} seconds".format(
+            dur=end_time - start_time))
+
     return unexpanded_kernel.copy(
             temporary_variables=new_temp_vars,
             args=[new_arg_dict[arg.name] for arg in kernel.args],
diff --git a/loopy/version.py b/loopy/version.py
index 7de2d687794358f182776b90752ceb81ee2675e4..77d0e21bdd2ef5383c5f874656c25fe1ede21a70 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v50-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v60-islpy%s" % _islpy_version
diff --git a/test/test_apps.py b/test/test_apps.py
index 9eab3fdb1fbc152b65344362d39766793d372d90..c4844d3a3c5d88e0c4eeccf0d67e9b4284fd744f 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -608,6 +608,61 @@ def test_poisson_fem(ctx_factory):
                 parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
 
 
+def test_domain_tree_nesting():
+    # From https://github.com/inducer/loopy/issues/78
+    from loopy.kernel.data import temp_var_scope as scopes
+
+    out_map = np.array([1, 2], dtype=np.int32)
+    if_val = np.array([-1, 0], dtype=np.int32)
+    vals = np.array([2, 3], dtype=np.int32)
+    num_vals = np.array([2, 4], dtype=np.int32)
+    num_vals_offset = np.array(np.cumsum(num_vals) - num_vals, dtype=np.int32)
+
+    TV = lp.TemporaryVariable  # noqa
+
+    knl = lp.make_kernel(['{[i]: 0 <= i < 12}',
+                    '{[j]: 0 <= j < 100}',
+                    '{[a_count]: 0 <= a_count < a_end}',
+                    '{[b_count]: 0 <= b_count < b_end}'],
+    """
+    for j
+        for i
+            <> a_end = abs(if_val[i])
+
+            <>b_end = num_vals[i]
+            <>offset = num_vals_offset[i] {id=offset}
+            <>b_sum = 0 {id=b_init}
+            for b_count
+                <>val = vals[offset + b_count] {dep=offset}
+            end
+            b_sum = exp(b_sum) {id=b_final}
+
+            out[j,i] =  b_sum {dep=b_final}
+        end
+    end
+    """,
+    [
+        TV('out_map', initializer=out_map, read_only=True, scope=scopes.PRIVATE),
+        TV('if_val', initializer=if_val, read_only=True, scope=scopes.PRIVATE),
+        TV('vals', initializer=vals, read_only=True, scope=scopes.PRIVATE),
+        TV('num_vals', initializer=num_vals, read_only=True, scope=scopes.PRIVATE),
+        TV('num_vals_offset', initializer=num_vals_offset, read_only=True,
+            scope=scopes.PRIVATE),
+        lp.GlobalArg('B', shape=(100, 31), dtype=np.float64),
+        lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)])
+
+    parents_per_domain = knl.parents_per_domain()
+
+    def depth(i):
+        if parents_per_domain[i] is None:
+            return 0
+        else:
+            return 1 + depth(parents_per_domain[i])
+
+    for i in range(len(parents_per_domain)):
+        assert depth(i) < 2
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_domain.py b/test/test_domain.py
index e01c3a937fd95984a27a7f2490edf4be12d9b57b..9d0379a50af188cc84de8e01f8278030b6cc04e2 100644
--- a/test/test_domain.py
+++ b/test/test_domain.py
@@ -229,6 +229,41 @@ def test_dependent_loop_bounds_3(ctx_factory):
         list(lp.generate_loop_schedules(knl_bad))
 
 
+def test_dependent_loop_bounds_4():
+    # https://gitlab.tiker.net/inducer/loopy/issues/23
+    import loopy as lp
+
+    loopy_knl = lp.make_kernel(
+        [
+            "{[a]: 0<=a<10}",
+            "{[b]: b_start<=b<b_end}",
+            "{[c,idim]: c_start<=c<c_end and 0<=idim<dim}",
+        ],
+        """
+        for a
+         <> b_start = 1
+         <> b_end = 2
+         for b
+          <> c_start = 1
+          <> c_end = 2
+
+          for c
+           ... nop
+          end
+
+          <>t[idim] = 1
+         end
+        end
+        """,
+        "...",
+        seq_dependencies=True)
+
+    loopy_knl = lp.fix_parameters(loopy_knl, dim=3)
+
+    with lp.CacheMode(False):
+        lp.generate_code_v2(loopy_knl)
+
+
 def test_independent_multi_domain(ctx_factory):
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 96b95cf4b44c603e3ee1948c1d5eceecac277bce..772d536d1e00fedc0b7abcd2f8c05350fe3b633e 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -192,6 +192,10 @@ def test_plain_matrix_mul(ctx_factory):
 def test_variable_size_matrix_mul(ctx_factory):
     ctx = ctx_factory()
 
+    if (not ctx.devices[0].image_support
+            or ctx.devices[0].platform.name == "Portable Computing Language"):
+        pytest.skip("crashes on pocl")
+
     n = get_suitable_size(ctx)
 
     knl = lp.make_kernel(
diff --git a/test/test_loopy.py b/test/test_loopy.py
index e41d55b85e504bcd39db37bd888ddbedbf6122f4..94cdb499c096337db0657267d5afd472eef80a9c 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -102,6 +102,28 @@ def test_type_inference_no_artificial_doubles(ctx_factory):
         assert "double" not in code
 
 
+def test_type_inference_with_type_dependencies():
+    knl = lp.make_kernel(
+            "{[i]: i=0}",
+            """
+            <>a = 99
+            a = a + 1
+            <>b = 0
+            <>c = 1
+            b = b + c + 1.0
+            c = b + c
+            <>d = b + 2 + 1j
+            """,
+            "...")
+    knl = lp.infer_unknown_types(knl)
+
+    from loopy.types import to_loopy_type
+    assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32)
+    assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32)
+    assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32)
+    assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128)
+
+
 def test_sized_and_complex_literals(ctx_factory):
     ctx = ctx_factory()
 
@@ -1473,6 +1495,22 @@ def test_call_with_no_returned_value(ctx_factory):
 # }}}
 
 
+# {{{ call with no return values and options
+
+def test_call_with_options():
+    knl = lp.make_kernel(
+        "{:}",
+        "f() {id=init}"
+        )
+
+    from library_for_test import no_ret_f_mangler
+    knl = lp.register_function_manglers(knl, [no_ret_f_mangler])
+
+    print(lp.generate_code_v2(knl).device_code())
+
+# }}}
+
+
 def test_unschedulable_kernel_detection():
     knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"],
                          """
@@ -1910,8 +1948,8 @@ def test_tight_loop_bounds_codegen():
 
     for_loop = \
         "for (int j = " \
-        "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \
-        "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)"
+        "(gid(0) == 0 && lid(0) == 0 ? 0 : -2 + 2 * lid(0) + 10 * gid(0)); " \
+        "j <= (-1 + gid(0) == 0 && lid(0) == 0 ? 9 : 2 * lid(0)); ++j)"
 
     assert for_loop in cgr.device_code()
 
@@ -1973,6 +2011,138 @@ def test_integer_reduction(ctx_factory):
             assert function(out)
 
 
+def test_nosync_option_parsing():
+    knl = lp.make_kernel(
+        "{[i]: 0 <= i < 10}",
+        """
+        <>t = 1 {id=insn1,nosync=insn1}
+        t = 2   {id=insn2,nosync=insn1:insn2}
+        t = 3   {id=insn3,nosync=insn1@local:insn2@global:insn3@any}
+        t = 4   {id=insn4,nosync_query=id:insn*@local}
+        t = 5   {id=insn5,nosync_query=id:insn1}
+        """,
+        options=lp.Options(allow_terminal_colors=False))
+    kernel_str = str(knl)
+    assert "# insn1,no_sync_with=insn1@any" in kernel_str
+    assert "# insn2,no_sync_with=insn1@any:insn2@any" in kernel_str
+    assert "# insn3,no_sync_with=insn1@local:insn2@global:insn3@any" in kernel_str
+    assert "# insn4,no_sync_with=insn1@local:insn2@local:insn3@local:insn5@local" in kernel_str  # noqa
+    assert "# insn5,no_sync_with=insn1@any" in kernel_str
+
+
+def assert_barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
+    from loopy.schedule import (RunInstruction, Barrier, EnterLoop, LeaveLoop)
+    watch_for_barrier = False
+    seen_barrier = False
+    loop_level = 0
+
+    for sched_item in knl.schedule:
+        if isinstance(sched_item, RunInstruction):
+            if sched_item.insn_id == id1:
+                watch_for_barrier = True
+            elif sched_item.insn_id == id2:
+                assert watch_for_barrier
+                assert seen_barrier
+                return
+        elif isinstance(sched_item, Barrier):
+            if watch_for_barrier and loop_level not in ignore_barriers_in_levels:
+                seen_barrier = True
+        elif isinstance(sched_item, EnterLoop):
+            loop_level += 1
+        elif isinstance(sched_item, LeaveLoop):
+            loop_level -= 1
+
+    raise RuntimeError("id2 was not seen")
+
+
+def test_barrier_insertion_near_top_of_loop():
+    knl = lp.make_kernel(
+        "{[i,j]: 0 <= i,j < 10 }",
+        """
+        for i
+         <>a[i] = i  {id=ainit}
+         for j
+          <>t = a[(i + 1) % 10]  {id=tcomp}
+          <>b[i,j] = a[i] + t   {id=bcomp1}
+          b[i,j] = b[i,j] + 1  {id=bcomp2}
+         end
+        end
+        """,
+        seq_dependencies=True)
+
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.set_temporary_scope(knl, "a", "local")
+    knl = lp.set_temporary_scope(knl, "b", "local")
+    knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+
+    print(knl)
+
+    assert_barrier_between(knl, "ainit", "tcomp")
+    assert_barrier_between(knl, "tcomp", "bcomp1")
+    assert_barrier_between(knl, "bcomp1", "bcomp2")
+
+
+def test_barrier_insertion_near_bottom_of_loop():
+    knl = lp.make_kernel(
+        ["{[i]: 0 <= i < 10 }",
+         "[jmax] -> {[j]: 0 <= j < jmax}"],
+        """
+        for i
+         <>a[i] = i  {id=ainit}
+         for j
+          <>b[i,j] = a[i] + t   {id=bcomp1}
+          b[i,j] = b[i,j] + 1  {id=bcomp2}
+         end
+         a[i] = i + 1 {id=aupdate}
+        end
+        """,
+        seq_dependencies=True)
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.set_temporary_scope(knl, "a", "local")
+    knl = lp.set_temporary_scope(knl, "b", "local")
+    knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+
+    print(knl)
+
+    assert_barrier_between(knl, "bcomp1", "bcomp2")
+    assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1])
+
+
+def test_struct_assignment(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    bbhit = np.dtype([
+        ("tmin", np.float32),
+        ("tmax", np.float32),
+        ("bi", np.int32),
+        ("hit", np.int32)])
+
+    bbhit, bbhit_c_decl = cl.tools.match_dtype_to_c_struct(
+            ctx.devices[0], "bbhit", bbhit)
+    bbhit = cl.tools.get_or_register_dtype('bbhit', bbhit)
+
+    preamble = bbhit_c_decl
+
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<N }",
+        """
+        for i
+            result[i].hit = i % 2
+            result[i].tmin = i
+            result[i].tmax = i+10
+            result[i].bi = i
+        end
+        """,
+        [
+            lp.GlobalArg("result", shape=("N",), dtype=bbhit),
+            "..."],
+        preambles=[("000", preamble)])
+
+    knl = lp.set_options(knl, write_cl=True)
+    knl(queue, N=200)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_misc.py b/test/test_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a22e424630255df4225586eeb9f0d62a03d5318f
--- /dev/null
+++ b/test/test_misc.py
@@ -0,0 +1,154 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2016 Matt Wala"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six  # noqa
+import pytest
+from six.moves import range
+
+import sys
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+def test_compute_sccs():
+    from loopy.tools import compute_sccs
+    import random
+
+    rng = random.Random(0)
+
+    def generate_random_graph(nnodes):
+        graph = dict((i, set()) for i in range(nnodes))
+        for i in range(nnodes):
+            for j in range(nnodes):
+                # Edge probability 2/n: Generates decently interesting inputs.
+                if rng.randint(0, nnodes - 1) <= 1:
+                    graph[i].add(j)
+        return graph
+
+    def verify_sccs(graph, sccs):
+        visited = set()
+
+        def visit(node):
+            if node in visited:
+                return []
+            else:
+                visited.add(node)
+                result = []
+                for child in graph[node]:
+                    result = result + visit(child)
+                return result + [node]
+
+        for scc in sccs:
+            scc = set(scc)
+            assert not scc & visited
+            # Check that starting from each element of the SCC results
+            # in the same set of reachable nodes.
+            for scc_root in scc:
+                visited.difference_update(scc)
+                result = visit(scc_root)
+                assert set(result) == scc, (set(result), scc)
+
+    for nnodes in range(10, 20):
+        for i in range(40):
+            graph = generate_random_graph(nnodes)
+            verify_sccs(graph, compute_sccs(graph))
+
+
+def test_SetTrie():
+    from loopy.kernel.tools import SetTrie
+
+    s = SetTrie()
+    s.add_or_update(set([1, 2, 3]))
+    s.add_or_update(set([4, 2, 1]))
+    s.add_or_update(set([1, 5]))
+
+    result = []
+    s.descend(lambda prefix: result.extend(prefix))
+    assert result == [1, 2, 3, 4, 5]
+
+    with pytest.raises(ValueError):
+        s.add_or_update(set([1, 4]))
+
+
+class PicklableItem(object):
+
+    flags = {"unpickled": False}
+
+    def __getstate__(self):
+        return True
+
+    def __setstate__(self, state):
+        PicklableItem.flags["unpickled"] = True
+
+
+def test_LazilyUnpicklingDictionary():
+    def is_unpickled():
+        return PicklableItem.flags["unpickled"]
+
+    from loopy.tools import LazilyUnpicklingDictionary
+
+    mapping = LazilyUnpicklingDictionary({0: PicklableItem()})
+
+    assert not is_unpickled()
+
+    from pickle import loads, dumps
+
+    pickled_mapping = dumps(mapping)
+
+    # {{{ test lazy loading
+
+    mapping = loads(pickled_mapping)
+    assert not is_unpickled()
+    list(mapping.keys())
+    assert not is_unpickled()
+    assert isinstance(mapping[0], PicklableItem)
+    assert is_unpickled()
+
+    # }}}
+
+    # {{{ test multi round trip
+
+    mapping = loads(dumps(loads(pickled_mapping)))
+    assert isinstance(mapping[0], PicklableItem)
+
+    # }}}
+
+    # {{{ test empty map
+
+    mapping = LazilyUnpicklingDictionary({})
+    mapping = loads(dumps(mapping))
+    assert len(mapping) == 0
+
+    # }}}
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 820c669da494f4d8863d274120cd5c0c7eb4420f..86e72c0c6644b7b9837a6d74da756c58344b1d6f 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -181,7 +181,7 @@ def test_recursive_nested_dependent_reduction(ctx_factory):
     # FIXME: Actually test functionality.
 
 
-@pytest.mark.parametrize("size", [128, 5, 113, 67])
+@pytest.mark.parametrize("size", [128, 5, 113, 67, 1])
 def test_local_parallel_reduction(ctx_factory, size):
     ctx = ctx_factory()
 
@@ -393,6 +393,18 @@ def test_double_sum_made_unique(ctx_factory):
     assert b.get() == ref
 
 
+def test_parallel_multi_output_reduction():
+    knl = lp.make_kernel(
+                "{[i]: 0<=i<128}",
+                """
+                max_val, max_indices = argmax(i, fabs(a[i]))
+                """)
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.realize_reduction(knl)
+    print(knl)
+    # TODO: Add functional test
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])