diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index d206faad5bd84e3a1c7e7c061673f3d5d1144c84..789c00d33b7bb41816e6901e24046d4b0eefb27d 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -1,8 +1,6 @@
 """Loop nest build top-level control/hoisting."""
 
-from __future__ import division
-from __future__ import absolute_import
-import six
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -26,12 +24,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-
+import six
 from loopy.codegen.result import merge_codegen_results, wrap_in_if
 import islpy as isl
 from loopy.schedule import (
         EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel,
         gather_schedule_block, generate_sub_sched_items)
+from loopy.diagnostic import LoopyError
 
 
 def get_admissible_conditional_inames_for(codegen_state, sched_index):
@@ -150,15 +149,36 @@ def generate_code_for_sched_index(codegen_state, sched_index):
         return func(codegen_state, sched_index)
 
     elif isinstance(sched_item, Barrier):
+        # {{{ emit barrier code
+
+        from loopy.codegen.result import CodeGenerationResult
+
         if codegen_state.is_generating_device_code:
-            return codegen_state.ast_builder.emit_barrier(
+            barrier_ast = codegen_state.ast_builder.emit_barrier(
                     sched_item.kind, sched_item.comment)
-        from loopy.codegen.result import CodeGenerationResult
-        return CodeGenerationResult(
-                host_program=None,
-                device_programs=[],
-                implemented_domains={},
-                implemented_data_info=codegen_state.implemented_data_info)
+            if sched_item.originating_insn_id:
+                return CodeGenerationResult.new(
+                        codegen_state,
+                        sched_item.originating_insn_id,
+                        barrier_ast,
+                        codegen_state.implemented_domain)
+            else:
+                return barrier_ast
+        else:
+            # host code
+            if sched_item.kind in ["global", "local"]:
+                # host code is assumed globally and locally synchronous
+                return CodeGenerationResult(
+                        host_program=None,
+                        device_programs=[],
+                        implemented_domains={},
+                        implemented_data_info=codegen_state.implemented_data_info)
+
+            else:
+                raise LoopyError("do not know how to emit code for barrier kind '%s'"
+                        "in host code" % sched_item.kind)
+
+        # }}}
 
     elif isinstance(sched_item, RunInstruction):
         insn = kernel.id_to_insn[sched_item.insn_id]
diff --git a/loopy/execution.py b/loopy/execution.py
index dac5b2ff80767ae00c126aba31c2851cfe3769ef..07e28f06d33e5884ac57c9505593c9ee916c3171 100644
--- a/loopy/execution.py
+++ b/loopy/execution.py
@@ -187,7 +187,12 @@ class KernelExecutorBase(object):
     def get_typed_and_scheduled_kernel(self, arg_to_dtype_set):
         from loopy import CACHING_ENABLED
 
-        cache_key = (type(self).__name__, self.kernel, arg_to_dtype_set)
+        from loopy.preprocess import prepare_for_caching
+        # prepare_for_caching() gets run by preprocess, but the kernel at this
+        # stage is not guaranteed to be preprocessed.
+        cacheable_kernel = prepare_for_caching(self.kernel)
+        cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set)
+
         if CACHING_ENABLED:
             try:
                 return typed_and_scheduled_cache[cache_key]
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index 50272e5fece9322db3f63104698e21f68c4f21db..e801d09dcf10750ce09af647e0b14f4641fa1fb2 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -212,6 +212,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         self.instruction_tags = []
         self.conditions = []
+        self.conditions_data = []
 
         self.filename = filename
 
@@ -312,7 +313,16 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
     def dtype_from_stmt(self, stmt):
         length, kind = stmt.selector
-        assert not kind
+
+        if kind and not length:
+            length = kind
+        elif length and not kind:
+            pass
+        elif not length and not kind:
+            pass
+        else:
+            raise RuntimeError("both length and kind specified")
+
         return np.dtype(self.TYPE_MAP[(type(stmt).__name__.lower(), length)])
 
     def map_type_decl(self, node):
@@ -442,7 +452,7 @@ class F2LoopyTranslator(FTreeWalkerBase):
         # node.expr
         # node.content[0]
 
-    def map_IfThen(self, node):
+    def realize_conditional(self, node, context_cond=None):
         scope = self.scope_stack[-1]
 
         cond_name = intern("loopy_cond%d" % self.condition_id_counter)
@@ -457,22 +467,53 @@ class F2LoopyTranslator(FTreeWalkerBase):
         self.add_expression_instruction(
                 cond_var, self.parse_expr(node, node.expr))
 
-        self.conditions.append(cond_name)
+        cond_expr = cond_var
+        if context_cond is not None:
+            from pymbolic.primitives import LogicalAnd
+            cond_expr = LogicalAnd((cond_var, context_cond))
 
+            self.conditions_data.append((context_cond, cond_var))
+        else:
+            self.conditions_data.append((None, cond_var))
+
+        self.conditions.append(cond_expr)
+
+    def map_IfThen(self, node):
         self.block_nest.append("if")
+        self.realize_conditional(node, None)
+
         for c in node.content:
             self.rec(c)
 
+    def construct_else_condition(self):
+        context_cond, prev_cond = self.conditions_data.pop()
+        if prev_cond is None:
+            raise RuntimeError("else if may not follow else")
+
+        self.conditions.pop()
+
+        from pymbolic.primitives import LogicalNot, LogicalAnd
+        else_expr = LogicalNot(prev_cond)
+        if context_cond is not None:
+            else_expr = LogicalAnd((else_expr, context_cond))
+
+        return else_expr
+
     def map_Else(self, node):
-        cond_name = self.conditions.pop()
-        self.conditions.append("!" + cond_name)
+        else_cond = self.construct_else_condition()
+        self.conditions.append(else_cond)
+        self.conditions_data.append((else_cond, None))
+
+    def map_ElseIf(self, node):
+        self.realize_conditional(node, self.construct_else_condition())
 
     def map_EndIfThen(self, node):
         if not self.block_nest:
-            raise TranslationError("no if block started at end do")
+            raise TranslationError("no if block started at end if")
         if self.block_nest.pop() != "if":
             raise TranslationError("mismatched end if")
 
+        self.conditions_data.pop()
         self.conditions.pop()
 
     def map_Do(self, node):
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index a02fc58d97f370d45f36a465c38fa3caf3da9d41..531cc822e1bc76573ef6e0812970d16bd6df0b17 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -862,6 +862,16 @@ class ArrayBase(ImmutableRecord):
     def __repr__(self):
         return "<%s>" % self.__str__()
 
+    def update_persistent_hash_for_shape(self, key_hash, key_builder, shape):
+        if isinstance(shape, tuple):
+            for shape_i in shape:
+                if shape_i is None:
+                    key_builder.rec(key_hash, shape_i)
+                else:
+                    key_builder.update_for_pymbolic_expression(key_hash, shape_i)
+        else:
+            key_builder.rec(key_hash, shape)
+
     def update_persistent_hash(self, key_hash, key_builder):
         """Custom hash computation function for use with
         :class:`pytools.persistent_dict.PersistentDict`.
@@ -869,14 +879,7 @@ class ArrayBase(ImmutableRecord):
 
         key_builder.rec(key_hash, self.name)
         key_builder.rec(key_hash, self.dtype)
-        if isinstance(self.shape, tuple):
-            for shape_i in self.shape:
-                if shape_i is None:
-                    key_builder.rec(key_hash, shape_i)
-                else:
-                    key_builder.update_for_pymbolic_expression(key_hash, shape_i)
-        else:
-            key_builder.rec(key_hash, self.shape)
+        self.update_persistent_hash_for_shape(key_hash, key_builder, self.shape)
         key_builder.rec(key_hash, self.dim_tags)
         key_builder.rec(key_hash, self.offset)
         key_builder.rec(key_hash, self.dim_names)
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 14b18150f5b84218f39ba23662eb6106ffb596a0..89cb5f26a4940656cca1ab09841311148e113275 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -366,55 +366,55 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None):
 # {{{ parse one instruction
 
 WITH_OPTIONS_RE = re.compile(
-        "^"
-        "\s*with\s*"
-        "\{(?P<options>.+)\}"
-        "\s*$")
+        r"^"
+        r"\s*with\s*"
+        r"\{(?P<options>.+)\}"
+        r"\s*$")
 
 FOR_RE = re.compile(
-        "^"
-        "\s*(for)\s+"
-        "(?P<inames>[ ,\w]*)"
-        "\s*$")
+        r"^"
+        r"\s*(for)\s+"
+        r"(?P<inames>[ ,\w]*)"
+        r"\s*$")
 
 IF_RE = re.compile(
-        "^"
-        "\s*if\s+"
-        "(?P<predicate>.+)"
-        "\s*$")
+        r"^"
+        r"\s*if\s+"
+        r"(?P<predicate>.+)"
+        r"\s*$")
 
 ELIF_RE = re.compile(
-        "^"
-        "\s*elif\s+"
-        "(?P<predicate>.+)"
-        "\s*$")
+        r"^"
+        r"\s*elif\s+"
+        r"(?P<predicate>.+)"
+        r"\s*$")
 
-ELSE_RE = re.compile("^\s*else\s*$")
+ELSE_RE = re.compile(r"^\s*else\s*$")
 
 INSN_RE = re.compile(
-        "^"
-        "\s*"
-        "(?P<lhs>[^{]+?)"
-        "\s*(?<!\:)=\s*"
-        "(?P<rhs>.+?)"
-        "\s*?"
-        "(?:\{(?P<options>.+)\}\s*)?$")
+        r"^"
+        r"\s*"
+        r"(?P<lhs>[^{]+?)"
+        r"\s*(?<!\:)=\s*"
+        r"(?P<rhs>.+?)"
+        r"\s*?"
+        r"(?:\{(?P<options>.+)\}\s*)?$")
 
 EMPTY_LHS_INSN_RE = re.compile(
-        "^"
-        "\s*"
-        "(?P<rhs>.+?)"
-        "\s*?"
-        "(?:\{(?P<options>.+)\}\s*)?$")
+        r"^"
+        r"\s*"
+        r"(?P<rhs>.+?)"
+        r"\s*?"
+        r"(?:\{(?P<options>.+)\}\s*)?$")
 
 SPECIAL_INSN_RE = re.compile(
-        "^"
-        "\s*"
-        "\.\.\."
-        "\s*"
-        "(?P<kind>[a-z]+?)"
-        "\s*?"
-        "(?:\{(?P<options>.+)\}\s*)?$")
+        r"^"
+        r"\s*"
+        r"\.\.\."
+        r"\s*"
+        r"(?P<kind>[a-z]+?)"
+        r"\s*?"
+        r"(?:\{(?P<options>.+)\}\s*)?$")
 
 SUBST_RE = re.compile(
         r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$")
@@ -582,6 +582,9 @@ def parse_special_insn(groups, insn_options):
     if special_insn_kind == "gbarrier":
         cls = BarrierInstruction
         kwargs["kind"] = "global"
+    elif special_insn_kind == "lbarrier":
+        cls = BarrierInstruction
+        kwargs["kind"] = "local"
     elif special_insn_kind == "nop":
         cls = NoOpInstruction
     else:
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 001dd06326edcad14d8ecd39e29229dd45de8ef2..94b31df12dae516d3539438b7e4ed66ed765e697 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -497,7 +497,8 @@ class TemporaryVariable(ArrayBase):
         """
 
         super(TemporaryVariable, self).update_persistent_hash(key_hash, key_builder)
-        key_builder.rec(key_hash, self.storage_shape)
+        self.update_persistent_hash_for_shape(key_hash, key_builder,
+                self.storage_shape)
         key_builder.rec(key_hash, self.base_indices)
 
         initializer = self.initializer
@@ -510,7 +511,7 @@ class TemporaryVariable(ArrayBase):
 # }}}
 
 
-# {{{ subsitution rule
+# {{{ substitution rule
 
 class SubstitutionRule(ImmutableRecord):
     """
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 08268ca9f27623a6d17a195d3c04acb55e5ec68a..d5c388af60a39987c09092fc93325f067a8f4cf7 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -1312,11 +1312,12 @@ class BarrierInstruction(_DataObliviousInstruction):
 
     .. attribute:: kind
 
-        A string, currently only ``"global"``.
+        A string, ``"global"`` or ``"local"``.
 
     The textual syntax in a :mod:`loopy` kernel is::
 
         ... gbarrier
+        ... lbarrier
     """
 
     fields = _DataObliviousInstruction.fields | set(["kind"])
@@ -1328,7 +1329,6 @@ class BarrierInstruction(_DataObliviousInstruction):
             priority=None,
             boostable=None, boostable_into=None,
             predicates=None, tags=None, kind="global"):
-        assert kind == "global"
 
         if predicates:
             raise LoopyError("conditional barriers are not supported")
diff --git a/loopy/library/function.py b/loopy/library/function.py
index efa590371bb632cbc9776078ea6b5c64f626d46a..9d557ac9fe5c4c040608dc181b96daa812405a65 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -26,7 +26,7 @@ THE SOFTWARE.
 def default_function_mangler(kernel, name, arg_dtypes):
     from loopy.library.reduction import reduction_function_mangler
 
-    manglers = [reduction_function_mangler]
+    manglers = [reduction_function_mangler, tuple_function_mangler]
     for mangler in manglers:
         result = mangler(kernel, name, arg_dtypes)
         if result is not None:
@@ -45,4 +45,15 @@ def single_arg_function_mangler(kernel, name, arg_dtypes):
     return None
 
 
+def tuple_function_mangler(kernel, name, arg_dtypes):
+    if name == "make_tuple":
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(
+                target_name="loopy_make_tuple",
+                result_dtypes=arg_dtypes,
+                arg_dtypes=arg_dtypes)
+
+    return None
+
+
 # vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index f9648bde7dc4d685ca9daf63ecf15b69496c8651..0e5a093b76b8d09d331edead7c69fcc2e3134601 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -123,6 +123,7 @@ class ScalarReductionOperation(ReductionOperation):
 
 class SumReductionOperation(ScalarReductionOperation):
     def neutral_element(self, dtype):
+        # FIXME: Document that we always use an int here.
         return 0
 
     def __call__(self, dtype, operand1, operand2):
@@ -131,6 +132,7 @@ class SumReductionOperation(ScalarReductionOperation):
 
 class ProductReductionOperation(ScalarReductionOperation):
     def neutral_element(self, dtype):
+        # FIXME: Document that we always use an int here.
         return 1
 
     def __call__(self, dtype, operand1, operand2):
@@ -189,8 +191,30 @@ class MinReductionOperation(ScalarReductionOperation):
         return var("min")(operand1, operand2)
 
 
+# {{{ base class for symbolic reduction ops
+
+class ReductionOpFunction(FunctionIdentifier):
+    init_arg_names = ("reduction_op",)
+
+    def __init__(self, reduction_op):
+        self.reduction_op = reduction_op
+
+    def __getinitargs__(self):
+        return (self.reduction_op,)
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+# }}}
+
+
 # {{{ segmented reduction
 
+class SegmentedOp(ReductionOpFunction):
+    pass
+
+
 class _SegmentedScalarReductionOperation(ReductionOperation):
     def __init__(self, **kwargs):
         self.inner_reduction = self.base_reduction_class(**kwargs)
@@ -205,7 +229,9 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
                 segment_flag_dtype.numpy_dtype.type.__name__)
 
     def neutral_element(self, scalar_dtype, segment_flag_dtype):
-        return SegmentedFunction(self, (scalar_dtype, segment_flag_dtype), "init")()
+        scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype)
+        return var("make_tuple")(scalar_neutral_element,
+                segment_flag_dtype.numpy_dtype.type(0))
 
     def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype):
         return (self.inner_reduction.result_dtypes(kernel, scalar_dtype)
@@ -221,7 +247,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
         return type(self) == type(other)
 
     def __call__(self, dtypes, operand1, operand2):
-        return SegmentedFunction(self, dtypes, "update")(*(operand1 + operand2))
+        return SegmentedOp(self)(*(operand1 + operand2))
 
 
 class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
@@ -236,45 +262,14 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
     which = "product"
 
 
-class SegmentedFunction(FunctionIdentifier):
-    init_arg_names = ("reduction_op", "dtypes", "name")
-
-    def __init__(self, reduction_op, dtypes, name):
-        """
-        :arg dtypes: A :class:`tuple` of `(scalar_dtype, segment_flag_dtype)`
-        """
-        self.reduction_op = reduction_op
-        self.dtypes = dtypes
-        self.name = name
-
-    @property
-    def scalar_dtype(self):
-        return self.dtypes[0]
-
-    @property
-    def segment_flag_dtype(self):
-        return self.dtypes[1]
-
-    def __getinitargs__(self):
-        return (self.reduction_op, self.dtypes, self.name)
-
-
-def get_segmented_function_preamble(kernel, func_id):
+def get_segmented_function_preamble(kernel, func_id, arg_dtypes):
     op = func_id.reduction_op
-    prefix = op.prefix(func_id.scalar_dtype, func_id.segment_flag_dtype)
-
-    from pymbolic.mapper.c_code import CCodeMapper
-
-    c_code_mapper = CCodeMapper()
+    scalar_dtype = arg_dtypes[0]
+    segment_flag_dtype = arg_dtypes[1]
+    prefix = op.prefix(scalar_dtype, segment_flag_dtype)
 
     return (prefix, """
-    inline %(scalar_t)s %(prefix)s_init(%(segment_flag_t)s *segment_flag_out)
-    {
-        *segment_flag_out = 0;
-        return %(neutral)s;
-    }
-
-    inline %(scalar_t)s %(prefix)s_update(
+    inline %(scalar_t)s %(prefix)s_op(
         %(scalar_t)s op1, %(segment_flag_t)s segment_flag1,
         %(scalar_t)s op2, %(segment_flag_t)s segment_flag2,
         %(segment_flag_t)s *segment_flag_out)
@@ -283,32 +278,36 @@ def get_segmented_function_preamble(kernel, func_id):
         return segment_flag2 ? op2 : %(combined)s;
     }
     """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype),
+            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
             prefix=prefix,
-            segment_flag_t=kernel.target.dtype_to_typename(
-                    func_id.segment_flag_dtype),
-            neutral=c_code_mapper(
-                    op.inner_reduction.neutral_element(func_id.scalar_dtype)),
+            segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype),
             combined=op.op % ("op1", "op2"),
             ))
 
-
 # }}}
 
 
 # {{{ argmin/argmax
 
+class ArgExtOp(ReductionOpFunction):
+    pass
+
+
 class _ArgExtremumReductionOperation(ReductionOperation):
     def prefix(self, scalar_dtype, index_dtype):
         return "loopy_arg%s_%s_%s" % (self.which,
-                index_dtype.numpy_dtype.type.__name__,
-                scalar_dtype.numpy_dtype.type.__name__)
+                scalar_dtype.numpy_dtype.type.__name__,
+                index_dtype.numpy_dtype.type.__name__)
 
     def result_dtypes(self, kernel, scalar_dtype, index_dtype):
         return (scalar_dtype, index_dtype)
 
     def neutral_element(self, scalar_dtype, index_dtype):
-        return ArgExtFunction(self, (scalar_dtype, index_dtype), "init")()
+        scalar_neutral_func = (
+                get_ge_neutral if self.neutral_sign < 0 else get_le_neutral)
+        scalar_neutral_element = scalar_neutral_func(scalar_dtype)
+        return var("make_tuple")(scalar_neutral_element,
+                index_dtype.numpy_dtype.type(-1))
 
     def __str__(self):
         return self.which
@@ -324,7 +323,7 @@ class _ArgExtremumReductionOperation(ReductionOperation):
         return 2
 
     def __call__(self, dtypes, operand1, operand2):
-        return ArgExtFunction(self, dtypes, "update")(*(operand1 + operand2))
+        return ArgExtOp(self)(*(operand1 + operand2))
 
 
 class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
@@ -339,44 +338,15 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation):
     neutral_sign = +1
 
 
-class ArgExtFunction(FunctionIdentifier):
-    init_arg_names = ("reduction_op", "dtypes", "name")
-
-    def __init__(self, reduction_op, dtypes, name):
-        self.reduction_op = reduction_op
-        self.dtypes = dtypes
-        self.name = name
-
-    @property
-    def scalar_dtype(self):
-        return self.dtypes[0]
-
-    @property
-    def index_dtype(self):
-        return self.dtypes[1]
-
-    def __getinitargs__(self):
-        return (self.reduction_op, self.dtypes, self.name)
-
-
-def get_argext_preamble(kernel, func_id):
+def get_argext_preamble(kernel, func_id, arg_dtypes):
     op = func_id.reduction_op
-    prefix = op.prefix(func_id.scalar_dtype, func_id.index_dtype)
-
-    from pymbolic.mapper.c_code import CCodeMapper
+    scalar_dtype = arg_dtypes[0]
+    index_dtype = arg_dtypes[1]
 
-    c_code_mapper = CCodeMapper()
-
-    neutral = get_ge_neutral if op.neutral_sign < 0 else get_le_neutral
+    prefix = op.prefix(scalar_dtype, index_dtype)
 
     return (prefix, """
-    inline %(scalar_t)s %(prefix)s_init(%(index_t)s *index_out)
-    {
-        *index_out = INT_MIN;
-        return %(neutral)s;
-    }
-
-    inline %(scalar_t)s %(prefix)s_update(
+    inline %(scalar_t)s %(prefix)s_op(
         %(scalar_t)s op1, %(index_t)s index1,
         %(scalar_t)s op2, %(index_t)s index2,
         %(index_t)s *index_out)
@@ -393,10 +363,9 @@ def get_argext_preamble(kernel, func_id):
         }
     }
     """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype),
+            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
             prefix=prefix,
-            index_t=kernel.target.dtype_to_typename(func_id.index_dtype),
-            neutral=c_code_mapper(neutral(func_id.scalar_dtype)),
+            index_t=kernel.target.dtype_to_typename(index_dtype),
             comp=op.update_comparison,
             ))
 
@@ -454,76 +423,48 @@ def parse_reduction_op(name):
 
 
 def reduction_function_mangler(kernel, func_id, arg_dtypes):
-    if isinstance(func_id, ArgExtFunction) and func_id.name == "init":
+    if isinstance(func_id, ArgExtOp):
         from loopy.target.opencl import CTarget
         if not isinstance(kernel.target, CTarget):
             raise LoopyError("%s: only C-like targets supported for now" % func_id)
 
         op = func_id.reduction_op
+        scalar_dtype = arg_dtypes[0]
+        index_dtype = arg_dtypes[1]
 
         from loopy.kernel.data import CallMangleInfo
         return CallMangleInfo(
-                target_name="%s_init" % op.prefix(
-                    func_id.scalar_dtype, func_id.index_dtype),
+                target_name="%s_op" % op.prefix(
+                    scalar_dtype, index_dtype),
                 result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.index_dtype),
-                arg_dtypes=(),
-                )
-
-    elif isinstance(func_id, ArgExtFunction) and func_id.name == "update":
-        from loopy.target.opencl import CTarget
-        if not isinstance(kernel.target, CTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_update" % op.prefix(
-                    func_id.scalar_dtype, func_id.index_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.index_dtype),
+                    kernel, scalar_dtype, index_dtype),
                 arg_dtypes=(
-                    func_id.scalar_dtype,
-                    kernel.index_dtype,
-                    func_id.scalar_dtype,
-                    kernel.index_dtype),
-                )
-
-    elif isinstance(func_id, SegmentedFunction) and func_id.name == "init":
-        from loopy.target.opencl import CTarget
-        if not isinstance(kernel.target, CTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_init" % op.prefix(
-                    func_id.scalar_dtype, func_id.segment_flag_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.segment_flag_dtype),
-                arg_dtypes=(),
+                    scalar_dtype,
+                    index_dtype,
+                    scalar_dtype,
+                    index_dtype),
                 )
 
-    elif isinstance(func_id, SegmentedFunction) and func_id.name == "update":
+    elif isinstance(func_id, SegmentedOp):
         from loopy.target.opencl import CTarget
         if not isinstance(kernel.target, CTarget):
             raise LoopyError("%s: only C-like targets supported for now" % func_id)
 
         op = func_id.reduction_op
+        scalar_dtype = arg_dtypes[0]
+        segment_flag_dtype = arg_dtypes[1]
 
         from loopy.kernel.data import CallMangleInfo
         return CallMangleInfo(
-                target_name="%s_update" % op.prefix(
-                    func_id.scalar_dtype, func_id.segment_flag_dtype),
+                target_name="%s_op" % op.prefix(
+                    scalar_dtype, segment_flag_dtype),
                 result_dtypes=op.result_dtypes(
-                    kernel, func_id.scalar_dtype, func_id.segment_flag_dtype),
+                    kernel, scalar_dtype, segment_flag_dtype),
                 arg_dtypes=(
-                    func_id.scalar_dtype,
-                    func_id.segment_flag_dtype,
-                    func_id.scalar_dtype,
-                    func_id.segment_flag_dtype),
+                    scalar_dtype,
+                    segment_flag_dtype,
+                    scalar_dtype,
+                    segment_flag_dtype),
                 )
 
     return None
@@ -533,16 +474,18 @@ def reduction_preamble_generator(preamble_info):
     from loopy.target.opencl import OpenCLTarget
 
     for func in preamble_info.seen_functions:
-        if isinstance(func.name, ArgExtFunction):
+        if isinstance(func.name, ArgExtOp):
             if not isinstance(preamble_info.kernel.target, OpenCLTarget):
                 raise LoopyError("only OpenCL supported for now")
 
-            yield get_argext_preamble(preamble_info.kernel, func.name)
+            yield get_argext_preamble(preamble_info.kernel, func.name,
+                    func.arg_dtypes)
 
-        elif isinstance(func.name, SegmentedFunction):
+        elif isinstance(func.name, SegmentedOp):
             if not isinstance(preamble_info.kernel.target, OpenCLTarget):
                 raise LoopyError("only OpenCL supported for now")
 
-            yield get_segmented_function_preamble(preamble_info.kernel, func.name)
+            yield get_segmented_function_preamble(preamble_info.kernel, func.name,
+                    func.arg_dtypes)
 
 # vim: fdm=marker
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 57cf74b808ae1a7107e76a18a3876785ab8baabd..4281e50bd006a3cddf5a3cae0ffffe3d78abcfac 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -363,9 +363,12 @@ def gen_dependencies_except(kernel, insn_id, except_insn_ids):
 def get_priority_tiers(wanted, priorities):
     # Get highest priority tier candidates: These are the first inames
     # of all the given priority constraints
-    candidates = set(next(iter(p for p in prio if p in wanted))
-                     for prio in priorities
-                     )
+    candidates = set()
+    for prio in priorities:
+        for p in prio:
+            if p in wanted:
+                candidates.add(p)
+                break
 
     # Now shrink this set by removing those inames that are prohibited
     # by other constraints
@@ -383,19 +386,19 @@ def get_priority_tiers(wanted, priorities):
     candidates = candidates - set(bad_candidates)
 
     if candidates:
-        # We found a valid priority tier!
+        # We found a valid priority tier
         yield candidates
     else:
-        # If we did not, we stop the generator!
+        # If we did not, stop the generator
         return
 
-    # Now reduce the input data for recursion!
+    # Now reduce the input data for recursion
     priorities = frozenset([tuple(i for i in prio if i not in candidates)
                             for prio in priorities
                             ]) - frozenset([()])
     wanted = wanted - candidates
 
-    # Yield recursively!
+    # Yield recursively
     for tier in get_priority_tiers(wanted, priorities):
         yield tier
 
@@ -596,7 +599,8 @@ class SchedulerState(ImmutableRecord):
     .. attribute:: preschedule
 
         A sequence of schedule items that must be inserted into the
-        schedule, maintaining the same ordering
+        schedule, maintaining the same relative ordering. Newly scheduled
+        items may interleave this sequence.
 
     .. attribute:: prescheduled_insn_ids
 
@@ -728,13 +732,15 @@ def generate_loop_schedules_internal(
 
     # }}}
 
-    # {{{ see if there are pending local barriers in the preschedule
+    # {{{ see if there are pending barriers in the preschedule
 
-    # Local barriers do not have associated instructions, so they need to
-    # be handled separately from instructions.
+    # Barriers that do not have an originating instruction are handled here.
+    # (These are automatically inserted by insert_barriers().) Barriers with
+    # originating instructions are handled as part of normal instruction
+    # scheduling below.
     if (
             isinstance(next_preschedule_item, Barrier)
-            and next_preschedule_item.kind == "local"):
+            and next_preschedule_item.originating_insn_id is None):
         for result in generate_loop_schedules_internal(
                     sched_state.copy(
                         schedule=sched_state.schedule + (next_preschedule_item,),
@@ -810,10 +816,7 @@ def generate_loop_schedules_internal(
         if insn_id in sched_state.prescheduled_insn_ids:
             if isinstance(next_preschedule_item, RunInstruction):
                 next_preschedule_insn_id = next_preschedule_item.insn_id
-            elif (
-                    isinstance(next_preschedule_item, Barrier)
-                    and next_preschedule_item.kind == "global"):
-                assert hasattr(next_preschedule_item, "originating_insn_id")
+            elif isinstance(next_preschedule_item, Barrier):
                 assert next_preschedule_item.originating_insn_id is not None
                 next_preschedule_insn_id = next_preschedule_item.originating_insn_id
             else:
@@ -1073,28 +1076,6 @@ def generate_loop_schedules_internal(
                           % iname)
                 continue
 
-            if (
-                    not sched_state.within_subkernel
-                    and iname not in sched_state.prescheduled_inames):
-                # Avoid messing up some orderings such as picking:
-                #
-                # EnterLoop(temporary.reload)
-                # CallKernel
-                # ...
-                #
-                # instead of
-                #
-                # CallKernel
-                # EnterLoop(temporary.reload)
-                # ...
-                #
-                # This serves a heuristic to catch some bad decisions early, the
-                # scheduler will not allow the first variant regardless.
-                if debug_mode:
-                    print("scheduling '%s' prohibited because we are outside "
-                          "a subkernel" % iname)
-                continue
-
             currently_accessible_inames = (
                     active_inames_set | sched_state.parallel_inames)
             if (
@@ -1624,7 +1605,10 @@ def append_barrier_or_raise_error(schedule, dep, verify_only):
         comment = "for %s (%s)" % (
                 dep.variable, dep.dep_descr.format(
                     tgt=dep.target.id, src=dep.source.id))
-        schedule.append(Barrier(comment=comment, kind=dep.var_kind))
+        schedule.append(Barrier(
+            comment=comment,
+            kind=dep.var_kind,
+            originating_insn_id=None))
 
 
 def insert_barriers(kernel, schedule, kind, verify_only, level=0):
@@ -1771,15 +1755,10 @@ def insert_barriers(kernel, schedule, kind, verify_only, level=0):
 # {{{ main scheduling entrypoint
 
 def generate_loop_schedules(kernel, debug_args={}):
-    import sys
-    rec_limit = sys.getrecursionlimit()
-    new_limit = max(rec_limit, len(kernel.instructions) * 2)
-    sys.setrecursionlimit(new_limit)
-    try:
+    from pytools import MinRecursionLimit
+    with MinRecursionLimit(len(kernel.instructions) * 2):
         for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args):
             yield sched
-    finally:
-        sys.setrecursionlimit(rec_limit)
 
 
 def generate_loop_schedules_inner(kernel, debug_args={}):
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 6f82eadd70443facf729711cd922bd8b754a2065..3b4fed215a35fa13a52e3f3901955dab2621dff0 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -656,6 +656,29 @@ class CASTBuilder(ASTBuilderBase):
             lhs_expr, rhs_expr, lhs_dtype):
         raise NotImplementedError("atomic updates in %s" % type(self).__name__)
 
+    def emit_tuple_assignment(self, codegen_state, insn):
+        ecm = codegen_state.expression_to_code_mapper
+
+        from cgen import Assign, block_if_necessary
+        assignments = []
+
+        for i, (assignee, parameter) in enumerate(
+                zip(insn.assignees, insn.expression.parameters)):
+            lhs_code = ecm(assignee, prec=PREC_NONE, type_context=None)
+            assignee_var_name = insn.assignee_var_names()[i]
+            lhs_var = codegen_state.kernel.get_var_descriptor(assignee_var_name)
+            lhs_dtype = lhs_var.dtype
+
+            from loopy.expression import dtype_to_type_context
+            rhs_type_context = dtype_to_type_context(
+                    codegen_state.kernel.target, lhs_dtype)
+            rhs_code = ecm(parameter, prec=PREC_NONE,
+                    type_context=rhs_type_context, needed_dtype=lhs_dtype)
+
+            assignments.append(Assign(lhs_code, rhs_code))
+
+        return block_if_necessary(assignments)
+
     def emit_multiple_assignment(self, codegen_state, insn):
         ecm = codegen_state.expression_to_code_mapper
 
@@ -682,6 +705,10 @@ class CASTBuilder(ASTBuilderBase):
 
         assert mangle_result.arg_dtypes is not None
 
+        if mangle_result.target_name == "loopy_make_tuple":
+            # This shorcut avoids actually having to emit a 'make_tuple' function.
+            return self.emit_tuple_assignment(codegen_state, insn)
+
         from loopy.expression import dtype_to_type_context
         c_parameters = [
                 ecm(par, PREC_NONE,
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 01e56405e30285705be7cb8eb6d75479c8658ef5..a5f7562c41c3ec8eca673904550e078d2a992241 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -390,10 +390,11 @@ class OpenCLCASTBuilder(CASTBuilder):
 
     def preamble_generators(self):
         from loopy.library.reduction import reduction_preamble_generator
+
         return (
                 super(OpenCLCASTBuilder, self).preamble_generators() + [
                     opencl_preamble_generator,
-                    reduction_preamble_generator
+                    reduction_preamble_generator,
                     ])
 
     # }}}
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 78d817ce73724d90a6cc6f380b24290971f6c1e7..409cbbc5ebd5feb13b04eeba1671f639663bfcf1 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -232,7 +232,8 @@ class TypeInferenceMapper(CombineMapper):
             # Codegen for complex types depends on exactly correct types.
             # Refuse temptation to guess.
             raise TypeInferenceFailure("Complex constant '%s' needs to "
-                    "be sized for type inference " % expr)
+                    "be sized (i.e. as numpy.complex64/128) for type inference "
+                    % expr)
         else:
             raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr)
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 2ac1026c0660573d97cd2f65c0502ec69b63803d..d7b1f37c18f71527bcb31f856d2f6d09fbc9df9a 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1046,6 +1046,24 @@ def test_within_inames_and_reduction():
     print(k.stringify(with_dependencies=True))
 
 
+def test_literal_local_barrier(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            """
+            for i
+                ... lbarrier
+            end
+            """, seq_dependencies=True)
+
+    knl = lp.fix_parameters(knl, n=128)
+
+    ref_knl = knl
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+
+
 def test_kernel_splitting(ctx_factory):
     ctx = ctx_factory()
 
@@ -1317,6 +1335,28 @@ def test_save_of_local_array(ctx_factory, debug=False):
     save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
 
 
+def test_save_of_local_array_with_explicit_local_barrier(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+        "{ [i,j]: 0<=i,j<8 }",
+        """
+        for i, j
+            <>t[2*j] = j
+            ... lbarrier
+            t[2*j+1] = t[2*j]
+            ... gbarrier
+            out[i] = t[2*i]
+        end
+        """, seq_dependencies=True)
+
+    knl = lp.set_temporary_scope(knl, "t", "local")
+    knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
+
+    save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
+
+
 def test_save_local_multidim_array(ctx_factory, debug=False):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -2087,6 +2127,47 @@ def test_integer_reduction(ctx_factory):
             assert function(out)
 
 
+def test_complicated_argmin_reduction(ctx_factory):
+    cl_ctx = ctx_factory()
+    knl = lp.make_kernel(
+            "{[ictr,itgt,idim]: "
+            "0<=itgt<ntargets "
+            "and 0<=ictr<ncenters "
+            "and 0<=idim<ambient_dim}",
+
+            """
+            for itgt
+                for ictr
+                    <> dist_sq = sum(idim,
+                            (tgt[idim,itgt] - center[idim,ictr])**2)
+                    <> in_disk = dist_sq < (radius[ictr]*1.05)**2
+                    <> matches = (
+                            (in_disk
+                                and qbx_forced_limit == 0)
+                            or (in_disk
+                                    and qbx_forced_limit != 0
+                                    and qbx_forced_limit * center_side[ictr] > 0)
+                            )
+
+                    <> post_dist_sq = if(matches, dist_sq, HUGE)
+                end
+                <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)
+
+                tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1)
+            end
+            """)
+
+    knl = lp.fix_parameters(knl, ambient_dim=2)
+    knl = lp.add_and_infer_dtypes(knl, {
+            "tgt,center,radius,HUGE": np.float32,
+            "center_side,qbx_forced_limit": np.int32,
+            })
+
+    lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={
+            "HUGE": 1e20, "ncenters": 200, "ntargets": 300,
+            "qbx_forced_limit": 1})
+
+
 def test_nosync_option_parsing():
     knl = lp.make_kernel(
         "{[i]: 0 <= i < 10}",
@@ -2335,6 +2416,21 @@ def test_kernel_var_name_generator():
     assert vng("b") != "b"
 
 
+def test_execution_backend_can_cache_dtypes(ctx_factory):
+    # When the kernel is invoked, the execution backend uses it as a cache key
+    # for the type inference and scheduling cache. This tests to make sure that
+    # dtypes in the kernel can be cached, even though they may not have a
+    # target.
+
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel("{[i]: 0 <= i < 10}", "<>tmp[i] = i")
+    knl = lp.add_dtypes(knl, dict(tmp=int))
+
+    knl(queue)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_target.py b/test/test_target.py
index b656383e7bbe008892f45159faadd2d195d67a3b..ad0cb7439bfdd6200e020c0becadcd73072ceef4 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -176,6 +176,22 @@ def test_random123(ctx_factory, tp):
     assert (0 <= out).all()
 
 
+def test_tuple(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{ [i]: 0 = i }",
+            """
+            a, b = make_tuple(1, 2.)
+            """)
+
+    evt, (a, b) = knl(queue)
+
+    assert a.get() == 1
+    assert b.get() == 2.
+
+
 def test_clamp(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)