diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index d151a2128754373559ea8a45730609782adb33d7..560facd63f183e1113ccb7cb94ff1953aced6e7d 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -217,9 +217,9 @@ These are usually key-value pairs. The following attributes are recognized:
   dependency is that the code generated for this instruction is required to
   appear textually after all of these dependees' generated code.
 
-  Identifiers here are allowed to be wildcards as defined by
-  the Python module :mod:`fnmatchcase`. This is helpful in conjunction
-  with ``id_prefix``.
+  Identifiers here are allowed to be wildcards as defined by the Python
+  function :func:`fnmatch.fnmatchcase`. This is helpful in conjunction with
+  ``id_prefix``.
 
   .. note::
 
@@ -242,6 +242,15 @@ These are usually key-value pairs. The following attributes are recognized:
       heuristic and indicate that the specified list of dependencies is
       exhaustive.
 
+* ``nosync=id1:id2`` prescribes that no barrier synchronization is necessary
+  the instructions with identifiers ``id1`` and ``id2`` to the, even if
+  a dependency chain exists and variables are accessed in an apparently
+  racy way.
+
+  Identifiers here are allowed to be wildcards as defined by the Python
+  function :func:`fnmatch.fnmatchcase`. This is helpful in conjunction with
+  ``id_prefix``.
+
 * ``priority=integer`` sets the instructions priority to the value
   ``integer``. Instructions with higher priority will be scheduled sooner,
   if possible. Note that the scheduler may still schedule a lower-priority
@@ -284,6 +293,11 @@ Loopy's expressions are a slight superset of the expressions supported by
 TODO: Functions
 TODO: Reductions
 
+Function Call Instructions
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: CallInstruction
+
 C Block Instructions
 ^^^^^^^^^^^^^^^^^^^^
 
@@ -459,6 +473,8 @@ Targets
 
 .. automodule:: loopy.target
 
+.. currentmodule:: loopy
+
 Helper values
 -------------
 
@@ -470,6 +486,27 @@ Helper values
 
 .. }}}
 
+Libraries: Extending and Interfacing with External Functionality
+----------------------------------------------------------------
+
+.. _symbols:
+
+Symbols
+^^^^^^^
+
+.. _functions:
+
+Functions
+^^^^^^^^^
+
+.. autoclass:: CallMangleInfo
+
+.. _reductions:
+
+Reductions
+^^^^^^^^^^
+
+
 The Kernel Object
 -----------------
 
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index f92cfbf67c0d2af723db8f6fa0ee67ebb14c08fe..d085c1215781f52a8f0c79359e8ac7061d94ffb7 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -68,6 +68,8 @@ Manipulating Instructions
 
 .. autofunction:: remove_instructions
 
+.. autofunction:: replace_instruction_ids
+
 .. autofunction:: tag_instructions
 
 Registering Library Routines
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index bc8912527dcefa2ddf43d5adb6a83173900d69d4..cde36163d09a5a7d31a1afc97e0bfcaa636cdda3 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -563,8 +563,9 @@ relation to loop nesting. For example, it's perfectly possible to request
     #define lid(N) ((int) get_local_id(N))
     ...
       for (int i_inner = 0; i_inner <= 15; ++i_inner)
-        for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
-          a[i_inner + i_outer * 16] = 0.0f;
+        if (-1 + -1 * i_inner + n >= 0)
+          for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + ((15 + n + 15 * i_inner) / 16); ++i_outer)
+            a[i_inner + i_outer * 16] = 0.0f;
     ...
 
 Notice how loopy has automatically generated guard conditionals to make
@@ -1105,7 +1106,7 @@ Attempting to create this kernel results in an error:
     ... # While trying to find shape axis 0 of argument 'out', the following exception occurred:
     Traceback (most recent call last):
     ...
-    StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n = 1; [(n)] : n >= 2; [(1)] : n <= 0 }'
+    StaticValueFindingError: a static maximum was not found for PwAff '[n] -> { [(1)] : n <= 1; [(n)] : n >= 2 }'
 
 The problem is that loopy cannot find a simple, universally valid expression
 for the length of *out* in this case. Notice how the kernel accesses both the
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 22022c0f63540c56ad65b9813ea576e491dd7fde..10bd1100126e79ff2451eaf96c45bd4be47adf3a 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -41,9 +41,12 @@ from loopy.kernel.data import (
         KernelArgument,
         ValueArg, GlobalArg, ConstantArg, ImageArg,
         memory_ordering, memory_scope, VarAtomicity, AtomicInit, AtomicUpdate,
-        InstructionBase, Assignment, ExpressionInstruction, CInstruction,
+        InstructionBase,
+        MultiAssignmentBase, Assignment, ExpressionInstruction,
+        CallInstruction, CInstruction,
         temp_var_scope, TemporaryVariable,
-        SubstitutionRule)
+        SubstitutionRule,
+        CallMangleInfo)
 
 from loopy.kernel import LoopKernel, kernel_state
 from loopy.kernel.tools import (
@@ -67,7 +70,9 @@ from loopy.transform.iname import (
 from loopy.transform.instruction import (
         find_instructions, map_instructions,
         set_instruction_priority, add_dependency,
-        remove_instructions, tag_instructions)
+        remove_instructions,
+        replace_instruction_ids,
+        tag_instructions)
 
 from loopy.transform.data import (
         add_prefetch, change_arg_to_image, tag_data_axes,
@@ -103,7 +108,8 @@ from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (get_op_poly, sum_ops_to_dtypes,
         get_gmem_access_poly,
         get_DRAM_access_poly, get_barrier_poly, stringify_stats_mapping,
-        sum_mem_access_to_bytes)
+        sum_mem_access_to_bytes,
+        gather_access_footprints, gather_access_footprint_bytes)
 from loopy.codegen import generate_code, generate_body
 from loopy.compiled import CompiledKernel
 from loopy.options import Options
@@ -131,8 +137,11 @@ __all__ = [
         "ValueArg", "GlobalArg", "ConstantArg", "ImageArg",
         "temp_var_scope", "TemporaryVariable",
         "SubstitutionRule",
+        "CallMangleInfo",
 
-        "InstructionBase", "Assignment", "ExpressionInstruction", "CInstruction",
+        "InstructionBase",
+        "MultiAssignmentBase", "Assignment", "ExpressionInstruction",
+        "CallInstruction", "CInstruction",
 
         "default_function_mangler", "single_arg_function_mangler",
 
@@ -157,7 +166,9 @@ __all__ = [
 
         "find_instructions", "map_instructions",
         "set_instruction_priority", "add_dependency",
-        "remove_instructions", "tag_instructions",
+        "remove_instructions",
+        "replace_instruction_ids",
+        "tag_instructions",
 
         "extract_subst", "expand_subst", "assignment_to_subst",
         "find_rules_matching", "find_one_rule_matching",
@@ -190,6 +201,7 @@ __all__ = [
         "get_op_poly", "sum_ops_to_dtypes", "get_gmem_access_poly",
         "get_DRAM_access_poly",
         "get_barrier_poly", "stringify_stats_mapping", "sum_mem_access_to_bytes",
+        "gather_access_footprints", "gather_access_footprint_bytes",
 
         "CompiledKernel",
 
@@ -278,6 +290,11 @@ def register_symbol_manglers(kernel, manglers):
 
 
 def register_function_manglers(kernel, manglers):
+    """
+    :arg manglers: list of functions of signature ``(target, name, arg_dtypes)``
+        returning a :class:`loopy.CallMangleInfo`.
+    :returns: *kernel* with *manglers* registered
+    """
     new_manglers = kernel.function_manglers[:]
     for m in manglers:
         if m not in new_manglers:
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 2ef5295c5cb157179a3eeaf952cebe4e65f2d5f2..f34a943831c68bd0d2cf6089d8b70ad4bc43022e 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -64,9 +64,9 @@ def fill_rand(ary):
         real_dtype = ary.dtype.type(0).real.dtype
         real_ary = ary.view(real_dtype)
 
-        fill_rand(real_ary, luxury=0)
+        fill_rand(real_ary)
     else:
-        fill_rand(ary, luxury=0)
+        fill_rand(ary)
 
 
 class TestArgInfo(Record):
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index d3a7ae42c078fc39c5844e3cb8047900ae347dc1..7b95f59482c826473e069d9f07a80dd97d7ea0ce 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -61,10 +61,12 @@ def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, st
 
 
 def generate_instruction_code(kernel, insn, codegen_state):
-    from loopy.kernel.data import Assignment, CInstruction
+    from loopy.kernel.data import Assignment, CallInstruction, CInstruction
 
     if isinstance(insn, Assignment):
         result = generate_expr_instruction_code(kernel, insn, codegen_state)
+    elif isinstance(insn, CallInstruction):
+        result = generate_call_code(kernel, insn, codegen_state)
     elif isinstance(insn, CInstruction):
         result = generate_c_instruction_code(kernel, insn, codegen_state)
     else:
@@ -218,12 +220,32 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
     return result
 
 
+def generate_call_code(kernel, insn, codegen_state):
+    # {{{ vectorization handling
+
+    if codegen_state.vectorization_info:
+        if insn.atomicity:
+            raise Unvectorizable("function call")
+
+    # }}}
+
+    result = kernel.target.generate_multiple_assignment(
+            codegen_state, insn)
+
+    # {{{ tracing
+
+    if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
+        raise NotImplementedError("tracing of multi-output function calls")
+
+    # }}}
+
+    return result
+
+
 def generate_c_instruction_code(kernel, insn, codegen_state):
     if codegen_state.vectorization_info is not None:
         raise Unvectorizable("C instructions cannot be vectorized")
 
-    ecm = codegen_state.expression_to_code_mapper
-
     body = []
 
     from loopy.codegen import POD
diff --git a/loopy/expression.py b/loopy/expression.py
index 16c09b82821044f880e0f459af980da13526211b..6fd49661d617f65d2cdbc5773c3a59955da8c19c 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -199,7 +199,7 @@ class TypeInferenceMapper(CombineMapper):
     def map_linear_subscript(self, expr):
         return self.rec(expr.aggregate)
 
-    def map_call(self, expr):
+    def map_call(self, expr, multiple_types_ok=False):
         from pymbolic.primitives import Variable
 
         identifier = expr.function
@@ -212,11 +212,20 @@ class TypeInferenceMapper(CombineMapper):
         arg_dtypes = tuple(self.rec(par) for par in expr.parameters)
 
         mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
-        if mangle_result is not None:
-            return mangle_result[0]
+        if multiple_types_ok:
+            if mangle_result is not None:
+                return mangle_result.result_dtypes
+        else:
+            if mangle_result is not None:
+                if len(mangle_result.result_dtypes) != 1 and not multiple_types_ok:
+                    raise LoopyError("functions with more or fewer than one "
+                            "return value may only be used in direct assignments")
 
-        raise RuntimeError("no type inference information on "
-                "function '%s'" % identifier)
+                return mangle_result.result_dtypes[0]
+
+        raise RuntimeError("unable to resolve "
+                "function '%s' with %d given arguments"
+                % (identifier, len(arg_dtypes)))
 
     def map_variable(self, expr):
         if expr.name in self.kernel.all_inames():
@@ -267,7 +276,8 @@ class TypeInferenceMapper(CombineMapper):
 
     def map_lookup(self, expr):
         agg_result = self.rec(expr.aggregate)
-        dtype, offset = agg_result.numpy_dtype.fields[expr.name]
+        field = agg_result.numpy_dtype.fields[expr.name]
+        dtype = field[0]
         return NumpyType(dtype)
 
     def map_comparison(self, expr):
@@ -285,9 +295,19 @@ class TypeInferenceMapper(CombineMapper):
     def map_local_hw_index(self, expr, *args):
         return self.kernel.index_dtype
 
-    def map_reduction(self, expr):
-        return expr.operation.result_dtype(
-                self.kernel.target, self.rec(expr.expr), expr.inames)
+    def map_reduction(self, expr, multiple_types_ok=False):
+        result = expr.operation.result_dtypes(
+                self.kernel, self.rec(expr.expr), expr.inames)
+
+        if multiple_types_ok:
+            return result
+
+        else:
+            if len(result) != 1 and not multiple_types_ok:
+                raise LoopyError("reductions with more or fewer than one "
+                        "return value may only be used in direct assignments")
+
+            return result[0]
 
 # }}}
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index d7009900b5683d43effb594cf5b38498db8e09c1..7baea324346b79e40d7ca601a99644727dd66e95 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -313,7 +313,35 @@ class LoopKernel(RecordWithoutPickling):
         for mangler in manglers:
             mangle_result = mangler(self, identifier, arg_dtypes)
             if mangle_result is not None:
-                return mangle_result
+                from loopy.kernel.data import CallMangleInfo
+                if isinstance(mangle_result, CallMangleInfo):
+                    assert len(mangle_result.arg_dtypes) == len(arg_dtypes)
+                    return mangle_result
+
+                assert isinstance(mangle_result, tuple)
+
+                from warnings import warn
+                warn("'%s' returned a tuple instead of a CallMangleInfo instance. "
+                        "This is deprecated." % mangler.__name__,
+                        DeprecationWarning)
+
+                if len(mangle_result) == 2:
+                    result_dtype, target_name = mangle_result
+                    return CallMangleInfo(
+                            target_name=target_name,
+                            result_dtypes=(result_dtype,),
+                            arg_dtypes=None)
+
+                elif len(mangle_result) == 3:
+                    result_dtype, target_name, actual_arg_dtypes = mangle_result
+                    return CallMangleInfo(
+                            target_name=target_name,
+                            result_dtypes=(result_dtype,),
+                            arg_dtypes=actual_arg_dtypes)
+
+                else:
+                    raise ValueError("unexpected size of tuple returned by '%s'"
+                            % mangler.__name__)
 
         return None
 
@@ -1027,8 +1055,8 @@ class LoopKernel(RecordWithoutPickling):
             for dep_id in sorted(insn.depends_on):
                 print_insn(kernel.id_to_insn[dep_id])
 
-            if isinstance(insn, lp.Assignment):
-                lhs = str(insn.assignee)
+            if isinstance(insn, lp.MultiAssignmentBase):
+                lhs = ", ".join(str(a) for a in insn.assignees)
                 rhs = str(insn.expression)
                 trailing = []
             elif isinstance(insn, lp.CInstruction):
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index aedc1edc650d570cdeec1ecca8dc08989678df65..9fe0f5b7904c47b26b8d0b1b4c4ea5ce039da713 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -29,7 +29,9 @@ import numpy as np
 from loopy.tools import intern_frozenset_of_ids
 from loopy.symbolic import IdentityMapper, WalkMapper
 from loopy.kernel.data import (
-        InstructionBase, Assignment, SubstitutionRule)
+        InstructionBase,
+        MultiAssignmentBase, Assignment,
+        SubstitutionRule)
 from loopy.diagnostic import LoopyError
 import islpy as isl
 from islpy import dim_type
@@ -147,7 +149,6 @@ def expand_defines_in_expr(expr, defines):
 # {{{ parse instructions
 
 INSN_RE = re.compile(
-        "\s*(?:\<(?P<temp_var_type>.*?)\>)?"
         "\s*(?P<lhs>.+?)\s*(?<!\:)=\s*(?P<rhs>.+?)"
         "\s*?(?:\{(?P<options>.+)\}\s*)?$"
         )
@@ -159,7 +160,8 @@ SUBST_RE = re.compile(
 def parse_insn(insn):
     """
     :return: a tuple ``(insn, inames_to_dup)``, where insn is a
-        :class:`Assignment` or a :class:`SubstitutionRule`
+        :class:`Assignment`, a :class:`CallInstruction`,
+        or a :class:`SubstitutionRule`
         and *inames_to_dup* is None or a list of tuples `(old, new)`.
     """
 
@@ -192,21 +194,74 @@ def parse_insn(insn):
                 "the following error occurred:" % groups["rhs"])
         raise
 
-    from pymbolic.primitives import Variable, Subscript, Call
-    if isinstance(lhs, Variable):
-        assignee_name = lhs.name
-    elif isinstance(lhs, Subscript):
-        assignee_name = lhs.aggregate.name
-    elif isinstance(lhs, Call):
-        assignee_name = None
-        assert subst_match is not None
-    else:
-        raise LoopyError("left hand side of assignment '%s' must "
-                "be variable or subscript" % lhs)
+    from pymbolic.primitives import Variable, Call, Subscript
+    from loopy.symbolic import TypeAnnotation
+
+    # {{{ deal with subst rules
+
+    if subst_match is not None:
+        assert insn_match is None
+        if isinstance(lhs, Variable):
+            subst_name = lhs.name
+            arg_names = []
+        elif isinstance(lhs, Call):
+            if not isinstance(lhs.function, Variable):
+                raise RuntimeError("Invalid substitution rule left-hand side")
+            subst_name = lhs.function.name
+            arg_names = []
+
+            for i, arg in enumerate(lhs.parameters):
+                if not isinstance(arg, Variable):
+                    raise RuntimeError("Invalid substitution rule "
+                                    "left-hand side: %s--arg number %d "
+                                    "is not a variable" % (lhs, i))
+                arg_names.append(arg.name)
+        else:
+            raise RuntimeError("Invalid substitution rule left-hand side")
+
+        return SubstitutionRule(
+                name=subst_name,
+                arguments=tuple(arg_names),
+                expression=rhs), []
+
+    # }}}
+
+    if not isinstance(lhs, tuple):
+        lhs = (lhs,)
+
+    temp_var_types = []
+    new_lhs = []
+    assignee_names = []
+
+    for lhs_i in lhs:
+        if isinstance(lhs_i, TypeAnnotation):
+            if lhs_i.type is None:
+                temp_var_types.append(lp.auto)
+            else:
+                temp_var_types.append(lhs_i.type)
+
+            lhs_i = lhs_i.child
+        else:
+            temp_var_types.append(None)
+
+        if isinstance(lhs_i, Variable):
+            assignee_names.append(lhs_i.name)
+        elif isinstance(lhs_i, Subscript):
+            assignee_names.append(lhs_i.aggregate.name)
+        else:
+            raise LoopyError("left hand side of assignment '%s' must "
+                    "be variable or subscript" % (lhs_i,))
+
+        new_lhs.append(lhs_i)
+
+    lhs = tuple(new_lhs)
+    temp_var_types = tuple(temp_var_types)
+    del new_lhs
 
     if insn_match is not None:
         depends_on = None
         depends_on_is_final = False
+        no_sync_with = None
         insn_groups = None
         conflicts_with_groups = None
         insn_id = None
@@ -256,6 +311,11 @@ def parse_insn(insn):
                             intern(dep.strip()) for dep in opt_value.split(":")
                             if dep.strip())
 
+                elif opt_key == "nosync" and opt_value is not None:
+                    no_sync_with = frozenset(
+                            intern(dep.strip()) for dep in opt_value.split(":")
+                            if dep.strip())
+
                 elif opt_key == "groups" and opt_value is not None:
                     insn_groups = frozenset(
                             intern(grp.strip()) for grp in opt_value.split(":")
@@ -284,6 +344,11 @@ def parse_insn(insn):
                             if tag.strip())
 
                 elif opt_key == "atomic":
+                    if len(assignee_names) != 1:
+                        raise LoopyError("atomic operations with more than one "
+                                "left-hand side not supported")
+                    assignee_name, = assignee_names
+
                     if opt_value is None:
                         atomicity = atomicity + (
                                 lp.AtomicUpdate(assignee_name),)
@@ -296,6 +361,7 @@ def parse_insn(insn):
                                 raise LoopyError("atomicity directive not "
                                         "understood: %s"
                                         % v)
+                    del assignee_name
 
                 else:
                     raise ValueError(
@@ -303,58 +369,27 @@ def parse_insn(insn):
                             "(maybe a missing/extraneous =value?)"
                             % opt_key)
 
-        if groups["temp_var_type"] is not None:
-            if groups["temp_var_type"]:
-                temp_var_type = np.dtype(groups["temp_var_type"])
-            else:
-                import loopy as lp
-                temp_var_type = lp.auto
-        else:
-            temp_var_type = None
-
-        return Assignment(
+        kwargs = dict(
                     id=(
                         intern(insn_id)
                         if isinstance(insn_id, str)
                         else insn_id),
                     depends_on=depends_on,
                     depends_on_is_final=depends_on_is_final,
+                    no_sync_with=no_sync_with,
                     groups=insn_groups,
                     conflicts_with_groups=conflicts_with_groups,
                     forced_iname_deps_is_final=forced_iname_deps_is_final,
                     forced_iname_deps=forced_iname_deps,
-                    assignee=lhs, expression=rhs,
-                    temp_var_type=temp_var_type,
-                    atomicity=atomicity,
                     priority=priority,
                     predicates=predicates,
-                    tags=tags), inames_to_dup
-
-    elif subst_match is not None:
-        from pymbolic.primitives import Variable, Call
-
-        if isinstance(lhs, Variable):
-            subst_name = lhs.name
-            arg_names = []
-        elif isinstance(lhs, Call):
-            if not isinstance(lhs.function, Variable):
-                raise RuntimeError("Invalid substitution rule left-hand side")
-            subst_name = lhs.function.name
-            arg_names = []
-
-            for i, arg in enumerate(lhs.parameters):
-                if not isinstance(arg, Variable):
-                    raise RuntimeError("Invalid substitution rule "
-                                    "left-hand side: %s--arg number %d "
-                                    "is not a variable" % (lhs, i))
-                arg_names.append(arg.name)
-        else:
-            raise RuntimeError("Invalid substitution rule left-hand side")
+                    tags=tags,
+                    atomicity=atomicity)
 
-        return SubstitutionRule(
-                name=subst_name,
-                arguments=tuple(arg_names),
-                expression=rhs), []
+        from loopy.kernel.data import make_assignment
+        return make_assignment(
+                lhs, rhs, temp_var_types, **kwargs
+                ), inames_to_dup
 
 
 def parse_if_necessary(insn, defines):
@@ -515,13 +550,13 @@ class ArgumentGuesser:
         self.all_written_names = set()
         from loopy.symbolic import get_dependencies
         for insn in instructions:
-            if isinstance(insn, Assignment):
-                (assignee_var_name, _), = insn.assignees_and_indices()
-                self.all_written_names.add(assignee_var_name)
-                self.all_names.update(get_dependencies(
-                    self.submap(insn.assignee)))
-                self.all_names.update(get_dependencies(
-                    self.submap(insn.expression)))
+            if isinstance(insn, MultiAssignmentBase):
+                for assignee_var_name, _ in insn.assignees_and_indices():
+                    self.all_written_names.add(assignee_var_name)
+                    self.all_names.update(get_dependencies(
+                        self.submap(insn.assignees)))
+                    self.all_names.update(get_dependencies(
+                        self.submap(insn.expression)))
 
     def find_index_rank(self, name):
         irf = IndexRankFinder(name)
@@ -583,10 +618,12 @@ class ArgumentGuesser:
         temp_var_names = set(six.iterkeys(self.temporary_variables))
 
         for insn in self.instructions:
-            if isinstance(insn, Assignment):
-                if insn.temp_var_type is not None:
-                    (assignee_var_name, _), = insn.assignees_and_indices()
-                    temp_var_names.add(assignee_var_name)
+            if isinstance(insn, MultiAssignmentBase):
+                for (assignee_var_name, _), temp_var_type in zip(
+                        insn.assignees_and_indices(),
+                        insn.temp_var_types):
+                    if temp_var_type is not None:
+                        temp_var_names.add(assignee_var_name)
 
         # }}}
 
@@ -780,7 +817,7 @@ def expand_cses(instructions, cse_prefix="cse_expr"):
     new_temp_vars = []
 
     for insn in instructions:
-        if isinstance(insn, Assignment):
+        if isinstance(insn, MultiAssignmentBase):
             new_insns.append(insn.copy(expression=cseam(insn.expression)))
         else:
             new_insns.append(insn)
@@ -799,29 +836,36 @@ def create_temporaries(knl, default_order):
     import loopy as lp
 
     for insn in knl.instructions:
-        if isinstance(insn, Assignment) \
-                and insn.temp_var_type is not None:
-            (assignee_name, _), = insn.assignees_and_indices()
-
-            if assignee_name in new_temp_vars:
-                raise RuntimeError("cannot create temporary variable '%s'--"
-                        "already exists" % assignee_name)
-            if assignee_name in knl.arg_dict:
-                raise RuntimeError("cannot create temporary variable '%s'--"
-                        "already exists as argument" % assignee_name)
-
-            logger.debug("%s: creating temporary %s"
-                    % (knl.name, assignee_name))
-
-            new_temp_vars[assignee_name] = lp.TemporaryVariable(
-                    name=assignee_name,
-                    dtype=insn.temp_var_type,
-                    is_local=lp.auto,
-                    base_indices=lp.auto,
-                    shape=lp.auto,
-                    order=default_order)
-
-            insn = insn.copy(temp_var_type=None)
+        if isinstance(insn, MultiAssignmentBase):
+            for (assignee_name, _), temp_var_type in zip(
+                    insn.assignees_and_indices(),
+                    insn.temp_var_types):
+
+                if temp_var_type is None:
+                    continue
+
+                if assignee_name in new_temp_vars:
+                    raise RuntimeError("cannot create temporary variable '%s'--"
+                            "already exists" % assignee_name)
+                if assignee_name in knl.arg_dict:
+                    raise RuntimeError("cannot create temporary variable '%s'--"
+                            "already exists as argument" % assignee_name)
+
+                logger.debug("%s: creating temporary %s"
+                        % (knl.name, assignee_name))
+
+                new_temp_vars[assignee_name] = lp.TemporaryVariable(
+                        name=assignee_name,
+                        dtype=temp_var_type,
+                        is_local=lp.auto,
+                        base_indices=lp.auto,
+                        shape=lp.auto,
+                        order=default_order)
+
+                if isinstance(insn, Assignment):
+                    insn = insn.copy(temp_var_type=None)
+                else:
+                    insn = insn.copy(temp_var_types=None)
 
         new_insns.append(insn)
 
@@ -925,8 +969,8 @@ def guess_arg_shape_if_requested(kernel, default_order):
 
             try:
                 for insn in kernel.instructions:
-                    if isinstance(insn, lp.Assignment):
-                        armap(submap(insn.assignee), kernel.insn_inames(insn))
+                    if isinstance(insn, lp.MultiAssignmentBase):
+                        armap(submap(insn.assignees), kernel.insn_inames(insn))
                         armap(submap(insn.expression), kernel.insn_inames(insn))
             except TypeError as e:
                 from traceback import print_exc
@@ -1021,25 +1065,39 @@ def apply_default_order_to_args(kernel, default_order):
 
 # {{{ resolve wildcard insn dependencies
 
+def find_matching_insn_ids(knl, dep):
+    from fnmatch import fnmatchcase
+
+    return [
+        other_insn.id
+        for other_insn in knl.instructions
+        if fnmatchcase(other_insn.id, dep)]
+
+
+def resove_wildcard_insn_ids(knl, deps):
+    new_deps = []
+    for dep in deps:
+        matches = find_matching_insn_ids(knl, dep)
+
+        if matches:
+            new_deps.extend(matches)
+        else:
+            # Uh, best we can do
+            new_deps.append(dep)
+
+    return frozenset(new_deps)
+
+
 def resolve_wildcard_deps(knl):
     new_insns = []
 
-    from fnmatch import fnmatchcase
     for insn in knl.instructions:
         if insn.depends_on is not None:
-            new_deps = set()
-            for dep in insn.depends_on:
-                match_count = 0
-                for other_insn in knl.instructions:
-                    if fnmatchcase(other_insn.id, dep):
-                        new_deps.add(other_insn.id)
-                        match_count += 1
-
-                if match_count == 0:
-                    # Uh, best we can do
-                    new_deps.add(dep)
-
-            insn = insn.copy(depends_on=frozenset(new_deps))
+            insn = insn.copy(
+                    depends_on=resove_wildcard_insn_ids(knl, insn.depends_on),
+                    no_sync_with=resove_wildcard_insn_ids(
+                        knl, insn.no_sync_with),
+                    )
 
         new_insns.append(insn)
 
@@ -1112,10 +1170,9 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     :arg default_offset: 0 or :class:`loopy.auto`. The default value of
         *offset* in :attr:`GlobalArg` for guessed arguments.
         Defaults to 0.
-    :arg function_manglers: list of functions of signature (name, arg_dtypes)
-        returning a tuple (result_dtype, c_name)
-        or a tuple (result_dtype, c_name, arg_dtypes),
-        where c_name is the C-level function to be called.
+    :arg function_manglers: list of functions of signature
+        ``(target, name, arg_dtypes)``
+        returning a :class:`loopy.CallMangleInfo`.
     :arg symbol_manglers: list of functions of signature (name) returning
         a tuple (result_dtype, c_name), where c_name is the C-level symbol to
         be evaluated.
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 0e0638491f9314de786a83cb7ccb1fedf095b9f6..b477b5fee0b047d0fb74099174e2a8c6f08336d1 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -417,9 +417,7 @@ class SubstitutionRule(Record):
 # }}}
 
 
-# {{{ instruction
-
-# {{{ base class
+# {{{ instructions: base class
 
 class InstructionBase(Record):
     """A base class for all types of instruction that can occur in
@@ -430,6 +428,8 @@ class InstructionBase(Record):
         An (otherwise meaningless) identifier that is unique within
         a :class:`loopy.kernel.LoopKernel`.
 
+    .. rubric:: Instruction ordering
+
     .. attribute:: depends_on
 
         a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances
@@ -460,6 +460,21 @@ class InstructionBase(Record):
         (see :class:`InstructionBase.groups`) may not be active when this
         instruction is scheduled.
 
+    .. attribute:: priority
+
+        Scheduling priority, an integer. Higher means 'execute sooner'.
+        Default 0.
+
+    .. rubric :: Synchronization
+
+    .. attribute:: no_sync_with
+
+        a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances
+        with which no barrier synchronization is necessary, even given the existence
+        of a dependency chain and apparently conflicting writes
+
+    .. rubric:: Conditionals
+
     .. attribute:: predicates
 
         a :class:`frozenset` of variable names the conjunction (logical and) of
@@ -467,6 +482,8 @@ class InstructionBase(Record):
         should be run. Each variable name may, optionally, be preceded by
         an exclamation point, indicating negation.
 
+    .. rubric:: Iname dependencies
+
     .. attribute:: forced_iname_deps_is_final
 
         A :class:`bool` determining whether :attr:`forced_iname_deps` constitutes
@@ -478,10 +495,7 @@ class InstructionBase(Record):
         dependencies *or* constitute the entire list of iname dependencies,
         depending on the value of :attr:`forced_iname_deps_is_final`.
 
-    .. attribute:: priority
-
-        Scheduling priority, an integer. Higher means 'execute sooner'.
-        Default 0.
+    .. rubric:: Iname dependencies
 
     .. attribute:: boostable
 
@@ -495,6 +509,8 @@ class InstructionBase(Record):
         may need to be boosted, as a heuristic help for the scheduler.
         Also allowed to be *None*.
 
+    .. rubric:: Tagging
+
     .. attribute:: tags
 
         A tuple of string identifiers that can be used to identify groups
@@ -512,12 +528,14 @@ class InstructionBase(Record):
 
     fields = set("id depends_on depends_on_is_final "
             "groups conflicts_with_groups "
+            "no_sync_with "
             "predicates "
             "forced_iname_deps_is_final forced_iname_deps "
             "priority boostable boostable_into".split())
 
     def __init__(self, id, depends_on, depends_on_is_final,
             groups, conflicts_with_groups,
+            no_sync_with,
             forced_iname_deps_is_final, forced_iname_deps, priority,
             boostable, boostable_into, predicates, tags,
             insn_deps=None, insn_deps_is_final=None):
@@ -541,6 +559,9 @@ class InstructionBase(Record):
         if conflicts_with_groups is None:
             conflicts_with_groups = frozenset()
 
+        if no_sync_with is None:
+            no_sync_with = frozenset()
+
         if forced_iname_deps_is_final is None:
             forced_iname_deps_is_final = False
 
@@ -574,6 +595,7 @@ class InstructionBase(Record):
                 id=id,
                 depends_on=depends_on,
                 depends_on_is_final=depends_on_is_final,
+                no_sync_with=no_sync_with,
                 groups=groups, conflicts_with_groups=conflicts_with_groups,
                 forced_iname_deps_is_final=forced_iname_deps_is_final,
                 forced_iname_deps=forced_iname_deps,
@@ -681,7 +703,7 @@ class InstructionBase(Record):
             result.append("priority=%d" % self.priority)
         if self.tags:
             result.append("tags=%s" % ":".join(self.tags))
-        if self.atomicity:
+        if hasattr(self, "atomicity"):
             result.append("atomic=%s" % ":".join(str(a) for a in self.atomicity))
 
         return result
@@ -762,6 +784,8 @@ def _get_assignee_and_index(expr):
         raise RuntimeError("invalid lvalue '%s'" % expr)
 
 
+# {{{ atomic ops
+
 class memory_ordering:
     """Ordering of atomic operations, defined as in C11 and OpenCL.
 
@@ -903,10 +927,51 @@ class AtomicUpdate(VarAtomicity):
                 memory_ordering.to_string(self.ordering),
                 memory_scope.to_string(self.scope))
 
+# }}}
+
+
+# {{{ instruction base class: expression rhs
+
+class MultiAssignmentBase(InstructionBase):
+    """An assignment instruction with an expression as a right-hand side."""
+
+    fields = InstructionBase.fields | set(["expression"])
+
+    @memoize_method
+    def read_dependency_names(self):
+        from loopy.symbolic import get_dependencies
+        result = get_dependencies(self.expression)
+        for _, subscript in self.assignees_and_indices():
+            result = result | get_dependencies(subscript)
+
+        processed_predicates = frozenset(
+                pred.lstrip("!") for pred in self.predicates)
+
+        result = result | processed_predicates
+
+        return result
+
+    @memoize_method
+    def reduction_inames(self):
+        def map_reduction(expr, rec):
+            rec(expr.expr)
+            for iname in expr.inames:
+                result.add(iname)
+
+        from loopy.symbolic import ReductionCallbackMapper
+        cb_mapper = ReductionCallbackMapper(map_reduction)
+
+        result = set()
+        cb_mapper(self.expression)
+
+        return result
+
+# }}}
+
 
-# {{{ assignment
+# {{{ instruction: assignment
 
-class Assignment(InstructionBase):
+class Assignment(MultiAssignmentBase):
     """
     .. attribute:: assignee
 
@@ -957,8 +1022,8 @@ class Assignment(InstructionBase):
     .. automethod:: __init__
     """
 
-    fields = InstructionBase.fields | \
-            set("assignee expression temp_var_type atomicity".split())
+    fields = MultiAssignmentBase.fields | \
+            set("assignee temp_var_type atomicity".split())
 
     def __init__(self,
             assignee, expression,
@@ -967,6 +1032,7 @@ class Assignment(InstructionBase):
             depends_on_is_final=None,
             groups=None,
             conflicts_with_groups=None,
+            no_sync_with=None,
             forced_iname_deps_is_final=None,
             forced_iname_deps=frozenset(),
             boostable=None, boostable_into=None, tags=None,
@@ -974,12 +1040,13 @@ class Assignment(InstructionBase):
             priority=0, predicates=frozenset(),
             insn_deps=None, insn_deps_is_final=None):
 
-        InstructionBase.__init__(self,
+        super(Assignment, self).__init__(
                 id=id,
                 depends_on=depends_on,
                 depends_on_is_final=depends_on_is_final,
                 groups=groups,
                 conflicts_with_groups=conflicts_with_groups,
+                no_sync_with=no_sync_with,
                 forced_iname_deps_is_final=forced_iname_deps_is_final,
                 forced_iname_deps=forced_iname_deps,
                 boostable=boostable,
@@ -994,7 +1061,7 @@ class Assignment(InstructionBase):
         if isinstance(assignee, str):
             assignee = parse(assignee)
         if isinstance(expression, str):
-            assignee = parse(expression)
+            expression = parse(expression)
 
         # FIXME: It may be worth it to enable this check eventually.
         # For now, it causes grief with certain 'checky' uses of the
@@ -1011,35 +1078,6 @@ class Assignment(InstructionBase):
 
     # {{{ implement InstructionBase interface
 
-    @memoize_method
-    def read_dependency_names(self):
-        from loopy.symbolic import get_dependencies
-        result = get_dependencies(self.expression)
-        for _, subscript in self.assignees_and_indices():
-            result = result | get_dependencies(subscript)
-
-        processed_predicates = frozenset(
-                pred.lstrip("!") for pred in self.predicates)
-
-        result = result | processed_predicates
-
-        return result
-
-    @memoize_method
-    def reduction_inames(self):
-        def map_reduction(expr, rec):
-            rec(expr.expr)
-            for iname in expr.inames:
-                result.add(iname)
-
-        from loopy.symbolic import ReductionCallbackMapper
-        cb_mapper = ReductionCallbackMapper(map_reduction)
-
-        result = set()
-        cb_mapper(self.expression)
-
-        return result
-
     @memoize_method
     def assignees_and_indices(self):
         return [_get_assignee_and_index(self.assignee)]
@@ -1078,6 +1116,18 @@ class Assignment(InstructionBase):
             else:
                 key_builder.rec(key_hash, getattr(self, field_name))
 
+    # {{{ for interface uniformity with CallInstruction
+
+    @property
+    def temp_var_types(self):
+        return (self.temp_var_type,)
+
+    @property
+    def assignees(self):
+        return (self.assignee,)
+
+    # }}}
+
 
 class ExpressionInstruction(Assignment):
     def __init__(self, *args, **kwargs):
@@ -1090,6 +1140,162 @@ class ExpressionInstruction(Assignment):
 # }}}
 
 
+# {{{ instruction: function call
+
+class CallInstruction(MultiAssignmentBase):
+    """An instruction capturing a function call. Unlike :class:`Assignment`,
+    this instruction supports functions with multiple return values.
+
+    .. attribute:: assignees
+
+    .. attribute:: expression
+
+    The following attributes are only used until
+    :func:`loopy.make_kernel` is finished:
+
+    .. attribute:: temp_var_types
+
+        if not *None*, a type that will be assigned to the new temporary variable
+        created from the assignee
+
+    .. automethod:: __init__
+    """
+
+    fields = MultiAssignmentBase.fields | \
+            set("assignees temp_var_types".split())
+
+    def __init__(self,
+            assignees, expression,
+            id=None,
+            depends_on=None,
+            depends_on_is_final=None,
+            groups=None,
+            conflicts_with_groups=None,
+            no_sync_with=None,
+            forced_iname_deps_is_final=None,
+            forced_iname_deps=frozenset(),
+            boostable=None, boostable_into=None, tags=None,
+            temp_var_types=None,
+            priority=0, predicates=frozenset(),
+            insn_deps=None, insn_deps_is_final=None):
+
+        super(CallInstruction, self).__init__(
+                id=id,
+                depends_on=depends_on,
+                depends_on_is_final=depends_on_is_final,
+                groups=groups,
+                conflicts_with_groups=conflicts_with_groups,
+                no_sync_with=no_sync_with,
+                forced_iname_deps_is_final=forced_iname_deps_is_final,
+                forced_iname_deps=forced_iname_deps,
+                boostable=boostable,
+                boostable_into=boostable_into,
+                priority=priority,
+                predicates=predicates,
+                tags=tags,
+                insn_deps=insn_deps,
+                insn_deps_is_final=insn_deps_is_final)
+
+        from pymbolic.primitives import Call
+        from loopy.symbolic import Reduction
+        if not isinstance(expression, (Call, Reduction)) and expression is not None:
+            raise LoopyError("'expression' argument to CallInstruction "
+                    "must be a function call")
+
+        from loopy.symbolic import parse
+        if isinstance(assignees, str):
+            assignees = parse(assignees)
+        if isinstance(expression, str):
+            expression = parse(expression)
+
+        # FIXME: It may be worth it to enable this check eventually.
+        # For now, it causes grief with certain 'checky' uses of the
+        # with_transformed_expressions(). (notably the access checker)
+        #
+        # from pymbolic.primitives import Variable, Subscript
+        # if not isinstance(assignee, (Variable, Subscript)):
+        #     raise LoopyError("invalid lvalue '%s'" % assignee)
+
+        self.assignees = assignees
+        self.expression = expression
+        self.temp_var_types = temp_var_types
+
+    # {{{ implement InstructionBase interface
+
+    @memoize_method
+    def assignees_and_indices(self):
+        return [_get_assignee_and_index(a) for a in self.assignees]
+
+    def with_transformed_expressions(self, f, *args):
+        return self.copy(
+                assignees=f(self.assignees, *args),
+                expression=f(self.expression, *args))
+
+    # }}}
+
+    def __str__(self):
+        result = "%s: %s <- %s" % (self.id,
+                ", ".join(str(a) for a in self.assignees),
+                self.expression)
+
+        options = self.get_str_options()
+        if options:
+            result += " (%s)" % (": ".join(options))
+
+        if self.predicates:
+            result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates)
+        return result
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        """Custom hash computation function for use with
+        :class:`pytools.persistent_dict.PersistentDict`.
+
+        Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
+        """
+
+        # Order matters for hash forming--sort the fields.
+        for field_name in sorted(self.fields):
+            if field_name in ["assignees", "expression"]:
+                key_builder.update_for_pymbolic_expression(
+                        key_hash, getattr(self, field_name))
+            else:
+                key_builder.rec(key_hash, getattr(self, field_name))
+
+# }}}
+
+
+def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
+    if len(assignees) < 1:
+        raise LoopyError("every instruction must have a left-hand side")
+    elif len(assignees) > 1:
+        atomicity = kwargs.pop("atomicity", ())
+        if atomicity:
+            raise LoopyError("atomic operations with more than one "
+                    "left-hand side not supported")
+
+        from pymbolic.primitives import Call
+        from loopy.symbolic import Reduction
+        if not isinstance(expression, (Call, Reduction)):
+            raise LoopyError("right-hand side in multiple assignment must be "
+                    "function call or reduction")
+
+        return CallInstruction(
+                assignees=assignees,
+                expression=expression,
+                temp_var_types=temp_var_types,
+                **kwargs)
+
+    else:
+        return Assignment(
+                assignee=assignees[0],
+                expression=expression,
+                temp_var_type=(
+                    temp_var_types[0]
+                    if temp_var_types is not None
+                    else None),
+                **kwargs)
+
+
 # {{{ c instruction
 
 class CInstruction(InstructionBase):
@@ -1121,9 +1327,10 @@ class CInstruction(InstructionBase):
 
     .. attribute:: assignees
 
-        A sequence of variable references (with or without subscript) as
-        :class:`pymbolic.primitives.Expression` instances that :attr:`code`
-        writes to. This is optional and only used for figuring out dependencies.
+        A sequence (typically a :class:`tuple`) of variable references (with or
+        without subscript) as :class:`pymbolic.primitives.Expression` instances
+        that :attr:`code` writes to. This is optional and only used for
+        figuring out dependencies.
     """
 
     fields = InstructionBase.fields | \
@@ -1131,9 +1338,10 @@ class CInstruction(InstructionBase):
 
     def __init__(self,
             iname_exprs, code,
-            read_variables=frozenset(), assignees=frozenset(),
+            read_variables=frozenset(), assignees=tuple(),
             id=None, depends_on=None, depends_on_is_final=None,
             groups=None, conflicts_with_groups=None,
+            no_sync_with=None,
             forced_iname_deps_is_final=None, forced_iname_deps=frozenset(),
             priority=0, boostable=None, boostable_into=None,
             predicates=frozenset(), tags=None,
@@ -1153,6 +1361,7 @@ class CInstruction(InstructionBase):
                 depends_on=depends_on,
                 depends_on_is_final=depends_on_is_final,
                 groups=groups, conflicts_with_groups=conflicts_with_groups,
+                no_sync_with=no_sync_with,
                 forced_iname_deps_is_final=forced_iname_deps_is_final,
                 forced_iname_deps=forced_iname_deps,
                 boostable=boostable,
@@ -1263,6 +1472,35 @@ class CInstruction(InstructionBase):
 
 # }}}
 
+
+# {{{ function call mangling
+
+class CallMangleInfo(Record):
+    """
+    .. attribute:: target_name
+
+        A string. The name of the function to be called in the
+        generated target code.
+
+    .. attribute:: result_dtypes
+
+        A tuple of :class:`LoopyType` instances indicating what
+        types of values the function returns.
+
+    .. attribute:: arg_dtypes
+
+        A tuple of :class:`LoopyType` instances indicating what
+        types of arguments the function actually receives.
+    """
+
+    def __init__(self, target_name, result_dtypes, arg_dtypes):
+        assert isinstance(result_dtypes, tuple)
+
+        super(CallMangleInfo, self).__init__(
+                target_name=target_name,
+                result_dtypes=result_dtypes,
+                arg_dtypes=arg_dtypes)
+
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index ef3cb3c8a4fdb25fa655c74d642f720dad825af2..0a02979ab32ad45cc4b1f6519c2bd327fa150812 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -423,11 +423,11 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
     dep_graph = {}
     lines = []
 
-    from loopy.kernel.data import Assignment, CInstruction
+    from loopy.kernel.data import MultiAssignmentBase, CInstruction
 
     for insn in kernel.instructions:
-        if isinstance(insn, Assignment):
-            op = "%s <- %s" % (insn.assignee, insn.expression)
+        if isinstance(insn, MultiAssignmentBase):
+            op = "%s <- %s" % (insn.assignees, insn.expression)
             if len(op) > 200:
                 op = op[:200] + "..."
 
@@ -657,8 +657,9 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 
     from pymbolic.primitives import Subscript
 
-    if isinstance(insn.assignee, Subscript):
-        ary_acc_exprs.append(insn.assignee)
+    for assignee in insn.assignees:
+        if isinstance(assignee, Subscript):
+            ary_acc_exprs.append(assignee)
 
     # }}}
 
@@ -855,7 +856,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
     import loopy as lp
 
     for insn in kernel.instructions:
-        if not isinstance(insn, lp.Assignment):
+        if not isinstance(insn, lp.MultiAssignmentBase):
             continue
 
         auto_axis_inames = [
diff --git a/loopy/library/function.py b/loopy/library/function.py
index df623a4770f4f14a7952ee2e0edbf59939de1cfd..efa590371bb632cbc9776078ea6b5c64f626d46a 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -38,7 +38,9 @@ def default_function_mangler(kernel, name, arg_dtypes):
 def single_arg_function_mangler(kernel, name, arg_dtypes):
     if len(arg_dtypes) == 1:
         dtype, = arg_dtypes
-        return dtype, name
+
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(name, (dtype,), (dtype,))
 
     return None
 
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d04b8c7330f88af9ee1d79fe19fd87b29b70050
--- /dev/null
+++ b/loopy/library/random123.py
@@ -0,0 +1,222 @@
+"""Library integration with Random123."""
+
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+from pytools import Record
+from mako.template import Template
+import numpy as np
+
+
+# {{{ rng metadata
+
+class RNGInfo(Record):
+    @property
+    def full_name(self):
+        return "%s%dx%d" % (self.name, self.width, self.bits)
+
+
+_philox_base_info = RNGInfo(
+            name="philox",
+            pyopencl_header="pyopencl-random123/philox.cl",
+            generic_header="Random123/philox.h",
+            key_width=2)
+
+_threefry_base_info = RNGInfo(
+            name="threefry",
+            pyopencl_header="pyopencl-random123/threefry.cl",
+            generic_header="Random123/threefry.h",
+            key_width=4)
+
+RNG_VARIANTS = [
+        _philox_base_info.copy(width=2, bits=32),
+        _philox_base_info.copy(width=2, bits=64),
+        _philox_base_info.copy(width=4, bits=32),
+        _philox_base_info.copy(width=4, bits=64),
+
+        _threefry_base_info.copy(width=2, bits=32),
+        _threefry_base_info.copy(width=2, bits=64),
+        _threefry_base_info.copy(width=4, bits=32),
+        _threefry_base_info.copy(width=4, bits=64),
+        ]
+
+FUNC_NAMES_TO_RNG = dict(
+        (v.full_name + suffix, v)
+        for v in RNG_VARIANTS
+        for suffix in [
+            "", "_f32", "_f64",
+            ])
+
+# }}}
+
+
+# {{{ preamble
+
+PREAMBLE_TEMPLATE = Template("""
+%if is_pyopencl_target:
+#include <${ rng_variant.pyopencl_header }>
+%else:
+#include <${ rng_variant.generic_header }>
+%endif
+
+<%
+name = rng_variant.full_name
+width = rng_variant.width
+if rng_variant.bits == 32:
+    counter_type = "uint%d" % width
+    key_type = "uint%d" % rng_variant.key_width
+elif rng_variant.bits == 64:
+    counter_type = "ulong%d" % width
+    key_type = "ulong%d" % rng_variant.key_width
+else:
+    assert False
+%>
+
+typedef union {
+    ${ counter_type } v;
+    ${ name }_ctr_t c;
+} ${ name }_ctr_vec_union;
+
+
+${ counter_type } ${ name }_bump(${ counter_type } ctr)
+{
+    if (++ctr.x == 0)
+        if (++ctr.y == 0)
+            ++ctr.z;
+    return ctr;
+}
+
+${ counter_type } ${ name }_gen(
+        ${ counter_type } ctr,
+        ${ key_type } key,
+        ${ counter_type } *new_ctr)
+{
+    ${ name }_ctr_vec_union result;
+    result.c = ${ name }(
+        *(${ name }_ctr_t *) &ctr,
+        *(${ name }_key_t *) &key);
+    *new_ctr = ${ name }_bump(ctr);
+    return result.v;
+}
+
+float${ width } ${ name }_f32(
+        ${ counter_type } ctr,
+        ${ key_type } key,
+        ${ counter_type } *new_ctr)
+{
+    *new_ctr = ctr;
+    return
+        convert_float${ width }(${ name }_gen(*new_ctr, key, new_ctr))
+        * ${ repr(1./2**32) }f;
+}
+
+double${ width } ${ name }_f64(
+        ${ counter_type } ctr,
+        ${ key_type } key,
+        ${ counter_type } *new_ctr)
+{
+    *new_ctr = ctr;
+    %if rng_variant.bits == 32:
+        return
+            convert_double${ width }(${ name }_gen(*new_ctr, key, new_ctr))
+            * ${ repr(1./2**32) }
+            +
+            convert_double${ width }(${ name }_gen(*new_ctr, key, new_ctr))
+            * ${ repr(1./2**64) };
+
+    %elif rng_variant.bits == 64:
+        *new_ctr = ctr;
+        return
+            convert_double${ width }(${ name }_gen(*new_ctr, key, new_ctr))
+            * ${ repr(1./2**64) };
+
+    %else:
+        #error Unrecognized bit width in RNG
+
+    %endif
+}
+
+""", strict_undefined=True)
+
+# }}}
+
+
+def random123_preamble_generator(preamble_info):
+    for f in preamble_info.seen_functions:
+        try:
+            rng_variant = FUNC_NAMES_TO_RNG[f.name]
+        except KeyError:
+            continue
+
+        from loopy.target.pyopencl import PyOpenCLTarget
+        yield ("90-random123-"+rng_variant.full_name,
+                PREAMBLE_TEMPLATE.render(
+                    is_pyopencl_target=isinstance(
+                        preamble_info.kernel.target,
+                        PyOpenCLTarget),
+                    rng_variant=rng_variant,
+                    ))
+
+
+def random123_function_mangler(kernel, name, arg_dtypes):
+    try:
+        rng_variant = FUNC_NAMES_TO_RNG[name]
+    except KeyError:
+        return None
+
+    from loopy.types import NumpyType
+    target = kernel.target
+    base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits]
+    ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width)
+    key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width)
+
+    from loopy.kernel.data import CallMangleInfo
+    fn = rng_variant.full_name
+    if name == fn:
+        return CallMangleInfo(
+                target_name=fn+"_gen",
+                result_dtypes=(ctr_dtype, ctr_dtype),
+                arg_dtypes=(ctr_dtype, key_dtype))
+
+    elif name == fn + "_f32":
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(
+                    target.vector_dtype(NumpyType(np.float32), rng_variant.width),
+                    ctr_dtype),
+                arg_dtypes=(ctr_dtype, key_dtype))
+
+    elif name == fn + "_f64":
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(
+                    target.vector_dtype(NumpyType(np.float64), rng_variant.width),
+                    ctr_dtype),
+                arg_dtypes=(ctr_dtype, key_dtype))
+
+    else:
+        return None
+
+# vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index b39115a355069cadf698b466b55c2fc3c6b5a4e0..8a38eebd55b003c624b386bcdf296d2b97e2c97c 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -36,7 +36,7 @@ class ReductionOperation(object):
     equality-comparable.
     """
 
-    def result_dtype(self, target, arg_dtype, inames):
+    def result_dtypes(self, target, arg_dtype, inames):
         raise NotImplementedError
 
     def neutral_element(self, dtype, inames):
@@ -82,11 +82,12 @@ class ScalarReductionOperation(ReductionOperation):
         """
         self.forced_result_type = forced_result_type
 
-    def result_dtype(self, target, arg_dtype, inames):
+    def result_dtypes(self, kernel, arg_dtype, inames):
         if self.forced_result_type is not None:
-            return self.parse_result_type(target, self.forced_result_type)
+            return (self.parse_result_type(
+                    kernel.target, self.forced_result_type),)
 
-        return arg_dtype
+        return (arg_dtype,)
 
     def __hash__(self):
         return hash((type(self), self.forced_result_type))
@@ -148,23 +149,12 @@ class MinReductionOperation(ScalarReductionOperation):
 
 # {{{ argmin/argmax
 
-ARGEXT_STRUCT_DTYPES = {}
-
-
 class _ArgExtremumReductionOperation(ReductionOperation):
     def prefix(self, dtype):
         return "loopy_arg%s_%s" % (self.which, dtype.numpy_dtype.type.__name__)
 
-    def result_dtype(self, target, dtype, inames):
-        try:
-            return ARGEXT_STRUCT_DTYPES[dtype]
-        except KeyError:
-            struct_dtype = np.dtype([("value", dtype), ("index", np.int32)])
-            ARGEXT_STRUCT_DTYPES[dtype] = NumpyType(struct_dtype, target)
-
-            target.get_or_register_dtype(self.prefix(dtype)+"_result",
-                    NumpyType(struct_dtype))
-            return ARGEXT_STRUCT_DTYPES[dtype]
+    def result_dtypes(self, kernel, dtype, inames):
+        return (dtype, kernel.index_dtype)
 
     def neutral_element(self, dtype, inames):
         return ArgExtFunction(self, dtype, "init", inames)()
@@ -179,7 +169,7 @@ class _ArgExtremumReductionOperation(ReductionOperation):
         iname, = inames
 
         return ArgExtFunction(self, dtype, "update", inames)(
-                operand1, operand2, var(iname))
+                *(operand1 + (operand2, var(iname))))
 
 
 class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
@@ -207,7 +197,7 @@ class ArgExtFunction(FunctionIdentifier):
         return (self.reduction_op, self.scalar_dtype, self.name, self.inames)
 
 
-def get_argext_preamble(target, func_id):
+def get_argext_preamble(kernel, func_id):
     op = func_id.reduction_op
     prefix = op.prefix(func_id.scalar_dtype)
 
@@ -216,35 +206,32 @@ def get_argext_preamble(target, func_id):
     c_code_mapper = CCodeMapper()
 
     return (prefix, """
-    typedef struct {
-        %(scalar_type)s value;
-        int index;
-    } %(type_name)s;
-
-    inline %(type_name)s %(prefix)s_init()
+    inline %(scalar_t)s %(prefix)s_init(%(index_t)s *index_out)
     {
-        %(type_name)s result;
-        result.value = %(neutral)s;
-        result.index = INT_MIN;
-        return result;
+        *index_out = INT_MIN;
+        return %(neutral)s;
     }
 
-    inline %(type_name)s %(prefix)s_update(
-        %(type_name)s state, %(scalar_type)s op2, int index)
+    inline %(scalar_t)s %(prefix)s_update(
+        %(scalar_t)s op1, %(index_t)s index1,
+        %(scalar_t)s op2, %(index_t)s index2,
+        %(index_t)s *index_out)
     {
-        %(type_name)s result;
-        if (op2 %(comp)s state.value)
+        if (op2 %(comp)s op1)
+        {
+            *index_out = index2;
+            return op2;
+        }
+        else
         {
-            result.value = op2;
-            result.index = index;
-            return result;
+            *index_out = index1;
+            return op1;
         }
-        else return state;
     }
     """ % dict(
-            type_name=prefix+"_result",
-            scalar_type=target.dtype_to_typename(func_id.scalar_dtype),
+            scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype),
             prefix=prefix,
+            index_t=kernel.target.dtype_to_typename(kernel.index_dtype),
             neutral=c_code_mapper(
                 op.neutral_sign*get_le_neutral(func_id.scalar_dtype)),
             comp=op.update_comparison,
@@ -302,14 +289,39 @@ def parse_reduction_op(name):
 
 
 def reduction_function_mangler(kernel, func_id, arg_dtypes):
-    if isinstance(func_id, ArgExtFunction):
+    if isinstance(func_id, ArgExtFunction) and func_id.name == "init":
         from loopy.target.opencl import OpenCLTarget
         if not isinstance(kernel.target, OpenCLTarget):
             raise LoopyError("only OpenCL supported for now")
 
         op = func_id.reduction_op
-        return (op.result_dtype(kernel.target, func_id.scalar_dtype, func_id.inames),
-                "%s_%s" % (op.prefix(func_id.scalar_dtype), func_id.name))
+
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(
+                target_name="%s_init" % op.prefix(func_id.scalar_dtype),
+                result_dtypes=op.result_dtypes(
+                    kernel, func_id.scalar_dtype, func_id.inames),
+                arg_dtypes=(),
+                )
+
+    elif isinstance(func_id, ArgExtFunction) and func_id.name == "update":
+        from loopy.target.opencl import OpenCLTarget
+        if not isinstance(kernel.target, OpenCLTarget):
+            raise LoopyError("only OpenCL supported for now")
+
+        op = func_id.reduction_op
+
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(
+                target_name="%s_update" % op.prefix(func_id.scalar_dtype),
+                result_dtypes=op.result_dtypes(
+                    kernel, func_id.scalar_dtype, func_id.inames),
+                arg_dtypes=(
+                    func_id.scalar_dtype,
+                    kernel.index_dtype,
+                    func_id.scalar_dtype,
+                    kernel.index_dtype),
+                )
 
     return None
 
@@ -322,6 +334,6 @@ def reduction_preamble_generator(preamble_info):
             if not isinstance(preamble_info.kernel.target, OpenCLTarget):
                 raise LoopyError("only OpenCL supported for now")
 
-            yield get_argext_preamble(preamble_info.kernel.target, func.name)
+            yield get_argext_preamble(preamble_info.kernel, func.name)
 
 # vim: fdm=marker
diff --git a/loopy/maxima.py b/loopy/maxima.py
index 1a2d0770c78af6506664aaed17f99e90e200875c..29f974ff9d1f6f0a6e6d4311eca88201559e6854 100644
--- a/loopy/maxima.py
+++ b/loopy/maxima.py
@@ -82,7 +82,7 @@ def get_loopy_instructions_as_maxima(kernel, prefix):
         if not isinstance(insn, InstructionBase):
             insn = kernel.id_to_insn[insn]
         if not isinstance(insn, Assignment):
-            raise RuntimeError("non-expression instructions not supported "
+            raise RuntimeError("non-single-output assignment not supported "
                     "in maxima export")
 
         for dep in insn.depends_on:
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index e30d3bcb3f441036bab6558bc2d8691031a1c2e4..2fdadb48e790d2cd05cc00f4d04c31957505aaa9 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -31,6 +31,7 @@ from loopy.diagnostic import (
 from pytools.persistent_dict import PersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
+from loopy.kernel.data import make_assignment
 
 import logging
 logger = logging.getLogger(__name__)
@@ -123,14 +124,26 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
     from loopy.diagnostic import DependencyTypeInferenceFailure
     for writer_insn_id in kernel.writer_map().get(var_name, []):
         writer_insn = kernel.id_to_insn[writer_insn_id]
-        if not isinstance(writer_insn, lp.Assignment):
+        if not isinstance(writer_insn, lp.MultiAssignmentBase):
             continue
 
         expr = subst_expander(writer_insn.expression)
 
         try:
             debug("             via expr %s" % expr)
-            result = type_inf_mapper(expr)
+            if isinstance(writer_insn, lp.Assignment):
+                result = type_inf_mapper(expr)
+            elif isinstance(writer_insn, lp.CallInstruction):
+                result_dtypes = type_inf_mapper(expr, multiple_types_ok=True)
+
+                result = None
+                for (assignee, _), comp_dtype in zip(
+                        writer_insn.assignees_and_indices(), result_dtypes):
+                    if assignee == var_name:
+                        result = comp_dtype
+                        break
+
+                assert result is not None
 
             debug("             result: %s" % result)
 
@@ -319,9 +332,10 @@ def mark_local_temporaries(kernel):
                     if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase))
 
             locparallel_assignee_inames = set(iname
-                    for _, assignee_indices in insn.assignees_and_indices()
-                    for iname in get_dependencies(assignee_indices)
+                    for aname, aindices in insn.assignees_and_indices()
+                    for iname in get_dependencies(aindices)
                         & kernel.all_inames()
+                    if aname == temp_var.name
                     if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase))
 
             assert locparallel_assignee_inames <= locparallel_compute_inames
@@ -400,7 +414,9 @@ def add_default_dependencies(kernel):
                             % var)
 
                 if len(var_writers) == 1:
-                    auto_deps.update(var_writers - set([insn.id]))
+                    auto_deps.update(
+                            var_writers
+                            - set([insn.id]))
 
             # }}}
 
@@ -436,35 +452,48 @@ def realize_reduction(kernel, insn_id_filter=None):
 
     new_insns = []
 
+    insn_id_gen = kernel.get_instruction_id_generator()
+
     var_name_gen = kernel.get_var_name_generator()
     new_temporary_variables = kernel.temporary_variables.copy()
 
     from loopy.expression import TypeInferenceMapper
     type_inf_mapper = TypeInferenceMapper(kernel)
 
-    def map_reduction(expr, rec):
+    def map_reduction(expr, rec, multiple_values_ok=False):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
-        from pymbolic import var
-
-        target_var_name = var_name_gen("acc_"+"_".join(expr.inames))
-        target_var = var(target_var_name)
-
         try:
             arg_dtype = type_inf_mapper(expr.expr)
         except DependencyTypeInferenceFailure:
             raise LoopyError("failed to determine type of accumulator for "
                     "reduction '%s'" % expr)
 
-        from loopy.kernel.data import Assignment, TemporaryVariable
+        arg_dtype = arg_dtype.with_target(kernel.target)
+
+        reduction_dtypes = expr.operation.result_dtypes(
+                    kernel, arg_dtype, expr.inames)
+        reduction_dtypes = tuple(
+                dt.with_target(kernel.target) for dt in reduction_dtypes)
+
+        ncomp = len(reduction_dtypes)
+
+        from pymbolic import var
+
+        acc_var_names = [
+                var_name_gen("acc_"+"_".join(expr.inames))
+                for i in range(ncomp)]
+        acc_vars = tuple(var(n) for n in acc_var_names)
+
+        from loopy.kernel.data import TemporaryVariable
 
-        new_temporary_variables[target_var_name] = TemporaryVariable(
-                name=target_var_name,
-                shape=(),
-                dtype=expr.operation.result_dtype(
-                    kernel.target, arg_dtype, expr.inames),
-                is_local=False)
+        for name, dtype in zip(acc_var_names, reduction_dtypes):
+            new_temporary_variables[name] = TemporaryVariable(
+                    name=name,
+                    shape=(),
+                    dtype=dtype,
+                    is_local=False)
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
         bad_inames = frozenset(expr.inames) & outer_insn_inames
@@ -472,41 +501,52 @@ def realize_reduction(kernel, insn_id_filter=None):
             raise LoopyError("reduction used within loop(s) that it was "
                     "supposed to reduce over: " + ", ".join(bad_inames))
 
-        init_id = temp_kernel.make_unique_instruction_id(
-                based_on="%s_%s_init" % (insn.id, "_".join(expr.inames)),
-                extra_used_ids=set(i.id for i in generated_insns))
+        init_id = insn_id_gen(
+                "%s_%s_init" % (insn.id, "_".join(expr.inames)))
 
-        init_insn = Assignment(
+        init_insn = make_assignment(
                 id=init_id,
-                assignee=target_var,
+                assignees=acc_vars,
                 forced_iname_deps=outer_insn_inames - frozenset(expr.inames),
+                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
                 depends_on=frozenset(),
                 expression=expr.operation.neutral_element(arg_dtype, expr.inames))
 
         generated_insns.append(init_insn)
 
-        update_id = temp_kernel.make_unique_instruction_id(
-                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)),
-                extra_used_ids=set(i.id for i in generated_insns))
+        update_id = insn_id_gen(
+                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))
+
+        update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames)
+        if insn.forced_iname_deps_is_final:
+            update_insn_iname_deps = insn.forced_iname_deps | set(expr.inames)
 
-        reduction_insn = Assignment(
+        reduction_insn = make_assignment(
                 id=update_id,
-                assignee=target_var,
+                assignees=acc_vars,
                 expression=expr.operation(
-                    arg_dtype, target_var, expr.expr, expr.inames),
+                    arg_dtype,
+                    acc_vars if len(acc_vars) > 1 else acc_vars[0],
+                    expr.expr, expr.inames),
                 depends_on=frozenset([init_insn.id]) | insn.depends_on,
-                forced_iname_deps=temp_kernel.insn_inames(insn) | set(expr.inames))
+                forced_iname_deps=update_insn_iname_deps,
+                forced_iname_deps_is_final=insn.forced_iname_deps_is_final)
 
         generated_insns.append(reduction_insn)
 
         new_insn_depends_on.add(reduction_insn.id)
 
-        return target_var
+        if multiple_values_ok:
+            return acc_vars
+        else:
+            assert len(acc_vars) == 1
+            return acc_vars[0]
 
     from loopy.symbolic import ReductionCallbackMapper
     cb_mapper = ReductionCallbackMapper(map_reduction)
 
     insn_queue = kernel.instructions[:]
+    insn_id_replacements = {}
 
     temp_kernel = kernel
 
@@ -518,24 +558,47 @@ def realize_reduction(kernel, insn_id_filter=None):
         insn = insn_queue.pop(0)
 
         if insn_id_filter is not None and insn.id != insn_id_filter \
-                or not isinstance(insn, lp.Assignment):
+                or not isinstance(insn, lp.MultiAssignmentBase):
             new_insns.append(insn)
             continue
 
         # Run reduction expansion.
-        new_expression = cb_mapper(insn.expression)
+        from loopy.symbolic import Reduction
+        if isinstance(insn.expression, Reduction):
+            new_expressions = cb_mapper(insn.expression, multiple_values_ok=True)
+        else:
+            new_expressions = (cb_mapper(insn.expression),)
 
         if generated_insns:
             # An expansion happened, so insert the generated stuff plus
             # ourselves back into the queue.
 
-            insn = insn.copy(
-                        expression=new_expression,
-                        depends_on=insn.depends_on
-                        | frozenset(new_insn_depends_on),
-                        forced_iname_deps=temp_kernel.insn_inames(insn))
+            kwargs = insn.get_copy_kwargs(
+                    depends_on=insn.depends_on
+                    | frozenset(new_insn_depends_on),
+                    forced_iname_deps=temp_kernel.insn_inames(insn))
+
+            kwargs.pop("id")
+            kwargs.pop("expression")
+            kwargs.pop("assignee", None)
+            kwargs.pop("assignees", None)
+            kwargs.pop("temp_var_type", None)
+            kwargs.pop("temp_var_types", None)
 
-            insn_queue = generated_insns + [insn] + insn_queue
+            replacement_insns = [
+                    lp.Assignment(
+                        id=insn_id_gen(insn.id),
+                        assignee=assignee,
+                        expression=new_expr,
+                        **kwargs)
+                    for assignee, new_expr in zip(insn.assignees, new_expressions)]
+
+            insn_id_replacements[insn.id] = [
+                    rinsn.id for rinsn in replacement_insns]
+
+            # FIXME: Track dep rename
+
+            insn_queue = generated_insns + replacement_insns + insn_queue
 
             # The reduction expander needs an up-to-date kernel
             # object to find dependencies. Keep temp_kernel up-to-date.
@@ -550,10 +613,12 @@ def realize_reduction(kernel, insn_id_filter=None):
 
             new_insns.append(insn)
 
-    return kernel.copy(
+    kernel = kernel.copy(
             instructions=new_insns,
             temporary_variables=new_temporary_variables)
 
+    return lp.replace_instruction_ids(kernel, insn_id_replacements)
+
 # }}}
 
 
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 056030cf56f7b1a6e00cddd64c81cced6fd37c8f..7ac8778378869f7c9cf6cfbe8d4be145e9fad520 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -355,7 +355,7 @@ def format_insn(kernel, insn_id):
     Style = kernel.options._style
     return "[%s] %s%s%s <- %s%s%s" % (
             format_insn_id(kernel, insn_id),
-            Fore.BLUE, str(insn.assignee), Style.RESET_ALL,
+            Fore.BLUE, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL,
             Fore.MAGENTA, str(insn.expression), Style.RESET_ALL)
 
 
@@ -363,7 +363,7 @@ def dump_schedule(kernel, schedule):
     lines = []
     indent = ""
 
-    from loopy.kernel.data import Assignment
+    from loopy.kernel.data import MultiAssignmentBase
     for sched_item in schedule:
         if isinstance(sched_item, EnterLoop):
             lines.append(indent + "LOOP %s" % sched_item.iname)
@@ -379,13 +379,13 @@ def dump_schedule(kernel, schedule):
             lines.append(indent + "RETURN FROM KERNEL %s" % sched_item.kernel_name)
         elif isinstance(sched_item, RunInstruction):
             insn = kernel.id_to_insn[sched_item.insn_id]
-            if isinstance(insn, Assignment):
+            if isinstance(insn, MultiAssignmentBase):
                 insn_str = format_insn(kernel, sched_item.insn_id)
             else:
                 insn_str = sched_item.insn_id
             lines.append(indent + insn_str)
         elif isinstance(sched_item, Barrier):
-            lines.append(indent + "---BARRIER---")
+            lines.append(indent + "---BARRIER:%s---" % sched_item.kind)
         else:
             assert False
 
@@ -1057,6 +1057,9 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
     if reverse:
         source, target = target, source
 
+    if source.id in target.no_sync_with:
+        return None
+
     # {{{ check that a dependency exists
 
     dep_descr = None
diff --git a/loopy/statistics.py b/loopy/statistics.py
index c5eb3142d73efe0b48960bc8ed7ef1aead20d0de..1a044b8d7f389fbb8e58d3fa597932f8c56ac97f 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -31,8 +31,8 @@ import islpy as isl
 from pytools import memoize_in
 from pymbolic.mapper import CombineMapper
 from functools import reduce
-from loopy.kernel.data import Assignment
-from loopy.diagnostic import warn
+from loopy.kernel.data import MultiAssignmentBase
+from loopy.diagnostic import warn, LoopyError
 
 
 __doc__ = """
@@ -47,6 +47,9 @@ __doc__ = """
 
 .. autofunction:: get_barrier_poly
 
+.. autofunction:: gather_access_footprints
+.. autofunction:: gather_access_footprint_bytes
+
 """
 
 
@@ -415,9 +418,10 @@ class GlobalSubscriptCounter(CombineMapper):
 # {{{ AccessFootprintGatherer
 
 class AccessFootprintGatherer(CombineMapper):
-    def __init__(self, kernel, domain):
+    def __init__(self, kernel, domain, ignore_uncountable=False):
         self.kernel = kernel
         self.domain = domain
+        self.ignore_uncountable = ignore_uncountable
 
     @staticmethod
     def combine(values):
@@ -456,10 +460,17 @@ class AccessFootprintGatherer(CombineMapper):
                     self.kernel.assumptions)
         except isl.Error:
             # Likely: index was non-linear, nothing we can do.
-            return
+            if self.ignore_uncountable:
+                return {}
+            else:
+                raise LoopyError("failed to gather footprint: %s" % expr)
+
         except TypeError:
             # Likely: index was non-linear, nothing we can do.
-            return
+            if self.ignore_uncountable:
+                return {}
+            else:
+                raise LoopyError("failed to gather footprint: %s" % expr)
 
         from pymbolic.primitives import Variable
         assert isinstance(expr.aggregate, Variable)
@@ -838,8 +849,16 @@ def get_barrier_poly(knl):
 
 # {{{ gather_access_footprints
 
-def gather_access_footprints(kernel):
-    # TODO: Docs
+def gather_access_footprints(kernel, ignore_uncountable=False):
+    """Return a dictionary mapping ``(var_name, direction)``
+    to :class:`islpy.Set` instances capturing which indices
+    of each the array *var_name* are read/written (where
+    *direction* is either ``read`` or ``write``.
+
+    :arg ignore_uncountable: If *True*, an error will be raised for
+        accesses on which the footprint cannot be determined (e.g.
+        data-dependent or nonlinear indices)
+    """
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
     kernel = infer_unknown_types(kernel, expect_completion=True)
@@ -849,7 +868,7 @@ def gather_access_footprints(kernel):
     read_footprints = []
 
     for insn in kernel.instructions:
-        if not isinstance(insn, Assignment):
+        if not isinstance(insn, MultiAssignmentBase):
             warn(kernel, "count_non_assignment",
                     "Non-assignment instruction encountered in "
                     "gather_access_footprints, not counted")
@@ -859,9 +878,11 @@ def gather_access_footprints(kernel):
         inames_domain = kernel.get_inames_domain(insn_inames)
         domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
 
-        afg = AccessFootprintGatherer(kernel, domain)
+        afg = AccessFootprintGatherer(kernel, domain,
+                ignore_uncountable=ignore_uncountable)
 
-        write_footprints.append(afg(insn.assignee))
+        for assignee in insn.assignees:
+            write_footprints.append(afg(insn.assignees))
         read_footprints.append(afg(insn.expression))
 
     write_footprints = AccessFootprintGatherer.combine(write_footprints)
@@ -877,6 +898,35 @@ def gather_access_footprints(kernel):
 
     return result
 
+
+def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
+    """Return a dictionary mapping ``(var_name, direction)`` to
+    :class:`islpy.PwQPolynomial` instances capturing the number of bytes  are
+    read/written (where *direction* is either ``read`` or ``write`` on array
+    *var_name*
+
+    :arg ignore_uncountable: If *True*, an error will be raised for
+        accesses on which the footprint cannot be determined (e.g.
+        data-dependent or nonlinear indices)
+    """
+
+    result = {}
+    fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable)
+
+    for key, var_fp in fp.items():
+        vname, direction = key
+
+        var_descr = kernel.get_var_descriptor(vname)
+        bytes_transferred = (
+                int(var_descr.dtype.numpy_dtype.itemsize)
+                * count(kernel, var_fp))
+        if key in result:
+            result[key] += bytes_transferred
+        else:
+            result[key] = bytes_transferred
+
+    return result
+
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index b887c703420d092d7f3c0fc9c729dd1d1f942a76..219d66d49a47882bf76bd8c89f7298c8597dfa0e 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -33,7 +33,7 @@ from pytools import memoize, memoize_method, Record
 import pytools.lex
 
 from pymbolic.primitives import (
-        Leaf, AlgebraicLeaf, Variable,
+        Leaf, AlgebraicLeaf, Expression, Variable,
         CommonSubexpression)
 
 from pymbolic.mapper import (
@@ -98,6 +98,9 @@ class IdentityMapperMixin(object):
         # leaf, doesn't change
         return expr
 
+    def map_type_annotation(self, expr, *args):
+        return TypeAnnotation(expr.type, self.rec(expr.child))
+
     map_linear_subscript = IdentityMapperBase.map_subscript
 
 
@@ -321,6 +324,18 @@ class TypedCSE(CommonSubexpression):
         return dict(dtype=self.dtype)
 
 
+class TypeAnnotation(Expression):
+    def __init__(self, type, child):
+        super(TypeAnnotation, self).__init__()
+        self.type = type
+        self.child = child
+
+    def __getinitargs__(self):
+        return (self.type, self.child)
+
+    mapper_method = intern("map_type_annotation")
+
+
 class TaggedVariable(Variable):
     """This is an identifier with a tag, such as 'matrix$one', where
     'one' identifies this specific use of the identifier. This mechanism
@@ -882,7 +897,6 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 # {{{ customization to pymbolic parser
 
 _open_dbl_bracket = intern("open_dbl_bracket")
-_close_dbl_bracket = intern("close_dbl_bracket")
 
 TRAILING_FLOAT_TAG_RE = re.compile("^(.*?)([a-zA-Z]*)$")
 
@@ -908,6 +922,26 @@ class LoopyParser(ParserBase):
         else:
             return float(val)  # generic float
 
+    def parse_prefix(self, pstate):
+        from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier
+        if pstate.is_next(_less):
+            pstate.advance()
+            if pstate.is_next(_greater):
+                typename = None
+                pstate.advance()
+            else:
+                pstate.expect(_identifier)
+                typename = pstate.next_str()
+                pstate.advance()
+                pstate.expect(_greater)
+                pstate.advance()
+
+            return TypeAnnotation(
+                    typename,
+                    self.parse_expression(pstate, _PREC_UNARY))
+        else:
+            return super(LoopyParser, self).parse_prefix(pstate)
+
     def parse_postfix(self, pstate, min_precedence, left_exp):
         from pymbolic.parser import _PREC_CALL, _closebracket
         if pstate.next_tag() is _open_dbl_bracket and _PREC_CALL > min_precedence:
@@ -1079,10 +1113,10 @@ class ReductionCallbackMapper(IdentityMapper):
     def __init__(self, callback):
         self.callback = callback
 
-    def map_reduction(self, expr):
-        result = self.callback(expr, self.rec)
+    def map_reduction(self, expr, **kwargs):
+        result = self.callback(expr, self.rec, **kwargs)
         if result is None:
-            return IdentityMapper.map_reduction(self, expr)
+            return IdentityMapper.map_reduction(self, expr, **kwargs)
         return result
 
 # }}}
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 42a9e5a3dc4c88cceadb3399b5399d609b877cfb..cad451c2ed041875f9dd9535d73ce20e8ac9e057 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -133,6 +133,9 @@ class TargetBase(object):
     def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
         raise NotImplementedError()
 
+    def generate_multiple_assignment(self, codegen_state, insn):
+        raise NotImplementedError()
+
     def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype):
         raise NotImplementedError("atomic update in target %s" % type(self).__name__)
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 822ee1838d5c0bf56eed98e9c3b36f34583ce033..5e35075623c346f908468ae3855f78883da8d652 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -341,6 +341,71 @@ class CTarget(TargetBase):
                 "++%s" % iname,
                 inner)
 
+    def generate_multiple_assignment(self, codegen_state, insn):
+        ecm = codegen_state.expression_to_code_mapper
+
+        from pymbolic.primitives import Variable
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        func_id = insn.expression.function
+        parameters = insn.expression.parameters
+
+        if isinstance(func_id, Variable):
+            func_id = func_id.name
+
+        assignee_var_descriptors = [codegen_state.kernel.get_var_descriptor(a)
+                for a, _ in insn.assignees_and_indices()]
+
+        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)
+
+        str_parameters = None
+
+        mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes)
+        if mangle_result is None:
+            raise RuntimeError("function '%s' unknown--"
+                    "maybe you need to register a function mangler?"
+                    % func_id)
+
+        assert mangle_result.arg_dtypes is not None
+
+        from loopy.expression import dtype_to_type_context
+        str_parameters = [
+                ecm(par, PREC_NONE,
+                    dtype_to_type_context(self, tgt_dtype),
+                    tgt_dtype)
+                for par, par_dtype, tgt_dtype in zip(
+                    parameters, par_dtypes, mangle_result.arg_dtypes)]
+
+        from loopy.codegen import SeenFunction
+        codegen_state.seen_functions.add(
+                SeenFunction(func_id,
+                    mangle_result.target_name,
+                    mangle_result.arg_dtypes))
+
+        for i, (a, tgt_dtype) in enumerate(
+                zip(insn.assignees[1:], mangle_result.result_dtypes[1:])):
+            if tgt_dtype != ecm.infer_type(a):
+                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
+                        "side of instruction '%s'" % (i+1, insn.id))
+            str_parameters.append(
+                    "&(%s)" % ecm(a, PREC_NONE,
+                        dtype_to_type_context(self, tgt_dtype),
+                        tgt_dtype))
+
+        result = "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
+
+        result = ecm.wrap_in_typecast(
+                mangle_result.result_dtypes[0],
+                assignee_var_descriptors[0].dtype,
+                result)
+
+        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)
+
+        from cgen import Assign
+        return Assign(
+                lhs_code,
+                result)
+
     # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 63a053c586362a10fa2eea70b83b2659646f62d0..11686dcdc3b691dbab6dbdc716729637e6dc96b7 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -34,7 +34,7 @@ import islpy as isl
 
 from loopy.expression import dtype_to_type_context, TypeInferenceMapper
 
-from loopy.diagnostic import LoopyError
+from loopy.diagnostic import LoopyError, LoopyWarning
 from loopy.tools import is_integer
 from loopy.types import LoopyType
 
@@ -94,22 +94,23 @@ class LoopyCCodeMapper(RecursiveMapper):
 
         return ary
 
-    def rec(self, expr, prec, type_context=None, needed_dtype=None):
-        if needed_dtype is None:
-            return RecursiveMapper.rec(self, expr, prec, type_context)
-
-        actual_type = self.infer_type(expr)
-
+    def wrap_in_typecast(self, actual_type, needed_dtype, s):
         if (actual_type.is_complex() and needed_dtype.is_complex()
                 and actual_type != needed_dtype):
-            result = RecursiveMapper.rec(self, expr, PREC_NONE, type_context)
-            return "%s_cast(%s)" % (self.complex_type_name(needed_dtype), result)
+            return "%s_cast(%s)" % (self.complex_type_name(needed_dtype), s)
         elif not actual_type.is_complex() and needed_dtype.is_complex():
-            result = RecursiveMapper.rec(self, expr, PREC_NONE, type_context)
-            return "%s_fromreal(%s)" % (self.complex_type_name(needed_dtype), result)
+            return "%s_fromreal(%s)" % (self.complex_type_name(needed_dtype), s)
         else:
+            return s
+
+    def rec(self, expr, prec, type_context=None, needed_dtype=None):
+        if needed_dtype is None:
             return RecursiveMapper.rec(self, expr, prec, type_context)
 
+        return self.wrap_in_typecast(
+                self.infer_type(expr), needed_dtype,
+                RecursiveMapper.rec(self, expr, PREC_NONE, type_context))
+
     __call__ = rec
 
     # }}}
@@ -419,37 +420,32 @@ class LoopyCCodeMapper(RecursiveMapper):
 
         # }}}
 
-        c_name = None
         if isinstance(identifier, Variable):
             identifier = identifier.name
-            c_name = identifier
 
         par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)
 
         str_parameters = None
 
         mangle_result = self.kernel.mangle_function(identifier, par_dtypes)
-        if mangle_result is not None:
-            if len(mangle_result) == 2:
-                result_dtype, c_name = mangle_result
-            elif len(mangle_result) == 3:
-                result_dtype, c_name, arg_tgt_dtypes = mangle_result
-
-                str_parameters = [
-                        self.rec(par, PREC_NONE,
-                            dtype_to_type_context(self.kernel.target, tgt_dtype),
-                            tgt_dtype)
-                        for par, par_dtype, tgt_dtype in zip(
-                            expr.parameters, par_dtypes, arg_tgt_dtypes)]
-            else:
-                raise RuntimeError("result of function mangler "
-                        "for function '%s' not understood"
-                        % identifier)
+        if mangle_result is None:
+            raise RuntimeError("function '%s' unknown--"
+                    "maybe you need to register a function mangler?"
+                    % identifier)
 
-        from loopy.codegen import SeenFunction
-        self.codegen_state.seen_functions.add(
-                SeenFunction(identifier, c_name, par_dtypes))
-        if str_parameters is None:
+        if len(mangle_result.result_dtypes) != 1:
+            raise LoopyError("functions with more or fewer than one return value "
+                    "may not be used in an expression")
+
+        if mangle_result.arg_dtypes is not None:
+            str_parameters = [
+                    self.rec(par, PREC_NONE,
+                        dtype_to_type_context(self.kernel.target, tgt_dtype),
+                        tgt_dtype)
+                    for par, par_dtype, tgt_dtype in zip(
+                        expr.parameters, par_dtypes, mangle_result.arg_dtypes)]
+
+        else:
             # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
             # propagate the type context here. But for many others, it does
             # not. Using the inferred type as a stopgap for now.
@@ -459,11 +455,18 @@ class LoopyCCodeMapper(RecursiveMapper):
                             self.kernel.target, par_dtype))
                     for par, par_dtype in zip(expr.parameters, par_dtypes)]
 
-        if c_name is None:
-            raise RuntimeError("unable to find C name for function identifier '%s'"
-                    % identifier)
+            from warnings import warn
+            warn("Calling function '%s' with unknown C signature--"
+                    "return CallMangleInfo.arg_dtypes"
+                    % identifier, LoopyWarning)
+
+        from loopy.codegen import SeenFunction
+        self.codegen_state.seen_functions.add(
+                SeenFunction(identifier,
+                    mangle_result.target_name,
+                    mangle_result.arg_dtypes or par_dtypes))
 
-        return "%s(%s)" % (c_name, ", ".join(str_parameters))
+        return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
 
     # {{{ deal with complex-valued variables
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index f083a70b0288d88d544285e0063eb2e677dfefcf..e839801fdfebaa40f6431db947200c32f880ee53 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -32,7 +32,7 @@ from pytools import memoize_method
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
 from loopy.target.c import DTypeRegistryWrapper
-from loopy.kernel.data import temp_var_scope
+from loopy.kernel.data import temp_var_scope, CallMangleInfo
 
 
 # {{{ dtype registry wrappers
@@ -137,11 +137,34 @@ def _register_vector_types(dtype_registry):
 
 # {{{ function mangler
 
+_CL_SIMPLE_MULTI_ARG_FUNCTIONS = {
+        "clamp": 3,
+        }
+
+
+VECTOR_LITERAL_FUNCS = dict(
+        ("make_%s%d" % (name, count), (name, dtype, count))
+        for name, dtype in [
+            ('char', np.int8),
+            ('uchar', np.uint8),
+            ('short', np.int16),
+            ('ushort', np.uint16),
+            ('int', np.int32),
+            ('uint', np.uint32),
+            ('long', np.int64),
+            ('ulong', np.uint64),
+            ('float', np.float32),
+            ('double', np.float64),
+            ]
+        for count in [2, 3, 4, 8, 16]
+        )
+
+
 def opencl_function_mangler(kernel, name, arg_dtypes):
     if not isinstance(name, str):
         return None
 
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
+    if name in ["max", "min", "atan2"] and len(arg_dtypes) == 2:
         dtype = np.find_common_type(
                 [], [dtype.numpy_dtype for dtype in arg_dtypes])
 
@@ -151,14 +174,49 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
         if dtype.kind == "f":
             name = "f" + name
 
-        return NumpyType(dtype), name
-
-    if name in "atan2" and len(arg_dtypes) == 2:
-        return arg_dtypes[0], name
+        result_dtype = NumpyType(dtype)
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(result_dtype,),
+                arg_dtypes=2*(result_dtype,))
 
     if name == "dot":
         scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
-        return NumpyType(scalar_dtype), name
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(NumpyType(scalar_dtype),),
+                arg_dtypes=(arg_dtypes[0],)*2)
+
+    if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
+        num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
+        if len(arg_dtypes) != num_args:
+            raise LoopyError("%s takes %d arguments (%d received)"
+                    % (name, num_args, len(arg_dtypes)))
+
+        dtype = np.find_common_type(
+                [], [dtype.numpy_dtype for dtype in arg_dtypes])
+
+        if dtype.kind == "c":
+            raise LoopyError("%s does not support complex numbers"
+                    % name)
+
+        result_dtype = NumpyType(dtype)
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(result_dtype,),
+                arg_dtypes=(result_dtype,)*3)
+
+    if name in VECTOR_LITERAL_FUNCS:
+        base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
+
+        if count != len(arg_dtypes):
+            return None
+
+        return CallMangleInfo(
+                target_name="(%s%d) " % (base_tp_name, count),
+                result_dtypes=(kernel.target.vector_dtype(
+                    NumpyType(dtype), count),),
+                arg_dtypes=(NumpyType(dtype),)*count)
 
     return None
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index aea2473972c9f0cc4144e17e1943bf9008cb4f18..44b057f1a713a09bd2f493c8f3be67e790049e6e 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -29,6 +29,7 @@ from six.moves import range
 
 import numpy as np
 
+from loopy.kernel.data import CallMangleInfo
 from loopy.target.opencl import OpenCLTarget
 from loopy.types import NumpyType
 
@@ -194,13 +195,18 @@ def pyopencl_function_mangler(target, name, arg_dtypes):
                     "sin", "cos", "tan",
                     "sinh", "cosh", "tanh",
                     "conj"]:
-                return arg_dtype, "%s_%s" % (tpname, name)
+                return CallMangleInfo(
+                        target_name="%s_%s" % (tpname, name),
+                        result_dtypes=(arg_dtype,),
+                        arg_dtypes=(arg_dtype,))
 
             if name in ["real", "imag", "abs"]:
-                return (
-                        NumpyType(
+                return CallMangleInfo(
+                        target_name="%s_%s" % (tpname, name),
+                        result_dtypes=(NumpyType(
                             np.dtype(arg_dtype.numpy_dtype.type(0).real)),
-                        "%s_%s" % (tpname, name))
+                            ),
+                        arg_dtypes=(arg_dtype,))
 
     return None
 
@@ -270,14 +276,18 @@ class PyOpenCLTarget(OpenCLTarget):
     comparison_fields = ["device"]
 
     def function_manglers(self):
+        from loopy.library.random123 import random123_function_mangler
         return (
                 super(PyOpenCLTarget, self).function_manglers() + [
-                    pyopencl_function_mangler
+                    pyopencl_function_mangler,
+                    random123_function_mangler
                     ])
 
     def preamble_generators(self):
+        from loopy.library.random123 import random123_preamble_generator
         return ([
-            pyopencl_preamble_generator
+            pyopencl_preamble_generator,
+            random123_preamble_generator,
             ] + super(PyOpenCLTarget, self).preamble_generators())
 
     def preprocess(self, kernel):
diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py
index d41222c26056300729b0c4005200ba6ea904010d..2c50b3f11a34ced81d6a93eb467482110f495f2f 100644
--- a/loopy/transform/arithmetic.py
+++ b/loopy/transform/arithmetic.py
@@ -158,7 +158,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
             continue
 
         if not isinstance(insn, Assignment):
-            raise LoopyError("'%s' modified by non-expression instruction"
+            raise LoopyError("'%s' modified by non-single-assignment"
                     % var_name)
 
         lhs = insn.assignee
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 25634c91975e686fed04223472ee9fbe2b416a4d..8b2fa370ad4214cbe16caec61ff45b684ed0a889 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -124,6 +124,27 @@ def remove_instructions(kernel, insn_ids):
     return kernel.copy(
             instructions=new_insns)
 
+
+def replace_instruction_ids(kernel, replacements):
+    new_insns = []
+
+    for insn in kernel.instructions:
+        changed = False
+        new_depends_on = []
+
+        for dep in insn.depends_on:
+            if dep in replacements:
+                new_depends_on.extend(replacements[dep])
+                changed = True
+            else:
+                new_depends_on.append(dep)
+
+        new_insns.append(
+                insn.copy(depends_on=frozenset(new_depends_on))
+                if changed else insn)
+
+    return kernel.copy(instructions=new_insns)
+
 # }}}
 
 
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 5c9a286de2678c1c23bcb3df682110c431bc793c..6ea0c06e631c5ea321dfd3a11b42f00bb0480078 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -426,8 +426,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
         import loopy as lp
         for insn in kernel.instructions:
-            if isinstance(insn, lp.Assignment):
-                invg(insn.assignee, kernel, insn)
+            if isinstance(insn, lp.MultiAssignmentBase):
+                for assignee in insn.assignees:
+                    invg(assignee, kernel, insn)
                 invg(insn.expression, kernel, insn)
 
         access_descriptors = invg.access_descriptors
diff --git a/loopy/version.py b/loopy/version.py
index 9fa881d02879dbdac67043f975bc3c00a6194f11..7716feea32079bb122557223b2a1e970ff630ecc 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v24-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v26-islpy%s" % _islpy_version
diff --git a/setup.py b/setup.py
index 5ed095315234339709309a1c55ec88c7fdab6bfa..30d6dfb636c4d5cea181da4a6b7e6df514adca9d 100644
--- a/setup.py
+++ b/setup.py
@@ -43,6 +43,7 @@ setup(name="loo.py",
           "islpy>=2016.1.2",
           "six>=1.8.0",
           "colorama",
+          "Mako",
           ],
 
       extras_require={
diff --git a/test/test_loopy.py b/test/test_loopy.py
index cebbda4d9b9177bd0a9f63ff9b3126b3c188c76c..a61b6563ff254fdf6eaf6279158a97d20045a663 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -262,7 +262,7 @@ def test_join_inames(ctx_factory):
     knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"])
     knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"])
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True)
 
 
 def test_divisibility_assumption(ctx_factory):
@@ -439,26 +439,21 @@ def test_argmax(ctx_factory):
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
-    order = "C"
 
     n = 10000
 
     knl = lp.make_kernel(
             "{[i]: 0<=i<%d}" % n,
-            [
-                "<> result = argmax(i, fabs(a[i]))",
-                "max_idx = result.index",
-                "max_val = result.value",
-                ],
-            [
-                lp.GlobalArg("a", dtype, shape=(n,), order=order),
-                lp.GlobalArg("max_idx", np.int32, shape=(), order=order),
-                lp.GlobalArg("max_val", dtype, shape=(), order=order),
-                ])
+            """
+            max_val, max_idx = argmax(i, fabs(a[i]))
+            """)
+
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    print(lp.preprocess_kernel(knl))
+    knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
 
     a = np.random.randn(10000).astype(dtype)
-    cknl = lp.CompiledKernel(ctx, knl)
-    evt, (max_idx, max_val) = cknl(queue, a=a, out_host=True)
+    evt, (max_idx, max_val) = knl(queue, a=a, out_host=True)
     assert max_val == np.max(np.abs(a))
     assert max_idx == np.where(np.abs(a) == max_val)[-1]
 
@@ -2473,6 +2468,94 @@ def test_atomic(ctx_factory, dtype):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
 
 
+def test_clamp(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 15 * 10**6
+    x = cl.clrandom.rand(queue, n, dtype=np.float32)
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            "out[i] = clamp(x[i], a, b)")
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.set_options(knl, write_cl=True)
+
+    evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
+
+
+def test_forced_iname_deps_and_reduction():
+    # See https://github.com/inducer/loopy/issues/24
+
+    # This is (purposefully) somewhat un-idiomatic, to replicate the conditions
+    # under which the above bug was found. If assignees were phi[i], then the
+    # iname propagation heuristic would not assume that dependent instructions
+    # need to run inside of 'i', and hence the forced_iname_* bits below would not
+    # be needed.
+
+    i1 = lp.CInstruction("i",
+            "doSomethingToGetPhi();",
+            assignees="phi")
+
+    from pymbolic.primitives import Subscript, Variable
+    i2 = lp.Assignment("a",
+            lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))),
+            forced_iname_deps=frozenset(),
+            forced_iname_deps_is_final=True)
+
+    k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
+            [i1, i2],
+            [
+                lp.GlobalArg("a", dtype=np.float32, shape=()),
+                lp.ValueArg("n", dtype=np.int32),
+                lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)),
+                ],
+            target=lp.CTarget(),
+            )
+
+    k = lp.preprocess_kernel(k)
+
+    assert 'i' not in k.insn_inames("insn_0_j_update")
+    print(k.stringify(with_dependencies=True))
+
+
+@pytest.mark.parametrize("tp", ["f32", "f64"])
+def test_random123(ctx_factory, tp):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    import pyopencl.version  # noqa
+    if cl.version.VERSION < (2016, 2):
+        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")
+
+    n = 150000
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            """
+            <> key2 = make_uint2(i, 324830944) {inames=i}
+            <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i}
+            <> ctr = make_uint4(0, 1, 2, 3) {inames=i}
+            <> real, ctr = philox4x32_TYPE(ctr, key2)
+            <> imag, ctr = threefry4x32_TYPE(ctr, key4)
+
+            out[i, 0] = real.s0 + 1j * imag.s0
+            out[i, 1] = real.s1 + 1j * imag.s1
+            out[i, 2] = real.s2 + 1j * imag.s2
+            out[i, 3] = real.s3 + 1j * imag.s3
+            """.replace("TYPE", tp))
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.set_options(knl, write_cl=True)
+
+    evt, (out,) = knl(queue, n=n)
+
+    out = out.get()
+    assert (out < 1).all()
+    assert (0 <= out).all()
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])