From 2728158191a0c960866d5e21bcefa9003edb4383 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 11 May 2016 21:42:32 -0500
Subject: [PATCH] Support functions with multiple return values

---
 doc/ref_kernel.rst                   |  28 +++
 doc/ref_transform.rst                |   2 +
 loopy/__init__.py                    |  25 ++-
 loopy/codegen/instruction.py         |  28 ++-
 loopy/expression.py                  |  29 ++-
 loopy/kernel/__init__.py             |  34 +++-
 loopy/kernel/creation.py             | 224 +++++++++++++---------
 loopy/kernel/data.py                 | 277 +++++++++++++++++++++++----
 loopy/kernel/tools.py                |  13 +-
 loopy/library/function.py            |   4 +-
 loopy/library/reduction.py           |  82 ++++----
 loopy/maxima.py                      |   2 +-
 loopy/preprocess.py                  | 124 ++++++++----
 loopy/schedule.py                    |   6 +-
 loopy/statistics.py                  |   7 +-
 loopy/symbolic.py                    |  44 ++++-
 loopy/target/__init__.py             |   3 +
 loopy/target/c/__init__.py           |  64 +++++++
 loopy/target/c/codegen/expression.py |  77 ++++----
 loopy/target/opencl.py               |  24 ++-
 loopy/target/pyopencl.py             |  14 +-
 loopy/transform/instruction.py       |  21 ++
 loopy/version.py                     |   2 +-
 setup.py                             |   1 +
 test/test_loopy.py                   |  23 +--
 25 files changed, 852 insertions(+), 306 deletions(-)

diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index a323fff52..560facd63 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -293,6 +293,11 @@ Loopy's expressions are a slight superset of the expressions supported by
 TODO: Functions
 TODO: Reductions
 
+Function Call Instructions
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: CallInstruction
+
 C Block Instructions
 ^^^^^^^^^^^^^^^^^^^^
 
@@ -468,6 +473,8 @@ Targets
 
 .. automodule:: loopy.target
 
+.. currentmodule:: loopy
+
 Helper values
 -------------
 
@@ -479,6 +486,27 @@ Helper values
 
 .. }}}
 
+Libraries: Extending and Interfacing with External Functionality
+----------------------------------------------------------------
+
+.. _symbols:
+
+Symbols
+^^^^^^^
+
+.. _functions:
+
+Functions
+^^^^^^^^^
+
+.. autoclass:: CallMangleInfo
+
+.. _reductions:
+
+Reductions
+^^^^^^^^^^
+
+
 The Kernel Object
 -----------------
 
diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst
index f92cfbf67..d085c1215 100644
--- a/doc/ref_transform.rst
+++ b/doc/ref_transform.rst
@@ -68,6 +68,8 @@ Manipulating Instructions
 
 .. autofunction:: remove_instructions
 
+.. autofunction:: replace_instruction_ids
+
 .. autofunction:: tag_instructions
 
 Registering Library Routines
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 22022c0f6..424aa522f 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -41,9 +41,12 @@ from loopy.kernel.data import (
         KernelArgument,
         ValueArg, GlobalArg, ConstantArg, ImageArg,
         memory_ordering, memory_scope, VarAtomicity, AtomicInit, AtomicUpdate,
-        InstructionBase, Assignment, ExpressionInstruction, CInstruction,
+        InstructionBase,
+        MultiAssignmentBase, Assignment, ExpressionInstruction,
+        CallInstruction, CInstruction,
         temp_var_scope, TemporaryVariable,
-        SubstitutionRule)
+        SubstitutionRule,
+        CallMangleInfo)
 
 from loopy.kernel import LoopKernel, kernel_state
 from loopy.kernel.tools import (
@@ -67,7 +70,9 @@ from loopy.transform.iname import (
 from loopy.transform.instruction import (
         find_instructions, map_instructions,
         set_instruction_priority, add_dependency,
-        remove_instructions, tag_instructions)
+        remove_instructions,
+        replace_instruction_ids,
+        tag_instructions)
 
 from loopy.transform.data import (
         add_prefetch, change_arg_to_image, tag_data_axes,
@@ -131,8 +136,11 @@ __all__ = [
         "ValueArg", "GlobalArg", "ConstantArg", "ImageArg",
         "temp_var_scope", "TemporaryVariable",
         "SubstitutionRule",
+        "CallMangleInfo",
 
-        "InstructionBase", "Assignment", "ExpressionInstruction", "CInstruction",
+        "InstructionBase",
+        "MultiAssignmentBase", "Assignment", "ExpressionInstruction",
+        "CallInstruction", "CInstruction",
 
         "default_function_mangler", "single_arg_function_mangler",
 
@@ -157,7 +165,9 @@ __all__ = [
 
         "find_instructions", "map_instructions",
         "set_instruction_priority", "add_dependency",
-        "remove_instructions", "tag_instructions",
+        "remove_instructions",
+        "replace_instruction_ids",
+        "tag_instructions",
 
         "extract_subst", "expand_subst", "assignment_to_subst",
         "find_rules_matching", "find_one_rule_matching",
@@ -278,6 +288,11 @@ def register_symbol_manglers(kernel, manglers):
 
 
 def register_function_manglers(kernel, manglers):
+    """
+    :arg manglers: list of functions of signature ``(target, name, arg_dtypes)``
+        returning a :class:`loopy.CallMangleInfo`.
+    :returns: *kernel* with *manglers* registered
+    """
     new_manglers = kernel.function_manglers[:]
     for m in manglers:
         if m not in new_manglers:
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index d3a7ae42c..7b95f5948 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -61,10 +61,12 @@ def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, st
 
 
 def generate_instruction_code(kernel, insn, codegen_state):
-    from loopy.kernel.data import Assignment, CInstruction
+    from loopy.kernel.data import Assignment, CallInstruction, CInstruction
 
     if isinstance(insn, Assignment):
         result = generate_expr_instruction_code(kernel, insn, codegen_state)
+    elif isinstance(insn, CallInstruction):
+        result = generate_call_code(kernel, insn, codegen_state)
     elif isinstance(insn, CInstruction):
         result = generate_c_instruction_code(kernel, insn, codegen_state)
     else:
@@ -218,12 +220,32 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
     return result
 
 
+def generate_call_code(kernel, insn, codegen_state):
+    # {{{ vectorization handling
+
+    if codegen_state.vectorization_info:
+        if insn.atomicity:
+            raise Unvectorizable("function call")
+
+    # }}}
+
+    result = kernel.target.generate_multiple_assignment(
+            codegen_state, insn)
+
+    # {{{ tracing
+
+    if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
+        raise NotImplementedError("tracing of multi-output function calls")
+
+    # }}}
+
+    return result
+
+
 def generate_c_instruction_code(kernel, insn, codegen_state):
     if codegen_state.vectorization_info is not None:
         raise Unvectorizable("C instructions cannot be vectorized")
 
-    ecm = codegen_state.expression_to_code_mapper
-
     body = []
 
     from loopy.codegen import POD
diff --git a/loopy/expression.py b/loopy/expression.py
index 16c09b828..42c54af71 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -199,7 +199,7 @@ class TypeInferenceMapper(CombineMapper):
     def map_linear_subscript(self, expr):
         return self.rec(expr.aggregate)
 
-    def map_call(self, expr):
+    def map_call(self, expr, multiple_types_ok=False):
         from pymbolic.primitives import Variable
 
         identifier = expr.function
@@ -212,8 +212,15 @@ class TypeInferenceMapper(CombineMapper):
         arg_dtypes = tuple(self.rec(par) for par in expr.parameters)
 
         mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
-        if mangle_result is not None:
-            return mangle_result[0]
+        if multiple_types_ok:
+            return mangle_result.result_dtypes
+        else:
+            if len(mangle_result.result_dtypes) != 1 and not multiple_types_ok:
+                raise LoopyError("functions with more or fewer than one "
+                        "return value may only be used in direct assignments")
+
+            if mangle_result is not None:
+                return mangle_result.result_dtypes[0]
 
         raise RuntimeError("no type inference information on "
                 "function '%s'" % identifier)
@@ -285,9 +292,19 @@ class TypeInferenceMapper(CombineMapper):
     def map_local_hw_index(self, expr, *args):
         return self.kernel.index_dtype
 
-    def map_reduction(self, expr):
-        return expr.operation.result_dtype(
-                self.kernel.target, self.rec(expr.expr), expr.inames)
+    def map_reduction(self, expr, multiple_types_ok=False):
+        result = expr.operation.result_dtypes(
+                self.kernel, self.rec(expr.expr), expr.inames)
+
+        if multiple_types_ok:
+            return result
+
+        else:
+            if len(result) != 1 and not multiple_types_ok:
+                raise LoopyError("reductions with more or fewer than one "
+                        "return value may only be used in direct assignments")
+
+            return result[0]
 
 # }}}
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index d7009900b..7baea3243 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -313,7 +313,35 @@ class LoopKernel(RecordWithoutPickling):
         for mangler in manglers:
             mangle_result = mangler(self, identifier, arg_dtypes)
             if mangle_result is not None:
-                return mangle_result
+                from loopy.kernel.data import CallMangleInfo
+                if isinstance(mangle_result, CallMangleInfo):
+                    assert len(mangle_result.arg_dtypes) == len(arg_dtypes)
+                    return mangle_result
+
+                assert isinstance(mangle_result, tuple)
+
+                from warnings import warn
+                warn("'%s' returned a tuple instead of a CallMangleInfo instance. "
+                        "This is deprecated." % mangler.__name__,
+                        DeprecationWarning)
+
+                if len(mangle_result) == 2:
+                    result_dtype, target_name = mangle_result
+                    return CallMangleInfo(
+                            target_name=target_name,
+                            result_dtypes=(result_dtype,),
+                            arg_dtypes=None)
+
+                elif len(mangle_result) == 3:
+                    result_dtype, target_name, actual_arg_dtypes = mangle_result
+                    return CallMangleInfo(
+                            target_name=target_name,
+                            result_dtypes=(result_dtype,),
+                            arg_dtypes=actual_arg_dtypes)
+
+                else:
+                    raise ValueError("unexpected size of tuple returned by '%s'"
+                            % mangler.__name__)
 
         return None
 
@@ -1027,8 +1055,8 @@ class LoopKernel(RecordWithoutPickling):
             for dep_id in sorted(insn.depends_on):
                 print_insn(kernel.id_to_insn[dep_id])
 
-            if isinstance(insn, lp.Assignment):
-                lhs = str(insn.assignee)
+            if isinstance(insn, lp.MultiAssignmentBase):
+                lhs = ", ".join(str(a) for a in insn.assignees)
                 rhs = str(insn.expression)
                 trailing = []
             elif isinstance(insn, lp.CInstruction):
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 034c9dd82..9fe0f5b79 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -29,7 +29,9 @@ import numpy as np
 from loopy.tools import intern_frozenset_of_ids
 from loopy.symbolic import IdentityMapper, WalkMapper
 from loopy.kernel.data import (
-        InstructionBase, Assignment, SubstitutionRule)
+        InstructionBase,
+        MultiAssignmentBase, Assignment,
+        SubstitutionRule)
 from loopy.diagnostic import LoopyError
 import islpy as isl
 from islpy import dim_type
@@ -147,7 +149,6 @@ def expand_defines_in_expr(expr, defines):
 # {{{ parse instructions
 
 INSN_RE = re.compile(
-        "\s*(?:\<(?P<temp_var_type>.*?)\>)?"
         "\s*(?P<lhs>.+?)\s*(?<!\:)=\s*(?P<rhs>.+?)"
         "\s*?(?:\{(?P<options>.+)\}\s*)?$"
         )
@@ -159,7 +160,8 @@ SUBST_RE = re.compile(
 def parse_insn(insn):
     """
     :return: a tuple ``(insn, inames_to_dup)``, where insn is a
-        :class:`Assignment` or a :class:`SubstitutionRule`
+        :class:`Assignment`, a :class:`CallInstruction`,
+        or a :class:`SubstitutionRule`
         and *inames_to_dup* is None or a list of tuples `(old, new)`.
     """
 
@@ -192,17 +194,69 @@ def parse_insn(insn):
                 "the following error occurred:" % groups["rhs"])
         raise
 
-    from pymbolic.primitives import Variable, Subscript, Call
-    if isinstance(lhs, Variable):
-        assignee_name = lhs.name
-    elif isinstance(lhs, Subscript):
-        assignee_name = lhs.aggregate.name
-    elif isinstance(lhs, Call):
-        assignee_name = None
-        assert subst_match is not None
-    else:
-        raise LoopyError("left hand side of assignment '%s' must "
-                "be variable or subscript" % lhs)
+    from pymbolic.primitives import Variable, Call, Subscript
+    from loopy.symbolic import TypeAnnotation
+
+    # {{{ deal with subst rules
+
+    if subst_match is not None:
+        assert insn_match is None
+        if isinstance(lhs, Variable):
+            subst_name = lhs.name
+            arg_names = []
+        elif isinstance(lhs, Call):
+            if not isinstance(lhs.function, Variable):
+                raise RuntimeError("Invalid substitution rule left-hand side")
+            subst_name = lhs.function.name
+            arg_names = []
+
+            for i, arg in enumerate(lhs.parameters):
+                if not isinstance(arg, Variable):
+                    raise RuntimeError("Invalid substitution rule "
+                                    "left-hand side: %s--arg number %d "
+                                    "is not a variable" % (lhs, i))
+                arg_names.append(arg.name)
+        else:
+            raise RuntimeError("Invalid substitution rule left-hand side")
+
+        return SubstitutionRule(
+                name=subst_name,
+                arguments=tuple(arg_names),
+                expression=rhs), []
+
+    # }}}
+
+    if not isinstance(lhs, tuple):
+        lhs = (lhs,)
+
+    temp_var_types = []
+    new_lhs = []
+    assignee_names = []
+
+    for lhs_i in lhs:
+        if isinstance(lhs_i, TypeAnnotation):
+            if lhs_i.type is None:
+                temp_var_types.append(lp.auto)
+            else:
+                temp_var_types.append(lhs_i.type)
+
+            lhs_i = lhs_i.child
+        else:
+            temp_var_types.append(None)
+
+        if isinstance(lhs_i, Variable):
+            assignee_names.append(lhs_i.name)
+        elif isinstance(lhs_i, Subscript):
+            assignee_names.append(lhs_i.aggregate.name)
+        else:
+            raise LoopyError("left hand side of assignment '%s' must "
+                    "be variable or subscript" % (lhs_i,))
+
+        new_lhs.append(lhs_i)
+
+    lhs = tuple(new_lhs)
+    temp_var_types = tuple(temp_var_types)
+    del new_lhs
 
     if insn_match is not None:
         depends_on = None
@@ -290,6 +344,11 @@ def parse_insn(insn):
                             if tag.strip())
 
                 elif opt_key == "atomic":
+                    if len(assignee_names) != 1:
+                        raise LoopyError("atomic operations with more than one "
+                                "left-hand side not supported")
+                    assignee_name, = assignee_names
+
                     if opt_value is None:
                         atomicity = atomicity + (
                                 lp.AtomicUpdate(assignee_name),)
@@ -302,6 +361,7 @@ def parse_insn(insn):
                                 raise LoopyError("atomicity directive not "
                                         "understood: %s"
                                         % v)
+                    del assignee_name
 
                 else:
                     raise ValueError(
@@ -309,16 +369,7 @@ def parse_insn(insn):
                             "(maybe a missing/extraneous =value?)"
                             % opt_key)
 
-        if groups["temp_var_type"] is not None:
-            if groups["temp_var_type"]:
-                temp_var_type = np.dtype(groups["temp_var_type"])
-            else:
-                import loopy as lp
-                temp_var_type = lp.auto
-        else:
-            temp_var_type = None
-
-        return Assignment(
+        kwargs = dict(
                     id=(
                         intern(insn_id)
                         if isinstance(insn_id, str)
@@ -330,38 +381,15 @@ def parse_insn(insn):
                     conflicts_with_groups=conflicts_with_groups,
                     forced_iname_deps_is_final=forced_iname_deps_is_final,
                     forced_iname_deps=forced_iname_deps,
-                    assignee=lhs, expression=rhs,
-                    temp_var_type=temp_var_type,
-                    atomicity=atomicity,
                     priority=priority,
                     predicates=predicates,
-                    tags=tags), inames_to_dup
+                    tags=tags,
+                    atomicity=atomicity)
 
-    elif subst_match is not None:
-        from pymbolic.primitives import Variable, Call
-
-        if isinstance(lhs, Variable):
-            subst_name = lhs.name
-            arg_names = []
-        elif isinstance(lhs, Call):
-            if not isinstance(lhs.function, Variable):
-                raise RuntimeError("Invalid substitution rule left-hand side")
-            subst_name = lhs.function.name
-            arg_names = []
-
-            for i, arg in enumerate(lhs.parameters):
-                if not isinstance(arg, Variable):
-                    raise RuntimeError("Invalid substitution rule "
-                                    "left-hand side: %s--arg number %d "
-                                    "is not a variable" % (lhs, i))
-                arg_names.append(arg.name)
-        else:
-            raise RuntimeError("Invalid substitution rule left-hand side")
-
-        return SubstitutionRule(
-                name=subst_name,
-                arguments=tuple(arg_names),
-                expression=rhs), []
+        from loopy.kernel.data import make_assignment
+        return make_assignment(
+                lhs, rhs, temp_var_types, **kwargs
+                ), inames_to_dup
 
 
 def parse_if_necessary(insn, defines):
@@ -522,13 +550,13 @@ class ArgumentGuesser:
         self.all_written_names = set()
         from loopy.symbolic import get_dependencies
         for insn in instructions:
-            if isinstance(insn, Assignment):
-                (assignee_var_name, _), = insn.assignees_and_indices()
-                self.all_written_names.add(assignee_var_name)
-                self.all_names.update(get_dependencies(
-                    self.submap(insn.assignee)))
-                self.all_names.update(get_dependencies(
-                    self.submap(insn.expression)))
+            if isinstance(insn, MultiAssignmentBase):
+                for assignee_var_name, _ in insn.assignees_and_indices():
+                    self.all_written_names.add(assignee_var_name)
+                    self.all_names.update(get_dependencies(
+                        self.submap(insn.assignees)))
+                    self.all_names.update(get_dependencies(
+                        self.submap(insn.expression)))
 
     def find_index_rank(self, name):
         irf = IndexRankFinder(name)
@@ -590,10 +618,12 @@ class ArgumentGuesser:
         temp_var_names = set(six.iterkeys(self.temporary_variables))
 
         for insn in self.instructions:
-            if isinstance(insn, Assignment):
-                if insn.temp_var_type is not None:
-                    (assignee_var_name, _), = insn.assignees_and_indices()
-                    temp_var_names.add(assignee_var_name)
+            if isinstance(insn, MultiAssignmentBase):
+                for (assignee_var_name, _), temp_var_type in zip(
+                        insn.assignees_and_indices(),
+                        insn.temp_var_types):
+                    if temp_var_type is not None:
+                        temp_var_names.add(assignee_var_name)
 
         # }}}
 
@@ -787,7 +817,7 @@ def expand_cses(instructions, cse_prefix="cse_expr"):
     new_temp_vars = []
 
     for insn in instructions:
-        if isinstance(insn, Assignment):
+        if isinstance(insn, MultiAssignmentBase):
             new_insns.append(insn.copy(expression=cseam(insn.expression)))
         else:
             new_insns.append(insn)
@@ -806,29 +836,36 @@ def create_temporaries(knl, default_order):
     import loopy as lp
 
     for insn in knl.instructions:
-        if isinstance(insn, Assignment) \
-                and insn.temp_var_type is not None:
-            (assignee_name, _), = insn.assignees_and_indices()
-
-            if assignee_name in new_temp_vars:
-                raise RuntimeError("cannot create temporary variable '%s'--"
-                        "already exists" % assignee_name)
-            if assignee_name in knl.arg_dict:
-                raise RuntimeError("cannot create temporary variable '%s'--"
-                        "already exists as argument" % assignee_name)
-
-            logger.debug("%s: creating temporary %s"
-                    % (knl.name, assignee_name))
-
-            new_temp_vars[assignee_name] = lp.TemporaryVariable(
-                    name=assignee_name,
-                    dtype=insn.temp_var_type,
-                    is_local=lp.auto,
-                    base_indices=lp.auto,
-                    shape=lp.auto,
-                    order=default_order)
-
-            insn = insn.copy(temp_var_type=None)
+        if isinstance(insn, MultiAssignmentBase):
+            for (assignee_name, _), temp_var_type in zip(
+                    insn.assignees_and_indices(),
+                    insn.temp_var_types):
+
+                if temp_var_type is None:
+                    continue
+
+                if assignee_name in new_temp_vars:
+                    raise RuntimeError("cannot create temporary variable '%s'--"
+                            "already exists" % assignee_name)
+                if assignee_name in knl.arg_dict:
+                    raise RuntimeError("cannot create temporary variable '%s'--"
+                            "already exists as argument" % assignee_name)
+
+                logger.debug("%s: creating temporary %s"
+                        % (knl.name, assignee_name))
+
+                new_temp_vars[assignee_name] = lp.TemporaryVariable(
+                        name=assignee_name,
+                        dtype=temp_var_type,
+                        is_local=lp.auto,
+                        base_indices=lp.auto,
+                        shape=lp.auto,
+                        order=default_order)
+
+                if isinstance(insn, Assignment):
+                    insn = insn.copy(temp_var_type=None)
+                else:
+                    insn = insn.copy(temp_var_types=None)
 
         new_insns.append(insn)
 
@@ -932,8 +969,8 @@ def guess_arg_shape_if_requested(kernel, default_order):
 
             try:
                 for insn in kernel.instructions:
-                    if isinstance(insn, lp.Assignment):
-                        armap(submap(insn.assignee), kernel.insn_inames(insn))
+                    if isinstance(insn, lp.MultiAssignmentBase):
+                        armap(submap(insn.assignees), kernel.insn_inames(insn))
                         armap(submap(insn.expression), kernel.insn_inames(insn))
             except TypeError as e:
                 from traceback import print_exc
@@ -1133,10 +1170,9 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     :arg default_offset: 0 or :class:`loopy.auto`. The default value of
         *offset* in :attr:`GlobalArg` for guessed arguments.
         Defaults to 0.
-    :arg function_manglers: list of functions of signature (name, arg_dtypes)
-        returning a tuple (result_dtype, c_name)
-        or a tuple (result_dtype, c_name, arg_dtypes),
-        where c_name is the C-level function to be called.
+    :arg function_manglers: list of functions of signature
+        ``(target, name, arg_dtypes)``
+        returning a :class:`loopy.CallMangleInfo`.
     :arg symbol_manglers: list of functions of signature (name) returning
         a tuple (result_dtype, c_name), where c_name is the C-level symbol to
         be evaluated.
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 30f9f3a10..b477b5fee 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -703,7 +703,7 @@ class InstructionBase(Record):
             result.append("priority=%d" % self.priority)
         if self.tags:
             result.append("tags=%s" % ":".join(self.tags))
-        if self.atomicity:
+        if hasattr(self, "atomicity"):
             result.append("atomic=%s" % ":".join(str(a) for a in self.atomicity))
 
         return result
@@ -930,9 +930,48 @@ class AtomicUpdate(VarAtomicity):
 # }}}
 
 
+# {{{ instruction base class: expression rhs
+
+class MultiAssignmentBase(InstructionBase):
+    """An assignment instruction with an expression as a right-hand side."""
+
+    fields = InstructionBase.fields | set(["expression"])
+
+    @memoize_method
+    def read_dependency_names(self):
+        from loopy.symbolic import get_dependencies
+        result = get_dependencies(self.expression)
+        for _, subscript in self.assignees_and_indices():
+            result = result | get_dependencies(subscript)
+
+        processed_predicates = frozenset(
+                pred.lstrip("!") for pred in self.predicates)
+
+        result = result | processed_predicates
+
+        return result
+
+    @memoize_method
+    def reduction_inames(self):
+        def map_reduction(expr, rec):
+            rec(expr.expr)
+            for iname in expr.inames:
+                result.add(iname)
+
+        from loopy.symbolic import ReductionCallbackMapper
+        cb_mapper = ReductionCallbackMapper(map_reduction)
+
+        result = set()
+        cb_mapper(self.expression)
+
+        return result
+
+# }}}
+
+
 # {{{ instruction: assignment
 
-class Assignment(InstructionBase):
+class Assignment(MultiAssignmentBase):
     """
     .. attribute:: assignee
 
@@ -983,8 +1022,8 @@ class Assignment(InstructionBase):
     .. automethod:: __init__
     """
 
-    fields = InstructionBase.fields | \
-            set("assignee expression temp_var_type atomicity".split())
+    fields = MultiAssignmentBase.fields | \
+            set("assignee temp_var_type atomicity".split())
 
     def __init__(self,
             assignee, expression,
@@ -1001,7 +1040,7 @@ class Assignment(InstructionBase):
             priority=0, predicates=frozenset(),
             insn_deps=None, insn_deps_is_final=None):
 
-        InstructionBase.__init__(self,
+        super(Assignment, self).__init__(
                 id=id,
                 depends_on=depends_on,
                 depends_on_is_final=depends_on_is_final,
@@ -1039,35 +1078,6 @@ class Assignment(InstructionBase):
 
     # {{{ implement InstructionBase interface
 
-    @memoize_method
-    def read_dependency_names(self):
-        from loopy.symbolic import get_dependencies
-        result = get_dependencies(self.expression)
-        for _, subscript in self.assignees_and_indices():
-            result = result | get_dependencies(subscript)
-
-        processed_predicates = frozenset(
-                pred.lstrip("!") for pred in self.predicates)
-
-        result = result | processed_predicates
-
-        return result
-
-    @memoize_method
-    def reduction_inames(self):
-        def map_reduction(expr, rec):
-            rec(expr.expr)
-            for iname in expr.inames:
-                result.add(iname)
-
-        from loopy.symbolic import ReductionCallbackMapper
-        cb_mapper = ReductionCallbackMapper(map_reduction)
-
-        result = set()
-        cb_mapper(self.expression)
-
-        return result
-
     @memoize_method
     def assignees_and_indices(self):
         return [_get_assignee_and_index(self.assignee)]
@@ -1106,6 +1116,18 @@ class Assignment(InstructionBase):
             else:
                 key_builder.rec(key_hash, getattr(self, field_name))
 
+    # {{{ for interface uniformity with CallInstruction
+
+    @property
+    def temp_var_types(self):
+        return (self.temp_var_type,)
+
+    @property
+    def assignees(self):
+        return (self.assignee,)
+
+    # }}}
+
 
 class ExpressionInstruction(Assignment):
     def __init__(self, *args, **kwargs):
@@ -1118,6 +1140,162 @@ class ExpressionInstruction(Assignment):
 # }}}
 
 
+# {{{ instruction: function call
+
+class CallInstruction(MultiAssignmentBase):
+    """An instruction capturing a function call. Unlike :class:`Assignment`,
+    this instruction supports functions with multiple return values.
+
+    .. attribute:: assignees
+
+    .. attribute:: expression
+
+    The following attributes are only used until
+    :func:`loopy.make_kernel` is finished:
+
+    .. attribute:: temp_var_types
+
+        if not *None*, a type that will be assigned to the new temporary variable
+        created from the assignee
+
+    .. automethod:: __init__
+    """
+
+    fields = MultiAssignmentBase.fields | \
+            set("assignees temp_var_types".split())
+
+    def __init__(self,
+            assignees, expression,
+            id=None,
+            depends_on=None,
+            depends_on_is_final=None,
+            groups=None,
+            conflicts_with_groups=None,
+            no_sync_with=None,
+            forced_iname_deps_is_final=None,
+            forced_iname_deps=frozenset(),
+            boostable=None, boostable_into=None, tags=None,
+            temp_var_types=None,
+            priority=0, predicates=frozenset(),
+            insn_deps=None, insn_deps_is_final=None):
+
+        super(CallInstruction, self).__init__(
+                id=id,
+                depends_on=depends_on,
+                depends_on_is_final=depends_on_is_final,
+                groups=groups,
+                conflicts_with_groups=conflicts_with_groups,
+                no_sync_with=no_sync_with,
+                forced_iname_deps_is_final=forced_iname_deps_is_final,
+                forced_iname_deps=forced_iname_deps,
+                boostable=boostable,
+                boostable_into=boostable_into,
+                priority=priority,
+                predicates=predicates,
+                tags=tags,
+                insn_deps=insn_deps,
+                insn_deps_is_final=insn_deps_is_final)
+
+        from pymbolic.primitives import Call
+        from loopy.symbolic import Reduction
+        if not isinstance(expression, (Call, Reduction)) and expression is not None:
+            raise LoopyError("'expression' argument to CallInstruction "
+                    "must be a function call")
+
+        from loopy.symbolic import parse
+        if isinstance(assignees, str):
+            assignees = parse(assignees)
+        if isinstance(expression, str):
+            expression = parse(expression)
+
+        # FIXME: It may be worth it to enable this check eventually.
+        # For now, it causes grief with certain 'checky' uses of the
+        # with_transformed_expressions(). (notably the access checker)
+        #
+        # from pymbolic.primitives import Variable, Subscript
+        # if not isinstance(assignee, (Variable, Subscript)):
+        #     raise LoopyError("invalid lvalue '%s'" % assignee)
+
+        self.assignees = assignees
+        self.expression = expression
+        self.temp_var_types = temp_var_types
+
+    # {{{ implement InstructionBase interface
+
+    @memoize_method
+    def assignees_and_indices(self):
+        return [_get_assignee_and_index(a) for a in self.assignees]
+
+    def with_transformed_expressions(self, f, *args):
+        return self.copy(
+                assignees=f(self.assignees, *args),
+                expression=f(self.expression, *args))
+
+    # }}}
+
+    def __str__(self):
+        result = "%s: %s <- %s" % (self.id,
+                ", ".join(str(a) for a in self.assignees),
+                self.expression)
+
+        options = self.get_str_options()
+        if options:
+            result += " (%s)" % (": ".join(options))
+
+        if self.predicates:
+            result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates)
+        return result
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        """Custom hash computation function for use with
+        :class:`pytools.persistent_dict.PersistentDict`.
+
+        Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
+        """
+
+        # Order matters for hash forming--sort the fields.
+        for field_name in sorted(self.fields):
+            if field_name in ["assignees", "expression"]:
+                key_builder.update_for_pymbolic_expression(
+                        key_hash, getattr(self, field_name))
+            else:
+                key_builder.rec(key_hash, getattr(self, field_name))
+
+# }}}
+
+
+def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
+    if len(assignees) < 1:
+        raise LoopyError("every instruction must have a left-hand side")
+    elif len(assignees) > 1:
+        atomicity = kwargs.pop("atomicity", ())
+        if atomicity:
+            raise LoopyError("atomic operations with more than one "
+                    "left-hand side not supported")
+
+        from pymbolic.primitives import Call
+        from loopy.symbolic import Reduction
+        if not isinstance(expression, (Call, Reduction)):
+            raise LoopyError("right-hand side in multiple assignment must be "
+                    "function call or reduction")
+
+        return CallInstruction(
+                assignees=assignees,
+                expression=expression,
+                temp_var_types=temp_var_types,
+                **kwargs)
+
+    else:
+        return Assignment(
+                assignee=assignees[0],
+                expression=expression,
+                temp_var_type=(
+                    temp_var_types[0]
+                    if temp_var_types is not None
+                    else None),
+                **kwargs)
+
+
 # {{{ c instruction
 
 class CInstruction(InstructionBase):
@@ -1294,4 +1472,35 @@ class CInstruction(InstructionBase):
 
 # }}}
 
+
+# {{{ function call mangling
+
+class CallMangleInfo(Record):
+    """
+    .. attribute:: target_name
+
+        A string. The name of the function to be called in the
+        generated target code.
+
+    .. attribute:: result_dtypes
+
+        A tuple of :class:`LoopyType` instances indicating what
+        types of values the function returns.
+
+    .. attribute:: arg_dtypes
+
+        A tuple of :class:`LoopyType` instances indicating what
+        types of arguments the function actually receives.
+    """
+
+    def __init__(self, target_name, result_dtypes, arg_dtypes):
+        assert isinstance(result_dtypes, tuple)
+
+        super(CallMangleInfo, self).__init__(
+                target_name=target_name,
+                result_dtypes=result_dtypes,
+                arg_dtypes=arg_dtypes)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index ef3cb3c8a..0a02979ab 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -423,11 +423,11 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
     dep_graph = {}
     lines = []
 
-    from loopy.kernel.data import Assignment, CInstruction
+    from loopy.kernel.data import MultiAssignmentBase, CInstruction
 
     for insn in kernel.instructions:
-        if isinstance(insn, Assignment):
-            op = "%s <- %s" % (insn.assignee, insn.expression)
+        if isinstance(insn, MultiAssignmentBase):
+            op = "%s <- %s" % (insn.assignees, insn.expression)
             if len(op) > 200:
                 op = op[:200] + "..."
 
@@ -657,8 +657,9 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 
     from pymbolic.primitives import Subscript
 
-    if isinstance(insn.assignee, Subscript):
-        ary_acc_exprs.append(insn.assignee)
+    for assignee in insn.assignees:
+        if isinstance(assignee, Subscript):
+            ary_acc_exprs.append(assignee)
 
     # }}}
 
@@ -855,7 +856,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
     import loopy as lp
 
     for insn in kernel.instructions:
-        if not isinstance(insn, lp.Assignment):
+        if not isinstance(insn, lp.MultiAssignmentBase):
             continue
 
         auto_axis_inames = [
diff --git a/loopy/library/function.py b/loopy/library/function.py
index df623a477..efa590371 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -38,7 +38,9 @@ def default_function_mangler(kernel, name, arg_dtypes):
 def single_arg_function_mangler(kernel, name, arg_dtypes):
     if len(arg_dtypes) == 1:
         dtype, = arg_dtypes
-        return dtype, name
+
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(name, (dtype,), (dtype,))
 
     return None
 
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index b39115a35..1540222b2 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -36,7 +36,7 @@ class ReductionOperation(object):
     equality-comparable.
     """
 
-    def result_dtype(self, target, arg_dtype, inames):
+    def result_dtypes(self, target, arg_dtype, inames):
         raise NotImplementedError
 
     def neutral_element(self, dtype, inames):
@@ -82,11 +82,12 @@ class ScalarReductionOperation(ReductionOperation):
         """
         self.forced_result_type = forced_result_type
 
-    def result_dtype(self, target, arg_dtype, inames):
+    def result_dtypes(self, kernel, arg_dtype, inames):
         if self.forced_result_type is not None:
-            return self.parse_result_type(target, self.forced_result_type)
+            return self.parse_result_type(
+                    kernel.target, self.forced_result_type)
 
-        return arg_dtype
+        return (arg_dtype,)
 
     def __hash__(self):
         return hash((type(self), self.forced_result_type))
@@ -148,23 +149,12 @@ class MinReductionOperation(ScalarReductionOperation):
 
 # {{{ argmin/argmax
 
-ARGEXT_STRUCT_DTYPES = {}
-
-
 class _ArgExtremumReductionOperation(ReductionOperation):
     def prefix(self, dtype):
         return "loopy_arg%s_%s" % (self.which, dtype.numpy_dtype.type.__name__)
 
-    def result_dtype(self, target, dtype, inames):
-        try:
-            return ARGEXT_STRUCT_DTYPES[dtype]
-        except KeyError:
-            struct_dtype = np.dtype([("value", dtype), ("index", np.int32)])
-            ARGEXT_STRUCT_DTYPES[dtype] = NumpyType(struct_dtype, target)
-
-            target.get_or_register_dtype(self.prefix(dtype)+"_result",
-                    NumpyType(struct_dtype))
-            return ARGEXT_STRUCT_DTYPES[dtype]
+    def result_dtypes(self, kernel, dtype, inames):
+        return (dtype, kernel.index_dtype)
 
     def neutral_element(self, dtype, inames):
         return ArgExtFunction(self, dtype, "init", inames)()
@@ -179,7 +169,7 @@ class _ArgExtremumReductionOperation(ReductionOperation):
         iname, = inames
 
         return ArgExtFunction(self, dtype, "update", inames)(
-                operand1, operand2, var(iname))
+                *(operand1 + (operand2, var(iname))))
 
 
 class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
@@ -207,7 +197,7 @@ class ArgExtFunction(FunctionIdentifier):
         return (self.reduction_op, self.scalar_dtype, self.name, self.inames)
 
 
-def get_argext_preamble(target, func_id):
+def get_argext_preamble(kernel, func_id):
     op = func_id.reduction_op
     prefix = op.prefix(func_id.scalar_dtype)
 
@@ -216,35 +206,32 @@ def get_argext_preamble(target, func_id):
     c_code_mapper = CCodeMapper()
 
     return (prefix, """
-    typedef struct {
-        %(scalar_type)s value;
-        int index;
-    } %(type_name)s;
-
-    inline %(type_name)s %(prefix)s_init()
+    inline %(scalar_t)s %(prefix)s_init(%(index_t)s *index_out)
     {
-        %(type_name)s result;
-        result.value = %(neutral)s;
-        result.index = INT_MIN;
-        return result;
+        *index_out = INT_MIN;
+        return %(neutral)s;
     }
 
-    inline %(type_name)s %(prefix)s_update(
-        %(type_name)s state, %(scalar_type)s op2, int index)
+    inline %(scalar_t)s %(prefix)s_update(
+        %(scalar_t)s op1, %(index_t)s index1,
+        %(scalar_t)s op2, %(index_t)s index2,
+        %(index_t)s *index_out)
     {
-        %(type_name)s result;
-        if (op2 %(comp)s state.value)
+        if (op2 %(comp)s op1)
+        {
+            *index_out = index2;
+            return op2;
+        }
+        else
         {
-            result.value = op2;
-            result.index = index;
-            return result;
+            *index_out = index1;
+            return op1;
         }
-        else return state;
     }
     """ % dict(
-            type_name=prefix+"_result",
-            scalar_type=target.dtype_to_typename(func_id.scalar_dtype),
+            scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype),
             prefix=prefix,
+            index_t=kernel.target.dtype_to_typename(kernel.index_dtype),
             neutral=c_code_mapper(
                 op.neutral_sign*get_le_neutral(func_id.scalar_dtype)),
             comp=op.update_comparison,
@@ -308,8 +295,19 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes):
             raise LoopyError("only OpenCL supported for now")
 
         op = func_id.reduction_op
-        return (op.result_dtype(kernel.target, func_id.scalar_dtype, func_id.inames),
-                "%s_%s" % (op.prefix(func_id.scalar_dtype), func_id.name))
+
+        from loopy.kernel.data import CallMangleInfo
+        return CallMangleInfo(
+                target_name="%s_%s" % (
+                    op.prefix(func_id.scalar_dtype), func_id.name),
+                result_dtypes=op.result_dtypes(
+                    kernel, func_id.scalar_dtype, func_id.inames),
+                arg_dtypes=(
+                    func_id.scalar_dtype,
+                    kernel.index_dtype,
+                    func_id.scalar_dtype,
+                    kernel.index_dtype),
+                )
 
     return None
 
@@ -322,6 +320,6 @@ def reduction_preamble_generator(preamble_info):
             if not isinstance(preamble_info.kernel.target, OpenCLTarget):
                 raise LoopyError("only OpenCL supported for now")
 
-            yield get_argext_preamble(preamble_info.kernel.target, func.name)
+            yield get_argext_preamble(preamble_info.kernel, func.name)
 
 # vim: fdm=marker
diff --git a/loopy/maxima.py b/loopy/maxima.py
index 1a2d0770c..29f974ff9 100644
--- a/loopy/maxima.py
+++ b/loopy/maxima.py
@@ -82,7 +82,7 @@ def get_loopy_instructions_as_maxima(kernel, prefix):
         if not isinstance(insn, InstructionBase):
             insn = kernel.id_to_insn[insn]
         if not isinstance(insn, Assignment):
-            raise RuntimeError("non-expression instructions not supported "
+            raise RuntimeError("non-single-output assignment not supported "
                     "in maxima export")
 
         for dep in insn.depends_on:
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index e25c1516b..51d588ef5 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -31,6 +31,7 @@ from loopy.diagnostic import (
 from pytools.persistent_dict import PersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
+from loopy.kernel.data import make_assignment
 
 import logging
 logger = logging.getLogger(__name__)
@@ -123,14 +124,26 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
     from loopy.diagnostic import DependencyTypeInferenceFailure
     for writer_insn_id in kernel.writer_map().get(var_name, []):
         writer_insn = kernel.id_to_insn[writer_insn_id]
-        if not isinstance(writer_insn, lp.Assignment):
+        if not isinstance(writer_insn, lp.MultiAssignmentBase):
             continue
 
         expr = subst_expander(writer_insn.expression)
 
         try:
             debug("             via expr %s" % expr)
-            result = type_inf_mapper(expr)
+            if isinstance(writer_insn, lp.Assignment):
+                result = type_inf_mapper(expr)
+            elif isinstance(writer_insn, lp.CallInstruction):
+                result_dtypes = type_inf_mapper(expr, multiple_types_ok=True)
+
+                result = None
+                for (assignee, _), comp_dtype in zip(
+                        writer_insn.assignees_and_indices(), result_dtypes):
+                    if assignee == var_name:
+                        result = comp_dtype
+                        break
+
+                assert result is not None
 
             debug("             result: %s" % result)
 
@@ -438,35 +451,44 @@ def realize_reduction(kernel, insn_id_filter=None):
 
     new_insns = []
 
+    insn_id_gen = kernel.get_instruction_id_generator()
+
     var_name_gen = kernel.get_var_name_generator()
     new_temporary_variables = kernel.temporary_variables.copy()
 
     from loopy.expression import TypeInferenceMapper
     type_inf_mapper = TypeInferenceMapper(kernel)
 
-    def map_reduction(expr, rec):
+    def map_reduction(expr, rec, multiple_values_ok=False):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
-        from pymbolic import var
-
-        target_var_name = var_name_gen("acc_"+"_".join(expr.inames))
-        target_var = var(target_var_name)
-
         try:
             arg_dtype = type_inf_mapper(expr.expr)
         except DependencyTypeInferenceFailure:
             raise LoopyError("failed to determine type of accumulator for "
                     "reduction '%s'" % expr)
 
-        from loopy.kernel.data import Assignment, TemporaryVariable
+        reduction_dtypes = expr.operation.result_dtypes(
+                    kernel, arg_dtype, expr.inames)
+
+        ncomp = len(reduction_dtypes)
+
+        from pymbolic import var
+
+        acc_var_names = [
+                var_name_gen("acc_"+"_".join(expr.inames))
+                for i in range(ncomp)]
+        acc_vars = tuple(var(n) for n in acc_var_names)
+
+        from loopy.kernel.data import TemporaryVariable
 
-        new_temporary_variables[target_var_name] = TemporaryVariable(
-                name=target_var_name,
-                shape=(),
-                dtype=expr.operation.result_dtype(
-                    kernel.target, arg_dtype, expr.inames),
-                is_local=False)
+        for name, dtype in zip(acc_var_names, reduction_dtypes):
+            new_temporary_variables[name] = TemporaryVariable(
+                    name=name,
+                    shape=(),
+                    dtype=dtype,
+                    is_local=False)
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
         bad_inames = frozenset(expr.inames) & outer_insn_inames
@@ -474,13 +496,12 @@ def realize_reduction(kernel, insn_id_filter=None):
             raise LoopyError("reduction used within loop(s) that it was "
                     "supposed to reduce over: " + ", ".join(bad_inames))
 
-        init_id = temp_kernel.make_unique_instruction_id(
-                based_on="%s_%s_init" % (insn.id, "_".join(expr.inames)),
-                extra_used_ids=set(i.id for i in generated_insns))
+        init_id = insn_id_gen(
+                "%s_%s_init" % (insn.id, "_".join(expr.inames)))
 
-        init_insn = Assignment(
+        init_insn = make_assignment(
                 id=init_id,
-                assignee=target_var,
+                assignees=acc_vars,
                 forced_iname_deps=outer_insn_inames - frozenset(expr.inames),
                 forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
                 depends_on=frozenset(),
@@ -488,19 +509,20 @@ def realize_reduction(kernel, insn_id_filter=None):
 
         generated_insns.append(init_insn)
 
-        update_id = temp_kernel.make_unique_instruction_id(
-                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)),
-                extra_used_ids=set(i.id for i in generated_insns))
+        update_id = insn_id_gen(
+                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))
 
         update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames)
         if insn.forced_iname_deps_is_final:
             update_insn_iname_deps = insn.forced_iname_deps | set(expr.inames)
 
-        reduction_insn = Assignment(
+        reduction_insn = make_assignment(
                 id=update_id,
-                assignee=target_var,
+                assignees=acc_vars,
                 expression=expr.operation(
-                    arg_dtype, target_var, expr.expr, expr.inames),
+                    arg_dtype,
+                    acc_vars if len(acc_vars) > 1 else acc_vars[0],
+                    expr.expr, expr.inames),
                 depends_on=frozenset([init_insn.id]) | insn.depends_on,
                 forced_iname_deps=update_insn_iname_deps,
                 forced_iname_deps_is_final=insn.forced_iname_deps_is_final)
@@ -509,12 +531,17 @@ def realize_reduction(kernel, insn_id_filter=None):
 
         new_insn_depends_on.add(reduction_insn.id)
 
-        return target_var
+        if multiple_values_ok:
+            return acc_vars
+        else:
+            assert len(acc_vars) == 1
+            return acc_vars[0]
 
     from loopy.symbolic import ReductionCallbackMapper
     cb_mapper = ReductionCallbackMapper(map_reduction)
 
     insn_queue = kernel.instructions[:]
+    insn_id_replacements = {}
 
     temp_kernel = kernel
 
@@ -526,24 +553,47 @@ def realize_reduction(kernel, insn_id_filter=None):
         insn = insn_queue.pop(0)
 
         if insn_id_filter is not None and insn.id != insn_id_filter \
-                or not isinstance(insn, lp.Assignment):
+                or not isinstance(insn, lp.MultiAssignmentBase):
             new_insns.append(insn)
             continue
 
         # Run reduction expansion.
-        new_expression = cb_mapper(insn.expression)
+        from loopy.symbolic import Reduction
+        if isinstance(insn.expression, Reduction):
+            new_expressions = cb_mapper(insn.expression, multiple_values_ok=True)
+        else:
+            new_expressions = (cb_mapper(insn.expression),)
 
         if generated_insns:
             # An expansion happened, so insert the generated stuff plus
             # ourselves back into the queue.
 
-            insn = insn.copy(
-                        expression=new_expression,
-                        depends_on=insn.depends_on
-                        | frozenset(new_insn_depends_on),
-                        forced_iname_deps=temp_kernel.insn_inames(insn))
+            kwargs = insn.get_copy_kwargs(
+                    depends_on=insn.depends_on
+                    | frozenset(new_insn_depends_on),
+                    forced_iname_deps=temp_kernel.insn_inames(insn))
+
+            kwargs.pop("id")
+            kwargs.pop("expression")
+            kwargs.pop("assignee", None)
+            kwargs.pop("assignees", None)
+            kwargs.pop("temp_var_type", None)
+            kwargs.pop("temp_var_types", None)
 
-            insn_queue = generated_insns + [insn] + insn_queue
+            replacement_insns = [
+                    lp.Assignment(
+                        id=insn_id_gen(insn.id),
+                        assignee=assignee,
+                        expression=new_expr,
+                        **kwargs)
+                    for assignee, new_expr in zip(insn.assignees, new_expressions)]
+
+            insn_id_replacements[insn.id] = [
+                    rinsn.id for rinsn in replacement_insns]
+
+            # FIXME: Track dep rename
+
+            insn_queue = generated_insns + replacement_insns + insn_queue
 
             # The reduction expander needs an up-to-date kernel
             # object to find dependencies. Keep temp_kernel up-to-date.
@@ -558,10 +608,12 @@ def realize_reduction(kernel, insn_id_filter=None):
 
             new_insns.append(insn)
 
-    return kernel.copy(
+    kernel = kernel.copy(
             instructions=new_insns,
             temporary_variables=new_temporary_variables)
 
+    return lp.replace_instruction_ids(kernel, insn_id_replacements)
+
 # }}}
 
 
diff --git a/loopy/schedule.py b/loopy/schedule.py
index b606ba360..8094995d7 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -347,7 +347,7 @@ def format_insn(kernel, insn_id):
     Style = kernel.options._style
     return "[%s] %s%s%s <- %s%s%s" % (
             format_insn_id(kernel, insn_id),
-            Fore.BLUE, str(insn.assignee), Style.RESET_ALL,
+            Fore.BLUE, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL,
             Fore.MAGENTA, str(insn.expression), Style.RESET_ALL)
 
 
@@ -355,7 +355,7 @@ def dump_schedule(kernel, schedule):
     lines = []
     indent = ""
 
-    from loopy.kernel.data import Assignment
+    from loopy.kernel.data import MultiAssignmentBase
     for sched_item in schedule:
         if isinstance(sched_item, EnterLoop):
             lines.append(indent + "LOOP %s" % sched_item.iname)
@@ -365,7 +365,7 @@ def dump_schedule(kernel, schedule):
             lines.append(indent + "ENDLOOP %s" % sched_item.iname)
         elif isinstance(sched_item, RunInstruction):
             insn = kernel.id_to_insn[sched_item.insn_id]
-            if isinstance(insn, Assignment):
+            if isinstance(insn, MultiAssignmentBase):
                 insn_str = format_insn(kernel, sched_item.insn_id)
             else:
                 insn_str = sched_item.insn_id
diff --git a/loopy/statistics.py b/loopy/statistics.py
index c5eb3142d..b0e5ad701 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -31,7 +31,7 @@ import islpy as isl
 from pytools import memoize_in
 from pymbolic.mapper import CombineMapper
 from functools import reduce
-from loopy.kernel.data import Assignment
+from loopy.kernel.data import MultiAssignmentBase
 from loopy.diagnostic import warn
 
 
@@ -849,7 +849,7 @@ def gather_access_footprints(kernel):
     read_footprints = []
 
     for insn in kernel.instructions:
-        if not isinstance(insn, Assignment):
+        if not isinstance(insn, MultiAssignmentBase):
             warn(kernel, "count_non_assignment",
                     "Non-assignment instruction encountered in "
                     "gather_access_footprints, not counted")
@@ -861,7 +861,8 @@ def gather_access_footprints(kernel):
 
         afg = AccessFootprintGatherer(kernel, domain)
 
-        write_footprints.append(afg(insn.assignee))
+        for assignee in insn.assignees:
+            write_footprints.append(afg(insn.assignees))
         read_footprints.append(afg(insn.expression))
 
     write_footprints = AccessFootprintGatherer.combine(write_footprints)
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index b887c7034..219d66d49 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -33,7 +33,7 @@ from pytools import memoize, memoize_method, Record
 import pytools.lex
 
 from pymbolic.primitives import (
-        Leaf, AlgebraicLeaf, Variable,
+        Leaf, AlgebraicLeaf, Expression, Variable,
         CommonSubexpression)
 
 from pymbolic.mapper import (
@@ -98,6 +98,9 @@ class IdentityMapperMixin(object):
         # leaf, doesn't change
         return expr
 
+    def map_type_annotation(self, expr, *args):
+        return TypeAnnotation(expr.type, self.rec(expr.child))
+
     map_linear_subscript = IdentityMapperBase.map_subscript
 
 
@@ -321,6 +324,18 @@ class TypedCSE(CommonSubexpression):
         return dict(dtype=self.dtype)
 
 
+class TypeAnnotation(Expression):
+    def __init__(self, type, child):
+        super(TypeAnnotation, self).__init__()
+        self.type = type
+        self.child = child
+
+    def __getinitargs__(self):
+        return (self.type, self.child)
+
+    mapper_method = intern("map_type_annotation")
+
+
 class TaggedVariable(Variable):
     """This is an identifier with a tag, such as 'matrix$one', where
     'one' identifies this specific use of the identifier. This mechanism
@@ -882,7 +897,6 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 # {{{ customization to pymbolic parser
 
 _open_dbl_bracket = intern("open_dbl_bracket")
-_close_dbl_bracket = intern("close_dbl_bracket")
 
 TRAILING_FLOAT_TAG_RE = re.compile("^(.*?)([a-zA-Z]*)$")
 
@@ -908,6 +922,26 @@ class LoopyParser(ParserBase):
         else:
             return float(val)  # generic float
 
+    def parse_prefix(self, pstate):
+        from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier
+        if pstate.is_next(_less):
+            pstate.advance()
+            if pstate.is_next(_greater):
+                typename = None
+                pstate.advance()
+            else:
+                pstate.expect(_identifier)
+                typename = pstate.next_str()
+                pstate.advance()
+                pstate.expect(_greater)
+                pstate.advance()
+
+            return TypeAnnotation(
+                    typename,
+                    self.parse_expression(pstate, _PREC_UNARY))
+        else:
+            return super(LoopyParser, self).parse_prefix(pstate)
+
     def parse_postfix(self, pstate, min_precedence, left_exp):
         from pymbolic.parser import _PREC_CALL, _closebracket
         if pstate.next_tag() is _open_dbl_bracket and _PREC_CALL > min_precedence:
@@ -1079,10 +1113,10 @@ class ReductionCallbackMapper(IdentityMapper):
     def __init__(self, callback):
         self.callback = callback
 
-    def map_reduction(self, expr):
-        result = self.callback(expr, self.rec)
+    def map_reduction(self, expr, **kwargs):
+        result = self.callback(expr, self.rec, **kwargs)
         if result is None:
-            return IdentityMapper.map_reduction(self, expr)
+            return IdentityMapper.map_reduction(self, expr, **kwargs)
         return result
 
 # }}}
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 42a9e5a3d..cad451c2e 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -133,6 +133,9 @@ class TargetBase(object):
     def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
         raise NotImplementedError()
 
+    def generate_multiple_assignment(self, codegen_state, insn):
+        raise NotImplementedError()
+
     def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype):
         raise NotImplementedError("atomic update in target %s" % type(self).__name__)
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 822ee1838..c6f956253 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -341,6 +341,70 @@ class CTarget(TargetBase):
                 "++%s" % iname,
                 inner)
 
+    def generate_multiple_assignment(self, codegen_state, insn):
+        ecm = codegen_state.expression_to_code_mapper
+
+        from pymbolic.primitives import Variable
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        func_id = insn.expression.function
+        parameters = insn.expression.parameters
+
+        if isinstance(func_id, Variable):
+            func_id = func_id.name
+
+        assignee_var_descriptors = [codegen_state.kernel.get_var_descriptor(a)
+                for a, _ in insn.assignees_and_indices()]
+
+        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)
+
+        str_parameters = None
+
+        mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes)
+        if mangle_result is None:
+            raise RuntimeError("function '%s' unknown--"
+                    "maybe you need to register a function mangler?"
+                    % func_id)
+
+        assert mangle_result.arg_dtypes is not None
+
+        from loopy.expression import dtype_to_type_context
+        str_parameters = [
+                ecm(par, PREC_NONE,
+                    dtype_to_type_context(self, tgt_dtype),
+                    tgt_dtype)
+                for par, par_dtype, tgt_dtype in zip(
+                    parameters, par_dtypes, mangle_result.arg_dtypes)]
+
+        from loopy.codegen import SeenFunction
+        codegen_state.seen_functions.add(
+                SeenFunction(func_id,
+                    mangle_result.target_name,
+                    mangle_result.arg_dtypes))
+
+        for a, tgt_dtype in zip(insn.assignees[1:], mangle_result.result_dtypes[1:]):
+            if tgt_dtype != ecm.infer_type(a):
+                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
+                        "side of instruction '%s'" % (insn.id))
+            str_parameters.append(
+                    "&(%s)" % ecm(a, PREC_NONE,
+                        dtype_to_type_context(self, tgt_dtype),
+                        tgt_dtype))
+
+        result = "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
+
+        result = ecm.wrap_in_typecast(
+                mangle_result.result_dtypes[0],
+                assignee_var_descriptors[0].dtype,
+                result)
+
+        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)
+
+        from cgen import Assign
+        return Assign(
+                lhs_code,
+                result)
+
     # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 63a053c58..11686dcdc 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -34,7 +34,7 @@ import islpy as isl
 
 from loopy.expression import dtype_to_type_context, TypeInferenceMapper
 
-from loopy.diagnostic import LoopyError
+from loopy.diagnostic import LoopyError, LoopyWarning
 from loopy.tools import is_integer
 from loopy.types import LoopyType
 
@@ -94,22 +94,23 @@ class LoopyCCodeMapper(RecursiveMapper):
 
         return ary
 
-    def rec(self, expr, prec, type_context=None, needed_dtype=None):
-        if needed_dtype is None:
-            return RecursiveMapper.rec(self, expr, prec, type_context)
-
-        actual_type = self.infer_type(expr)
-
+    def wrap_in_typecast(self, actual_type, needed_dtype, s):
         if (actual_type.is_complex() and needed_dtype.is_complex()
                 and actual_type != needed_dtype):
-            result = RecursiveMapper.rec(self, expr, PREC_NONE, type_context)
-            return "%s_cast(%s)" % (self.complex_type_name(needed_dtype), result)
+            return "%s_cast(%s)" % (self.complex_type_name(needed_dtype), s)
         elif not actual_type.is_complex() and needed_dtype.is_complex():
-            result = RecursiveMapper.rec(self, expr, PREC_NONE, type_context)
-            return "%s_fromreal(%s)" % (self.complex_type_name(needed_dtype), result)
+            return "%s_fromreal(%s)" % (self.complex_type_name(needed_dtype), s)
         else:
+            return s
+
+    def rec(self, expr, prec, type_context=None, needed_dtype=None):
+        if needed_dtype is None:
             return RecursiveMapper.rec(self, expr, prec, type_context)
 
+        return self.wrap_in_typecast(
+                self.infer_type(expr), needed_dtype,
+                RecursiveMapper.rec(self, expr, PREC_NONE, type_context))
+
     __call__ = rec
 
     # }}}
@@ -419,37 +420,32 @@ class LoopyCCodeMapper(RecursiveMapper):
 
         # }}}
 
-        c_name = None
         if isinstance(identifier, Variable):
             identifier = identifier.name
-            c_name = identifier
 
         par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)
 
         str_parameters = None
 
         mangle_result = self.kernel.mangle_function(identifier, par_dtypes)
-        if mangle_result is not None:
-            if len(mangle_result) == 2:
-                result_dtype, c_name = mangle_result
-            elif len(mangle_result) == 3:
-                result_dtype, c_name, arg_tgt_dtypes = mangle_result
-
-                str_parameters = [
-                        self.rec(par, PREC_NONE,
-                            dtype_to_type_context(self.kernel.target, tgt_dtype),
-                            tgt_dtype)
-                        for par, par_dtype, tgt_dtype in zip(
-                            expr.parameters, par_dtypes, arg_tgt_dtypes)]
-            else:
-                raise RuntimeError("result of function mangler "
-                        "for function '%s' not understood"
-                        % identifier)
+        if mangle_result is None:
+            raise RuntimeError("function '%s' unknown--"
+                    "maybe you need to register a function mangler?"
+                    % identifier)
 
-        from loopy.codegen import SeenFunction
-        self.codegen_state.seen_functions.add(
-                SeenFunction(identifier, c_name, par_dtypes))
-        if str_parameters is None:
+        if len(mangle_result.result_dtypes) != 1:
+            raise LoopyError("functions with more or fewer than one return value "
+                    "may not be used in an expression")
+
+        if mangle_result.arg_dtypes is not None:
+            str_parameters = [
+                    self.rec(par, PREC_NONE,
+                        dtype_to_type_context(self.kernel.target, tgt_dtype),
+                        tgt_dtype)
+                    for par, par_dtype, tgt_dtype in zip(
+                        expr.parameters, par_dtypes, mangle_result.arg_dtypes)]
+
+        else:
             # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
             # propagate the type context here. But for many others, it does
             # not. Using the inferred type as a stopgap for now.
@@ -459,11 +455,18 @@ class LoopyCCodeMapper(RecursiveMapper):
                             self.kernel.target, par_dtype))
                     for par, par_dtype in zip(expr.parameters, par_dtypes)]
 
-        if c_name is None:
-            raise RuntimeError("unable to find C name for function identifier '%s'"
-                    % identifier)
+            from warnings import warn
+            warn("Calling function '%s' with unknown C signature--"
+                    "return CallMangleInfo.arg_dtypes"
+                    % identifier, LoopyWarning)
+
+        from loopy.codegen import SeenFunction
+        self.codegen_state.seen_functions.add(
+                SeenFunction(identifier,
+                    mangle_result.target_name,
+                    mangle_result.arg_dtypes or par_dtypes))
 
-        return "%s(%s)" % (c_name, ", ".join(str_parameters))
+        return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
 
     # {{{ deal with complex-valued variables
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 5cb6fbc19..362cbb79a 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -32,7 +32,7 @@ from pytools import memoize_method
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
 from loopy.target.c import DTypeRegistryWrapper
-from loopy.kernel.data import temp_var_scope
+from loopy.kernel.data import temp_var_scope, CallMangleInfo
 
 
 # {{{ dtype registry wrappers
@@ -146,7 +146,7 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
     if not isinstance(name, str):
         return None
 
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
+    if name in ["max", "min", "atan2"] and len(arg_dtypes) == 2:
         dtype = np.find_common_type(
                 [], [dtype.numpy_dtype for dtype in arg_dtypes])
 
@@ -156,14 +156,18 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
         if dtype.kind == "f":
             name = "f" + name
 
-        return NumpyType(dtype), name
-
-    if name in "atan2" and len(arg_dtypes) == 2:
-        return arg_dtypes[0], name
+        result_dtype = NumpyType(dtype)
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(result_dtype,),
+                arg_dtypes=2*(result_dtype,))
 
     if name == "dot":
         scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
-        return NumpyType(scalar_dtype), name
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(NumpyType(scalar_dtype),),
+                arg_dtypes=(arg_dtypes[0],)*2)
 
     if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
         num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
@@ -178,7 +182,11 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
             raise LoopyError("%s does not support complex numbers"
                     % name)
 
-        return NumpyType(dtype), name
+        result_dtype = NumpyType(dtype)
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=(result_dtype,),
+                arg_dtypes=(result_dtype,)*3)
 
     return None
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index aea247397..6097cd7a2 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -29,6 +29,7 @@ from six.moves import range
 
 import numpy as np
 
+from loopy.kernel.data import CallMangleInfo
 from loopy.target.opencl import OpenCLTarget
 from loopy.types import NumpyType
 
@@ -194,13 +195,18 @@ def pyopencl_function_mangler(target, name, arg_dtypes):
                     "sin", "cos", "tan",
                     "sinh", "cosh", "tanh",
                     "conj"]:
-                return arg_dtype, "%s_%s" % (tpname, name)
+                return CallMangleInfo(
+                        target_name="%s_%s" % (tpname, name),
+                        result_dtypes=(arg_dtype,),
+                        arg_dtypes=(arg_dtype,))
 
             if name in ["real", "imag", "abs"]:
-                return (
-                        NumpyType(
+                return CallMangleInfo(
+                        target_name="%s_%s" % (tpname, name),
+                        result_dtypes=(NumpyType(
                             np.dtype(arg_dtype.numpy_dtype.type(0).real)),
-                        "%s_%s" % (tpname, name))
+                            ),
+                        arg_dtypes=(arg_dtype,))
 
     return None
 
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 25634c919..8b2fa370a 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -124,6 +124,27 @@ def remove_instructions(kernel, insn_ids):
     return kernel.copy(
             instructions=new_insns)
 
+
+def replace_instruction_ids(kernel, replacements):
+    new_insns = []
+
+    for insn in kernel.instructions:
+        changed = False
+        new_depends_on = []
+
+        for dep in insn.depends_on:
+            if dep in replacements:
+                new_depends_on.extend(replacements[dep])
+                changed = True
+            else:
+                new_depends_on.append(dep)
+
+        new_insns.append(
+                insn.copy(depends_on=frozenset(new_depends_on))
+                if changed else insn)
+
+    return kernel.copy(instructions=new_insns)
+
 # }}}
 
 
diff --git a/loopy/version.py b/loopy/version.py
index cd9f45ac3..7716feea3 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v25-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v26-islpy%s" % _islpy_version
diff --git a/setup.py b/setup.py
index 5ed095315..30d6dfb63 100644
--- a/setup.py
+++ b/setup.py
@@ -43,6 +43,7 @@ setup(name="loo.py",
           "islpy>=2016.1.2",
           "six>=1.8.0",
           "colorama",
+          "Mako",
           ],
 
       extras_require={
diff --git a/test/test_loopy.py b/test/test_loopy.py
index f7ef0db33..7474b5128 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -262,7 +262,7 @@ def test_join_inames(ctx_factory):
     knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"])
     knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"])
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True)
 
 
 def test_divisibility_assumption(ctx_factory):
@@ -439,26 +439,21 @@ def test_argmax(ctx_factory):
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
-    order = "C"
 
     n = 10000
 
     knl = lp.make_kernel(
             "{[i]: 0<=i<%d}" % n,
-            [
-                "<> result = argmax(i, fabs(a[i]))",
-                "max_idx = result.index",
-                "max_val = result.value",
-                ],
-            [
-                lp.GlobalArg("a", dtype, shape=(n,), order=order),
-                lp.GlobalArg("max_idx", np.int32, shape=(), order=order),
-                lp.GlobalArg("max_val", dtype, shape=(), order=order),
-                ])
+            """
+            max_val, max_idx = argmax(i, fabs(a[i]))
+            """)
+
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    print(lp.preprocess_kernel(knl))
+    knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
 
     a = np.random.randn(10000).astype(dtype)
-    cknl = lp.CompiledKernel(ctx, knl)
-    evt, (max_idx, max_val) = cknl(queue, a=a, out_host=True)
+    evt, (max_idx, max_val) = knl(queue, a=a, out_host=True)
     assert max_val == np.max(np.abs(a))
     assert max_idx == np.where(np.abs(a) == max_val)[-1]
 
-- 
GitLab