diff --git a/loopy/__init__.py b/loopy/__init__.py
index 189aabd4cd738542da2ff4d6e521c3ed708a783c..9c16200f37f18875f2ccfe6546d6e7cfc2ea0f2a 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -22,8 +22,10 @@ class LoopyAdvisory(UserWarning):
 from loopy.kernel import ScalarArg, GlobalArg, ArrayArg, ConstantArg, ImageArg
 
 from loopy.kernel import (AutoFitLocalIndexTag, get_dot_dependency_graph,
-        LoopKernel, Instruction)
+        LoopKernel, Instruction, default_function_mangler, single_arg_function_mangler,
+        default_preamble_generator)
 from loopy.creation import make_kernel
+from loopy.reduction import register_reduction_parser
 from loopy.subst import extract_subst, expand_subst
 from loopy.cse import precompute
 from loopy.preprocess import preprocess_kernel, realize_reduction
@@ -34,7 +36,11 @@ from loopy.check import check_kernels
 
 __all__ = ["ScalarArg", "GlobalArg", "ArrayArg", "ConstantArg", "ImageArg",
         "LoopKernel",
-        "Instruction", "make_kernel",
+        "Instruction",
+        "default_function_mangler", "single_arg_function_mangler",
+        "default_preamble_generator",
+        "make_kernel",
+        "register_reduction_parser",
         "get_dot_dependency_graph",
         "preprocess_kernel", "realize_reduction",
         "generate_loop_schedules",
@@ -64,7 +70,7 @@ def split_dimension(kernel, split_iname, inner_length,
     if split_iname not in kernel.all_inames():
         raise ValueError("cannot split loop for unknown variable '%s'" % split_iname)
 
-    applied_substitutions = kernel.applied_substitutions[:]
+    applied_iname_rewrites = kernel.applied_iname_rewrites[:]
 
     if outer_iname is None:
         outer_iname = split_iname+"_outer"
@@ -109,7 +115,7 @@ def split_dimension(kernel, split_iname, inner_length,
     new_insns = []
     for insn in kernel.instructions:
         subst_map = {var(split_iname): new_loop_index}
-        applied_substitutions.append(subst_map)
+        applied_iname_rewrites.append(subst_map)
 
         from loopy.symbolic import SubstitutionMapper
         subst_mapper = SubstitutionMapper(subst_map.get)
@@ -139,7 +145,7 @@ def split_dimension(kernel, split_iname, inner_length,
             .copy(domain=new_domain,
                 iname_slab_increments=iname_slab_increments,
                 instructions=new_insns,
-                applied_substitutions=applied_substitutions,
+                applied_iname_rewrites=applied_iname_rewrites,
                 ))
 
     return tag_dimensions(result, {outer_iname: outer_tag, inner_iname: inner_tag})
@@ -232,7 +238,7 @@ def join_dimensions(kernel, inames, new_iname=None, tag=AutoFitLocalIndexTag()):
             .map_expressions(subst_map, exclude_instructions=True)
             .copy(
                 instructions=new_insns, domain=new_domain,
-                applied_substitutions=kernel.applied_substitutions + [subst_map]
+                applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_map]
                 ))
 
     return tag_dimensions(result, {new_iname: tag})
@@ -376,7 +382,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
             if len(si) != arg.dimensions:
                 raise ValueError("sweep index '%s' has the wrong number of dimensions")
 
-            for subst_map in kernel.applied_substitutions:
+            for subst_map in kernel.applied_iname_rewrites:
                 from loopy.symbolic import SubstitutionMapper
                 from pymbolic.mapper.substitutor import make_subst_func
                 si = SubstitutionMapper(make_subst_func(subst_map))(si)
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index e4d1f83a18b0d648eae5f121ca476e6a21860cc4..b969d989682b1cdac32e77625f9f73f7facd4e10 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -201,12 +201,16 @@ def generate_code(kernel, with_annotation=False,
         if var.dtype.kind == "c":
             allow_complex = True
 
+    seen_dtypes = set()
+    seen_functions = set()
+
     from loopy.codegen.expression import LoopyCCodeMapper
-    ccm = (LoopyCCodeMapper(kernel, with_annotation=with_annotation,
+    ccm = (LoopyCCodeMapper(kernel, seen_dtypes, seen_functions,
+        with_annotation=with_annotation,
         allow_complex=allow_complex)
         .copy_and_assign_many(make_initial_assignments(kernel)))
 
-    mod = Module()
+    mod = []
 
     body = Block()
 
@@ -220,7 +224,6 @@ def generate_code(kernel, with_annotation=False,
         else:
             return RestrictPointer(arg)
 
-    has_double = False
     has_image = False
 
     from loopy.kernel import GlobalArg, ConstantArg, ImageArg, ScalarArg
@@ -250,16 +253,8 @@ def generate_code(kernel, with_annotation=False,
         else:
             raise ValueError("argument type not understood: '%s'" % type(arg))
 
-        if arg.dtype in [np.float64, np.complex128]:
-            has_double = True
-
         args.append(arg_decl)
 
-    if has_double:
-        mod.extend([
-            Line("#pragma OPENCL EXTENSION cl_khr_fp64: enable"),
-            Line()])
-
     if has_image:
         body.append(Initializer(Const(Value("sampler_t", "loopy_sampler")),
             "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP "
@@ -267,24 +262,6 @@ def generate_code(kernel, with_annotation=False,
 
     # }}}
 
-    # {{{ handle preambles
-
-    seen_preamble_tags = set()
-    dedup_preambles = []
-
-    for tag, preamble in kernel.preambles:
-        if tag in seen_preamble_tags:
-            continue
-
-        seen_preamble_tags.add(tag)
-        dedup_preambles.append(preamble)
-
-    mod.extend(
-            [LiteralLines(lines) for lines in dedup_preambles]
-            +[Line()])
-
-    # }}}
-
     mod.extend([
         LiteralLines(r"""
         #define lid(N) ((int) get_local_id(N))
@@ -357,7 +334,28 @@ def generate_code(kernel, with_annotation=False,
     from loopy.check import check_implemented_domains
     assert check_implemented_domains(kernel, gen_code.implemented_domains)
 
-    return str(mod)
+    # {{{ handle preambles
+
+    preambles = kernel.preambles[:]
+    for prea_gen in kernel.preamble_generators:
+        preambles.extend(prea_gen(seen_dtypes, seen_functions))
+
+    seen_preamble_tags = set()
+    dedup_preambles = []
+
+    for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]):
+        if tag in seen_preamble_tags:
+            continue
+
+        seen_preamble_tags.add(tag)
+        dedup_preambles.append(preamble)
+
+    mod = ([LiteralLines(lines) for lines in dedup_preambles]
+            +[Line()] + mod)
+
+    # }}}
+
+    return str(Module(mod))
 
 # }}}
 
diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py
index d77cdbd2a752d88631bfdb8bb62cc7f94fc1f59e..760962a3d3f1e482105ab97f8bd26207f6ab7ee5 100644
--- a/loopy/codegen/expression.py
+++ b/loopy/codegen/expression.py
@@ -52,19 +52,21 @@ class TypeInferenceMapper(CombineMapper):
 
     def map_call(self, expr):
         from pymbolic.primitives import Variable
-        if isinstance(expr.function, Variable):
-            name = expr.function.name
-            arg_dtypes = tuple(self.rec(par) for par in expr.parameters)
-            for rdg in self.kernel.function_result_dtype_getters:
-                result = rdg(name, arg_dtypes)
-                if result is not None:
-                    return result
 
-            raise RuntimeError("no type inference information on "
-                    "function '%s'" % name)
+        identifier = expr.function
+        if isinstance(identifier, Variable):
+            identifier = identifier.name
 
-        else:
-            return CombineMapper.map_call(self, expr)
+        arg_dtypes = tuple(self.rec(par) for par in expr.parameters)
+
+        for mangler in self.kernel.function_manglers:
+            mangle_result = mangler(identifier, arg_dtypes)
+            if mangle_result is not None:
+                result_dtype, _ = mangle_result
+                return result_dtype
+
+        raise RuntimeError("no type inference information on "
+                "function '%s'" % identifier)
 
     def map_variable(self, expr):
         try:
@@ -96,34 +98,45 @@ class TypeInferenceMapper(CombineMapper):
         return dtype
 
     def map_reduction(self, expr):
-        return expr.operation.dtype(expr.inames)
+        return expr.operation.result_dtype(self.rec(expr.expr), expr.inames)
 
 # }}}
 
 # {{{ C code mapper
 
 class LoopyCCodeMapper(CCodeMapper):
-    def __init__(self, kernel, cse_name_list=[], var_subst_map={},
+    def __init__(self, kernel, seen_dtypes, seen_functions, var_subst_map={},
             with_annotation=False, allow_complex=False):
+        """
+        :arg seen_dtypes: set of dtypes that were encountered
+        :arg seen_functions: set of tuples (name, c_name, arg_types) indicating
+            functions that were encountered.
+        """
 
-        CCodeMapper.__init__(self, cse_name_list=cse_name_list)
+        CCodeMapper.__init__(self)
         self.kernel = kernel
-        self.infer_type = TypeInferenceMapper(kernel)
+        self.seen_dtypes = seen_dtypes
+        self.seen_functions = seen_functions
+
+        self.type_inf_mapper = TypeInferenceMapper(kernel)
         self.allow_complex = allow_complex
 
         self.with_annotation = with_annotation
         self.var_subst_map = var_subst_map.copy()
 
-    def copy(self, var_subst_map=None, cse_name_list=None):
+    def copy(self, var_subst_map=None):
         if var_subst_map is None:
             var_subst_map = self.var_subst_map
-        if cse_name_list is None:
-            cse_name_list = self.cse_name_list
-        return LoopyCCodeMapper(self.kernel,
-                cse_name_list=cse_name_list, var_subst_map=var_subst_map,
+        return LoopyCCodeMapper(self.kernel, self.seen_dtypes, self.seen_functions,
+                var_subst_map=var_subst_map,
                 with_annotation=self.with_annotation,
                 allow_complex=self.allow_complex)
 
+    def infer_type(self, expr):
+        result = self.type_inf_mapper(expr)
+        self.seen_dtypes.add(result)
+        return result
+
     def copy_and_assign(self, name, value):
         """Make a copy of self with variable *name* fixed to *value*."""
         var_subst_map = self.var_subst_map.copy()
@@ -269,6 +282,33 @@ class LoopyCCodeMapper(CCodeMapper):
         else:
             return CCodeMapper.map_constant(self, expr, enclosing_prec)
 
+    def map_call(self, expr, enclosing_prec):
+        from pymbolic.primitives import Variable
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        identifier = expr.function
+
+        c_name = None
+        if isinstance(identifier, Variable):
+            identifier = identifier.name
+            c_name = identifier
+
+        arg_dtypes = tuple(self.infer_type(par) for par in expr.parameters)
+
+        for mangler in self.kernel.function_manglers:
+            mangle_result = mangler(identifier, arg_dtypes)
+            if mangle_result is not None:
+                result_dtype, c_name = mangle_result
+
+        self.seen_functions.add((identifier, c_name, arg_dtypes))
+
+        if c_name is None:
+            raise RuntimeError("unable to find C name for function identifier '%s'"
+                    % identifier)
+
+        return self.format("%s(%s)",
+                c_name, self.join_rec(", ", expr.parameters, PREC_NONE))
+
     # {{{ deal with complex-valued variables
 
     def complex_type_name(self, dtype):
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 4bd9750b33e0448ee80f420d9d550ae90144153d..ae40d8dbe4b3c565bfbc0e4c001a4651267dc446 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -6,7 +6,8 @@ import numpy as np
 from pytools import Record, memoize_method
 import islpy as isl
 from islpy import dim_type
-from pymbolic import var
+
+import re
 
 
 
@@ -390,241 +391,96 @@ class Instruction(Record):
 
 # }}}
 
-# {{{ reduction operations
-
-class ReductionOperation(object):
-    def dtype(self, inames):
-        raise NotImplementedError
-
-    def get_function_result_dtype_getter(self):
-        """If the reduction declares any functions, return a getter that
-        makes their return types known to type inference.
-        """
-        return None
-
-    def get_preambles(self, inames):
-        return []
-
-    def neutral_element(self, inames):
-        raise NotImplementedError
-
-    def __call__(self, operand1, operand2):
-        raise NotImplementedError
-
-class ScalarReductionOperation(ReductionOperation):
-    def __init__(self, dtype):
-        self.scalar_dtype = dtype
-
-    def dtype(self, inames):
-        return self.scalar_dtype
-
-    def __str__(self):
-        return (type(self).__name__.replace("ReductionOperation", "").lower()
-                + "_" + str(self.dtype))
-
-class SumReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, inames):
-        return 0
-
-    def __call__(self, operand1, operand2, index):
-        return operand1 + operand2
-
-class ProductReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, inames):
-        return 1
-
-    def __call__(self, operand1, operand2, index):
-        return operand1 * operand2
-
-def get_le_neutral(dtype):
-    """Return a number y that satisfies (x <= y) for all y."""
-
-    if dtype.kind == "f":
-        # OpenCL 1.1, section 6.11.2
-        return var("INFINITY")
-    else:
-        raise NotImplementedError("less")
-
-class MaxReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, inames):
-        return get_le_neutral(self.dtype)
-
-    def __call__(self, operand1, operand2, index):
-        return var("max")(operand1, operand2)
-
-class MinReductionOperation(ScalarReductionOperation):
-    @property
-    def neutral_element(self, inames):
-        return -get_le_neutral(self.dtype)
-
-    def __call__(self, operand1, operand2, index):
-        return var("min")(operand1, operand2)
-
-
-
-
-class _ArgExtremumReductionOperation(ReductionOperation):
-    def __init__(self, dtype):
-        self.scalar_dtype = dtype
-
-        self.struct_dtype = np.dtype(
-                [("value", self.scalar_dtype),
-                ("index", np.int32)])
+# {{{ expand defines
 
-        self.prefix = "loopy_arg%s_%s" % (self.which, self.scalar_dtype.type.__name__)
-        self.type_name = self.prefix + "_result"
-        from pyopencl.tools import register_dtype
-        register_dtype(self.struct_dtype, self.type_name, alias_ok=True)
+MACRO_RE = re.compile(r"\{([a-zA-Z0-9_]+)\}")
 
-    def dtype(self, inames):
-        return self.struct_dtype
+def expand_defines(insn, defines):
+    macros = set(match.group(1) for match in MACRO_RE.finditer(insn))
 
-    def get_function_result_dtype_getter(self):
-        names = [self.prefix+"_init", self.prefix+"_update"]
-
-        def getter(name, arg_dtypes):
-            if name in names:
-                return self.struct_dtype
-
-            return None
-
-        return getter
-
-    def get_preambles(self, inames):
-        """Returns a tuple (preamble_key, preamble), where *preamble* is a string
-        that goes into the kernel preamble, and *preamble_key* is a unique
-        identifier. No two preambles with the same key will be emitted.
-        """
-
-        if len(inames) != 1:
-            raise RuntimeError("arg%s must be used with exactly one iname"
-                    % self.which)
-
-        from pyopencl.tools import dtype_to_ctype
-        from pymbolic.mapper.c_code import CCodeMapper
-
-        c_code_mapper = CCodeMapper()
-
-        return [(self.prefix, """
-        typedef struct {
-            %(scalar_type)s value;
-            int index;
-        } %(type_name)s;
-
-        inline %(type_name)s %(prefix)s_init()
-        {
-            %(type_name)s result;
-            result.value = %(neutral)s;
-            result.index = INT_MIN;
-            return result;
-        }
-
-        inline %(type_name)s %(prefix)s_update(
-            %(type_name)s state, %(scalar_type)s op2, int index)
-        {
-            %(type_name)s result;
-            if (op2 %(comp)s state.value)
-            {
-                result.value = op2;
-                result.index = index;
-                return result;
-            }
-            else return state;
-        }
-        """ % dict(
-                type_name=self.type_name,
-                scalar_type=dtype_to_ctype(self.scalar_dtype),
-                prefix=self.prefix,
-                neutral=c_code_mapper(
-                    self.neutral_sign*get_le_neutral(self.scalar_dtype)),
-                comp=self.update_comparison,
-                ))]
-
-    def neutral_element(self, inames):
-        return var(self.prefix+"_init")()
-
-    def __call__(self, operand1, operand2, inames):
-        iname, = inames
-
-        return var(self.prefix+"_update")(
-                operand1, operand2, var(iname))
+    replacements = [()]
+    for mac in macros:
+        value = defines[mac]
+        if isinstance(value, list):
+            replacements = [
+                    rep+(("{%s}" % mac, subval),)
+                    for rep in replacements
+                    for subval in value
+                    ]
+        else:
+            replacements = [
+                    rep+(("{%s}" % mac, value),)
+                    for rep in replacements]
 
-class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
-    which = "max"
-    update_comparison = ">="
-    neutral_sign = -1
+    for rep in replacements:
+        rep_value = insn
+        for name, val in rep:
+            rep_value = rep_value.replace(name, str(val))
 
-class ArgMinReductionOperation(_ArgExtremumReductionOperation):
-    which = "min"
-    update_comparison = "<="
-    neutral_sign = +1
+        yield rep_value
 
+# }}}
 
+# {{{ function manglers
 
+def default_function_mangler(name, arg_dtypes):
+    from loopy.reduction import reduction_function_mangler
 
+    manglers = [reduction_function_mangler]
+    for mangler in manglers:
+        result = mangler(name, arg_dtypes)
+        if result is not None:
+            return result
 
+    return None
 
-_REDUCTION_OPS = {
-        "sum": SumReductionOperation,
-        "product": ProductReductionOperation,
-        "max": MaxReductionOperation,
-        "min": MinReductionOperation,
-        "argmax": ArgMaxReductionOperation,
-        "argmin": ArgMinReductionOperation,
-        }
+def single_arg_function_mangler(name, arg_dtypes):
+    if len(arg_dtypes) == 1:
+        dtype, = arg_dtypes
+        return dtype, name
 
-_REDUCTION_OP_PARSERS = [
-        ]
+    return None
 
+# }}}
 
-def register_reduction_parser(parser):
-    """Register a new :class:`ReductionOperation`.
+# {{{ preamble generators
 
-    :arg parser: A function that receives a string and returns
-        a subclass of ReductionOperation.
-    """
-    _REDUCTION_OP_PARSERS.append(parser)
+def default_preamble_generator(seen_dtypes, seen_functions):
+    from loopy.reduction import reduction_preamble_generator
 
-def parse_reduction_op(name):
-    import re
-    red_op_match = re.match(r"^([a-z]+)_([a-z0-9_]+)$", name)
-    if red_op_match:
-        op_name = red_op_match.group(1)
-        op_type = red_op_match.group(2)
+    for result in reduction_preamble_generator(seen_dtypes, seen_functions):
+        yield result
 
-        try:
-            op_dtype = np.dtype(op_type)
-        except TypeError:
-            op_dtype = None
+    has_double = False
+    has_complex = False
 
-        if op_dtype is None and op_type.startswith("vec_"):
-            import pyopencl.array as cl_array
-            try:
-                op_dtype = getattr(cl_array.vec, op_type[4:])
-            except AttributeError:
-                op_dtype = None
+    for dtype in seen_dtypes:
+        if dtype in [np.float64, np.complex128]:
+            has_double = True
+        if dtype.kind == "c":
+            has_complex = True
 
-        if op_name in _REDUCTION_OPS and op_dtype is not None:
-            return _REDUCTION_OPS[op_name](op_dtype)
+    if has_double:
+        yield ("00_enable_double", """
+            #pragma OPENCL EXTENSION cl_khr_fp64: enable
+            """)
 
-    for parser in _REDUCTION_OP_PARSERS:
-        result = parser(name)
-        if result is not None:
-            return result
+    if has_complex:
+        if has_double:
+            yield ("10_include_complex_header", """
+                #define PYOPENCL_DEFINE_CDOUBLE
 
-    return None
+                #include <pyopencl-complex.h>
+                """)
+        else:
+            yield ("10_include_complex_header", """
+                #include <pyopencl-complex.h>
+                """)
 
 # }}}
 
 # {{{ loop kernel object
 
-def _default_get_function_result_dtype(name, arg_dtypes):
-    if len(arg_dtypes) == 1:
-        dtype, = arg_dtypes
-        return dtype
-
-    return None
-
 class LoopKernel(Record):
     """
     :ivar device: :class:`pyopencl.Device`
@@ -635,27 +491,39 @@ class LoopKernel(Record):
     :ivar name:
     :ivar preambles: a list of (tag, code) tuples that identify preamble snippets.
         Each tag's snippet is only included once, at its first occurrence.
+        The preambles will be inserted in order of their tags.
+    :ivar preamble_generators: a list of functions of signature
+        (seen_dtypes, seen_functions) where seen_functions is a set of
+        (name, c_name, arg_dtypes), generating extra entries for `preambles`.
     :ivar assumptions: the initial implemented_domain, captures assumptions
         on the parameters. (an isl.Set)
-    :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr,
-        upper_incr) tuples that will be separated out in the execution to generate
-        'bulk' slabs with fewer conditionals.
-    :ivar temporary_variables:
-    :ivar iname_to_tag:
     :ivar local_sizes: A dictionary from integers to integers, mapping
-        workgroup axes to ther sizes, e.g. *{0: 16}* forces axis 0 to be
+        workgroup axes to their sizes, e.g. *{0: 16}* forces axis 0 to be
         length 16.
+    :ivar temporary_variables:
+    :ivar iname_to_tag:
     :ivar substitutions: a mapping from substitution names to :class:`SubstitutionRule`
         objects
-    :ivar lowest_priority_inames:
-    :ivar breakable_inames: these inames' loops may be broken up by the scheduler
-    :ivar applied_substitutions: A list of past substitution dictionaries that
+    :ivar function_manglers: list of functions of signature (name, arg_dtypes)
+        returning a tuple (result_dtype, function_name), where the function_name
+        is the C-level function to be called.
+    :ivar defines: a dictionary of replacements to be made in instructions given
+        as strings before parsing. A macro instance intended to be replaced should 
+        look like "{MACRO}" in the instruction code. The expansion given in this
+        parameter is allowed to be a list. In this case, instructions are generated
+        for *each* combination of macro values.
+
+    The following arguments are not user-facing:
+
+    :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr,
+        upper_incr) tuples that will be separated out in the execution to generate
+        'bulk' slabs with fewer conditionals.
+    :ivar applied_iname_rewrites: A list of past substitution dictionaries that
         were applied to the kernel. These are stored so that they may be repeated
         on expressions the user specifies later.
-    :ivar function_result_dtype_getters: list of functions of signature (name, arg_dtypes)
-        returning the result dtype of that function.
-
     :ivar cache_manager:
+    :ivar lowest_priority_inames:
+    :ivar breakable_inames: these inames' loops may be broken up by the scheduler
 
     The following instance variables are only used until :func:`loopy.make_kernel` is
     finished:
@@ -665,14 +533,22 @@ class LoopKernel(Record):
 
     def __init__(self, device, domain, instructions, args=None, schedule=None,
             name="loopy_kernel",
-            preambles=[], assumptions=None,
-            iname_slab_increments={},
-            temporary_variables={},
+            preambles=[], 
+            preamble_generators=[default_preamble_generator],
+            assumptions=None,
             local_sizes={},
-            iname_to_tag={}, iname_to_tag_requests=None, substitutions={},
-            cache_manager=None, lowest_priority_inames=[], breakable_inames=set(),
-            applied_substitutions=[],
-            function_result_dtype_getters=[_default_get_function_result_dtype]):
+            temporary_variables={},
+            iname_to_tag={},
+            substitutions={},
+            function_manglers=[default_function_mangler, single_arg_function_mangler],
+            defines={},
+
+            # non-user-facing
+            iname_slab_increments={},
+            applied_iname_rewrites=[],
+            cache_manager=None,
+            iname_to_tag_requests=None, 
+            lowest_priority_inames=[], breakable_inames=set()):
         """
         :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl.
             Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}"
@@ -734,20 +610,7 @@ class LoopKernel(Record):
 
         # {{{ instruction parser
 
-        def parse_if_necessary(insn):
-            from loopy.symbolic import parse
-
-            if isinstance(insn, Instruction):
-                if insn.id is None:
-                    insn = insn.copy(id=self.make_unique_instruction_id(insns))
-                insns.append(insn)
-                return
-
-            if not isinstance(insn, str):
-                raise TypeError("Instructions must be either an Instruction "
-                        "instance or a parseable string. got '%s' instead."
-                        % type(insn))
-
+        def parse_insn(insn):
             insn_match = INSN_RE.match(insn)
             subst_match = SUBST_RE.match(insn)
             if insn_match is not None and subst_match is not None:
@@ -760,6 +623,7 @@ class LoopKernel(Record):
             else:
                 raise RuntimeError("insn parse error")
 
+            from loopy.symbolic import parse
             lhs = parse(groups["lhs"])
             rhs = parse(groups["rhs"])
 
@@ -835,6 +699,22 @@ class LoopKernel(Record):
                         arguments=arg_names,
                         expression=rhs)
 
+        def parse_if_necessary(insn):
+            if isinstance(insn, Instruction):
+                if insn.id is None:
+                    insn = insn.copy(id=self.make_unique_instruction_id(insns))
+                insns.append(insn)
+                return
+
+            if not isinstance(insn, str):
+                raise TypeError("Instructions must be either an Instruction "
+                        "instance or a parseable string. got '%s' instead."
+                        % type(insn))
+
+            for insn in expand_defines(insn, defines):
+                parse_insn(insn)
+
+
         # }}}
 
         insns = []
@@ -866,6 +746,7 @@ class LoopKernel(Record):
                 schedule=schedule,
                 name=name,
                 preambles=preambles,
+                preamble_generators=preamble_generators,
                 assumptions=assumptions,
                 iname_slab_increments=iname_slab_increments,
                 temporary_variables=temporary_variables,
@@ -876,8 +757,8 @@ class LoopKernel(Record):
                 cache_manager=cache_manager,
                 lowest_priority_inames=lowest_priority_inames,
                 breakable_inames=breakable_inames,
-                applied_substitutions=applied_substitutions,
-                function_result_dtype_getters=function_result_dtype_getters)
+                applied_iname_rewrites=applied_iname_rewrites,
+                function_manglers=function_manglers)
 
     def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()):
         if insns is None:
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 82a67450efe2ca558d5e7c103dd4fe3139553e9a..c3755f1949c3ac60fe42094ffd61d8e056d8b8d0 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -6,85 +6,103 @@ import pyopencl.characterize as cl_char
 
 
 
-# {{{ gather dtype getters from reduction operations
+# {{{ infer types
 
-def gather_dtype_getters(kernel):
-    dtype_getters = kernel.function_result_dtype_getters
+def infer_temp_var_type(kernel, tv, type_inf_mapper, debug):
+    dtypes = []
 
-    def gather_from_reduction(expr, rec):
-        red_getter = expr.operation.get_function_result_dtype_getter()
-        if red_getter is not None:
-            dtype_getters.append(red_getter)
+    writers = kernel.writer_map()[tv.name]
+    exprs = [kernel.id_to_insn[w].expression for w in writers]
 
-        rec(expr.expr)
+    from loopy.codegen.expression import DependencyTypeInferenceFailure
+    for expr in exprs:
+        try:
+            if debug:
+                print "             via expr", expr
+            result = type_inf_mapper(expr)
 
-    from loopy.symbolic import ReductionCallbackMapper
-    rcm = ReductionCallbackMapper(gather_from_reduction)
-    for insn in kernel.instructions:
-        rcm(insn.expression)
+            if debug:
+                print "             result", result
 
-    return kernel.copy(function_result_dtype_getters=dtype_getters)
+            dtypes.append(result)
 
-# }}}
+        except DependencyTypeInferenceFailure:
+            if debug:
+                print "             failed"
+
+    if not dtypes:
+        return None
 
-# {{{ infer types of temporaries
+    from pytools import is_single_valued
+    if not is_single_valued(dtypes):
+        raise RuntimeError("ambiguous type inference for '%s'"
+                % tv.name)
 
-def infer_types_of_temporaries(kernel):
-    new_temp_vars = {}
+    return dtypes[0]
+
+def infer_types(kernel):
+    """Infer types on temporaries and reductions."""
+
+    new_temp_vars = kernel.temporary_variables.copy()
 
+    # {{{ fill queue
+
+    # queue contains temporary variables and reductions
     queue = []
 
     from loopy import infer_type
     for tv in kernel.temporary_variables.itervalues():
         if tv.dtype is infer_type:
             queue.append(tv)
-            new_temp_vars[tv.name] = tv
-        else:
-            new_temp_vars[tv.name] = tv
 
-    from loopy.codegen.expression import (
-            TypeInferenceMapper, DependencyTypeInferenceFailure)
-    tim = TypeInferenceMapper(kernel, new_temp_vars)
+    # }}}
+
+    from loopy.codegen.expression import TypeInferenceMapper
+    type_inf_mapper = TypeInferenceMapper(kernel, new_temp_vars)
+
+    # {{{ work on type inference queue
+
+    from loopy.kernel import TemporaryVariable
+
+    debug = 0
 
     first_failure = None
     while queue:
-        tv = queue.pop(0)
+        item = queue.pop(0)
 
-        dtypes = []
+        if isinstance(item, TemporaryVariable):
+            if debug:
+                print "inferring type for tempvar", item.name
 
-        writers = kernel.writer_map()[tv.name]
-        exprs = [kernel.id_to_insn[w].expression for w in writers]
+            result = infer_temp_var_type(kernel, item, type_inf_mapper, debug)
 
-        for expr in exprs:
-            try:
-                dtypes.append(tim(expr))
-            except DependencyTypeInferenceFailure:
-                pass
+            failed = result is None
+            if not failed:
+                if debug:
+                    print "     success", result
+                new_temp_vars[item.name] = item.copy(dtype=result)
+            else:
+                if debug:
+                    print "     failure", result
+        else:
+            raise RuntimeError("unexpected item type in type inference")
 
-        if not dtypes:
-            if tv is first_failure:
-                # this has failed before, give up.
-                raise RuntimeError("could not determine type of '%s' from expression(s) '%s'"
-                        % (tv.name,
-                            ", ".join(str(e) for e in exprs)))
+        if failed:
+            if item is first_failure:
+                # this item has failed before, give up.
+                raise RuntimeError("could not determine type of '%s'" % item)
 
             if first_failure is None:
                 # remember the first failure for this round through the queue
-                first_failure = tv
+                first_failure = item
 
             # can't infer type yet, put back into queue
-            queue.append(tv)
-            continue
-
-        from pytools import is_single_valued
-        if not is_single_valued(dtypes):
-            raise RuntimeError("ambiguous type inference for '%s'"
-                    % tv.name)
-
-        new_temp_vars[tv.name] = tv.copy(dtype=dtypes[0])
+            queue.append(item)
+        else:
+            # we've made progress, reset failure marker
+            first_failure = None
 
-        # we've made progress, reset failure flag.
-        first_failure = None
+    # }}}
 
     return kernel.copy(temporary_variables=new_temp_vars)
 
@@ -209,10 +227,12 @@ def realize_reduction(kernel, insn_id_filter=None):
 
     new_insns = []
     new_temporary_variables = kernel.temporary_variables.copy()
-    new_preambles = kernel.preambles
 
     from loopy.kernel import IlpBaseTag
 
+    from loopy.codegen.expression import TypeInferenceMapper
+    type_inf_mapper = TypeInferenceMapper(kernel)
+
     def map_reduction(expr, rec):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
@@ -239,8 +259,6 @@ def realize_reduction(kernel, insn_id_filter=None):
 
         # }}}
 
-        new_preambles.extend(expr.operation.get_preambles(expr.inames))
-
         from pymbolic import var
 
         target_var_name = kernel.make_unique_var_name("acc_"+"_".join(expr.inames),
@@ -251,12 +269,14 @@ def realize_reduction(kernel, insn_id_filter=None):
             target_var = target_var[
                     tuple(var(ilp_iname) for ilp_iname in ilp_inames)]
 
+        arg_dtype = type_inf_mapper(expr.expr)
+
         from loopy.kernel import Instruction
 
         from loopy.kernel import TemporaryVariable
         new_temporary_variables[target_var_name] = TemporaryVariable(
                 name=target_var_name,
-                dtype=expr.operation.dtype(expr.inames),
+                dtype=expr.operation.result_dtype(arg_dtype, expr.inames),
                 shape=tuple(ilp_iname_lengths),
                 is_local=False)
 
@@ -268,7 +288,7 @@ def realize_reduction(kernel, insn_id_filter=None):
                 id=new_id,
                 assignee=target_var,
                 forced_iname_deps=temp_kernel.insn_inames(insn) - set(expr.inames),
-                expression=expr.operation.neutral_element(expr.inames))
+                expression=expr.operation.neutral_element(arg_dtype, expr.inames))
 
         generated_insns.append(init_insn)
 
@@ -279,7 +299,7 @@ def realize_reduction(kernel, insn_id_filter=None):
         reduction_insn = Instruction(
                 id=new_id,
                 assignee=target_var,
-                expression=expr.operation(target_var, expr.expr, expr.inames),
+                expression=expr.operation(arg_dtype, target_var, expr.expr, expr.inames),
                 insn_deps=set([init_insn.id]) | insn.insn_deps,
                 forced_iname_deps=temp_kernel.insn_inames(insn) | set(expr.inames))
 
@@ -783,18 +803,19 @@ def adjust_local_temp_var_storage(kernel):
 
 
 def preprocess_kernel(kernel):
-    kernel = gather_dtype_getters(kernel)
 
-    # all type inference must happen *after* this point (because only then all
-    # the functions return dtype getters are available.)
 
-    kernel = infer_types_of_temporaries(kernel)
+    kernel = infer_types(kernel)
 
-    from loopy.subst import expand_subst
-    kernel = expand_subst(kernel)
+    # Ordering restriction:
+    # realize_reduction must happen after type inference because it needs
+    # to be able to determine the types of the reduced expressions.
 
     kernel = realize_reduction(kernel)
 
+    from loopy.subst import expand_subst
+    kernel = expand_subst(kernel)
+
     # Ordering restriction:
     # Must realize reductions before realizing ILP, because realize_ilp()
     # gets rid of ILP tags, but realize_reduction() needs them to do
diff --git a/loopy/reduction.py b/loopy/reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b687cfc811b01033db07aa1f73215a10fd2eb63
--- /dev/null
+++ b/loopy/reduction.py
@@ -0,0 +1,249 @@
+from pymbolic import var
+import numpy as np
+
+from loopy.symbolic import FunctionIdentifier
+
+
+
+
+class ReductionOperation(object):
+    def result_dtype(self, arg_dtype, inames):
+        raise NotImplementedError
+
+    def neutral_element(self, dtype, inames):
+        raise NotImplementedError
+
+    def __call__(self, dtype, operand1, operand2, inames):
+        raise NotImplementedError
+
+class ScalarReductionOperation(ReductionOperation):
+    def __init__(self, forced_result_dtype=None):
+        """
+        :arg forced_result_dtype: Force the reduction result to be of this type.
+        """
+        self.forced_result_dtype = forced_result_dtype
+
+    def result_dtype(self, arg_dtype, inames):
+        if self.forced_result_dtype is not None:
+            return self.forced_result_dtype
+
+        return arg_dtype
+
+    def __str__(self):
+        result = type(self).__name__.replace("ReductionOperation", "").lower()
+
+        if self.forced_result_dtype is not None:
+            result = "%s<%s>" % (result, str(self.dtype))
+
+        return result
+
+class SumReductionOperation(ScalarReductionOperation):
+    def neutral_element(self, dtype, inames):
+        return 0
+
+    def __call__(self, dtype, operand1, operand2, inames):
+        return operand1 + operand2
+
+class ProductReductionOperation(ScalarReductionOperation):
+    def neutral_element(self, dtype, inames):
+        return 1
+
+    def __call__(self, dtype, operand1, operand2, inames):
+        return operand1 * operand2
+
+def get_le_neutral(dtype):
+    """Return a number y that satisfies (x <= y) for all y."""
+
+    if dtype.kind == "f":
+        # OpenCL 1.1, section 6.11.2
+        return var("INFINITY")
+    else:
+        raise NotImplementedError("less")
+
+class MaxReductionOperation(ScalarReductionOperation):
+    def neutral_element(self, dtype, inames):
+        return get_le_neutral(dtype)
+
+    def __call__(self, dtype, operand1, operand2, inames):
+        return var("max")(operand1, operand2)
+
+class MinReductionOperation(ScalarReductionOperation):
+    @property
+    def neutral_element(self, dtype, inames):
+        return -get_le_neutral(dtype)
+
+    def __call__(self, dtype, operand1, operand2, inames):
+        return var("min")(operand1, operand2)
+
+
+
+
+# {{{ argmin/argmax
+
+ARGEXT_STRUCT_DTYPES = {}
+
+class _ArgExtremumReductionOperation(ReductionOperation):
+    def prefix(self, dtype):
+        return "loopy_arg%s_%s" % (self.which, dtype.type.__name__)
+
+    def result_dtype(self, dtype, inames):
+        try:
+            return ARGEXT_STRUCT_DTYPES[dtype]
+        except KeyError:
+            struct_dtype = np.dtype([("value", dtype), ("index", np.int32)])
+            ARGEXT_STRUCT_DTYPES[dtype] = struct_dtype
+
+            from pyopencl.tools import register_dtype
+            register_dtype(struct_dtype, self.prefix(dtype)+"_result", alias_ok=True)
+            return struct_dtype
+
+    def neutral_element(self, dtype, inames):
+        return ArgExtFunction(self, dtype, "init", inames)()
+
+    def __call__(self, dtype, operand1, operand2, inames):
+        iname, = inames
+
+        return ArgExtFunction(self, dtype, "update", inames)(
+                operand1, operand2, var(iname))
+
+class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
+    which = "max"
+    update_comparison = ">="
+    neutral_sign = -1
+
+class ArgMinReductionOperation(_ArgExtremumReductionOperation):
+    which = "min"
+    update_comparison = "<="
+    neutral_sign = +1
+
+class ArgExtFunction(FunctionIdentifier):
+    def __init__(self, reduction_op, scalar_dtype, name, inames):
+        self.reduction_op = reduction_op
+        self.scalar_dtype = scalar_dtype
+        self.name = name
+        self.inames = inames
+
+def get_argext_preamble(func_id):
+    op = func_id.reduction_op
+    prefix = op.prefix(func_id.scalar_dtype)
+
+    from pyopencl.tools import dtype_to_ctype
+    from pymbolic.mapper.c_code import CCodeMapper
+
+    c_code_mapper = CCodeMapper()
+
+    return (prefix, """
+    typedef struct {
+        %(scalar_type)s value;
+        int index;
+    } %(type_name)s;
+
+    inline %(type_name)s %(prefix)s_init()
+    {
+        %(type_name)s result;
+        result.value = %(neutral)s;
+        result.index = INT_MIN;
+        return result;
+    }
+
+    inline %(type_name)s %(prefix)s_update(
+        %(type_name)s state, %(scalar_type)s op2, int index)
+    {
+        %(type_name)s result;
+        if (op2 %(comp)s state.value)
+        {
+            result.value = op2;
+            result.index = index;
+            return result;
+        }
+        else return state;
+    }
+    """ % dict(
+            type_name=prefix+"_result",
+            scalar_type=dtype_to_ctype(func_id.scalar_dtype),
+            prefix=prefix,
+            neutral=c_code_mapper(
+                op.neutral_sign*get_le_neutral(func_id.scalar_dtype)),
+            comp=op.update_comparison,
+            ))
+
+# }}
+
+
+
+
+# {{{ reduction op registry
+
+_REDUCTION_OPS = {
+        "sum": SumReductionOperation,
+        "product": ProductReductionOperation,
+        "max": MaxReductionOperation,
+        "min": MinReductionOperation,
+        "argmax": ArgMaxReductionOperation,
+        "argmin": ArgMinReductionOperation,
+        }
+
+_REDUCTION_OP_PARSERS = [
+        ]
+
+
+def register_reduction_parser(parser):
+    """Register a new :class:`ReductionOperation`.
+
+    :arg parser: A function that receives a string and returns
+        a subclass of ReductionOperation.
+    """
+    _REDUCTION_OP_PARSERS.append(parser)
+
+def parse_reduction_op(name):
+    import re
+
+    red_op_match = re.match(r"^([a-z]+)_([a-z0-9_]+)$", name)
+    if red_op_match:
+        op_name = red_op_match.group(1)
+        op_type = red_op_match.group(2)
+
+        try:
+            op_dtype = np.dtype(op_type)
+        except TypeError:
+            op_dtype = None
+
+        if op_dtype is None and op_type.startswith("vec_"):
+            import pyopencl.array as cl_array
+            try:
+                op_dtype = getattr(cl_array.vec, op_type[4:])
+            except AttributeError:
+                op_dtype = None
+
+        if op_name in _REDUCTION_OPS and op_dtype is not None:
+            return _REDUCTION_OPS[op_name](op_dtype)
+
+    if name in _REDUCTION_OPS:
+        return _REDUCTION_OPS[name]()
+
+    for parser in _REDUCTION_OP_PARSERS:
+        result = parser(name)
+        if result is not None:
+            return result
+
+    return None
+
+# }}}
+
+
+
+
+def reduction_function_mangler(func_id, arg_dtypes):
+    if isinstance(func_id, ArgExtFunction):
+        op = func_id.reduction_op
+        return (op.result_dtype(func_id.scalar_dtype, func_id.inames),
+                "%s_%s" % (op.prefix(func_id.scalar_dtype), func_id.name))
+
+    return None
+
+def reduction_preamble_generator(seen_dtypes, seen_functions):
+    for func_id, c_name, arg_dtypes in seen_functions:
+        if isinstance(func_id, ArgExtFunction):
+            yield get_argext_preamble(func_id)
+
+# vim: fdm=marker
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index d2029c4628a697ab899cec314e2679acf4511f91..d91b94c001863ff0ffa21b58979ef52257e6cb41 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -5,7 +5,7 @@ from __future__ import division
 from pytools import memoize, memoize_method
 
 from pymbolic.primitives import (
-        AlgebraicLeaf, Variable as VariableBase,
+        Leaf, AlgebraicLeaf, Variable as VariableBase,
         CommonSubexpression)
 
 from pymbolic.mapper import (
@@ -32,6 +32,12 @@ from islpy import dim_type
 
 # {{{ loopy-specific primitives
 
+class FunctionIdentifier(Leaf):
+    def __getinitargs__(self):
+        return ()
+
+    mapper_method = intern("map_loopy_function_identifier")
+
 class TypedCSE(CommonSubexpression):
     def __init__(self, child, prefix=None, dtype=None):
         CommonSubexpression.__init__(self, child, prefix)
@@ -45,6 +51,11 @@ class TypedCSE(CommonSubexpression):
 
 
 class TaggedVariable(VariableBase):
+    """This is an identifier with a tag, such as 'matrix$one', where
+    'one' identifies this specific use of the identifier. This mechanism
+    may then be used to address these uses--such as by prefetching only 
+    accesses tagged a certain way.
+    """
     def __init__(self, name, tag):
         VariableBase.__init__(self, name)
         self.tag = tag
@@ -62,7 +73,7 @@ class Reduction(AlgebraicLeaf):
         assert isinstance(inames, tuple)
 
         if isinstance(operation, str):
-            from loopy.kernel import parse_reduction_op
+            from loopy.reduction import parse_reduction_op
             operation = parse_reduction_op(operation)
 
         self.operation = operation
@@ -109,6 +120,9 @@ class IdentityMapperMixin(object):
         # leaf, doesn't change
         return expr
 
+    def map_loopy_function_identifier(self, expr):
+        return expr
+
 class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):
     pass
 
@@ -143,6 +157,9 @@ class DependencyMapper(DependencyMapperBase):
     def map_tagged_variable(self, expr):
         return set([expr])
 
+    def map_loopy_function_identifier(self, expr):
+        return set()
+
 class UnidirectionalUnifier(UnidirectionalUnifierBase):
     def map_reduction(self, expr, other, unis):
         if not isinstance(other, type(expr)):
@@ -224,7 +241,7 @@ class FunctionToPrimitiveMapper(IdentityMapper):
         else:
             # see if 'name' is an existing reduction op
 
-            from loopy.kernel import parse_reduction_op
+            from loopy.reduction import parse_reduction_op
             if parse_reduction_op(name):
                 if len(expr.parameters) != 2:
                     raise RuntimeError("invalid invocation of "
diff --git a/test/test_loopy.py b/test/test_loopy.py
index b799e88036a4e10e9af89a330777b11448f8737e..a40a157f48f85f5c10a7fee456df1be8335a2549 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -161,7 +161,7 @@ def test_argmax(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "{[i]: 0<=i<%d}" % n,
             [
-                "<> result = argmax_float32(i, fabs(a[i]))",
+                "<> result = argmax(i, fabs(a[i]))",
                 "max_idx = result.index",
                 "max_val = result.value",
                 ],