diff --git a/MEMO b/MEMO
index c08463c3f0e116f39360b8c3e947b5a513e95943..22fa41ca87c7ff4d86d143aec45d6299cc1d087d 100644
--- a/MEMO
+++ b/MEMO
@@ -53,10 +53,11 @@ Things to consider
 
 - Parallel dimension splitting/merging via tags
 
-- Generalize reduction to be over multiplie variables
-
 - Implement get_problems()
 
+- CSE iname duplication might be unnecessary?
+  (don't think so: It might be desired to do a full fetch before a mxm k loop
+  even if that requires going iterative.)
 
 Dealt with
 ^^^^^^^^^^
@@ -65,6 +66,8 @@ Dealt with
 
 - Types of reduction variables?
 
+- Generalize reduction to be over multiple variables
+
 How to represent the schedule
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 7e0474c4d5d5c541aa60b4f9e22354f121598494..d9cedd753b8f4a741a58a43be92c0b4590efed89 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -78,7 +78,7 @@ def split_dimension(kernel, name, inner_length, padded_length=None,
                 .intersect(inner_constraint_set)
                 .project_out(name_dim_type, name_idx, 1))
 
-    new_domain = process_set(kernel.domain) 
+    new_domain = process_set(kernel.domain)
     new_assumptions = process_set(kernel.assumptions)
 
     from pymbolic import var
@@ -272,21 +272,21 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
         # {{{ build the new instruction's iname_to tag
 
         from loopy.symbolic import IndexVariableFinder
-        insn_iname_to_tag = {}
+        new_iname_to_tag = {}
         for old_iname, new_iname in zip(duplicate_inames, new_inames):
-            insn_iname_to_tag[new_iname] = dup_iname_to_tag[old_iname]
+            new_iname_to_tag[new_iname] = dup_iname_to_tag[old_iname]
 
         index_deps = (
-                IndexVariableFinder()(new_outer_expr) 
+                IndexVariableFinder()(new_inner_expr)
                 | set(new_inames))
 
         for iname in index_deps:
-            if iname not in insn_iname_to_tag:
+            if iname not in new_iname_to_tag:
                 # assume generating instruction's view on
                 # inames on which we don't have an opinion.
 
                 if iname in insn.iname_to_tag:
-                    insn_iname_to_tag[iname] = insn.iname_to_tag[iname]
+                    new_iname_to_tag[iname] = insn.iname_to_tag[iname]
 
         # }}}
 
@@ -295,7 +295,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
                 id=kernel.make_unique_instruction_id(based_on=cse_tag),
                 assignee=assignee,
                 expression=new_inner_expr,
-                iname_to_tag=insn_iname_to_tag,
+                iname_to_tag=new_iname_to_tag,
                 )
 
         cse_result_insns.append(new_insn)
@@ -401,9 +401,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
 
 
 
-def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None):
-    dtype = np.dtype(dtype)
-
+def realize_reduction(kernel, loop_iname, reduction_tag=None):
     new_insns = []
     new_temporary_variables = kernel.temporary_variables[:]
 
@@ -415,9 +413,6 @@ def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None):
 
         from pymbolic import var
 
-        from loopy.kernel import REGISTERED_REDUCTION_OPS
-        red_op = REGISTERED_REDUCTION_OPS[expr.operation]
-
         target_var_name = kernel.make_unique_var_name("red_"+loop_iname,
                 extra_used_vars=set(tv.name for tv in new_temporary_variables))
         target_var = var(target_var_name)
@@ -428,35 +423,38 @@ def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None):
         new_temporary_variables.append(
                 TemporaryVariable(
                     name=target_var_name,
-                    dtype=dtype,
+                    dtype=expr.operation.dtype,
                     shape=(),
                     is_local=False))
 
         init_iname_to_tag = insn.iname_to_tag.copy()
+        for iname in expr.inames:
+            del init_iname_to_tag[iname]
 
         init_insn = Instruction(
                 id=kernel.make_unique_instruction_id(
                     extra_used_ids=set(ni.id for ni in new_insns)),
                 assignee=target_var,
-                forced_iname_deps=list(insn.all_inames() - set(expr.iname)),
-                expression=red_op.neutral_element)
+                forced_iname_deps=list(insn.all_inames() - set(expr.inames)),
+                expression=expr.operation.neutral_element,
+                iname_to_tag=init_iname_to_tag)
 
         new_insns.append(init_insn)
 
-        from loopy.kernel import SequentialTag
         reduction_insn = Instruction(
                 id=kernel.make_unique_instruction_id(
                     extra_used_ids=set(ni.id for ni in new_insns)),
                 assignee=target_var,
-                expression=red_op(target_var, sub_expr),
+                expression=expr.operation(target_var, sub_expr),
                 forced_insn_deps=[init_insn.id],
                 use_auto_dependencies=False,
                 forced_iname_deps=list(insn.all_inames()),
-                iname_to_tag={expr.iname: SequentialTag()})
+                iname_to_tag=insn.iname_to_tag)
 
         new_insns.append(reduction_insn)
 
         new_insn_forced_insn_deps.append(reduction_insn.id)
+        new_insn_removed_inames.extend(expr.inames)
 
         return target_var
 
@@ -465,12 +463,20 @@ def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None):
 
     for insn in kernel.instructions:
         new_insn_forced_insn_deps = []
+        new_insn_removed_inames = []
+
+        new_expression = cb_mapper(insn.expression)
+
+        new_iname_to_tag = insn.iname_to_tag.copy()
+        for iname in new_insn_removed_inames:
+            del new_iname_to_tag[iname]
 
         new_insns.append(
                 insn.copy(
-                    expression=cb_mapper(insn.expression),
+                    expression=new_expression,
                     forced_insn_deps=insn.forced_insn_deps
                         + new_insn_forced_insn_deps,
+                    iname_to_tag=new_iname_to_tag,
                     ))
 
     return kernel.copy(
diff --git a/loopy/kernel.py b/loopy/kernel.py
index da04a654505dd13f47bff745d1b13c62b5ee2675..5df71831ab9c9e3cabaa8f8b838a1c96f32fcbef 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -24,7 +24,8 @@ class IndexTag(Record):
 
 
 class SequentialTag(IndexTag):
-    pass
+    def __str__(self):
+        return "seq"
 
 class ParallelTag(IndexTag):
     pass
@@ -232,7 +233,7 @@ class Instruction(Record):
 
         def map_reduction(expr, rec):
             rec(expr.expr)
-            reduction_inames.add(expr.iname)
+            reduction_inames.update(expr.inames)
 
         ReductionCallbackMapper(map_reduction)(expression)
 
@@ -266,18 +267,10 @@ class Instruction(Record):
 
     @memoize_method
     def all_inames(self):
-        from loopy.symbolic import VariableIndexExpressionCollector
-        idx_exprs = (
-                VariableIndexExpressionCollector()(self.expression)
-                | VariableIndexExpressionCollector()(self.assignee))
-        from pymbolic.mapper.dependency import DependencyMapper
-        index_vars = set()
-
-        from pymbolic.primitives import Variable
-        for idx_expr in idx_exprs:
-            for i in DependencyMapper()(idx_expr):
-                if isinstance(i, Variable):
-                    index_vars.add(i.name)
+        from loopy.symbolic import IndexVariableFinder
+        index_vars = (
+                IndexVariableFinder()(self.expression)
+                | IndexVariableFinder()(self.assignee))
 
         return index_vars | set(self.forced_iname_deps)
 
@@ -308,7 +301,7 @@ class Instruction(Record):
 
     def __str__(self):
         loop_descrs = []
-        for iname in self.all_inames():
+        for iname in sorted(self.all_inames()):
             tag = self.iname_to_tag.get(iname)
 
             if tag is None:
@@ -325,33 +318,42 @@ class Instruction(Record):
 
 # {{{ reduction operations
 
-class ReductionOperation:
+class ReductionOperation(object):
     """
     :ivar neutral_element:
+    :ivar dtype:
     """
 
     def __call__(self, operand1, operand2):
         raise NotImplementedError
 
-class SumReductionOperation:
+class TypedReductionOperation(ReductionOperation):
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+    def __str__(self):
+        return (type(self).__name__.replace("ReductionOperation", "").lower()
+                + "_" + str(self.dtype))
+
+class SumReductionOperation(TypedReductionOperation):
     neutral_element = 0
 
     def __call__(self, operand1, operand2):
         return operand1 + operand2
 
-class ProductReductionOperation:
+class ProductReductionOperation(TypedReductionOperation):
     neutral_element = 1
 
     def __call__(self, operand1, operand2):
         return operand1 * operand2
 
-class FloatingPointMaxOperation:
+class FloatingPointMaxOperation(TypedReductionOperation):
     neutral_element = -var("INFINITY")
 
     def __call__(self, operand1, operand2):
         return var("max")(operand1, operand2)
 
-class FloatingPointMaxOperation:
+class FloatingPointMaxOperation(TypedReductionOperation):
     # OpenCL 1.1, section 6.11.2
     neutral_element = -var("INFINITY")
 
@@ -359,7 +361,7 @@ class FloatingPointMaxOperation:
         from pymbolic.primitives import FunctionSymbol
         return FunctionSymbol("max")(operand1, operand2)
 
-class FloatingPointMinOperation:
+class FloatingPointMinOperation(TypedReductionOperation):
     # OpenCL 1.1, section 6.11.2
     neutral_element = var("INFINITY")
 
@@ -370,29 +372,45 @@ class FloatingPointMinOperation:
 
 
 
-REGISTERED_REDUCTION_OPS = {
-        "sum": SumReductionOperation(),
-        "product": ProductReductionOperation(),
-        "fp_max": FloatingPointMaxOperation(),
-        "fp_min": FloatingPointMinOperation(),
+_REDUCTION_OPS = {
+        "sum": SumReductionOperation,
+        "product": ProductReductionOperation,
+        "fpmax": FloatingPointMaxOperation,
+        "fpmin": FloatingPointMinOperation,
         }
 
+_REDUCTION_OP_PARSERS = [
+        ]
 
 
-def register_reduction(prefix, op):
+def register_reduction_parser(parser):
     """Register a new :class:`ReductionOperation`.
 
-    :arg prefix: the desired name of the operation
-    :return: the name actually assigned to the operation,
-        will start with *prefix*.
+    :arg parser: A function that receives a string and returns
+        a subclass of ReductionOperation.
     """
+    _REDUCTION_OP_PARSERS.append(parser)
+
+def parse_reduction_op(name):
+    import re
+    red_op_match = re.match(r"^([a-z]+)_([a-z0-9]+)$", name)
+    if red_op_match:
+        op_name = red_op_match.group(1)
+        op_type = red_op_match.group(2)
+        try:
+            op_dtype = np.dtype(op_type)
+        except TypeError:
+            op_dtype = None
+
+        if op_name in _REDUCTION_OPS and op_dtype is not None:
+            return _REDUCTION_OPS[op_name](op_dtype)
 
-    from loopy.tools import generate_unique_possibilities
+    for parser in _REDUCTION_OP_PARSERS:
+        result = parser(name)
+        if result is not None:
+            return result
 
-    for name in generate_unique_possibilities(prefix):
-        if name not in REGISTERED_REDUCTION_OPS:
-            REGISTERED_REDUCTION_OPS[name] = op
-            return name
+    raise RuntimeError("could not parse reudction operation '%s'" % name)
 
 # }}}
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 69637268c68a4de070d924da134f9e0a8bf3f386..816943f3a3d73dd8c94951b9d33bcccd1a6c1055 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -22,23 +22,29 @@ from islpy import dim_type
 # {{{ loopy-specific primitives
 
 class Reduction(AlgebraicLeaf):
-    def __init__(self, operation, iname, expr, tag=None):
+    def __init__(self, operation, inames, expr, tag=None):
+        assert isinstance(inames, tuple)
+
+        if isinstance(operation, str):
+            from loopy.kernel import parse_reduction_op
+            operation = parse_reduction_op(operation)
+
         self.operation = operation
-        self.iname = iname
+        self.inames = inames
         self.expr = expr
         self.tag = tag
 
     def __getinitargs__(self):
-        return (self.operation, self.iname, self.expr, self.tag)
+        return (self.operation, self.inames, self.expr, self.tag)
 
     def get_hash(self):
-        return hash((self.__class__, self.operation, self.iname,
+        return hash((self.__class__, self.operation, self.inames,
             self.expr, self.tag))
 
     def is_equal(self, other):
         return (other.__class__ == self.__class__
                 and other.operation == self.operation
-                and other.iname == self.iname
+                and other.inames == self.inames
                 and other.expr == self.expr
                 and other.tag == self.tag)
 
@@ -53,7 +59,7 @@ class Reduction(AlgebraicLeaf):
 
 class IdentityMapperMixin(object):
     def map_reduction(self, expr):
-        return Reduction(expr.operation, expr.iname,
+        return Reduction(expr.operation, expr.inames,
                 self.rec(expr.expr), expr.tag)
 
 class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):
@@ -68,8 +74,8 @@ class SubstitutionMapper(SubstitutionMapperBase, IdentityMapperMixin):
 
 class StringifyMapper(StringifyMapperBase):
     def map_reduction(self, expr, prec):
-        return "reduce(%s, %s, %s, tag=%s)" % (
-                expr.operation, expr.iname, expr.expr, expr.tag)
+        return "reduce(%s, [%s], %s, tag=%s)" % (
+                expr.operation, ", ".join(expr.inames), expr.expr, expr.tag)
 
 # }}}
 
@@ -94,10 +100,10 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 
         elif isinstance(expr.function, Variable) and expr.function.name == "reduce":
             if len(expr.parameters) == 3:
-                operation, iname, red_expr = expr.parameters
+                operation, inames, red_expr = expr.parameters
                 tag = None
             elif len(expr.parameters) == 4:
-                operation, iname, red_expr, tag = expr.parameters
+                operation, inames, red_expr, tag = expr.parameters
             else:
                 raise TypeError("reduce takes three or four arguments")
 
@@ -106,15 +112,27 @@ class FunctionToPrimitiveMapper(IdentityMapper):
             if not isinstance(operation, Variable):
                 raise TypeError("operation argument to reduce() must be a symbol")
             operation = operation.name
-            if not isinstance(iname, Variable):
-                raise TypeError("iname argument to reduce() must be a symbol")
-            iname = iname.name
+            if isinstance(inames, Variable):
+                inames = (inames,)
+
+            if not isinstance(inames, (tuple)):
+                raise TypeError("iname argument to reduce() must be a symbol "
+                        "or a tuple of symbols")
+
+            processed_inames = []
+            for iname in inames:
+                if not isinstance(iname, Variable):
+                    raise TypeError("iname argument to reduce() must be a symbol "
+                            "or a tuple or a tuple of symbols")
+
+                processed_inames.append(iname.name)
+
             if tag is not None:
                 if  not isinstance(tag, Variable):
                     raise TypeError("tag argument to reduce() must be a symbol")
                 tag = tag.name
 
-            return Reduction(operation, iname, red_expr, tag)
+            return Reduction(operation, tuple(processed_inames), red_expr, tag)
         else:
             return IdentityMapper.map_call(self, expr)
 
@@ -123,34 +141,18 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 # {{{ reduction loop splitter
 
 class ReductionLoopSplitter(IdentityMapper):
-    def __init__(self, old_iname, outer_iname, inner_iname, do_warn=True):
+    def __init__(self, old_iname, outer_iname, inner_iname):
         self.old_iname = old_iname
         self.outer_iname = outer_iname
         self.inner_iname = inner_iname
-        self.do_warn = do_warn
 
     def map_reduction(self, expr):
-        if expr.iname == self.old_iname:
-            if self.do_warn:
-                from warnings import warn
-                from loopy import LoopyAdvisory
-                warn("The reduction loop for iname '%s' (with tag '%s') "
-                        "was split into two nested reductions with separate "
-                        "state variables. It might be advantageous to "
-                        "'realize' the reduction first before splitting "
-                        "this loop." % (expr.iname, expr.tag), LoopyAdvisory)
-
-            if expr.tag is not None:
-                outer_tag = expr.tag + "_outer"
-                inner_tag = expr.tag + "_inner"
-            else:
-                outer_tag = None
-                inner_tag = None
-
-            return Reduction(expr.operation, self.outer_iname,
-                    Reduction(expr.operation, self.inner_iname,
-                        expr.expr, inner_tag),
-                    outer_tag)
+        if self.old_iname in expr.inames:
+            new_inames = expr.inames[:]
+            new_inames.remove(self.old_iname)
+            new_inames.extend([self.outer_iname, self.inner_iname])
+            return Reduction(expr.operation, new_inames,
+                        expr.expr, expr.tag)
         else:
             return IdentityMapper.map_reduction(self, expr)
 
@@ -469,9 +471,24 @@ class IndexVariableFinder(CombineMapper):
         import operator
         return reduce(operator.or_, values, set())
 
+    def map_constant(self, expr):
+        return set()
+
+    def map_algebraic_leaf(self, expr):
+        return set()
+
     def map_subscript(self, expr):
         from pymbolic.mapper.dependency import DependencyMapper
-        return DependencyMapper()(expr)
+        idx_vars = DependencyMapper()(expr.index)
+
+        from pymbolic.primitives import Variable
+        result = set()
+        for idx_var in idx_vars:
+            if isinstance(idx_var, Variable):
+                result.add(idx_var.name)
+            else:
+                raise RuntimeError("index variable not understood: %s" % idx_var)
+        return result
 
 # }}}
 
diff --git a/test/test_matmul.py b/test/test_matmul.py
index b58edc582575aa2f09f41dcbd5eb6a7dde36f7e3..5e5564a143f15e710e220cbd84a42bfd89505720 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -203,7 +203,7 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
     knl = lp.LoopKernel(ctx.devices[0],
             "[n] -> {[i,j,k]: 0<=i,j,k<n}",
             [
-                "c[i, j] = reduce(sum, k, cse(a[i, k], lhsmat)*cse(b[k, j], rhsmat))"
+                "c[i, j] = reduce(sum_float32, k, cse(a[i, k], lhsmat)*cse(b[k, j], rhsmat))"
                 ],
             [
                 lp.ArrayArg("a", dtype, shape=(n, n), order=order),
@@ -220,7 +220,7 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
     for insn in knl.instructions:
         print insn
     print
-    knl = lp.realize_reduction(knl, "k", dtype)
+    knl = lp.realize_reduction(knl, "k")
     for insn in knl.instructions:
         print insn
     print