diff --git a/doc/misc.rst b/doc/misc.rst
index 2f8fac090455eea1be87c3b4eae7bfd72eba24ee..89279d9744d2f1cf4f080618d0d2b2e0f078a723 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -5,7 +5,9 @@ Installation
 
 This command should install :mod:`loopy`::
 
-    pip install loopy
+    pip install loo.py
+
+(Note the extra "."!)
 
 You may need to run this with :command:`sudo`.
 If you don't already have `pip <https://pypi.python.org/pypi/pip>`_,
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index ff27d84f55ea1b8b4222f18ba271723f450b1a42..33d40385b529f72e54da65238304e87bdb2cddab 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -270,9 +270,10 @@ Expressions
 Loopy's expressions are a slight superset of the expressions supported by
 :mod:`pymbolic`.
 
-* `if`
-* `reductions`
+* ``if``
+* ``reductions``
     * duplication of reduction inames
+    * ``reduce`` vs ``simul_reduce``
 * complex-valued arithmetic
 * tagging of array access and substitution rule use ("$")
 * ``indexof``, ``indexof_vec``
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 4421fd80f35cffdeebf37dbb93f668fc7bf48aa4..4275474d9ce74e04aa3c7ae69356f0672db2128c 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -96,6 +96,7 @@ always see loopy's view of a kernel by printing it.
 
 .. doctest::
 
+    >>> knl = lp.set_options(knl, allow_terminal_colors=False)
     >>> print(knl)
     ---------------------------------------------------------------------------
     KERNEL: loopy_kernel
@@ -245,8 +246,6 @@ call :func:`loopy.generate_code`:
 .. doctest::
 
     >>> typed_knl = lp.add_dtypes(knl, dict(a=np.float32))
-    >>> typed_knl = lp.preprocess_kernel(typed_knl, device=ctx.devices[0])
-    >>> typed_knl = lp.get_one_scheduled_kernel(typed_knl)
     >>> code, _ = lp.generate_code(typed_knl)
     >>> print(code)
     #define lid(N) ((int) get_local_id(N))
diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
index 199a4dd038b1b79e3364c9a0c053300c5407bff0..a402896c85baa49bc23bb5d770607cf73641c273 100644
--- a/examples/python/ispc-stream-harness.py
+++ b/examples/python/ispc-stream-harness.py
@@ -59,29 +59,6 @@ def main():
     with open("tasksys.cpp", "r") as ts_file:
         tasksys_source = ts_file.read()
 
-    if 0:
-        from loopy.target.ispc import ISPCTarget
-        stream_knl = lp.make_kernel(
-                "{[i]: 0<=i<n}",
-                "z[i] = x[i] + a*y[i]",
-                target=ISPCTarget(),
-                index_dtype=INDEX_DTYPE)
-
-        stream_knl = lp.add_and_infer_dtypes(stream_knl, {
-            "a": STREAM_DTYPE,
-            "x": STREAM_DTYPE,
-            "y": STREAM_DTYPE
-            })
-
-        stream_knl = lp.assume(stream_knl, "n>0")
-        stream_knl = lp.split_iname(stream_knl,
-                "i", 2**18, outer_tag="g.0", slabs=(0, 1))
-        stream_knl = lp.split_iname(stream_knl, "i_inner", 8, inner_tag="l.0")
-        stream_knl = lp.preprocess_kernel(stream_knl)
-        stream_knl = lp.get_one_scheduled_kernel(stream_knl)
-        stream_knl = lp.set_argument_order(stream_knl, "n,a,x,y,z")
-        ispc_code, arg_info = lp.generate_code(stream_knl)
-
     def make_knl(name, insn, vars):
         knl = lp.make_kernel(
                 "{[i]: 0<=i<n}",
@@ -120,7 +97,7 @@ def main():
                     ]
                     + (["--addressing=64"] if INDEX_DTYPE == np.int64 else [])
                     ),
-                ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc",
+                #ispc_bin="/home/andreask/pack/ispc-v1.9.0-linux/ispc",
                 quiet=False,
                 )
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index c71a03fec9715e40c371f57a3392aeded687c3bf..5e3ad508596a20cce8a084e4d70604cdafa47c79 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -60,7 +60,8 @@ from loopy.transform.iname import (
         split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames,
         rename_iname, link_inames, remove_unused_inames,
         split_reduction_inward, split_reduction_outward,
-        affine_map_inames, find_unused_axis_tag)
+        affine_map_inames, find_unused_axis_tag,
+        make_reduction_inames_unique)
 
 from loopy.transform.instruction import (
         find_instructions, map_instructions,
@@ -144,6 +145,7 @@ __all__ = [
         "rename_iname", "link_inames", "remove_unused_inames",
         "split_reduction_inward", "split_reduction_outward",
         "affine_map_inames", "find_unused_axis_tag",
+        "make_reduction_inames_unique",
 
         "add_prefetch", "change_arg_to_image", "tag_data_axes",
         "set_array_dim_names", "remove_unused_arguments",
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 2e136d7bb5264238f024775b4b9b9d31479bb914..91af52cde0000ce811172be53bf21e13ce5fde97 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -464,10 +464,15 @@ def generate_code(kernel, device=None):
         warn("passing 'device' to generate_code() is deprecated",
                 DeprecationWarning, stacklevel=2)
 
+    from loopy.kernel import kernel_state
+    if kernel.state == kernel_state.INITIAL:
+        from loopy.preprocess import preprocess_kernel
+        kernel = preprocess_kernel(kernel)
+
     if kernel.schedule is None:
         from loopy.schedule import get_one_scheduled_kernel
         kernel = get_one_scheduled_kernel(kernel)
-    from loopy.kernel import kernel_state
+
     if kernel.state != kernel_state.SCHEDULED:
         raise LoopyError("cannot generate code for a kernel that has not been "
                 "scheduled")
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 91c22dbeb9d7c1cb40abc83a52aab82e7d7d85b2..fb081a959523114ee5fc823eec59420f4adf3457 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1012,6 +1012,9 @@ class LoopKernel(RecordWithoutPickling):
 
         printed_insn_ids = set()
 
+        Fore = self.options._fore
+        Style = self.options._style
+
         def print_insn(insn):
             if insn.id in printed_insn_ids:
                 return
@@ -1035,7 +1038,7 @@ class LoopKernel(RecordWithoutPickling):
 
             loop_list = ",".join(sorted(kernel.insn_inames(insn)))
 
-            options = [insn.id]
+            options = [Fore.GREEN+insn.id+Style.RESET_ALL]
             if insn.priority:
                 options.append("priority=%d" % insn.priority)
             if insn.tags:
@@ -1048,12 +1051,15 @@ class LoopKernel(RecordWithoutPickling):
             if len(loop_list) > loop_list_width:
                 lines.append("[%s]" % loop_list)
                 lines.append("%s%s <- %s   # %s" % (
-                    (loop_list_width+2)*" ", lhs,
-                    rhs, ", ".join(options)))
+                    (loop_list_width+2)*" ", Fore.BLUE+lhs+Style.RESET_ALL,
+                    Fore.MAGENTA+rhs+Style.RESET_ALL,
+                    ", ".join(options)))
             else:
                 lines.append("[%s]%s%s <- %s   # %s" % (
                     loop_list, " "*(loop_list_width-len(loop_list)),
-                    lhs, rhs, ",".join(options)))
+                    Fore.BLUE + lhs + Style.RESET_ALL,
+                    Fore.MAGENTA+rhs+Style.RESET_ALL,
+                    ",".join(options)))
 
             lines.extend(trailing)
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 46b301b47c45430078f554a178448687eb490e94..cb72f283e2151fd8d097f19eb4e4876cc78ccd43 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -770,7 +770,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
         If *axis* is None, find a suitable axis automatically.
         """
         try:
-            desired_length = kernel.get_constant_iname_length(iname)
+            with isl.SuppressedWarnings(kernel.isl_context):
+                desired_length = kernel.get_constant_iname_length(iname)
         except isl.Error:
             # Likely unbounded, automatic assignment is not
             # going to happen for this iname.
@@ -882,7 +883,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 
             def get_iname_length(iname):
                 try:
-                    return kernel.get_constant_iname_length(iname)
+                    with isl.SuppressedWarnings(kernel.isl_context):
+                        return kernel.get_constant_iname_length(iname)
                 except isl.Error:
                     return -1
             # assign longest auto axis inames first
diff --git a/loopy/options.py b/loopy/options.py
index 049d21affd284d94f27416d018d38bdbd5b4bddc..9b27f111e2fb93b09090b35278a3753f9f1a3f82 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -27,6 +27,11 @@ from pytools import Record
 import re
 
 
+class _ColoramaStub(object):
+    def __getattribute__(self, name):
+        return ""
+
+
 class Options(Record):
     """
     Unless otherwise specified, these options are Boolean-valued
@@ -105,6 +110,11 @@ class Options(Record):
 
         Options to pass to the OpenCL compiler when building the kernel.
         A list of strings.
+
+    .. attribute:: allow_terminal_colors
+
+        A :class:`bool`. Whether to allow colors in terminal output
+
     """
 
     def __init__(
@@ -124,6 +134,7 @@ class Options(Record):
             write_wrapper=False, highlight_wrapper=False,
             write_cl=False, highlight_cl=False,
             edit_cl=False, cl_build_options=[],
+            allow_terminal_colors=True
             ):
         Record.__init__(
                 self,
@@ -137,6 +148,7 @@ class Options(Record):
                 write_wrapper=write_wrapper, highlight_wrapper=highlight_wrapper,
                 write_cl=write_cl, highlight_cl=highlight_cl,
                 edit_cl=edit_cl, cl_build_options=cl_build_options,
+                allow_terminal_colors=allow_terminal_colors,
                 )
 
     def update(self, other):
@@ -150,6 +162,30 @@ class Options(Record):
         for field_name in sorted(self.__class__.fields):
             key_builder.rec(key_hash, getattr(self, field_name))
 
+    @property
+    def _fore(self):
+        if self.allow_terminal_colors:
+            import colorama
+            return colorama.Fore
+        else:
+            return _ColoramaStub()
+
+    @property
+    def _back(self):
+        if self.allow_terminal_colors:
+            import colorama
+            return colorama.Back
+        else:
+            return _ColoramaStub()
+
+    @property
+    def _style(self):
+        if self.allow_terminal_colors:
+            import colorama
+            return colorama.Style
+        else:
+            return _ColoramaStub()
+
 
 KEY_VAL_RE = re.compile("^([a-zA-Z0-9]+)=(.*)$")
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index fe88118e018d5829e9ff2104b70940a39cb95ade..4c75cfd250807c6959c1e5167465d34b029d762e 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -66,6 +66,45 @@ def prepare_for_caching(kernel):
 # }}}
 
 
+# {{{ check reduction iname uniqueness
+
+def check_reduction_iname_uniqueness(kernel):
+    iname_to_reduction_count = {}
+    iname_to_nonsimultaneous_reduction_count = {}
+
+    def map_reduction(expr, rec):
+        rec(expr.expr)
+        for iname in expr.inames:
+            iname_to_reduction_count[iname] = (
+                    iname_to_reduction_count.get(iname, 0) + 1)
+            if not expr.allow_simultaneous:
+                iname_to_nonsimultaneous_reduction_count[iname] = (
+                        iname_to_nonsimultaneous_reduction_count.get(iname, 0) + 1)
+
+        return expr
+
+    from loopy.symbolic import ReductionCallbackMapper
+    cb_mapper = ReductionCallbackMapper(map_reduction)
+
+    for insn in kernel.instructions:
+        insn.with_transformed_expressions(cb_mapper)
+
+    for iname, count in six.iteritems(iname_to_reduction_count):
+        nonsimul_count = iname_to_nonsimultaneous_reduction_count.get(iname, 0)
+
+        if nonsimul_count and count > 1:
+            raise LoopyError("iname '%s' used in more than one reduction. "
+                    "(%d of them, to be precise.) "
+                    "Since this usage can easily cause loop scheduling "
+                    "problems, this is prohibited by default. "
+                    "Use loopy.make_reduction_inames_unique() to fix this. "
+                    "If you are sure that this is OK, write the reduction "
+                    "as 'simul_reduce(...)' instead of 'reduce(...)'"
+                    % (iname, count))
+
+# }}}
+
+
 # {{{ infer types
 
 def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
@@ -677,11 +716,13 @@ def preprocess_kernel(kernel, device=None):
     kernel = expand_subst(kernel)
 
     # Ordering restriction:
-    # Type inference doesn't handle substitutions. Get them out of the
-    # way.
+    # Type inference and reduction iname uniqueness don't handle substitutions.
+    # Get them out of the way.
 
     kernel = infer_unknown_types(kernel, expect_completion=False)
 
+    check_reduction_iname_uniqueness(kernel)
+
     kernel = add_default_dependencies(kernel)
 
     # Ordering restrictions:
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 17c1ab3ce0bad0170f8f7b03e243862de33af21d..de71ffaed0d457f96ee6ddbdeeafac58c0959aa9 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -317,11 +317,40 @@ def group_insn_counts(kernel):
 
     return result
 
+
+def gen_dependencies_except(kernel, insn_id, except_insn_ids):
+    insn = kernel.id_to_insn[insn_id]
+    for dep_id in insn.depends_on:
+
+        if dep_id in except_insn_ids:
+            continue
+
+        yield dep_id
+
+        for sub_dep_id in gen_dependencies_except(kernel, dep_id, except_insn_ids):
+            yield sub_dep_id
+
 # }}}
 
 
 # {{{ debug help
 
+def format_insn_id(kernel, insn_id):
+    Fore = kernel.options._fore
+    Style = kernel.options._style
+    return Fore.GREEN + insn_id + Style.RESET_ALL
+
+
+def format_insn(kernel, insn_id):
+    insn = kernel.id_to_insn[insn_id]
+    Fore = kernel.options._fore
+    Style = kernel.options._style
+    return "[%s] %s%s%s <- %s%s%s" % (
+            format_insn_id(kernel, insn_id),
+            Fore.BLUE, str(insn.assignee), Style.RESET_ALL,
+            Fore.MAGENTA, str(insn.expression), Style.RESET_ALL)
+
+
 def dump_schedule(kernel, schedule):
     lines = []
     indent = ""
@@ -337,8 +366,7 @@ def dump_schedule(kernel, schedule):
         elif isinstance(sched_item, RunInstruction):
             insn = kernel.id_to_insn[sched_item.insn_id]
             if isinstance(insn, Assignment):
-                insn_str = "[%s] %s <- %s" % (
-                        insn.id, str(insn.assignee), str(insn.expression))
+                insn_str = format_insn(kernel, sched_item.insn_id)
             else:
                 insn_str = sched_item.insn_id
             lines.append(indent + insn_str)
@@ -482,6 +510,8 @@ def generate_loop_schedules_internal(
     # to give loops containing high-priority instructions a chance.
 
     kernel = sched_state.kernel
+    Fore = kernel.options._fore
+    Style = kernel.options._style
 
     if allow_boost is None:
         rec_allow_boost = None
@@ -510,7 +540,7 @@ def generate_loop_schedules_internal(
         print(dump_schedule(sched_state.kernel, sched_state.schedule))
         #print("boost allowed:", allow_boost)
         print(75*"=")
-        print("LOOP NEST MAP:")
+        print("LOOP NEST MAP (inner: outer):")
         for iname, val in six.iteritems(sched_state.loop_nest_around_map):
             print("%s : %s" % (iname, ", ".join(val)))
         print(75*"=")
@@ -549,7 +579,7 @@ def generate_loop_schedules_internal(
         if not is_ready:
             if debug_mode:
                 print("instruction '%s' is missing insn depedencies '%s'" % (
-                        insn.id, ",".join(
+                        format_insn(kernel, insn.id), ",".join(
                             insn.depends_on - sched_state.scheduled_insn_ids)))
             continue
 
@@ -570,10 +600,10 @@ def generate_loop_schedules_internal(
             if debug_mode:
                 if want-have:
                     print("instruction '%s' is missing inames '%s'"
-                            % (insn.id, ",".join(want-have)))
+                            % (format_insn(kernel, insn.id), ",".join(want-have)))
                 if have-want:
                     print("instruction '%s' won't work under inames '%s'"
-                            % (insn.id, ",".join(have-want)))
+                            % (format_insn(kernel, insn.id), ",".join(have-want)))
 
         # {{{ determine group-based readiness
 
@@ -595,7 +625,7 @@ def generate_loop_schedules_internal(
         # }}}
 
         if is_ready and debug_mode:
-            print("ready to schedule '%s'" % insn.id)
+            print("ready to schedule '%s'" % format_insn(kernel, insn.id))
 
         if is_ready and not debug_mode:
             iid_set = frozenset([insn.id])
@@ -660,7 +690,38 @@ def generate_loop_schedules_internal(
                 if last_entered_loop in kernel.insn_inames(insn):
                     if debug_mode:
                         print("cannot leave '%s' because '%s' still depends on it"
-                                % (last_entered_loop, insn.id))
+                                % (last_entered_loop, format_insn(kernel, insn.id)))
+
+                        # check if there's a dependency of insn that needs to be
+                        # outside of last_entered_loop.
+                        for subdep_id in gen_dependencies_except(kernel, insn_id,
+                                sched_state.unscheduled_insn_ids):
+                            subdep = kernel.id_to_insn[insn_id]
+                            want = (kernel.insn_inames(subdep_id)
+                                    - sched_state.parallel_inames)
+                            if (
+                                    last_entered_loop not in want and
+                                    last_entered_loop not in subdep.boostable_into):
+                                print(
+                                    "%(warn)swarning:%(reset_all)s '%(iname)s', "
+                                    "which the schedule is "
+                                    "currently stuck inside of, seems mis-nested. "
+                                    "'%(subdep)s' must occur " "before '%(dep)s', "
+                                    "but '%(subdep)s must be outside "
+                                    "'%(iname)s', whereas '%(dep)s' must be back "
+                                    "in it.%(reset_all)s\n"
+                                    "  %(subdep_i)s\n"
+                                    "  %(dep_i)s"
+                                    % {
+                                        "warn": Fore.RED + Style.BRIGHT,
+                                        "reset_all": Style.RESET_ALL,
+                                        "iname": last_entered_loop,
+                                        "subdep": format_insn_id(kernel, subdep_id),
+                                        "dep": format_insn_id(kernel, insn_id),
+                                        "subdep_i": format_insn(kernel, subdep_id),
+                                        "dep_i": format_insn(kernel, insn_id),
+                                        })
+
                     can_leave = False
                     break
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index f0a01463a9045ad93694bf26cf95f6cd3393d1fc..ab0743f5b3ba54f214512d7be1ea4db8caffd2c8 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -49,6 +49,7 @@ __doc__ = """
 
 """
 
+
 # {{{ ToCountMap
 
 class ToCountMap:
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 7adab80c68c38f900976eb1adcd90226f40a7d9b..b887c703420d092d7f3c0fc9c729dd1d1f942a76 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -79,7 +79,20 @@ class IdentityMapperMixin(object):
         return expr
 
     def map_reduction(self, expr, *args):
-        return Reduction(expr.operation, expr.inames, self.rec(expr.expr, *args))
+        mapped_inames = [self.rec(Variable(iname), *args) for iname in expr.inames]
+
+        new_inames = []
+        for iname, new_sym_iname in zip(expr.inames, mapped_inames):
+            if not isinstance(new_sym_iname, Variable):
+                from loopy.diagnostic import LoopyError
+                raise LoopyError("%s did not map iname '%s' to a variable"
+                        % (type(self).__name__, iname))
+
+            new_inames.append(new_sym_iname.name)
+
+        return Reduction(
+                expr.operation, tuple(new_inames), self.rec(expr.expr, *args),
+                allow_simultaneous=expr.allow_simultaneous)
 
     def map_tagged_variable(self, expr, *args):
         # leaf, doesn't change
@@ -146,7 +159,8 @@ class StringifyMapper(StringifyMapperBase):
         return "loc.%d" % expr.index
 
     def map_reduction(self, expr, prec):
-        return "reduce(%s, [%s], %s)" % (
+        return "%sreduce(%s, [%s], %s)" % (
+                "simul_" if expr.allow_simultaneous else "",
                 expr.operation, ", ".join(expr.inames), expr.expr)
 
     def map_tagged_variable(self, expr, prec):
@@ -346,11 +360,16 @@ class Reduction(AlgebraicLeaf):
 
         The expression (as a :class:`pymbolic.primitives.Expression`)
         on which reduction is performed.
+
+    .. attribute:: allow_simultaneous
+
+        A :class:`bool`. If not *True*, an iname is allowed to be used
+        in precisely one reduction, to avoid mis-nesting errors.
     """
 
-    init_arg_names = ("operation", "inames", "expr")
+    init_arg_names = ("operation", "inames", "expr", "allow_simultaneous")
 
-    def __init__(self, operation, inames, expr):
+    def __init__(self, operation, inames, expr, allow_simultaneous=False):
         if isinstance(inames, str):
             inames = tuple(iname.strip() for iname in inames.split(","))
 
@@ -378,9 +397,10 @@ class Reduction(AlgebraicLeaf):
         self.operation = operation
         self.inames = inames
         self.expr = expr
+        self.allow_simultaneous = allow_simultaneous
 
     def __getinitargs__(self):
-        return (self.operation, self.inames, self.expr)
+        return (self.operation, self.inames, self.expr, self.allow_simultaneous)
 
     def get_hash(self):
         return hash((self.__class__, self.operation, self.inames,
@@ -779,7 +799,8 @@ class FunctionToPrimitiveMapper(IdentityMapper):
     turns those into the actual pymbolic primitives used for that.
     """
 
-    def _parse_reduction(self, operation, inames, red_expr):
+    def _parse_reduction(self, operation, inames, red_expr,
+            allow_simultaneous=False):
         if isinstance(inames, Variable):
             inames = (inames,)
 
@@ -795,7 +816,8 @@ class FunctionToPrimitiveMapper(IdentityMapper):
 
             processed_inames.append(iname.name)
 
-        return Reduction(operation, tuple(processed_inames), red_expr)
+        return Reduction(operation, tuple(processed_inames), red_expr,
+                allow_simultaneous=allow_simultaneous)
 
     def map_call(self, expr):
         from loopy.library.reduction import parse_reduction_op
@@ -820,7 +842,7 @@ class FunctionToPrimitiveMapper(IdentityMapper):
             else:
                 raise TypeError("cse takes two arguments")
 
-        elif name == "reduce":
+        elif name in ["reduce", "simul_reduce"]:
             if len(expr.parameters) == 3:
                 operation, inames, red_expr = expr.parameters
 
@@ -829,7 +851,8 @@ class FunctionToPrimitiveMapper(IdentityMapper):
                             "must be a symbol")
 
                 operation = parse_reduction_op(operation.name)
-                return self._parse_reduction(operation, inames, self.rec(red_expr))
+                return self._parse_reduction(operation, inames, self.rec(red_expr),
+                        allow_simultaneous=(name == "simul_reduce"))
             else:
                 raise TypeError("invalid 'reduce' calling sequence")
 
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 64332637910340d68cb035d64ad6f4f643c0b5c9..7b1deb7951392e2e0c46360f8fd979ebf5aedb37 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -353,6 +353,26 @@ def remove_unused_arguments(knl):
     for insn in exp_knl.instructions:
         refd_vars.update(insn.dependency_names())
 
+    from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
+    from loopy.symbolic import get_dependencies
+    from itertools import chain
+
+    def tolerant_get_deps(expr):
+        if expr is None or expr is lp.auto:
+            return set()
+        return get_dependencies(expr)
+
+    for ary in chain(knl.args, six.itervalues(knl.temporary_variables)):
+        if isinstance(ary, ArrayBase):
+            refd_vars.update(
+                    tolerant_get_deps(ary.shape)
+                    | tolerant_get_deps(ary.offset))
+
+            for dim_tag in ary.dim_tags:
+                if isinstance(dim_tag, FixedStrideArrayDimTag):
+                    refd_vars.update(
+                            tolerant_get_deps(dim_tag.stride))
+
     for arg in knl.args:
         if arg.name in refd_vars:
             new_args.append(arg)
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index bf435d3fe08d022790bf31a4d583d4923f0bfeff..e44f8abe227d451e8e940708530f6c20566685e8 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -210,8 +210,9 @@ def _fuse_two_kernels(knla, knlb):
 
     from pymbolic.imperative.transform import \
             fuse_instruction_streams_with_unique_ids
-    new_instructions, _ = fuse_instruction_streams_with_unique_ids(
-            knla.instructions, knlb.instructions)
+    new_instructions, old_b_id_to_new_b_id = \
+            fuse_instruction_streams_with_unique_ids(
+                    knla.instructions, knlb.instructions)
 
     # {{{ fuse assumptions
 
@@ -283,12 +284,12 @@ def _fuse_two_kernels(knla, knlb):
                 "target",
                 knla.target,
                 knlb.target),
-            options=knla.options)
+            options=knla.options), old_b_id_to_new_b_id
 
 # }}}
 
 
-def fuse_kernels(kernels, suffixes=None):
+def fuse_kernels(kernels, suffixes=None, data_flow=None):
     """Return a kernel that performs all the operations in all entries
     of *kernels*.
 
@@ -296,6 +297,11 @@ def fuse_kernels(kernels, suffixes=None):
     :arg suffixes: If given, must be a list of strings of a length matching
         that of *kernels*. This will be used to disambiguate the names
         of temporaries, as described below.
+    :arg data_flow: A list of data dependencies
+        ``[(var_name, from_kernel, to_kernel), ...]``.
+        Based on this, the fuser will create dependencies between all
+        writers of *var_name* in ``kernels[from_kernel]`` to
+        readers of *var_name* in ``kernels[to_kernel]``.
 
     The components of the kernels are fused as follows:
 
@@ -321,9 +327,16 @@ def fuse_kernels(kernels, suffixes=None):
     *   The resulting kernel will contain all instructions from each entry
         of *kernels*. Clashing instruction IDs will be renamed to ensure
         uniqueness.
+
+    .. versionchanged:: 2016.2
+
+        *data_flow* was added in version 2016.2
     """
     kernels = list(kernels)
 
+    if data_flow is None:
+        data_flow = []
+
     if suffixes:
         suffixes = list(suffixes)
         if len(suffixes) != len(kernels):
@@ -356,9 +369,46 @@ def fuse_kernels(kernels, suffixes=None):
 
         # }}}
 
-    result = kernels.pop(0)
-    while kernels:
-        result = _fuse_two_kernels(result, kernels.pop(0))
+    kernel_insn_ids = []
+    result = None
+
+    for knlb in kernels:
+        if result is None:
+            result = knlb
+            kernel_insn_ids.append([
+                insn.id for insn in knlb.instructions])
+        else:
+            result, old_b_id_to_new_b_id = _fuse_two_kernels(
+                    knla=result,
+                    knlb=knlb)
+
+            kernel_insn_ids.append([
+                old_b_id_to_new_b_id[insn.id]
+                for insn in knlb.instructions])
+
+    # {{{ realize data_flow dependencies
+
+    id_to_insn = result.id_to_insn.copy()
+
+    for var_name, from_kernel, to_kernel in data_flow:
+        from_writer_ids = frozenset(
+                insn_id
+                for insn_id in kernel_insn_ids[from_kernel]
+                if var_name in id_to_insn[insn_id].assignee_var_names())
+
+        for insn_id in kernel_insn_ids[to_kernel]:
+            insn = id_to_insn[insn_id]
+            if var_name in insn.read_dependency_names():
+                insn = insn.copy(depends_on=insn.depends_on | from_writer_ids)
+
+            id_to_insn[insn_id] = insn
+
+    result = result.copy(instructions=[
+            id_to_insn[insn_id]
+            for insn_ids in kernel_insn_ids
+            for insn_id in insn_ids])
+
+    # }}}
 
     return result
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index c98ed26b41d69189edc8253449b226524a365367..b42b338a6856a0603b76d44604c7ea3da2f065a5 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -66,6 +66,8 @@ __doc__ = """
 
 .. autofunction:: find_unused_axis_tag
 
+.. autofunction:: make_reduction_inames_unique
+
 """
 
 
@@ -119,7 +121,8 @@ class _InameSplitter(RuleAwareIdentityMapper):
 
             from loopy.symbolic import Reduction
             return Reduction(expr.operation, tuple(new_inames),
-                        self.rec(expr.expr, expn_state))
+                        self.rec(expr.expr, expn_state),
+                        expr.allow_simultaneous)
         else:
             return super(_InameSplitter, self).map_reduction(expr, expn_state)
 
@@ -444,7 +447,8 @@ class _InameJoiner(RuleAwareSubstitutionMapper):
 
             from loopy.symbolic import Reduction
             return Reduction(expr.operation, tuple(new_inames),
-                        self.rec(expr.expr, expn_state))
+                        self.rec(expr.expr, expn_state),
+                        expr.allow_simultaneous)
         else:
             return super(_InameJoiner, self).map_reduction(expr, expn_state)
 
@@ -676,7 +680,8 @@ class _InameDuplicator(RuleAwareIdentityMapper):
 
             from loopy.symbolic import Reduction
             return Reduction(expr.operation, new_inames,
-                        self.rec(expr.expr, expn_state))
+                        self.rec(expr.expr, expn_state),
+                        expr.allow_simultaneous)
         else:
             return super(_InameDuplicator, self).map_reduction(expr, expn_state)
 
@@ -1074,11 +1079,14 @@ class _ReductionSplitter(RuleAwareIdentityMapper):
             if self.direction == "in":
                 return Reduction(expr.operation, tuple(leftover_inames),
                         Reduction(expr.operation, tuple(self.inames),
-                            self.rec(expr.expr, expn_state)))
+                            self.rec(expr.expr, expn_state),
+                            expr.allow_simultaneous),
+                        expr.allow_simultaneous)
             elif self.direction == "out":
                 return Reduction(expr.operation, tuple(self.inames),
                         Reduction(expr.operation, tuple(leftover_inames),
-                            self.rec(expr.expr, expn_state)))
+                            self.rec(expr.expr, expn_state),
+                            expr.allow_simultaneous))
             else:
                 assert False
         else:
@@ -1399,4 +1407,108 @@ def separate_loop_head_tail_slab(kernel, iname, head_it_count, tail_it_count):
 
 # }}}
 
+
+# {{{ make_reduction_inames_unique
+
+class _ReductionInameUniquifier(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, inames, within):
+        super(_ReductionInameUniquifier, self).__init__(rule_mapping_context)
+
+        self.inames = inames
+        self.old_to_new = []
+        self.within = within
+
+        self.iname_to_red_count = {}
+        self.iname_to_nonsimultaneous_red_count = {}
+
+    def map_reduction(self, expr, expn_state):
+        within = self.within(
+                    expn_state.kernel,
+                    expn_state.instruction,
+                    expn_state.stack)
+
+        for iname in expr.inames:
+            self.iname_to_red_count[iname] = (
+                    self.iname_to_red_count.get(iname, 0) + 1)
+            if not expr.allow_simultaneous:
+                self.iname_to_nonsimultaneous_red_count[iname] = (
+                    self.iname_to_nonsimultaneous_red_count.get(iname, 0) + 1)
+
+        if within and not expr.allow_simultaneous:
+            subst_dict = {}
+
+            from pymbolic import var
+
+            new_inames = []
+            for iname in expr.inames:
+                if (
+                        not (self.inames is None or iname in self.inames)
+                        or
+                        self.iname_to_red_count[iname] <= 1):
+                    new_inames.append(iname)
+                    continue
+
+                new_iname = self.rule_mapping_context.make_unique_var_name(iname)
+                subst_dict[iname] = var(new_iname)
+                self.old_to_new.append((iname, new_iname))
+                new_inames.append(new_iname)
+
+            from loopy.symbolic import SubstitutionMapper
+            from pymbolic.mapper.substitutor import make_subst_func
+
+            from loopy.symbolic import Reduction
+            return Reduction(expr.operation, tuple(new_inames),
+                    self.rec(
+                        SubstitutionMapper(make_subst_func(subst_dict))(
+                            expr.expr),
+                        expn_state),
+                    expr.allow_simultaneous)
+        else:
+            return super(_ReductionInameUniquifier, self).map_reduction(
+                    expr, expn_state)
+
+
+def make_reduction_inames_unique(kernel, inames=None, within=None):
+    """
+    :arg inames: if not *None*, only apply to these inames
+    :arg within: a stack match as understood by
+        :func:`loopy.context_matching.parse_stack_match`.
+
+    .. versionadded:: 2016.2
+    """
+
+    name_gen = kernel.get_var_name_generator()
+
+    from loopy.context_matching import parse_stack_match
+    within = parse_stack_match(within)
+
+    # {{{ change kernel
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, name_gen)
+    r_uniq = _ReductionInameUniquifier(rule_mapping_context,
+            inames, within=within)
+
+    kernel = rule_mapping_context.finish_kernel(
+            r_uniq.map_kernel(kernel))
+
+    # }}}
+
+    # {{{ duplicate the inames
+
+    for old_iname, new_iname in r_uniq.old_to_new:
+        from loopy.kernel.tools import DomainChanger
+        domch = DomainChanger(kernel, frozenset([old_iname]))
+
+        from loopy.isl_helpers import duplicate_axes
+        kernel = kernel.copy(
+                domains=domch.get_domains_with(
+                    duplicate_axes(domch.domain, [old_iname], [new_iname])))
+
+    # }}}
+
+    return kernel
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 9ce1f9c5420e8dbbc0d3a94b6c465e443c5f109d..e599c902227faf8d1292ece2307d097bc8fd7c19 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -349,7 +349,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
 
     usage_to_definition = {}
 
-    for insn in kernel.instructions:
+    for insn in dep_kernel.instructions:
         if lhs_name not in insn.read_dependency_names():
             continue
 
diff --git a/loopy/version.py b/loopy/version.py
index 9ad8ac19bebff7a712e91900815057155205ae57..adc069663503b200bcdd1638c05ae0ffae5f14df 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v18-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v19-islpy%s" % _islpy_version
diff --git a/setup.py b/setup.py
index 4fbb4e142b0680e28e88282409655c5a644a6f4e..5ed095315234339709309a1c55ec88c7fdab6bfa 100644
--- a/setup.py
+++ b/setup.py
@@ -40,8 +40,9 @@ setup(name="loo.py",
           "pytools>=2016.1",
           "pymbolic>=2016.2",
           "cgen>=2016.1",
-          "islpy>=2016.1",
+          "islpy>=2016.1.2",
           "six>=1.8.0",
+          "colorama",
           ],
 
       extras_require={
diff --git a/test/test_dg.py b/test/test_dg.py
index 63a961423d2f750a4c9a25fdcb5fb56a479d8a35..fafef86c35211183ebdaeb75acf2b664a36586a0 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -52,10 +52,10 @@ def test_dg_volume(ctx_factory):
             "{[n,m,k]: 0<= n,m < Np and 0<= k < K}",
             ],
             """
-                <> du_drst = sum(m, DrDsDt[n,m]*u[k,m])
-                <> dv_drst = sum(m, DrDsDt[n,m]*v[k,m])
-                <> dw_drst = sum(m, DrDsDt[n,m]*w[k,m])
-                <> dp_drst = sum(m, DrDsDt[n,m]*p[k,m])
+                <> du_drst = simul_reduce(sum, m, DrDsDt[n,m]*u[k,m])
+                <> dv_drst = simul_reduce(sum, m, DrDsDt[n,m]*v[k,m])
+                <> dw_drst = simul_reduce(sum, m, DrDsDt[n,m]*w[k,m])
+                <> dp_drst = simul_reduce(sum, m, DrDsDt[n,m]*p[k,m])
 
                 # volume flux
                 rhsu[k,n] = dot(drst_dx[k],dp_drst)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 1fed3289aac1c184b2267e3425aed2d8023f9a03..606eec7667ca3d76a215c3b487e8c93bc371c36e 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -944,6 +944,27 @@ def test_double_sum(ctx_factory):
 
     n = 20
 
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i,j<n }",
+            [
+                "a = simul_reduce(sum, (i,j), i*j)",
+                "b = simul_reduce(sum, i, simul_reduce(sum, j, i*j))",
+                ],
+            assumptions="n>=1")
+
+    evt, (a, b) = knl(queue, n=n)
+
+    ref = sum(i*j for i in range(n) for j in range(n))
+    assert a.get() == ref
+    assert b.get() == ref
+
+
+def test_double_sum_made_unique(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 20
+
     knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
             [
@@ -952,9 +973,10 @@ def test_double_sum(ctx_factory):
                 ],
             assumptions="n>=1")
 
-    cknl = lp.CompiledKernel(ctx, knl)
+    knl = lp.make_reduction_inames_unique(knl)
+    print(knl)
 
-    evt, (a, b) = cknl(queue, n=n)
+    evt, (a, b) = knl(queue, n=n)
 
     ref = sum(i*j for i in range(n) for j in range(n))
     assert a.get() == ref
@@ -1098,8 +1120,8 @@ def test_arg_guessing_with_reduction(ctx_factory):
     knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
             """
-                a = 1.5 + sum((i,j), i*j)
-                d = 1.5 + sum((i,j), b[i,j])
+                a = 1.5 + simul_reduce(sum, (i,j), i*j)
+                d = 1.5 + simul_reduce(sum, (i,j), b[i,j])
                 b[i, j] = i*j
                 c[i+j, j] = b[j,i]
                 """,
@@ -1895,19 +1917,22 @@ def test_poisson(ctx_factory):
     sdim = 3
 
     knl = lp.make_kernel(
-            "{ [c,i,j,k,ell,ell2]: \
+            "{ [c,i,j,k,ell,ell2,ell3]: \
             0 <= c < nels and \
             0 <= i < nbf and \
             0 <= j < nbf and \
             0 <= k < nqp and \
-            0 <= ell < sdim and \
-            0 <= ell2 < sdim }",
+            0 <= ell,ell2 < sdim}",
             """
-            dpsi(bf,k0,dir) := sum(ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
-            Ael[c,i,j] = J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
+            dpsi(bf,k0,dir) := \
+                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
+            Ael[c,i,j] = \
+                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
             """,
             assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")
 
+    print(knl)
+
     knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)
 
     ref_knl = knl
@@ -1978,7 +2003,7 @@ def test_generate_c_snippet():
     u = var("u")
 
     from functools import partial
-    l_sum = partial(lp.Reduction, "sum")
+    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)
 
     Instr = lp.Assignment  # noqa
 
@@ -2269,7 +2294,10 @@ def test_finite_difference_expr_subst(ctx_factory):
             lp.GlobalArg("u", shape="n+2"),
             ])
 
-    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl])
+    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
+            data_flow=[
+                ("f", 1, 0)
+                ])
 
     fused_knl = lp.set_options(fused_knl, write_cl=True)
     evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 4350a3878469cb7ccd16ba5f0dda2d287d7f2136..3eacbaa2850b12ab1130a0f4b02ac5698bc9fab9 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -179,7 +179,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
 
     hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames,
           fetch_bounding_box=True, default_tag="for",
-          init_expression="0")
+          init_expression="0", store_expression="base + buffer")
 
     if opt_level == 5:
         tap_hsv = hsv
@@ -242,7 +242,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
     hsv = hsv.copy(name="horizontalStrongVolumeKernel")
 
     results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300),
-            do_check=False, quiet=True)
+            quiet=True)
 
     elapsed = results["elapsed_wall"]
 
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index a00fce1776dd8d0722e59ae544ce439568a66d5d..f4b91b236a16a8a30f2962d92b83780343fe801a 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -47,16 +47,16 @@ def test_tim2d(ctx_factory):
     knl = lp.make_kernel(
             "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
             [
-                "ur(a,b) := sum(o, D[a,o]*u[e,o,b])",
-                "us(a,b) := sum(o, D[b,o]*u[e,a,o])",
+                "ur(a,b) := simul_reduce(sum, o, D[a,o]*u[e,o,b])",
+                "us(a,b) := simul_reduce(sum, o, D[b,o]*u[e,a,o])",
 
                 #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)",
 
                 "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)",
                 "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)",
                 "lap[e,i,j]  = "
-                "  sum(m, D[m,i]*Gux(m,j))"
-                "+ sum(m, D[m,j]*Guy(i,m))"
+                "  simul_reduce(sum, m, D[m,i]*Gux(m,j))"
+                "+ simul_reduce(sum, m, D[m,j]*Guy(i,m))"
 
             ],
             [