diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b6c1ccf8424b3989bef169c63c58b9a4344e2031..ea88059c4c0d74afa0b778e3d9755142cd419a03 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -24,9 +24,9 @@ Python 2.6 POCL:
   - pocl
   except:
   - tags
-Python 3.4 AMD CPU:
+Python 3.5 AMD CPU:
   script:
-  - export PY_EXE=python3.4
+  - export PY_EXE=python3.5
   - export PYOPENCL_TEST=amd:pu
   - export EXTRA_INSTALL="numpy mako"
   - export NO_DOCTESTS=1
@@ -34,7 +34,7 @@ Python 3.4 AMD CPU:
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
-  - python3.4
+  - python3.5
   - amd-cl-cpu
   except:
   - tags
diff --git a/build-helpers/make-linux-build-docker-inner.sh b/build-helpers/make-linux-build-docker-inner.sh
index 119609a3c1767dd1e5ce1781f27c946781d1a79b..a7f621b1ef21676898d2283d93f8a54f086e5d9d 100755
--- a/build-helpers/make-linux-build-docker-inner.sh
+++ b/build-helpers/make-linux-build-docker-inner.sh
@@ -8,7 +8,7 @@ cd /tmp/build
 
 useradd -d /home/user -m -s /bin/bash user
 
-yum install -y centos-release-SCL
+yum install -y centos-release-scl
 yum install -y git python27 python27-python-devel python27-numpy tar gcc gcc-c++ mercurial libffi-devel
 
 scl enable python27 /mnt/make-linux-build-docker-inner-part-2.sh
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 19d9ddbc72c6de90ad7cbfddc740d74a87b2d2a7..4322bf928d524bfa4b52ca9698fcfaefac93d804 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -62,7 +62,7 @@ from loopy.library.reduction import register_reduction_parser
 from loopy.transform.iname import (
         set_loop_priority,
         split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames,
-        rename_iname, link_inames, remove_unused_inames,
+        rename_iname, remove_unused_inames,
         split_reduction_inward, split_reduction_outward,
         affine_map_inames, find_unused_axis_tag,
         make_reduction_inames_unique)
@@ -159,7 +159,7 @@ __all__ = [
         "set_loop_priority",
         "split_iname", "chunk_iname", "join_inames", "tag_inames",
         "duplicate_inames",
-        "rename_iname", "link_inames", "remove_unused_inames",
+        "rename_iname", "remove_unused_inames",
         "split_reduction_inward", "split_reduction_outward",
         "affine_map_inames", "find_unused_axis_tag",
         "make_reduction_inames_unique",
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index bada80328c337cc5e756afaf05b180f2b027b1ca..b475a40f3580b3b0e3c746a0de90f33cf182aa4e 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -265,7 +265,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
             host_storage_array = np.empty(alloc_size, dtype)
             host_array = as_strided(
                     host_storage_array, shape, numpy_strides)
-            host_array[:] = host_contig_array
+            host_array[...] = host_contig_array
 
             host_contig_array = arg_desc.ref_storage_array.get()
             storage_array = cl_array.to_device(queue, host_storage_array)
diff --git a/loopy/check.py b/loopy/check.py
index 910ab24ab3f98a9b01953d30354eda077273423d..0ef1f163aaed071d5dc4d219017b1164342be651 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -59,47 +59,6 @@ def check_loop_priority_inames_known(kernel):
             raise LoopyError("unknown iname '%s' in loop priorities" % iname)
 
 
-def check_for_unused_hw_axes_in_insns(kernel):
-    # FIXME: This could be made specific to the current kernel piece.
-    group_size, local_size = kernel.get_grid_size_upper_bounds_as_exprs()
-
-    group_axes = set(ax for ax, length in enumerate(group_size))
-    local_axes = set(ax for ax, length in enumerate(local_size))
-
-    # alternative: just disregard length-1 dimensions?
-
-    from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag
-    for insn in kernel.instructions:
-        if insn.boostable:
-            continue
-
-        group_axes_used = set()
-        local_axes_used = set()
-
-        for iname in kernel.insn_inames(insn):
-            tag = kernel.iname_to_tag.get(iname)
-
-            if isinstance(tag, LocalIndexTag):
-                local_axes_used.add(tag.axis)
-            elif isinstance(tag, GroupIndexTag):
-                group_axes_used.add(tag.axis)
-            elif isinstance(tag, AutoLocalIndexTagBase):
-                raise LoopyError("auto local tag encountered")
-
-        if group_axes != group_axes_used:
-            raise LoopyError("instruction '%s' does not use all group hw axes "
-                    "(available: %s used:%s)"
-                    % (insn.id,
-                        ",".join(str(i) for i in group_axes),
-                        ",".join(str(i) for i in group_axes_used)))
-        if local_axes != local_axes_used:
-            raise LoopyError("instruction '%s' does not use all local hw axes"
-                    "(available: %s used:%s)"
-                    % (insn.id,
-                        ",".join(str(i) for i in local_axes),
-                        ",".join(str(i) for i in local_axes_used)))
-
-
 def check_for_double_use_of_hw_axes(kernel):
     from loopy.kernel.data import UniqueTag
 
@@ -122,7 +81,7 @@ def check_for_inactive_iname_access(kernel):
 
         if not expression_inames <= kernel.insn_inames(insn):
             raise LoopyError(
-                    "instructiosn '%s' references "
+                    "instruction '%s' references "
                     "inames that the instruction does not depend on"
                     % insn.id)
 
@@ -154,22 +113,13 @@ def _is_racing_iname_tag(tv, tag):
 
 
 def check_for_write_races(kernel):
-    from loopy.symbolic import DependencyMapper
     from loopy.kernel.data import ParallelTag
-    depmap = DependencyMapper(composite_leaves=False)
 
     iname_to_tag = kernel.iname_to_tag.get
     for insn in kernel.instructions:
-        for assignee_name, assignee_indices in insn.assignees_and_indices():
-            assignee_indices = depmap(assignee_indices)
-
-            def strip_var(expr):
-                from pymbolic.primitives import Variable
-                assert isinstance(expr, Variable)
-                return expr.name
-
-            assignee_indices = set(strip_var(index) for index in assignee_indices)
-
+        for assignee_name, assignee_indices in zip(
+                insn.assignee_var_names(),
+                insn.assignee_subscript_deps()):
             assignee_inames = assignee_indices & kernel.all_inames()
             if not assignee_inames <= kernel.insn_inames(insn):
                 raise LoopyError(
@@ -332,7 +282,7 @@ def check_bounds(kernel):
 
 def check_write_destinations(kernel):
     for insn in kernel.instructions:
-        for wvar, _ in insn.assignees_and_indices():
+        for wvar in insn.assignee_var_names():
             if wvar in kernel.all_inames():
                 raise LoopyError("iname '%s' may not be written" % wvar)
 
@@ -363,7 +313,6 @@ def pre_schedule_checks(kernel):
         check_for_double_use_of_hw_axes(kernel)
         check_insn_attributes(kernel)
         check_loop_priority_inames_known(kernel)
-        check_for_unused_hw_axes_in_insns(kernel)
         check_for_inactive_iname_access(kernel)
         check_for_write_races(kernel)
         check_for_data_dependent_parallel_bounds(kernel)
@@ -382,7 +331,89 @@ def pre_schedule_checks(kernel):
         raise
 
 
-# {{{ pre-code-generation checks
+# {{{ post-schedule / pre-code-generation checks
+
+def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
+    from loopy.schedule import (CallKernel, RunInstruction,
+            Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
+            get_insn_ids_for_block_at, gather_schedule_block)
+
+    if sched_index is None:
+        group_axes = set()
+        local_axes = set()
+
+        i = 0
+        loop_end_i = past_end_i = len(kernel.schedule)
+    else:
+        assert isinstance(kernel.schedule[sched_index], CallKernel)
+        _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
+        group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
+                get_insn_ids_for_block_at(kernel.schedule, sched_index))
+
+        group_axes = set(ax for ax, length in enumerate(group_size))
+        local_axes = set(ax for ax, length in enumerate(local_size))
+
+        i = sched_index + 1
+        assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel)
+        loop_end_i = past_end_i - 1
+
+    # alternative: just disregard length-1 dimensions?
+
+    from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag
+
+    while i < loop_end_i:
+        sched_item = kernel.schedule[i]
+        if isinstance(sched_item, CallKernel):
+            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)
+
+        elif isinstance(sched_item, RunInstruction):
+            insn = kernel.id_to_insn[sched_item.insn_id]
+            i += 1
+
+            if insn.boostable:
+                continue
+
+            group_axes_used = set()
+            local_axes_used = set()
+
+            for iname in kernel.insn_inames(insn):
+                tag = kernel.iname_to_tag.get(iname)
+
+                if isinstance(tag, LocalIndexTag):
+                    local_axes_used.add(tag.axis)
+                elif isinstance(tag, GroupIndexTag):
+                    group_axes_used.add(tag.axis)
+                elif isinstance(tag, AutoLocalIndexTagBase):
+                    raise LoopyError("auto local tag encountered")
+
+            if group_axes != group_axes_used:
+                raise LoopyError("instruction '%s' does not use all group hw axes "
+                        "(available: %s used:%s)"
+                        % (insn.id,
+                            ",".join(str(i) for i in group_axes),
+                            ",".join(str(i) for i in group_axes_used)))
+            if local_axes != local_axes_used:
+                raise LoopyError("instruction '%s' does not use all local hw axes "
+                        "(available: %s used:%s)"
+                        % (insn.id,
+                            ",".join(str(i) for i in local_axes),
+                            ",".join(str(i) for i in local_axes_used)))
+
+        elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)):
+            i += 1
+            continue
+
+        else:
+            raise TypeError(
+                    "schedule item not understood: %s" % type(sched_item).__name__)
+
+    return past_end_i
+
+
+def check_for_unused_hw_axes_in_insns(kernel):
+    if kernel.schedule:
+        _check_for_unused_hw_axes_in_kernel_chunk(kernel)
+
 
 def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel):
     from loopy.kernel.data import ArrayBase, Assignment
@@ -459,6 +490,7 @@ def pre_codegen_checks(kernel):
     try:
         logger.info("pre-codegen check %s: start" % kernel.name)
 
+        check_for_unused_hw_axes_in_insns(kernel)
         check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel)
         kernel.target.pre_codegen_check(kernel)
         check_that_shapes_and_strides_are_arguments(kernel)
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 4438958e53d176535b13262a56f0b30a95d9faf7..2f266d2803f81985a79cd4793abbdb3c89bd1585 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -112,7 +112,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
 
         from loopy.codegen.result import generate_host_or_device_program
         codegen_result = generate_host_or_device_program(
-                new_codegen_state, sched_index + 1)
+                new_codegen_state, sched_index)
 
         glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(kernel.schedule, sched_index))
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index db3d15184a94aa13e6a4b449a2036869e34ca566..8531b97a947f3c46b984cf7405b794bd68e2f638 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -122,7 +122,25 @@ def generate_assignment_instruction_code(codegen_state, insn):
 
     # }}}
 
-    (assignee_var_name, assignee_indices), = insn.assignees_and_indices()
+    from pymbolic.primitives import Variable, Subscript
+    from loopy.symbolic import LinearSubscript
+
+    lhs = insn.assignee
+    if isinstance(lhs, Variable):
+        assignee_var_name = lhs.name
+        assignee_indices = ()
+
+    elif isinstance(lhs, Subscript):
+        assignee_var_name = lhs.aggregate.name
+        assignee_indices = lhs.index_tuple
+
+    elif isinstance(lhs, LinearSubscript):
+        assignee_var_name = lhs.aggregate.name
+        assignee_indices = (lhs.index,)
+
+    else:
+        raise RuntimeError("invalid lvalue '%s'" % lhs)
+
     lhs_var = kernel.get_var_descriptor(assignee_var_name)
     lhs_dtype = lhs_var.dtype
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index a9eb44f84d44015fa11ce83afe410e833bb8e214..648c3fe6f5b748dcc47de5ac972bb82ce605a9a9 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -233,17 +233,23 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     from loopy.kernel.data import (
             UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)
 
+    from loopy.schedule import get_insn_ids_for_block_at
+    insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
+
     if hw_inames_left is None:
+        all_inames_by_insns = set()
+        for insn_id in insn_ids_for_block:
+            all_inames_by_insns |= kernel.insn_inames(insn_id)
+
         hw_inames_left = [iname
-                for iname in kernel.all_inames()
+                for iname in all_inames_by_insns
                 if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]
 
     if not hw_inames_left:
         return next_func(codegen_state)
 
-    from loopy.schedule import get_insn_ids_for_block_at
     global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
-            get_insn_ids_for_block_at(kernel.schedule, schedule_index))
+            insn_ids_for_block)
 
     hw_inames_left = hw_inames_left[:]
     iname = hw_inames_left.pop()
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index ad92fcefa7db5e20cbc44d0cf643070b8f4c1832..82dbe09986b2567e9951b2446313a8ef5fde4f8c 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -241,19 +241,22 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
 
 def generate_host_or_device_program(codegen_state, schedule_index):
     ast_builder = codegen_state.ast_builder
-    temp_decls = ast_builder.get_temporary_decls(codegen_state)
+    temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index)
 
     from functools import partial
 
     from loopy.codegen.control import build_loop_nest
-    next_func = partial(build_loop_nest, schedule_index=schedule_index)
-
     if codegen_state.is_generating_device_code:
+        from loopy.schedule import CallKernel
+        assert isinstance(codegen_state.kernel.schedule[schedule_index], CallKernel)
+
         from loopy.codegen.loop import set_up_hw_parallel_loops
         codegen_result = set_up_hw_parallel_loops(
-                codegen_state, schedule_index, next_func=next_func)
+                codegen_state, schedule_index,
+                next_func=partial(build_loop_nest,
+                    schedule_index=schedule_index + 1))
     else:
-        codegen_result = next_func(codegen_state)
+        codegen_result = build_loop_nest(codegen_state, schedule_index)
 
     codegen_result = merge_codegen_results(
             codegen_state,
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index 2ea1bd40e540b24d5a08e4136ab603d6c5a422a1..c1d0e03633772bb7e3f86807f579e2b33793e6aa 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -23,6 +23,9 @@ THE SOFTWARE.
 """
 
 
+from pytools import MovedFunctionDeprecationWrapper
+
+
 # {{{ warnings
 
 class LoopyWarningBase(UserWarning):
@@ -47,7 +50,7 @@ class WriteRaceConditionWarning(LoopyWarning):
 # }}}
 
 
-def warn(kernel, id, text, type=LoopyWarning):
+def warn_with_kernel(kernel, id, text, type=LoopyWarning):
     from fnmatch import fnmatchcase
     for sw in kernel.silenced_warnings:
         if fnmatchcase(id, sw):
@@ -60,6 +63,9 @@ def warn(kernel, id, text, type=LoopyWarning):
     warn(text, type)
 
 
+warn = MovedFunctionDeprecationWrapper(warn_with_kernel)
+
+
 # {{{ errors
 
 class LoopyError(RuntimeError):
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 5ac63b56eb6347d93e65a3fa860840c37cabe146..bfd8ee284f0cf4439cb4409ba807777900be726a 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -669,9 +669,11 @@ class LoopKernel(RecordWithoutPickling):
         """Return a mapping from instruction ids to inames inside which
         they should be run.
         """
+        result = {}
+        for insn in self.instructions:
+            result[insn.id] = insn.forced_iname_deps
 
-        from loopy.kernel.tools import find_all_insn_inames
-        return find_all_insn_inames(self)
+        return result
 
     @memoize_method
     def all_referenced_inames(self):
@@ -681,11 +683,9 @@ class LoopKernel(RecordWithoutPickling):
         return result
 
     def insn_inames(self, insn):
-        from loopy.kernel.data import InstructionBase
-        if isinstance(insn, InstructionBase):
-            return self.all_insn_inames()[insn.id]
-        else:
-            return self.all_insn_inames()[insn]
+        if isinstance(insn, str):
+            insn = self.id_to_insn[insn]
+        return insn.forced_iname_deps
 
     @memoize_method
     def iname_to_insns(self):
@@ -790,7 +790,7 @@ class LoopKernel(RecordWithoutPickling):
         result = {}
 
         for insn in self.instructions:
-            for var_name, _ in insn.assignees_and_indices():
+            for var_name in insn.assignee_var_names():
                 result.setdefault(var_name, set()).add(insn.id)
 
         return result
@@ -807,7 +807,7 @@ class LoopKernel(RecordWithoutPickling):
         return frozenset(
                 var_name
                 for insn in self.instructions
-                for var_name, _ in insn.assignees_and_indices())
+                for var_name in insn.assignee_var_names())
 
     @memoize_method
     def get_temporary_to_base_storage_map(self):
@@ -918,9 +918,7 @@ class LoopKernel(RecordWithoutPickling):
 
         all_inames_by_insns = set()
         for insn_id in insn_ids:
-            insn = self.id_to_insn[insn_id]
-
-            all_inames_by_insns |= self.insn_inames(insn)
+            all_inames_by_insns |= self.insn_inames(insn_id)
 
         if not all_inames_by_insns <= self.all_inames():
             raise RuntimeError("some inames collected from instructions (%s) "
@@ -935,7 +933,7 @@ class LoopKernel(RecordWithoutPickling):
                 GroupIndexTag, LocalIndexTag,
                 AutoLocalIndexTagBase)
 
-        for iname in self.all_inames():
+        for iname in all_inames_by_insns:
             tag = self.iname_to_tag.get(iname)
 
             if isinstance(tag, GroupIndexTag):
@@ -1138,6 +1136,8 @@ class LoopKernel(RecordWithoutPickling):
                 options.append("groups=%s" % ":".join(insn.groups))
             if insn.conflicts_with_groups:
                 options.append("conflicts=%s" % ":".join(insn.conflicts_with_groups))
+            if insn.no_sync_with:
+                options.append("no_sync_with=%s" % ":".join(insn.no_sync_with))
 
             if len(loop_list) > loop_list_width:
                 lines.append("[%s]" % loop_list)
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index e25320729df3d5b4cceda46c9c406eb80e13ebdf..0040a6d9543952cb3615f10e86ea7fb32fd5cdab 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -146,18 +146,165 @@ def expand_defines_in_expr(expr, defines):
 # }}}
 
 
-# {{{ parse instructions
+# {{{ instruction options
+
+def get_default_insn_options_dict():
+    return {
+        "depends_on": None,
+        "depends_on_is_final": False,
+        "no_sync_with": None,
+        "groups": None,
+        "conflicts_with_groups": None,
+        "insn_id": None,
+        "inames_to_dup": [],
+        "priority": 0,
+        "forced_iname_deps_is_final": False,
+        "forced_iname_deps": frozenset(),
+        "predicates": frozenset(),
+        "tags": (),
+        "atomicity": (),
+        }
+
+
+def parse_insn_options(opt_dict, options_str, assignee_names=None):
+    if options_str is None:
+        return opt_dict
+
+    result = opt_dict.copy()
+
+    for option in options_str.split(","):
+        option = option.strip()
+        if not option:
+            raise RuntimeError("empty option supplied")
+
+        equal_idx = option.find("=")
+        if equal_idx == -1:
+            opt_key = option
+            opt_value = None
+        else:
+            opt_key = option[:equal_idx].strip()
+            opt_value = option[equal_idx+1:].strip()
+
+        if opt_key == "id" and opt_value is not None:
+            result["insn_id"] = intern(opt_value)
+        elif opt_key == "id_prefix" and opt_value is not None:
+            result["insn_id"] = UniqueName(opt_value)
+        elif opt_key == "priority" and opt_value is not None:
+            result["priority"] = int(opt_value)
+        elif opt_key == "dup" and opt_value is not None:
+            for value in opt_value.split(":"):
+                arrow_idx = value.find("->")
+                if arrow_idx >= 0:
+                    result["inames_to_dup"].append(
+                            (value[:arrow_idx], value[arrow_idx+2:]))
+                else:
+                    result["inames_to_dup"].append((value, None))
+
+        elif opt_key == "dep" and opt_value is not None:
+            if opt_value.startswith("*"):
+                result["depends_on_is_final"] = True
+                opt_value = (opt_value[1:]).strip()
+
+            result["depends_on"] = frozenset(
+                    intern(dep.strip()) for dep in opt_value.split(":")
+                    if dep.strip())
+
+        elif opt_key == "nosync" and opt_value is not None:
+            result["no_sync_with"] = frozenset(
+                    intern(dep.strip()) for dep in opt_value.split(":")
+                    if dep.strip())
+
+        elif opt_key == "groups" and opt_value is not None:
+            result["groups"] = frozenset(
+                    intern(grp.strip()) for grp in opt_value.split(":")
+                    if grp.strip())
+
+        elif opt_key == "conflicts" and opt_value is not None:
+            result["conflicts_with_groups"] = frozenset(
+                    intern(grp.strip()) for grp in opt_value.split(":")
+                    if grp.strip())
+
+        elif opt_key == "inames" and opt_value is not None:
+            if opt_value.startswith("+"):
+                result["forced_iname_deps_is_final"] = False
+                opt_value = (opt_value[1:]).strip()
+            else:
+                result["forced_iname_deps_is_final"] = True
+
+            result["forced_iname_deps"] = intern_frozenset_of_ids(
+                    opt_value.split(":"))
+
+        elif opt_key == "if" and opt_value is not None:
+            result["predicates"] = intern_frozenset_of_ids(opt_value.split(":"))
+
+        elif opt_key == "tags" and opt_value is not None:
+            result["tags"] = tuple(
+                    tag.strip() for tag in opt_value.split(":")
+                    if tag.strip())
+
+        elif opt_key == "atomic":
+            if assignee_names is None:
+                raise LoopyError("'atomic' option may only be specified "
+                        "on an actual instruction")
+            if len(assignee_names) != 1:
+                raise LoopyError("atomic operations with more than one "
+                        "left-hand side not supported")
+            assignee_name, = assignee_names
+
+            import loopy as lp
+            if opt_value is None:
+                result["atomicity"] = result["atomicity"] + (
+                        lp.AtomicUpdate(assignee_name),)
+            else:
+                for v in opt_value.split(":"):
+                    if v == "init":
+                        result["atomicity"] = result["atomicity"] + (
+                                lp.AtomicInit(assignee_name),)
+                    else:
+                        raise LoopyError("atomicity directive not "
+                                "understood: %s"
+                                % v)
+            del assignee_name
+
+        else:
+            raise ValueError(
+                    "unrecognized instruction option '%s' "
+                    "(maybe a missing/extraneous =value?)"
+                    % opt_key)
+
+    return result
+
+# }}}
+
+
+# {{{ parse one instruction
+
+WITH_OPTIONS_RE = re.compile(
+        "^"
+        "\s*with\s*"
+        "\{(?P<options>.+)\}"
+        "\s*$")
+
+FOR_RE = re.compile(
+        "^"
+        "\s*for\s*"
+        "(?P<inames>[ ,\w]+)"
+        "\s*$")
 
 INSN_RE = re.compile(
-        "\s*(?P<lhs>.+?)\s*(?<!\:)=\s*(?P<rhs>.+?)"
-        "\s*?(?:\{(?P<options>.+)\}\s*)?$"
-        )
+        "^"
+        "\s*"
+        "(?P<lhs>.+?)"
+        "\s*(?<!\:)=\s*"
+        "(?P<rhs>.+?)"
+        "\s*?"
+        "(?:\{(?P<options>.+)\}\s*)?$")
+
 SUBST_RE = re.compile(
-        r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$"
-        )
+        r"^\s*(?P<lhs>.+?)\s*:=\s*(?P<rhs>.+)\s*$")
 
 
-def parse_insn(insn):
+def parse_insn(groups, insn_options):
     """
     :return: a tuple ``(insn, inames_to_dup)``, where insn is a
         :class:`Assignment`, a :class:`CallInstruction`,
@@ -167,18 +314,6 @@ def parse_insn(insn):
 
     import loopy as lp
 
-    insn_match = INSN_RE.match(insn)
-    subst_match = SUBST_RE.match(insn)
-    if insn_match is not None and subst_match is not None:
-        raise RuntimeError("instruction parse error: %s" % insn)
-
-    if insn_match is not None:
-        groups = insn_match.groupdict()
-    elif subst_match is not None:
-        groups = subst_match.groupdict()
-    else:
-        raise RuntimeError("instruction parse error at '%s'" % insn)
-
     from loopy.symbolic import parse
     try:
         lhs = parse(groups["lhs"])
@@ -194,38 +329,9 @@ def parse_insn(insn):
                 "the following error occurred:" % groups["rhs"])
         raise
 
-    from pymbolic.primitives import Variable, Call, Subscript
+    from pymbolic.primitives import Variable, Subscript
     from loopy.symbolic import TypeAnnotation
 
-    # {{{ deal with subst rules
-
-    if subst_match is not None:
-        assert insn_match is None
-        if isinstance(lhs, Variable):
-            subst_name = lhs.name
-            arg_names = []
-        elif isinstance(lhs, Call):
-            if not isinstance(lhs.function, Variable):
-                raise RuntimeError("Invalid substitution rule left-hand side")
-            subst_name = lhs.function.name
-            arg_names = []
-
-            for i, arg in enumerate(lhs.parameters):
-                if not isinstance(arg, Variable):
-                    raise RuntimeError("Invalid substitution rule "
-                                    "left-hand side: %s--arg number %d "
-                                    "is not a variable" % (lhs, i))
-                arg_names.append(arg.name)
-        else:
-            raise RuntimeError("Invalid substitution rule left-hand side")
-
-        return SubstitutionRule(
-                name=subst_name,
-                arguments=tuple(arg_names),
-                expression=rhs), []
-
-    # }}}
-
     if not isinstance(lhs, tuple):
         lhs = (lhs,)
 
@@ -244,9 +350,10 @@ def parse_insn(insn):
         else:
             temp_var_types.append(None)
 
+        from loopy.symbolic import LinearSubscript
         if isinstance(lhs_i, Variable):
             assignee_names.append(lhs_i.name)
-        elif isinstance(lhs_i, Subscript):
+        elif isinstance(lhs_i, (Subscript, LinearSubscript)):
             assignee_names.append(lhs_i.aggregate.name)
         else:
             raise LoopyError("left hand side of assignment '%s' must "
@@ -258,170 +365,179 @@ def parse_insn(insn):
     temp_var_types = tuple(temp_var_types)
     del new_lhs
 
-    if insn_match is not None:
-        depends_on = None
-        depends_on_is_final = False
-        no_sync_with = None
-        insn_groups = None
-        conflicts_with_groups = None
-        insn_id = None
-        inames_to_dup = []
-        priority = 0
-        forced_iname_deps_is_final = False
-        forced_iname_deps = frozenset()
-        predicates = frozenset()
-        tags = ()
-        atomicity = ()
-
-        if groups["options"] is not None:
-            for option in groups["options"].split(","):
-                option = option.strip()
-                if not option:
-                    raise RuntimeError("empty option supplied")
-
-                equal_idx = option.find("=")
-                if equal_idx == -1:
-                    opt_key = option
-                    opt_value = None
-                else:
-                    opt_key = option[:equal_idx].strip()
-                    opt_value = option[equal_idx+1:].strip()
-
-                if opt_key == "id" and opt_value is not None:
-                    insn_id = intern(opt_value)
-                elif opt_key == "id_prefix" and opt_value is not None:
-                    insn_id = UniqueName(opt_value)
-                elif opt_key == "priority" and opt_value is not None:
-                    priority = int(opt_value)
-                elif opt_key == "dup" and opt_value is not None:
-                    for value in opt_value.split(":"):
-                        arrow_idx = value.find("->")
-                        if arrow_idx >= 0:
-                            inames_to_dup.append(
-                                    (value[:arrow_idx], value[arrow_idx+2:]))
-                        else:
-                            inames_to_dup.append((value, None))
-
-                elif opt_key == "dep" and opt_value is not None:
-                    if opt_value.startswith("*"):
-                        depends_on_is_final = True
-                        opt_value = (opt_value[1:]).strip()
-
-                    depends_on = frozenset(
-                            intern(dep.strip()) for dep in opt_value.split(":")
-                            if dep.strip())
-
-                elif opt_key == "nosync" and opt_value is not None:
-                    no_sync_with = frozenset(
-                            intern(dep.strip()) for dep in opt_value.split(":")
-                            if dep.strip())
-
-                elif opt_key == "groups" and opt_value is not None:
-                    insn_groups = frozenset(
-                            intern(grp.strip()) for grp in opt_value.split(":")
-                            if grp.strip())
-
-                elif opt_key == "conflicts" and opt_value is not None:
-                    conflicts_with_groups = frozenset(
-                            intern(grp.strip()) for grp in opt_value.split(":")
-                            if grp.strip())
-
-                elif opt_key == "inames" and opt_value is not None:
-                    if opt_value.startswith("+"):
-                        forced_iname_deps_is_final = False
-                        opt_value = (opt_value[1:]).strip()
-                    else:
-                        forced_iname_deps_is_final = True
+    insn_options = parse_insn_options(
+            insn_options,
+            groups["options"],
+            assignee_names=assignee_names)
 
-                    forced_iname_deps = intern_frozenset_of_ids(opt_value.split(":"))
+    insn_id = insn_options.pop("insn_id", None)
+    inames_to_dup = insn_options.pop("inames_to_dup", [])
 
-                elif opt_key == "if" and opt_value is not None:
-                    predicates = intern_frozenset_of_ids(opt_value.split(":"))
+    kwargs = dict(
+                id=(
+                    intern(insn_id)
+                    if isinstance(insn_id, str)
+                    else insn_id),
+                **insn_options)
 
-                elif opt_key == "tags" and opt_value is not None:
-                    tags = tuple(
-                            tag.strip() for tag in opt_value.split(":")
-                            if tag.strip())
+    from loopy.kernel.data import make_assignment
+    return make_assignment(
+            lhs, rhs, temp_var_types, **kwargs
+            ), inames_to_dup
 
-                elif opt_key == "atomic":
-                    if len(assignee_names) != 1:
-                        raise LoopyError("atomic operations with more than one "
-                                "left-hand side not supported")
-                    assignee_name, = assignee_names
+# }}}
 
-                    if opt_value is None:
-                        atomicity = atomicity + (
-                                lp.AtomicUpdate(assignee_name),)
-                    else:
-                        for v in opt_value.split(":"):
-                            if v == "init":
-                                atomicity = atomicity + (
-                                        lp.AtomicInit(assignee_name),)
-                            else:
-                                raise LoopyError("atomicity directive not "
-                                        "understood: %s"
-                                        % v)
-                    del assignee_name
 
-                else:
-                    raise ValueError(
-                            "unrecognized instruction option '%s' "
-                            "(maybe a missing/extraneous =value?)"
-                            % opt_key)
-
-        kwargs = dict(
-                    id=(
-                        intern(insn_id)
-                        if isinstance(insn_id, str)
-                        else insn_id),
-                    depends_on=depends_on,
-                    depends_on_is_final=depends_on_is_final,
-                    no_sync_with=no_sync_with,
-                    groups=insn_groups,
-                    conflicts_with_groups=conflicts_with_groups,
-                    forced_iname_deps_is_final=forced_iname_deps_is_final,
-                    forced_iname_deps=forced_iname_deps,
-                    priority=priority,
-                    predicates=predicates,
-                    tags=tags,
-                    atomicity=atomicity)
-
-        from loopy.kernel.data import make_assignment
-        return make_assignment(
-                lhs, rhs, temp_var_types, **kwargs
-                ), inames_to_dup
-
-
-def parse_if_necessary(insn, defines):
-    if isinstance(insn, InstructionBase):
-        yield insn.copy(
-                id=intern(insn.id) if isinstance(insn.id, str) else insn.id,
-                depends_on=frozenset(intern(dep) for dep in insn.depends_on),
-                groups=frozenset(intern(grp) for grp in insn.groups),
-                conflicts_with_groups=frozenset(
-                    intern(grp) for grp in insn.conflicts_with_groups),
-                forced_iname_deps=frozenset(
-                    intern(iname) for iname in insn.forced_iname_deps),
-                predicates=frozenset(
-                    intern(pred) for pred in insn.predicates),
-                ), []
-        return
-    elif not isinstance(insn, str):
-        raise TypeError("Instructions must be either an Instruction "
-                "instance or a parseable string. got '%s' instead."
-                % type(insn))
-
-    for insn in insn.split("\n"):
-        comment_start = insn.find("#")
-        if comment_start >= 0:
-            insn = insn[:comment_start]
-
-        insn = insn.strip()
-        if not insn:
+# {{{ parse_subst_rule
+
+def parse_subst_rule(groups):
+    from loopy.symbolic import parse
+    try:
+        lhs = parse(groups["lhs"])
+    except:
+        print("While parsing left hand side '%s', "
+                "the following error occurred:" % groups["lhs"])
+        raise
+
+    try:
+        rhs = parse(groups["rhs"])
+    except:
+        print("While parsing right hand side '%s', "
+                "the following error occurred:" % groups["rhs"])
+        raise
+
+    from pymbolic.primitives import Variable, Call
+    if isinstance(lhs, Variable):
+        subst_name = lhs.name
+        arg_names = []
+    elif isinstance(lhs, Call):
+        if not isinstance(lhs.function, Variable):
+            raise RuntimeError("Invalid substitution rule left-hand side")
+        subst_name = lhs.function.name
+        arg_names = []
+
+        for i, arg in enumerate(lhs.parameters):
+            if not isinstance(arg, Variable):
+                raise RuntimeError("Invalid substitution rule "
+                                "left-hand side: %s--arg number %d "
+                                "is not a variable" % (lhs, i))
+            arg_names.append(arg.name)
+    else:
+        raise RuntimeError("Invalid substitution rule left-hand side")
+
+    return SubstitutionRule(
+            name=subst_name,
+            arguments=tuple(arg_names),
+            expression=rhs)
+
+# }}}
+
+
+# {{{ parse_instructions
+
+def parse_instructions(instructions, defines):
+    if isinstance(instructions, str):
+        instructions = [instructions]
+
+    parsed_instructions = []
+
+    # {{{ pass 1: interning, comments, whitespace, defines
+
+    for insn in instructions:
+        if isinstance(insn, InstructionBase):
+            parsed_instructions.append(
+                    insn.copy(
+                        id=intern(insn.id) if isinstance(insn.id, str) else insn.id,
+                        depends_on=frozenset(intern(dep) for dep in insn.depends_on),
+                        groups=frozenset(intern(grp) for grp in insn.groups),
+                        conflicts_with_groups=frozenset(
+                            intern(grp) for grp in insn.conflicts_with_groups),
+                        forced_iname_deps=frozenset(
+                            intern(iname) for iname in insn.forced_iname_deps),
+                        predicates=frozenset(
+                            intern(pred) for pred in insn.predicates),
+                        ))
+            continue
+
+        elif not isinstance(insn, str):
+            raise TypeError("Instructions must be either an Instruction "
+                    "instance or a parseable string. got '%s' instead."
+                    % type(insn))
+
+        for insn in insn.split("\n"):
+            comment_start = insn.find("#")
+            if comment_start >= 0:
+                insn = insn[:comment_start]
+
+            insn = insn.strip()
+            if not insn:
+                continue
+
+            for sub_insn in expand_defines(insn, defines, single_valued=False):
+                parsed_instructions.append(sub_insn)
+
+    # }}}
+
+    instructions = parsed_instructions
+    parsed_instructions = []
+    substitutions = {}
+    inames_to_dup = []  # one for each parsed_instruction
+
+    # {{{ pass 2: parsing
+
+    insn_options_stack = [get_default_insn_options_dict()]
+
+    for insn in instructions:
+        if isinstance(insn, InstructionBase):
+            parsed_instructions.append(insn)
+            inames_to_dup.append([])
+            continue
+
+        with_options_match = WITH_OPTIONS_RE.match(insn)
+        if with_options_match is not None:
+            insn_options_stack.append(
+                    parse_insn_options(
+                        insn_options_stack[-1],
+                        with_options_match.group("options")))
+            continue
+
+        for_match = FOR_RE.match(insn)
+        if for_match is not None:
+            options = insn_options_stack[-1].copy()
+            options["forced_iname_deps"] = (
+                    options.get("forced_iname_deps", frozenset())
+                    | frozenset(
+                        iname.strip()
+                        for iname in for_match.group("inames").split(",")))
+            options["forced_iname_deps_is_final"] = True
+
+            insn_options_stack.append(options)
+            del options
+            continue
+
+        if insn == "end":
+            insn_options_stack.pop()
+            continue
+
+        subst_match = SUBST_RE.match(insn)
+        if subst_match is not None:
+            subst = parse_subst_rule(subst_match.groupdict())
+            substitutions[subst.name] = subst
             continue
 
-        for sub_insn in expand_defines(insn, defines, single_valued=False):
-            yield parse_insn(sub_insn)
+        insn_match = INSN_RE.match(insn)
+        if insn_match is not None:
+            insn, insn_inames_to_dup = parse_insn(
+                    insn_match.groupdict(), insn_options_stack[-1])
+            parsed_instructions.append(insn)
+            inames_to_dup.append(insn_inames_to_dup)
+            continue
+
+        raise RuntimeError("instruction parse error: %s" % insn)
+
+    # }}}
+
+    return parsed_instructions, inames_to_dup, substitutions
 
 # }}}
 
@@ -551,7 +667,7 @@ class ArgumentGuesser:
         from loopy.symbolic import get_dependencies
         for insn in instructions:
             if isinstance(insn, MultiAssignmentBase):
-                for assignee_var_name, _ in insn.assignees_and_indices():
+                for assignee_var_name in insn.assignee_var_names():
                     self.all_written_names.add(assignee_var_name)
                     self.all_names.update(get_dependencies(
                         self.submap(insn.assignees)))
@@ -619,8 +735,8 @@ class ArgumentGuesser:
 
         for insn in self.instructions:
             if isinstance(insn, MultiAssignmentBase):
-                for (assignee_var_name, _), temp_var_type in zip(
-                        insn.assignees_and_indices(),
+                for assignee_var_name, temp_var_type in zip(
+                        insn.assignee_var_names(),
                         insn.temp_var_types):
                     if temp_var_type is not None:
                         temp_var_names.add(assignee_var_name)
@@ -655,33 +771,6 @@ class ArgumentGuesser:
 # }}}
 
 
-# {{{ tag reduction inames as sequential
-
-def tag_reduction_inames_as_sequential(knl):
-    result = set()
-
-    for insn in knl.instructions:
-        result.update(insn.reduction_inames())
-
-    from loopy.kernel.data import ParallelTag, ForceSequentialTag
-
-    new_iname_to_tag = {}
-    for iname in result:
-        tag = knl.iname_to_tag.get(iname)
-        if tag is not None and isinstance(tag, ParallelTag):
-            raise RuntimeError("inconsistency detected: "
-                    "reduction iname '%s' has "
-                    "a parallel tag" % iname)
-
-        if tag is None:
-            new_iname_to_tag[iname] = ForceSequentialTag()
-
-    from loopy import tag_inames
-    return tag_inames(knl, new_iname_to_tag)
-
-# }}}
-
-
 # {{{ sanity checking
 
 def check_for_duplicate_names(knl):
@@ -740,7 +829,7 @@ def check_written_variable_names(knl):
             | set(six.iterkeys(knl.temporary_variables)))
 
     for insn in knl.instructions:
-        for var_name, _ in insn.assignees_and_indices():
+        for var_name in insn.assignee_var_names():
             if var_name not in admissible_vars:
                 raise RuntimeError("variable '%s' not declared or not "
                         "allowed for writing" % var_name)
@@ -837,8 +926,8 @@ def create_temporaries(knl, default_order):
 
     for insn in knl.instructions:
         if isinstance(insn, MultiAssignmentBase):
-            for (assignee_name, _), temp_var_type in zip(
-                    insn.assignees_and_indices(),
+            for assignee_name, temp_var_type in zip(
+                    insn.assignee_var_names(),
                     insn.temp_var_types):
 
                 if temp_var_type is None:
@@ -882,7 +971,6 @@ def determine_shapes_of_temporaries(knl):
     new_temp_vars = knl.temporary_variables.copy()
 
     from loopy.symbolic import AccessRangeMapper
-    from pymbolic import var
     import loopy as lp
 
     new_temp_vars = {}
@@ -890,10 +978,8 @@ def determine_shapes_of_temporaries(knl):
         if tv.shape is lp.auto or tv.base_indices is lp.auto:
             armap = AccessRangeMapper(knl, tv.name)
             for insn in knl.instructions:
-                for assignee_name, assignee_index in insn.assignees_and_indices():
-                    if assignee_index:
-                        armap(var(assignee_name).index(assignee_index),
-                                knl.insn_inames(insn))
+                for assignee in insn.assignees:
+                    armap(assignee, knl.insn_inames(insn))
 
             if armap.access_range is not None:
                 base_indices, shape = list(zip(*[
@@ -976,7 +1062,6 @@ def guess_arg_shape_if_requested(kernel, default_order):
                 from traceback import print_exc
                 print_exc()
 
-                from loopy.diagnostic import LoopyError
                 raise LoopyError(
                         "Failed to (automatically, as requested) find "
                         "shape/strides for argument '%s'. "
@@ -986,6 +1071,14 @@ def guess_arg_shape_if_requested(kernel, default_order):
 
             if armap.access_range is None:
                 if armap.bad_subscripts:
+                    from loopy.symbolic import LinearSubscript
+                    if any(isinstance(sub, LinearSubscript)
+                            for sub in armap.bad_subscripts):
+                        raise LoopyError("cannot determine access range for '%s': "
+                                "linear subscript(s) in '%s'"
+                                % (arg.name, ", ".join(
+                                        str(i) for i in armap.bad_subscripts)))
+
                     n_axes_in_subscripts = set(
                             len(sub.index_tuple) for sub in armap.bad_subscripts)
 
@@ -999,7 +1092,7 @@ def guess_arg_shape_if_requested(kernel, default_order):
                         # Leave shape undetermined--we can live with that for 1D.
                         shape = (None,)
                     else:
-                        raise RuntimeError("cannot determine access range for '%s': "
+                        raise LoopyError("cannot determine access range for '%s': "
                                 "undetermined index in subscript(s) '%s'"
                                 % (arg.name, ", ".join(
                                         str(i) for i in armap.bad_subscripts)))
@@ -1106,6 +1199,19 @@ def resolve_wildcard_deps(knl):
 # }}}
 
 
+# {{{ add inferred iname deps
+
+def add_inferred_inames(knl):
+    from loopy.kernel.tools import find_all_insn_inames
+    insn_inames = find_all_insn_inames(knl)
+
+    return knl.copy(instructions=[
+            insn.copy(forced_iname_deps=insn_inames[insn.id])
+            for insn in knl.instructions])
+
+# }}}
+
+
 # {{{ kernel creation top-level
 
 def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
@@ -1158,14 +1264,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     :arg preamble_generators: a list of functions of signature
         (seen_dtypes, seen_functions) where seen_functions is a set of
         (name, c_name, arg_dtypes), generating extra entries for *preambles*.
-    :arg defines: a dictionary of replacements to be made in instructions given
-        as strings before parsing. A macro instance intended to be replaced
-        should look like "MACRO" in the instruction code. The expansion given
-        in this parameter is allowed to be a list. In this case, instructions
-        are generated for *each* combination of macro values.
-
-        These defines may also be used in the domain and in argument shapes and
-        strides. They are expanded only upon kernel creation.
     :arg default_order: "C" (default) or "F"
     :arg default_offset: 0 or :class:`loopy.auto`. The default value of
         *offset* in :attr:`GlobalArg` for guessed arguments.
@@ -1199,6 +1297,12 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     flags = kwargs.pop("flags", None)
     target = kwargs.pop("target", None)
 
+    if defines:
+        from warnings import warn
+        warn("'defines' argument to make_kernel is deprecated. "
+                "Use lp.fix_parameters instead",
+                DeprecationWarning, stacklevel=2)
+
     if target is None:
         from loopy import _DEFAULT_TARGET
         target = _DEFAULT_TARGET
@@ -1256,34 +1360,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
     # }}}
 
-    # {{{ instruction/subst parsing
-
-    parsed_instructions = []
-    kwargs["substitutions"] = substitutions = {}
-    inames_to_dup = []
-
-    if isinstance(instructions, str):
-        instructions = [instructions]
-
-    for insn in instructions:
-        for new_insn, insn_inames_to_dup in parse_if_necessary(insn, defines):
-            if isinstance(new_insn, InstructionBase):
-                parsed_instructions.append(new_insn)
-
-                # Need to maintain 1-to-1 correspondence to instructions
-                inames_to_dup.append(insn_inames_to_dup)
-
-            elif isinstance(new_insn, SubstitutionRule):
-                substitutions[new_insn.name] = new_insn
-
-                assert not insn_inames_to_dup
-            else:
-                raise RuntimeError("unexpected type in instruction parsing")
-
-    instructions = parsed_instructions
-    del parsed_instructions
-
-    # }}}
+    instructions, inames_to_dup, substitutions = \
+            parse_instructions(instructions, defines)
 
     # {{{ find/create isl_context
 
@@ -1307,6 +1385,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     kernel_args = arg_guesser.convert_names_to_full_args(kernel_args)
     kernel_args = arg_guesser.guess_kernel_args_if_requested(kernel_args)
 
+    kwargs["substitutions"] = substitutions
+
     from loopy.kernel import LoopKernel
     knl = LoopKernel(domains, instructions, kernel_args,
             temporary_variables=temporary_variables,
@@ -1324,8 +1404,21 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     check_for_nonexistent_iname_deps(knl)
 
     knl = create_temporaries(knl, default_order)
+    # -------------------------------------------------------------------------
+    # Ordering dependency:
+    # -------------------------------------------------------------------------
+    # Must create temporaries before inferring inames (because those temporaries
+    # mediate dependencies that are then used for iname propagation.)
+    # -------------------------------------------------------------------------
+    # NOTE: add_inferred_inames will be phased out and throws warnings if it
+    # does something.
+    knl = add_inferred_inames(knl)
+    # -------------------------------------------------------------------------
+    # Ordering dependency:
+    # -------------------------------------------------------------------------
+    # Must infer inames before determining shapes.
+    # -------------------------------------------------------------------------
     knl = determine_shapes_of_temporaries(knl)
-    knl = tag_reduction_inames_as_sequential(knl)
     knl = expand_defines_in_shapes(knl, defines)
     knl = guess_arg_shape_if_requested(knl, default_order)
     knl = apply_default_order_to_args(knl, default_order)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 26f4f64a60e90aed59cedcbaa0e1cd2b117f6af1..5b6ff31cbcbdb404b7e812c6ae1f25a1a9323b88 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -506,7 +506,7 @@ class InstructionBase(Record):
 
         a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances
         with which no barrier synchronization is necessary, even given the existence
-        of a dependency chain and apparently conflicting writes
+        of a dependency chain and apparently conflicting access
 
     .. rubric:: Conditionals
 
@@ -542,7 +542,8 @@ class InstructionBase(Record):
 
         A :class:`set` of inames into which the instruction
         may need to be boosted, as a heuristic help for the scheduler.
-        Also allowed to be *None*.
+        Also allowed to be *None* to indicate that this hasn't been
+        decided yet.
 
     .. rubric:: Tagging
 
@@ -552,12 +553,11 @@ class InstructionBase(Record):
         of instructions.
 
     .. automethod:: __init__
-    .. automethod:: assignees_and_indices
-    .. automethod:: assignee_name
+    .. automethod:: assignee_var_names
+    .. automethod:: assignee_subscript_deps
     .. automethod:: with_transformed_expressions
     .. automethod:: write_dependency_names
     .. automethod:: dependency_names
-    .. automethod:: assignee_var_names
     .. automethod:: copy
     """
 
@@ -571,7 +571,8 @@ class InstructionBase(Record):
     def __init__(self, id, depends_on, depends_on_is_final,
             groups, conflicts_with_groups,
             no_sync_with,
-            forced_iname_deps_is_final, forced_iname_deps, priority,
+            forced_iname_deps_is_final, forced_iname_deps,
+            priority,
             boostable, boostable_into, predicates, tags,
             insn_deps=None, insn_deps_is_final=None):
 
@@ -658,10 +659,15 @@ class InstructionBase(Record):
     def reduction_inames(self):
         raise NotImplementedError
 
-    def assignees_and_indices(self):
-        """Return a list of tuples *(assignee_var_name, subscript)*
-        where assignee_var_name is a string representing an assigned
-        variable name and subscript is a :class:`tuple`.
+    def assignee_var_names(self):
+        """Return a tuple of tuples of assignee variable names, one
+        for each quantity being assigned to.
+        """
+        raise NotImplementedError
+
+    def assignee_subscript_deps(self):
+        """Return a list of sets of variable names referred to in the subscripts
+        of the quantities being assigned to, one for each assignee.
         """
         raise NotImplementedError
 
@@ -676,20 +682,20 @@ class InstructionBase(Record):
 
     @property
     def assignee_name(self):
-        """A convenience wrapper around :meth:`assignees_and_indices`
+        """A convenience wrapper around :meth:`assignee_var_names`
         that returns the the name of the variable being assigned.
         If more than one variable is being modified in the instruction,
         :raise:`ValueError` is raised.
         """
 
-        aai = self.assignees_and_indices()
+        names = self.assignee_var_names()
 
-        if len(aai) != 1:
+        if len(names) != 1:
             raise ValueError("expected exactly one assignment in instruction "
                     "on which assignee_name is being called, found %d"
-                    % len(aai))
+                    % len(names))
 
-        (name, _), = aai
+        name, = names
         return name
 
     @memoize_method
@@ -700,19 +706,15 @@ class InstructionBase(Record):
         """
 
         result = set()
-        for assignee, indices in self.assignees_and_indices():
-            result.add(assignee)
+        for assignee in self.assignees:
             from loopy.symbolic import get_dependencies
-            result.update(get_dependencies(indices))
+            result.update(get_dependencies(assignee))
 
         return frozenset(result)
 
     def dependency_names(self):
         return self.read_dependency_names() | self.write_dependency_names()
 
-    def assignee_var_names(self):
-        return (var_name for var_name, _ in self.assignees_and_indices())
-
     def get_str_options(self):
         result = []
 
@@ -730,6 +732,8 @@ class InstructionBase(Record):
 
         if self.depends_on:
             result.append("deps="+":".join(self.depends_on))
+        if self.no_sync_with:
+            result.append("nosync="+":".join(self.no_sync_with))
         if self.groups:
             result.append("groups=%s" % ":".join(self.groups))
         if self.conflicts_with_groups:
@@ -738,7 +742,7 @@ class InstructionBase(Record):
             result.append("priority=%d" % self.priority)
         if self.tags:
             result.append("tags=%s" % ":".join(self.tags))
-        if hasattr(self, "atomicity"):
+        if hasattr(self, "atomicity") and self.atomicity:
             result.append("atomic=%s" % ":".join(str(a) for a in self.atomicity))
 
         return result
@@ -806,15 +810,38 @@ class InstructionBase(Record):
 # }}}
 
 
-def _get_assignee_and_index(expr):
+def _get_assignee_var_name(expr):
     from pymbolic.primitives import Variable, Subscript
+    from loopy.symbolic import LinearSubscript
+
     if isinstance(expr, Variable):
-        return (expr.name, ())
+        return expr.name
+
     elif isinstance(expr, Subscript):
         agg = expr.aggregate
         assert isinstance(agg, Variable)
 
-        return (agg.name, expr.index_tuple)
+        return agg.name
+
+    elif isinstance(expr, LinearSubscript):
+        agg = expr.aggregate
+        assert isinstance(agg, Variable)
+
+        return agg.name
+    else:
+        raise RuntimeError("invalid lvalue '%s'" % expr)
+
+
+def _get_assignee_subscript_deps(expr):
+    from pymbolic.primitives import Variable, Subscript
+    from loopy.symbolic import LinearSubscript, get_dependencies
+
+    if isinstance(expr, Variable):
+        return frozenset()
+    elif isinstance(expr, Subscript):
+        return get_dependencies(expr.index)
+    elif isinstance(expr, LinearSubscript):
+        return get_dependencies(expr.index)
     else:
         raise RuntimeError("invalid lvalue '%s'" % expr)
 
@@ -976,8 +1003,8 @@ class MultiAssignmentBase(InstructionBase):
     def read_dependency_names(self):
         from loopy.symbolic import get_dependencies
         result = get_dependencies(self.expression)
-        for _, subscript in self.assignees_and_indices():
-            result = result | get_dependencies(subscript)
+        for subscript_deps in self.assignee_subscript_deps():
+            result = result | subscript_deps
 
         processed_predicates = frozenset(
                 pred.lstrip("!") for pred in self.predicates)
@@ -1114,8 +1141,11 @@ class Assignment(MultiAssignmentBase):
     # {{{ implement InstructionBase interface
 
     @memoize_method
-    def assignees_and_indices(self):
-        return [_get_assignee_and_index(self.assignee)]
+    def assignee_var_names(self):
+        return (_get_assignee_var_name(self.assignee),)
+
+    def assignee_subscript_deps(self):
+        return (_get_assignee_subscript_deps(self.assignee),)
 
     def with_transformed_expressions(self, f, *args):
         return self.copy(
@@ -1258,8 +1288,13 @@ class CallInstruction(MultiAssignmentBase):
     # {{{ implement InstructionBase interface
 
     @memoize_method
-    def assignees_and_indices(self):
-        return [_get_assignee_and_index(a) for a in self.assignees]
+    def assignee_var_names(self):
+        return tuple(_get_assignee_var_name(a) for a in self.assignees)
+
+    def assignee_subscript_deps(self):
+        return tuple(
+                _get_assignee_subscript_deps(a)
+                for a in self.assignees)
 
     def with_transformed_expressions(self, f, *args):
         return self.copy(
@@ -1451,17 +1486,21 @@ class CInstruction(InstructionBase):
         for name, iname_expr in self.iname_exprs:
             result.update(get_dependencies(iname_expr))
 
-        for _, subscript in self.assignees_and_indices():
-            result.update(get_dependencies(subscript))
+        for subscript_deps in self.assignee_subscript_deps():
+            result.update(subscript_deps)
 
         return frozenset(result) | self.predicates
 
     def reduction_inames(self):
         return set()
 
-    def assignees_and_indices(self):
-        return [_get_assignee_and_index(expr)
-                for expr in self.assignees]
+    def assignee_var_names(self):
+        return tuple(_get_assignee_var_name(expr) for expr in self.assignees)
+
+    def assignee_subscript_deps(self):
+        return tuple(
+                _get_assignee_subscript_deps(a)
+                for a in self.assignees)
 
     def with_transformed_expressions(self, f, *args):
         return self.copy(
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 7862ceb1efbed4f70ebe0fa81d07de84b12fca60..a4e6ab0d6b7cc3e884eadd0f044429e186b9118f 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -30,7 +30,7 @@ from six.moves import intern
 import numpy as np
 import islpy as isl
 from islpy import dim_type
-from loopy.diagnostic import LoopyError
+from loopy.diagnostic import LoopyError, warn_with_kernel
 
 import logging
 logger = logging.getLogger(__name__)
@@ -125,7 +125,41 @@ def _add_and_infer_dtypes_overdetermined(knl, dtype_dict):
 # }}}
 
 
-# {{{ find_all_insn_inames fixed point iteration
+# {{{ find_all_insn_inames fixed point iteration (deprecated)
+
+def guess_iname_deps_based_on_var_use(kernel, insn, insn_id_to_inames=None):
+    # For all variables that insn depends on, find the intersection
+    # of iname deps of all writers, and add those to insn's
+    # dependencies.
+
+    result = frozenset()
+
+    writer_map = kernel.writer_map()
+
+    for tv_name in (insn.read_dependency_names() & kernel.get_written_variables()):
+        tv_implicit_inames = None
+
+        for writer_id in writer_map[tv_name]:
+            writer_insn = kernel.id_to_insn[writer_id]
+            if insn_id_to_inames is None:
+                writer_inames = writer_insn.forced_iname_deps
+            else:
+                writer_inames = insn_id_to_inames[writer_id]
+
+            writer_implicit_inames = (
+                    writer_inames
+                    - (writer_insn.write_dependency_names() & kernel.all_inames()))
+            if tv_implicit_inames is None:
+                tv_implicit_inames = writer_implicit_inames
+            else:
+                tv_implicit_inames = (tv_implicit_inames
+                        & writer_implicit_inames)
+
+        if tv_implicit_inames is not None:
+            result = result | tv_implicit_inames
+
+    return result - insn.reduction_inames()
+
 
 def find_all_insn_inames(kernel):
     logger.debug("%s: find_all_insn_inames: start" % kernel.name)
@@ -166,8 +200,6 @@ def find_all_insn_inames(kernel):
         insn_id_to_inames[insn.id] = iname_deps
         insn_assignee_inames[insn.id] = write_deps & kernel.all_inames()
 
-    written_vars = kernel.get_written_variables()
-
     # fixed point iteration until all iname dep sets have converged
 
     # Why is fixed point iteration necessary here? Consider the following
@@ -190,32 +222,22 @@ def find_all_insn_inames(kernel):
 
             # {{{ depdency-based propagation
 
-            # For all variables that insn depends on, find the intersection
-            # of iname deps of all writers, and add those to insn's
-            # dependencies.
-
-            for tv_name in (all_read_deps[insn.id] & written_vars):
-                implicit_inames = None
-
-                for writer_id in writer_map[tv_name]:
-                    writer_implicit_inames = (
-                            insn_id_to_inames[writer_id]
-                            - insn_assignee_inames[writer_id])
-                    if implicit_inames is None:
-                        implicit_inames = writer_implicit_inames
-                    else:
-                        implicit_inames = (implicit_inames
-                                & writer_implicit_inames)
-
-                inames_old = insn_id_to_inames[insn.id]
-                inames_new = (inames_old | implicit_inames) \
-                            - insn.reduction_inames()
-                insn_id_to_inames[insn.id] = inames_new
-
-                if inames_new != inames_old:
-                    did_something = True
-                    logger.debug("%s: find_all_insn_inames: %s -> %s (dep-based)" % (
-                        kernel.name, insn.id, ", ".join(sorted(inames_new))))
+            inames_old = insn_id_to_inames[insn.id]
+            inames_new = inames_old | guess_iname_deps_based_on_var_use(
+                    kernel, insn, insn_id_to_inames)
+
+            insn_id_to_inames[insn.id] = inames_new
+
+            if inames_new != inames_old:
+                did_something = True
+
+                warn_with_kernel(kernel, "inferred_iname",
+                        "The iname(s) '%s' on instruction '%s' was "
+                        "automatically added. "
+                        "This is deprecated. Please add the iname "
+                        "to the instruction "
+                        "implicitly, e.g. by adding '{inames=...}"
+                        % (inames_new-inames_old, insn.id))
 
             # }}}
 
@@ -245,8 +267,14 @@ def find_all_insn_inames(kernel):
             if inames_new != inames_old:
                 did_something = True
                 insn_id_to_inames[insn.id] = frozenset(inames_new)
-                logger.debug("%s: find_all_insn_inames: %s -> %s (domain-based)" % (
-                    kernel.name, insn.id, ", ".join(sorted(inames_new))))
+
+                warn_with_kernel(kernel, "inferred_iname",
+                        "The iname(s) '%s' on instruction '%s' was "
+                        "automatically added. "
+                        "This is deprecated. Please add the iname "
+                        "to the instruction "
+                        "implicitly, e.g. by adding '{inames=...}"
+                        % (inames_new-inames_old, insn.id))
 
             # }}}
 
diff --git a/loopy/maxima.py b/loopy/maxima.py
index 29f974ff9d1f6f0a6e6d4311eca88201559e6854..738df86c4b94988b57d6dc01bcc22bb0ca62ac21 100644
--- a/loopy/maxima.py
+++ b/loopy/maxima.py
@@ -60,7 +60,7 @@ def get_loopy_instructions_as_maxima(kernel, prefix):
     my_variable_names = (
             avn
             for insn in kernel.instructions
-            for avn, _ in insn.assignees_and_indices()
+            for avn in insn.assignee_var_names()
             )
 
     from pymbolic import var
@@ -89,7 +89,7 @@ def get_loopy_instructions_as_maxima(kernel, prefix):
             if dep not in written_insn_ids:
                 write_insn(dep)
 
-        (aname, _), = insn.assignees_and_indices()
+        aname, = insn.assignee_var_names()
         result.append("%s%s : %s;" % (
             prefix, aname,
             mstr(substitute(insn.expression))))
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 8b7be4f0407bb0b9e52f0b6915841af1c5dd7ac5..d333cdf18291546e3efaaa96a5468c9e9ed4a470 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -28,7 +28,10 @@ from loopy.diagnostic import (
         LoopyError, WriteRaceConditionWarning, warn,
         LoopyAdvisory, DependencyTypeInferenceFailure)
 
+import islpy as isl
+
 from pytools.persistent_dict import PersistentDict
+
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 from loopy.kernel.data import make_assignment
@@ -137,8 +140,8 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
                 result_dtypes = type_inf_mapper(expr, multiple_types_ok=True)
 
                 result = None
-                for (assignee, _), comp_dtype in zip(
-                        writer_insn.assignees_and_indices(), result_dtypes):
+                for assignee, comp_dtype in zip(
+                        writer_insn.assignee_var_names(), result_dtypes):
                     if assignee == var_name:
                         result = comp_dtype
                         break
@@ -302,12 +305,11 @@ def _get_compute_inames_tagged(kernel, insn, tag_base):
 
 
 def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_name):
-    from loopy.symbolic import get_dependencies
-
     return set(iname
-            for aname, aindices in insn.assignees_and_indices()
-            for iname in get_dependencies(aindices)
-                & kernel.all_inames()
+            for aname, adeps in zip(
+                insn.assignee_var_names(),
+                insn.assignee_subscript_deps())
+            for iname in adeps & kernel.all_inames()
             if aname == tv_name
             if isinstance(kernel.iname_to_tag.get(iname), tag_base))
 
@@ -414,6 +416,9 @@ def find_temporary_scope(kernel):
 def add_default_dependencies(kernel):
     logger.debug("%s: default deps" % kernel.name)
 
+    from loopy.transform.subst import expand_subst
+    expanded_kernel = expand_subst(kernel)
+
     writer_map = kernel.writer_map()
 
     arg_names = set(arg.name for arg in kernel.args)
@@ -422,7 +427,7 @@ def add_default_dependencies(kernel):
 
     dep_map = dict(
             (insn.id, insn.read_dependency_names() & var_names)
-            for insn in kernel.instructions)
+            for insn in expanded_kernel.instructions)
 
     new_insns = []
     for insn in kernel.instructions:
@@ -463,7 +468,7 @@ def add_default_dependencies(kernel):
 
 # {{{ rewrite reduction to imperative form
 
-def realize_reduction(kernel, insn_id_filter=None):
+def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
     """Rewrites reductions into their imperative form. With *insn_id_filter*
     specified, operate only on the instruction with an instruction id matching
     *insn_id_filter*.
@@ -479,6 +484,7 @@ def realize_reduction(kernel, insn_id_filter=None):
     logger.debug("%s: realize reduction" % kernel.name)
 
     new_insns = []
+    new_iname_tags = {}
 
     insn_id_gen = kernel.get_instruction_id_generator()
 
@@ -488,30 +494,16 @@ def realize_reduction(kernel, insn_id_filter=None):
     from loopy.expression import TypeInferenceMapper
     type_inf_mapper = TypeInferenceMapper(kernel)
 
-    def map_reduction(expr, rec, multiple_values_ok=False):
-        # Only expand one level of reduction at a time, going from outermost to
-        # innermost. Otherwise we get the (iname + insn) dependencies wrong.
-
-        try:
-            arg_dtype = type_inf_mapper(expr.expr)
-        except DependencyTypeInferenceFailure:
-            raise LoopyError("failed to determine type of accumulator for "
-                    "reduction '%s'" % expr)
-
-        arg_dtype = arg_dtype.with_target(kernel.target)
-
-        reduction_dtypes = expr.operation.result_dtypes(
-                    kernel, arg_dtype, expr.inames)
-        reduction_dtypes = tuple(
-                dt.with_target(kernel.target) for dt in reduction_dtypes)
+    # {{{ sequential
 
-        ncomp = len(reduction_dtypes)
+    def map_reduction_seq(expr, rec, nresults, arg_dtype,
+            reduction_dtypes):
+        outer_insn_inames = temp_kernel.insn_inames(insn)
 
         from pymbolic import var
-
         acc_var_names = [
                 var_name_gen("acc_"+"_".join(expr.inames))
-                for i in range(ncomp)]
+                for i in range(nresults)]
         acc_vars = tuple(var(n) for n in acc_var_names)
 
         from loopy.kernel.data import TemporaryVariable, temp_var_scope
@@ -523,12 +515,6 @@ def realize_reduction(kernel, insn_id_filter=None):
                     dtype=dtype,
                     scope=temp_var_scope.PRIVATE)
 
-        outer_insn_inames = temp_kernel.insn_inames(insn)
-        bad_inames = frozenset(expr.inames) & outer_insn_inames
-        if bad_inames:
-            raise LoopyError("reduction used within loop(s) that it was "
-                    "supposed to reduce over: " + ", ".join(bad_inames))
-
         init_id = insn_id_gen(
                 "%s_%s_init" % (insn.id, "_".join(expr.inames)))
 
@@ -562,25 +548,284 @@ def realize_reduction(kernel, insn_id_filter=None):
 
         generated_insns.append(reduction_insn)
 
-        new_insn_depends_on.add(reduction_insn.id)
+        new_insn_add_depends_on.add(reduction_insn.id)
 
-        if multiple_values_ok:
-            return acc_vars
-        else:
+        if nresults == 1:
             assert len(acc_vars) == 1
             return acc_vars[0]
+        else:
+            return acc_vars
+
+    # }}}
+
+    # {{{ local-parallel
+
+    def _get_int_iname_size(iname):
+        from loopy.isl_helpers import static_max_of_pw_aff
+        from loopy.symbolic import pw_aff_to_expr
+        size = pw_aff_to_expr(
+                static_max_of_pw_aff(
+                    kernel.get_iname_bounds(iname).size,
+                    constants_only=True))
+        assert isinstance(size, six.integer_types)
+        return size
+
+    def _make_slab_set(iname, size):
+        v = isl.make_zero_and_vars([iname])
+        bs, = (
+                v[0].le_set(v[iname])
+                &
+                v[iname].lt_set(v[0] + size)).get_basic_sets()
+        return bs
+
+    def map_reduction_local(expr, rec, nresults, arg_dtype,
+            reduction_dtypes):
+        red_iname, = expr.inames
+
+        size = _get_int_iname_size(red_iname)
+
+        outer_insn_inames = temp_kernel.insn_inames(insn)
+
+        from loopy.kernel.data import LocalIndexTagBase
+        outer_local_inames = tuple(
+                oiname
+                for oiname in outer_insn_inames
+                if isinstance(
+                    kernel.iname_to_tag.get(oiname),
+                    LocalIndexTagBase))
+
+        from pymbolic import var
+        outer_local_iname_vars = tuple(
+                var(oiname) for oiname in outer_local_inames)
+
+        outer_local_iname_sizes = tuple(
+                _get_int_iname_size(oiname)
+                for oiname in outer_local_inames)
+
+        # {{{ add separate iname to carry out the reduction
+
+        # Doing this sheds any odd conditionals that may be active
+        # on our red_iname.
+
+        base_exec_iname = var_name_gen("red_"+red_iname)
+        domains.append(_make_slab_set(base_exec_iname, size))
+        new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname]
+
+        # }}}
+
+        acc_var_names = [
+                var_name_gen("acc_"+red_iname)
+                for i in range(nresults)]
+        acc_vars = tuple(var(n) for n in acc_var_names)
+
+        from loopy.kernel.data import TemporaryVariable, temp_var_scope
+        for name, dtype in zip(acc_var_names, reduction_dtypes):
+            new_temporary_variables[name] = TemporaryVariable(
+                    name=name,
+                    shape=outer_local_iname_sizes + (size,),
+                    dtype=dtype,
+                    scope=temp_var_scope.LOCAL)
+
+        base_iname_deps = outer_insn_inames - frozenset(expr.inames)
+
+        neutral = expr.operation.neutral_element(arg_dtype, expr.inames)
+
+        init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname))
+        init_insn = make_assignment(
+                id=init_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
+                    for acc_var in acc_vars),
+                expression=neutral,
+                forced_iname_deps=base_iname_deps | frozenset([base_exec_iname]),
+                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
+                depends_on=frozenset())
+        generated_insns.append(init_insn)
+
+        transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname))
+        transfer_insn = make_assignment(
+                id=transfer_id,
+                assignees=tuple(
+                    acc_var[outer_local_iname_vars + (var(red_iname),)]
+                    for acc_var in acc_vars),
+                expression=expr.operation(
+                    arg_dtype, neutral, expr.expr, expr.inames),
+                forced_iname_deps=(
+                    (outer_insn_inames - frozenset(expr.inames))
+                    | frozenset([red_iname])),
+                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
+                depends_on=frozenset([init_id]) | insn.depends_on,
+                no_sync_with=frozenset([init_id]))
+        generated_insns.append(transfer_insn)
+
+        def _strip_if_scalar(c):
+            if len(acc_vars) == 1:
+                return c[0]
+            else:
+                return c
+
+        cur_size = 1
+        while cur_size < size:
+            cur_size *= 2
+
+        prev_id = transfer_id
+        bound = size
+
+        istage = 0
+        while cur_size > 1:
+
+            new_size = cur_size // 2
+            assert new_size * 2 == cur_size
+
+            stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage))
+            domains.append(_make_slab_set(stage_exec_iname, bound-new_size))
+            new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname]
+
+            stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage))
+            stage_insn = make_assignment(
+                    id=stage_id,
+                    assignees=tuple(
+                        acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
+                        for acc_var in acc_vars),
+                    expression=expr.operation(
+                        arg_dtype,
+                        _strip_if_scalar([
+                            acc_var[
+                                outer_local_iname_vars + (var(stage_exec_iname),)]
+                            for acc_var in acc_vars]),
+                        _strip_if_scalar([
+                            acc_var[
+                                outer_local_iname_vars + (
+                                    var(stage_exec_iname) + new_size,)]
+                            for acc_var in acc_vars]),
+                        expr.inames),
+                    forced_iname_deps=(
+                        base_iname_deps | frozenset([stage_exec_iname])),
+                    forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
+                    depends_on=frozenset([prev_id]),
+                    )
+
+            generated_insns.append(stage_insn)
+            prev_id = stage_id
+
+            cur_size = new_size
+            bound = cur_size
+            istage += 1
+
+        new_insn_add_depends_on.add(prev_id)
+        new_insn_add_no_sync_with.add(prev_id)
+        new_insn_add_forced_iname_deps.add(stage_exec_iname or base_exec_iname)
+
+        if nresults == 1:
+            assert len(acc_vars) == 1
+            return acc_vars[0][outer_local_iname_vars + (0,)]
+        else:
+            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
+    # }}}
+
+    # {{{ seq/par dispatch
+
+    def map_reduction(expr, rec, nresults=1):
+        # Only expand one level of reduction at a time, going from outermost to
+        # innermost. Otherwise we get the (iname + insn) dependencies wrong.
+
+        try:
+            arg_dtype = type_inf_mapper(expr.expr)
+        except DependencyTypeInferenceFailure:
+            if unknown_types_ok:
+                arg_dtype = lp.auto
+
+                reduction_dtypes = (lp.auto,)*nresults
+
+            else:
+                raise LoopyError("failed to determine type of accumulator for "
+                        "reduction '%s'" % expr)
+        else:
+            arg_dtype = arg_dtype.with_target(kernel.target)
+
+            reduction_dtypes = expr.operation.result_dtypes(
+                        kernel, arg_dtype, expr.inames)
+            reduction_dtypes = tuple(
+                    dt.with_target(kernel.target) for dt in reduction_dtypes)
+
+        outer_insn_inames = temp_kernel.insn_inames(insn)
+        bad_inames = frozenset(expr.inames) & outer_insn_inames
+        if bad_inames:
+            raise LoopyError("reduction used within loop(s) that it was "
+                    "supposed to reduce over: " + ", ".join(bad_inames))
+
+        n_sequential = 0
+        n_local_par = 0
+
+        from loopy.kernel.data import (
+                LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
+                ParallelTag)
+        for iname in expr.inames:
+            iname_tag = kernel.iname_to_tag.get(iname)
+
+            if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
+                # These are nominally parallel, but we can live with
+                # them as sequential.
+                n_sequential += 1
+
+            elif isinstance(iname_tag, LocalIndexTagBase):
+                n_local_par += 1
+
+            elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
+                raise LoopyError("the only form of parallelism supported "
+                        "by reductions is 'local'--found iname '%s' "
+                        "tagged '%s'"
+                        % (iname, type(iname_tag).__name__))
+
+            else:
+                n_sequential += 1
+
+        if n_local_par and n_sequential:
+            raise LoopyError("Reduction over '%s' contains both parallel and "
+                    "sequential inames. It must be split "
+                    "(using split_reduction_{in,out}ward) "
+                    "before code generation."
+                    % ", ".join(expr.inames))
+
+        if n_local_par > 1:
+            raise LoopyError("Reduction over '%s' contains more than"
+                    "one parallel iname. It must be split "
+                    "(using split_reduction_{in,out}ward) "
+                    "before code generation."
+                    % ", ".join(expr.inames))
+
+        if n_sequential:
+            assert n_local_par == 0
+            return map_reduction_seq(expr, rec, nresults, arg_dtype,
+                    reduction_dtypes)
+        elif n_local_par:
+            return map_reduction_local(expr, rec, nresults, arg_dtype,
+                    reduction_dtypes)
+        else:
+            from loopy.diagnostic import warn
+            warn(kernel, "empty_reduction",
+                    "Empty reduction found (no inames to reduce over). "
+                    "Eliminating.")
+
+            return expr.expr
+
+    # }}}
 
     from loopy.symbolic import ReductionCallbackMapper
     cb_mapper = ReductionCallbackMapper(map_reduction)
 
     insn_queue = kernel.instructions[:]
     insn_id_replacements = {}
+    domains = kernel.domains[:]
 
     temp_kernel = kernel
 
     import loopy as lp
     while insn_queue:
-        new_insn_depends_on = set()
+        new_insn_add_depends_on = set()
+        new_insn_add_no_sync_with = set()
+        new_insn_add_forced_iname_deps = set()
+
         generated_insns = []
 
         insn = insn_queue.pop(0)
@@ -590,10 +835,12 @@ def realize_reduction(kernel, insn_id_filter=None):
             new_insns.append(insn)
             continue
 
+        nresults = len(insn.assignees)
+
         # Run reduction expansion.
         from loopy.symbolic import Reduction
-        if isinstance(insn.expression, Reduction):
-            new_expressions = cb_mapper(insn.expression, multiple_values_ok=True)
+        if isinstance(insn.expression, Reduction) and nresults > 1:
+            new_expressions = cb_mapper(insn.expression, nresults=nresults)
         else:
             new_expressions = (cb_mapper(insn.expression),)
 
@@ -603,8 +850,12 @@ def realize_reduction(kernel, insn_id_filter=None):
 
             kwargs = insn.get_copy_kwargs(
                     depends_on=insn.depends_on
-                    | frozenset(new_insn_depends_on),
-                    forced_iname_deps=temp_kernel.insn_inames(insn))
+                    | frozenset(new_insn_add_depends_on),
+                    no_sync_with=insn.no_sync_with
+                    | frozenset(new_insn_add_no_sync_with),
+                    forced_iname_deps=(
+                        temp_kernel.insn_inames(insn)
+                        | new_insn_add_forced_iname_deps))
 
             kwargs.pop("id")
             kwargs.pop("expression")
@@ -624,8 +875,6 @@ def realize_reduction(kernel, insn_id_filter=None):
             insn_id_replacements[insn.id] = [
                     rinsn.id for rinsn in replacement_insns]
 
-            # FIXME: Track dep rename
-
             insn_queue = generated_insns + replacement_insns + insn_queue
 
             # The reduction expander needs an up-to-date kernel
@@ -633,27 +882,35 @@ def realize_reduction(kernel, insn_id_filter=None):
 
             temp_kernel = kernel.copy(
                     instructions=new_insns + insn_queue,
-                    temporary_variables=new_temporary_variables)
+                    temporary_variables=new_temporary_variables,
+                    domains=domains)
+            temp_kernel = lp.replace_instruction_ids(
+                    temp_kernel, insn_id_replacements)
 
         else:
             # nothing happened, we're done with insn
-            assert not new_insn_depends_on
+            assert not new_insn_add_depends_on
 
             new_insns.append(insn)
 
     kernel = kernel.copy(
             instructions=new_insns,
-            temporary_variables=new_temporary_variables)
+            temporary_variables=new_temporary_variables,
+            domains=domains)
+
+    kernel = lp.replace_instruction_ids(kernel, insn_id_replacements)
+
+    kernel = lp.tag_inames(kernel, new_iname_tags)
 
-    return lp.replace_instruction_ids(kernel, insn_id_replacements)
+    return kernel
 
 # }}}
 
 
-# {{{ find boostability of instructions
+# {{{ find idempotence ("boostability") of instructions
 
-def find_boostability(kernel):
-    logger.debug("%s: boostability" % kernel.name)
+def find_idempotence(kernel):
+    logger.debug("%s: idempotence" % kernel.name)
 
     writer_map = kernel.writer_map()
 
@@ -661,16 +918,20 @@ def find_boostability(kernel):
 
     var_names = arg_names | set(six.iterkeys(kernel.temporary_variables))
 
-    dep_map = dict(
+    reads_map = dict(
             (insn.id, insn.read_dependency_names() & var_names)
             for insn in kernel.instructions)
 
-    non_boostable_vars = set()
+    non_idempotently_updated_vars = set()
+
+    # FIXME: This can be made more efficient by simply starting
+    # from all written variables and not even considering
+    # instructions as the start of the first pass.
 
     new_insns = []
     for insn in kernel.instructions:
         all_my_var_writers = set()
-        for var in dep_map[insn.id]:
+        for var in reads_map[insn.id]:
             var_writers = writer_map.get(var, set())
             all_my_var_writers |= var_writers
 
@@ -680,7 +941,7 @@ def find_boostability(kernel):
             last_all_my_var_writers = all_my_var_writers
 
             for writer_insn_id in last_all_my_var_writers:
-                for var in dep_map[writer_insn_id]:
+                for var in reads_map[writer_insn_id]:
                     all_my_var_writers = \
                             all_my_var_writers | writer_map.get(var, set())
 
@@ -692,19 +953,20 @@ def find_boostability(kernel):
         boostable = insn.id not in all_my_var_writers
 
         if not boostable:
-            non_boostable_vars.update(
-                    var_name for var_name, _ in insn.assignees_and_indices())
+            non_idempotently_updated_vars.update(
+                    insn.assignee_var_names())
 
         insn = insn.copy(boostable=boostable)
 
         new_insns.append(insn)
 
-    # {{{ remove boostability from isns that access non-boostable vars
+    # {{{ remove boostability from isns that access non-idempotently updated vars
 
     new2_insns = []
     for insn in new_insns:
         accessed_vars = insn.dependency_names()
-        boostable = insn.boostable and not bool(non_boostable_vars & accessed_vars)
+        boostable = insn.boostable and not bool(
+                non_idempotently_updated_vars & accessed_vars)
         new2_insns.append(insn.copy(boostable=boostable))
 
     # }}}
@@ -827,7 +1089,7 @@ def preprocess_kernel(kernel, device=None):
     #   because it manipulates the depends_on field, which could prevent
     #   defaults from being applied.
 
-    kernel = realize_reduction(kernel)
+    kernel = realize_reduction(kernel, unknown_types_ok=False)
 
     # Ordering restriction:
     # add_axes_to_temporaries_for_ilp because reduction accumulators
@@ -837,7 +1099,7 @@ def preprocess_kernel(kernel, device=None):
     kernel = add_axes_to_temporaries_for_ilp_and_vec(kernel)
 
     kernel = find_temporary_scope(kernel)
-    kernel = find_boostability(kernel)
+    kernel = find_idempotence(kernel)
     kernel = limit_boostability(kernel)
 
     kernel = kernel.target.preprocess(kernel)
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 89c9ffcdaaceb78778231101411e1f017f7c4f85..8b2a8190cedb65c034e95a8f4cd6b0c9a2bd65e3 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -142,7 +142,7 @@ def generate_sub_sched_items(schedule, start_idx):
 def get_insn_ids_for_block_at(schedule, start_idx):
     return frozenset(
             sub_sched_item.insn_id
-            for sub_sched_item in generate_sub_sched_items(
+            for i, sub_sched_item in generate_sub_sched_items(
                 schedule, start_idx)
             if isinstance(sub_sched_item, RunInstruction))
 
@@ -569,7 +569,7 @@ def generate_loop_schedules_internal(
             print()
         print(75*"=")
         print("KERNEL:")
-        print(kernel)
+        print(kernel.stringify(with_dependencies=True))
         print(75*"=")
         print("CURRENT SCHEDULE:")
         print(dump_schedule(sched_state.kernel, sched_state.schedule))
@@ -1486,15 +1486,6 @@ def generate_loop_schedules(kernel, debug_args={}):
                     gen_sched = insert_barriers(kernel, gen_sched,
                             reverse=False, kind="global")
 
-                    """
-                    for sched_item in gen_sched:
-                        if (
-                                isinstance(sched_item, Barrier)
-                                and sched_item.kind == "global"):
-                            raise LoopyError("kernel requires a global barrier %s"
-                                    % sched_item.comment)
-                    """
-
                     logger.info("%s: barrier insertion: local" % kernel.name)
 
                     gen_sched = insert_barriers(kernel, gen_sched,
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 8a31f67fbf85bf8a370f17d2775569697f247502..932acab50b29163345585b10edca0e020f8183b8 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -72,7 +72,7 @@ class ToCountMap:
     def __radd__(self, other):
         if other != 0:
             raise ValueError("ToCountMap: Attempted to add ToCountMap "
-                                "to {} {}. ToCountMap may only be added to "
+                                "to {0} {1}. ToCountMap may only be added to "
                                 "0 and other ToCountMap objects."
                                 .format(type(other), other))
         return self
@@ -84,7 +84,7 @@ class ToCountMap:
                 for index in self.dict.keys()))
         else:
             raise ValueError("ToCountMap: Attempted to multiply "
-                                "ToCountMap by {} {}."
+                                "ToCountMap by {0} {1}."
                                 .format(type(other), other))
 
     __rmul__ = __mul__
@@ -490,7 +490,10 @@ def count(kernel, set):
     except AttributeError:
         pass
 
-    count = 0
+    count = isl.PwQPolynomial.zero(
+            set.space
+            .drop_dims(dim_type.set, 0, set.dim(dim_type.set))
+            .add_dims(dim_type.set, 1))
 
     set = set.make_disjoint()
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index cade5e5e060b26a8c6d44965e61500baa31e5c08..cef32851f7578c946a7e26f672dbcceb588a9edd 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1286,6 +1286,12 @@ class AccessRangeMapper(WalkMapper):
 
             self.access_range = self.access_range | access_range
 
+    def map_linear_subscript(self, expr, inames):
+        self.rec(expr.index, inames)
+
+        if expr.aggregate.name == self.arg_name:
+            self.bad_subscripts.append(expr)
+
     def map_reduction(self, expr, inames):
         return WalkMapper.map_reduction(self, expr, inames | set(expr.inames))
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 3ec3a50b11f72a2975ac4366d495326bfcb69b37..eb39539b9c489320b227da7c7397c0748a704159 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -156,7 +156,7 @@ class ASTBuilderBase(object):
     def generate_top_of_body(self, codegen_state):
         return []
 
-    def get_temporary_decls(self, codegen_state):
+    def get_temporary_decls(self, codegen_state, schedule_index):
         raise NotImplementedError
 
     def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
@@ -239,7 +239,7 @@ class DummyHostASTBuilder(ASTBuilderBase):
             schedule_index):
         return None
 
-    def get_temporary_decls(self, codegen_state):
+    def get_temporary_decls(self, codegen_state, schedule_index):
         return []
 
     def get_expression_to_code_mapper(self, codegen_state):
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 15156f2a13ca1cb60d8b64471852e253b59a8c2c..11e93df8d4f912a73b4064ba17404ec27e6356cc 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -236,7 +236,7 @@ class CASTBuilder(ASTBuilderBase):
                         [self.idi_to_cgen_declarator(codegen_state.kernel, idi)
                             for idi in codegen_state.implemented_data_info])
 
-    def get_temporary_decls(self, codegen_state):
+    def get_temporary_decls(self, codegen_state, schedule_index):
         from loopy.kernel.data import temp_var_scope
 
         kernel = codegen_state.kernel
@@ -264,7 +264,7 @@ class CASTBuilder(ASTBuilderBase):
                         temp_decls.append(
                                 self.wrap_temporary_decl(
                                     self.get_temporary_decl(
-                                        kernel, tv, idi), tv.scope))
+                                        kernel, schedule_index, tv, idi), tv.scope))
 
             else:
                 offset = 0
@@ -349,7 +349,7 @@ class CASTBuilder(ASTBuilderBase):
         return ExpressionToCMapper(
                 codegen_state, fortran_abi=self.target.fortran_abi)
 
-    def get_temporary_decl(self, knl, temp_var, decl_info):
+    def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info):
         temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
 
         if decl_info.shape:
@@ -402,8 +402,9 @@ class CASTBuilder(ASTBuilderBase):
         if isinstance(func_id, Variable):
             func_id = func_id.name
 
-        assignee_var_descriptors = [codegen_state.kernel.get_var_descriptor(a)
-                for a, _ in insn.assignees_and_indices()]
+        assignee_var_descriptors = [
+                codegen_state.kernel.get_var_descriptor(a)
+                for a in insn.assignee_var_names()]
 
         par_dtypes = tuple(ecm.infer_type(par) for par in parameters)
 
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 896ea9158223435e3bef933818fbf3bc51a424b4..7f19bdbdf838bcf43b66eb9c1c4fbf58699634a9 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -74,12 +74,19 @@ class ExprToISPCMapper(ExpressionToCMapper):
                         "for constant '%s'" % expr)
 
     def map_variable(self, expr, enclosing_prec, type_context):
-        if expr.name in self.kernel.temporary_variables:
-            gsize, lsize = self.kernel.get_grid_sizes_as_exprs()
+        tv = self.kernel.temporary_variables.get(expr.name)
+
+        from loopy.kernel.data import temp_var_scope
+        if tv is not None and tv.scope == temp_var_scope.PRIVATE:
+            # FIXME: This is a pretty coarse way of deciding what
+            # private temporaries get duplicated. Refine? (See also
+            # below in decl generation)
+            gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
             if lsize:
                 return "%s[programIndex]" % expr.name
             else:
                 return expr.name
+
         else:
             return super(ExprToISPCMapper, self).map_variable(
                     expr, enclosing_prec, type_context)
@@ -291,7 +298,7 @@ class ISPCASTBuilder(CASTBuilder):
         else:
             raise LoopyError("unknown barrier kind")
 
-    def get_temporary_decl(self, knl, temp_var, decl_info):
+    def get_temporary_decl(self, knl, sched_index, temp_var, decl_info):
         from loopy.target.c import POD  # uses the correct complex type
         temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
 
@@ -299,7 +306,10 @@ class ISPCASTBuilder(CASTBuilder):
 
         from loopy.kernel.data import temp_var_scope
         if temp_var.scope == temp_var_scope.PRIVATE:
-            gsize, lsize = knl.get_grid_sizes_as_exprs()
+            # FIXME: This is a pretty coarse way of deciding what
+            # private temporaries get duplicated. Refine? (See also
+            # above in expr to code mapper)
+            _, lsize = knl.get_grid_size_upper_bounds_as_exprs()
             shape = lsize + shape
 
         if shape:
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index c8f3b6b9e9dbd827e11b35af7189cd0c49b5bba3..fe38f9c7eb1a6a6cf30a04c4f34fad232e653e13 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -556,8 +556,10 @@ class OpenCLCASTBuilder(CASTBuilder):
                 cast_str = "(%s %s *) " % (var_kind, ctype)
 
             return Block([
-                POD(self, NumpyType(lhs_dtype.dtype), old_val_var),
-                POD(self, NumpyType(lhs_dtype.dtype), new_val_var),
+                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
+                    old_val_var),
+                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
+                    new_val_var),
                 DoWhile(
                     "%(func_name)s("
                     "%(cast_str)s&(%(lhs_expr)s), "
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index c36d78c4d78098217f2005184c2a5bdde2f59918..b570cf4ab7e4ed543213b73716b6271e61e6b69b 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -611,7 +611,7 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
         # no such thing in Python
         return None
 
-    def get_temporary_decls(self, codegen_state):
+    def get_temporary_decls(self, codegen_state, schedule_state):
         from genpy import Assign, Comment, Line
 
         def alloc_nbytes(tv):
diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py
index 2c50b3f11a34ced81d6a93eb467482110f495f2f..181f5241137cc1abe3256506f3ebd59327e4c4f8 100644
--- a/loopy/transform/arithmetic.py
+++ b/loopy/transform/arithmetic.py
@@ -136,9 +136,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
             raise ValueError("unexpected type of access_expr")
 
     def is_assignee(insn):
-        return any(
-                lhs == var_name
-                for lhs, sbscript in insn.assignees_and_indices())
+        return var_name in insn.assignee_var_names()
 
     def iterate_as(cls, expr):
         if isinstance(expr, cls):
@@ -237,7 +235,7 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
             new_insns.append(insn)
             continue
 
-        (_, index_key), = insn.assignees_and_indices()
+        index_key = extract_index_key(insn.assignee)
 
         lhs = insn.assignee
         rhs = insn.expression
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index 967e14de692ee96b0c01d2ea5bcf8b411890038b..97c15c6f234211204db8366c5e69462cec5c2e77 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -143,9 +143,17 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
     bvc = _BatchVariableChanger(rule_mapping_context,
             knl, batch_varying_args, batch_iname_expr,
             sequential=sequential)
-    return rule_mapping_context.finish_kernel(
+    kernel = rule_mapping_context.finish_kernel(
             bvc.map_kernel(knl))
 
+    batch_iname_set = frozenset([batch_iname])
+    kernel = kernel.copy(
+            instructions=[
+                insn.copy(forced_iname_deps=insn.forced_iname_deps | batch_iname_set)
+                for insn in kernel.instructions])
+
+    return kernel
+
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index a7d22b2d0f7e376086b3a42ac2f186e2805fda26..0c25b63930f99ebcb8b2b817367ec1572b15a1d4 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -239,8 +239,27 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         if not within(kernel, insn.id, ()):
             continue
 
-        for assignee, index in insn.assignees_and_indices():
-            if assignee == var_name:
+        from pymbolic.primitives import Variable, Subscript
+        from loopy.symbolic import LinearSubscript
+
+        for assignee in insn.assignees:
+            if isinstance(assignee, Variable):
+                assignee_name = assignee.name
+                index = ()
+
+            elif isinstance(assignee, Subscript):
+                assignee_name = assignee.aggregate.name
+                index = assignee.index_tuple
+
+            elif isinstance(assignee, LinearSubscript):
+                if assignee.aggregate.name == var_name:
+                    raise LoopyError("buffer_array may not be applied in the "
+                            "presence of linear write indexing into '%s'" % var_name)
+
+            else:
+                raise LoopyError("invalid lvalue '%s'" % assignee)
+
+            if assignee_name == var_name:
                 within_inames.update(
                         (get_dependencies(index) & kernel.all_inames())
                         - buffer_inames_set)
@@ -381,7 +400,9 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
     init_instruction = Assignment(id=init_insn_id,
                 assignee=buf_var_init,
                 expression=init_expression,
-                forced_iname_deps=frozenset(within_inames),
+                forced_iname_deps=(
+                    frozenset(within_inames)
+                    | frozenset(non1_init_inames)),
                 depends_on=frozenset(),
                 depends_on_is_final=True)
 
@@ -396,8 +417,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
     did_write = False
     for insn_id in aar.modified_insn_ids:
         insn = kernel.id_to_insn[insn_id]
-        if any(assignee_name == buf_var_name
-                for assignee_name, _ in insn.assignees_and_indices()):
+        if buf_var_name in insn.assignee_var_names():
             did_write = True
 
     # {{{ add init_insn_id to depends_on
@@ -457,9 +477,12 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         store_instruction = Assignment(
                     id=kernel.make_unique_instruction_id(based_on="store_"+var_name),
                     depends_on=frozenset(aar.modified_insn_ids),
+                    no_sync_with=frozenset([init_insn_id]),
                     assignee=store_target,
                     expression=store_expression,
-                    forced_iname_deps=frozenset(within_inames))
+                    forced_iname_deps=(
+                        frozenset(within_inames)
+                        | frozenset(non1_store_inames)))
     else:
         did_write = False
 
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 3db96712eab21d9ad4efed6fc0a3dde08d1b9f65..89ad1cdb52d69cf1bee981bc93679fb76883dc54 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -139,7 +139,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
         temporary_name=None,
         temporary_scope=None, temporary_is_local=None,
         footprint_subscripts=None,
-        fetch_bounding_box=False):
+        fetch_bounding_box=False,
+        fetch_outer_inames=None):
     """Prefetch all accesses to the variable *var_name*, with all accesses
     being swept through *sweep_inames*.
 
@@ -153,6 +154,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
         such as those occurring in dimension splits are recorded and also
         applied to these indices.
 
+    :arg fetch_outer_inames: The inames within which the fetch
+        instruction is nested. If *None*, make an educated guess.
+
     This function combines :func:`extract_subst` and :func:`precompute`.
     """
 
@@ -246,7 +250,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
             default_tag=default_tag, dtype=arg.dtype,
             fetch_bounding_box=fetch_bounding_box,
             temporary_name=temporary_name,
-            temporary_scope=temporary_scope, temporary_is_local=temporary_is_local)
+            temporary_scope=temporary_scope, temporary_is_local=temporary_is_local,
+            precompute_outer_inames=fetch_outer_inames)
 
     # {{{ remove inames that were temporarily added by slice sweeps
 
@@ -571,4 +576,69 @@ def set_temporary_scope(kernel, temp_var_names, scope):
 
 # }}}
 
+
+# {{{ reduction_arg_to_subst_rule
+
+def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None):
+    if isinstance(inames, str):
+        inames = [s.strip() for s in inames.split(",")]
+
+    inames_set = frozenset(inames)
+
+    substs = knl.substitutions.copy()
+
+    var_name_gen = knl.get_var_name_generator()
+
+    def map_reduction(expr, rec, nresults=1):
+        if frozenset(expr.inames) != inames_set:
+            return type(expr)(
+                    operation=expr.operation,
+                    inames=expr.inames,
+                    expr=rec(expr.expr),
+                    allow_simultaneous=expr.allow_simultaneous)
+
+        if subst_rule_name is None:
+            subst_rule_prefix = "red_%s_arg" % "_".join(inames)
+            my_subst_rule_name = var_name_gen(subst_rule_prefix)
+        else:
+            my_subst_rule_name = subst_rule_name
+
+        if my_subst_rule_name in substs:
+            raise LoopyError("substitution rule '%s' already exists"
+                    % my_subst_rule_name)
+
+        from loopy.kernel.data import SubstitutionRule
+        substs[my_subst_rule_name] = SubstitutionRule(
+                name=my_subst_rule_name,
+                arguments=tuple(inames),
+                expression=expr.expr)
+
+        from pymbolic import var
+        iname_vars = [var(iname) for iname in inames]
+
+        return type(expr)(
+                operation=expr.operation,
+                inames=expr.inames,
+                expr=var(my_subst_rule_name)(*iname_vars),
+                allow_simultaneous=expr.allow_simultaneous)
+
+    from loopy.symbolic import ReductionCallbackMapper
+    cb_mapper = ReductionCallbackMapper(map_reduction)
+
+    from loopy.kernel.data import MultiAssignmentBase
+
+    new_insns = []
+    for insn in knl.instructions:
+        if not isinstance(insn, MultiAssignmentBase):
+            new_insns.append(insn)
+        else:
+            new_insns.append(insn.copy(expression=cb_mapper(insn.expression)))
+
+    return knl.copy(
+            instructions=new_insns,
+            substitutions=substs)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py
index 6e9eb45bb58b7750c075ee7b79fb024e4cf4f24a..a8c3b5849d90466d1163f4cc47777e1253cae079 100644
--- a/loopy/transform/diff.py
+++ b/loopy/transform/diff.py
@@ -295,13 +295,24 @@ class DifferentiationContext(object):
         if not diff_expr:
             return None
 
-        (_, lhs_ind), = orig_writer_insn.assignees_and_indices()
+        assert isinstance(orig_writer_insn, lp.Assignment)
+        if isinstance(orig_writer_insn.assignee, p.Subscript):
+            lhs_ind = orig_writer_insn.assignee.index_tuple
+        elif isinstance(orig_writer_insn.assignee, p.Variable):
+            lhs_ind = ()
+        else:
+            raise LoopyError(
+                    "Unrecognized LHS type in differentiation: %s"
+                    % type(orig_writer_insn.assignee).__name__)
+
         new_insn_id = self.generate_instruction_id()
         insn = lp.Assignment(
                 id=new_insn_id,
                 assignee=var(new_var_name)[
                     lhs_ind + diff_iname_exprs],
-                expression=diff_expr)
+                expression=diff_expr,
+                forced_iname_deps=(
+                    orig_writer_insn.forced_iname_deps | frozenset(diff_inames)))
 
         self.new_instructions.append(insn)
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index e72897a0427c9818be26d42cda298f8d4f71bca9..4c3cd0a6988b269139eceb7028423407765282f6 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -48,8 +48,6 @@ __doc__ = """
 
 .. autofunction:: duplicate_inames
 
-.. undocumented .. autofunction:: link_inames
-
 .. autofunction:: rename_iname
 
 .. autofunction:: remove_unused_inames
@@ -890,114 +888,6 @@ def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
 # }}}
 
 
-# {{{ link inames
-
-def link_inames(knl, inames, new_iname, within=None, tag=None):
-    # {{{ normalize arguments
-
-    if isinstance(inames, str):
-        inames = inames.split(",")
-
-    var_name_gen = knl.get_var_name_generator()
-    new_iname = var_name_gen(new_iname)
-
-    # }}}
-
-    # {{{ ensure that each iname is used at most once in each instruction
-
-    inames_set = set(inames)
-
-    if 0:
-        # FIXME!
-        for insn in knl.instructions:
-            insn_inames = knl.insn_inames(insn.id) | insn.reduction_inames()
-
-            if len(insn_inames & inames_set) > 1:
-                raise LoopyError("To-be-linked inames '%s' are used in "
-                        "instruction '%s'. No more than one such iname can "
-                        "be used in one instruction."
-                        % (", ".join(insn_inames & inames_set), insn.id))
-
-    # }}}
-
-    from loopy.kernel.tools import DomainChanger
-    domch = DomainChanger(knl, tuple(inames))
-
-    # {{{ ensure that projections are identical
-
-    unrelated_dom_inames = list(
-            set(domch.domain.get_var_names(dim_type.set))
-            - inames_set)
-
-    domain = domch.domain
-
-    # move all inames to be linked to end to prevent shuffly confusion
-    for iname in inames:
-        dt, index = domain.get_var_dict()[iname]
-        assert dt == dim_type.set
-
-        # move to tail of param dim_type
-        domain = domain.move_dims(
-                    dim_type.param, domain.dim(dim_type.param),
-                    dt, index, 1)
-        # move to tail of set dim_type
-        domain = domain.move_dims(
-                    dim_type.set, domain.dim(dim_type.set),
-                    dim_type.param, domain.dim(dim_type.param)-1, 1)
-
-    projections = [
-            domch.domain.project_out_except(
-                unrelated_dom_inames + [iname], [dim_type.set])
-            for iname in inames]
-
-    all_equal = True
-    first_proj = projections[0]
-    for proj in projections[1:]:
-        all_equal = all_equal and (proj <= first_proj and first_proj <= proj)
-
-    if not all_equal:
-        raise LoopyError("Inames cannot be linked because their domain "
-                "constraints are not the same.")
-
-    del domain  # messed up for testing, do not use
-
-    # }}}
-
-    # change the domain
-    from loopy.isl_helpers import duplicate_axes
-    knl = knl.copy(
-            domains=domch.get_domains_with(
-                duplicate_axes(domch.domain, [inames[0]], [new_iname])))
-
-    # {{{ change the code
-
-    from pymbolic import var
-    subst_dict = dict((iname, var(new_iname)) for iname in inames)
-
-    from loopy.match import parse_stack_match
-    within = parse_stack_match(within)
-
-    from pymbolic.mapper.substitutor import make_subst_func
-    rule_mapping_context = SubstitutionRuleMappingContext(
-            knl.substitutions, var_name_gen)
-    ijoin = RuleAwareSubstitutionMapper(rule_mapping_context,
-                    make_subst_func(subst_dict), within)
-
-    knl = rule_mapping_context.finish_kernel(
-            ijoin.map_kernel(knl))
-
-    # }}}
-
-    knl = remove_unused_inames(knl, inames)
-
-    if tag is not None:
-        knl = tag_inames(knl, {new_iname: tag})
-
-    return knl
-
-# }}}
-
-
 # {{{ remove unused inames
 
 def remove_unused_inames(knl, inames=None):
@@ -1252,8 +1142,8 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
 
     # {{{ change domains
 
-    new_inames_set = set(new_inames)
-    old_inames_set = set(old_inames)
+    new_inames_set = frozenset(new_inames)
+    old_inames_set = frozenset(old_inames)
 
     new_domains = []
     for idom, dom in enumerate(kernel.domains):
@@ -1339,7 +1229,26 @@ def affine_map_inames(kernel, old_inames, new_inames, equations):
 
     # }}}
 
-    return kernel.copy(domains=new_domains)
+    # {{{ switch iname refs in instructions
+
+    def fix_iname_set(insn_id, inames):
+        if old_inames_set <= inames:
+            return (inames - old_inames_set) | new_inames_set
+        elif old_inames_set & inames:
+            raise LoopyError("instruction '%s' uses only a part (%s), not all, "
+                    "of the old inames"
+                    % (insn_id, ", ".join(old_inames_set & inames)))
+        else:
+            return inames
+
+    new_instructions = [
+            insn.copy(forced_iname_deps=fix_iname_set(
+                insn.id, insn.forced_iname_deps))
+            for insn in kernel.instructions]
+
+    # }}}
+
+    return kernel.copy(domains=new_domains, instructions=new_instructions)
 
 # }}}
 
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 8b2fa370ad4214cbe16caec61ff45b684ed0a889..e73e5c33c8703a4cdbb5fc20feca9caabeb077c1 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -27,7 +27,7 @@ import six  # noqa
 from loopy.diagnostic import LoopyError
 
 
-# {{{ instruction processing
+# {{{ find_instructions
 
 def find_instructions(kernel, insn_match):
     from loopy.match import parse_match
@@ -35,6 +35,11 @@ def find_instructions(kernel, insn_match):
     return [insn for insn in kernel.instructions if match(kernel, insn)]
 
 
+# }}}
+
+
+# {{{ map_instructions
+
 def map_instructions(kernel, insn_match, f):
     from loopy.match import parse_match
     match = parse_match(insn_match)
@@ -49,6 +54,10 @@ def map_instructions(kernel, insn_match, f):
 
     return kernel.copy(instructions=new_insns)
 
+# }}}
+
+
+# {{{ set_instruction_priority
 
 def set_instruction_priority(kernel, insn_match, priority):
     """Set the priority of instructions matching *insn_match* to *priority*.
@@ -62,6 +71,10 @@ def set_instruction_priority(kernel, insn_match, priority):
 
     return map_instructions(kernel, insn_match, set_prio)
 
+# }}}
+
+
+# {{{ add_dependency
 
 def add_dependency(kernel, insn_match, dependency):
     """Add the instruction dependency *dependency* to the instructions matched
@@ -87,6 +100,10 @@ def add_dependency(kernel, insn_match, dependency):
 
     return map_instructions(kernel, insn_match, add_dep)
 
+# }}}
+
+
+# {{{ remove_instructions
 
 def remove_instructions(kernel, insn_ids):
     """Return a new kernel with instructions in *insn_ids* removed.
@@ -124,6 +141,10 @@ def remove_instructions(kernel, insn_ids):
     return kernel.copy(
             instructions=new_insns)
 
+# }}}
+
+
+# {{{ replace_instruction_ids
 
 def replace_instruction_ids(kernel, replacements):
     new_insns = []
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index fd6f33efc95d568fbc553c761c64508f42c4832c..0772b3dc02e7f1bbd83ccfabd485497d93b57601 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -239,6 +239,7 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper):
 
 def precompute(kernel, subst_use, sweep_inames=[], within=None,
         storage_axes=None, temporary_name=None, precompute_inames=None,
+        precompute_outer_inames=None,
         storage_axis_to_tag={}, default_tag="l.auto", dtype=None,
         fetch_bounding_box=False,
         temporary_scope=None, temporary_is_local=None,
@@ -307,6 +308,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         tuple, in which case names will be automatically created.
         May also equivalently be a comma-separated string.
 
+    :arg precompute_outer_inames: The inames within which the compute
+        instruction is nested. If *None*, make an educated guess.
+
     :arg compute_insn_id: The ID of the instruction performing the precomputation.
 
     If `storage_axes` is not specified, it defaults to the arrangement
@@ -480,8 +484,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     from loopy.symbolic import SubstitutionRuleExpander
     submap = SubstitutionRuleExpander(kernel.substitutions)
 
-    value_inames = get_dependencies(
-            submap(subst.expression)
+    value_inames = (
+            get_dependencies(submap(subst.expression))
+            - frozenset(subst.arguments)
             ) & kernel.all_inames()
     if value_inames - expanding_usage_arg_deps < extra_storage_axes:
         raise RuntimeError("unreferenced sweep inames specified: "
@@ -728,8 +733,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     assignee = var(temporary_name)
 
     if non1_storage_axis_names:
-        assignee = assignee.index(
-                tuple(var(iname) for iname in non1_storage_axis_names))
+        assignee = assignee[
+                tuple(var(iname) for iname in non1_storage_axis_names)]
 
     # {{{ process substitutions on compute instruction
 
@@ -764,7 +769,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     compute_insn = Assignment(
             id=compute_insn_id,
             assignee=assignee,
-            expression=compute_expression)
+            expression=compute_expression,
+            # forced_iname_deps determined below
+            )
 
     # }}}
 
@@ -784,6 +791,29 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # }}}
 
+    # {{{ determine inames for compute insn
+
+    if precompute_outer_inames is None:
+        from loopy.kernel.tools import guess_iname_deps_based_on_var_use
+        precompute_outer_inames = (
+                    frozenset(non1_storage_axis_names)
+                    | frozenset(
+                        (expanding_usage_arg_deps | value_inames)
+                        - sweep_inames_set)
+                    | guess_iname_deps_based_on_var_use(kernel, compute_insn))
+    else:
+        if not isinstance(precompute_outer_inames, frozenset):
+            raise TypeError("precompute_outer_inames must be a frozenset")
+
+    kernel = kernel.copy(
+            instructions=[
+                insn.copy(forced_iname_deps=precompute_outer_inames)
+                if insn.id == compute_insn_id
+                else insn
+                for insn in kernel.instructions])
+
+    # }}}
+
     # {{{ set up temp variable
 
     import loopy as lp
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 63e70f8e386bfc6ecdddb4b8661e9f975206725e..27e98383ee7211643aecdafebc9bf63988c55e0c 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -391,11 +391,21 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
     for def_id, subst_name in six.iteritems(tts.definition_insn_id_to_subst_name):
         def_insn = kernel.id_to_insn[def_id]
 
-        (_, indices), = def_insn.assignees_and_indices()
+        from loopy.kernel.data import Assignment
+        assert isinstance(def_insn, Assignment)
+
+        from pymbolic.primitives import Variable, Subscript
+        if isinstance(def_insn.assignee, Subscript):
+            indices = def_insn.assignee.index_tuple
+        elif isinstance(def_insn.assignee, Variable):
+            indices = ()
+        else:
+            raise LoopyError(
+                    "Unrecognized LHS type: %s"
+                    % type(def_insn.assignee).__name__)
 
         arguments = []
 
-        from pymbolic.primitives import Variable
         for i in indices:
             if not isinstance(i, Variable):
                 raise LoopyError("In defining instruction '%s': "
@@ -460,6 +470,9 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
 # {{{ expand_subst
 
 def expand_subst(kernel, within=None):
+    if not kernel.substitutions:
+        return kernel
+
     logger.debug("%s: expand subst" % kernel.name)
 
     from loopy.symbolic import RuleAwareSubstitutionRuleExpander
diff --git a/loopy/version.py b/loopy/version.py
index cfaa4b9a01d4eeb57b4cdf1b64a114e3b8c3bb33..0fcb9c04d1439378a56f4f47cfd6b1a653a1b5c1 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v33-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v36-islpy%s" % _islpy_version
diff --git a/test/test_apps.py b/test/test_apps.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dfc6dc0c5edc22e8ccc8fdb791f4f7013901de5
--- /dev/null
+++ b/test/test_apps.py
@@ -0,0 +1,416 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pyopencl.clrandom  # noqa
+import pytest
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+__all__ = [
+        "pytest_generate_tests",
+        "cl"  # 'cl.create_some_context'
+        ]
+
+
+# {{{ convolutions
+
+def test_convolution(ctx_factory):
+    ctx = ctx_factory()
+
+    dtype = np.float32
+
+    knl = lp.make_kernel(
+        "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
+                -f_w <= f_x,f_y <= f_w \
+                and 0 <= im_x < im_w and 0 <= im_y < im_h \
+                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
+                }",
+        """
+        out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \
+            img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \
+            * f[ifeat, f_w+f_x, f_w+f_y, icolor])
+        """,
+        [
+            lp.GlobalArg("f", dtype, shape=lp.auto),
+            lp.GlobalArg("img", dtype, shape=lp.auto),
+            lp.GlobalArg("out", dtype, shape=lp.auto),
+            "..."
+            ],
+        assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0",
+        options="annotate_inames")
+
+    f_w = 3
+
+    knl = lp.fix_parameters(knl, f_w=f_w, ncolors=3)
+
+    ref_knl = knl
+
+    def variant_0(knl):
+        #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
+        knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y")
+        return knl
+
+    def variant_1(knl):
+        knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
+        knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
+        return knl
+
+    def variant_2(knl):
+        knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0")
+        knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
+        knl = lp.tag_inames(knl, dict(ifeat="g.2"))
+        knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]")
+        knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y")
+        return knl
+
+    for variant in [
+            #variant_0,
+            #variant_1,
+            variant_2
+            ]:
+        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
+                parameters=dict(
+                    im_w=128, im_h=128, f_w=f_w,
+                    nfeats=3, nimgs=3
+                    ))
+
+
+def test_convolution_with_nonzero_base(ctx_factory):
+    # This is kept alive as a test for domains that don't start at zero.
+    # These are a bad idea for split_iname, which places its origin at zero
+    # and therefore produces a first block that is odd-sized.
+    #
+    # Therefore, for real tests, check test_convolution further up.
+
+    ctx = ctx_factory()
+
+    dtype = np.float32
+
+    knl = lp.make_kernel(
+        "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
+                -f_w <= f_x,f_y <= f_w \
+                and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \
+                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
+                }",
+        """
+        out[iimg, ifeat, im_x-f_w, im_y-f_w] = sum((f_x, f_y, icolor), \
+            img[iimg, im_x-f_x, im_y-f_y, icolor] \
+            * f[ifeat, f_w+f_x, f_w+f_y, icolor])
+        """,
+        [
+            lp.GlobalArg("f", dtype, shape=lp.auto),
+            lp.GlobalArg("img", dtype, shape=lp.auto),
+            lp.GlobalArg("out", dtype, shape=lp.auto),
+            "..."
+            ],
+        assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0",
+        options="annotate_inames")
+
+    knl = lp.fix_parameters(knl, ncolors=3)
+
+    ref_knl = knl
+
+    f_w = 3
+
+    def variant_0(knl):
+        #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
+        knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y")
+        return knl
+
+    def variant_1(knl):
+        knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
+        knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
+        return knl
+
+    for variant in [
+            variant_0,
+            variant_1,
+            ]:
+        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
+                parameters=dict(
+                    im_w=128, im_h=128, f_w=f_w,
+                    nfeats=12, nimgs=17
+                    ))
+
+# }}}
+
+
+def test_rob_stroud_bernstein(ctx_factory):
+    ctx = ctx_factory()
+
+    # NOTE: tmp would have to be zero-filled beforehand
+
+    knl = lp.make_kernel(
+            "{[el, i2, alpha1,alpha2]: \
+                    0 <= el < nels and \
+                    0 <= i2 < nqp1d and \
+                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
+            """
+            for el,i2
+                <> xi = qpts[1, i2]
+                <> s = 1-xi
+                <> r = xi/s
+                <> aind = 0 {id=aind_init}
+
+                for alpha1
+                    <> w = s**(deg-alpha1) {id=init_w}
+
+                    for alpha2
+                        tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
+                                {id=write_tmp}
+                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
+                                {id=update_w,dep=init_w:write_tmp}
+                        aind = aind + 1 \
+                                {id=aind_incr,dep=aind_init:write_tmp:update_w}
+                    end
+                end
+            end
+            """,
+            [
+                # Must declare coeffs to have "no" shape, to keep loopy
+                # from trying to figure it out the shape automatically.
+
+                lp.GlobalArg("coeffs", None, shape=None),
+                "..."
+                ],
+            assumptions="deg>=0 and nels>=1"
+            )
+
+    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
+    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
+    knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
+            slabs=(0, 1))
+    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
+
+    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
+            dict(
+                qpts=np.float32,
+                coeffs=np.float32,
+                tmp=np.float32,
+                )))
+
+
+def test_rob_stroud_bernstein_full(ctx_factory):
+    #logging.basicConfig(level=logging.DEBUG)
+    ctx = ctx_factory()
+
+    # NOTE: result would have to be zero-filled beforehand
+
+    knl = lp.make_kernel(
+        "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \
+                0 <= el < nels and \
+                0 <= i2 < nqp1d and \
+                0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\
+                \
+                0 <= i1_2 < nqp1d and \
+                0 <= alpha1_2 <= deg and \
+                0 <= i2_2 < nqp1d \
+                }",
+        """
+        for el
+            for i2
+                <> xi = qpts[1, i2]
+                <> s = 1-xi
+                <> r = xi/s
+                <> aind = 0 {id=aind_init}
+
+                for alpha1
+                    <> w = s**(deg-alpha1) {id=init_w}
+
+                    <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \
+                            {id=write_tmp}
+                    for alpha2
+                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
+                            {id=update_w,dep=init_w:write_tmp}
+                        aind = aind + 1 \
+                            {id=aind_incr,dep=aind_init:write_tmp:update_w}
+                    end
+                end
+            end
+
+            for i1_2
+                <> xi2 = qpts[0, i1_2] {dep=aind_incr}
+                <> s2 = 1-xi2
+                <> r2 = xi2/s2
+                <> w2 = s2**deg
+
+                for alpha1_2
+                    for i2_2
+                        result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \
+                                w2 * tmp[alpha1_2, i2_2]
+                    end
+
+                    w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2)
+                end
+            end
+        end
+        """,
+        [
+            # Must declare coeffs to have "no" shape, to keep loopy
+            # from trying to figure it out the shape automatically.
+
+            lp.GlobalArg("coeffs", None, shape=None),
+            "..."
+            ],
+        assumptions="deg>=0 and nels>=1"
+        )
+
+    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
+
+    if 0:
+        knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
+        knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
+                slabs=(0, 1))
+        knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
+
+    from pickle import dumps, loads
+    knl = loads(dumps(knl))
+
+    knl = lp.CompiledKernel(ctx, knl).get_highlighted_code(
+            dict(
+                qpts=np.float32,
+                tmp=np.float32,
+                coeffs=np.float32,
+                result=np.float32,
+                ))
+    print(knl)
+
+
+def test_stencil(ctx_factory):
+    ctx = ctx_factory()
+
+    # n=32 causes corner case behavior in size calculations for temprorary (a
+    # non-unifiable, two-constant-segments PwAff as the base index)
+
+    n = 256
+    knl = lp.make_kernel(
+            "{[i,j]: 0<= i,j < %d}" % n,
+            [
+                "a_offset(ii, jj) := a[ii+1, jj+1]",
+                "z[i,j] = -2*a_offset(i,j)"
+                " + a_offset(i,j-1)"
+                " + a_offset(i,j+1)"
+                " + a_offset(i-1,j)"
+                " + a_offset(i+1,j)"
+                ],
+            [
+                lp.GlobalArg("a", np.float32, shape=(n+2, n+2,)),
+                lp.GlobalArg("z", np.float32, shape=(n+2, n+2,))
+                ])
+
+    ref_knl = knl
+
+    def variant_1(knl):
+        knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
+        knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
+        knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"])
+        knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"])
+        return knl
+
+    def variant_2(knl):
+        knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
+        knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
+        knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
+                fetch_bounding_box=True)
+        knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"])
+        return knl
+
+    for variant in [
+            #variant_1,
+            variant_2,
+            ]:
+        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
+                print_ref_code=False,
+                op_count=[n*n], op_label=["cells"])
+
+
+def test_stencil_with_overfetch(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<= i,j < n}",
+            [
+                "a_offset(ii, jj) := a[ii+2, jj+2]",
+                "z[i,j] = -2*a_offset(i,j)"
+                " + a_offset(i,j-1)"
+                " + a_offset(i,j+1)"
+                " + a_offset(i-1,j)"
+                " + a_offset(i+1,j)"
+
+                " + a_offset(i,j-2)"
+                " + a_offset(i,j+2)"
+                " + a_offset(i-2,j)"
+                " + a_offset(i+2,j)"
+                ],
+            assumptions="n>=1")
+
+    if ctx.devices[0].platform.name == "Portable Computing Language":
+        # https://github.com/pocl/pocl/issues/205
+        pytest.skip("takes very long to compile on pocl")
+
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+
+    ref_knl = knl
+
+    def variant_overfetch(knl):
+        knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1",
+                slabs=(1, 1))
+        knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0",
+               slabs=(1, 1))
+        knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
+                fetch_bounding_box=True)
+        knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"])
+        return knl
+
+    for variant in [variant_overfetch]:
+        n = 200
+        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
+                print_ref_code=False,
+                op_count=[n*n], parameters=dict(n=n), op_label=["cells"])
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_dg.py b/test/test_dg.py
index fafef86c35211183ebdaeb75acf2b664a36586a0..d65c68ed4c729582d511064d6495535efcf7a9a4 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -72,8 +72,9 @@ def test_dg_volume(ctx_factory):
                     order=order),
                 lp.ValueArg("K", np.int32, approximately=1000),
                 ],
-            name="dg_volume", assumptions="K>=1",
-            defines=dict(Np=Np))
+            name="dg_volume", assumptions="K>=1")
+
+    knl = lp.fix_parameters(knl, Np=Np)
 
     seq_knl = knl
 
@@ -218,9 +219,10 @@ def no_test_dg_surface(ctx_factory):
                 lp.GlobalArg("LIFT", dtype, shape="Np, NfpNfaces", order="C"),
                 lp.ValueArg("K", np.int32, approximately=1000),
                 ],
-            name="dg_surface", assumptions="K>=1",
-            defines=dict(Np=Np, Nfp=Nfp, NfpNfaces=Nfaces*Nfp, nsurf_dofs=K*Nfp),
-            )
+            name="dg_surface", assumptions="K>=1")
+
+    knl = lp.fix_parameters(knl,
+            Np=Np, Nfp=Nfp, NfpNfaces=Nfaces*Nfp, nsurf_dofs=K*Nfp)
 
     seq_knl = knl
 
diff --git a/test/test_domain.py b/test/test_domain.py
new file mode 100644
index 0000000000000000000000000000000000000000..0631a08796eeda73376afcd1cd03a737a33241f6
--- /dev/null
+++ b/test/test_domain.py
@@ -0,0 +1,375 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+from six.moves import range
+
+import sys
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pyopencl.clrandom  # noqa
+import pytest
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+__all__ = [
+        "pytest_generate_tests",
+        "cl"  # 'cl.create_some_context'
+        ]
+
+
+def test_assume(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            "a[i] = a[i] + 1",
+            [lp.GlobalArg("a", np.float32, shape="n"), "..."])
+
+    knl = lp.split_iname(knl, "i", 16)
+    knl = lp.set_loop_priority(knl, "i_outer,i_inner")
+    knl = lp.assume(knl, "n mod 16 = 0")
+    knl = lp.assume(knl, "n > 10")
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+    kernel_gen = lp.generate_loop_schedules(knl)
+
+    for gen_knl in kernel_gen:
+        print(gen_knl)
+        compiled = lp.CompiledKernel(ctx, gen_knl)
+        print(compiled.get_code())
+        assert "if" not in compiled.get_code()
+
+
+def test_divisibility_assumption(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "[n] -> {[i]: 0<=i<n}",
+            [
+                "b[i] = 2*a[i]"
+                ],
+            [
+                lp.GlobalArg("a", np.float32, shape=("n",)),
+                lp.GlobalArg("b", np.float32, shape=("n",)),
+                lp.ValueArg("n", np.int32),
+                ],
+            assumptions="n>=1 and (exists zz: n = 16*zz)")
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16)
+
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+    for k in lp.generate_loop_schedules(knl):
+        code = lp.generate_code(k)
+        assert "if" not in code
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl,
+            parameters={"n": 16**3})
+
+
+def test_eq_constraint(ctx_factory):
+    logging.basicConfig(level=logging.INFO)
+
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<= i,j < 32}",
+            [
+                "a[i] = b[i]"
+                ],
+            [
+                lp.GlobalArg("a", np.float32, shape=(1000,)),
+                lp.GlobalArg("b", np.float32, shape=(1000,))
+                ])
+
+    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
+    knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")
+
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+    kernel_gen = lp.generate_loop_schedules(knl)
+
+    for knl in kernel_gen:
+        print(lp.generate_code(knl))
+
+
+def test_dependent_loop_bounds(ctx_factory):
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            [
+                "{[i]: 0<=i<n}",
+                "{[jj]: 0<=jj<row_len}",
+                ],
+            [
+                "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]",
+                "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])",
+                ],
+            [
+                lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto),
+                lp.GlobalArg("a_indices", np.int32, shape=lp.auto),
+                lp.GlobalArg("a_values", dtype),
+                lp.GlobalArg("a_sum", dtype, shape=lp.auto),
+                lp.ValueArg("n", np.int32),
+                ],
+            assumptions="n>=1 and row_len>=1")
+
+    cknl = lp.CompiledKernel(ctx, knl)
+    print("---------------------------------------------------")
+    print(cknl.get_highlighted_code())
+    print("---------------------------------------------------")
+
+
+def test_dependent_loop_bounds_2(ctx_factory):
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            [
+                "{[i]: 0<=i<n}",
+                "{[jj]: 0<=jj<row_len}",
+                ],
+            [
+                "<> row_start = a_rowstarts[i]",
+                "<> row_len = a_rowstarts[i+1] - row_start",
+                "ax[i] = sum(jj, a_values[[row_start+jj]])",
+                ],
+            [
+                lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto),
+                lp.GlobalArg("a_indices", np.int32, shape=lp.auto),
+                lp.GlobalArg("a_values", dtype, strides=(1,)),
+                lp.GlobalArg("ax", dtype, shape=lp.auto),
+                lp.ValueArg("n", np.int32),
+                ],
+            assumptions="n>=1 and row_len>=1")
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
+            inner_tag="l.0")
+    cknl = lp.CompiledKernel(ctx, knl)
+    print("---------------------------------------------------")
+    print(cknl.get_highlighted_code())
+    print("---------------------------------------------------")
+
+
+def test_dependent_loop_bounds_3(ctx_factory):
+    # The point of this test is that it shows a dependency between
+    # domains that is exclusively mediated by the row_len temporary.
+    # It also makes sure that row_len gets read before any
+    # conditionals use it.
+
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            [
+                "{[i]: 0<=i<n}",
+                "{[jj]: 0<=jj<row_len}",
+                ],
+            [
+                "<> row_len = a_row_lengths[i]",
+                "a[i,jj] = 1",
+                ],
+            [
+                lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
+                lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
+                lp.ValueArg("n", np.int32),
+                ])
+
+    assert knl.parents_per_domain()[1] == 0
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
+            inner_tag="l.0")
+
+    cknl = lp.CompiledKernel(ctx, knl)
+    print("---------------------------------------------------")
+    print(cknl.get_highlighted_code())
+    print("---------------------------------------------------")
+
+    knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1",
+            inner_tag="l.1")
+
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+
+    import pytest
+    with pytest.raises(RuntimeError):
+        list(lp.generate_loop_schedules(knl_bad))
+
+
+def test_independent_multi_domain(ctx_factory):
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            [
+                "{[i]: 0<=i<n}",
+                "{[j]: 0<=j<n}",
+                ],
+            [
+                "a[i] = 1",
+                "b[j] = 2",
+                ],
+            [
+                lp.GlobalArg("a", dtype, shape=("n"), order="C"),
+                lp.GlobalArg("b", dtype, shape=("n"), order="C"),
+                lp.ValueArg("n", np.int32),
+                ])
+
+    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0",
+            inner_tag="l.0")
+    knl = lp.split_iname(knl, "j", 16, outer_tag="g.0",
+            inner_tag="l.0")
+    assert knl.parents_per_domain() == 2*[None]
+
+    n = 50
+    cknl = lp.CompiledKernel(ctx, knl)
+    evt, (a, b) = cknl(queue, n=n, out_host=True)
+
+    assert a.shape == (50,)
+    assert b.shape == (50,)
+    assert (a == 1).all()
+    assert (b == 2).all()
+
+
+def test_equality_constraints(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+
+    order = "C"
+
+    n = 10
+
+    knl = lp.make_kernel([
+            "[n] -> {[i,j]: 0<=i,j<n }",
+            "{[k]: k =i+5 and k < n}",
+            ],
+            [
+                "a[i,j] = 5 {id=set_all}",
+                "b[i,k] = 22 {dep=set_all}",
+                ],
+            [
+                lp.GlobalArg("a,b", dtype, shape="n, n", order=order),
+                lp.ValueArg("n", np.int32, approximately=1000),
+                ],
+            name="equality_constraints", assumptions="n>=1")
+
+    seq_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
+    #print(knl)
+    #print(knl.domains[0].detect_equalities())
+
+    lp.auto_test_vs_ref(seq_knl, ctx, knl,
+            parameters=dict(n=n), print_ref_code=True)
+
+
+def test_stride(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+
+    order = "C"
+
+    n = 10
+
+    knl = lp.make_kernel([
+            "{[i]: 0<=i<n and (exists l: i = 2*l)}",
+            ],
+            [
+                "a[i] = 5",
+                ],
+            [
+                lp.GlobalArg("a", dtype, shape="n", order=order),
+                lp.ValueArg("n", np.int32, approximately=1000),
+                ],
+            assumptions="n>=1")
+
+    seq_knl = knl
+
+    lp.auto_test_vs_ref(seq_knl, ctx, knl,
+            parameters=dict(n=n))
+
+
+def test_domain_dependency_via_existentially_quantified_variable(ctx_factory):
+    dtype = np.float32
+    ctx = ctx_factory()
+
+    order = "C"
+
+    n = 10
+
+    knl = lp.make_kernel([
+            "{[i]: 0<=i<n }",
+            "{[k]: k=i and (exists l: k = 2*l) }",
+            ],
+            [
+                "a[i] = 5 {id=set}",
+                "b[k] = 6 {dep=set}",
+                ],
+            [
+                lp.GlobalArg("a,b", dtype, shape="n", order=order),
+                lp.ValueArg("n", np.int32, approximately=1000),
+                ],
+            assumptions="n>=1")
+
+    seq_knl = knl
+
+    lp.auto_test_vs_ref(seq_knl, ctx, knl,
+            parameters=dict(n=n))
+
+
+def test_triangle_domain(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i,j<n and i <= j}",
+            "a[i,j] = 17",
+            assumptions="n>=1")
+
+    print(knl)
+    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 0e0b59089fe1d3a1c1310bd0834a29ca751b8df0..b7a1e059cb4499fe136f962b5d5fdcd539fc05f1 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -285,7 +285,7 @@ def test_rank_one(ctx_factory):
     def variant_1(knl):
         knl = lp.add_prefetch(knl, "a")
         knl = lp.add_prefetch(knl, "b")
-        knl = knl.set_loop_priority(knl, ["i", "j"])
+        knl = lp.set_loop_priority(knl, ["i", "j"])
         return knl
 
     def variant_2(knl):
@@ -294,7 +294,6 @@ def test_rank_one(ctx_factory):
         knl = lp.split_iname(knl, "j", 16,
                 outer_tag="g.1", inner_tag="l.1")
 
-        knl = knl.set_loop_priority(knl, ["i", "j"])
         knl = lp.add_prefetch(knl, "a")
         knl = lp.add_prefetch(knl, "b")
         return knl
@@ -331,8 +330,7 @@ def test_rank_one(ctx_factory):
 
     seq_knl = knl
 
-    #for variant in [variant_1, variant_2, variant_3, variant_4]:
-    for variant in [variant_4]:
+    for variant in [variant_1, variant_2, variant_3, variant_4]:
         lp.auto_test_vs_ref(seq_knl, ctx, variant(knl),
                 op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"],
                 parameters={"n": n})
@@ -604,7 +602,7 @@ def test_small_batched_matvec(ctx_factory):
     Np = 36  # noqa
 
     knl = lp.make_kernel(
-            "[K] -> {[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np,
+            "{[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np,
             [
                 "result[k, i] = sum(j, d[i, j]*f[k, j])"
                 ],
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 739a99b8bcd43d8940e8ccbb5ce460d1c6c2b44e..26f4f2faf3b83a24932cad0ac3941696ed3e2636 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -79,23 +79,6 @@ def test_complicated_subst(ctx_factory):
         assert substs_with_letter == how_many
 
 
-def test_extract_subst(ctx_factory):
-    knl = lp.make_kernel(
-            "{[i]: 0<=i<n}",
-            """
-                a[i] = 23*b[i]**2 + 25*b[i]**2
-                """)
-
-    knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha")
-
-    print(knl)
-
-    from loopy.symbolic import parse
-
-    insn, = knl.instructions
-    assert insn.expression == parse("bsquare(23) + bsquare(25)")
-
-
 def test_type_inference_no_artificial_doubles(ctx_factory):
     ctx = ctx_factory()
 
@@ -142,28 +125,6 @@ def test_sized_and_complex_literals(ctx_factory):
     lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
 
 
-def test_assume(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i]: 0<=i<n}",
-            "a[i] = a[i] + 1",
-            [lp.GlobalArg("a", np.float32, shape="n"), "..."])
-
-    knl = lp.split_iname(knl, "i", 16)
-    knl = lp.set_loop_priority(knl, "i_outer,i_inner")
-    knl = lp.assume(knl, "n mod 16 = 0")
-    knl = lp.assume(knl, "n > 10")
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for gen_knl in kernel_gen:
-        print(gen_knl)
-        compiled = lp.CompiledKernel(ctx, gen_knl)
-        print(compiled.get_code())
-        assert "if" not in compiled.get_code()
-
-
 def test_simple_side_effect(ctx_factory):
     ctx = ctx_factory()
 
@@ -184,22 +145,6 @@ def test_simple_side_effect(ctx_factory):
         print(compiled.get_code())
 
 
-def test_nonsense_reduction(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i]: 0<=i<100}",
-            """
-                a[i] = sum(i, 2)
-                """,
-            [lp.GlobalArg("a", np.float32, shape=(100,))]
-            )
-
-    import pytest
-    with pytest.raises(RuntimeError):
-        knl = lp.preprocess_kernel(knl, ctx.devices[0])
-
-
 def test_owed_barriers(ctx_factory):
     ctx = ctx_factory()
 
@@ -243,56 +188,6 @@ def test_wg_too_small(ctx_factory):
             lp.CompiledKernel(ctx, gen_knl).get_code()
 
 
-def test_join_inames(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<16}",
-            [
-                "b[i,j] = 2*a[i,j]"
-                ],
-            [
-                lp.GlobalArg("a", np.float32, shape=(16, 16,)),
-                lp.GlobalArg("b", np.float32, shape=(16, 16,))
-                ],
-            )
-
-    ref_knl = knl
-
-    knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"])
-    knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"])
-
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True)
-
-
-def test_divisibility_assumption(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "[n] -> {[i]: 0<=i<n}",
-            [
-                "b[i] = 2*a[i]"
-                ],
-            [
-                lp.GlobalArg("a", np.float32, shape=("n",)),
-                lp.GlobalArg("b", np.float32, shape=("n",)),
-                lp.ValueArg("n", np.int32),
-                ],
-            assumptions="n>=1 and (exists zz: n = 16*zz)")
-
-    ref_knl = knl
-
-    knl = lp.split_iname(knl, "i", 16)
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    for k in lp.generate_loop_schedules(knl):
-        code = lp.generate_code(k)
-        assert "if" not in code
-
-    lp.auto_test_vs_ref(ref_knl, ctx, knl,
-            parameters={"n": 16**3})
-
-
 def test_multi_cse(ctx_factory):
     ctx = ctx_factory()
 
@@ -315,149 +210,6 @@ def test_multi_cse(ctx_factory):
         print(compiled.get_code())
 
 
-def test_stencil(ctx_factory):
-    ctx = ctx_factory()
-
-    # n=32 causes corner case behavior in size calculations for temprorary (a
-    # non-unifiable, two-constant-segments PwAff as the base index)
-
-    n = 256
-    knl = lp.make_kernel(
-            "{[i,j]: 0<= i,j < %d}" % n,
-            [
-                "a_offset(ii, jj) := a[ii+1, jj+1]",
-                "z[i,j] = -2*a_offset(i,j)"
-                " + a_offset(i,j-1)"
-                " + a_offset(i,j+1)"
-                " + a_offset(i-1,j)"
-                " + a_offset(i+1,j)"
-                ],
-            [
-                lp.GlobalArg("a", np.float32, shape=(n+2, n+2,)),
-                lp.GlobalArg("z", np.float32, shape=(n+2, n+2,))
-                ])
-
-    ref_knl = knl
-
-    def variant_1(knl):
-        knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
-        knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
-        knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"])
-        knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"])
-        return knl
-
-    def variant_2(knl):
-        knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
-        knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
-        knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
-                fetch_bounding_box=True)
-        knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"])
-        return knl
-
-    for variant in [
-            #variant_1,
-            variant_2,
-            ]:
-        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
-                print_ref_code=False,
-                op_count=[n*n], op_label=["cells"])
-
-
-def test_stencil_with_overfetch(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<= i,j < n}",
-            [
-                "a_offset(ii, jj) := a[ii+2, jj+2]",
-                "z[i,j] = -2*a_offset(i,j)"
-                " + a_offset(i,j-1)"
-                " + a_offset(i,j+1)"
-                " + a_offset(i-1,j)"
-                " + a_offset(i+1,j)"
-
-                " + a_offset(i,j-2)"
-                " + a_offset(i,j+2)"
-                " + a_offset(i-2,j)"
-                " + a_offset(i+2,j)"
-                ],
-            assumptions="n>=1")
-
-    if ctx.devices[0].platform.name == "Portable Computing Language":
-        # https://github.com/pocl/pocl/issues/205
-        pytest.skip("takes very long to compile on pocl")
-
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
-
-    ref_knl = knl
-
-    def variant_overfetch(knl):
-        knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1",
-                slabs=(1, 1))
-        knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0",
-               slabs=(1, 1))
-        knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
-                fetch_bounding_box=True)
-        knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"])
-        return knl
-
-    for variant in [variant_overfetch]:
-        n = 200
-        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
-                print_ref_code=False,
-                op_count=[n*n], parameters=dict(n=n), op_label=["cells"])
-
-
-def test_eq_constraint(ctx_factory):
-    logging.basicConfig(level=logging.INFO)
-
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<= i,j < 32}",
-            [
-                "a[i] = b[i]"
-                ],
-            [
-                lp.GlobalArg("a", np.float32, shape=(1000,)),
-                lp.GlobalArg("b", np.float32, shape=(1000,))
-                ])
-
-    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
-    knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for knl in kernel_gen:
-        print(lp.generate_code(knl))
-
-
-def test_argmax(ctx_factory):
-    logging.basicConfig(level=logging.INFO)
-
-    dtype = np.dtype(np.float32)
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    n = 10000
-
-    knl = lp.make_kernel(
-            "{[i]: 0<=i<%d}" % n,
-            """
-            max_val, max_idx = argmax(i, fabs(a[i]))
-            """)
-
-    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
-    print(lp.preprocess_kernel(knl))
-    knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
-
-    a = np.random.randn(10000).astype(dtype)
-    evt, (max_idx, max_val) = knl(queue, a=a, out_host=True)
-    assert max_val == np.max(np.abs(a))
-    assert max_idx == np.where(np.abs(a) == max_val)[-1]
-
-
 # {{{ code generator fuzzing
 
 def make_random_value():
@@ -571,552 +323,148 @@ def test_fuzz_code_generator(ctx_factory):
 # }}}
 
 
-def test_empty_reduction(ctx_factory):
+def test_bare_data_dependency(ctx_factory):
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
             [
-                "{[i]: 0<=i<20}",
-                "[i] -> {[j]: 0<=j<0}"
+                "[znirp] -> {[i]: 0<=i<znirp}",
                 ],
             [
-                "a[i] = sum(j, j)",
+                "<> znirp = n",
+                "a[i] = 1",
                 ],
             [
-                lp.GlobalArg("a", dtype, (20,)),
+                lp.GlobalArg("a", dtype, shape=("n"), order="C"),
+                lp.ValueArg("n", np.int32),
                 ])
+
     cknl = lp.CompiledKernel(ctx, knl)
+    n = 20000
+    evt, (a,) = cknl(queue, n=n, out_host=True)
 
-    evt, (a,) = cknl(queue)
+    assert a.shape == (n,)
+    assert (a == 1).all()
 
-    assert (a.get() == 0).all()
 
+# {{{ test race detection
 
-def test_nested_dependent_reduction(ctx_factory):
-    dtype = np.dtype(np.int32)
+@pytest.mark.skipif("sys.version_info < (2,6)")
+def test_ilp_write_race_detection_global(ctx_factory):
     ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(
+            "[n] -> {[i,j]: 0<=i,j<n }",
             [
-                "{[i]: 0<=i<n}",
-                "{[j]: 0<=j<i+sumlen}"
+                "a[i] = 5+i+j",
                 ],
             [
-                "<> sumlen = l[i]",
-                "a[i] = sum(j, j)",
+                lp.GlobalArg("a", np.float32),
+                lp.ValueArg("n", np.int32, approximately=1000),
                 ],
-            [
-                lp.ValueArg("n", np.int32),
-                lp.GlobalArg("a", dtype, ("n",)),
-                lp.GlobalArg("l", np.int32, ("n",)),
-                ])
+            assumptions="n>=1")
 
-    cknl = lp.CompiledKernel(ctx, knl)
+    knl = lp.tag_inames(knl, dict(j="ilp"))
 
-    n = 330
-    l = np.arange(n, dtype=np.int32)
-    evt, (a,) = cknl(queue, l=l, n=n, out_host=True)
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+
+    with lp.CacheMode(False):
+        from loopy.diagnostic import WriteRaceConditionWarning
+        from warnings import catch_warnings
+        with catch_warnings(record=True) as warn_list:
+            list(lp.generate_loop_schedules(knl))
 
-    tgt_result = (2*l-1)*2*l/2
-    assert (a == tgt_result).all()
+            assert any(isinstance(w.message, WriteRaceConditionWarning)
+                    for w in warn_list)
 
 
-def test_multi_nested_dependent_reduction(ctx_factory):
-    dtype = np.dtype(np.int32)
+def test_ilp_write_race_avoidance_local(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
+            "{[i,j]: 0<=i<16 and 0<=j<17 }",
             [
-                "{[itgt]: 0 <= itgt < ntgts}",
-                "{[isrc_box]: 0 <= isrc_box < nboxes}",
-                "{[isrc]: 0 <= isrc < npart}"
-                ],
-            [
-                "<> npart = nparticles_per_box[isrc_box]",
-                "a[itgt] = sum((isrc_box, isrc), 1)",
-                ],
-            [
-                lp.ValueArg("n", np.int32),
-                lp.GlobalArg("a", dtype, ("n",)),
-                lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)),
-                lp.ValueArg("ntgts", np.int32),
-                lp.ValueArg("nboxes", np.int32),
+                "<> a[i] = 5+i+j",
                 ],
-            assumptions="ntgts>=1")
+            [])
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print(cknl.get_code())
-    # FIXME: Actually test functionality.
+    knl = lp.tag_inames(knl, dict(i="l.0", j="ilp"))
+
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+    for k in lp.generate_loop_schedules(knl):
+        assert k.temporary_variables["a"].shape == (16, 17)
 
 
-def test_recursive_nested_dependent_reduction(ctx_factory):
-    dtype = np.dtype(np.int32)
+def test_ilp_write_race_avoidance_private(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
+            "{[j]: 0<=j<16 }",
             [
-                "{[itgt]: 0 <= itgt < ntgts}",
-                "{[isrc_box]: 0 <= isrc_box < nboxes}",
-                "{[isrc]: 0 <= isrc < npart}"
-                ],
-            [
-                "<> npart = nparticles_per_box[isrc_box]",
-                "<> boxsum = sum(isrc, isrc+isrc_box+itgt)",
-                "a[itgt] = sum(isrc_box, boxsum)",
-                ],
-            [
-                lp.ValueArg("n", np.int32),
-                lp.GlobalArg("a", dtype, ("n",)),
-                lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)),
-                lp.ValueArg("ntgts", np.int32),
-                lp.ValueArg("nboxes", np.int32),
+                "<> a = 5+j",
                 ],
-            assumptions="ntgts>=1")
-
-    cknl = lp.CompiledKernel(ctx, knl)
-    print(cknl.get_code())
-    # FIXME: Actually test functionality.
-
+            [])
 
-def test_dependent_loop_bounds(ctx_factory):
-    dtype = np.dtype(np.float32)
-    ctx = ctx_factory()
+    knl = lp.tag_inames(knl, dict(j="ilp"))
 
-    knl = lp.make_kernel(
-            [
-                "{[i]: 0<=i<n}",
-                "{[jj]: 0<=jj<row_len}",
-                ],
-            [
-                "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]",
-                "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])",
-                ],
-            [
-                lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto),
-                lp.GlobalArg("a_indices", np.int32, shape=lp.auto),
-                lp.GlobalArg("a_values", dtype),
-                lp.GlobalArg("a_sum", dtype, shape=lp.auto),
-                lp.ValueArg("n", np.int32),
-                ],
-            assumptions="n>=1 and row_len>=1")
+    knl = lp.preprocess_kernel(knl, ctx.devices[0])
+    for k in lp.generate_loop_schedules(knl):
+        assert k.temporary_variables["a"].shape == (16,)
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print("---------------------------------------------------")
-    print(cknl.get_highlighted_code())
-    print("---------------------------------------------------")
+# }}}
 
 
-def test_dependent_loop_bounds_2(ctx_factory):
-    dtype = np.dtype(np.float32)
+def test_write_parameter(ctx_factory):
+    dtype = np.float32
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
+            "{[i,j]: 0<=i,j<n }",
+            """
+                a = sum((i,j), i*j)
+                b = sum(i, sum(j, i*j))
+                n = 15
+                """,
             [
-                "{[i]: 0<=i<n}",
-                "{[jj]: 0<=jj<row_len}",
-                ],
-            [
-                "<> row_start = a_rowstarts[i]",
-                "<> row_len = a_rowstarts[i+1] - row_start",
-                "ax[i] = sum(jj, a_values[[row_start+jj]])",
-                ],
-            [
-                lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto),
-                lp.GlobalArg("a_indices", np.int32, shape=lp.auto),
-                lp.GlobalArg("a_values", dtype, strides=(1,)),
-                lp.GlobalArg("ax", dtype, shape=lp.auto),
-                lp.ValueArg("n", np.int32),
+                lp.GlobalArg("a", dtype, shape=()),
+                lp.GlobalArg("b", dtype, shape=()),
+                lp.ValueArg("n", np.int32, approximately=1000),
                 ],
-            assumptions="n>=1 and row_len>=1")
+            assumptions="n>=1")
 
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
-            inner_tag="l.0")
-    cknl = lp.CompiledKernel(ctx, knl)
-    print("---------------------------------------------------")
-    print(cknl.get_highlighted_code())
-    print("---------------------------------------------------")
+    import pytest
+    with pytest.raises(RuntimeError):
+        lp.CompiledKernel(ctx, knl).get_code()
 
 
-def test_dependent_loop_bounds_3(ctx_factory):
-    # The point of this test is that it shows a dependency between
-    # domains that is exclusively mediated by the row_len temporary.
-    # It also makes sure that row_len gets read before any
-    # conditionals use it.
+# {{{ arg guessing
 
-    dtype = np.dtype(np.float32)
+def test_arg_shape_guessing(ctx_factory):
     ctx = ctx_factory()
 
     knl = lp.make_kernel(
+            "{[i,j]: 0<=i,j<n }",
+            """
+                a = 1.5 + sum((i,j), i*j)
+                b[i, j] = i*j
+                c[i+j, j] = b[j,i]
+                """,
             [
-                "{[i]: 0<=i<n}",
-                "{[jj]: 0<=jj<row_len}",
-                ],
-            [
-                "<> row_len = a_row_lengths[i]",
-                "a[i,jj] = 1",
+                lp.GlobalArg("a", shape=lp.auto),
+                lp.GlobalArg("b", shape=lp.auto),
+                lp.GlobalArg("c", shape=lp.auto),
+                lp.ValueArg("n"),
                 ],
-            [
-                lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
-                lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
-                lp.ValueArg("n", np.int32),
-                ])
-
-    assert knl.parents_per_domain()[1] == 0
+            assumptions="n>=1")
 
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
-            inner_tag="l.0")
+    print(knl)
+    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print("---------------------------------------------------")
-    print(cknl.get_highlighted_code())
-    print("---------------------------------------------------")
 
-    knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1",
-            inner_tag="l.1")
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-
-    import pytest
-    with pytest.raises(RuntimeError):
-        list(lp.generate_loop_schedules(knl_bad))
-
-
-def test_independent_multi_domain(ctx_factory):
-    dtype = np.dtype(np.float32)
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    knl = lp.make_kernel(
-            [
-                "{[i]: 0<=i<n}",
-                "{[j]: 0<=j<n}",
-                ],
-            [
-                "a[i] = 1",
-                "b[j] = 2",
-                ],
-            [
-                lp.GlobalArg("a", dtype, shape=("n"), order="C"),
-                lp.GlobalArg("b", dtype, shape=("n"), order="C"),
-                lp.ValueArg("n", np.int32),
-                ])
-
-    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0",
-            inner_tag="l.0")
-    knl = lp.split_iname(knl, "j", 16, outer_tag="g.0",
-            inner_tag="l.0")
-    assert knl.parents_per_domain() == 2*[None]
-
-    n = 50
-    cknl = lp.CompiledKernel(ctx, knl)
-    evt, (a, b) = cknl(queue, n=n, out_host=True)
-
-    assert a.shape == (50,)
-    assert b.shape == (50,)
-    assert (a == 1).all()
-    assert (b == 2).all()
-
-
-def test_bare_data_dependency(ctx_factory):
-    dtype = np.dtype(np.float32)
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    knl = lp.make_kernel(
-            [
-                "[znirp] -> {[i]: 0<=i<znirp}",
-                ],
-            [
-                "<> znirp = n",
-                "a[i] = 1",
-                ],
-            [
-                lp.GlobalArg("a", dtype, shape=("n"), order="C"),
-                lp.ValueArg("n", np.int32),
-                ])
-
-    cknl = lp.CompiledKernel(ctx, knl)
-    n = 20000
-    evt, (a,) = cknl(queue, n=n, out_host=True)
-
-    assert a.shape == (n,)
-    assert (a == 1).all()
-
-
-def test_equality_constraints(ctx_factory):
-    dtype = np.float32
-    ctx = ctx_factory()
-
-    order = "C"
-
-    n = 10
-
-    knl = lp.make_kernel([
-            "[n] -> {[i,j]: 0<=i,j<n }",
-            "{[k]: k =i+5 and k < n}",
-            ],
-            [
-                "a[i,j] = 5 {id=set_all}",
-                "b[i,k] = 22 {dep=set_all}",
-                ],
-            [
-                lp.GlobalArg("a,b", dtype, shape="n, n", order=order),
-                lp.ValueArg("n", np.int32, approximately=1000),
-                ],
-            name="equality_constraints", assumptions="n>=1")
-
-    seq_knl = knl
-
-    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
-    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
-    #print(knl)
-    #print(knl.domains[0].detect_equalities())
-
-    lp.auto_test_vs_ref(seq_knl, ctx, knl,
-            parameters=dict(n=n), print_ref_code=True)
-
-
-def test_stride(ctx_factory):
-    dtype = np.float32
-    ctx = ctx_factory()
-
-    order = "C"
-
-    n = 10
-
-    knl = lp.make_kernel([
-            "{[i]: 0<=i<n and (exists l: i = 2*l)}",
-            ],
-            [
-                "a[i] = 5",
-                ],
-            [
-                lp.GlobalArg("a", dtype, shape="n", order=order),
-                lp.ValueArg("n", np.int32, approximately=1000),
-                ],
-            assumptions="n>=1")
-
-    seq_knl = knl
-
-    lp.auto_test_vs_ref(seq_knl, ctx, knl,
-            parameters=dict(n=n))
-
-
-def test_domain_dependency_via_existentially_quantified_variable(ctx_factory):
-    dtype = np.float32
-    ctx = ctx_factory()
-
-    order = "C"
-
-    n = 10
-
-    knl = lp.make_kernel([
-            "{[i]: 0<=i<n }",
-            "{[k]: k=i and (exists l: k = 2*l) }",
-            ],
-            [
-                "a[i] = 5 {id=set}",
-                "b[k] = 6 {dep=set}",
-                ],
-            [
-                lp.GlobalArg("a,b", dtype, shape="n", order=order),
-                lp.ValueArg("n", np.int32, approximately=1000),
-                ],
-            assumptions="n>=1")
-
-    seq_knl = knl
-
-    lp.auto_test_vs_ref(seq_knl, ctx, knl,
-            parameters=dict(n=n))
-
-
-def test_double_sum(ctx_factory):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    n = 20
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<n }",
-            [
-                "a = simul_reduce(sum, (i,j), i*j)",
-                "b = simul_reduce(sum, i, simul_reduce(sum, j, i*j))",
-                ],
-            assumptions="n>=1")
-
-    evt, (a, b) = knl(queue, n=n)
-
-    ref = sum(i*j for i in range(n) for j in range(n))
-    assert a.get() == ref
-    assert b.get() == ref
-
-
-@pytest.mark.parametrize(("op_name", "np_op"), [
-    ("sum", np.sum),
-    ("product", np.prod),
-    ("min", np.min),
-    ("max", np.max),
-    ])
-def test_reduction_library(ctx_factory, op_name, np_op):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<=i<n and 0<=j<m }",
-            [
-                "res[i] = reduce(%s, j, a[i,j])" % op_name,
-                ],
-            assumptions="n>=1")
-
-    a = np.random.randn(20, 10)
-    evt, (res,) = knl(queue, a=a)
-
-    assert np.allclose(res, np_op(a, axis=1))
-
-
-def test_double_sum_made_unique(ctx_factory):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    n = 20
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<n }",
-            [
-                "a = sum((i,j), i*j)",
-                "b = sum(i, sum(j, i*j))",
-                ],
-            assumptions="n>=1")
-
-    knl = lp.make_reduction_inames_unique(knl)
-    print(knl)
-
-    evt, (a, b) = knl(queue, n=n)
-
-    ref = sum(i*j for i in range(n) for j in range(n))
-    assert a.get() == ref
-    assert b.get() == ref
-
-
-# {{{ test race detection
-
-@pytest.mark.skipif("sys.version_info < (2,6)")
-def test_ilp_write_race_detection_global(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "[n] -> {[i,j]: 0<=i,j<n }",
-            [
-                "a[i] = 5+i+j",
-                ],
-            [
-                lp.GlobalArg("a", np.float32),
-                lp.ValueArg("n", np.int32, approximately=1000),
-                ],
-            assumptions="n>=1")
-
-    knl = lp.tag_inames(knl, dict(j="ilp"))
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-
-    with lp.CacheMode(False):
-        from loopy.diagnostic import WriteRaceConditionWarning
-        from warnings import catch_warnings
-        with catch_warnings(record=True) as warn_list:
-            list(lp.generate_loop_schedules(knl))
-
-            assert any(isinstance(w.message, WriteRaceConditionWarning)
-                    for w in warn_list)
-
-
-def test_ilp_write_race_avoidance_local(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<=i<16 and 0<=j<17 }",
-            [
-                "<> a[i] = 5+i+j",
-                ],
-            [])
-
-    knl = lp.tag_inames(knl, dict(i="l.0", j="ilp"))
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    for k in lp.generate_loop_schedules(knl):
-        assert k.temporary_variables["a"].shape == (16, 17)
-
-
-def test_ilp_write_race_avoidance_private(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[j]: 0<=j<16 }",
-            [
-                "<> a = 5+j",
-                ],
-            [])
-
-    knl = lp.tag_inames(knl, dict(j="ilp"))
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    for k in lp.generate_loop_schedules(knl):
-        assert k.temporary_variables["a"].shape == (16,)
-
-# }}}
-
-
-def test_write_parameter(ctx_factory):
-    dtype = np.float32
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<n }",
-            """
-                a = sum((i,j), i*j)
-                b = sum(i, sum(j, i*j))
-                n = 15
-                """,
-            [
-                lp.GlobalArg("a", dtype, shape=()),
-                lp.GlobalArg("b", dtype, shape=()),
-                lp.ValueArg("n", np.int32, approximately=1000),
-                ],
-            assumptions="n>=1")
-
-    import pytest
-    with pytest.raises(RuntimeError):
-        lp.CompiledKernel(ctx, knl).get_code()
-
-
-# {{{ arg guessing
-
-def test_arg_shape_guessing(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<n }",
-            """
-                a = 1.5 + sum((i,j), i*j)
-                b[i, j] = i*j
-                c[i+j, j] = b[j,i]
-                """,
-            [
-                lp.GlobalArg("a", shape=lp.auto),
-                lp.GlobalArg("b", shape=lp.auto),
-                lp.GlobalArg("c", shape=lp.auto),
-                lp.ValueArg("n"),
-                ],
-            assumptions="n>=1")
-
-    print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
-
-
-def test_arg_guessing(ctx_factory):
-    ctx = ctx_factory()
+def test_arg_guessing(ctx_factory):
+    ctx = ctx_factory()
 
     knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n }",
@@ -1169,18 +517,6 @@ def test_nonlinear_index(ctx_factory):
     print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
 
 
-def test_triangle_domain(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i,j]: 0<=i,j<n and i <= j}",
-            "a[i,j] = 17",
-            assumptions="n>=1")
-
-    print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
-
-
 def test_offsets_and_slicing(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -1227,144 +563,21 @@ def test_vector_ilp_with_prefetch(ctx_factory):
             "out[i] = 2*a[i]",
             [
                 # Tests that comma'd arguments interoperate with
-                # argument guessing.
-                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
-                "..."
-                ])
-
-    knl = lp.split_iname(knl, "i", 128, inner_tag="l.0")
-    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
-    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
-
-    cknl = lp.CompiledKernel(ctx, knl)
-    cknl.cl_kernel_info()
-
-    import re
-    code = cknl.get_code()
-    assert len(list(re.finditer("barrier", code))) == 1
-
-
-# {{{ convolutions
-
-def test_convolution(ctx_factory):
-    ctx = ctx_factory()
-
-    dtype = np.float32
-
-    knl = lp.make_kernel(
-        "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
-                -f_w <= f_x,f_y <= f_w \
-                and 0 <= im_x < im_w and 0 <= im_y < im_h \
-                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
-                }",
-        """
-        out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \
-            img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \
-            * f[ifeat, f_w+f_x, f_w+f_y, icolor])
-        """,
-        [
-            lp.GlobalArg("f", dtype, shape=lp.auto),
-            lp.GlobalArg("img", dtype, shape=lp.auto),
-            lp.GlobalArg("out", dtype, shape=lp.auto),
-            "..."
-            ],
-        assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0",
-        flags="annotate_inames",
-        defines=dict(ncolors=3))
-
-    f_w = 3
-
-    knl = lp.fix_parameters(knl, f_w=f_w)
-
-    ref_knl = knl
-
-    def variant_0(knl):
-        #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
-        knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y")
-        return knl
-
-    def variant_1(knl):
-        knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
-        knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
-        return knl
-
-    def variant_2(knl):
-        knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0")
-        knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
-        knl = lp.tag_inames(knl, dict(ifeat="g.2"))
-        knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]")
-        knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y")
-        return knl
-
-    for variant in [
-            variant_0,
-            variant_1,
-            variant_2
-            ]:
-        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
-                parameters=dict(
-                    im_w=128, im_h=128, f_w=f_w,
-                    nfeats=3, nimgs=3
-                    ))
-
-
-def test_convolution_with_nonzero_base(ctx_factory):
-    # This is kept alive as a test for domains that don't start at zero.
-    # These are a bad idea for split_iname, which places its origin at zero
-    # and therefore produces a first block that is odd-sized.
-    #
-    # Therefore, for real tests, check test_convolution further up.
-
-    ctx = ctx_factory()
-
-    dtype = np.float32
-
-    knl = lp.make_kernel(
-        "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
-                -f_w <= f_x,f_y <= f_w \
-                and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \
-                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
-                }",
-        """
-        out[iimg, ifeat, im_x-f_w, im_y-f_w] = sum((f_x, f_y, icolor), \
-            img[iimg, im_x-f_x, im_y-f_y, icolor] \
-            * f[ifeat, f_w+f_x, f_w+f_y, icolor])
-        """,
-        [
-            lp.GlobalArg("f", dtype, shape=lp.auto),
-            lp.GlobalArg("img", dtype, shape=lp.auto),
-            lp.GlobalArg("out", dtype, shape=lp.auto),
-            "..."
-            ],
-        assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0",
-        flags="annotate_inames",
-        defines=dict(ncolors=3))
-
-    ref_knl = knl
-
-    f_w = 3
-
-    def variant_0(knl):
-        #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
-        knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y")
-        return knl
+                # argument guessing.
+                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
+                "..."
+                ])
 
-    def variant_1(knl):
-        knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
-        knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
-        return knl
+    knl = lp.split_iname(knl, "i", 128, inner_tag="l.0")
+    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
 
-    for variant in [
-            variant_0,
-            variant_1,
-            ]:
-        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
-                parameters=dict(
-                    im_w=128, im_h=128, f_w=f_w,
-                    nfeats=12, nimgs=17
-                    ))
+    cknl = lp.CompiledKernel(ctx, knl)
+    cknl.cl_kernel_info()
 
-# }}}
+    import re
+    code = cknl.get_code()
+    assert len(list(re.finditer("barrier", code))) == 1
 
 
 def test_c_instruction(ctx_factory):
@@ -1442,21 +655,6 @@ def test_inames_deps_from_write_subscript(ctx_factory):
     assert "i" in knl.insn_inames("myred")
 
 
-def test_split_reduction(ctx_factory):
-    knl = lp.make_kernel(
-            "{[i,j,k]: 0<=i,j,k<n}",
-            """
-                b = sum((i,j,k), a[i,j,k])
-                """,
-            [
-                lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a",
-                    None, shape=None),
-                "..."])
-
-    knl = lp.split_reduction_outward(knl, "j,k")
-    # FIXME: finish test
-
-
 def test_modulo_indexing(ctx_factory):
     ctx = ctx_factory()
 
@@ -1478,132 +676,6 @@ def test_modulo_indexing(ctx_factory):
                 )))
 
 
-def test_rob_stroud_bernstein(ctx_factory):
-    ctx = ctx_factory()
-
-    # NOTE: tmp would have to be zero-filled beforehand
-
-    knl = lp.make_kernel(
-            "{[el, i2, alpha1,alpha2]: \
-                    0 <= el < nels and \
-                    0 <= i2 < nqp1d and \
-                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
-            """
-                <> xi = qpts[1, i2] {inames=+el}
-                <> s = 1-xi
-                <> r = xi/s
-                <> aind = 0 {id=aind_init,inames=+i2:el}
-
-                <> w = s**(deg-alpha1) {id=init_w}
-
-                tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
-                        {id=write_tmp,inames=+alpha2}
-                w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
-                        {id=update_w,dep=init_w:write_tmp}
-                aind = aind + 1 \
-                        {id=aind_incr,\
-                        dep=aind_init:write_tmp:update_w, \
-                        inames=+el:i2:alpha1:alpha2}
-                """,
-            [
-                # Must declare coeffs to have "no" shape, to keep loopy
-                # from trying to figure it out the shape automatically.
-
-                lp.GlobalArg("coeffs", None, shape=None),
-                "..."
-                ],
-            assumptions="deg>=0 and nels>=1"
-            )
-
-    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
-    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
-    knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
-            slabs=(0, 1))
-    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
-
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
-            dict(
-                qpts=np.float32,
-                coeffs=np.float32,
-                tmp=np.float32,
-                )))
-
-
-def test_rob_stroud_bernstein_full(ctx_factory):
-    #logging.basicConfig(level=logging.DEBUG)
-    ctx = ctx_factory()
-
-    # NOTE: result would have to be zero-filled beforehand
-
-    knl = lp.make_kernel(
-            "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \
-                    0 <= el < nels and \
-                    0 <= i2 < nqp1d and \
-                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\
-                    \
-                    0 <= i1_2 < nqp1d and \
-                    0 <= alpha1_2 <= deg and \
-                    0 <= i2_2 < nqp1d \
-                    }",
-            """
-                <> xi = qpts[1, i2] {inames=+el}
-                <> s = 1-xi
-                <> r = xi/s
-                <> aind = 0 {id=aind_init,inames=+i2:el}
-
-                <> w = s**(deg-alpha1) {id=init_w}
-
-                <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \
-                        {id=write_tmp,inames=+alpha2}
-                w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
-                        {id=update_w,dep=init_w:write_tmp}
-                aind = aind + 1 \
-                        {id=aind_incr,\
-                        dep=aind_init:write_tmp:update_w, \
-                        inames=+el:i2:alpha1:alpha2}
-
-                <> xi2 = qpts[0, i1_2] {dep=aind_incr,inames=+el}
-                <> s2 = 1-xi2
-                <> r2 = xi2/s2
-                <> w2 = s2**deg
-
-                result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \
-                        w2 * tmp[alpha1_2, i2_2] \
-                        {inames=el:alpha1_2:i1_2:i2_2}
-
-                w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2)
-                """,
-            [
-                # Must declare coeffs to have "no" shape, to keep loopy
-                # from trying to figure it out the shape automatically.
-
-                lp.GlobalArg("coeffs", None, shape=None),
-                "..."
-                ],
-            assumptions="deg>=0 and nels>=1"
-            )
-
-    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
-
-    if 0:
-        knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
-        knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
-                slabs=(0, 1))
-        knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
-
-    from pickle import dumps, loads
-    knl = loads(dumps(knl))
-
-    knl = lp.CompiledKernel(ctx, knl).get_highlighted_code(
-            dict(
-                qpts=np.float32,
-                tmp=np.float32,
-                coeffs=np.float32,
-                result=np.float32,
-                ))
-    print(knl)
-
-
 @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16])
 def test_vector_types(ctx_factory, vec_len):
     ctx = ctx_factory()
@@ -1632,28 +704,6 @@ def test_vector_types(ctx_factory, vec_len):
                 ))
 
 
-def test_tag_data_axes(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{ [i,j,k]: 0<=i,j,k<n }",
-            "out[i,j,k] = 15")
-
-    ref_knl = knl
-
-    with pytest.raises(lp.LoopyError):
-        lp.tag_data_axes(knl, "out", "N1,N0,N5")
-
-    with pytest.raises(lp.LoopyError):
-        lp.tag_data_axes(knl, "out", "N1,N0,c")
-
-    knl = lp.tag_data_axes(knl, "out", "N1,N0,N2")
-    knl = lp.tag_inames(knl, dict(j="g.0", i="g.1"))
-
-    lp.auto_test_vs_ref(ref_knl, ctx, knl,
-            parameters=dict(n=20))
-
-
 def test_conditional(ctx_factory):
     #logging.basicConfig(level=logging.DEBUG)
     ctx = ctx_factory()
@@ -1768,431 +818,62 @@ def test_multiple_writes_to_local_temporary():
         "{[i,e]: 0<=i<5 and 0<=e<nelements}",
         """
         <> temp[i, 0] = 17
-        temp[i, 1] = 15
-        """)
-    knl = lp.tag_inames(knl, dict(i="l.0"))
-
-    knl = lp.preprocess_kernel(knl)
-    for k in lp.generate_loop_schedules(knl):
-        code, _ = lp.generate_code(k)
-        print(code)
-
-
-def test_fd_demo():
-    knl = lp.make_kernel(
-        "{[i,j]: 0<=i,j<n}",
-        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
-                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
-                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
-    #assumptions="n mod 16=0")
-    knl = lp.split_iname(knl,
-            "i", 16, outer_tag="g.1", inner_tag="l.1")
-    knl = lp.split_iname(knl,
-            "j", 16, outer_tag="g.0", inner_tag="l.0")
-    knl = lp.add_prefetch(knl, "u",
-            ["i_inner", "j_inner"],
-            fetch_bounding_box=True)
-
-    #n = 1000
-    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)
-
-    knl = lp.set_options(knl, write_cl=True)
-    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
-    code, inf = lp.generate_code(knl)
-    print(code)
-
-    assert "double" not in code
-
-
-def test_fd_1d(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-        "{[i]: 0<=i<n}",
-        "result[i] = u[i+1]-u[i]")
-
-    knl = lp.add_and_infer_dtypes(knl, {"u": np.float32})
-    ref_knl = knl
-
-    knl = lp.split_iname(knl, "i", 16)
-    knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j")
-    knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for")
-    knl = lp.assume(knl, "n mod 16 = 0")
-
-    lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
-            parameters=dict(n=2048))
-
-
-def test_make_copy_kernel(ctx_factory):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    intermediate_format = "f,f,sep"
-
-    a1 = np.random.randn(1024, 4, 3)
-
-    cknl1 = lp.make_copy_kernel(intermediate_format)
-
-    cknl1 = lp.fix_parameters(cknl1, n2=3)
-
-    cknl1 = lp.set_options(cknl1, write_cl=True)
-    evt, a2 = cknl1(queue, input=a1)
-
-    cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format)
-    cknl2 = lp.fix_parameters(cknl2, n2=3)
-
-    evt, a3 = cknl2(queue, input=a2)
-
-    assert (a1 == a3).all()
-
-
-def test_set_arg_order():
-    knl = lp.make_kernel(
-            "{ [i,j]: 0<=i,j<n }",
-            "out[i,j] = a[i]*b[j]")
-
-    knl = lp.set_argument_order(knl, "out,a,n,b")
-
-
-def test_affine_map_inames():
-    knl = lp.make_kernel(
-        "{[e, i,j,n]: 0<=e<E and 0<=i,j,n<N}",
-        "rhsQ[e, n+i, j] = rhsQ[e, n+i, j] - D[i, n]*x[i,j]")
-
-    knl = lp.affine_map_inames(knl,
-            "i", "i0",
-            "i0 = n+i")
-
-    print(knl)
-
-
-def test_precompute_confusing_subst_arguments(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-        "{[i,j]: 0<=i<n and 0<=j<5}",
-        """
-        D(i):=a[i+1]-a[i]
-        b[i,j] = D(j)
-        """)
-
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
-
-    ref_knl = knl
-
-    knl = lp.tag_inames(knl, dict(j="g.1"))
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
-
-    from loopy.symbolic import get_dependencies
-    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
-    knl = lp.precompute(knl, "D")
-
-    lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
-            parameters=dict(n=12345))
-
-
-def test_precompute_nested_subst(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-        "{[i,j]: 0<=i<n and 0<=j<5}",
-        """
-        E:=a[i]
-        D:=E*E
-        b[i] = D
-        """)
-
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
-
-    ref_knl = knl
-
-    knl = lp.tag_inames(knl, dict(j="g.1"))
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
-
-    from loopy.symbolic import get_dependencies
-    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
-    knl = lp.precompute(knl, "D", "i_inner")
-
-    # There's only one surviving 'E' rule.
-    assert len([
-        rule_name
-        for rule_name in knl.substitutions
-        if rule_name.startswith("E")]) == 1
-
-    # That rule should use the newly created prefetch inames,
-    # not the prior 'i_inner'
-    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)
-
-    lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
-            parameters=dict(n=12345))
-
-
-def test_poisson(ctx_factory):
-    # Stolen from Peter Coogan and Rob Kirby for FEM assembly
-    ctx = ctx_factory()
-
-    nbf = 5
-    nqp = 5
-    sdim = 3
-
-    knl = lp.make_kernel(
-            "{ [c,i,j,k,ell,ell2,ell3]: \
-            0 <= c < nels and \
-            0 <= i < nbf and \
-            0 <= j < nbf and \
-            0 <= k < nqp and \
-            0 <= ell,ell2 < sdim}",
-            """
-            dpsi(bf,k0,dir) := \
-                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
-            Ael[c,i,j] = \
-                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
-            """,
-            assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")
-
-    print(knl)
-
-    knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)
-
-    ref_knl = knl
-
-    knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"])
-
-    def variant_1(knl):
-        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
-        knl = lp.set_loop_priority(knl, "c,i,j")
-        return knl
-
-    def variant_2(knl):
-        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
-        knl = lp.set_loop_priority(knl, "c,i,j")
-        return knl
-
-    def add_types(knl):
-        return lp.add_and_infer_dtypes(knl, dict(
-            w=np.float32,
-            J=np.float32,
-            DPsi=np.float32,
-            DFinv=np.float32,
-            ))
-
-    for variant in [
-            #variant_1,
-            variant_2
-            ]:
-        knl = variant(knl)
-
-        lp.auto_test_vs_ref(
-                add_types(ref_knl), ctx, add_types(knl),
-                parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
-
-
-def test_auto_test_can_detect_problems(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-        "{[i,j]: 0<=i,j<n}",
-        """
-        a[i,j] = 25
-        """)
-
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
-
-    ref_knl = knl
-
-    knl = lp.link_inames(knl, "i,j", "i0")
-
-    from loopy.diagnostic import AutomaticTestFailure
-    with pytest.raises(AutomaticTestFailure):
-        lp.auto_test_vs_ref(
-                ref_knl, ctx, knl,
-                parameters=dict(n=123))
-
-
-def test_generate_c_snippet():
-    from loopy.target.c import CTarget
-
-    from pymbolic import var
-    I = var("I")  # noqa
-    f = var("f")
-    df = var("df")
-    q_v = var("q_v")
-    eN = var("eN")  # noqa
-    k = var("k")
-    u = var("u")
-
-    from functools import partial
-    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)
-
-    Instr = lp.Assignment  # noqa
-
-    knl = lp.make_kernel(
-        "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}",
-        [
-            Instr(f[I], l_sum(k, q_v[k, I]*u)),
-            Instr(df[I], l_sum(k, q_v[k, I])),
-            ],
-        [
-            lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
-            lp.GlobalArg("f,df", np.float64, shape="nSpace"),
-            lp.ValueArg("u", np.float64),
-            "...",
-            ],
-        target=CTarget(),
-        assumptions="nQuad>=1")
-
-    if 0:  # enable to play with prefetching
-        # (prefetch currently requires constant sizes)
-        knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
-        knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)
-
-    knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
-    knl = lp.set_loop_priority(knl, "I,k_outer,k_inner")
-
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    print(lp.generate_body(knl))
-
-
-def test_precompute_with_preexisting_inames(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
-        """
-        result[e,i] = sum(j, D1[i,j]*u[e,j])
-        result2[e,i] = sum(k, D2[i,k]*u[e,k])
-        """)
-
-    knl = lp.add_and_infer_dtypes(knl, {
-        "u": np.float32,
-        "D1": np.float32,
-        "D2": np.float32,
-        })
-
-    knl = lp.fix_parameters(knl, n=13)
-
-    ref_knl = knl
-
-    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
-    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")
-
-    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
-            precompute_inames="ii,jj")
-    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
-            precompute_inames="ii,jj")
-
-    knl = lp.set_loop_priority(knl, "ii,jj,e,j,k")
-
-    lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
-            parameters=dict(E=200))
-
-
-def test_precompute_with_preexisting_inames_fail():
-    knl = lp.make_kernel(
-        "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}",
-        """
-        result[e,i] = sum(j, D1[i,j]*u[e,j])
-        result2[e,i] = sum(k, D2[i,k]*u[e,k])
-        """)
-
-    knl = lp.add_and_infer_dtypes(knl, {
-        "u": np.float32,
-        "D1": np.float32,
-        "D2": np.float32,
-        })
-
-    knl = lp.fix_parameters(knl, n=13)
-
-    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
-    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")
-
-    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
-            precompute_inames="ii,jj")
-    with pytest.raises(lp.LoopyError):
-        lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
-                precompute_inames="ii,jj")
-
-
-def test_vectorize(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-        "{[i]: 0<=i<n}",
-        """
-        <> temp = 2*b[i]
-        a[i] = temp
-        """)
-    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
-    knl = lp.set_array_dim_names(knl, "a,b", "i")
-    knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4,
-            split_kwargs=dict(slabs=(0, 1)))
-
-    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
-    ref_knl = knl
-    ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})
-
-    knl = lp.tag_inames(knl, {"i_inner": "vec"})
+        temp[i, 1] = 15
+        """)
+    knl = lp.tag_inames(knl, dict(i="l.0"))
 
     knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    code, inf = lp.generate_code(knl)
-
-    lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
-            parameters=dict(n=30))
+    for k in lp.generate_loop_schedules(knl):
+        code, _ = lp.generate_code(k)
+        print(code)
 
 
-def test_alias_temporaries(ctx_factory):
+def test_make_copy_kernel(ctx_factory):
     ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel(
-        "{[i]: 0<=i<n}",
-        """
-        times2(i) := 2*a[i]
-        times3(i) := 3*a[i]
-        times4(i) := 4*a[i]
+    intermediate_format = "f,f,sep"
 
-        x[i] = times2(i)
-        y[i] = times3(i)
-        z[i] = times4(i)
-        """)
+    a1 = np.random.randn(1024, 4, 3)
 
-    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    cknl1 = lp.make_copy_kernel(intermediate_format)
 
-    ref_knl = knl
+    cknl1 = lp.fix_parameters(cknl1, n2=3)
+
+    cknl1 = lp.set_options(cknl1, write_cl=True)
+    evt, a2 = cknl1(queue, input=a1)
 
-    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
+    cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format)
+    cknl2 = lp.fix_parameters(cknl2, n2=3)
 
-    knl = lp.precompute(knl, "times2", "i_inner")
-    knl = lp.precompute(knl, "times3", "i_inner")
-    knl = lp.precompute(knl, "times4", "i_inner")
+    evt, a3 = cknl2(queue, input=a2)
 
-    knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"])
+    assert (a1 == a3).all()
 
-    lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
-            parameters=dict(n=30))
 
+def test_auto_test_can_detect_problems(ctx_factory):
+    ctx = ctx_factory()
 
-def test_fusion():
-    exp_kernel = lp.make_kernel(
-         ''' { [i]: 0<=i<n } ''',
-         ''' exp[i] = pow(E, z[i])''',
-         assumptions="n>0")
+    ref_knl = lp.make_kernel(
+        "{[i,j]: 0<=i,j<n}",
+        """
+        a[i,j] = 25
+        """)
 
-    sum_kernel = lp.make_kernel(
-        '{ [j]: 0<=j<n }',
-        'out2 = sum(j, exp[j])',
-        assumptions='n>0')
+    knl = lp.make_kernel(
+        "{[i]: 0<=i<n}",
+        """
+        a[i,i] = 25
+        """)
 
-    knl = lp.fuse_kernels([exp_kernel, sum_kernel])
+    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(a=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
 
-    print(knl)
+    from loopy.diagnostic import AutomaticTestFailure
+    with pytest.raises(AutomaticTestFailure):
+        lp.auto_test_vs_ref(
+                ref_knl, ctx, knl,
+                parameters=dict(n=123))
 
 
 def test_sci_notation_literal(ctx_factory):
@@ -2210,37 +891,6 @@ def test_sci_notation_literal(ctx_factory):
     assert (np.abs(out.get() - 1e-12) < 1e-20).all()
 
 
-def test_rename_argument(ctx_factory):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    kernel = lp.make_kernel(
-         '''{ [i]: 0<=i<n }''',
-         '''out[i] = a + 2''')
-
-    kernel = lp.rename_argument(kernel, "a", "b")
-
-    evt, (out,) = kernel(queue, b=np.float32(12), n=20)
-
-    assert (np.abs(out.get() - 14) < 1e-8).all()
-
-
-def test_to_batched(ctx_factory):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    knl = lp.make_kernel(
-         ''' { [i,j]: 0<=i,j<n } ''',
-         ''' out[i] = sum(j, a[i,j]*x[j])''')
-
-    bknl = lp.to_batched(knl, "nbatches", "out,x")
-
-    a = np.random.randn(5, 5)
-    x = np.random.randn(7, 5)
-
-    bknl(queue, a=a, x=x)
-
-
 def test_variable_size_temporary():
     knl = lp.make_kernel(
          ''' { [i,j]: 0<=i,j<n } ''',
@@ -2295,57 +945,6 @@ def test_indexof_vec(ctx_factory):
     #assert np.array_equal(out.ravel(order="C"), np.arange(25))
 
 
-def test_finite_difference_expr_subst(ctx_factory):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
-    h = grid[1] - grid[0]
-    u = cl.clmath.sin(cl.array.to_device(queue, grid))
-
-    fin_diff_knl = lp.make_kernel(
-        "{[i]: 1<=i<=n}",
-        "out[i] = -(f[i+1] - f[i-1])/h",
-        [lp.GlobalArg("out", shape="n+2"), "..."])
-
-    flux_knl = lp.make_kernel(
-        "{[j]: 1<=j<=n}",
-        "f[j] = u[j]**2/2",
-        [
-            lp.GlobalArg("f", shape="n+2"),
-            lp.GlobalArg("u", shape="n+2"),
-            ])
-
-    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
-            data_flow=[
-                ("f", 1, 0)
-                ])
-
-    fused_knl = lp.set_options(fused_knl, write_cl=True)
-    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
-
-    fused_knl = lp.assignment_to_subst(fused_knl, "f")
-
-    fused_knl = lp.set_options(fused_knl, write_cl=True)
-
-    # This is the real test here: The automatically generated
-    # shape expressions are '2+n' and the ones above are 'n+2'.
-    # Is loopy smart enough to understand that these are equal?
-    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
-
-    fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")
-
-    gpu_knl = lp.split_iname(
-            fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")
-
-    precomp_knl = lp.precompute(
-            gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)
-
-    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
-    precomp_knl = lp.set_options(precomp_knl, return_dict=True)
-    evt, _ = precomp_knl(queue, u=u, h=h)
-
-
 def test_is_expression_equal():
     from loopy.symbolic import is_expression_equal
     from pymbolic import var
@@ -2359,92 +958,6 @@ def test_is_expression_equal():
     assert is_expression_equal((x+y)**2, x**2 + 2*x*y + y**2)
 
 
-def test_collect_common_factors(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{[i,j,k]: 0<=i,j<n}",
-            """
-            <float32> out_tmp = 0 {id=out_init,inames=i}
-            out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init}
-            out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] {id=out_up2,dep=out_init}
-            out[i] = out_tmp {dep=out_up1:out_up2}
-            """)
-    knl = lp.add_and_infer_dtypes(knl,
-            dict(a=np.float32, alpha=np.float32, b1=np.float32, b2=np.float32))
-
-    ref_knl = knl
-
-    knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0")
-    knl = lp.collect_common_factors_on_increment(knl, "out_tmp")
-
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13))
-
-
-def test_ispc_target(occa_mode=False):
-    from loopy.target.ispc import ISPCTarget
-
-    knl = lp.make_kernel(
-            "{ [i]: 0<=i<n }",
-            "out[i] = 2*a[i]",
-            [
-                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
-                "..."
-                ],
-            target=ISPCTarget(occa_mode=occa_mode))
-
-    knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
-    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
-    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
-
-    codegen_result = lp.generate_code_v2(
-                lp.get_one_scheduled_kernel(
-                    lp.preprocess_kernel(knl)))
-
-    print(codegen_result.device_code())
-    print(codegen_result.host_code())
-
-
-def test_cuda_target():
-    from loopy.target.cuda import CudaTarget
-
-    knl = lp.make_kernel(
-            "{ [i]: 0<=i<n }",
-            "out[i] = 2*a[i]",
-            [
-                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
-                "..."
-                ],
-            target=CudaTarget())
-
-    knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
-    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
-    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
-
-    print(
-            lp.generate_code(
-                lp.get_one_scheduled_kernel(
-                    lp.preprocess_kernel(knl)))[0])
-
-
-def test_chunk_iname(ctx_factory):
-    ctx = ctx_factory()
-
-    knl = lp.make_kernel(
-            "{ [i]: 0<=i<n }",
-            "out[i] = 2*a[i]",
-            [
-                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
-                "..."
-                ],
-            assumptions="n>0")
-
-    ref_knl = knl
-    knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0")
-    knl = lp.set_loop_priority(knl, "i_outer, i_inner")
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
-
-
 @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
 def test_atomic(ctx_factory, dtype):
     ctx = ctx_factory()
@@ -2476,23 +989,6 @@ def test_atomic(ctx_factory, dtype):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
 
 
-def test_clamp(ctx_factory):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    n = 15 * 10**6
-    x = cl.clrandom.rand(queue, n, dtype=np.float32)
-
-    knl = lp.make_kernel(
-            "{ [i]: 0<=i<n }",
-            "out[i] = clamp(x[i], a, b)")
-
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
-    knl = lp.set_options(knl, write_cl=True)
-
-    evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
-
-
 def test_forced_iname_deps_and_reduction():
     # See https://github.com/inducer/loopy/issues/24
 
@@ -2528,42 +1024,6 @@ def test_forced_iname_deps_and_reduction():
     print(k.stringify(with_dependencies=True))
 
 
-@pytest.mark.parametrize("tp", ["f32", "f64"])
-def test_random123(ctx_factory, tp):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    import pyopencl.version  # noqa
-    if cl.version.VERSION < (2016, 2):
-        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")
-
-    n = 150000
-
-    knl = lp.make_kernel(
-            "{ [i]: 0<=i<n }",
-            """
-            <> key2 = make_uint2(i, 324830944) {inames=i}
-            <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i}
-            <> ctr = make_uint4(0, 1, 2, 3) {inames=i}
-            <> real, ctr = philox4x32_TYPE(ctr, key2)
-            <> imag, ctr = threefry4x32_TYPE(ctr, key4)
-
-            out[i, 0] = real.s0 + 1j * imag.s0
-            out[i, 1] = real.s1 + 1j * imag.s1
-            out[i, 2] = real.s2 + 1j * imag.s2
-            out[i, 3] = real.s3 + 1j * imag.s3
-            """.replace("TYPE", tp))
-
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
-    knl = lp.set_options(knl, write_cl=True)
-
-    evt, (out,) = knl(queue, n=n)
-
-    out = out.get()
-    assert (out < 1).all()
-    assert (0 <= out).all()
-
-
 def test_kernel_splitting(ctx_factory):
     ctx = ctx_factory()
 
@@ -2704,6 +1164,77 @@ def test_global_temporary(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
+def test_assign_to_linear_subscript(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl1 = lp.make_kernel(
+            "{ [i]: 0<=i<n}",
+            "a[i,i] = 1")
+    knl2 = lp.make_kernel(
+            "{ [i]: 0<=i<n}",
+            "a[[i*n + i]] = 1",
+            [lp.GlobalArg("a", shape="n,n"), "..."])
+
+    a1 = cl.array.zeros(queue, (10, 10), np.float32)
+    knl1(queue, a=a1)
+    a2 = cl.array.zeros(queue, (10, 10), np.float32)
+    knl2(queue, a=a2)
+
+    assert np.array_equal(a1.get(),  a2.get())
+
+
+def test_finite_difference_expr_subst(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
+    h = grid[1] - grid[0]
+    u = cl.clmath.sin(cl.array.to_device(queue, grid))
+
+    fin_diff_knl = lp.make_kernel(
+        "{[i]: 1<=i<=n}",
+        "out[i] = -(f[i+1] - f[i-1])/h",
+        [lp.GlobalArg("out", shape="n+2"), "..."])
+
+    flux_knl = lp.make_kernel(
+        "{[j]: 1<=j<=n}",
+        "f[j] = u[j]**2/2",
+        [
+            lp.GlobalArg("f", shape="n+2"),
+            lp.GlobalArg("u", shape="n+2"),
+            ])
+
+    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
+            data_flow=[
+                ("f", 1, 0)
+                ])
+
+    fused_knl = lp.set_options(fused_knl, write_cl=True)
+    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
+
+    fused_knl = lp.assignment_to_subst(fused_knl, "f")
+
+    fused_knl = lp.set_options(fused_knl, write_cl=True)
+
+    # This is the real test here: The automatically generated
+    # shape expressions are '2+n' and the ones above are 'n+2'.
+    # Is loopy smart enough to understand that these are equal?
+    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
+
+    fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")
+
+    gpu_knl = lp.split_iname(
+            fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")
+
+    precomp_knl = lp.precompute(
+            gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)
+
+    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
+    precomp_knl = lp.set_options(precomp_knl, return_dict=True)
+    evt, _ = precomp_knl(queue, u=u, h=h)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 3eacbaa2850b12ab1130a0f4b02ac5698bc9fab9..389151f173cc835c0e01f2cbc3c640952cf3c575 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -93,7 +93,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
     # turn the first reads into subst rules
     local_prep_var_names = set()
     for insn in lp.find_instructions(hsv, "tag:local_prep"):
-        (assignee, _), = insn.assignees_and_indices()
+        assignee, = insn.assignee_var_names()
         local_prep_var_names.add(assignee)
         hsv = lp.assignment_to_subst(hsv, assignee)
 
@@ -122,7 +122,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
                   ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)),
                   ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)),
                   ]:
-            (flux_var, _), = insn.assignees_and_indices()
+            flux_var, = insn.assignee_var_names()
             print(insn)
 
             reader, = lp.find_instructions(hsv,
diff --git a/test/test_reduction.py b/test/test_reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..b751dd515b5978ba3c9ccedab06016d19e746546
--- /dev/null
+++ b/test/test_reduction.py
@@ -0,0 +1,496 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pyopencl.clrandom  # noqa
+import pytest
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+__all__ = [
+        "pytest_generate_tests",
+        "cl"  # 'cl.create_some_context'
+        ]
+
+
+def test_nonsense_reduction(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<100}",
+            """
+                a[i] = sum(i, 2)
+                """,
+            [lp.GlobalArg("a", np.float32, shape=(100,))]
+            )
+
+    import pytest
+    with pytest.raises(RuntimeError):
+        knl = lp.preprocess_kernel(knl, ctx.devices[0])
+
+
+def test_empty_reduction(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            [
+                "{[i]: 0<=i<20}",
+                "[i] -> {[j]: 0<=j<0}"
+                ],
+            "a[i] = sum(j, j)",
+            )
+
+    knl = lp.realize_reduction(knl)
+    print(knl)
+
+    knl = lp.set_options(knl, write_cl=True)
+    evt, (a,) = knl(queue)
+
+    assert (a.get() == 0).all()
+
+
+def test_nested_dependent_reduction(ctx_factory):
+    dtype = np.dtype(np.int32)
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            [
+                "{[i]: 0<=i<n}",
+                "{[j]: 0<=j<i+sumlen}"
+                ],
+            [
+                "<> sumlen = l[i]",
+                "a[i] = sum(j, j)",
+                ],
+            [
+                lp.ValueArg("n", np.int32),
+                lp.GlobalArg("a", dtype, ("n",)),
+                lp.GlobalArg("l", np.int32, ("n",)),
+                ])
+
+    cknl = lp.CompiledKernel(ctx, knl)
+
+    n = 330
+    l = np.arange(n, dtype=np.int32)
+    evt, (a,) = cknl(queue, l=l, n=n, out_host=True)
+
+    tgt_result = (2*l-1)*2*l/2
+    assert (a == tgt_result).all()
+
+
+def test_multi_nested_dependent_reduction(ctx_factory):
+    dtype = np.dtype(np.int32)
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            [
+                "{[itgt]: 0 <= itgt < ntgts}",
+                "{[isrc_box]: 0 <= isrc_box < nboxes}",
+                "{[isrc]: 0 <= isrc < npart}"
+                ],
+            [
+                "<> npart = nparticles_per_box[isrc_box]",
+                "a[itgt] = sum((isrc_box, isrc), 1)",
+                ],
+            [
+                lp.ValueArg("n", np.int32),
+                lp.GlobalArg("a", dtype, ("n",)),
+                lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)),
+                lp.ValueArg("ntgts", np.int32),
+                lp.ValueArg("nboxes", np.int32),
+                ],
+            assumptions="ntgts>=1")
+
+    cknl = lp.CompiledKernel(ctx, knl)
+    print(cknl.get_code())
+    # FIXME: Actually test functionality.
+
+
+def test_recursive_nested_dependent_reduction(ctx_factory):
+    dtype = np.dtype(np.int32)
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            [
+                "{[itgt]: 0 <= itgt < ntgts}",
+                "{[isrc_box]: 0 <= isrc_box < nboxes}",
+                "{[isrc]: 0 <= isrc < npart}"
+                ],
+            [
+                "<> npart = nparticles_per_box[isrc_box]",
+                "<> boxsum = sum(isrc, isrc+isrc_box+itgt)",
+                "a[itgt] = sum(isrc_box, boxsum)",
+                ],
+            [
+                lp.ValueArg("n", np.int32),
+                lp.GlobalArg("a", dtype, ("n",)),
+                lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)),
+                lp.ValueArg("ntgts", np.int32),
+                lp.ValueArg("nboxes", np.int32),
+                ],
+            assumptions="ntgts>=1")
+
+    cknl = lp.CompiledKernel(ctx, knl)
+    print(cknl.get_code())
+    # FIXME: Actually test functionality.
+
+
+@pytest.mark.parametrize("size", [128, 5, 113, 67])
+def test_local_parallel_reduction(ctx_factory, size):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{[i, j]: 0 <= i < n and 0 <= j < 5}",
+            """
+            z[j] = sum(i, i+j)
+            """)
+
+    knl = lp.fix_parameters(knl, n=size)
+
+    ref_knl = knl
+
+    def variant0(knl):
+        return lp.tag_inames(knl, "i:l.0")
+
+    def variant1(knl):
+        return lp.tag_inames(knl, "i:l.0,j:l.1")
+
+    def variant2(knl):
+        return lp.tag_inames(knl, "i:l.0,j:g.0")
+
+    for variant in [
+            variant0,
+            variant1,
+            variant2
+            ]:
+        knl = variant(ref_knl)
+
+        lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
+# FIXME: Make me a test
+@pytest.mark.parametrize("size", [10000])
+def no_test_global_parallel_reduction(ctx_factory, size):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i]: 0 <= i < n }",
+            """
+            <> key = make_uint2(i, 324830944)  {inames=i}
+            <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
+            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
+            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
+            """)
+
+    # ref_knl = knl
+
+    gsize = 128
+    knl = lp.split_iname(knl, "i", gsize * 20)
+    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
+    knl = lp.split_reduction_inward(knl, "i_inner_inner")
+    knl = lp.split_reduction_inward(knl, "i_inner_outer")
+    from loopy.transform.data import reduction_arg_to_subst_rule
+    knl = reduction_arg_to_subst_rule(knl, "i_outer")
+    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer")
+    print(knl)
+    1/0
+    knl = lp.realize_reduction(knl)
+
+    evt, (z,) = knl(queue, n=size)
+
+    #lp.auto_test_vs_ref(ref_knl, ctx, knl)
+
+
+@pytest.mark.parametrize("size", [10000])
+def test_global_parallel_reduction_simpler(ctx_factory, size):
+    ctx = ctx_factory()
+
+    pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check")
+
+    knl = lp.make_kernel(
+            "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}",
+            """
+            <> key = make_uint2(l+nl*g, 1234)  {inames=l:g}
+            <> ctr = make_uint4(0, 1, 2, 3)  {inames=l:g,id=init_ctr}
+            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
+
+            <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3)
+
+            result = sum(j, tmp[j])
+            """)
+
+    ng = 50
+    knl = lp.fix_parameters(knl, ng=ng)
+
+    knl = lp.set_options(knl, write_cl=True)
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "l", 128, inner_tag="l.0")
+    knl = lp.split_reduction_outward(knl, "l_inner")
+    knl = lp.tag_inames(knl, "g:g.0,j:l.0")
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
+
+
+def test_argmax(ctx_factory):
+    logging.basicConfig(level=logging.INFO)
+
+    dtype = np.dtype(np.float32)
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 10000
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<%d}" % n,
+            """
+            max_val, max_idx = argmax(i, fabs(a[i]))
+            """)
+
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    print(lp.preprocess_kernel(knl))
+    knl = lp.set_options(knl, write_cl=True, highlight_cl=True)
+
+    a = np.random.randn(10000).astype(dtype)
+    evt, (max_idx, max_val) = knl(queue, a=a, out_host=True)
+    assert max_val == np.max(np.abs(a))
+    assert max_idx == np.where(np.abs(a) == max_val)[-1]
+
+
+def test_simul_reduce(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 20
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i,j<n }",
+            [
+                "a = simul_reduce(sum, (i,j), i*j)",
+                "b = simul_reduce(sum, i, simul_reduce(sum, j, i*j))",
+                ],
+            assumptions="n>=1")
+
+    evt, (a, b) = knl(queue, n=n)
+
+    ref = sum(i*j for i in range(n) for j in range(n))
+    assert a.get() == ref
+    assert b.get() == ref
+
+
+@pytest.mark.parametrize(("op_name", "np_op"), [
+    ("sum", np.sum),
+    ("product", np.prod),
+    ("min", np.min),
+    ("max", np.max),
+    ])
+def test_reduction_library(ctx_factory, op_name, np_op):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i<n and 0<=j<m }",
+            [
+                "res[i] = reduce(%s, j, a[i,j])" % op_name,
+                ],
+            assumptions="n>=1")
+
+    a = np.random.randn(20, 10)
+    evt, (res,) = knl(queue, a=a)
+
+    assert np.allclose(res, np_op(a, axis=1))
+
+
+def test_split_reduction(ctx_factory):
+    knl = lp.make_kernel(
+            "{[i,j,k]: 0<=i,j,k<n}",
+            """
+                b = sum((i,j,k), a[i,j,k])
+                """,
+            [
+                lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a",
+                    None, shape=None),
+                "..."])
+
+    knl = lp.split_reduction_outward(knl, "j,k")
+    # FIXME: finish test
+
+
+def test_double_sum_made_unique(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 20
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i,j<n }",
+            [
+                "a = sum((i,j), i*j)",
+                "b = sum(i, sum(j, i*j))",
+                ],
+            assumptions="n>=1")
+
+    knl = lp.make_reduction_inames_unique(knl)
+    print(knl)
+
+    evt, (a, b) = knl(queue, n=n)
+
+    ref = sum(i*j for i in range(n) for j in range(n))
+    assert a.get() == ref
+    assert b.get() == ref
+
+
+def test_fd_demo():
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i,j<n}",
+        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
+                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
+                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
+    #assumptions="n mod 16=0")
+    knl = lp.split_iname(knl,
+            "i", 16, outer_tag="g.1", inner_tag="l.1")
+    knl = lp.split_iname(knl,
+            "j", 16, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.add_prefetch(knl, "u",
+            ["i_inner", "j_inner"],
+            fetch_bounding_box=True)
+
+    #n = 1000
+    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)
+
+    knl = lp.set_options(knl, write_cl=True)
+    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
+    code, inf = lp.generate_code(knl)
+    print(code)
+
+    assert "double" not in code
+
+
+def test_fd_1d(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i]: 0<=i<n}",
+        "result[i] = u[i+1]-u[i]")
+
+    knl = lp.add_and_infer_dtypes(knl, {"u": np.float32})
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16)
+    knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j")
+    knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for")
+    knl = lp.assume(knl, "n mod 16 = 0")
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=2048))
+
+
+def test_poisson_fem(ctx_factory):
+    # Stolen from Peter Coogan and Rob Kirby for FEM assembly
+    ctx = ctx_factory()
+
+    nbf = 5
+    nqp = 5
+    sdim = 3
+
+    knl = lp.make_kernel(
+            "{ [c,i,j,k,ell,ell2,ell3]: \
+            0 <= c < nels and \
+            0 <= i < nbf and \
+            0 <= j < nbf and \
+            0 <= k < nqp and \
+            0 <= ell,ell2 < sdim}",
+            """
+            dpsi(bf,k0,dir) := \
+                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
+            Ael[c,i,j] = \
+                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
+            """,
+            assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")
+
+    print(knl)
+
+    knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)
+
+    ref_knl = knl
+
+    knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"])
+
+    def variant_1(knl):
+        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
+        knl = lp.set_loop_priority(knl, "c,i,j")
+        return knl
+
+    def variant_2(knl):
+        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
+        knl = lp.set_loop_priority(knl, "c,i,j")
+        return knl
+
+    def add_types(knl):
+        return lp.add_and_infer_dtypes(knl, dict(
+            w=np.float32,
+            J=np.float32,
+            DPsi=np.float32,
+            DFinv=np.float32,
+            ))
+
+    for variant in [
+            #variant_1,
+            variant_2
+            ]:
+        knl = variant(knl)
+
+        lp.auto_test_vs_ref(
+                add_types(ref_knl), ctx, add_types(knl),
+                parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_target.py b/test/test_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..67204d9c1fbc02624eebf230b87d33c114c21ef1
--- /dev/null
+++ b/test/test_target.py
@@ -0,0 +1,203 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pyopencl.clrandom  # noqa
+import pytest
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+__all__ = [
+        "pytest_generate_tests",
+        "cl"  # 'cl.create_some_context'
+        ]
+
+
+def test_ispc_target(occa_mode=False):
+    from loopy.target.ispc import ISPCTarget
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            "out[i] = 2*a[i]",
+            [
+                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
+                "..."
+                ],
+            target=ISPCTarget(occa_mode=occa_mode))
+
+    knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
+    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
+
+    codegen_result = lp.generate_code_v2(
+                lp.get_one_scheduled_kernel(
+                    lp.preprocess_kernel(knl)))
+
+    print(codegen_result.device_code())
+    print(codegen_result.host_code())
+
+
+def test_cuda_target():
+    from loopy.target.cuda import CudaTarget
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            "out[i] = 2*a[i]",
+            [
+                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
+                "..."
+                ],
+            target=CudaTarget())
+
+    knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
+    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
+    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
+
+    print(
+            lp.generate_code(
+                lp.get_one_scheduled_kernel(
+                    lp.preprocess_kernel(knl)))[0])
+
+
+def test_generate_c_snippet():
+    from loopy.target.c import CTarget
+
+    from pymbolic import var
+    I = var("I")  # noqa
+    f = var("f")
+    df = var("df")
+    q_v = var("q_v")
+    eN = var("eN")  # noqa
+    k = var("k")
+    u = var("u")
+
+    from functools import partial
+    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)
+
+    Instr = lp.Assignment  # noqa
+
+    knl = lp.make_kernel(
+        "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}",
+        [
+            Instr(f[I], l_sum(k, q_v[k, I]*u)),
+            Instr(df[I], l_sum(k, q_v[k, I])),
+            ],
+        [
+            lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
+            lp.GlobalArg("f,df", np.float64, shape="nSpace"),
+            lp.ValueArg("u", np.float64),
+            "...",
+            ],
+        target=CTarget(),
+        assumptions="nQuad>=1")
+
+    if 0:  # enable to play with prefetching
+        # (prefetch currently requires constant sizes)
+        knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
+        knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)
+
+    knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
+    knl = lp.set_loop_priority(knl, "I,k_outer,k_inner")
+
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+    print(lp.generate_body(knl))
+
+
+@pytest.mark.parametrize("tp", ["f32", "f64"])
+def test_random123(ctx_factory, tp):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    import pyopencl.version  # noqa
+    if cl.version.VERSION < (2016, 2):
+        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")
+
+    n = 150000
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            """
+            <> key2 = make_uint2(i, 324830944) {inames=i}
+            <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i}
+            <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
+            <> real, ctr = philox4x32_TYPE(ctr, key2)  {dep=init_ctr}
+            <> imag, ctr = threefry4x32_TYPE(ctr, key4)  {dep=init_ctr}
+
+            out[i, 0] = real.s0 + 1j * imag.s0
+            out[i, 1] = real.s1 + 1j * imag.s1
+            out[i, 2] = real.s2 + 1j * imag.s2
+            out[i, 3] = real.s3 + 1j * imag.s3
+            """.replace("TYPE", tp))
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.set_options(knl, write_cl=True)
+
+    evt, (out,) = knl(queue, n=n)
+
+    out = out.get()
+    assert (out < 1).all()
+    assert (0 <= out).all()
+
+
+def test_clamp(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 15 * 10**6
+    x = cl.clrandom.rand(queue, n, dtype=np.float32)
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            "out[i] = clamp(x[i], a, b)")
+
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.set_options(knl, write_cl=True)
+
+    evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_transform.py b/test/test_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9564274d12b2889d5010bbd807c4bd94fa149f
--- /dev/null
+++ b/test/test_transform.py
@@ -0,0 +1,412 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import numpy as np
+import loopy as lp
+import pyopencl as cl
+import pyopencl.clmath  # noqa
+import pyopencl.clrandom  # noqa
+import pytest
+
+import logging
+logger = logging.getLogger(__name__)
+
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
+from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+        as pytest_generate_tests
+
+__all__ = [
+        "pytest_generate_tests",
+        "cl"  # 'cl.create_some_context'
+        ]
+
+
+def test_chunk_iname(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            "out[i] = 2*a[i]",
+            [
+                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
+                "..."
+                ],
+            assumptions="n>0")
+
+    ref_knl = knl
+    knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0")
+    knl = lp.set_loop_priority(knl, "i_outer, i_inner")
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
+
+
+def test_collect_common_factors(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{[i,j,k]: 0<=i,j<n}",
+            """
+            <float32> out_tmp = 0 {id=out_init,inames=i}
+            out_tmp = out_tmp + alpha[i]*a[i,j]*b1[j] {id=out_up1,dep=out_init}
+            out_tmp = out_tmp + alpha[i]*a[j,i]*b2[j] {id=out_up2,dep=out_init}
+            out[i] = out_tmp {dep=out_up1:out_up2}
+            """)
+    knl = lp.add_and_infer_dtypes(knl,
+            dict(a=np.float32, alpha=np.float32, b1=np.float32, b2=np.float32))
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0")
+    knl = lp.collect_common_factors_on_increment(knl, "out_tmp")
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13))
+
+
+def test_to_batched(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+         ''' { [i,j]: 0<=i,j<n } ''',
+         ''' out[i] = sum(j, a[i,j]*x[j])''')
+
+    bknl = lp.to_batched(knl, "nbatches", "out,x")
+
+    a = np.random.randn(5, 5)
+    x = np.random.randn(7, 5)
+
+    bknl(queue, a=a, x=x)
+
+
+def test_rename_argument(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    kernel = lp.make_kernel(
+         '''{ [i]: 0<=i<n }''',
+         '''out[i] = a + 2''')
+
+    kernel = lp.rename_argument(kernel, "a", "b")
+
+    evt, (out,) = kernel(queue, b=np.float32(12), n=20)
+
+    assert (np.abs(out.get() - 14) < 1e-8).all()
+
+
+def test_fusion():
+    exp_kernel = lp.make_kernel(
+         ''' { [i]: 0<=i<n } ''',
+         ''' exp[i] = pow(E, z[i])''',
+         assumptions="n>0")
+
+    sum_kernel = lp.make_kernel(
+        '{ [j]: 0<=j<n }',
+        'out2 = sum(j, exp[j])',
+        assumptions='n>0')
+
+    knl = lp.fuse_kernels([exp_kernel, sum_kernel])
+
+    print(knl)
+
+
+def test_alias_temporaries(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i]: 0<=i<n}",
+        """
+        times2(i) := 2*a[i]
+        times3(i) := 3*a[i]
+        times4(i) := 4*a[i]
+
+        x[i] = times2(i)
+        y[i] = times3(i)
+        z[i] = times4(i)
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+
+    ref_knl = knl
+
+    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
+
+    knl = lp.precompute(knl, "times2", "i_inner")
+    knl = lp.precompute(knl, "times3", "i_inner")
+    knl = lp.precompute(knl, "times4", "i_inner")
+
+    knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"])
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=30))
+
+
+def test_vectorize(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i]: 0<=i<n}",
+        """
+        <> temp = 2*b[i]
+        a[i] = temp
+        """)
+    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
+    knl = lp.set_array_dim_names(knl, "a,b", "i")
+    knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4,
+            split_kwargs=dict(slabs=(0, 1)))
+
+    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
+    ref_knl = knl
+    ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})
+
+    knl = lp.tag_inames(knl, {"i_inner": "vec"})
+
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+    code, inf = lp.generate_code(knl)
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=30))
+
+
+def test_extract_subst(ctx_factory):
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+                a[i] = 23*b[i]**2 + 25*b[i]**2
+                """)
+
+    knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha")
+
+    print(knl)
+
+    from loopy.symbolic import parse
+
+    insn, = knl.instructions
+    assert insn.expression == parse("bsquare(23) + bsquare(25)")
+
+
+def test_join_inames(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{[i,j]: 0<=i,j<16}",
+            [
+                "b[i,j] = 2*a[i,j]"
+                ],
+            [
+                lp.GlobalArg("a", np.float32, shape=(16, 16,)),
+                lp.GlobalArg("b", np.float32, shape=(16, 16,))
+                ],
+            )
+
+    ref_knl = knl
+
+    knl = lp.add_prefetch(knl, "a", sweep_inames=["i", "j"])
+    knl = lp.join_inames(knl, ["a_dim_0", "a_dim_1"])
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, print_ref_code=True)
+
+
+def test_tag_data_axes(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{ [i,j,k]: 0<=i,j,k<n }",
+            "out[i,j,k] = 15")
+
+    ref_knl = knl
+
+    with pytest.raises(lp.LoopyError):
+        lp.tag_data_axes(knl, "out", "N1,N0,N5")
+
+    with pytest.raises(lp.LoopyError):
+        lp.tag_data_axes(knl, "out", "N1,N0,c")
+
+    knl = lp.tag_data_axes(knl, "out", "N1,N0,N2")
+    knl = lp.tag_inames(knl, dict(j="g.0", i="g.1"))
+
+    lp.auto_test_vs_ref(ref_knl, ctx, knl,
+            parameters=dict(n=20))
+
+
+def test_set_arg_order():
+    knl = lp.make_kernel(
+            "{ [i,j]: 0<=i,j<n }",
+            "out[i,j] = a[i]*b[j]")
+
+    knl = lp.set_argument_order(knl, "out,a,n,b")
+
+
+def test_affine_map_inames():
+    knl = lp.make_kernel(
+        "{[e, i,j,n]: 0<=e<E and 0<=i,j,n<N}",
+        "rhsQ[e, n+i, j] = rhsQ[e, n+i, j] - D[i, n]*x[i,j]")
+
+    knl = lp.affine_map_inames(knl,
+            "i", "i0",
+            "i0 = n+i")
+
+    print(knl)
+
+
+def test_precompute_confusing_subst_arguments(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<n and 0<=j<5}",
+        """
+        D(i):=a[i+1]-a[i]
+        b[i,j] = D(j)
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+
+    ref_knl = knl
+
+    knl = lp.tag_inames(knl, dict(j="g.1"))
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+
+    from loopy.symbolic import get_dependencies
+    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
+    knl = lp.precompute(knl, "D")
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=12345))
+
+
+def test_precompute_nested_subst(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[i,j]: 0<=i<n and 0<=j<5}",
+        """
+        E:=a[i]
+        D:=E*E
+        b[i] = D
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+
+    ref_knl = knl
+
+    knl = lp.tag_inames(knl, dict(j="g.1"))
+    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+
+    from loopy.symbolic import get_dependencies
+    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
+    knl = lp.precompute(knl, "D", "i_inner")
+
+    # There's only one surviving 'E' rule.
+    assert len([
+        rule_name
+        for rule_name in knl.substitutions
+        if rule_name.startswith("E")]) == 1
+
+    # That rule should use the newly created prefetch inames,
+    # not the prior 'i_inner'
+    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(n=12345))
+
+
+def test_precompute_with_preexisting_inames(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
+        """
+        result[e,i] = sum(j, D1[i,j]*u[e,j])
+        result2[e,i] = sum(k, D2[i,k]*u[e,k])
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, {
+        "u": np.float32,
+        "D1": np.float32,
+        "D2": np.float32,
+        })
+
+    knl = lp.fix_parameters(knl, n=13)
+
+    ref_knl = knl
+
+    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
+    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")
+
+    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
+            precompute_inames="ii,jj")
+    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
+            precompute_inames="ii,jj")
+
+    knl = lp.set_loop_priority(knl, "ii,jj,e,j,k")
+
+    lp.auto_test_vs_ref(
+            ref_knl, ctx, knl,
+            parameters=dict(E=200))
+
+
+def test_precompute_with_preexisting_inames_fail():
+    knl = lp.make_kernel(
+        "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}",
+        """
+        result[e,i] = sum(j, D1[i,j]*u[e,j])
+        result2[e,i] = sum(k, D2[i,k]*u[e,k])
+        """)
+
+    knl = lp.add_and_infer_dtypes(knl, {
+        "u": np.float32,
+        "D1": np.float32,
+        "D2": np.float32,
+        })
+
+    knl = lp.fix_parameters(knl, n=13)
+
+    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
+    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")
+
+    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
+            precompute_inames="ii,jj")
+    with pytest.raises(lp.LoopyError):
+        lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
+                precompute_inames="ii,jj")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: foldmethod=marker