diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fb90b51291951d952bf30c24c3fa7c08030a53d5..e371192f98f3004c0c3949fe91a5ec48b9296862 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,6 +3,7 @@ Python 2.7 AMD CPU:
   - export PY_EXE=python2.7
   - export PYOPENCL_TEST=amd:pu
   - export EXTRA_INSTALL="numpy mako"
+  - export LOOPY_NO_CACHE=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -15,6 +16,7 @@ Python 2.6 AMD CPU:
   - export PY_EXE=python2.6
   - export PYOPENCL_TEST=amd:pu
   - export EXTRA_INSTALL="numpy mako"
+  - export LOOPY_NO_CACHE=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -28,6 +30,7 @@ Python 3.4 AMD CPU:
   - export PYOPENCL_TEST=amd:pu
   - export EXTRA_INSTALL="numpy mako"
   - export NO_DOCTESTS=1
+  - export LOOPY_NO_CACHE=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -40,6 +43,7 @@ Python 2.7 POCL:
   - export PY_EXE=python2.7
   - export PYOPENCL_TEST=portable
   - export EXTRA_INSTALL="numpy mako"
+  - export LOOPY_NO_CACHE=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -53,6 +57,7 @@ Python 2.7 with legacy PyOpenCL:
   - export PYOPENCL_TEST=amd:pu
   - export EXTRA_INSTALL="numpy mako"
   - export REQUIREMENTS_TXT="requirements-old-pyopencl.txt"
+  - export LOOPY_NO_CACHE=1
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
   tags:
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 391504b8cbe92185b5220eee2f78b5f7cae0fdd0..2df476e1f0efd732f89cd589f051f2a96c674b4e 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -1160,7 +1160,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     # If the rule survived past precompute() (i.e. some accesses fell outside
     # the footprint), get rid of it before moving on.
     if rule_name in new_kernel.substitutions:
-        return expand_subst(new_kernel, "id:"+rule_name)
+        return expand_subst(new_kernel, "... > id:"+rule_name)
     else:
         return new_kernel
 
diff --git a/loopy/buffer.py b/loopy/buffer.py
index fea87effcef266d9fbfb89363548a54d2d57455e..5677421de24ad2319ffcc7abf712f32c5b38132b 100644
--- a/loopy/buffer.py
+++ b/loopy/buffer.py
@@ -449,6 +449,9 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         from loopy.preprocess import prepare_for_caching
         buffer_array_cache[cache_key] = prepare_for_caching(kernel)
 
+    from loopy.kernel.tools import assign_automatic_axes
+    kernel = assign_automatic_axes(kernel)
+
     return kernel
 
 # vim: foldmethod=marker
diff --git a/loopy/context_matching.py b/loopy/context_matching.py
index a88e207002220a1be840114d71948869f566863d..45f9a4d74b353e5821d2a2ed3e410c44e3187eb8 100644
--- a/loopy/context_matching.py
+++ b/loopy/context_matching.py
@@ -98,7 +98,6 @@ class MatchExpressionBase(object):
         return not self.__eq__(other)
 
 
-
 class AllMatchExpression(MatchExpressionBase):
     def __call__(self, kernel, matchable):
         return True
diff --git a/loopy/fusion.py b/loopy/fusion.py
index 8845951ea293d5a0e66d457a4bcb8680db57623c..6974ee7fe863990bddc7480ee0c8abd50c49f9ab 100644
--- a/loopy/fusion.py
+++ b/loopy/fusion.py
@@ -32,6 +32,44 @@ from loopy.diagnostic import LoopyError
 from pymbolic import var
 
 
+def _apply_renames_in_exprs(kernel, var_renames):
+    from loopy.symbolic import (
+            SubstitutionRuleMappingContext,
+            RuleAwareSubstitutionMapper)
+    from pymbolic.mapper.substitutor import make_subst_func
+    from loopy.context_matching import parse_stack_match
+
+    srmc = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+    subst_map = RuleAwareSubstitutionMapper(
+            srmc, make_subst_func(var_renames),
+            within=parse_stack_match(None))
+    return subst_map.map_kernel(kernel)
+
+
+def _rename_temporaries(kernel, suffix, all_identifiers):
+    var_renames = {}
+
+    vng = kernel.get_var_name_generator()
+
+    new_temporaries = {}
+    for tv in six.itervalues(kernel.temporary_variables):
+        if tv.name in all_identifiers:
+            new_tv_name = vng(tv.name+suffix)
+        else:
+            new_tv_name = tv.name
+
+        if new_tv_name != tv.name:
+            var_renames[tv.name] = var(new_tv_name)
+
+        assert new_tv_name not in new_temporaries
+        new_temporaries[new_tv_name] = tv.copy(name=new_tv_name)
+
+    kernel = kernel.copy(temporary_variables=new_temporaries)
+
+    return _apply_renames_in_exprs(kernel, var_renames)
+
+
 def _find_fusable_loop_domain_index(domain, other_domains):
     my_inames = set(domain.get_var_dict(dim_type.set))
 
@@ -168,22 +206,7 @@ def _fuse_two_kernels(knla, knlb):
 
     # }}}
 
-    # {{{ apply renames in kernel b
-
-    from loopy.symbolic import (
-            SubstitutionRuleMappingContext,
-            RuleAwareSubstitutionMapper)
-    from pymbolic.mapper.substitutor import make_subst_func
-    from loopy.context_matching import parse_stack_match
-
-    srmc = SubstitutionRuleMappingContext(
-            knlb.substitutions, knlb.get_var_name_generator())
-    subst_map = RuleAwareSubstitutionMapper(
-            srmc, make_subst_func(b_var_renames),
-            within=parse_stack_match(None))
-    knlb = subst_map.map_kernel(knlb)
-
-    # }}}
+    knlb = _apply_renames_in_exprs(knlb, b_var_renames)
 
     # {{{ fuse instructions
 
@@ -286,8 +309,41 @@ def _fuse_two_kernels(knla, knlb):
 # }}}
 
 
-def fuse_kernels(kernels):
+def fuse_kernels(kernels, suffixes=None):
     kernels = list(kernels)
+
+    if suffixes:
+        suffixes = list(suffixes)
+        if len(suffixes) != len(kernels):
+            raise ValueError("length of 'suffixes' must match "
+                    "length of 'kernels'")
+
+        # {{{ rename temporaries with suffixes
+
+        all_identifiers = [
+                kernel.all_variable_names()
+                for kernel in kernels]
+
+        from functools import reduce, partial
+        from operator import or_
+        merge_sets = partial(reduce, or_)
+
+        new_kernels = []
+        for i, (kernel, suffix) in enumerate(zip(kernels, suffixes)):
+            new_kernels.append(
+                    _rename_temporaries(
+                        kernel,
+                        suffix,
+                        merge_sets(
+                            all_identifiers[:i]
+                            +
+                            all_identifiers[i+1:])))
+
+        kernels = new_kernels
+        del new_kernels
+
+        # }}}
+
     result = kernels.pop(0)
     while kernels:
         result = _fuse_two_kernels(result, kernels.pop(0))
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 13afaa66d05b8dce89a2eb3f1f06e8b752dc5420..bbea32e228ad4508fe9de5c89b14d5c0202ac02f 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1012,7 +1012,7 @@ class LoopKernel(RecordWithoutPickling):
                 return
             printed_insn_ids.add(insn.id)
 
-            for dep_id in insn.insn_deps:
+            for dep_id in sorted(insn.insn_deps):
                 print_insn(kernel.id_to_insn[dep_id])
 
             if isinstance(insn, lp.ExpressionInstruction):
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index be6f32a9bf78fab306cb4acd3f45a1a4f2e66f34..769301ed99d8486a396dbf94b06bf361771596cb 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -578,4 +578,271 @@ def is_domain_dependent_on_inames(kernel, domain_index, inames):
 # }}}
 
 
+# {{{ assign automatic axes
+
+# {{{ rank inames by stride
+
+def get_auto_axis_iname_ranking_by_stride(kernel, insn):
+    from loopy.kernel.data import ImageArg, ValueArg
+
+    approximate_arg_values = {}
+    for arg in kernel.args:
+        if isinstance(arg, ValueArg):
+            if arg.approximately is not None:
+                approximate_arg_values[arg.name] = arg.approximately
+            else:
+                raise LoopyError("No approximate arg value specified for '%s'"
+                        % arg.name)
+
+    # {{{ find all array accesses in insn
+
+    from loopy.symbolic import ArrayAccessFinder
+    ary_acc_exprs = list(ArrayAccessFinder()(insn.expression))
+
+    from pymbolic.primitives import Subscript
+
+    if isinstance(insn.assignee, Subscript):
+        ary_acc_exprs.append(insn.assignee)
+
+    # }}}
+
+    # {{{ filter array accesses to only the global ones
+
+    global_ary_acc_exprs = []
+
+    for aae in ary_acc_exprs:
+        ary_name = aae.aggregate.name
+        arg = kernel.arg_dict.get(ary_name)
+        if arg is None:
+            continue
+
+        if isinstance(arg, ImageArg):
+            continue
+
+        global_ary_acc_exprs.append(aae)
+
+    # }}}
+
+    # {{{ figure out automatic-axis inames
+
+    from loopy.kernel.data import AutoLocalIndexTagBase
+    auto_axis_inames = set(
+            iname
+            for iname in kernel.insn_inames(insn)
+            if isinstance(kernel.iname_to_tag.get(iname),
+                AutoLocalIndexTagBase))
+
+    # }}}
+
+    # {{{ figure out which iname should get mapped to local axis 0
+
+    # maps inames to "aggregate stride"
+    aggregate_strides = {}
+
+    from loopy.symbolic import CoefficientCollector
+    from pymbolic.primitives import Variable
+
+    for aae in global_ary_acc_exprs:
+        index_expr = aae.index
+        if not isinstance(index_expr, tuple):
+            index_expr = (index_expr,)
+
+        ary_name = aae.aggregate.name
+        arg = kernel.arg_dict.get(ary_name)
+
+        if arg.dim_tags is None:
+            from warnings import warn
+            warn("Strides for '%s' are not known. Local axis assignment "
+                    "is likely suboptimal." % arg.name)
+            ary_strides = [1] * len(index_expr)
+        else:
+            ary_strides = []
+            from loopy.kernel.array import FixedStrideArrayDimTag
+            for dim_tag in arg.dim_tags:
+                if isinstance(dim_tag, FixedStrideArrayDimTag):
+                    ary_strides.append(dim_tag.stride)
+
+        # {{{ construct iname_to_stride_expr
+
+        iname_to_stride_expr = {}
+        for iexpr_i, stride in zip(index_expr, ary_strides):
+            if stride is None:
+                continue
+            coeffs = CoefficientCollector()(iexpr_i)
+            for var, coeff in six.iteritems(coeffs):
+                if (isinstance(var, Variable)
+                        and var.name in auto_axis_inames):
+                    # excludes '1', i.e.  the constant
+                    new_stride = coeff*stride
+                    old_stride = iname_to_stride_expr.get(var.name, None)
+                    if old_stride is None or new_stride < old_stride:
+                        iname_to_stride_expr[var.name] = new_stride
+
+        # }}}
+
+        from pymbolic import evaluate
+        for iname, stride_expr in six.iteritems(iname_to_stride_expr):
+            stride = evaluate(stride_expr, approximate_arg_values)
+            aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride
+
+    if aggregate_strides:
+        very_large_stride = int(np.iinfo(np.int32).max)
+
+        return sorted((iname for iname in kernel.insn_inames(insn)),
+                key=lambda iname: (
+                    aggregate_strides.get(iname, very_large_stride),
+                    iname))
+    else:
+        return None
+
+    # }}}
+
+# }}}
+
+
+def assign_automatic_axes(kernel, axis=0, local_size=None):
+    logger.debug("%s: assign automatic axes" % kernel.name)
+
+    from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag)
+
+    # Realize that at this point in time, axis lengths are already
+    # fixed. So we compute them once and pass them to our recursive
+    # copies.
+
+    if local_size is None:
+        _, local_size = kernel.get_grid_sizes_as_exprs(
+                ignore_auto=True)
+
+    # {{{ axis assignment helper function
+
+    def assign_axis(recursion_axis, iname, axis=None):
+        """Assign iname to local axis *axis* and start over by calling
+        the surrounding function assign_automatic_axes.
+
+        If *axis* is None, find a suitable axis automatically.
+        """
+        desired_length = kernel.get_constant_iname_length(iname)
+
+        if axis is None:
+            # {{{ find a suitable axis
+
+            shorter_possible_axes = []
+            test_axis = 0
+            while True:
+                if test_axis >= len(local_size):
+                    break
+                if test_axis in assigned_local_axes:
+                    test_axis += 1
+                    continue
+
+                if local_size[test_axis] < desired_length:
+                    shorter_possible_axes.append(test_axis)
+                    test_axis += 1
+                    continue
+                else:
+                    axis = test_axis
+                    break
+
+            # The loop above will find an unassigned local axis
+            # that has enough 'room' for the iname. In the same traversal,
+            # it also finds theoretically assignable axes that are shorter,
+            # in the variable shorter_possible_axes.
+
+            if axis is None and shorter_possible_axes:
+                # sort as longest first
+                shorter_possible_axes.sort(key=lambda ax: local_size[ax])
+                axis = shorter_possible_axes[0]
+
+            # }}}
+
+        if axis is None:
+            new_tag = None
+        else:
+            new_tag = LocalIndexTag(axis)
+            if desired_length > local_size[axis]:
+                from loopy import split_iname
+
+                # Don't be tempted to switch the outer tag to unroll--this may
+                # generate tons of code on some examples.
+
+                return assign_automatic_axes(
+                        split_iname(kernel, iname, inner_length=local_size[axis],
+                            outer_tag=None, inner_tag=new_tag,
+                            do_tagged_check=False),
+                        axis=recursion_axis, local_size=local_size)
+
+        if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase):
+            raise LoopyError("trying to reassign '%s'" % iname)
+
+        new_iname_to_tag = kernel.iname_to_tag.copy()
+        new_iname_to_tag[iname] = new_tag
+        return assign_automatic_axes(kernel.copy(iname_to_tag=new_iname_to_tag),
+                axis=recursion_axis, local_size=local_size)
+
+    # }}}
+
+    # {{{ main assignment loop
+
+    # assignment proceeds in one phase per axis, each time assigning the
+    # smallest-stride available iname to the current axis
+
+    import loopy as lp
+
+    for insn in kernel.instructions:
+        if not isinstance(insn, lp.ExpressionInstruction):
+            continue
+
+        auto_axis_inames = [
+                iname
+                for iname in kernel.insn_inames(insn)
+                if isinstance(kernel.iname_to_tag.get(iname),
+                    AutoLocalIndexTagBase)]
+
+        if not auto_axis_inames:
+            continue
+
+        assigned_local_axes = set()
+
+        for iname in kernel.insn_inames(insn):
+            tag = kernel.iname_to_tag.get(iname)
+            if isinstance(tag, LocalIndexTag):
+                assigned_local_axes.add(tag.axis)
+
+        if axis < len(local_size):
+            # "valid" pass: try to assign a given axis
+
+            if axis not in assigned_local_axes:
+                iname_ranking = get_auto_axis_iname_ranking_by_stride(kernel, insn)
+                if iname_ranking is not None:
+                    for iname in iname_ranking:
+                        prev_tag = kernel.iname_to_tag.get(iname)
+                        if isinstance(prev_tag, AutoLocalIndexTagBase):
+                            return assign_axis(axis, iname, axis)
+
+        else:
+            # "invalid" pass: There are still unassigned axis after the
+            #  numbered "valid" passes--assign the remainder by length.
+
+            # assign longest auto axis inames first
+            auto_axis_inames.sort(
+                            key=lambda iname: (kernel.get_constant_iname_length(iname), iname),
+                            reverse=True)
+
+            if auto_axis_inames:
+                return assign_axis(axis, auto_axis_inames.pop())
+
+    # }}}
+
+    # We've seen all instructions and not punted to recursion/restart because
+    # of a new axis assignment.
+
+    if axis >= len(local_size):
+        return kernel
+    else:
+        return assign_automatic_axes(kernel, axis=axis+1,
+                local_size=local_size)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 9a4330658515c0eccfc6566d1a478adb6afbecf2..d1176bc4af1d99b62860364702549b9a1b6b7eda 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -300,27 +300,27 @@ def parse_reduction_op(name):
 # }}}
 
 
-def reduction_function_mangler(target, func_id, arg_dtypes):
+def reduction_function_mangler(kernel, func_id, arg_dtypes):
     if isinstance(func_id, ArgExtFunction):
         from loopy.target.opencl import OpenCLTarget
-        if not isinstance(target, OpenCLTarget):
+        if not isinstance(kernel.target, OpenCLTarget):
             raise LoopyError("only OpenCL supported for now")
 
         op = func_id.reduction_op
-        return (op.result_dtype(target, func_id.scalar_dtype, func_id.inames),
+        return (op.result_dtype(kernel.target, func_id.scalar_dtype, func_id.inames),
                 "%s_%s" % (op.prefix(func_id.scalar_dtype), func_id.name))
 
     return None
 
 
-def reduction_preamble_generator(target, seen_dtypes, seen_functions):
+def reduction_preamble_generator(kernel, seen_dtypes, seen_functions):
     from loopy.target.opencl import OpenCLTarget
 
     for func in seen_functions:
         if isinstance(func.name, ArgExtFunction):
-            if not isinstance(target, OpenCLTarget):
+            if not isinstance(kernel.target, OpenCLTarget):
                 raise LoopyError("only OpenCL supported for now")
 
-            yield get_argext_preamble(target, func.name)
+            yield get_argext_preamble(kernel.target, func.name)
 
 # vim: fdm=marker
diff --git a/loopy/precompute.py b/loopy/precompute.py
index ee7f815cf90cd2e870af4b153435083f264503e3..18d066397547370e8ce70a9a8721e625a3fd713e 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -1,6 +1,4 @@
 from __future__ import division, absolute_import, print_function
-import six
-from six.moves import range, zip
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -25,6 +23,8 @@ THE SOFTWARE.
 """
 
 
+import six
+from six.moves import range, zip
 import islpy as isl
 from loopy.symbolic import (get_dependencies,
         RuleAwareIdentityMapper, RuleAwareSubstitutionMapper,
@@ -296,9 +296,14 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
         (such as size, type) are checked (and updated, if possible) to match
         its use.
     :arg precompute_inames:
+        A tuple of inames to be used to carry out the precomputation.
         If the specified inames do not already exist, they will be
         created. If they do already exist, their loop domain is verified
-        against the one required for this precomputation.
+        against the one required for this precomputation. This tuple may
+        be shorter than the (provided or automatically found) *storage_axes*
+        tuple, in which case names will be automatically created.
+        May also equivalently be a comma-separated string.
+
     :arg compute_insn_id: The ID of the instruction performing the precomputation.
 
     If `storage_axes` is not specified, it defaults to the arrangement
@@ -440,7 +445,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     # {{{ use given / find new storage_axes
 
     # extra axes made necessary because they don't occur in the arguments
-    extra_storage_axes = sweep_inames_set - expanding_usage_arg_deps
+    extra_storage_axes = set(sweep_inames_set - expanding_usage_arg_deps)
 
     from loopy.symbolic import SubstitutionRuleExpander
     submap = SubstitutionRuleExpander(kernel.substitutions)
@@ -456,9 +461,27 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     new_iname_to_tag = {}
 
     if storage_axes is None:
-        storage_axes = (
-                list(extra_storage_axes)
-                + list(range(len(subst.arguments))))
+        storage_axes = []
+
+        # Add sweep_inames (in given--rather than arbitrary--order) to
+        # storage_axes *if* they are part of extra_storage_axes.
+        for iname in sweep_inames:
+            if iname in extra_storage_axes:
+                extra_storage_axes.remove(iname)
+                storage_axes.append(iname)
+
+        if extra_storage_axes:
+            if (precompute_inames is not None
+                    and len(storage_axes) < len(precompute_inames)):
+                raise LoopyError("must specify a sufficient number of "
+                        "storage_axes to uniquely determine the meaning "
+                        "of the given precompute_inames. (%d storage_axes "
+                        "needed)" % len(precompute_inames))
+            storage_axes.extend(sorted(extra_storage_axes))
+
+        storage_axes.extend(range(len(subst.arguments)))
+
+    del extra_storage_axes
 
     prior_storage_axis_name_dict = {}
 
@@ -814,6 +837,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     # }}}
 
     from loopy import tag_inames
-    return tag_inames(kernel, new_iname_to_tag)
+    kernel = tag_inames(kernel, new_iname_to_tag)
+
+    from loopy.kernel.tools import assign_automatic_axes
+    kernel = assign_automatic_axes(kernel)
+
+    return kernel
 
 # vim: foldmethod=marker
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 14940b529378a2caac44c2bf9f45180f51952ebc..b8330fe036b1d7d15dd6768aa9688e25aa35a8aa 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -24,7 +24,6 @@ THE SOFTWARE.
 
 
 import six
-import numpy as np
 from loopy.diagnostic import (
         LoopyError, WriteRaceConditionWarning, warn,
         LoopyAdvisory, DependencyTypeInferenceFailure)
@@ -763,269 +762,6 @@ def limit_boostability(kernel):
 # }}}
 
 
-# {{{ rank inames by stride
-
-def get_auto_axis_iname_ranking_by_stride(kernel, insn):
-    from loopy.kernel.data import ImageArg, ValueArg
-
-    approximate_arg_values = {}
-    for arg in kernel.args:
-        if isinstance(arg, ValueArg):
-            if arg.approximately is not None:
-                approximate_arg_values[arg.name] = arg.approximately
-            else:
-                raise LoopyError("No approximate arg value specified for '%s'"
-                        % arg.name)
-
-    # {{{ find all array accesses in insn
-
-    from loopy.symbolic import ArrayAccessFinder
-    ary_acc_exprs = list(ArrayAccessFinder()(insn.expression))
-
-    from pymbolic.primitives import Subscript
-
-    if isinstance(insn.assignee, Subscript):
-        ary_acc_exprs.append(insn.assignee)
-
-    # }}}
-
-    # {{{ filter array accesses to only the global ones
-
-    global_ary_acc_exprs = []
-
-    for aae in ary_acc_exprs:
-        ary_name = aae.aggregate.name
-        arg = kernel.arg_dict.get(ary_name)
-        if arg is None:
-            continue
-
-        if isinstance(arg, ImageArg):
-            continue
-
-        global_ary_acc_exprs.append(aae)
-
-    # }}}
-
-    # {{{ figure out automatic-axis inames
-
-    from loopy.kernel.data import AutoLocalIndexTagBase
-    auto_axis_inames = set(
-            iname
-            for iname in kernel.insn_inames(insn)
-            if isinstance(kernel.iname_to_tag.get(iname),
-                AutoLocalIndexTagBase))
-
-    # }}}
-
-    # {{{ figure out which iname should get mapped to local axis 0
-
-    # maps inames to "aggregate stride"
-    aggregate_strides = {}
-
-    from loopy.symbolic import CoefficientCollector
-    from pymbolic.primitives import Variable
-
-    for aae in global_ary_acc_exprs:
-        index_expr = aae.index
-        if not isinstance(index_expr, tuple):
-            index_expr = (index_expr,)
-
-        ary_name = aae.aggregate.name
-        arg = kernel.arg_dict.get(ary_name)
-
-        if arg.dim_tags is None:
-            from warnings import warn
-            warn("Strides for '%s' are not known. Local axis assignment "
-                    "is likely suboptimal." % arg.name)
-            ary_strides = [1] * len(index_expr)
-        else:
-            ary_strides = []
-            from loopy.kernel.array import FixedStrideArrayDimTag
-            for dim_tag in arg.dim_tags:
-                if isinstance(dim_tag, FixedStrideArrayDimTag):
-                    ary_strides.append(dim_tag.stride)
-
-        # {{{ construct iname_to_stride_expr
-
-        iname_to_stride_expr = {}
-        for iexpr_i, stride in zip(index_expr, ary_strides):
-            if stride is None:
-                continue
-            coeffs = CoefficientCollector()(iexpr_i)
-            for var, coeff in six.iteritems(coeffs):
-                if (isinstance(var, Variable)
-                        and var.name in auto_axis_inames):
-                    # excludes '1', i.e.  the constant
-                    new_stride = coeff*stride
-                    old_stride = iname_to_stride_expr.get(var.name, None)
-                    if old_stride is None or new_stride < old_stride:
-                        iname_to_stride_expr[var.name] = new_stride
-
-        # }}}
-
-        from pymbolic import evaluate
-        for iname, stride_expr in six.iteritems(iname_to_stride_expr):
-            stride = evaluate(stride_expr, approximate_arg_values)
-            aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride
-
-    if aggregate_strides:
-        very_large_stride = np.iinfo(np.int32).max
-
-        return sorted((iname for iname in kernel.insn_inames(insn)),
-                key=lambda iname: aggregate_strides.get(iname, very_large_stride))
-    else:
-        return None
-
-    # }}}
-
-# }}}
-
-
-# {{{ assign automatic axes
-
-def assign_automatic_axes(kernel, axis=0, local_size=None):
-    logger.debug("%s: assign automatic axes" % kernel.name)
-
-    from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag)
-
-    # Realize that at this point in time, axis lengths are already
-    # fixed. So we compute them once and pass them to our recursive
-    # copies.
-
-    if local_size is None:
-        _, local_size = kernel.get_grid_sizes_as_exprs(
-                ignore_auto=True)
-
-    # {{{ axis assignment helper function
-
-    def assign_axis(recursion_axis, iname, axis=None):
-        """Assign iname to local axis *axis* and start over by calling
-        the surrounding function assign_automatic_axes.
-
-        If *axis* is None, find a suitable axis automatically.
-        """
-        desired_length = kernel.get_constant_iname_length(iname)
-
-        if axis is None:
-            # {{{ find a suitable axis
-
-            shorter_possible_axes = []
-            test_axis = 0
-            while True:
-                if test_axis >= len(local_size):
-                    break
-                if test_axis in assigned_local_axes:
-                    test_axis += 1
-                    continue
-
-                if local_size[test_axis] < desired_length:
-                    shorter_possible_axes.append(test_axis)
-                    test_axis += 1
-                    continue
-                else:
-                    axis = test_axis
-                    break
-
-            # The loop above will find an unassigned local axis
-            # that has enough 'room' for the iname. In the same traversal,
-            # it also finds theoretically assignable axes that are shorter,
-            # in the variable shorter_possible_axes.
-
-            if axis is None and shorter_possible_axes:
-                # sort as longest first
-                shorter_possible_axes.sort(key=lambda ax: local_size[ax])
-                axis = shorter_possible_axes[0]
-
-            # }}}
-
-        if axis is None:
-            new_tag = None
-        else:
-            new_tag = LocalIndexTag(axis)
-            if desired_length > local_size[axis]:
-                from loopy import split_iname
-
-                # Don't be tempted to switch the outer tag to unroll--this may
-                # generate tons of code on some examples.
-
-                return assign_automatic_axes(
-                        split_iname(kernel, iname, inner_length=local_size[axis],
-                            outer_tag=None, inner_tag=new_tag,
-                            do_tagged_check=False),
-                        axis=recursion_axis, local_size=local_size)
-
-        if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase):
-            raise LoopyError("trying to reassign '%s'" % iname)
-
-        new_iname_to_tag = kernel.iname_to_tag.copy()
-        new_iname_to_tag[iname] = new_tag
-        return assign_automatic_axes(kernel.copy(iname_to_tag=new_iname_to_tag),
-                axis=recursion_axis, local_size=local_size)
-
-    # }}}
-
-    # {{{ main assignment loop
-
-    # assignment proceeds in one phase per axis, each time assigning the
-    # smallest-stride available iname to the current axis
-
-    import loopy as lp
-
-    for insn in kernel.instructions:
-        if not isinstance(insn, lp.ExpressionInstruction):
-            continue
-
-        auto_axis_inames = [
-                iname
-                for iname in kernel.insn_inames(insn)
-                if isinstance(kernel.iname_to_tag.get(iname),
-                    AutoLocalIndexTagBase)]
-
-        if not auto_axis_inames:
-            continue
-
-        assigned_local_axes = set()
-
-        for iname in kernel.insn_inames(insn):
-            tag = kernel.iname_to_tag.get(iname)
-            if isinstance(tag, LocalIndexTag):
-                assigned_local_axes.add(tag.axis)
-
-        if axis < len(local_size):
-            # "valid" pass: try to assign a given axis
-
-            if axis not in assigned_local_axes:
-                iname_ranking = get_auto_axis_iname_ranking_by_stride(kernel, insn)
-                if iname_ranking is not None:
-                    for iname in iname_ranking:
-                        prev_tag = kernel.iname_to_tag.get(iname)
-                        if isinstance(prev_tag, AutoLocalIndexTagBase):
-                            return assign_axis(axis, iname, axis)
-
-        else:
-            # "invalid" pass: There are still unassigned axis after the
-            #  numbered "valid" passes--assign the remainder by length.
-
-            # assign longest auto axis inames first
-            auto_axis_inames.sort(key=kernel.get_constant_iname_length, reverse=True)
-
-            if auto_axis_inames:
-                return assign_axis(axis, auto_axis_inames.pop())
-
-    # }}}
-
-    # We've seen all instructions and not punted to recursion/restart because
-    # of a new axis assignment.
-
-    if axis >= len(local_size):
-        return kernel
-    else:
-        return assign_automatic_axes(kernel, axis=axis+1,
-                local_size=local_size)
-
-# }}}
-
-
 preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
@@ -1058,6 +794,17 @@ def preprocess_kernel(kernel, device=None):
 
     logger.info("%s: preprocess start" % kernel.name)
 
+    # {{{ check that there are no l.auto-tagged inames
+
+    from loopy.kernel.data import AutoLocalIndexTagBase
+    for iname, tag in six.iteritems(kernel.iname_to_tag):
+        if (isinstance(tag, AutoLocalIndexTagBase)
+                 and iname in kernel.all_inames()):
+            raise LoopyError("kernel with automatically-assigned "
+                    "local axes passed to preprocessing")
+
+    # }}}
+
     from loopy.subst import expand_subst
     kernel = expand_subst(kernel)
 
@@ -1086,7 +833,6 @@ def preprocess_kernel(kernel, device=None):
 
     kernel = duplicate_private_temporaries_for_ilp_and_vec(kernel)
     kernel = mark_local_temporaries(kernel)
-    kernel = assign_automatic_axes(kernel)
     kernel = find_boostability(kernel)
     kernel = limit_boostability(kernel)
 
diff --git a/loopy/schedule.py b/loopy/schedule.py
index f22b95d45275d54d473a97a8f7a0dfde69555d6b..f276e2f1244b41429ff434cd7fa1ad5c32d7563a 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -1039,7 +1039,7 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
     raw = tgt_read & src_write
     war = tgt_write & src_read
 
-    for var_name in raw | war:
+    for var_name in sorted(raw | war):
         return DependencyRecord(
                 source=source,
                 target=target,
@@ -1050,7 +1050,7 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind):
     if source is target:
         return None
 
-    for var_name in waw:
+    for var_name in sorted(waw):
         return DependencyRecord(
                 source=source,
                 target=target,
@@ -1213,7 +1213,7 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0):
 
             # (for leading (before-first-barrier) bit of loop body)
             for insn_id in insn_ids_from_schedule(subresult[:first_barrier_index]):
-                search_set = candidates
+                search_set = sorted(candidates)
 
                 for dep_src_insn_id in search_set:
                     dep = get_barrier_needing_dependency(
@@ -1252,7 +1252,7 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0):
         elif isinstance(sched_item, RunInstruction):
             i += 1
 
-            search_set = candidates
+            search_set = sorted(candidates)
 
             for dep_src_insn_id in search_set:
                 dep = get_barrier_needing_dependency(
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 3311d23165b5f34f16c0c745218fd7da3a0ad2ac..d65440d57005183b2756040473080337257c5b52 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -153,7 +153,8 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
         if not isinstance(other, type(expr)):
             return self.treat_mismatch(expr, other, unis)
         if (expr.inames != other.inames
-                or type(expr.operation) != type(other.operation)):
+                or type(expr.operation) != type(other.operation)  # noqa
+                ):
             return []
 
         return self.rec(expr.expr, other.expr, unis)
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index ca71c21269add662dc1ef19a4437c9f297ec6477..628e5d9ac14714f58dd6b68e3f8b605880b1f19b 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -102,7 +102,9 @@ class CTarget(TargetBase):
                 sub_tp, sub_decl = self.subdecl.get_decl_pair()
                 return sub_tp, ("*const restrict %s" % sub_decl)
 
-        for tv in six.itervalues(kernel.temporary_variables):
+        for tv in sorted(
+                six.itervalues(kernel.temporary_variables),
+                key=lambda tv: tv.name):
             decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype)
 
             if not tv.base_storage:
@@ -166,7 +168,7 @@ class CTarget(TargetBase):
                             idi.dtype.itemsize
                             * product(si for si in idi.shape))
 
-        for bs_name, bs_sizes in six.iteritems(base_storage_sizes):
+        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
             bs_var_decl = POD(self, np.int8, bs_name)
             if base_storage_to_is_local[bs_name]:
                 bs_var_decl = CLLocal(bs_var_decl)
diff --git a/loopy/version.py b/loopy/version.py
index 9598697b09afc091741cea5d8da37917dd88ce9d..389659b86ac35b00cf14267b33b17b0d841d19c1 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v11-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v12-islpy%s" % _islpy_version
diff --git a/test/test_nbody.py b/test/test_nbody.py
index 65e5658b53f692d1142a8112849111e98014126d..540a8f5c3fac4be02bd4bf53f8d5cc92a195c64a 100644
--- a/test/test_nbody.py
+++ b/test/test_nbody.py
@@ -74,17 +74,17 @@ def test_nbody(ctx_factory):
                 outer_tag="g.0", inner_tag="l.0")
         knl = lp.split_iname(knl, "j", 256)
         knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"],
-                ["x_fetch_j", "x_fetch_k"])
+                ["x_fetch_j", "x_fetch_k"], default_tag=None)
+        knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0"))
         knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
-        knl = lp.tag_inames(knl, dict(x_fetch_k="unr"))
         knl = lp.set_loop_priority(knl, ["j_outer", "j_inner"])
         return knl
 
     n = 3000
 
     for variant in [
-            variant_1,
-            variant_cpu,
+            #variant_1,
+            #variant_cpu,
             variant_gpu
             ]:
         variant_knl = variant(knl)