diff --git a/loopy/buffer.py b/loopy/buffer.py
index fea87effcef266d9fbfb89363548a54d2d57455e..5677421de24ad2319ffcc7abf712f32c5b38132b 100644
--- a/loopy/buffer.py
+++ b/loopy/buffer.py
@@ -449,6 +449,9 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         from loopy.preprocess import prepare_for_caching
         buffer_array_cache[cache_key] = prepare_for_caching(kernel)
 
+    from loopy.kernel.tools import assign_automatic_axes
+    kernel = assign_automatic_axes(kernel)
+
     return kernel
 
 # vim: foldmethod=marker
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index be6f32a9bf78fab306cb4acd3f45a1a4f2e66f34..f6adfeebf79aef2e5f3b0ecc782ef15e0c0a2192 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -578,4 +578,269 @@ def is_domain_dependent_on_inames(kernel, domain_index, inames):
 # }}}
 
 
+# {{{ assign automatic axes
+
+# {{{ rank inames by stride
+
+def get_auto_axis_iname_ranking_by_stride(kernel, insn):
+    from loopy.kernel.data import ImageArg, ValueArg
+
+    approximate_arg_values = {}
+    for arg in kernel.args:
+        if isinstance(arg, ValueArg):
+            if arg.approximately is not None:
+                approximate_arg_values[arg.name] = arg.approximately
+            else:
+                raise LoopyError("No approximate arg value specified for '%s'"
+                        % arg.name)
+
+    # {{{ find all array accesses in insn
+
+    from loopy.symbolic import ArrayAccessFinder
+    ary_acc_exprs = list(ArrayAccessFinder()(insn.expression))
+
+    from pymbolic.primitives import Subscript
+
+    if isinstance(insn.assignee, Subscript):
+        ary_acc_exprs.append(insn.assignee)
+
+    # }}}
+
+    # {{{ filter array accesses to only the global ones
+
+    global_ary_acc_exprs = []
+
+    for aae in ary_acc_exprs:
+        ary_name = aae.aggregate.name
+        arg = kernel.arg_dict.get(ary_name)
+        if arg is None:
+            continue
+
+        if isinstance(arg, ImageArg):
+            continue
+
+        global_ary_acc_exprs.append(aae)
+
+    # }}}
+
+    # {{{ figure out automatic-axis inames
+
+    from loopy.kernel.data import AutoLocalIndexTagBase
+    auto_axis_inames = set(
+            iname
+            for iname in kernel.insn_inames(insn)
+            if isinstance(kernel.iname_to_tag.get(iname),
+                AutoLocalIndexTagBase))
+
+    # }}}
+
+    # {{{ figure out which iname should get mapped to local axis 0
+
+    # maps inames to "aggregate stride"
+    aggregate_strides = {}
+
+    from loopy.symbolic import CoefficientCollector
+    from pymbolic.primitives import Variable
+
+    for aae in global_ary_acc_exprs:
+        index_expr = aae.index
+        if not isinstance(index_expr, tuple):
+            index_expr = (index_expr,)
+
+        ary_name = aae.aggregate.name
+        arg = kernel.arg_dict.get(ary_name)
+
+        if arg.dim_tags is None:
+            from warnings import warn
+            warn("Strides for '%s' are not known. Local axis assignment "
+                    "is likely suboptimal." % arg.name)
+            ary_strides = [1] * len(index_expr)
+        else:
+            ary_strides = []
+            from loopy.kernel.array import FixedStrideArrayDimTag
+            for dim_tag in arg.dim_tags:
+                if isinstance(dim_tag, FixedStrideArrayDimTag):
+                    ary_strides.append(dim_tag.stride)
+
+        # {{{ construct iname_to_stride_expr
+
+        iname_to_stride_expr = {}
+        for iexpr_i, stride in zip(index_expr, ary_strides):
+            if stride is None:
+                continue
+            coeffs = CoefficientCollector()(iexpr_i)
+            for var, coeff in six.iteritems(coeffs):
+                if (isinstance(var, Variable)
+                        and var.name in auto_axis_inames):
+                    # excludes '1', i.e.  the constant
+                    new_stride = coeff*stride
+                    old_stride = iname_to_stride_expr.get(var.name, None)
+                    if old_stride is None or new_stride < old_stride:
+                        iname_to_stride_expr[var.name] = new_stride
+
+        # }}}
+
+        from pymbolic import evaluate
+        for iname, stride_expr in six.iteritems(iname_to_stride_expr):
+            stride = evaluate(stride_expr, approximate_arg_values)
+            aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride
+
+    if aggregate_strides:
+        very_large_stride = np.iinfo(np.int32).max
+
+        return sorted((iname for iname in kernel.insn_inames(insn)),
+                key=lambda iname: (
+                    aggregate_strides.get(iname, very_large_stride),
+                    iname))
+    else:
+        return None
+
+    # }}}
+
+# }}}
+
+
+def assign_automatic_axes(kernel, axis=0, local_size=None):
+    logger.debug("%s: assign automatic axes" % kernel.name)
+
+    from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag)
+
+    # Realize that at this point in time, axis lengths are already
+    # fixed. So we compute them once and pass them to our recursive
+    # copies.
+
+    if local_size is None:
+        _, local_size = kernel.get_grid_sizes_as_exprs(
+                ignore_auto=True)
+
+    # {{{ axis assignment helper function
+
+    def assign_axis(recursion_axis, iname, axis=None):
+        """Assign iname to local axis *axis* and start over by calling
+        the surrounding function assign_automatic_axes.
+
+        If *axis* is None, find a suitable axis automatically.
+        """
+        desired_length = kernel.get_constant_iname_length(iname)
+
+        if axis is None:
+            # {{{ find a suitable axis
+
+            shorter_possible_axes = []
+            test_axis = 0
+            while True:
+                if test_axis >= len(local_size):
+                    break
+                if test_axis in assigned_local_axes:
+                    test_axis += 1
+                    continue
+
+                if local_size[test_axis] < desired_length:
+                    shorter_possible_axes.append(test_axis)
+                    test_axis += 1
+                    continue
+                else:
+                    axis = test_axis
+                    break
+
+            # The loop above will find an unassigned local axis
+            # that has enough 'room' for the iname. In the same traversal,
+            # it also finds theoretically assignable axes that are shorter,
+            # in the variable shorter_possible_axes.
+
+            if axis is None and shorter_possible_axes:
+                # sort as longest first
+                shorter_possible_axes.sort(key=lambda ax: local_size[ax])
+                axis = shorter_possible_axes[0]
+
+            # }}}
+
+        if axis is None:
+            new_tag = None
+        else:
+            new_tag = LocalIndexTag(axis)
+            if desired_length > local_size[axis]:
+                from loopy import split_iname
+
+                # Don't be tempted to switch the outer tag to unroll--this may
+                # generate tons of code on some examples.
+
+                return assign_automatic_axes(
+                        split_iname(kernel, iname, inner_length=local_size[axis],
+                            outer_tag=None, inner_tag=new_tag,
+                            do_tagged_check=False),
+                        axis=recursion_axis, local_size=local_size)
+
+        if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase):
+            raise LoopyError("trying to reassign '%s'" % iname)
+
+        new_iname_to_tag = kernel.iname_to_tag.copy()
+        new_iname_to_tag[iname] = new_tag
+        return assign_automatic_axes(kernel.copy(iname_to_tag=new_iname_to_tag),
+                axis=recursion_axis, local_size=local_size)
+
+    # }}}
+
+    # {{{ main assignment loop
+
+    # assignment proceeds in one phase per axis, each time assigning the
+    # smallest-stride available iname to the current axis
+
+    import loopy as lp
+
+    for insn in kernel.instructions:
+        if not isinstance(insn, lp.ExpressionInstruction):
+            continue
+
+        auto_axis_inames = [
+                iname
+                for iname in kernel.insn_inames(insn)
+                if isinstance(kernel.iname_to_tag.get(iname),
+                    AutoLocalIndexTagBase)]
+
+        if not auto_axis_inames:
+            continue
+
+        assigned_local_axes = set()
+
+        for iname in kernel.insn_inames(insn):
+            tag = kernel.iname_to_tag.get(iname)
+            if isinstance(tag, LocalIndexTag):
+                assigned_local_axes.add(tag.axis)
+
+        if axis < len(local_size):
+            # "valid" pass: try to assign a given axis
+
+            if axis not in assigned_local_axes:
+                iname_ranking = get_auto_axis_iname_ranking_by_stride(kernel, insn)
+                if iname_ranking is not None:
+                    for iname in iname_ranking:
+                        prev_tag = kernel.iname_to_tag.get(iname)
+                        if isinstance(prev_tag, AutoLocalIndexTagBase):
+                            return assign_axis(axis, iname, axis)
+
+        else:
+            # "invalid" pass: There are still unassigned axis after the
+            #  numbered "valid" passes--assign the remainder by length.
+
+            # assign longest auto axis inames first
+            auto_axis_inames.sort(key=kernel.get_constant_iname_length, reverse=True)
+
+            if auto_axis_inames:
+                return assign_axis(axis, auto_axis_inames.pop())
+
+    # }}}
+
+    # We've seen all instructions and not punted to recursion/restart because
+    # of a new axis assignment.
+
+    if axis >= len(local_size):
+        return kernel
+    else:
+        return assign_automatic_axes(kernel, axis=axis+1,
+                local_size=local_size)
+
+# }}}
+
+
 # vim: foldmethod=marker
diff --git a/loopy/precompute.py b/loopy/precompute.py
index c0e1af93c21cc71a35e164e205ec0917c29f0e64..edd17fa348a54f2399d39f9cf8c130fabd0aa162 100644
--- a/loopy/precompute.py
+++ b/loopy/precompute.py
@@ -477,7 +477,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
                         "storage_axes to uniquely determine the meaning "
                         "of the given precompute_inames. (%d storage_axes "
                         "needed)" % len(precompute_inames))
-            storage_axes.extend(extra_storage_axes)
+            storage_axes.extend(sorted(extra_storage_axes))
 
         storage_axes.extend(range(len(subst.arguments)))
 
@@ -836,7 +836,13 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # }}}
 
+    print(new_iname_to_tag)
     from loopy import tag_inames
-    return tag_inames(kernel, new_iname_to_tag)
+    kernel = tag_inames(kernel, new_iname_to_tag)
+
+    from loopy.kernel.tools import assign_automatic_axes
+    kernel = assign_automatic_axes(kernel)
+
+    return kernel
 
 # vim: foldmethod=marker
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 14940b529378a2caac44c2bf9f45180f51952ebc..b8330fe036b1d7d15dd6768aa9688e25aa35a8aa 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -24,7 +24,6 @@ THE SOFTWARE.
 
 
 import six
-import numpy as np
 from loopy.diagnostic import (
         LoopyError, WriteRaceConditionWarning, warn,
         LoopyAdvisory, DependencyTypeInferenceFailure)
@@ -763,269 +762,6 @@ def limit_boostability(kernel):
 # }}}
 
 
-# {{{ rank inames by stride
-
-def get_auto_axis_iname_ranking_by_stride(kernel, insn):
-    from loopy.kernel.data import ImageArg, ValueArg
-
-    approximate_arg_values = {}
-    for arg in kernel.args:
-        if isinstance(arg, ValueArg):
-            if arg.approximately is not None:
-                approximate_arg_values[arg.name] = arg.approximately
-            else:
-                raise LoopyError("No approximate arg value specified for '%s'"
-                        % arg.name)
-
-    # {{{ find all array accesses in insn
-
-    from loopy.symbolic import ArrayAccessFinder
-    ary_acc_exprs = list(ArrayAccessFinder()(insn.expression))
-
-    from pymbolic.primitives import Subscript
-
-    if isinstance(insn.assignee, Subscript):
-        ary_acc_exprs.append(insn.assignee)
-
-    # }}}
-
-    # {{{ filter array accesses to only the global ones
-
-    global_ary_acc_exprs = []
-
-    for aae in ary_acc_exprs:
-        ary_name = aae.aggregate.name
-        arg = kernel.arg_dict.get(ary_name)
-        if arg is None:
-            continue
-
-        if isinstance(arg, ImageArg):
-            continue
-
-        global_ary_acc_exprs.append(aae)
-
-    # }}}
-
-    # {{{ figure out automatic-axis inames
-
-    from loopy.kernel.data import AutoLocalIndexTagBase
-    auto_axis_inames = set(
-            iname
-            for iname in kernel.insn_inames(insn)
-            if isinstance(kernel.iname_to_tag.get(iname),
-                AutoLocalIndexTagBase))
-
-    # }}}
-
-    # {{{ figure out which iname should get mapped to local axis 0
-
-    # maps inames to "aggregate stride"
-    aggregate_strides = {}
-
-    from loopy.symbolic import CoefficientCollector
-    from pymbolic.primitives import Variable
-
-    for aae in global_ary_acc_exprs:
-        index_expr = aae.index
-        if not isinstance(index_expr, tuple):
-            index_expr = (index_expr,)
-
-        ary_name = aae.aggregate.name
-        arg = kernel.arg_dict.get(ary_name)
-
-        if arg.dim_tags is None:
-            from warnings import warn
-            warn("Strides for '%s' are not known. Local axis assignment "
-                    "is likely suboptimal." % arg.name)
-            ary_strides = [1] * len(index_expr)
-        else:
-            ary_strides = []
-            from loopy.kernel.array import FixedStrideArrayDimTag
-            for dim_tag in arg.dim_tags:
-                if isinstance(dim_tag, FixedStrideArrayDimTag):
-                    ary_strides.append(dim_tag.stride)
-
-        # {{{ construct iname_to_stride_expr
-
-        iname_to_stride_expr = {}
-        for iexpr_i, stride in zip(index_expr, ary_strides):
-            if stride is None:
-                continue
-            coeffs = CoefficientCollector()(iexpr_i)
-            for var, coeff in six.iteritems(coeffs):
-                if (isinstance(var, Variable)
-                        and var.name in auto_axis_inames):
-                    # excludes '1', i.e.  the constant
-                    new_stride = coeff*stride
-                    old_stride = iname_to_stride_expr.get(var.name, None)
-                    if old_stride is None or new_stride < old_stride:
-                        iname_to_stride_expr[var.name] = new_stride
-
-        # }}}
-
-        from pymbolic import evaluate
-        for iname, stride_expr in six.iteritems(iname_to_stride_expr):
-            stride = evaluate(stride_expr, approximate_arg_values)
-            aggregate_strides[iname] = aggregate_strides.get(iname, 0) + stride
-
-    if aggregate_strides:
-        very_large_stride = np.iinfo(np.int32).max
-
-        return sorted((iname for iname in kernel.insn_inames(insn)),
-                key=lambda iname: aggregate_strides.get(iname, very_large_stride))
-    else:
-        return None
-
-    # }}}
-
-# }}}
-
-
-# {{{ assign automatic axes
-
-def assign_automatic_axes(kernel, axis=0, local_size=None):
-    logger.debug("%s: assign automatic axes" % kernel.name)
-
-    from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag)
-
-    # Realize that at this point in time, axis lengths are already
-    # fixed. So we compute them once and pass them to our recursive
-    # copies.
-
-    if local_size is None:
-        _, local_size = kernel.get_grid_sizes_as_exprs(
-                ignore_auto=True)
-
-    # {{{ axis assignment helper function
-
-    def assign_axis(recursion_axis, iname, axis=None):
-        """Assign iname to local axis *axis* and start over by calling
-        the surrounding function assign_automatic_axes.
-
-        If *axis* is None, find a suitable axis automatically.
-        """
-        desired_length = kernel.get_constant_iname_length(iname)
-
-        if axis is None:
-            # {{{ find a suitable axis
-
-            shorter_possible_axes = []
-            test_axis = 0
-            while True:
-                if test_axis >= len(local_size):
-                    break
-                if test_axis in assigned_local_axes:
-                    test_axis += 1
-                    continue
-
-                if local_size[test_axis] < desired_length:
-                    shorter_possible_axes.append(test_axis)
-                    test_axis += 1
-                    continue
-                else:
-                    axis = test_axis
-                    break
-
-            # The loop above will find an unassigned local axis
-            # that has enough 'room' for the iname. In the same traversal,
-            # it also finds theoretically assignable axes that are shorter,
-            # in the variable shorter_possible_axes.
-
-            if axis is None and shorter_possible_axes:
-                # sort as longest first
-                shorter_possible_axes.sort(key=lambda ax: local_size[ax])
-                axis = shorter_possible_axes[0]
-
-            # }}}
-
-        if axis is None:
-            new_tag = None
-        else:
-            new_tag = LocalIndexTag(axis)
-            if desired_length > local_size[axis]:
-                from loopy import split_iname
-
-                # Don't be tempted to switch the outer tag to unroll--this may
-                # generate tons of code on some examples.
-
-                return assign_automatic_axes(
-                        split_iname(kernel, iname, inner_length=local_size[axis],
-                            outer_tag=None, inner_tag=new_tag,
-                            do_tagged_check=False),
-                        axis=recursion_axis, local_size=local_size)
-
-        if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase):
-            raise LoopyError("trying to reassign '%s'" % iname)
-
-        new_iname_to_tag = kernel.iname_to_tag.copy()
-        new_iname_to_tag[iname] = new_tag
-        return assign_automatic_axes(kernel.copy(iname_to_tag=new_iname_to_tag),
-                axis=recursion_axis, local_size=local_size)
-
-    # }}}
-
-    # {{{ main assignment loop
-
-    # assignment proceeds in one phase per axis, each time assigning the
-    # smallest-stride available iname to the current axis
-
-    import loopy as lp
-
-    for insn in kernel.instructions:
-        if not isinstance(insn, lp.ExpressionInstruction):
-            continue
-
-        auto_axis_inames = [
-                iname
-                for iname in kernel.insn_inames(insn)
-                if isinstance(kernel.iname_to_tag.get(iname),
-                    AutoLocalIndexTagBase)]
-
-        if not auto_axis_inames:
-            continue
-
-        assigned_local_axes = set()
-
-        for iname in kernel.insn_inames(insn):
-            tag = kernel.iname_to_tag.get(iname)
-            if isinstance(tag, LocalIndexTag):
-                assigned_local_axes.add(tag.axis)
-
-        if axis < len(local_size):
-            # "valid" pass: try to assign a given axis
-
-            if axis not in assigned_local_axes:
-                iname_ranking = get_auto_axis_iname_ranking_by_stride(kernel, insn)
-                if iname_ranking is not None:
-                    for iname in iname_ranking:
-                        prev_tag = kernel.iname_to_tag.get(iname)
-                        if isinstance(prev_tag, AutoLocalIndexTagBase):
-                            return assign_axis(axis, iname, axis)
-
-        else:
-            # "invalid" pass: There are still unassigned axis after the
-            #  numbered "valid" passes--assign the remainder by length.
-
-            # assign longest auto axis inames first
-            auto_axis_inames.sort(key=kernel.get_constant_iname_length, reverse=True)
-
-            if auto_axis_inames:
-                return assign_axis(axis, auto_axis_inames.pop())
-
-    # }}}
-
-    # We've seen all instructions and not punted to recursion/restart because
-    # of a new axis assignment.
-
-    if axis >= len(local_size):
-        return kernel
-    else:
-        return assign_automatic_axes(kernel, axis=axis+1,
-                local_size=local_size)
-
-# }}}
-
-
 preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
@@ -1058,6 +794,17 @@ def preprocess_kernel(kernel, device=None):
 
     logger.info("%s: preprocess start" % kernel.name)
 
+    # {{{ check that there are no l.auto-tagged inames
+
+    from loopy.kernel.data import AutoLocalIndexTagBase
+    for iname, tag in six.iteritems(kernel.iname_to_tag):
+        if (isinstance(tag, AutoLocalIndexTagBase)
+                 and iname in kernel.all_inames()):
+            raise LoopyError("kernel with automatically-assigned "
+                    "local axes passed to preprocessing")
+
+    # }}}
+
     from loopy.subst import expand_subst
     kernel = expand_subst(kernel)
 
@@ -1086,7 +833,6 @@ def preprocess_kernel(kernel, device=None):
 
     kernel = duplicate_private_temporaries_for_ilp_and_vec(kernel)
     kernel = mark_local_temporaries(kernel)
-    kernel = assign_automatic_axes(kernel)
     kernel = find_boostability(kernel)
     kernel = limit_boostability(kernel)
 
diff --git a/loopy/version.py b/loopy/version.py
index 9598697b09afc091741cea5d8da37917dd88ce9d..389659b86ac35b00cf14267b33b17b0d841d19c1 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v11-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v12-islpy%s" % _islpy_version
diff --git a/test/test_nbody.py b/test/test_nbody.py
index 65e5658b53f692d1142a8112849111e98014126d..540a8f5c3fac4be02bd4bf53f8d5cc92a195c64a 100644
--- a/test/test_nbody.py
+++ b/test/test_nbody.py
@@ -74,17 +74,17 @@ def test_nbody(ctx_factory):
                 outer_tag="g.0", inner_tag="l.0")
         knl = lp.split_iname(knl, "j", 256)
         knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"],
-                ["x_fetch_j", "x_fetch_k"])
+                ["x_fetch_j", "x_fetch_k"], default_tag=None)
+        knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0"))
         knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
-        knl = lp.tag_inames(knl, dict(x_fetch_k="unr"))
         knl = lp.set_loop_priority(knl, ["j_outer", "j_inner"])
         return knl
 
     n = 3000
 
     for variant in [
-            variant_1,
-            variant_cpu,
+            #variant_1,
+            #variant_cpu,
             variant_gpu
             ]:
         variant_knl = variant(knl)