diff --git a/MEMO b/MEMO
index 0e3d65aad124a443f4e99260fac8ff931f954584..a5cbf93f05ce422468529ba8bfa140de3b713e4b 100644
--- a/MEMO
+++ b/MEMO
@@ -88,6 +88,10 @@ TODO
 - Better for loop bound generation
   -> Try a triangular loop
 
+- AUTO_PICK or AUTO_FIT
+
+- What if we run out of axes to assign for AUTO_PICK/AUTO_FIT
+
 Dealt with
 ^^^^^^^^^^
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index ce40f0d2b98dd139a41ac27a50e40d289173c83d..d3adafe88ce31bc54dd0bfddf62135155bbf098e 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -1,5 +1,4 @@
 from __future__ import division
-import isl
 
 def register_mpz_with_pymbolic():
     from pymbolic.primitives import register_constant_class
@@ -28,7 +27,7 @@ class LoopyAdvisory(UserWarning):
 
 from loopy.kernel import ScalarArg, ArrayArg, ImageArg
 
-from loopy.kernel import LoopKernel
+from loopy.kernel import LoopKernel, AutoFitLocalIndexTag
 from loopy.schedule import generate_loop_schedules
 from loopy.compiled import CompiledKernel, drive_timing_run
 
@@ -63,7 +62,7 @@ def split_dimension(kernel, iname, inner_length, padded_length=None,
         s.set_dim_name(dim_type.set, outer_var_nr, outer_iname)
         s.set_dim_name(dim_type.set, inner_var_nr, inner_iname)
 
-        from loopy.isl import make_slab
+        from loopy.isl_helpers import make_slab
 
         space = s.get_space()
         inner_constraint_set = (
@@ -156,7 +155,7 @@ def tag_dimensions(kernel, iname_to_tag):
 
 
 def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=None,
-        dup_iname_to_tag={}, new_inames=None):
+        dup_iname_to_tag={}, new_inames=None, default_tag_class=AutoFitLocalIndexTag):
     """
     :arg duplicate_inames: which inames are supposed to be separate loops
         in the CSE. Also determines index order of temporary array.
@@ -174,9 +173,8 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
         parallel_inames = duplicate_inames
 
     dup_iname_to_tag = dup_iname_to_tag.copy()
-    from loopy.kernel import TAG_AUTO_LOCAL_IDX
     for piname in parallel_inames:
-        dup_iname_to_tag[piname] = TAG_AUTO_LOCAL_IDX()
+        dup_iname_to_tag[piname] = default_tag_class()
 
     for diname in duplicate_inames:
         dup_iname_to_tag.setdefault(diname, None)
@@ -208,10 +206,9 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
 
     target_var_name = kernel.make_unique_var_name(cse_tag)
 
-    from loopy.kernel import (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX,
-            TAG_GROUP_IDX)
+    from loopy.kernel import (LocalIndexTagBase, GroupIndexTag)
     target_var_is_local = any(
-            isinstance(tag, (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX))
+            isinstance(tag, LocalIndexTagBase)
             for tag in dup_iname_to_tag.itervalues())
 
     cse_lookup_table = {}
@@ -234,7 +231,6 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
         if cse_result_insns:
             raise RuntimeError("CSE tag '%s' is not unique" % cse_tag)
 
-
         # {{{ decide what to do with each iname
 
         parent_inames = insn.all_inames()
@@ -251,9 +247,9 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
             else:
                 tag = kernel.iname_to_tag[iname]
 
-            if isinstance(tag, (TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX)):
+            if isinstance(tag, LocalIndexTagBase):
                 kind = "l"
-            elif isinstance(tag, TAG_GROUP_IDX):
+            elif isinstance(tag, GroupIndexTag):
                 kind = "g"
             else:
                 kind = "o"
@@ -389,7 +385,7 @@ def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=Non
         lower_bound_pw_aff = new_domain.dim_min(new_iname_to_dim[iname][1])
         upper_bound_pw_aff = new_domain.dim_max(new_iname_to_dim[iname][1])
 
-        from loopy.isl import static_max_of_pw_aff
+        from loopy.isl_helpers import static_max_of_pw_aff
         from loopy.symbolic import pw_aff_to_expr
 
         target_var_shape.append(static_max_of_pw_aff(
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index f022e60983eb9fd4b95f6301fe74ae24105863ad..fee274b465c2fc305ccc5b9afb0999d4c33ae65e 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -134,17 +134,17 @@ def make_initial_assignments(kernel):
 
     global_size, local_size = kernel.get_grid_sizes()
 
-    from loopy.kernel import TAG_LOCAL_IDX, TAG_GROUP_IDX
+    from loopy.kernel import LocalIndexTag, GroupIndexTag
     from pymbolic import var
 
     for iname in kernel.all_inames():
         tag = kernel.iname_to_tag.get(iname)
 
-        if isinstance(tag, TAG_LOCAL_IDX):
+        if isinstance(tag, LocalIndexTag):
             hw_axis_expr = var("lid")(tag.axis)
             hw_axis_size = local_size[tag.axis]
 
-        elif isinstance(tag, TAG_GROUP_IDX):
+        elif isinstance(tag, GroupIndexTag):
             hw_axis_expr = var("gid")(tag.axis)
             hw_axis_size = global_size[tag.axis]
 
diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py
index e3cd775a435d67831a9bf6355bfa97d186b3bcdd..b79ce784c9fe83fae0b260d62d8d9a87a040d4e7 100644
--- a/loopy/codegen/dispatch.py
+++ b/loopy/codegen/dispatch.py
@@ -12,7 +12,7 @@ def get_admissible_conditional_inames_for(kernel, sched_index):
     inames if there is a barrier nested somewhere within.
     """
 
-    from loopy.kernel import TAG_LOCAL_IDX, ParallelTag
+    from loopy.kernel import LocalIndexTag, ParallelTag
 
     from loopy.schedule import find_active_inames_at, has_barrier_within
     result = find_active_inames_at(kernel, sched_index)
@@ -21,7 +21,7 @@ def get_admissible_conditional_inames_for(kernel, sched_index):
 
     for iname, tag in kernel.iname_to_tag.iteritems():
         if isinstance(tag, ParallelTag):
-            if not has_barrier or not isinstance(tag, TAG_LOCAL_IDX):
+            if not has_barrier or not isinstance(tag, LocalIndexTag):
                 result.add(iname)
 
     return result
@@ -41,8 +41,8 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state):
                 generate_unroll_loop,
                 generate_sequential_loop_dim_code)
 
-        from loopy.kernel import TAG_UNROLL, SequentialTag
-        if isinstance(tag, TAG_UNROLL):
+        from loopy.kernel import UnrollTag, SequentialTag
+        if isinstance(tag, UnrollTag):
             func = generate_unroll_loop
         elif tag is None or isinstance(tag, SequentialTag):
             func = generate_sequential_loop_dim_code
@@ -213,7 +213,7 @@ def build_loop_nest(kernel, sched_index, codegen_state):
             # group only contains starting schedule item
             result = [generate_code_for_sched_index(kernel, sched_index, new_codegen_state)]
         else:
-            # recurse with a bigger iname count
+            # recurse with a bigger minimum iname count
             result = build_insn_group(sched_indices_and_cond_inames[0:idx],
                     new_codegen_state, len(current_iname_set)+1)
 
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 1d1bc55d517e12e13b70909894d5c91adcf6af90..3fd65c4351996678e4f60a0cda0c237392b93c79 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -43,7 +43,7 @@ class ILPInstance(Record):
 def generate_ilp_instances(kernel, insn, codegen_state):
     impl_domain = codegen_state.implemented_domain
 
-    from loopy.kernel import TAG_ILP
+    from loopy.kernel import IlpTag
 
     result = [ILPInstance(impl_domain, {}, frozenset())]
 
@@ -52,7 +52,7 @@ def generate_ilp_instances(kernel, insn, codegen_state):
     for iname in insn.all_inames():
         tag = kernel.iname_to_tag.get(iname)
 
-        if not isinstance(tag, TAG_ILP):
+        if not isinstance(tag, IlpTag):
             continue
 
         from warnings import warn
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 20405c0ff97c738b87136bc454c3b2b7bbda5e49..14d6c586263c685f1f23c4e15f25dd5d760f37d3 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -32,7 +32,7 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain):
 # {{{ conditional-minimizing slab decomposition
 
 def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
-    from loopy.isl import block_shift_constraint, negate_constraint
+    from loopy.isl_helpers import block_shift_constraint, negate_constraint
 
     ccm = codegen_state.c_code_mapper
     space = kernel.space
@@ -82,7 +82,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state):
 # {{{ unrolled/ILP loops
 
 def generate_unroll_loop(kernel, sched_index, codegen_state):
-    from loopy.isl import block_shift_constraint
+    from loopy.isl_helpers import block_shift_constraint
 
     from cgen import (POD, Line)
 
@@ -95,7 +95,7 @@ def generate_unroll_loop(kernel, sched_index, codegen_state):
             codegen_state.implemented_domain)
 
     bounds = kernel.get_iname_bounds(iname)
-    from loopy.isl import static_max_of_pw_aff
+    from loopy.isl_helpers import static_max_of_pw_aff
     from loopy.symbolic import pw_aff_to_expr
 
     length = int(pw_aff_to_expr(static_max_of_pw_aff(bounds.length)))
@@ -114,8 +114,8 @@ def generate_unroll_loop(kernel, sched_index, codegen_state):
                             block_shift_constraint(
                                 lower_cns, iname, -i, as_equality=True)))
 
-    from loopy.kernel import TAG_UNROLL
-    if isinstance(tag, TAG_UNROLL):
+    from loopy.kernel import UnrollTag
+    if isinstance(tag, UnrollTag):
         result = [POD(np.int32, iname), Line()]
 
         for i in range(length):
@@ -134,7 +134,7 @@ def generate_unroll_loop(kernel, sched_index, codegen_state):
 # {{{ parallel loop
 
 def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None):
-    from loopy.kernel import UniqueTag, HardwareParallelTag, TAG_LOCAL_IDX, TAG_GROUP_IDX
+    from loopy.kernel import UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag
 
     if hw_inames_left is None:
         hw_inames_left = [iname
@@ -160,9 +160,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=
 
     # {{{ 'implement' hardware axis boundaries
 
-    if isinstance(tag, TAG_LOCAL_IDX):
+    if isinstance(tag, LocalIndexTag):
         hw_axis_size = local_size[tag.axis]
-    elif isinstance(tag, TAG_GROUP_IDX):
+    elif isinstance(tag, GroupIndexTag):
         hw_axis_size = global_size[tag.axis]
     else:
         raise RuntimeError("unknown hardware parallel tag")
@@ -171,7 +171,7 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=
 
     bounds = kernel.get_iname_bounds(iname)
 
-    from loopy.isl import make_slab
+    from loopy.isl_helpers import make_slab
     slab = make_slab(kernel.space, iname,
             bounds.lower_bound_pw_aff, bounds.lower_bound_pw_aff+hw_axis_size)
     codegen_state = codegen_state.intersect(slab)
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 85daa16121cebd2a3e827dd643852cc02c3f82f0..a263ea8f2a0326bca77c56c84ba30ff7272da7e6 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -48,7 +48,7 @@ class CompiledKernel:
         else:
             self.size_args = size_args
 
-        from loopy.kernel import TAG_GROUP_IDX, TAG_LOCAL_IDX
+        from loopy.kernel import GroupIndexTag, LocalIndexTag
         gsize_expr, lsize_expr = kernel.get_grid_sizes_as_exprs()
 
         if not gsize_expr: gsize_expr = (1,)
diff --git a/loopy/isl.py b/loopy/isl_helpers.py
similarity index 100%
rename from loopy/isl.py
rename to loopy/isl_helpers.py
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 0da1d56714406193f23693965fa5a906834d0a2c..dcf14172d7879bc97c1442ed14901b8c180c73c6 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -30,12 +30,15 @@ class SequentialTag(IndexTag):
 class ParallelTag(IndexTag):
     pass
 
+class HardwareParallelTag(ParallelTag):
+    pass
+
 class UniqueTag(IndexTag):
     @property
     def key(self):
         return type(self)
 
-class HardwareParallelTag(ParallelTag, UniqueTag):
+class AxisTag(UniqueTag):
     __slots__ = ["axis"]
 
     def __init__(self, axis):
@@ -50,27 +53,31 @@ class HardwareParallelTag(ParallelTag, UniqueTag):
         return "%s.%d" % (
                 self.print_name, self.axis)
 
-#class MultiTag(IndexTag):
-    
-#class SplitTag(IndexTag):
-    
-
-
-class TAG_GROUP_IDX(HardwareParallelTag):
+class GroupIndexTag(HardwareParallelTag, AxisTag):
     print_name = "g"
 
-class TAG_LOCAL_IDX(HardwareParallelTag):
+class LocalIndexTagBase(HardwareParallelTag):
+    pass
+
+class LocalIndexTag(LocalIndexTagBase, AxisTag):
     print_name = "l"
 
-class TAG_AUTO_LOCAL_IDX(ParallelTag):
+class AutoLocalIndexTagBase(LocalIndexTagBase):
+    pass
+
+class AutoFitLocalIndexTag(AutoLocalIndexTagBase):
     def __str__(self):
-        return "l.auto"
+        return "l.fit"
 
-class TAG_ILP(ParallelTag):
+class AutoPickLocalIndexTag(AutoLocalIndexTagBase):
+    def __str__(self):
+        return "l.pick"
+
+class IlpTag(ParallelTag):
     def __str__(self):
         return "ilp"
 
-class TAG_UNROLL(IndexTag):
+class UnrollTag(IndexTag):
     def __str__(self):
         return "unr"
 
@@ -85,13 +92,19 @@ def parse_tag(tag):
         raise ValueError("cannot parse tag: %s" % tag)
 
     if tag in ["unr"]:
-        return TAG_UNROLL()
+        return UnrollTag()
     elif tag == "ilp":
-        return TAG_ILP()
+        return IlpTag()
     elif tag.startswith("g."):
-        return TAG_GROUP_IDX(int(tag[2:]))
+        return GroupIndexTag(int(tag[2:]))
     elif tag.startswith("l."):
-        return TAG_LOCAL_IDX(int(tag[2:]))
+        axis = tag[2:]
+        if axis == "pick":
+            return AutoPickLocalIndexTag()
+        elif axis == "fit":
+            return AutoFitLocalIndexTag()
+        else:
+            return LocalIndexTag(int(axis))
     else:
         raise ValueError("cannot parse tag: %s" % tag)
 
@@ -593,7 +606,7 @@ class LoopKernel(Record):
                 size=size)
 
     @memoize_method
-    def get_grid_sizes(self):
+    def get_grid_sizes(self, ignore_auto=False):
         all_inames_by_insns = set()
         for insn in self.instructions:
             all_inames_by_insns |= insn.all_inames()
@@ -606,18 +619,18 @@ class LoopKernel(Record):
         local_sizes = {}
 
         from loopy.kernel import (
-                TAG_GROUP_IDX, TAG_LOCAL_IDX,
-                TAG_AUTO_LOCAL_IDX)
+                GroupIndexTag, LocalIndexTag,
+                AutoLocalIndexTagBase)
 
         for iname in self.all_inames():
             tag = self.iname_to_tag.get(iname)
 
-            if isinstance(tag, TAG_GROUP_IDX):
+            if isinstance(tag, GroupIndexTag):
                 tgt_dict = global_sizes
-            elif isinstance(tag, TAG_LOCAL_IDX):
+            elif isinstance(tag, LocalIndexTag):
                 tgt_dict = local_sizes
-            elif isinstance(tag, TAG_AUTO_LOCAL_IDX):
-                raise RuntimeError("cannot find grid sizes if AUTO_LOCAL_IDX tags are "
+            elif isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto:
+                raise RuntimeError("cannot find grid sizes if automatic local index tags are "
                         "present")
             else:
                 tgt_dict = None
@@ -630,11 +643,11 @@ class LoopKernel(Record):
             if tag.axis in tgt_dict:
                 size = tgt_dict[tag.axis].max(size)
 
-            from loopy.isl import static_max_of_pw_aff
+            from loopy.isl_helpers import static_max_of_pw_aff
             try:
                 # insist block size is constant
                 size = static_max_of_pw_aff(size, 
-                        constants_only=isinstance(tag, TAG_LOCAL_IDX))
+                        constants_only=isinstance(tag, LocalIndexTag))
             except ValueError:
                 pass
 
diff --git a/loopy/schedule.py b/loopy/schedule.py
index 5cbf88471067c6cffe4c58b9123e9e6549dda1ea..e0c36afb50d10710fa1ff81a5d5921b789f90003 100644
--- a/loopy/schedule.py
+++ b/loopy/schedule.py
@@ -100,13 +100,13 @@ def realize_reduction(kernel, inames=None, reduction_tag=None):
 
 
 
-def check_non_use_of_hw_axes(kernel):
+def check_for_unused_hw_axes(kernel):
     group_size, local_size = kernel.get_grid_sizes_as_exprs()
 
     group_axes = set(range(len(group_size)))
     local_axes = set(range(len(local_size)))
 
-    from loopy.kernel import TAG_LOCAL_IDX, TAG_AUTO_LOCAL_IDX, TAG_GROUP_IDX
+    from loopy.kernel import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag
     for insn in kernel.instructions:
         group_axes_used = set()
         local_axes_used = set()
@@ -114,11 +114,11 @@ def check_non_use_of_hw_axes(kernel):
         for iname in insn.all_inames():
             tag = kernel.iname_to_tag.get(iname)
 
-            if isinstance(tag, TAG_LOCAL_IDX):
+            if isinstance(tag, LocalIndexTag):
                 local_axes_used.add(tag.axis)
-            elif isinstance(tag, TAG_GROUP_IDX):
+            elif isinstance(tag, GroupIndexTag):
                 group_axes_used.add(tag.axis)
-            elif isinstance(tag, TAG_AUTO_LOCAL_IDX):
+            elif isinstance(tag, AutoLocalIndexTagBase):
                 raise RuntimeError("auto local tag encountered")
 
         if group_axes != group_axes_used:
@@ -131,13 +131,13 @@ def check_non_use_of_hw_axes(kernel):
 
 
 def check_double_use_of_hw_axes(kernel):
-    from loopy.kernel import HardwareParallelTag
+    from loopy.kernel import UniqueTag
 
     for insn in kernel.instructions:
         insn_tag_keys = set()
         for iname in insn.all_inames():
             tag = kernel.iname_to_tag.get(iname)
-            if isinstance(tag, HardwareParallelTag):
+            if isinstance(tag, UniqueTag):
                 key = tag.key
                 if key in insn_tag_keys:
                     raise RuntimeError("instruction '%s' has two "
@@ -398,7 +398,7 @@ def find_inadmissible_tag_keys(kernel, iname, iname_to_tag=None):
 
 def assign_automatic_axes(kernel):
     from loopy.kernel import (
-            TAG_AUTO_LOCAL_IDX, TAG_LOCAL_IDX)
+            TAG_AUTO_LOCAL_IDX, LocalIndexTag)
 
     new_iname_to_tag = kernel.iname_to_tag
 
@@ -418,14 +418,14 @@ def assign_automatic_axes(kernel):
 
             for iname in insn.all_inames():
                 tag = new_iname_to_tag.get(iname)
-                if isinstance(tag, TAG_LOCAL_IDX):
+                if isinstance(tag, LocalIndexTag):
                     local_assigned_axes.add(tag.axis)
 
             if 0 not in local_assigned_axes:
                 axis0_iname = guess_good_iname_for_axis_0(kernel, insn)
 
                 axis0_iname_tag = new_iname_to_tag.get(axis0_iname)
-                ax0_tag = TAG_LOCAL_IDX(0)
+                ax0_tag = LocalIndexTag(0)
                 if (isinstance(axis0_iname_tag, TAG_AUTO_LOCAL_IDX)
                         and ax0_tag.key not in find_inadmissible_tag_keys(
                             kernel, axis0_iname, new_iname_to_tag)):
@@ -442,7 +442,7 @@ def assign_automatic_axes(kernel):
                 while next_axis in local_assigned_axes:
                     next_axis += 1
 
-                new_iname_to_tag[iname] = TAG_LOCAL_IDX(next_axis)
+                new_iname_to_tag[iname] = LocalIndexTag(next_axis)
                 local_assigned_axes.add(next_axis)
 
     return kernel.copy(iname_to_tag=new_iname_to_tag)
@@ -716,7 +716,7 @@ def generate_loop_schedules(kernel):
 
     kernel = add_automatic_dependencies(kernel)
     kernel = assign_automatic_axes(kernel)
-    check_non_use_of_hw_axes(kernel)
+    check_for_unused_hw_axes(kernel)
 
     for gen_sched in generate_loop_schedules_internal(kernel):
         gen_sched, owed_barriers = insert_barriers(kernel, gen_sched)
diff --git a/test/test_matmul.py b/test/test_matmul.py
index 71b1d409e1f2d95c225544be1cf92c30f2a5947a..f861153c80e3aece10e8300a78b9591187fbf397 100644
--- a/test/test_matmul.py
+++ b/test/test_matmul.py
@@ -215,9 +215,9 @@ def test_plain_matrix_mul_new_ui(ctx_factory):
 
     knl = lp.split_dimension(knl, "i", 16,
             outer_tag="g.0", inner_tag="l.1", no_slabs=True)
-    knl = lp.split_dimension(knl, "j", 16,
-            outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_dimension(knl, "k", 16)
+    knl = lp.split_dimension(knl, "j", 8,
+            outer_tag="g.1", inner_tag="l.0", no_slabs=True)
+    knl = lp.split_dimension(knl, "k", 32, no_slabs=True)
 
     knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"])
     knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"])