diff --git a/loopy/check.py b/loopy/check.py
index 611043d53fc5a6b711f17795ffcbbefd0d94ed09..9a9ff1fd575883ad4ee212f9e17f7c0197781201 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -213,12 +213,15 @@ def check_for_write_races(kernel):
 
 
 def check_for_orphaned_user_hardware_axes(kernel):
-    from loopy.kernel.data import LocalIndexTag
+    from loopy.kernel.data import LocalIndexTag, check_iname_tags
     for axis in kernel.local_sizes:
         found = False
-        for tag in six.itervalues(kernel.iname_to_tag):
-            if isinstance(tag, LocalIndexTag) and tag.axis == axis:
-                found = True
+        for tags in six.itervalues(kernel.iname_to_tags):
+            for tag in tags:
+                if isinstance(tag, LocalIndexTag) and tag.axis == axis:
+                    found = True
+                    break
+            if found:
                 break
 
         if not found:
@@ -893,6 +896,8 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
 
     from islpy import align_two
 
+    from loopy.kernel.data import check_iname_tags
+
     last_idomains = None
     last_insn_inames = None
 
@@ -928,9 +933,8 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
         from loopy.kernel.data import LocalIndexTag
         if isinstance(insn, BarrierInstruction):
             # project out local-id-mapped inames, solves #94 on gitlab
-            non_lid_inames = frozenset(
-                [iname for iname in insn_inames if not isinstance(
-                    kernel.iname_to_tag.get(iname), LocalIndexTag)])
+            non_lid_inames = frozenset(iname for iname in insn_inames
+                if not check_iname_tags(kernel.iname_to_tags[iname], LocalIndexTag))
             insn_impl_domain = insn_impl_domain.project_out_except(
                 non_lid_inames, [dim_type.set])
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 89e832127327356f2c51c84ba0dbef8e2a980a80..01f8a82554595d4eda9dfa899fa26dc262294259 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -254,11 +254,16 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     hw_inames_left = hw_inames_left[:]
     iname = hw_inames_left.pop()
 
-    tag = kernel.iname_to_tag.get(iname)
+    tags = kernel.iname_to_tags[iname]
 
     from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex
 
-    assert isinstance(tag, UniqueTag)
+    assert check_iname_tags(tags, UniqueTag)
+
+    if len(tags) > 1:
+        raise LoopyError("cannot have more than one UniqueTag")
+
+    tag, = tags
     if isinstance(tag, GroupIndexTag):
         hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
     elif isinstance(tag, LocalIndexTag):
@@ -267,10 +272,10 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
         raise RuntimeError("unexpected hw tag type")
 
     other_inames_with_same_tag = [
-            other_iname for other_iname in kernel.all_inames()
-            if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag)
-            and kernel.iname_to_tag.get(other_iname).key == tag.key
-            and other_iname != iname]
+        other_iname for other_iname in kernel.all_inames()
+        if check_iname_tags(kernel.iname_to_tags[other_iname], UniqueTag)
+           and any(_tag.key == tag.key for _tag in kernel.iname_to_tags[other_iname])
+           and other_iname != iname]
 
     # {{{ 'implement' hardware axis boundaries
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index c1d17371fd57f606d9bb3f758d7fe865585c83c9..7282542dccd8c70b8f0dd90f9c66f9f0dfb3992c 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -632,7 +632,7 @@ def is_domain_dependent_on_inames(kernel, domain_index, inames):
 # {{{ rank inames by stride
 
 def get_auto_axis_iname_ranking_by_stride(kernel, insn):
-    from loopy.kernel.data import ImageArg, ValueArg
+    from loopy.kernel.data import ImageArg, ValueArg, check_iname_tags
 
     approximate_arg_values = {}
     for arg in kernel.args:
@@ -677,10 +677,8 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 
     from loopy.kernel.data import AutoLocalIndexTagBase
     auto_axis_inames = set(
-            iname
-            for iname in kernel.insn_inames(insn)
-            if isinstance(kernel.iname_to_tag.get(iname),
-                AutoLocalIndexTagBase))
+            iname for iname in kernel.insn_inames(insn)
+            if check_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase))
 
     # }}}
 
@@ -752,8 +750,11 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 
 def assign_automatic_axes(kernel, axis=0, local_size=None):
     logger.debug("%s: assign automatic axes" % kernel.name)
+    # TODO: do the tag removal rigorously, might be easier after switching
+    # to set() from tuple()
 
-    from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag)
+    from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag,
+                                   check_iname_tags, get_iname_tags)
 
     # Realize that at this point in time, axis lengths are already
     # fixed. So we compute them once and pass them to our recursive
@@ -777,10 +778,10 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
         except isl.Error:
             # Likely unbounded, automatic assignment is not
             # going to happen for this iname.
-            new_iname_to_tag = kernel.iname_to_tag.copy()
-            new_iname_to_tag[iname] = None
+            new_iname_to_tags = kernel.iname_to_tags.copy()
+            new_iname_to_tags[iname] = tuple()
             return assign_automatic_axes(
-                    kernel.copy(iname_to_tag=new_iname_to_tag),
+                    kernel.copy(iname_to_tags=new_iname_to_tags),
                     axis=recursion_axis)
 
         if axis is None:
@@ -816,9 +817,9 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
             # }}}
 
         if axis is None:
-            new_tag = None
+            new_tag = tuple()
         else:
-            new_tag = LocalIndexTag(axis)
+            new_tag = (LocalIndexTag(axis),)
             if desired_length > local_size[axis]:
                 from loopy import split_iname
 
@@ -831,12 +832,12 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                             do_tagged_check=False),
                         axis=recursion_axis, local_size=local_size)
 
-        if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase):
+        if not check_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase):
             raise LoopyError("trying to reassign '%s'" % iname)
 
-        new_iname_to_tag = kernel.iname_to_tag.copy()
-        new_iname_to_tag[iname] = new_tag
-        return assign_automatic_axes(kernel.copy(iname_to_tag=new_iname_to_tag),
+        new_iname_to_tags = kernel.iname_to_tags.copy()
+        new_iname_to_tags[iname] = new_tag
+        return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags),
                 axis=recursion_axis, local_size=local_size)
 
     # }}}
@@ -853,10 +854,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
             continue
 
         auto_axis_inames = [
-                iname
-                for iname in kernel.insn_inames(insn)
-                if isinstance(kernel.iname_to_tag.get(iname),
-                    AutoLocalIndexTagBase)]
+            iname for iname in kernel.insn_inames(insn)
+            if check_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase)]
 
         if not auto_axis_inames:
             continue
@@ -864,8 +863,11 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
         assigned_local_axes = set()
 
         for iname in kernel.insn_inames(insn):
-            tag = kernel.iname_to_tag.get(iname)
-            if isinstance(tag, LocalIndexTag):
+            tags = get(kernel.iname_to_tags[iname], LocalIndexTag)
+            if tags:
+                if len(tags) > 1:
+                    raise LoopyError("cannot have more than one LocalIndexTags")
+                tag, = tags
                 assigned_local_axes.add(tag.axis)
 
         if axis < len(local_size):
@@ -875,8 +877,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                 iname_ranking = get_auto_axis_iname_ranking_by_stride(kernel, insn)
                 if iname_ranking is not None:
                     for iname in iname_ranking:
-                        prev_tag = kernel.iname_to_tag.get(iname)
-                        if isinstance(prev_tag, AutoLocalIndexTagBase):
+                        prev_tags = kernel.iname_to_tags[iname]
+                        if check_iname_tags(prev_tags, AutoLocalIndexTagBase):
                             return assign_axis(axis, iname, axis)
 
         else:
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index bd0a7c8ddbdc35bf39ba4f8700f8404132aa0f71..b20fbef91168ab4a5580b031d1cab4403bfec7c1 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -291,20 +291,20 @@ def _classify_reduction_inames(kernel, inames):
 
     from loopy.kernel.data import (
             LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
-            ConcurrentTag)
+            ConcurrentTag, check_iname_tags)
 
     for iname in inames:
-        iname_tag = kernel.iname_to_tag.get(iname)
+        iname_tags = kernel.iname_to_tags[iname]
 
-        if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
+        if check_iname_tags(iname_tags, (UnrollTag, UnrolledIlpTag)):
             # These are nominally parallel, but we can live with
             # them as sequential.
             sequential.append(iname)
 
-        elif isinstance(iname_tag, LocalIndexTagBase):
+        elif check_iname_tags(iname_tags, LocalIndexTagBase):
             local_par.append(iname)
 
-        elif isinstance(iname_tag, (ConcurrentTag, VectorizeTag)):
+        elif check_iname_tags(iname_tags, (ConcurrentTag, VectorizeTag)):
             nonlocal_par.append(iname)
 
         else:
@@ -912,6 +912,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
     outer (sweep) iname.
     """
 
+    # TODO: reassigning tags needs some thinking here
+
     logger.debug("%s: realize reduction" % kernel.name)
 
     new_insns = []
@@ -1134,13 +1136,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
-        from loopy.kernel.data import LocalIndexTagBase
-        outer_local_inames = tuple(
-                oiname
-                for oiname in outer_insn_inames
-                if isinstance(
-                    kernel.iname_to_tag.get(oiname),
-                    LocalIndexTagBase))
+        from loopy.kernel.data import LocalIndexTagBase, check_iname_tags
+        outer_local_inames = tuple(oiname for oiname in outer_insn_inames
+                if check_iname_tags(kernel.iname_to_tags[oiname], LocalIndexTagBase))
 
         from pymbolic import var
         outer_local_iname_vars = tuple(
@@ -1175,7 +1173,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         base_exec_iname = var_name_gen("red_"+red_iname)
         domains.append(_make_slab_set(base_exec_iname, size))
-        new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname]
+        new_iname_tags[base_exec_iname] = kernel.iname_to_tags[red_iname]
 
         # }}}
 
@@ -1270,7 +1268,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
             stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage))
             domains.append(_make_slab_set(stage_exec_iname, bound-new_size))
-            new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname]
+            new_iname_tags[stage_exec_iname] = kernel.iname_to_tags[red_iname]
 
             stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage))
             stage_insn = make_assignment(
@@ -1473,13 +1471,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
-        from loopy.kernel.data import LocalIndexTagBase
-        outer_local_inames = tuple(
-                oiname
-                for oiname in outer_insn_inames
-                if isinstance(
-                    kernel.iname_to_tag.get(oiname),
-                    LocalIndexTagBase)
+        from loopy.kernel.data import LocalIndexTagBase, check_iname_tags
+        outer_local_inames = tuple(oiname for oiname in outer_insn_inames
+                if check_iname_tags(kernel.iname_to_tags[oiname], LocalIndexTagBase)
                 and oiname != sweep_iname)
 
         from pymbolic import var
@@ -1505,7 +1499,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         base_exec_iname = var_name_gen(sweep_iname + "__scan")
         domains.append(_make_slab_set(base_exec_iname, scan_size))
-        new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname]
+        new_iname_tags[base_exec_iname] = kernel.iname_to_tags[sweep_iname]
 
         # }}}
 
@@ -1596,7 +1590,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage))
             domains.append(
                     _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size))
-            new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname]
+            new_iname_tags[stage_exec_iname] = kernel.iname_to_tags[sweep_iname]
 
             for read_var, acc_var in zip(read_vars, acc_vars):
                 read_stage_id = insn_id_gen(
@@ -1746,7 +1740,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     "by reductions is 'local'--found iname(s) '%s' "
                     "respectively tagged '%s'"
                     % (", ".join(bad_inames),
-                       ", ".join(kernel.iname_to_tag[iname]
+                       ", ".join(tag.key for tag in kernel.iname_to_tags[iname]
                                  for iname in bad_inames)))
 
         if n_local_par == 0 and n_sequential == 0:
@@ -1784,7 +1778,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     _error_if_force_scan_on(LoopyError,
                             "Sweep iname '%s' has an unsupported parallel tag '%s' "
                             "- the only parallelism allowed is 'local'." %
-                            (sweep_iname, temp_kernel.iname_to_tag[sweep_iname]))
+                            (sweep_iname,
+                             ", ".join(tag.key
+                            for tag in temp_kernel.iname_to_tags[sweep_iname])))
                 elif parallel:
                     return map_scan_local(
                             expr, rec, nresults, arg_dtypes, reduction_dtypes,
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 6f4cc78b711196fbff2cff60920d178dd357a101..77c638128fe3dab63ee79f72bd14d70e0ca866bf 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -917,18 +917,22 @@ class GlobalMemAccessCounter(MemAccessCounter):
             index = (index,)
 
         from loopy.symbolic import get_dependencies
-        from loopy.kernel.data import LocalIndexTag, GroupIndexTag
+        from loopy.kernel.data import LocalIndexTag, GroupIndexTag, get_iname_tags
+
         my_inames = get_dependencies(index) & self.knl.all_inames()
 
         # find all local and global index tags and corresponding inames
         lid_to_iname = {}
         gid_to_iname = {}
         for iname in my_inames:
-            tag = self.knl.iname_to_tag.get(iname)
-            if isinstance(tag, LocalIndexTag):
-                lid_to_iname[tag.axis] = iname
-            elif isinstance(tag, GroupIndexTag):
-                gid_to_iname[tag.axis] = iname
+            tags = get_iname_tags(self.knl.iname_to_tags[iname],
+                                  (GroupIndexTag, LocalIndexTag))
+            if tags:
+                tag, = get_iname_tags(tags, (GroupIndexTag, LocalIndexTag), 1)
+                if isinstance(tag, LocalIndexTag):
+                    lid_to_iname[tag.axis] = iname
+                else:
+                    gid_to_iname[tag.axis] = iname
 
         # create lid_strides and gid_strides dicts
 
@@ -1177,14 +1181,18 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
     g_used = set()
     l_used = set()
 
-    from loopy.kernel.data import LocalIndexTag, GroupIndexTag
+    from loopy.kernel.data import (LocalIndexTag, GroupIndexTag,
+                                   get_iname_tags, check_iname_tags)
     for iname in knl.insn_inames(insn):
-        tag = knl.iname_to_tag.get(iname)
-
-        if isinstance(tag, LocalIndexTag):
-            l_used.add(tag.axis)
-        elif isinstance(tag, GroupIndexTag):
-            g_used.add(tag.axis)
+        tags = get_iname_tags(knl.iname_to_tags[iname], (LocalIndexTag, GroupIndexTag))
+        if tags:
+            if len(tags) > 1:
+                raise LoopyError("cannot have more than one UniqueTags")
+            tag, = tags
+            if isinstance(tag, LocalIndexTag):
+                l_used.add(tag.axis)
+            elif isinstance(tag, GroupIndexTag):
+                g_used.add(tag.axis)
 
     def mult_grid_factor(used_axes, size):
         result = 1
@@ -1213,9 +1221,9 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False)
     insn_inames = knl.insn_inames(insn)
 
     if disregard_local_axes:
-        from loopy.kernel.data import LocalIndexTag
+        from loopy.kernel.data import LocalIndexTag, check_iname_tags
         insn_inames = [iname for iname in insn_inames if not
-                       isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
+                check_iname_tags(kernel.iname_to_tags[iname], LocalIndexTag)]
 
     inames_domain = knl.get_inames_domain(insn_inames)
     domain = (inames_domain.project_out_except(