diff --git a/loopy/check.py b/loopy/check.py
index 9a9ff1fd575883ad4ee212f9e17f7c0197781201..bebd86fffe00d9374da4352206952f1e439ef883 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -168,7 +168,7 @@ def _is_racing_iname_tag(tv, tag):
 
 
 def check_for_write_races(kernel):
-    from loopy.kernel.data import ConcurrentTag, check_iname_tags
+    from loopy.kernel.data import ConcurrentTag, get_iname_tags
 
     for insn in kernel.instructions:
         for assignee_name, assignee_indices in zip(
@@ -186,8 +186,9 @@ def check_for_write_races(kernel):
                 # will cause write races.
 
                 raceable_parallel_insn_inames = set(
-                        iname for iname in kernel.insn_inames(insn)
-                        if check_iname_tags(kernel.iname_to_tags[iname], ConcurrentTag))
+                    iname for iname in kernel.insn_inames(insn)
+                    if get_iname_tags(kernel.iname_to_tags[iname],
+                                      ConcurrentTag))
 
             elif assignee_name in kernel.temporary_variables:
                 temp_var = kernel.temporary_variables[assignee_name]
@@ -213,7 +214,7 @@ def check_for_write_races(kernel):
 
 
 def check_for_orphaned_user_hardware_axes(kernel):
-    from loopy.kernel.data import LocalIndexTag, check_iname_tags
+    from loopy.kernel.data import LocalIndexTag
     for axis in kernel.local_sizes:
         found = False
         for tags in six.itervalues(kernel.iname_to_tags):
@@ -230,12 +231,12 @@ def check_for_orphaned_user_hardware_axes(kernel):
 
 
 def check_for_data_dependent_parallel_bounds(kernel):
-    from loopy.kernel.data import ConcurrentTag, check_iname_tags
+    from loopy.kernel.data import ConcurrentTag, get_iname_tags
 
     for i, dom in enumerate(kernel.domains):
         dom_inames = set(dom.get_var_names(dim_type.set))
         par_inames = set(iname for iname in dom_inames
-            if check_iname_tags(kernel.iname_to_tags[iname], ConcurrentTag))
+            if get_iname_tags(kernel.iname_to_tags[iname], ConcurrentTag))
 
         if not par_inames:
             continue
@@ -651,7 +652,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
     # alternative: just disregard length-1 dimensions?
 
     from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase,
-                        GroupIndexTag, check_iname_tags, get_iname_tags)
+                        GroupIndexTag, get_iname_tags)
 
     while i < loop_end_i:
         sched_item = kernel.schedule[i]
@@ -671,19 +672,13 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
             for iname in kernel.insn_inames(insn):
                 tags = kernel.iname_to_tags[iname]
 
-                if check_iname_tags(tags, LocalIndexTag):
-                    tags = get_iname_tags(tags, LocalIndexTag)
-                    if len(tags) > 1:
-                        raise LoopyError("Can only have one LocalIndexTag")
-                    tag, = tags
+                if get_iname_tags(tags, LocalIndexTag):
+                    tag, = get_iname_tags(tags, LocalIndexTag, 1)
                     local_axes_used.add(tag.axis)
-                elif check_iname_tags(tags, GroupIndexTag):
-                    tags = get_iname_tags(tags, GroupIndexTag)
-                    if len(tags) > 1:
-                        raise LoopyError("Can only have one GroupIndexTag")
-                    tag, = tags
+                elif get_iname_tags(tags, GroupIndexTag):
+                    tag, = get_iname_tags(tags, GroupIndexTag, 1)
                     group_axes_used.add(tag.axis)
-                elif check_iname_tags(tags, AutoLocalIndexTagBase):
+                elif get_iname_tags(tags, AutoLocalIndexTagBase):
                     raise LoopyError("auto local tag encountered")
 
             if group_axes != group_axes_used:
@@ -896,8 +891,6 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
 
     from islpy import align_two
 
-    from loopy.kernel.data import check_iname_tags
-
     last_idomains = None
     last_insn_inames = None
 
@@ -930,11 +923,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                 .project_out_except(insn_inames, [dim_type.set]))
 
         from loopy.kernel.instruction import BarrierInstruction
-        from loopy.kernel.data import LocalIndexTag
+        from loopy.kernel.data import LocalIndexTag, get_iname_tags
         if isinstance(insn, BarrierInstruction):
             # project out local-id-mapped inames, solves #94 on gitlab
             non_lid_inames = frozenset(iname for iname in insn_inames
-                if not check_iname_tags(kernel.iname_to_tags[iname], LocalIndexTag))
+                if not get_iname_tags(kernel.iname_to_tags[iname], LocalIndexTag))
             insn_impl_domain = insn_impl_domain.project_out_except(
                 non_lid_inames, [dim_type.set])
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 886f305a455f66b943660ff653c40ae632360243..240df24e5d2a6dc7ea620a6f2f290937b9119384 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -59,7 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
     from loopy.schedule import (
         find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
     from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase,
-                                   IlpBaseTag, check_iname_tags)
+                                   IlpBaseTag, get_iname_tags)
 
     result = find_active_inames_at(kernel, sched_index)
     crosses_barrier = has_barrier_within(kernel, sched_index)
@@ -98,9 +98,9 @@ def get_usable_inames_for_conditional(kernel, sched_index):
         #   at the innermost level of nesting.
 
         if (
-                check_iname_tags(tags, ConcurrentTag)
-                and not (check_iname_tags(tags, LocalIndexTagBase)
-                and crosses_barrier) and not check_iname_tags(tags, IlpBaseTag)
+                get_iname_tags(tags, ConcurrentTag)
+                and not (get_iname_tags(tags, LocalIndexTagBase)
+                and crosses_barrier) and not get_iname_tags(tags, IlpBaseTag)
         ):
             result.add(iname)
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 22f18760882ae984de56ffd050ee21573b46fbad..41b04e172a2afeef612d31bf2fff323d096945ec 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -41,7 +41,7 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
     kernel = codegen_state.kernel
 
     from loopy.kernel.data import (LocalIndexTag, HardwareConcurrentTag,
-                                   check_iname_tags)
+                                   get_iname_tags)
 
     from loopy.schedule import find_active_inames_at, has_barrier_within
     result = find_active_inames_at(kernel, sched_index)
@@ -49,9 +49,9 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index):
     has_barrier = has_barrier_within(kernel, sched_index)
 
     for iname, tags in six.iteritems(kernel.iname_to_tags):
-        if (check_iname_tags(tags, HardwareConcurrentTag)
+        if (get_iname_tags(tags, HardwareConcurrentTag)
                 and codegen_state.is_generating_device_code):
-            if not has_barrier or not check_iname_tags(tags, LocalIndexTag):
+            if not has_barrier or not get_iname_tags(tags, LocalIndexTag):
                 result.add(iname)
 
     return frozenset(result)
@@ -136,13 +136,14 @@ def generate_code_for_sched_index(codegen_state, sched_index):
                 generate_vectorize_loop,
                 generate_sequential_loop_dim_code)
 
-        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag,
-                LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag, check_iname_tags)
-        if check_iname_tags(tags, (UnrollTag, UnrolledIlpTag)):
+        from loopy.kernel.data import (UnrolledIlpTag, UnrollTag,
+                ForceSequentialTag, LoopedIlpTag, VectorizeTag,
+                InOrderSequentialSequentialTag, get_iname_tags)
+        if get_iname_tags(tags, (UnrollTag, UnrolledIlpTag)):
             func = generate_unroll_loop
-        elif check_iname_tags(tags, VectorizeTag):
+        elif get_iname_tags(tags, VectorizeTag):
             func = generate_vectorize_loop
-        elif len(tags) == 0 or check_iname_tags(tags, (LoopedIlpTag,
+        elif len(tags) == 0 or get_iname_tags(tags, (LoopedIlpTag,
                     ForceSequentialTag, InOrderSequentialSequentialTag)):
             func = generate_sequential_loop_dim_code
         else:
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 01f8a82554595d4eda9dfa899fa26dc262294259..0efa96f967bc597d65ed3fd57c18301d67d6990b 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     kernel = codegen_state.kernel
 
     from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
-                LocalIndexTag, GroupIndexTag, check_iname_tags)
+                LocalIndexTag, GroupIndexTag, get_iname_tags)
 
     from loopy.schedule import get_insn_ids_for_block_at
     insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
@@ -242,7 +242,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
             all_inames_by_insns |= kernel.insn_inames(insn_id)
 
         hw_inames_left = [iname for iname in all_inames_by_insns
-                if check_iname_tags(kernel.iname_to_tags[iname],
+                if get_iname_tags(kernel.iname_to_tags[iname],
                                     HardwareConcurrentTag)]
 
     if not hw_inames_left:
@@ -258,12 +258,8 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
 
     from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex
 
-    assert check_iname_tags(tags, UniqueTag)
+    tag, = get_iname_tags(tags, UniqueTag, max_num=1, min_num=1)
 
-    if len(tags) > 1:
-        raise LoopyError("cannot have more than one UniqueTag")
-
-    tag, = tags
     if isinstance(tag, GroupIndexTag):
         hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
     elif isinstance(tag, LocalIndexTag):
@@ -271,11 +267,14 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     else:
         raise RuntimeError("unexpected hw tag type")
 
+    # TODO: get rid of None
+
     other_inames_with_same_tag = [
         other_iname for other_iname in kernel.all_inames()
-        if check_iname_tags(kernel.iname_to_tags[other_iname], UniqueTag)
-           and any(_tag.key == tag.key for _tag in kernel.iname_to_tags[other_iname])
-           and other_iname != iname]
+        if (get_iname_tags(kernel.iname_to_tags[other_iname], UniqueTag)
+            and other_iname != iname
+            and any(_tag.key == tag.key
+                    for _tag in kernel.iname_to_tags[other_iname]))]
 
     # {{{ 'implement' hardware axis boundaries
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index a69f0727db132aa5b34372cd4fd9f245a2270fc5..872365fca2c87a0c4bab3ca5569acc471364d63a 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -44,7 +44,7 @@ from loopy.library.function import (
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
 from loopy.tools import natsorted
 from loopy.diagnostic import StaticValueFindingError
-from loopy.kernel.data import check_iname_tags, get_iname_tags
+from loopy.kernel.data import get_iname_tags
 
 
 # {{{ unique var names
@@ -197,7 +197,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             assumptions=None,
             local_sizes={},
             temporary_variables={},
-            iname_to_tags=defaultdict(tuple),
+            iname_to_tags=defaultdict(set),
             substitutions={},
             function_manglers=[
                 default_function_mangler,
@@ -711,11 +711,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         from loopy.kernel.data import HardwareConcurrentTag
 
         for iname in cond_inames:
-            tags = self.iname_to_tags[iname]
-            if check_iname_tags(tags, HardwareConcurrentTag):
-                tags = get_iname_tags(tags, HardwareConcurrentTag)
-                if len(tags) > 1:
-                    raise LoopyError("cannot have more than one HardwareConcurentTags")
+            tags = get_iname_tags(self.iname_to_tags[iname],
+                                  HardwareConcurrentTag, 1)
+            if tags:
                 tag, = tags
                 tag_key_uses[tag.key].append(iname)
 
@@ -725,8 +723,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         multi_use_inames = set()
         for iname in cond_inames:
-            for tag in self.iname_to_tags[iname]:
-                if isinstance(tag, HardwareConcurrentTag) and tag.key in multi_use_keys:
+            tags = get_iname_tags(self.iname_to_tags[iname], HardwareConcurrentTag)
+            if tags:
+                tag, = get_iname_tags(tags, HardwareConcurrentTag, 1)
+                if tag.key in multi_use_keys:
                     multi_use_inames.add(iname)
                     break
 
@@ -960,22 +960,17 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         for iname in all_inames_by_insns:
             tags = self.iname_to_tags[iname]
 
-            if check_iname_tags(tags, GroupIndexTag):
+            if get_iname_tags(tags, GroupIndexTag):
                 tgt_dict = global_sizes
-            elif check_iname_tags(tags, LocalIndexTag):
+            elif get_iname_tags(tags, LocalIndexTag):
                 tgt_dict = local_sizes
-            elif check_iname_tags(tags, AutoLocalIndexTagBase) and not ignore_auto:
+            elif get_iname_tags(tags, AutoLocalIndexTagBase) and not ignore_auto:
                 raise RuntimeError("cannot find grid sizes if automatic "
                         "local index tags are present")
             else:
                 continue
 
-            tags = get_iname_tags(tags, (GroupIndexTag, LocalIndexTag))
-
-            if len(tags) != 1:
-                raise LoopyError("Multiple axis tag not allowed")
-
-            tag, = tags
+            tag, = get_iname_tags(tags, (GroupIndexTag, LocalIndexTag), 1)
 
             size = self.get_iname_bounds(iname).size
 
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 9b66088e5b75ca6bad3d06d931112de29980b634..9250c5acf7304e19572a96ec214452a62d891254 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -55,19 +55,24 @@ class auto(object):  # noqa
 # {{{ iname tags
 
 
-def check_iname_tags(tags, tag_type):
-    return any([isinstance(tag, tag_type) for tag in tags])
-
-
-def get_iname_tags(tags, tag_type):
-    return tuple(tag for tag in tags if isinstance(tag, tag_type))
+def get_iname_tags(tags, tag_type, max_num=None, min_num=None):
+    result = set(tag for tag in tags if isinstance(tag, tag_type))
+    if max_num:
+        if len(result) > max_num:
+            raise LoopyError("cannot have more than {0} tags"
+                    "of type(s): {1}".format(max_num, tag_type))
+    if min_num:
+        if len(result) < min_num:
+            raise LoopyError("must have more than {0} tags"
+                    "of type(s): {1}".format(max_num, tag_type))
+    return result
 
 
 class IndexTag(ImmutableRecord):
     __slots__ = []
 
     def __hash__(self):
-        raise RuntimeError("use .key to hash index tags")
+        return hash(self.key)
 
     def update_persistent_hash(self, key_hash, key_builder):
         """Custom hash computation function for use with
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 505482dea66cf2a1849b262eb3d28b121f990b88..5be3375373ca322bb3126ebf1a4407235f0ca2eb 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -36,7 +36,7 @@ from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
 from pytools import memoize_on_first_arg
 from loopy.tools import natsorted
-from loopy.kernel.data import check_iname_tags
+from loopy.kernel.data import get_iname_tags
 
 import logging
 logger = logging.getLogger(__name__)
@@ -632,7 +632,7 @@ def is_domain_dependent_on_inames(kernel, domain_index, inames):
 # {{{ rank inames by stride
 
 def get_auto_axis_iname_ranking_by_stride(kernel, insn):
-    from loopy.kernel.data import ImageArg, ValueArg, check_iname_tags
+    from loopy.kernel.data import ImageArg, ValueArg, get_iname_tags
 
     approximate_arg_values = {}
     for arg in kernel.args:
@@ -677,8 +677,8 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 
     from loopy.kernel.data import AutoLocalIndexTagBase
     auto_axis_inames = set(
-            iname for iname in kernel.insn_inames(insn)
-            if check_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase))
+        iname for iname in kernel.insn_inames(insn)
+        if get_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase))
 
     # }}}
 
@@ -754,7 +754,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
     # to set() from tuple()
 
     from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag,
-                                   check_iname_tags, get_iname_tags)
+                                   get_iname_tags)
 
     # Realize that at this point in time, axis lengths are already
     # fixed. So we compute them once and pass them to our recursive
@@ -832,7 +832,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                             do_tagged_check=False),
                         axis=recursion_axis, local_size=local_size)
 
-        if not check_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase):
+        if not get_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase):
             raise LoopyError("trying to reassign '%s'" % iname)
 
         new_iname_to_tags = kernel.iname_to_tags.copy()
@@ -855,7 +855,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 
         auto_axis_inames = [
             iname for iname in kernel.insn_inames(insn)
-            if check_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase)]
+            if get_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase)]
 
         if not auto_axis_inames:
             continue
@@ -878,7 +878,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                 if iname_ranking is not None:
                     for iname in iname_ranking:
                         prev_tags = kernel.iname_to_tags[iname]
-                        if check_iname_tags(prev_tags, AutoLocalIndexTagBase):
+                        if get_iname_tags(prev_tags, AutoLocalIndexTagBase):
                             return assign_axis(axis, iname, axis)
 
         else:
@@ -1134,7 +1134,7 @@ def get_visual_iname_order_embedding(kernel):
     # nest.
     ilp_inames = frozenset(iname
         for iname in kernel.iname_to_tags
-        if check_iname_tags(kernel.iname_to_tags[iname], IlpBaseTag))
+        if get_iname_tags(kernel.iname_to_tags[iname], IlpBaseTag))
 
     iname_trie = SetTrie()
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index b20fbef91168ab4a5580b031d1cab4403bfec7c1..38b213c05632670ebf98fba0534fbffba5738fa5 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -34,7 +34,7 @@ from pytools.persistent_dict import WriteOncePersistentDict
 
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
-from loopy.kernel.data import make_assignment, check_iname_tags, get_iname_tags
+from loopy.kernel.data import make_assignment, get_iname_tags
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
 
@@ -136,7 +136,7 @@ def check_reduction_iname_uniqueness(kernel):
 
 def _get_compute_inames_tagged(kernel, insn, tag_base):
     return set(iname for iname in kernel.insn_inames(insn.id)
-               if check_iname_tags(kernel.iname_to_tags[iname], tag_base))
+               if get_iname_tags(kernel.iname_to_tags[iname], tag_base))
 
 
 def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names):
@@ -146,7 +146,7 @@ def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names):
                 insn.assignee_subscript_deps())
             for iname in adeps & kernel.all_inames()
             if aname in tv_names
-            if check_iname_tags(kernel.iname_to_tags[iname], tag_base))
+            if get_iname_tags(kernel.iname_to_tags[iname], tag_base))
 
 
 def find_temporary_scope(kernel):
@@ -291,20 +291,20 @@ def _classify_reduction_inames(kernel, inames):
 
     from loopy.kernel.data import (
             LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
-            ConcurrentTag, check_iname_tags)
+            ConcurrentTag, get_iname_tags)
 
     for iname in inames:
         iname_tags = kernel.iname_to_tags[iname]
 
-        if check_iname_tags(iname_tags, (UnrollTag, UnrolledIlpTag)):
+        if get_iname_tags(iname_tags, (UnrollTag, UnrolledIlpTag)):
             # These are nominally parallel, but we can live with
             # them as sequential.
             sequential.append(iname)
 
-        elif check_iname_tags(iname_tags, LocalIndexTagBase):
+        elif get_iname_tags(iname_tags, LocalIndexTagBase):
             local_par.append(iname)
 
-        elif check_iname_tags(iname_tags, (ConcurrentTag, VectorizeTag)):
+        elif get_iname_tags(iname_tags, (ConcurrentTag, VectorizeTag)):
             nonlocal_par.append(iname)
 
         else:
@@ -1136,9 +1136,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
-        from loopy.kernel.data import LocalIndexTagBase, check_iname_tags
+        from loopy.kernel.data import LocalIndexTagBase, get_iname_tags
         outer_local_inames = tuple(oiname for oiname in outer_insn_inames
-                if check_iname_tags(kernel.iname_to_tags[oiname], LocalIndexTagBase))
+                if get_iname_tags(kernel.iname_to_tags[oiname], LocalIndexTagBase))
 
         from pymbolic import var
         outer_local_iname_vars = tuple(
@@ -1471,9 +1471,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
-        from loopy.kernel.data import LocalIndexTagBase, check_iname_tags
+        from loopy.kernel.data import LocalIndexTagBase, get_iname_tags
         outer_local_inames = tuple(oiname for oiname in outer_insn_inames
-                if check_iname_tags(kernel.iname_to_tags[oiname], LocalIndexTagBase)
+                if get_iname_tags(kernel.iname_to_tags[oiname], LocalIndexTagBase)
                 and oiname != sweep_iname)
 
         from pymbolic import var
@@ -1740,7 +1740,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     "by reductions is 'local'--found iname(s) '%s' "
                     "respectively tagged '%s'"
                     % (", ".join(bad_inames),
-                       ", ".join(tag.key for tag in kernel.iname_to_tags[iname]
+                       ", ".join(str(kernel.iname_to_tags[iname])
                                  for iname in bad_inames)))
 
         if n_local_par == 0 and n_sequential == 0:
@@ -2150,7 +2150,7 @@ def preprocess_kernel(kernel, device=None):
 
     from loopy.kernel.data import AutoLocalIndexTagBase
     for iname, tags in six.iteritems(kernel.iname_to_tags):
-        if (check_iname_tags(tags, AutoLocalIndexTagBase)
+        if (get_iname_tags(tags, AutoLocalIndexTagBase)
                  and iname in kernel.all_inames()):
             raise LoopyError("kernel with automatically-assigned "
                     "local axes passed to preprocessing")
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 68b3fb0cbad892745a775f6e1ef3f6b5e34a1c5a..bda316bc170b9099e41355a4a3670902f44461e6 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -213,12 +213,12 @@ def find_loop_nest_with_map(kernel):
     result = {}
 
     from loopy.kernel.data import (ConcurrentTag, IlpBaseTag, VectorizeTag,
-                                   check_iname_tags)
+                                   get_iname_tags)
 
     all_nonpar_inames = set(
-            iname for iname, tags in six.iteritems(kernel.iname_to_tags)
-            if tags and not check_iname_tags(tags,
-                (ConcurrentTag, IlpBaseTag, VectorizeTag)))
+            iname for iname in kernel.all_inames()
+            if not get_iname_tags(kernel.iname_to_tags[iname],
+                    (ConcurrentTag, IlpBaseTag, VectorizeTag)))
 
     iname_to_insns = kernel.iname_to_insns()
 
@@ -241,7 +241,7 @@ def find_loop_nest_around_map(kernel):
     iname_to_insns = kernel.iname_to_insns()
 
     # examine pairs of all inames--O(n**2), I know.
-    from loopy.kernel.data import IlpBaseTag, check_iname_tags
+    from loopy.kernel.data import IlpBaseTag, get_iname_tags
     for inner_iname in all_inames:
         result[inner_iname] = set()
         for outer_iname in all_inames:
@@ -249,7 +249,7 @@ def find_loop_nest_around_map(kernel):
                 continue
 
             tags = kernel.iname_to_tags[outer_iname]
-            if check_iname_tags(tags, IlpBaseTag):
+            if get_iname_tags(tags, IlpBaseTag):
                 # ILP tags are special because they are parallel tags
                 # and therefore 'in principle' nest around everything.
                 # But they're realized by the scheduler as a loop
@@ -279,10 +279,10 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
     result = {}
 
     from loopy.kernel.data import (ConcurrentTag, IlpBaseTag, VectorizeTag,
-                                   check_iname_tags)
+                                   get_iname_tags)
     for insn in kernel.instructions:
         for iname in kernel.insn_inames(insn):
-            if check_iname_tags(kernel.iname_to_tags[iname], ConcurrentTag):
+            if get_iname_tags(kernel.iname_to_tags[iname], ConcurrentTag):
                 continue
 
             iname_dep = result.setdefault(iname, set())
@@ -313,7 +313,8 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
                         continue
 
                     tags = kernel.iname_to_tags[dep_insn_iname]
-                    if check_iname_tags(tags, (ConcurrentTag, IlpBaseTag, VectorizeTag)):
+                    if get_iname_tags(tags,
+                                (ConcurrentTag, IlpBaseTag, VectorizeTag)):
                         # Parallel tags don't really nest, so we'll disregard
                         # them here.
                         continue
@@ -1878,19 +1879,19 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
         for insn_id in sched_item_to_insn_id(item))
 
     from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag,
-                                   check_iname_tags)
+                                   get_iname_tags)
     ilp_inames = set(
             iname
             for iname, tags in six.iteritems(kernel.iname_to_tags)
-            if check_iname_tags(tags, IlpBaseTag))
+            if get_iname_tags(tags, IlpBaseTag))
     vec_inames = set(
             iname
             for iname, tags in six.iteritems(kernel.iname_to_tags)
-            if check_iname_tags(tags, VectorizeTag))
+            if get_iname_tags(tags, VectorizeTag))
     parallel_inames = set(
             iname
             for iname, tags in six.iteritems(kernel.iname_to_tags)
-            if check_iname_tags(tags, ConcurrentTag))
+            if get_iname_tags(tags, ConcurrentTag))
 
     loop_nest_with_map = find_loop_nest_with_map(kernel)
     loop_nest_around_map = find_loop_nest_around_map(kernel)
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 77c638128fe3dab63ee79f72bd14d70e0ca866bf..4b0643873d100af365cbf2782f0c084f83b55760 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1182,12 +1182,11 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
     l_used = set()
 
     from loopy.kernel.data import (LocalIndexTag, GroupIndexTag,
-                                   get_iname_tags, check_iname_tags)
+                                   get_iname_tags)
     for iname in knl.insn_inames(insn):
-        tags = get_iname_tags(knl.iname_to_tags[iname], (LocalIndexTag, GroupIndexTag))
+        tags = get_iname_tags(knl.iname_to_tags[iname],
+                              (LocalIndexTag, GroupIndexTag), 1)
         if tags:
-            if len(tags) > 1:
-                raise LoopyError("cannot have more than one UniqueTags")
             tag, = tags
             if isinstance(tag, LocalIndexTag):
                 l_used.add(tag.axis)
@@ -1221,9 +1220,9 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False)
     insn_inames = knl.insn_inames(insn)
 
     if disregard_local_axes:
-        from loopy.kernel.data import LocalIndexTag, check_iname_tags
+        from loopy.kernel.data import LocalIndexTag, get_iname_tags
         insn_inames = [iname for iname in insn_inames if not
-                check_iname_tags(kernel.iname_to_tags[iname], LocalIndexTag)]
+                get_iname_tags(knl.iname_to_tags[iname], LocalIndexTag)]
 
     inames_domain = knl.get_inames_domain(insn_inames)
     domain = (inames_domain.project_out_except(
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 3fd2bc1ae6a439aeab1ff290e990143207f948d2..b7edc517bee38c3f7b6a6cdce5419863db5ffed8 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -418,33 +418,33 @@ class ISPCASTBuilder(CASTBuilder):
 
             new_terms = []
 
-            from loopy.kernel.data import LocalIndexTag, check_iname_tags, get_iname_tags
+            from loopy.kernel.data import LocalIndexTag, get_iname_tags
             from loopy.symbolic import get_dependencies
 
             saw_l0 = False
             for term in terms:
                 if (isinstance(term, Variable)
-                    and check_iname_tags(kernel.iname_to_tags[term.name], LocalIndexTag)):
-                        tags = get_iname_tags(kernel.iname_to_tags[term.name], LocalIndexTag)
-                        if len(tags) > 1:
-                            raise LoopyError("cannot have more than one LocalIndexTags")
-                        tag, = tags
+                        and get_iname_tags(
+                            kernel.iname_to_tags[term.name], LocalIndexTag)):
+                        tag, = get_iname_tags(kernel.iname_to_tags[term.name],
+                                              LocalIndexTag, 1)
                         if tag.axis == 0:
                             if saw_l0:
-                                raise LoopyError("streaming store must have stride 1 "
-                                        "in local index, got: %s" % subscript)
+                                raise LoopyError(
+                                    "streaming store must have stride 1 in "
+                                    "local index, got: %s" % subscript)
                             saw_l0 = True
                             continue
                 else:
                     for dep in get_dependencies(term):
-                        if check_iname_tags(kernel.iname_to_tags[dep], LocalIndexTag):
-                            tags = get_iname_tags(kernel.iname_to_tags[dep], LocalIndexTag)
-                            if len(tags) > 1:
-                                raise LoopyError("cannot have more than one LocalIndexTags")
-                            tag, = tags
+                        if get_iname_tags(
+                                kernel.iname_to_tags[dep], LocalIndexTag):
+                            tag, = get_iname_tags(kernel.iname_to_tags[dep],
+                                                  LocalIndexTag, 1)
                             if tag.axis == 0:
-                                raise LoopyError("streaming store must have stride 1 "
-                                        "in local index, got: %s" % subscript)
+                                raise LoopyError(
+                                    "streaming store must have stride 1 in "
+                                    "local index, got: %s" % subscript)
 
                     new_terms.append(term)
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 21c2f7eea5054db0ed393dd49c003ae8bcfa7ee6..365f2db773e63b69c75771cb45532c29416e03c4 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -33,7 +33,6 @@ from loopy.symbolic import (
         RuleAwareIdentityMapper, RuleAwareSubstitutionMapper,
         SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
-from loopy.kernel.data import check_iname_tags, get_iname_tags
 
 
 __doc__ = """
@@ -177,9 +176,9 @@ def _split_iname_backend(kernel, split_iname,
     """
 
     existing_tags = kernel.iname_to_tags[split_iname]
-    from loopy.kernel.data import ForceSequentialTag, check_iname_tags
+    from loopy.kernel.data import ForceSequentialTag, get_iname_tags
     if (do_tagged_check and existing_tags
-            and not check_iname_tags(existing_tags, ForceSequentialTag)):
+            and not get_iname_tags(existing_tags, ForceSequentialTag)):
         raise LoopyError("cannot split already tagged iname '%s'" % split_iname)
 
     if split_iname not in kernel.all_inames():
@@ -648,8 +647,8 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
 
     iname_to_tag = [(iname, parse_tag(tag)) for iname, tag in iname_to_tag]
 
-    from loopy.kernel.data import (ConcurrentTag, AutoLocalIndexTagBase,
-            ForceSequentialTag)
+    from loopy.kernel.data import (ConcurrentTag, ForceSequentialTag,
+                                   get_iname_tags)
 
     # {{{ globbing
 
@@ -680,27 +679,27 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
 
     knl_iname_to_tags = kernel.iname_to_tags.copy()
     for iname, new_tag in six.iteritems(iname_to_tag):
+        if not new_tag:
+            continue
+
         old_tags = kernel.iname_to_tags[iname]
 
         if iname not in kernel.all_inames():
             raise ValueError("cannot tag '%s'--not known" % iname)
 
-        if isinstance(new_tag, ConcurrentTag) \
-                and check_iname_tags(old_tags, ForceSequentialTag):
+        if (isinstance(new_tag, ConcurrentTag)
+                and get_iname_tags(old_tags, ForceSequentialTag)):
             raise ValueError("cannot tag '%s' as parallel--"
                     "iname requires sequential execution" % iname)
 
-        if isinstance(new_tag, ForceSequentialTag) \
-                and check_iname_tags(old_tags, ConcurrentTag):
+        if (isinstance(new_tag, ForceSequentialTag)
+                and get_iname_tags(old_tags, ConcurrentTag)):
             raise ValueError("'%s' is already tagged as parallel, "
                     "but is now prohibited from being parallel "
                     "(likely because of participation in a precompute or "
                     "a reduction)" % iname)
 
-        if new_tag and all(tag.key != new_tag.key for tag in old_tags):
-            old_tags = old_tags + (new_tag,)
-
-        knl_iname_to_tags[iname] = old_tags
+        knl_iname_to_tags[iname] = old_tags.union([new_tag])
 
     return kernel.copy(iname_to_tags=knl_iname_to_tags)
 
@@ -982,9 +981,9 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
     # Get the duplication options as a tuple of iname and a set
     for iname, insns in _get_iname_duplication_options(insn_iname_sets):
         # Check whether this iname has a parallel tag and discard it if so
-        from loopy.kernel.data import ConcurrentTag, check_iname_tags
+        from loopy.kernel.data import ConcurrentTag, get_iname_tags
         if (iname in knl.iname_to_tags
-                    and check_iname_tags(knl.iname_to_tags[iname], ConcurrentTag)):
+                and get_iname_tags(knl.iname_to_tags[iname], ConcurrentTag)):
             continue
 
         # If we find a duplication option and to not use boostable_into
@@ -1501,7 +1500,7 @@ def find_unused_axis_tag(kernel, kind, insn_match=None):
     """
     used_axes = set()
 
-    from loopy.kernel.data import GroupIndexTag, LocalIndexTag, check_iname_tags
+    from loopy.kernel.data import GroupIndexTag, LocalIndexTag, get_iname_tags
 
     if isinstance(kind, str):
         found = False
@@ -1521,7 +1520,7 @@ def find_unused_axis_tag(kernel, kind, insn_match=None):
     for insn in insns:
         for iname in kernel.insn_inames(insn):
             dim_tags = kernel.iname_to_tags[iname]
-            if check_iname_tags(dim_tags, kind):
+            if get_iname_tags(dim_tags, kind):
                 used_axes.add(kind.axis)
 
     i = 0
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index d1c112eca6a44cdd0bbd2b2826c291cd52ba8a0a..4810784039899a471bbd23c4922438e75275db25 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -41,7 +41,6 @@ __doc__ = """
 # {{{ privatize temporaries with iname
 
 from loopy.symbolic import IdentityMapper
-from loopy.kernel.data import check_iname_tags, get_iname_tags
 
 
 class ExtraInameIndexInserter(IdentityMapper):
@@ -85,6 +84,7 @@ def privatize_temporaries_with_inames(
 
     Example::
 
+<<<<<<< HEAD:loopy/transform/privatize.py
 <<<<<<< HEAD:loopy/transform/privatize.py
         for imatrix, i
             acc = 0
@@ -95,6 +95,9 @@ def privatize_temporaries_with_inames(
 =======
     from loopy.kernel.data import IlpBaseTag, VectorizeTag, check_iname_tags
 >>>>>>> d4c1d2e... change tags from set to tuple:loopy/transform/ilp.py
+=======
+    from loopy.kernel.data import IlpBaseTag, VectorizeTag, get_iname_tags
+>>>>>>> 38a4424... change tags from tuple to set:loopy/transform/ilp.py
 
     might become::
 
@@ -192,7 +195,7 @@ def privatize_temporaries_with_inames(
 
         dim_tags = ["c"] * (len(shape) + len(extra_shape))
         for i, iname in enumerate(inames):
-            if check_iname_tags(kernel.iname_to_tags[iname], VectorizeTag):
+            if get_iname_tags(kernel.iname_to_tags[iname], VectorizeTag):
                 dim_tags[len(shape) + i] = "vec"
 
         new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape,
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 962a83cd1098b48c7bcf21190609209dc79904b2..cd4c10272651128c4fe311c9e4af66798fa448df 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -403,26 +403,20 @@ class TemporarySaver(object):
                     continue
 
                 from loopy.kernel.data import (GroupIndexTag, LocalIndexTag,
-                        ConcurrentTag, get_iname_tags, check_iname_tags)
+                        ConcurrentTag, get_iname_tags)
 
-                if check_iname_tags(tags, GroupIndexTag):
-                    tags = get_iname_tags(tags, GroupIndexTag)
-                    if len(tags) > 1:
-                        raise LoopyError("cannot have more than one GroupIndexTags")
-                    tag, = tags
+                if get_iname_tags(tags, GroupIndexTag):
+                    tag, = get_iname_tags(tags, GroupIndexTag, 1)
                     my_group_tags.append(tag)
-                elif check_iname_tags(tags, LocalIndexTag):
-                    tags = get_iname_tags(tags, LocalIndexTag)
-                    if len(tags) > 1:
-                        raise LoopyError("cannot have more than one LocalIndexTags")
-                    tag, = tags
+                elif get_iname_tags(tags, LocalIndexTag):
+                    tag, = get_iname_tags(tags, LocalIndexTag, 1)
                     my_local_tags.append(tag)
-                elif check_iname_tags(tags, ConcurrentTag):
+                elif get_iname_tags(tags, ConcurrentTag):
                     raise LoopyError(
                         "iname '%s' is tagged with '%s' - only "
                         "group and local tags are supported for "
                         "auto save/reload of temporaries" %
-                        (iname, ", ".join(str(tag) for tag in tags)))
+                        (iname, tags))
 
             if group_tags is None:
                 group_tags = _sortedtags(my_group_tags)