diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ccb7cec080b3b4c04832ae079300d0e50c94a3f5..870c2a6bfd428657a8145bc1eeca3061bd3196f9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -60,16 +60,16 @@ Python 3.6 POCL Twice With Cache:
   except:
   - tags
 
-# PyPy AMD CPU:
+# PyPy POCL:
 #   script:
 #   - export PY_EXE=pypy
-#   - export PYOPENCL_TEST="amd:pu"
+#   - export PYOPENCL_TEST=portable
 #   - export EXTRA_INSTALL="numpy mako"
 #   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
 #   - ". ./build-and-test-py-project.sh"
 #   tags:
 #   - pypy
-#   - amd-cl-cpu
+#   - pocl
 #   except:
 #   - tags
 
@@ -87,19 +87,19 @@ Python 3.6 POCL Examples:
   except:
   - tags
 
-# CentOS binary:
-#   script:
-#   - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
-#   - (cd ./build-helpers; ./loopy-centos6 ../examples/fortran/sparse.floopy)
-#   artifacts:
-#     expire_in: 4 weeks
-#     paths:
-#     - build-helpers/loopy-centos6
-#   tags:
-#   - docker
-#   only:
-#   - master
-#   retry: 2
+CentOS binary:
+  script:
+  - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
+  - (cd ./build-helpers; ./loopy-centos6 ../examples/fortran/sparse.floopy)
+  artifacts:
+    expire_in: 4 weeks
+    paths:
+    - build-helpers/loopy-centos6
+  tags:
+  - docker
+  only:
+  - master
+  retry: 2
 
 Documentation:
   script:
diff --git a/loopy/check.py b/loopy/check.py
index 17703e90f2ca275363f6d4d16fa7f721264845d4..77e916328a4849d76c769f31c18d74935cca9a8a 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -208,15 +208,14 @@ def check_multiple_tags_allowed(kernel):
 
 
 def check_for_double_use_of_hw_axes(kernel):
-    from loopy.kernel.data import UniqueTag, filter_iname_tags_by_type
+    from loopy.kernel.data import UniqueTag
     from loopy.kernel.instruction import CallInstruction
     from loopy.kernel.function_interface import CallableKernel
 
     for insn in kernel.instructions:
         insn_tag_keys = set()
         for iname in kernel.insn_inames(insn):
-            tags = kernel.iname_to_tags[iname]
-            for tag in filter_iname_tags_by_type(tags, UniqueTag):
+            for tag in kernel.iname_tags_of_type(iname, UniqueTag):
                 key = tag.key
                 if key in insn_tag_keys:
                     raise LoopyError("instruction '%s' has multiple "
@@ -279,7 +278,7 @@ def _is_racing_iname_tag(tv, tag):
 
 
 def check_for_write_races(kernel):
-    from loopy.kernel.data import ConcurrentTag, filter_iname_tags_by_type
+    from loopy.kernel.data import ConcurrentTag
 
     for insn in kernel.instructions:
         for assignee_name, assignee_indices in zip(
@@ -298,15 +297,14 @@ def check_for_write_races(kernel):
 
                 raceable_parallel_insn_inames = set(
                     iname for iname in kernel.insn_inames(insn)
-                    if filter_iname_tags_by_type(kernel.iname_to_tags[iname],
-                                      ConcurrentTag))
+                    if kernel.iname_tags_of_type(iname, ConcurrentTag))
 
             elif assignee_name in kernel.temporary_variables:
                 temp_var = kernel.temporary_variables[assignee_name]
                 raceable_parallel_insn_inames = set(
                         iname for iname in kernel.insn_inames(insn)
                         if any(_is_racing_iname_tag(temp_var, tag)
-                            for tag in kernel.iname_to_tags[iname]))
+                            for tag in kernel.iname_tags(iname)))
 
             else:
                 raise LoopyError("invalid assignee name in instruction '%s'"
@@ -342,12 +340,13 @@ def check_for_orphaned_user_hardware_axes(kernel):
 
 
 def check_for_data_dependent_parallel_bounds(kernel):
-    from loopy.kernel.data import ConcurrentTag, filter_iname_tags_by_type
+    from loopy.kernel.data import ConcurrentTag
 
     for i, dom in enumerate(kernel.domains):
         dom_inames = set(dom.get_var_names(dim_type.set))
-        par_inames = set(iname for iname in dom_inames
-            if filter_iname_tags_by_type(kernel.iname_to_tags[iname], ConcurrentTag))
+        par_inames = set(
+                iname for iname in dom_inames
+                if kernel.iname_tags_of_type(iname, ConcurrentTag))
 
         if not par_inames:
             continue
@@ -774,7 +773,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
     # alternative: just disregard length-1 dimensions?
 
     from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase,
-                        GroupIndexTag, filter_iname_tags_by_type)
+                        GroupIndexTag)
 
     while i < loop_end_i:
         sched_item = kernel.schedule[i]
@@ -792,15 +791,18 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
             local_axes_used = set()
 
             for iname in kernel.insn_inames(insn):
-                tags = kernel.iname_to_tags[iname]
+                ltags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1)
+                gtags = kernel.iname_tags_of_type(iname, GroupIndexTag, max_num=1)
+                altags = kernel.iname_tags_of_type(
+                        iname, AutoLocalIndexTagBase, max_num=1)
 
-                if filter_iname_tags_by_type(tags, LocalIndexTag):
-                    tag, = filter_iname_tags_by_type(tags, LocalIndexTag, 1)
+                if ltags:
+                    tag, = ltags
                     local_axes_used.add(tag.axis)
-                elif filter_iname_tags_by_type(tags, GroupIndexTag):
-                    tag, = filter_iname_tags_by_type(tags, GroupIndexTag, 1)
+                elif gtags:
+                    tag, = gtags
                     group_axes_used.add(tag.axis)
-                elif filter_iname_tags_by_type(tags, AutoLocalIndexTagBase):
+                elif altags:
                     raise LoopyError("auto local tag encountered")
 
             if group_axes != group_axes_used:
@@ -1045,12 +1047,11 @@ def check_implemented_domains(kernel, implemented_domains, code=None):
                 .project_out_except(insn_inames, [dim_type.set]))
 
         from loopy.kernel.instruction import BarrierInstruction
-        from loopy.kernel.data import LocalIndexTag, filter_iname_tags_by_type
+        from loopy.kernel.data import LocalIndexTag
         if isinstance(insn, BarrierInstruction):
             # project out local-id-mapped inames, solves #94 on gitlab
             non_lid_inames = frozenset(iname for iname in insn_inames
-                if not filter_iname_tags_by_type(
-                    kernel.iname_to_tags[iname], LocalIndexTag))
+                if not kernel.iname_tags_of_type(iname, LocalIndexTag))
             insn_impl_domain = insn_impl_domain.project_out_except(
                 non_lid_inames, [dim_type.set])
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index a6b70359af614525255388ab8536e5fb903483e4..c946e09a086e574a2593d60f652a81773d95a1fe 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -59,7 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index):
     from loopy.schedule import (
         find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within)
     from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase,
-                                   IlpBaseTag, filter_iname_tags_by_type)
+                                   IlpBaseTag)
 
     result = find_active_inames_at(kernel, sched_index)
     crosses_barrier = has_barrier_within(kernel, sched_index)
@@ -88,8 +88,6 @@ def get_usable_inames_for_conditional(kernel, sched_index):
         for iname in kernel.insn_inames(insn))
 
     for iname in inames_for_subkernel:
-        tags = kernel.iname_to_tags[iname]
-
         # Parallel inames are defined within a subkernel, BUT:
         #
         # - local indices may not be used in conditionals that cross barriers.
@@ -98,10 +96,10 @@ def get_usable_inames_for_conditional(kernel, sched_index):
         #   at the innermost level of nesting.
 
         if (
-                filter_iname_tags_by_type(tags, ConcurrentTag)
-                and not (filter_iname_tags_by_type(tags, LocalIndexTagBase)
-                and crosses_barrier)
-                and not filter_iname_tags_by_type(tags, IlpBaseTag)
+                kernel.iname_tags_of_type(iname, ConcurrentTag)
+                and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase)
+                    and crosses_barrier)
+                and not kernel.iname_tags_of_type(iname, IlpBaseTag)
         ):
             result.add(iname)
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index af3a0cbd25ebc96ce9380bb46e767a7d5a953876..9969f6ad0ba51e3a0def3e11c83eb49a204c14ab 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -128,7 +128,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
             ])
 
     elif isinstance(sched_item, EnterLoop):
-        tags = kernel.iname_to_tags[sched_item.iname]
+        tags = kernel.iname_tags(sched_item.iname)
         tags = tuple(tag for tag in tags if tag)
 
         from loopy.codegen.loop import (
@@ -143,7 +143,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
             func = generate_unroll_loop
         elif filter_iname_tags_by_type(tags, VectorizeTag):
             func = generate_vectorize_loop
-        elif len(tags) == 0 or filter_iname_tags_by_type(tags, (LoopedIlpTag,
+        elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag,
                     ForceSequentialTag, InOrderSequentialSequentialTag)):
             func = generate_sequential_loop_dim_code
         else:
@@ -423,7 +423,7 @@ def build_loop_nest(codegen_state, schedule_index):
 
             # }}}
 
-            only_unshared_inames = kernel.remove_inames_for_shared_hw_axes(
+            only_unshared_inames = kernel._remove_inames_for_shared_hw_axes(
                     current_iname_set & used_inames)
 
             bounds_checks = bounds_check_cache(only_unshared_inames)
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 7b44fd7b27d0d1191778d7a01aa34904a013f808..ebddf315373403b8afaf174d149872f5dcb4518e 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     kernel = codegen_state.kernel
 
     from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag,
-                LocalIndexTag, GroupIndexTag, filter_iname_tags_by_type)
+                LocalIndexTag, GroupIndexTag)
 
     from loopy.schedule import get_insn_ids_for_block_at
     insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index)
@@ -242,8 +242,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
             all_inames_by_insns |= kernel.insn_inames(insn_id)
 
         hw_inames_left = [iname for iname in all_inames_by_insns
-                if filter_iname_tags_by_type(kernel.iname_to_tags[iname],
-                                        HardwareConcurrentTag)]
+                if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)]
 
     if not hw_inames_left:
         return next_func(codegen_state)
@@ -254,11 +253,9 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
     hw_inames_left = hw_inames_left[:]
     iname = hw_inames_left.pop()
 
-    tags = kernel.iname_to_tags[iname]
-
     from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex
 
-    tag, = filter_iname_tags_by_type(tags, UniqueTag, max_num=1, min_num=1)
+    tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1)
 
     if isinstance(tag, GroupIndexTag):
         hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
@@ -269,10 +266,11 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
 
     other_inames_with_same_tag = [
         other_iname for other_iname in kernel.all_inames()
-        if (filter_iname_tags_by_type(kernel.iname_to_tags[other_iname], UniqueTag)
+        if (kernel.iname_tags_of_type(other_iname, UniqueTag)
             and other_iname != iname
             and any(_tag.key == tag.key
-                    for _tag in kernel.iname_to_tags[other_iname] if _tag))]
+                    for _tag in kernel.iname_tags(other_iname)
+                    if _tag))]
 
     # {{{ 'implement' hardware axis boundaries
 
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index b752d8a13f1428c29d7372482c7939c4b524e8b8..4141ac4cb78ce049087047fac216afb92fb94a1b 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -104,6 +104,12 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     """These correspond more or less directly to arguments of
     :func:`loopy.make_kernel`.
 
+    .. note::
+
+        This data structure and its attributes should be considered immutable,
+        even if it contains mutable data types. See :method:`copy` for an easy
+        way of producing a modified copy.
+
     .. attribute:: domains
 
         a list of :class:`islpy.BasicSet` instances
@@ -198,25 +204,25 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # {{{ constructor
 
-    def __init__(self, domains, instructions, args=[], schedule=None,
+    def __init__(self, domains, instructions, args=None, schedule=None,
             name="loopy_kernel",
-            preambles=[],
-            preamble_generators=[],
+            preambles=None,
+            preamble_generators=None,
             assumptions=None,
-            local_sizes={},
-            temporary_variables={},
-            iname_to_tags=defaultdict(set),
-            substitutions={},
-            function_manglers=[],
+            local_sizes=None,
+            temporary_variables=None,
+            iname_to_tags=None,
+            substitutions=None,
+            function_manglers=None,
             function_scopers=None,
             scoped_functions={},
             symbol_manglers=[],
 
-            iname_slab_increments={},
+            iname_slab_increments=None,
             loop_priority=frozenset(),
-            silenced_warnings=[],
+            silenced_warnings=None,
 
-            applied_iname_rewrites=[],
+            applied_iname_rewrites=None,
             cache_manager=None,
             index_dtype=np.int32,
             options=None,
@@ -233,10 +239,38 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             change. This provides a way to forward sub-kernel grid size requests.
         """
 
+        # {{{ process constructor arguments
+
+        if args is None:
+            args = []
+        if preambles is None:
+            preambles = []
+        if preamble_generators is None:
+            preamble_generators = []
+        if local_sizes is None:
+            local_sizes = {}
+        if temporary_variables is None:
+            temporary_variables = {}
+        if iname_to_tags is None:
+            iname_to_tags = {}
+        if substitutions is None:
+            substitutions = {}
+        if function_manglers is None:
+            function_manglers = []
+        if iname_slab_increments is None:
+            iname_slab_increments = {}
+
+        if silenced_warnings is None:
+            silenced_warnings = []
+        if applied_iname_rewrites is None:
+            applied_iname_rewrites = []
+
         if cache_manager is None:
             from loopy.kernel.tools import SetOperationCacheManager
             cache_manager = SetOperationCacheManager()
 
+        # }}}
+
         # {{{ process assumptions
 
         if assumptions is None:
@@ -274,6 +308,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 ]:
             raise ValueError("invalid value for 'state'")
 
+        from collections import defaultdict
+        assert not isinstance(iname_to_tags, defaultdict)
+
+        for iname, tags in six.iteritems(iname_to_tags):
+            # don't tolerate empty sets
+            assert tags
+            assert isinstance(tags, frozenset)
+
         assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains)
         assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT
 
@@ -319,24 +361,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # }}}
 
-    # {{{ compatibility wrapper for iname_to_tag.get("iname")
-
-    @property
-    def iname_to_tag(self):
-        from warnings import warn
-        warn("Since version 2018.1, inames can hold multiple tags. Use "
-             "iname_to_tags['iname'] instead. iname_to_tag.get('iname') will be "
-             "deprecated at version 2019.0.", DeprecationWarning)
-        for iname, tags in six.iteritems(self.iname_to_tags):
-            if len(tags) > 1:
-                raise LoopyError(
-                    "iname {0} has multiple tags: {1}. "
-                    "Use iname_to_tags['iname'] instead.".format(iname, tags))
-        return dict((k, next(iter(v)))
-                    for k, v in six.iteritems(self.iname_to_tags) if v)
-
-    # }}}
-
     # {{{ function mangling/scoping
 
     def mangle_function(self, identifier, arg_dtypes, ast_builder=None):
@@ -682,6 +706,26 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # {{{ iname wrangling
 
+    def iname_tags(self, iname):
+        return self.iname_to_tags.get(iname, frozenset())
+
+    def iname_tags_of_type(self, iname, tag_type_or_types,
+            max_num=None, min_num=None):
+        """Return a subset of *tags* that matches type *tag_type*. Raises exception
+        if the number of tags found were greater than *max_num* or less than
+        *min_num*.
+
+        :arg tags: An iterable of tags.
+        :arg tag_type_or_types: a subclass of :class:`loopy.kernel.data.IndexTag`.
+        :arg max_num: the maximum number of tags expected to be found.
+        :arg min_num: the minimum number of tags expected to be found.
+        """
+
+        from loopy.kernel.data import filter_iname_tags_by_type
+        return filter_iname_tags_by_type(
+                self.iname_to_tags.get(iname, frozenset()),
+                tag_type_or_types, max_num=max_num, min_num=min_num)
+
     @memoize_method
     def all_inames(self):
         result = set()
@@ -748,7 +792,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return result
 
     @memoize_method
-    def remove_inames_for_shared_hw_axes(self, cond_inames):
+    def _remove_inames_for_shared_hw_axes(self, cond_inames):
         """
         See if cond_inames contains references to two (or more) inames that
         boil down to the same tag. If so, exclude them. (We shouldn't be writing
@@ -761,8 +805,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         from loopy.kernel.data import HardwareConcurrentTag
 
         for iname in cond_inames:
-            tags = filter_iname_tags_by_type(self.iname_to_tags[iname],
-                                        HardwareConcurrentTag, 1)
+            tags = self.iname_tags_of_type(iname, HardwareConcurrentTag, max_num=1)
             if tags:
                 tag, = tags
                 tag_key_uses[tag.key].append(iname)
@@ -773,8 +816,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         multi_use_inames = set()
         for iname in cond_inames:
-            tags = filter_iname_tags_by_type(self.iname_to_tags[iname],
-                                        HardwareConcurrentTag)
+            tags = self.iname_tags_of_type(iname, HardwareConcurrentTag)
             if tags:
                 tag, = filter_iname_tags_by_type(tags, HardwareConcurrentTag, 1)
                 if tag.key in multi_use_keys:
@@ -782,6 +824,24 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return frozenset(cond_inames - multi_use_inames)
 
+    # {{{ compatibility wrapper for iname_to_tag.get("iname")
+
+    @property
+    def iname_to_tag(self):
+        from warnings import warn
+        warn("Since version 2018.1, inames can hold multiple tags. Use "
+             "iname_to_tags['iname'] instead. iname_to_tag.get('iname') will be "
+             "removed at version 2019.0.", DeprecationWarning)
+        for iname, tags in six.iteritems(self.iname_to_tags):
+            if len(tags) > 1:
+                raise LoopyError(
+                    "iname {0} has multiple tags: {1}. "
+                    "Use iname_to_tags['iname'] instead.".format(iname, tags))
+        return dict((k, next(iter(v)))
+                    for k, v in six.iteritems(self.iname_to_tags) if v)
+
+    # }}}
+
     # }}}
 
     # {{{ dependency wrangling
@@ -1019,21 +1079,25 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 AutoLocalIndexTagBase)
 
         for iname in all_inames_by_insns:
-            tags = self.iname_to_tags[iname]
+            tags = self.iname_tags_of_type(
+                    iname,
+                    (AutoLocalIndexTagBase, GroupIndexTag, LocalIndexTag), max_num=1)
 
-            if filter_iname_tags_by_type(tags, GroupIndexTag):
-                tgt_dict = global_sizes
-            elif filter_iname_tags_by_type(tags, LocalIndexTag):
-                tgt_dict = local_sizes
-            elif (filter_iname_tags_by_type(tags, AutoLocalIndexTagBase)
-                  and not ignore_auto):
+            if not tags:
+                continue
+
+            tag, = tags
+
+            if isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto:
                 raise RuntimeError("cannot find grid sizes if automatic "
                         "local index tags are present")
+            elif isinstance(tag, GroupIndexTag):
+                tgt_dict = global_sizes
+            elif isinstance(tag, LocalIndexTag):
+                tgt_dict = local_sizes
             else:
                 continue
 
-            tag, = filter_iname_tags_by_type(tags, (GroupIndexTag, LocalIndexTag), 1)
-
             size = self.get_iname_bounds(iname).size
 
             if tag.axis in tgt_dict:
@@ -1263,11 +1327,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             if show_labels:
                 lines.append("INAME IMPLEMENTATION TAGS:")
             for iname in natsorted(kernel.all_inames()):
-                if not kernel.iname_to_tags[iname]:
-                    tags = "None"
+                tags = kernel.iname_tags(iname)
+
+                if not tags:
+                    tags_str = "None"
                 else:
-                    tags = ", ".join(str(tag) for tag in kernel.iname_to_tags[iname])
-                line = "%s: %s" % (iname, tags)
+                    tags_str = ", ".join(str(tag) for tag in tags)
+
+                line = "%s: %s" % (iname, tags_str)
                 lines.append(line)
 
         if "variables" in what and kernel.temporary_variables:
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index b124a27edc9f85f9cd4fafc51a18e48f8b2034b2..83f98ecd19eab030ac902c1ea05b1ace02f1dad8 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -58,7 +58,7 @@ class auto(object):  # noqa
 def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None):
     """Return a subset of *tags* that matches type *tag_type*. Raises exception
     if the number of tags found were greater than *max_num* or less than
-    *min_num*W.
+    *min_num*.
 
     :arg tags: An iterable of tags.
     :arg tag_type: a subclass of :class:`loopy.kernel.data.IndexTag`.
@@ -67,14 +67,21 @@ def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None):
     """
 
     result = set(tag for tag in tags if isinstance(tag, tag_type))
-    if max_num:
+
+    def strify_tag_type():
+        if isinstance(tag_type, tuple):
+            return ", ".join(t.__name__ for t in tag_type)
+        else:
+            return tag_type.__name__
+
+    if max_num is not None:
         if len(result) > max_num:
-            raise LoopyError("cannot have more than {0} tags"
-                    "of type(s): {1}".format(max_num, tag_type))
-    if min_num:
+            raise LoopyError("cannot have more than {0} tags "
+                    "of type(s): {1}".format(max_num, strify_tag_type()))
+    if min_num is not None:
         if len(result) < min_num:
-            raise LoopyError("must have more than {0} tags"
-                    "of type(s): {1}".format(max_num, tag_type))
+            raise LoopyError("must have more than {0} tags "
+                    "of type(s): {1}".format(max_num, strify_tag_type()))
     return result
 
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 0d68c002e4d2f815a1ba9a1686a965dd0eb85da3..fb57133e9be1647570def6e4c1678c5ec7ea3532 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -36,7 +36,6 @@ from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
 from pytools import memoize_on_first_arg
 from loopy.tools import natsorted
-from loopy.kernel.data import filter_iname_tags_by_type
 
 import logging
 logger = logging.getLogger(__name__)
@@ -632,7 +631,7 @@ def is_domain_dependent_on_inames(kernel, domain_index, inames):
 # {{{ rank inames by stride
 
 def get_auto_axis_iname_ranking_by_stride(kernel, insn):
-    from loopy.kernel.data import ImageArg, ValueArg, filter_iname_tags_by_type
+    from loopy.kernel.data import ImageArg, ValueArg
 
     approximate_arg_values = {}
     for arg in kernel.args:
@@ -678,8 +677,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
     from loopy.kernel.data import AutoLocalIndexTagBase
     auto_axis_inames = set(
         iname for iname in kernel.insn_inames(insn)
-        if filter_iname_tags_by_type(
-            kernel.iname_to_tags[iname], AutoLocalIndexTagBase))
+        if kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase))
 
     # }}}
 
@@ -780,7 +778,15 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
             # Likely unbounded, automatic assignment is not
             # going to happen for this iname.
             new_iname_to_tags = kernel.iname_to_tags.copy()
-            new_iname_to_tags[iname] = set()
+            new_tags = new_iname_to_tags.get(iname, frozenset())
+            new_tags = frozenset(tag for tag in new_tags
+                    if not isinstance(tag, AutoLocalIndexTagBase))
+
+            if new_tags:
+                new_iname_to_tags[iname] = new_tags
+            else:
+                del new_iname_to_tags[iname]
+
             return assign_automatic_axes(
                     kernel.copy(iname_to_tags=new_iname_to_tags),
                     axis=recursion_axis)
@@ -835,16 +841,24 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                             do_tagged_check=False),
                         axis=recursion_axis, local_size=local_size)
 
-        if not filter_iname_tags_by_type(kernel.iname_to_tags[iname],
-                                    AutoLocalIndexTagBase):
+        if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase):
             raise LoopyError("trying to reassign '%s'" % iname)
 
         if new_tag:
-            new_tag = set([new_tag])
+            new_tag_set = frozenset([new_tag])
         else:
-            new_tag = set()
+            new_tag_set = frozenset()
         new_iname_to_tags = kernel.iname_to_tags.copy()
-        new_iname_to_tags[iname] = new_tag
+        new_tags = (
+                frozenset(tag for tag in new_iname_to_tags.get(iname, frozenset())
+                    if not isinstance(tag, AutoLocalIndexTagBase))
+                | new_tag_set)
+
+        if new_tags:
+            new_iname_to_tags[iname] = new_tags
+        else:
+            del new_iname_to_tags[iname]
+
         return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags),
                 axis=recursion_axis, local_size=local_size)
 
@@ -863,8 +877,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 
         auto_axis_inames = [
             iname for iname in kernel.insn_inames(insn)
-            if filter_iname_tags_by_type(kernel.iname_to_tags[iname],
-                                    AutoLocalIndexTagBase)]
+            if kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase)]
 
         if not auto_axis_inames:
             continue
@@ -872,11 +885,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
         assigned_local_axes = set()
 
         for iname in kernel.insn_inames(insn):
-            tags = filter_iname_tags_by_type(
-                kernel.iname_to_tags[iname], LocalIndexTag)
+            tags = kernel.iname_tags_of_type(iname, LocalIndexTag, max_num=1)
             if tags:
-                if len(tags) > 1:
-                    raise LoopyError("cannot have more than one LocalIndexTags")
                 tag, = tags
                 assigned_local_axes.add(tag.axis)
 
@@ -887,7 +897,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                 iname_ranking = get_auto_axis_iname_ranking_by_stride(kernel, insn)
                 if iname_ranking is not None:
                     for iname in iname_ranking:
-                        prev_tags = kernel.iname_to_tags[iname]
+                        prev_tags = kernel.iname_tags(iname)
                         if filter_iname_tags_by_type(
                                 prev_tags, AutoLocalIndexTagBase):
                             return assign_axis(axis, iname, axis)
@@ -1145,7 +1155,7 @@ def get_visual_iname_order_embedding(kernel):
     # nest.
     ilp_inames = frozenset(iname
         for iname in kernel.iname_to_tags
-        if filter_iname_tags_by_type(kernel.iname_to_tags[iname], IlpBaseTag))
+        if kernel.iname_tags_of_type(iname, IlpBaseTag))
 
     iname_trie = SetTrie()
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 707676edeccbfdc36eced532e7ca1f20ac45a2b4..c4719ace5575efe2897b46963f8a7edd6bd38df1 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -141,7 +141,7 @@ def check_reduction_iname_uniqueness(kernel):
 
 def _get_compute_inames_tagged(kernel, insn, tag_base):
     return set(iname for iname in kernel.insn_inames(insn.id)
-               if filter_iname_tags_by_type(kernel.iname_to_tags[iname], tag_base))
+               if kernel.iname_tags_of_type(iname, tag_base))
 
 
 def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names):
@@ -151,7 +151,7 @@ def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names):
                 insn.assignee_subscript_deps())
             for iname in adeps & kernel.all_inames()
             if aname in tv_names
-            if filter_iname_tags_by_type(kernel.iname_to_tags[iname], tag_base))
+            if kernel.iname_tags_of_type(iname, tag_base))
 
 
 def find_temporary_scope(kernel):
@@ -299,7 +299,7 @@ def _classify_reduction_inames(kernel, inames):
             ConcurrentTag, filter_iname_tags_by_type)
 
     for iname in inames:
-        iname_tags = kernel.iname_to_tags[iname]
+        iname_tags = kernel.iname_tags(iname)
 
         if filter_iname_tags_by_type(iname_tags, (UnrollTag, UnrolledIlpTag)):
             # These are nominally parallel, but we can live with
@@ -1142,10 +1142,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
-        from loopy.kernel.data import LocalIndexTagBase, filter_iname_tags_by_type
+        from loopy.kernel.data import LocalIndexTagBase
         outer_local_inames = tuple(oiname for oiname in outer_insn_inames
-                if filter_iname_tags_by_type(
-                    kernel.iname_to_tags[oiname], LocalIndexTagBase))
+                if kernel.iname_tags_of_type(oiname, LocalIndexTagBase))
 
         from pymbolic import var
         outer_local_iname_vars = tuple(
@@ -1180,7 +1179,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         base_exec_iname = var_name_gen("red_"+red_iname)
         domains.append(_make_slab_set(base_exec_iname, size))
-        new_iname_tags[base_exec_iname] = kernel.iname_to_tags[red_iname]
+        new_iname_tags[base_exec_iname] = kernel.iname_tags(red_iname)
 
         # }}}
 
@@ -1275,7 +1274,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
             stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage))
             domains.append(_make_slab_set(stage_exec_iname, bound-new_size))
-            new_iname_tags[stage_exec_iname] = kernel.iname_to_tags[red_iname]
+            new_iname_tags[stage_exec_iname] = kernel.iname_tags(red_iname)
 
             stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage))
             stage_insn = make_assignment(
@@ -1478,10 +1477,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
-        from loopy.kernel.data import LocalIndexTagBase, filter_iname_tags_by_type
+        from loopy.kernel.data import LocalIndexTagBase
         outer_local_inames = tuple(oiname for oiname in outer_insn_inames
-                if filter_iname_tags_by_type(kernel.iname_to_tags[oiname],
-                                        LocalIndexTagBase)
+                if kernel.iname_tags_of_type(oiname, LocalIndexTagBase)
                 and oiname != sweep_iname)
 
         from pymbolic import var
@@ -1507,7 +1505,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         base_exec_iname = var_name_gen(sweep_iname + "__scan")
         domains.append(_make_slab_set(base_exec_iname, scan_size))
-        new_iname_tags[base_exec_iname] = kernel.iname_to_tags[sweep_iname]
+        new_iname_tags[base_exec_iname] = kernel.iname_tags(sweep_iname)
 
         # }}}
 
@@ -1598,7 +1596,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage))
             domains.append(
                     _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size))
-            new_iname_tags[stage_exec_iname] = kernel.iname_to_tags[sweep_iname]
+            new_iname_tags[stage_exec_iname] = kernel.iname_tags(sweep_iname)
 
             for read_var, acc_var in zip(read_vars, acc_vars):
                 read_stage_id = insn_id_gen(
@@ -1748,7 +1746,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     "by reductions is 'local'--found iname(s) '%s' "
                     "respectively tagged '%s'"
                     % (", ".join(bad_inames),
-                       ", ".join(str(kernel.iname_to_tags[iname])
+                       ", ".join(str(kernel.iname_tags(iname))
                                  for iname in bad_inames)))
 
         if n_local_par == 0 and n_sequential == 0:
@@ -1788,7 +1786,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                             "- the only parallelism allowed is 'local'." %
                             (sweep_iname,
                              ", ".join(tag.key
-                            for tag in temp_kernel.iname_to_tags[sweep_iname])))
+                            for tag in temp_kernel.iname_tags(sweep_iname))))
                 elif parallel:
                     return map_scan_local(
                             expr, rec, nresults, arg_dtypes, reduction_dtypes,
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 616c8e62a5900dea981ef4d1d5c12b5cf8925a27..440ac22cb890bd9f1b47f909ee96681c39c33975 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -212,12 +212,11 @@ def find_loop_nest_with_map(kernel):
     """
     result = {}
 
-    from loopy.kernel.data import (ConcurrentTag, IlpBaseTag, VectorizeTag,
-                                   filter_iname_tags_by_type)
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
 
     all_nonpar_inames = set(
             iname for iname in kernel.all_inames()
-            if not filter_iname_tags_by_type(kernel.iname_to_tags[iname],
+            if not kernel.iname_tags_of_type(iname,
                     (ConcurrentTag, IlpBaseTag, VectorizeTag)))
 
     iname_to_insns = kernel.iname_to_insns()
@@ -241,15 +240,14 @@ def find_loop_nest_around_map(kernel):
     iname_to_insns = kernel.iname_to_insns()
 
     # examine pairs of all inames--O(n**2), I know.
-    from loopy.kernel.data import IlpBaseTag, filter_iname_tags_by_type
+    from loopy.kernel.data import IlpBaseTag
     for inner_iname in all_inames:
         result[inner_iname] = set()
         for outer_iname in all_inames:
             if inner_iname == outer_iname:
                 continue
 
-            tags = kernel.iname_to_tags[outer_iname]
-            if filter_iname_tags_by_type(tags, IlpBaseTag):
+            if kernel.iname_tags_of_type(outer_iname, IlpBaseTag):
                 # ILP tags are special because they are parallel tags
                 # and therefore 'in principle' nest around everything.
                 # But they're realized by the scheduler as a loop
@@ -278,11 +276,10 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
 
     result = {}
 
-    from loopy.kernel.data import (ConcurrentTag, IlpBaseTag, VectorizeTag,
-                                   filter_iname_tags_by_type)
+    from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag
     for insn in kernel.instructions:
         for iname in kernel.insn_inames(insn):
-            if filter_iname_tags_by_type(kernel.iname_to_tags[iname], ConcurrentTag):
+            if kernel.iname_tags_of_type(iname, ConcurrentTag):
                 continue
 
             iname_dep = result.setdefault(iname, set())
@@ -312,8 +309,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map):
                         # -> safe.
                         continue
 
-                    tags = kernel.iname_to_tags[dep_insn_iname]
-                    if filter_iname_tags_by_type(tags,
+                    if kernel.iname_tags_of_type(dep_insn_iname,
                                 (ConcurrentTag, IlpBaseTag, VectorizeTag)):
                         # Parallel tags don't really nest, so we'll disregard
                         # them here.
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 9b44530d1e1eb1c7976383c4e2f414f3f1ef5f5d..521eaeb5a04cc244ce0f0fff511d273952ec8a2a 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -847,8 +847,7 @@ def _get_lid_and_gid_strides(knl, array, index):
     lid_to_iname = {}
     gid_to_iname = {}
     for iname in my_inames:
-        tags = filter_iname_tags_by_type(knl.iname_to_tags[iname],
-                              (GroupIndexTag, LocalIndexTag))
+        tags = knl.iname_tags_of_type(iname, (GroupIndexTag, LocalIndexTag))
         if tags:
             tag, = filter_iname_tags_by_type(
                 tags, (GroupIndexTag, LocalIndexTag), 1)
@@ -1203,11 +1202,10 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
     g_used = set()
     l_used = set()
 
-    from loopy.kernel.data import (LocalIndexTag, GroupIndexTag,
-                                   filter_iname_tags_by_type)
+    from loopy.kernel.data import LocalIndexTag, GroupIndexTag
     for iname in knl.insn_inames(insn):
-        tags = filter_iname_tags_by_type(knl.iname_to_tags[iname],
-                              (LocalIndexTag, GroupIndexTag), 1)
+        tags = knl.iname_tags_of_type(iname,
+                              (LocalIndexTag, GroupIndexTag), max_num=1)
         if tags:
             tag, = tags
             if isinstance(tag, LocalIndexTag):
@@ -1242,9 +1240,10 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False)
     insn_inames = knl.insn_inames(insn)
 
     if disregard_local_axes:
-        from loopy.kernel.data import LocalIndexTag, filter_iname_tags_by_type
-        insn_inames = [iname for iname in insn_inames if not
-                filter_iname_tags_by_type(knl.iname_to_tags[iname], LocalIndexTag)]
+        from loopy.kernel.data import LocalIndexTag
+        insn_inames = [iname
+                for iname in insn_inames
+                if not knl.iname_tags_of_type(iname, LocalIndexTag)]
 
     inames_domain = knl.get_inames_domain(insn_inames)
     domain = (inames_domain.project_out_except(
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 11fcf574707023a1096503e58f10ab63fb506b26..b2e4118d2107622470f3ebddf6983005a06b4a47 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -46,7 +46,8 @@ class vec:  # noqa
 def _create_vector_types():
     field_names = ["x", "y", "z", "w"]
 
-    if tuple.__itemsize__ * 8 == 32:
+    import sys
+    if sys.maxsize <= 2**33:
         long_dtype = np.int32
         ulong_dtype = np.uint32
     else:
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index a5d5bbf74ef62eacd2391e709230c020bb665d32..abe49a2414face070156463e7e11ae027e136ff0 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -431,10 +431,9 @@ class ISPCASTBuilder(CASTBuilder):
             saw_l0 = False
             for term in terms:
                 if (isinstance(term, Variable)
-                        and filter_iname_tags_by_type(
-                            kernel.iname_to_tags[term.name], LocalIndexTag)):
-                        tag, = filter_iname_tags_by_type(
-                            kernel.iname_to_tags[term.name], LocalIndexTag, 1)
+                            and kernel.iname_tags_of_type(term.name, LocalIndexTag)):
+                        tag, = kernel.iname_tags_of_type(
+                            term.name, LocalIndexTag, min_num=1, max_num=1)
                         if tag.axis == 0:
                             if saw_l0:
                                 raise LoopyError(
@@ -465,7 +464,7 @@ class ISPCASTBuilder(CASTBuilder):
 
             rhs_has_programindex = any(
                 isinstance(tag, LocalIndexTag) and tag.axis == 0
-                for tag in kernel.iname_to_tags[dep]
+                for tag in kernel.iname_tags(dep)
                 for dep in get_dependencies(insn.expression))
 
             if not rhs_has_programindex:
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 423ccfb5517f634434f3b3882b1d4e03926f79b7..2b618a464b5103ee28bceded07dc68f9c376c84d 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -177,7 +177,7 @@ def _split_iname_backend(kernel, split_iname,
         for syntax.
     """
 
-    existing_tags = kernel.iname_to_tags[split_iname]
+    existing_tags = kernel.iname_tags(split_iname)
     from loopy.kernel.data import ForceSequentialTag, filter_iname_tags_by_type
     if (do_tagged_check and existing_tags
             and not filter_iname_tags_by_type(existing_tags, ForceSequentialTag)):
@@ -610,9 +610,13 @@ def untag_inames(kernel, iname_to_untag, tag_type):
     """
 
     knl_iname_to_tags = kernel.iname_to_tags.copy()
-    old_tags = knl_iname_to_tags[iname_to_untag]
+    old_tags = knl_iname_to_tags.get(iname_to_untag, frozenset())
     old_tags = set(tag for tag in old_tags if not isinstance(tag, tag_type))
-    knl_iname_to_tags[iname_to_untag] = old_tags
+
+    if old_tags:
+        knl_iname_to_tags[iname_to_untag] = old_tags
+    else:
+        del knl_iname_to_tags[iname_to_untag]
 
     return kernel.copy(iname_to_tags=knl_iname_to_tags)
 
@@ -671,7 +675,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
     def parse_tag(tag):
         if isinstance(tag, str):
             if tag.startswith("like."):
-                tags = kernel.iname_to_tags[tag[5:]]
+                tags = kernel.iname_tags(tag[5:])
                 if len(tags) == 0:
                     return None
                 if len(tags) == 1:
@@ -722,7 +726,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
         if not new_tag:
             continue
 
-        old_tags = kernel.iname_to_tags[iname]
+        old_tags = kernel.iname_tags(iname)
 
         if iname not in kernel.all_inames():
             raise ValueError("cannot tag '%s'--not known" % iname)
@@ -739,7 +743,7 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
                     "(likely because of participation in a precompute or "
                     "a reduction)" % iname)
 
-        knl_iname_to_tags[iname] = old_tags.union([new_tag])
+        knl_iname_to_tags[iname] = old_tags | frozenset([new_tag])
 
     return kernel.copy(iname_to_tags=knl_iname_to_tags)
 
@@ -992,12 +996,12 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
     Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be
     duplicated in a given kernel.
     """
-    from loopy.kernel.data import ConcurrentTag, filter_iname_tags_by_type
+    from loopy.kernel.data import ConcurrentTag
 
     concurrent_inames = set(
             iname
-            for iname in knl.all_inames() if filter_iname_tags_by_type(
-                knl.iname_to_tags[iname], ConcurrentTag))
+            for iname in knl.all_inames()
+            if knl.iname_tags_of_type(iname, ConcurrentTag))
 
     # First we extract the minimal necessary information from the kernel
     if use_boostable_into:
@@ -1021,8 +1025,8 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
     # Get the duplication options as a tuple of iname and a set
     for iname, insns in _get_iname_duplication_options(insn_iname_sets):
         # Check whether this iname has a parallel tag and discard it if so
-        if (iname in knl.iname_to_tags and filter_iname_tags_by_type(
-                knl.iname_to_tags[iname], ConcurrentTag)):
+        if (iname in knl.iname_to_tags
+                and knl.iname_tags_of_type(iname, ConcurrentTag)):
             continue
 
         # If we find a duplication option and to not use boostable_into
@@ -1539,8 +1543,7 @@ def find_unused_axis_tag(kernel, kind, insn_match=None):
     """
     used_axes = set()
 
-    from loopy.kernel.data import (GroupIndexTag, LocalIndexTag,
-                                   filter_iname_tags_by_type)
+    from loopy.kernel.data import GroupIndexTag, LocalIndexTag
 
     if isinstance(kind, str):
         found = False
@@ -1559,8 +1562,7 @@ def find_unused_axis_tag(kernel, kind, insn_match=None):
 
     for insn in insns:
         for iname in kernel.insn_inames(insn):
-            dim_tags = kernel.iname_to_tags[iname]
-            if filter_iname_tags_by_type(dim_tags, kind):
+            if kernel.iname_tags_of_type(iname, kind):
                 used_axes.add(kind.axis)
 
     i = 0
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index c953c1cee0b1e6930423ebf42d2fba55a602c3df..d4128bd115666cf66c6f06a40823ed9d5929faab 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -174,7 +174,7 @@ def privatize_temporaries_with_inames(
 
     # {{{ change temporary variables
 
-    from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type
+    from loopy.kernel.data import VectorizeTag
 
     new_temp_vars = kernel.temporary_variables.copy()
     for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname):
@@ -187,7 +187,7 @@ def privatize_temporaries_with_inames(
 
         dim_tags = ["c"] * (len(shape) + len(extra_shape))
         for i, iname in enumerate(inames):
-            if filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag):
+            if kernel.iname_tags_of_type(iname, VectorizeTag):
                 dim_tags[len(shape) + i] = "vec"
 
         new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape,
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 5c2d65062af547eed6232da3c8319dfe3ea0a8aa..0283b84f970a74d3e78d1f09d6f428c7daf5b7ee 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -245,7 +245,7 @@ class TemporarySaver(object):
         self.insns_to_insert = []
         self.insns_to_update = {}
         self.extra_args_to_add = {}
-        self.updated_iname_to_tags = defaultdict(set)
+        self.updated_iname_to_tags = {}
         self.updated_temporary_variables = {}
 
         # temporary name -> save or reload insn ids
@@ -397,7 +397,7 @@ class TemporarySaver(object):
             my_local_tags = []
 
             for iname in insn.within_inames:
-                tags = self.kernel.iname_to_tags[iname]
+                tags = self.kernel.iname_tags(iname)
 
                 if not tags:
                     continue
@@ -677,7 +677,7 @@ class TemporarySaver(object):
                 # If the temporary has local scope, then loads / stores can
                 # be done in parallel.
                 from loopy.kernel.data import AutoFitLocalIndexTag
-                iname_to_tags[new_iname] = set([AutoFitLocalIndexTag()])
+                iname_to_tags[new_iname] = frozenset([AutoFitLocalIndexTag()])
 
             dim_inames.append(new_iname)
 
@@ -707,7 +707,7 @@ class TemporarySaver(object):
                 &
                 aff[new_iname].lt_set(aff_from_expr(domain.space, dim)))
 
-            self.updated_iname_to_tags[new_iname] = set([hw_tag])
+            self.updated_iname_to_tags[new_iname] = frozenset([hw_tag])
             hw_inames.append(new_iname)
 
         # The operations on the domain above return a Set object, but the
diff --git a/loopy/version.py b/loopy/version.py
index 2f5006be32999362b87e0a17ec90337137463262..da28a3f0ac02133edaf2fa7c9e9eff5828c5b1ff 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -42,7 +42,7 @@ else:
 # }}}
 
 
-VERSION = (2017, 2, 1)
+VERSION = (2018, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS