diff --git a/loopy/check.py b/loopy/check.py
index 6a1e3dc33a33b826ad54c42a549b35ad275d9fe5..54ab043d6a38f36852920eb3008d26e28b5cedfb 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -505,22 +505,23 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel):
 # {{{ check that temporaries are defined in subkernels where used
 
 def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
-    from loopy.schedule.tools import InstructionQuery
     from loopy.kernel.data import temp_var_scope
+    from loopy.kernel.tools import get_subkernels
 
-    insn_query = InstructionQuery(kernel)
-
-    for subkernel in insn_query.subkernels():
+    for subkernel in get_subkernels(kernel):
         defined_base_storage = set()
 
-        for temporary in insn_query.temporaries_written_in_subkernel(subkernel):
+        from loopy.schedule.tools import (
+                temporaries_written_in_subkernel, temporaries_read_in_subkernel)
+
+        for temporary in temporaries_written_in_subkernel(kernel, subkernel):
             tval = kernel.temporary_variables[temporary]
             if tval.base_storage is not None:
                 defined_base_storage.add(tval.base_storage)
 
         for temporary in (
-                insn_query.temporaries_read_in_subkernel(subkernel) -
-                insn_query.temporaries_written_in_subkernel(subkernel)):
+                temporaries_read_in_subkernel(kernel, subkernel) -
+                temporaries_written_in_subkernel(kernel, subkernel)):
             tval = kernel.temporary_variables[temporary]
 
             if tval.initializer is not None:
@@ -530,16 +531,17 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
             if tval.base_storage is not None:
                 if tval.base_storage not in defined_base_storage:
                     from loopy.diagnostic import MissingDefinitionError
-                    raise MissingDefinitionError("temporary variable '%s' gets used "
-                        "in subkernel '%s' and neither it nor its aliases have a "
-                        "definition" % (temporary, subkernel))
+                    raise MissingDefinitionError("temporary variable '%s' gets "
+                            "used in subkernel '%s' and neither it nor its "
+                            "aliases have a definition" % (temporary, subkernel))
                 continue
 
             if tval.scope in (temp_var_scope.PRIVATE, temp_var_scope.LOCAL):
                 from loopy.diagnostic import MissingDefinitionError
-                raise MissingDefinitionError("temporary variable '%s' gets used in "
-                    "subkernel '%s' without a definition (maybe you forgot to call "
-                    "loopy.save_and_reload_temporaries?)" % (temporary, subkernel))
+                raise MissingDefinitionError("temporary variable '%s' gets used "
+                        "in subkernel '%s' without a definition (maybe you forgot "
+                        "to call loopy.save_and_reload_temporaries?)"
+                        % (temporary, subkernel))
 
 # }}}
 
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index 7cc381f11d1239cba5656a9dc7a04cddaa14a368..61f4b3a9b8c38dfc25ebc81243812aa963423f8a 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -63,13 +63,20 @@ def get_usable_inames_for_conditional(kernel, sched_index):
     result = find_active_inames_at(kernel, sched_index)
     crosses_barrier = has_barrier_within(kernel, sched_index)
 
-    # Find our containing subkernel, grab inames for all insns from there.
-
-    subkernel_index = sched_index
-    from loopy.schedule import CallKernel
-
-    while not isinstance(kernel.schedule[subkernel_index], CallKernel):
-        subkernel_index -= 1
+    # Find our containing subkernel. Grab inames for all insns from there.
+    within_subkernel = False
+
+    for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]):
+        from loopy.schedule import CallKernel, ReturnFromKernel
+        if isinstance(sched_item, CallKernel):
+            within_subkernel = True
+            subkernel_index = sched_item_index
+        elif isinstance(sched_item, ReturnFromKernel):
+            within_subkernel = False
+
+    if not within_subkernel:
+        # Outside all subkernels - use only inames available to host.
+        return frozenset(result)
 
     insn_ids_for_subkernel = get_insn_ids_for_block_at(
         kernel.schedule, subkernel_index)
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 0ebe90fbca0d31c05eaee64321e2b73709292331..36fbb49f4bb77c959877fb0bd21e1de6fb49c74b 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -594,6 +594,10 @@ def get_simple_strides(bset, key_by="name"):
     """
     result = {}
 
+    comp_div_set_pieces = convexify(bset.compute_divs()).get_basic_sets()
+    assert len(comp_div_set_pieces) == 1
+    bset, = comp_div_set_pieces
+
     lspace = bset.get_local_space()
     for idiv in range(lspace.dim(dim_type.div)):
         div = lspace.get_div(idiv)
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 324f7da1a21de0115ea060ff7ef55e52ab0913d4..e8c846fbc491b7049d7820e3ef14d9ed8071ded3 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -44,33 +44,49 @@ from loopy.diagnostic import CannotBranchDomainTree, LoopyError
 
 # {{{ unique var names
 
-def _is_var_name_conflicting_with_longer(name_a, name_b):
-    # Array dimensions implemented as separate arrays generate
-    # names by appending '_s<NUMBER>'. Make sure that no
-    # conflicts can arise from these names.
+class _UniqueVarNameGenerator(UniqueNameGenerator):
 
-    # Only deal with the case of b longer than a.
-    if not name_b.startswith(name_a):
-        return False
+    def __init__(self, existing_names=set(), forced_prefix=""):
+        super(_UniqueVarNameGenerator, self).__init__(existing_names, forced_prefix)
+        array_prefix_pattern = re.compile("(.*)_s[0-9]+$")
 
-    return re.match("^%s_s[0-9]+" % re.escape(name_b), name_a) is not None
+        array_prefixes = set()
+        for name in existing_names:
+            match = array_prefix_pattern.match(name)
+            if match is None:
+                continue
 
+            array_prefixes.add(match.group(1))
 
-def _is_var_name_conflicting(name_a, name_b):
-    if name_a == name_b:
-        return True
+        self.conflicting_array_prefixes = array_prefixes
+        self.array_prefix_pattern = array_prefix_pattern
 
-    return (
-            _is_var_name_conflicting_with_longer(name_a, name_b)
-            or _is_var_name_conflicting_with_longer(name_b, name_a))
+    def _name_added(self, name):
+        match = self.array_prefix_pattern.match(name)
+        if match is None:
+            return
 
+        self.conflicting_array_prefixes.add(match.group(1))
 
-class _UniqueVarNameGenerator(UniqueNameGenerator):
     def is_name_conflicting(self, name):
-        from pytools import any
-        return any(
-                _is_var_name_conflicting(name, other_name)
-                for other_name in self.existing_names)
+        if name in self.existing_names:
+            return True
+
+        # Array dimensions implemented as separate arrays generate
+        # names by appending '_s<NUMBER>'. Make sure that no
+        # conflicts can arise from these names.
+
+        # Case 1: a_s0 is already a name; we are trying to insert a
+        # Case 2: a is already a name; we are trying to insert a_s0
+
+        if name in self.conflicting_array_prefixes:
+            return True
+
+        match = self.array_prefix_pattern.match(name)
+        if match is None:
+            return False
+
+        return match.group(1) in self.existing_names
 
 # }}}
 
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
index 5de677e72708be844a5276b3d40ace8b1dad9da0..f9b08d3434556f912107726f125abbfa110f5676 100644
--- a/loopy/schedule/tools.py
+++ b/loopy/schedule/tools.py
@@ -23,10 +23,6 @@ THE SOFTWARE.
 """
 
 from loopy.kernel.data import temp_var_scope
-from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem,
-                            RunInstruction, Barrier)
-
-from pytools import memoize_method
 
 
 # {{{ block boundary finder
@@ -37,6 +33,7 @@ def get_block_boundaries(schedule):
     :class:`loopy.schedule.BlockBeginItem`s to
     :class:`loopy.schedule.BlockEndItem`s and vice versa.
     """
+    from loopy.schedule import (BeginBlockItem, EndBlockItem)
     block_bounds = {}
     active_blocks = []
     for idx, sched_item in enumerate(schedule):
@@ -51,109 +48,24 @@ def get_block_boundaries(schedule):
 # }}}
 
 
-# {{{ instruction query utility
-
-class InstructionQuery(object):
-
-    def __init__(self, kernel):
-        self.kernel = kernel
-        block_bounds = get_block_boundaries(kernel.schedule)
-        subkernel_slices = {}
-        from six import iteritems
-        for start, end in iteritems(block_bounds):
-            sched_item = kernel.schedule[start]
-            if isinstance(sched_item, CallKernel):
-                subkernel_slices[sched_item.kernel_name] = slice(start, end + 1)
-        self.subkernel_slices = subkernel_slices
-
-    @memoize_method
-    def subkernels(self):
-        return frozenset(self.subkernel_slices.keys())
-
-    @memoize_method
-    def insns_reading_or_writing(self, var):
-        return frozenset(insn.id for insn in self.kernel.instructions
-            if var in insn.read_dependency_names()
-                or var in insn.assignee_var_names())
-
-    @memoize_method
-    def insns_in_subkernel(self, subkernel):
-        return frozenset(sched_item.insn_id for sched_item
-            in self.kernel.schedule[self.subkernel_slices[subkernel]]
-            if isinstance(sched_item, RunInstruction))
-
-    @memoize_method
-    def temporaries_read_in_subkernel(self, subkernel):
-        return frozenset(
-            var
-            for insn in self.insns_in_subkernel(subkernel)
-            for var in self.kernel.id_to_insn[insn].read_dependency_names()
-            if var in self.kernel.temporary_variables)
-
-    @memoize_method
-    def temporaries_written_in_subkernel(self, subkernel):
-        return frozenset(
-            var
-            for insn in self.insns_in_subkernel(subkernel)
-            for var in self.kernel.id_to_insn[insn].assignee_var_names()
-            if var in self.kernel.temporary_variables)
-
-    @memoize_method
-    def temporaries_read_or_written_in_subkernel(self, subkernel):
-        return (
-            self.temporaries_read_in_subkernel(subkernel) |
-            self.temporaries_written_in_subkernel(subkernel))
-
-    @memoize_method
-    def inames_in_subkernel(self, subkernel):
-        subkernel_start = self.subkernel_slices[subkernel].start
-        return frozenset(self.kernel.schedule[subkernel_start].extra_inames)
-
-    @memoize_method
-    def pre_and_post_barriers(self, subkernel):
-        subkernel_start = self.subkernel_slices[subkernel].start
-        subkernel_end = self.subkernel_slices[subkernel].stop
-
-        def is_global_barrier(item):
-            return isinstance(item, Barrier) and item.kind == "global"
-
-        try:
-            pre_barrier = next(item for item in
-                    self.kernel.schedule[subkernel_start::-1]
-                    if is_global_barrier(item)).originating_insn_id
-        except StopIteration:
-            pre_barrier = None
-
-        try:
-            post_barrier = next(item for item in
-                    self.kernel.schedule[subkernel_end:]
-                    if is_global_barrier(item)).originating_insn_id
-        except StopIteration:
-            post_barrier = None
-
-        return (pre_barrier, post_barrier)
-
-    @memoize_method
-    def hw_inames(self, insn_id):
-        """
-        Return the inames that insn runs in and that are tagged as hardware
-        parallel.
-        """
-        from loopy.kernel.data import HardwareParallelTag
-        return set(iname for iname in self.kernel.insn_inames(insn_id)
-                   if isinstance(self.kernel.iname_to_tag.get(iname),
-                                 HardwareParallelTag))
-
-    @memoize_method
-    def common_hw_inames(self, insn_ids):
-        """
-        Return the common set of hardware parallel tagged inames among
-        the list of instructions.
-        """
-        # Get the list of hardware inames in which the temporary is defined.
-        if len(insn_ids) == 0:
-            return set()
-        return set.intersection(*(self.hw_inames(id) for id in insn_ids))
+# {{{ subkernel tools
+
+def temporaries_read_in_subkernel(kernel, subkernel):
+    from loopy.kernel.tools import get_subkernel_to_insn_id_map
+    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel]
+    return frozenset(tv
+            for insn_id in insn_ids
+            for tv in kernel.id_to_insn[insn_id].read_dependency_names()
+            if tv in kernel.temporary_variables)
+
+
+def temporaries_written_in_subkernel(kernel, subkernel):
+    from loopy.kernel.tools import get_subkernel_to_insn_id_map
+    insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel]
+    return frozenset(tv
+            for insn_id in insn_ids
+            for tv in kernel.id_to_insn[insn_id].write_dependency_names()
+            if tv in kernel.temporary_variables)
 
 # }}}
 
@@ -166,23 +78,27 @@ def add_extra_args_to_schedule(kernel):
     instructions in the schedule with global temporaries.
     """
     new_schedule = []
-
-    insn_query = InstructionQuery(kernel)
+    from loopy.schedule import CallKernel
 
     for sched_item in kernel.schedule:
         if isinstance(sched_item, CallKernel):
-            subrange_temporaries = (insn_query
-                .temporaries_read_or_written_in_subkernel(sched_item.kernel_name))
+            subkernel = sched_item.kernel_name
+
+            used_temporaries = (
+                    temporaries_read_in_subkernel(kernel, subkernel)
+                    | temporaries_written_in_subkernel(kernel, subkernel))
+
             more_args = set(tv
-                for tv in subrange_temporaries
-                if
-                kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
-                and
-                kernel.temporary_variables[tv].initializer is None
-                and
-                tv not in sched_item.extra_args)
+                    for tv in used_temporaries
+                    if
+                    kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL
+                    and
+                    kernel.temporary_variables[tv].initializer is None
+                    and
+                    tv not in sched_item.extra_args)
+
             new_schedule.append(sched_item.copy(
-                extra_args=sched_item.extra_args + sorted(more_args)))
+                    extra_args=sched_item.extra_args + sorted(more_args)))
         else:
             new_schedule.append(sched_item)
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index fde8643bf92b7ad56bb47975fa7ede1bda9b399c..cb15eb55498bcafe4ae537747e387e47ddbd8254 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -66,16 +66,17 @@ class ToCountMap(object):
 
     """
 
-    def __init__(self, init_dict=None):
+    def __init__(self, init_dict=None, val_type=isl.PwQPolynomial):
         if init_dict is None:
             init_dict = {}
         self.count_map = init_dict
+        self.val_type = val_type
 
     def __add__(self, other):
         result = self.count_map.copy()
         for k, v in six.iteritems(other.count_map):
             result[k] = self.count_map.get(k, 0) + v
-        return ToCountMap(result)
+        return ToCountMap(result, self.val_type)
 
     def __radd__(self, other):
         if other != 0:
@@ -101,7 +102,11 @@ class ToCountMap(object):
         try:
             return self.count_map[index]
         except KeyError:
-            return isl.PwQPolynomial('{ 0 }')
+            #TODO what is the best way to handle this?
+            if self.val_type is isl.PwQPolynomial:
+                return isl.PwQPolynomial('{ 0 }')
+            else:
+                return 0
 
     def __setitem__(self, index, value):
         self.count_map[index] = value
@@ -112,6 +117,9 @@ class ToCountMap(object):
     def __len__(self):
         return len(self.count_map)
 
+    def get(self, key, default=None):
+        return self.count_map.get(key, default)
+
     def items(self):
         return self.count_map.items()
 
@@ -122,7 +130,7 @@ class ToCountMap(object):
         return self.count_map.pop(item)
 
     def copy(self):
-        return ToCountMap(dict(self.count_map))
+        return ToCountMap(dict(self.count_map), self.val_type)
 
     def filter_by(self, **kwargs):
         """Remove items without specified key fields.
@@ -149,7 +157,7 @@ class ToCountMap(object):
 
         """
 
-        result_map = ToCountMap()
+        result_map = ToCountMap(val_type=self.val_type)
 
         from loopy.types import to_loopy_type
         if 'dtype' in kwargs.keys():
@@ -197,7 +205,7 @@ class ToCountMap(object):
 
         """
 
-        result_map = ToCountMap()
+        result_map = ToCountMap(val_type=self.val_type)
 
         # for each item in self.count_map, call func on the key
         for self_key, self_val in self.items():
@@ -252,7 +260,7 @@ class ToCountMap(object):
 
         """
 
-        result_map = ToCountMap()
+        result_map = ToCountMap(val_type=self.val_type)
 
         # make sure all item keys have same type
         if self.count_map:
@@ -315,23 +323,36 @@ class ToCountMap(object):
             bytes_processed = int(key.dtype.itemsize) * val
             result[key] = bytes_processed
 
+        #TODO again, is this okay?
+        result.val_type = int
+
         return result
 
     def sum(self):
         """Add all counts in ToCountMap.
 
-        :return: A :class:`islpy.PwQPolynomial` containing the sum of counts.
+        :return: A :class:`islpy.PwQPolynomial` or :class:`int` containing the sum of
+                 counts.
 
         """
-        total = isl.PwQPolynomial('{ 0 }')
+
+        if self.val_type is isl.PwQPolynomial:
+            total = isl.PwQPolynomial('{ 0 }')
+        else:
+            total = 0
+
         for k, v in self.items():
-            if not isinstance(v, isl.PwQPolynomial):
-                raise ValueError("ToCountMap: sum() encountered type {0} but "
-                                 "may only be used on PwQPolynomials."
-                                 .format(type(v)))
             total += v
         return total
 
+    #TODO test and document
+    def eval(self, params):
+        result = self.copy()
+        for key, val in self.items():
+            result[key] = val.eval_with_dict(params)
+        result.val_type = int
+        return result
+
     def eval_and_sum(self, params):
         """Add all counts in :class:`ToCountMap` and evaluate with provided
         parameter dict.
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index a19e06ecdf7c9966501ebb9600ea4e01614363f4..6077332c4fc4322ac7ffb02ade4a0e24c7066245 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -681,12 +681,18 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
                 dt, dim_idx = var_dict[primed_non1_saxis_names[i]]
                 mod_domain = mod_domain.set_dim_name(dt, dim_idx, saxis)
 
+        def add_assumptions(d):
+            assumption_non_param = isl.BasicSet.from_params(kernel.assumptions)
+            assumptions, domain = isl.align_two(assumption_non_param, d)
+            return assumptions & domain
+
         # {{{ check that we got the desired domain
 
-        check_domain = check_domain.project_out_except(
-                primed_non1_saxis_names, [isl.dim_type.set])
+        check_domain = add_assumptions(
+            check_domain.project_out_except(
+                primed_non1_saxis_names, [isl.dim_type.set]))
 
-        mod_check_domain = mod_domain
+        mod_check_domain = add_assumptions(mod_domain)
 
         # re-add the prime from the new variable
         var_dict = mod_check_domain.get_var_dict(isl.dim_type.set)
@@ -716,10 +722,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
         # project out the new names from the modified domain
         orig_domain_inames = list(domch.domain.get_var_dict(isl.dim_type.set))
-        mod_check_domain = mod_domain.project_out_except(
-                orig_domain_inames, [isl.dim_type.set])
+        mod_check_domain = add_assumptions(
+                mod_domain.project_out_except(
+                    orig_domain_inames, [isl.dim_type.set]))
 
-        check_domain = domch.domain
+        check_domain = add_assumptions(domch.domain)
 
         mod_check_domain, check_domain = isl.align_two(
                 mod_check_domain, check_domain)
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 8afc1695a38a37baf165f6ec6ef6567e2012173b..3d4f5c2d4765aa7cbf1e56c76d127bf8f4d61a06 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 
 from loopy.diagnostic import LoopyError
 import loopy as lp
+import six
 
 from loopy.kernel.data import auto, temp_var_scope
 from pytools import memoize_method, Record
@@ -32,7 +33,7 @@ from loopy.schedule import (
             EnterLoop, LeaveLoop, RunInstruction,
             CallKernel, ReturnFromKernel, Barrier)
 
-from loopy.schedule.tools import (get_block_boundaries, InstructionQuery)
+from loopy.schedule.tools import get_block_boundaries
 
 
 import logging
@@ -135,7 +136,7 @@ class LivenessAnalysis(object):
 
     @memoize_method
     def liveness(self):
-        logging.info("running liveness analysis")
+        logger.info("running liveness analysis")
         successors = self.get_successor_relation()
         gen, kill = self.get_gen_and_kill_sets()
 
@@ -152,7 +153,7 @@ class LivenessAnalysis(object):
                     lr[idx].live_out.update(lr[succ].live_in)
                 lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx])
 
-        logging.info("done running liveness analysis")
+        logger.info("done running liveness analysis")
 
         return lr
 
@@ -193,13 +194,9 @@ class TemporarySaver(object):
 
             The name of the new temporary.
 
-        .. attribute:: orig_temporary
+        .. attribute:: orig_temporary_name
 
-            The original temporary variable object.
-
-        .. attribute:: hw_inames
-
-            The common list of hw axes that define the original object.
+            The name of original temporary variable object.
 
         .. attribute:: hw_dims
 
@@ -207,6 +204,10 @@ class TemporarySaver(object):
             of the promoted temporary value, corresponding to
             hardware dimensions
 
+        .. attribute:: hw_tags
+
+            The tags for the inames associated with hw_dims
+
         .. attribute:: non_hw_dims
 
             A list of expressions, to be added in front of the shape
@@ -214,9 +215,15 @@ class TemporarySaver(object):
             non-hardware dimensions
         """
 
-        @memoize_method
-        def as_variable(self):
-            temporary = self.orig_temporary
+        __slots__ = """
+                name
+                orig_temporary_name
+                hw_dims
+                hw_tags
+                non_hw_dims""".split()
+
+        def as_kernel_temporary(self, kernel):
+            temporary = kernel.temporary_variables[self.orig_temporary_name]
             from loopy.kernel.data import TemporaryVariable
             return TemporaryVariable(
                 name=self.name,
@@ -230,16 +237,215 @@ class TemporarySaver(object):
 
     def __init__(self, kernel):
         self.kernel = kernel
-        self.insn_query = InstructionQuery(kernel)
         self.var_name_gen = kernel.get_var_name_generator()
         self.insn_name_gen = kernel.get_instruction_id_generator()
+
         # These fields keep track of updates to the kernel.
         self.insns_to_insert = []
         self.insns_to_update = {}
         self.extra_args_to_add = {}
         self.updated_iname_to_tag = {}
         self.updated_temporary_variables = {}
-        self.saves_or_reloads_added = {}
+
+        # temporary name -> save or reload insn ids
+        from collections import defaultdict
+        self.temporary_to_save_ids = defaultdict(set)
+        self.temporary_to_reload_ids = defaultdict(set)
+        self.subkernel_to_newly_added_insn_ids = defaultdict(set)
+
+        # Maps names of base_storage to the name of the temporary
+        # representative chosen for saves/reloads
+        self.base_storage_to_representative = {}
+
+        from loopy.kernel.data import ValueArg
+        import islpy as isl
+        self.new_subdomain = (
+                isl.BasicSet.universe(
+                    isl.Space.create_from_names(
+                        isl.DEFAULT_CONTEXT,
+                        set=[],
+                        params=set(
+                            arg.name for arg in kernel.args
+                            if isinstance(arg, ValueArg)))))
+
+    def find_accessing_instructions_in_subkernel(self, temporary, subkernel):
+        # Find all accessing instructions in the subkernel. If base_storage is
+        # present, this includes instructions that access aliasing memory.
+
+        aliasing_names = set([temporary])
+        base_storage = self.kernel.temporary_variables[temporary].base_storage
+
+        if base_storage is not None:
+            aliasing_names |= self.base_storage_to_temporary_map[base_storage]
+
+        from loopy.kernel.tools import get_subkernel_to_insn_id_map
+        accessing_insns_in_subkernel = set()
+        subkernel_insns = get_subkernel_to_insn_id_map(self.kernel)[subkernel]
+
+        for name in aliasing_names:
+            try:
+                accessing_insns_in_subkernel |= (
+                        self.kernel.reader_map()[name] & subkernel_insns)
+            except KeyError:
+                pass
+
+            try:
+                accessing_insns_in_subkernel |= (
+                        self.kernel.writer_map()[name] & subkernel_insns)
+            except KeyError:
+                pass
+
+        return frozenset(accessing_insns_in_subkernel)
+
+    @property
+    @memoize_method
+    def base_storage_to_temporary_map(self):
+        from collections import defaultdict
+
+        result = defaultdict(set)
+
+        for temporary in six.itervalues(self.kernel.temporary_variables):
+            if temporary.base_storage is None:
+                continue
+            result[temporary.base_storage].add(temporary.name)
+
+        return result
+
+    @property
+    @memoize_method
+    def subkernel_to_slice_indices(self):
+        result = {}
+
+        for sched_item_idx, sched_item in enumerate(self.kernel.schedule):
+            if isinstance(sched_item, CallKernel):
+                start_idx = sched_item_idx
+            elif isinstance(sched_item, ReturnFromKernel):
+                result[sched_item.kernel_name] = (start_idx, 1 + sched_item_idx)
+
+        return result
+
+    @property
+    @memoize_method
+    def subkernel_to_surrounding_inames(self):
+        current_outer_inames = set()
+        within_subkernel = False
+        result = {}
+
+        for sched_item_idx, sched_item in enumerate(self.kernel.schedule):
+            if isinstance(sched_item, CallKernel):
+                within_subkernel = True
+                result[sched_item.kernel_name] = frozenset(current_outer_inames)
+            elif isinstance(sched_item, ReturnFromKernel):
+                within_subkernel = False
+            elif isinstance(sched_item, EnterLoop):
+                if not within_subkernel:
+                    current_outer_inames.add(sched_item.iname)
+            elif isinstance(sched_item, LeaveLoop):
+                current_outer_inames.discard(sched_item.iname)
+
+        return result
+
+    @memoize_method
+    def get_enclosing_global_barrier_pair(self, subkernel):
+        subkernel_start, subkernel_end = (
+            self.subkernel_to_slice_indices[subkernel])
+
+        def is_global_barrier(item):
+            return isinstance(item, Barrier) and item.kind == "global"
+
+        try:
+            pre_barrier = next(item for item in
+                self.kernel.schedule[subkernel_start::-1]
+                if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            pre_barrier = None
+
+        try:
+            post_barrier = next(item for item in
+                self.kernel.schedule[subkernel_end:]
+                if is_global_barrier(item)).originating_insn_id
+        except StopIteration:
+            post_barrier = None
+
+        return (pre_barrier, post_barrier)
+
+    def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary):
+        """
+        This is used for determining the amount of global storage needed for saving
+        and restoring the temporary across kernel calls, due to hardware
+        parallel inames (the inferred axes get prefixed to the number of
+        dimensions in the temporary).
+
+        In the case of local temporaries, inames that are tagged
+        hw-local do not contribute to the global storage shape.
+        """
+        accessor_insn_ids = frozenset(
+            self.kernel.reader_map()[temporary.name]
+            | self.kernel.writer_map()[temporary.name])
+
+        group_tags = None
+        local_tags = None
+
+        def _sortedtags(tags):
+            return sorted(tags, key=lambda tag: tag.axis)
+
+        for insn_id in accessor_insn_ids:
+            insn = self.kernel.id_to_insn[insn_id]
+
+            my_group_tags = []
+            my_local_tags = []
+
+            for iname in insn.within_inames:
+                tag = self.kernel.iname_to_tag.get(iname)
+
+                if tag is None:
+                    continue
+
+                from loopy.kernel.data import (
+                    GroupIndexTag, LocalIndexTag, ParallelTag)
+
+                if isinstance(tag, GroupIndexTag):
+                    my_group_tags.append(tag)
+                elif isinstance(tag, LocalIndexTag):
+                    my_local_tags.append(tag)
+                elif isinstance(tag, ParallelTag):
+                    raise LoopyError(
+                        "iname '%s' is tagged with '%s' - only "
+                        "group and local tags are supported for "
+                        "auto save/reload of temporaries" %
+                        (iname, tag))
+
+            if group_tags is None:
+                group_tags = _sortedtags(my_group_tags)
+                local_tags = _sortedtags(my_local_tags)
+                group_tags_originating_insn_id = insn_id
+
+            if (
+                    group_tags != _sortedtags(my_group_tags)
+                    or local_tags != _sortedtags(my_local_tags)):
+                raise LoopyError(
+                    "inconsistent parallel tags across instructions that access "
+                    "'%s' (specifically, instruction '%s' has tags '%s' but "
+                    "instruction '%s' has tags '%s')"
+                    % (temporary.name,
+                       group_tags_originating_insn_id, group_tags + local_tags,
+                       insn_id, my_group_tags + my_local_tags))
+
+        if group_tags is None:
+            assert local_tags is None
+            return (), ()
+
+        group_sizes, local_sizes = (
+            self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids))
+
+        if temporary.scope == lp.temp_var_scope.LOCAL:
+            # Elide local axes in the save slot for local temporaries.
+            del local_tags[:]
+            local_sizes = ()
+
+        # We set hw_dims to be arranged according to the order:
+        #    g.0 < g.1 < ... < l.0 < l.1 < ...
+        return (group_sizes + local_sizes), tuple(group_tags + local_tags)
 
     @memoize_method
     def auto_promote_temporary(self, temporary_name):
@@ -255,52 +461,16 @@ class TemporarySaver(object):
             assert temporary.read_only
             return None
 
-        if temporary.base_storage is not None:
-            raise ValueError(
-                "Cannot promote temporaries with base_storage to global")
-
-        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
-        # is associated with. This is used for determining the shape of the
-        # global storage needed for saving and restoring the temporary across
-        # kernel calls.
-        #
-        # TODO: Make a policy decision about which dimensions to use. Currently,
-        # the code looks at each instruction that defines or uses the temporary,
-        # and takes the common set of hw-parallel tagged inames associated with
-        # these instructions.
-        #
-        # Furthermore, in the case of local temporaries, inames that are tagged
-        # hw-local do not contribute to the global storage shape.
-        hw_inames = self.insn_query.common_hw_inames(
-            self.insn_query.insns_reading_or_writing(temporary.name))
-
-        # We want hw_inames to be arranged according to the order:
-        #    g.0 < g.1 < ... < l.0 < l.1 < ...
-        # Sorting lexicographically accomplishes this.
-        hw_inames = sorted(hw_inames,
-            key=lambda iname: str(self.kernel.iname_to_tag[iname]))
-
-        # Calculate the sizes of the dimensions that get added in front for
-        # the global storage of the temporary.
-        hw_dims = []
-
-        backing_hw_inames = []
-
-        for iname in hw_inames:
-            tag = self.kernel.iname_to_tag[iname]
-            from loopy.kernel.data import LocalIndexTag
-            is_local_iname = isinstance(tag, LocalIndexTag)
-            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
-                # Restrict shape to that of group inames for locals.
-                continue
-            backing_hw_inames.append(iname)
-            from loopy.isl_helpers import static_max_of_pw_aff
-            from loopy.symbolic import aff_to_expr
-            hw_dims.append(
-                aff_to_expr(
-                    static_max_of_pw_aff(
-                        self.kernel.get_iname_bounds(iname).size, False)))
+        base_storage_conflict = (
+            self.base_storage_to_representative.get(
+                temporary.base_storage, temporary) is not temporary)
 
+        if base_storage_conflict:
+            raise NotImplementedError(
+                "tried to save/reload multiple temporaries with the "
+                "same base_storage; this is currently not supported")
+
+        hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary)
         non_hw_dims = temporary.shape
 
         if len(non_hw_dims) == 0 and len(hw_dims) == 0:
@@ -309,10 +479,14 @@ class TemporarySaver(object):
 
         backing_temporary = self.PromotedTemporary(
             name=self.var_name_gen(temporary.name + "_save_slot"),
-            orig_temporary=temporary,
-            hw_dims=tuple(hw_dims),
-            non_hw_dims=non_hw_dims,
-            hw_inames=backing_hw_inames)
+            orig_temporary_name=temporary.name,
+            hw_dims=hw_dims,
+            hw_tags=hw_tags,
+            non_hw_dims=non_hw_dims)
+
+        if temporary.base_storage is not None:
+            self.base_storage_to_representative[temporary.base_storage] = (
+                    backing_temporary)
 
         return backing_temporary
 
@@ -326,23 +500,16 @@ class TemporarySaver(object):
         if promoted_temporary is None:
             return
 
-        from loopy.kernel.tools import DomainChanger
-        dchg = DomainChanger(
-            self.kernel,
-            frozenset(
-                self.insn_query.inames_in_subkernel(subkernel) |
-                set(promoted_temporary.hw_inames)))
-
-        domain, hw_inames, dim_inames, iname_to_tag = \
+        new_subdomain, hw_inames, dim_inames, iname_to_tag = (
             self.augment_domain_for_save_or_reload(
-                dchg.domain, promoted_temporary, mode, subkernel)
+                self.new_subdomain, promoted_temporary, mode, subkernel))
 
-        self.kernel = dchg.get_kernel_with(domain)
+        self.new_subdomain = new_subdomain
 
         save_or_load_insn_id = self.insn_name_gen(
             "{name}.{mode}".format(name=temporary, mode=mode))
 
-        def subscript_or_var(agg, subscript=()):
+        def add_subscript_if_subscript_nonempty(agg, subscript=()):
             from pymbolic.primitives import Subscript, Variable
             if len(subscript) == 0:
                 return Variable(agg)
@@ -351,20 +518,22 @@ class TemporarySaver(object):
                     Variable(agg),
                     tuple(map(Variable, subscript)))
 
-        dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)]
+        orig_temporary = (
+            self.kernel.temporary_variables[
+                promoted_temporary.orig_temporary_name])
+        dim_inames_trunc = dim_inames[:len(orig_temporary.shape)]
 
         args = (
-            subscript_or_var(
-                temporary, dim_inames_trunc),
-            subscript_or_var(
-                promoted_temporary.name, hw_inames + dim_inames))
+            add_subscript_if_subscript_nonempty(
+                temporary, subscript=dim_inames_trunc),
+            add_subscript_if_subscript_nonempty(
+                promoted_temporary.name, subscript=hw_inames + dim_inames))
 
         if mode == "save":
             args = reversed(args)
 
-        accessing_insns_in_subkernel = (
-            self.insn_query.insns_reading_or_writing(temporary) &
-            self.insn_query.insns_in_subkernel(subkernel))
+        accessing_insns_in_subkernel = self.find_accessing_instructions_in_subkernel(
+                temporary, subkernel)
 
         if mode == "save":
             depends_on = accessing_insns_in_subkernel
@@ -373,7 +542,7 @@ class TemporarySaver(object):
             depends_on = frozenset()
             update_deps = accessing_insns_in_subkernel
 
-        pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel)
+        pre_barrier, post_barrier = self.get_enclosing_global_barrier_pair(subkernel)
 
         if pre_barrier is not None:
             depends_on |= set([pre_barrier])
@@ -387,16 +556,19 @@ class TemporarySaver(object):
             *args,
             id=save_or_load_insn_id,
             within_inames=(
-                self.insn_query.inames_in_subkernel(subkernel) |
-                frozenset(hw_inames + dim_inames)),
+                self.subkernel_to_surrounding_inames[subkernel]
+                | frozenset(hw_inames + dim_inames)),
             within_inames_is_final=True,
             depends_on=depends_on,
             boostable=False,
             boostable_into=frozenset())
 
-        if temporary not in self.saves_or_reloads_added:
-            self.saves_or_reloads_added[temporary] = set()
-        self.saves_or_reloads_added[temporary].add(save_or_load_insn_id)
+        if mode == "save":
+            self.temporary_to_save_ids[temporary].add(save_or_load_insn_id)
+        else:
+            self.temporary_to_reload_ids[temporary].add(save_or_load_insn_id)
+
+        self.subkernel_to_newly_added_insn_ids[subkernel].add(save_or_load_insn_id)
 
         self.insns_to_insert.append(save_or_load_insn)
 
@@ -405,8 +577,8 @@ class TemporarySaver(object):
             self.insns_to_update[insn_id] = insn.copy(
                 depends_on=insn.depends_on | frozenset([save_or_load_insn_id]))
 
-        self.updated_temporary_variables[promoted_temporary.name] = \
-            promoted_temporary.as_variable()
+        self.updated_temporary_variables[promoted_temporary.name] = (
+            promoted_temporary.as_kernel_temporary(self.kernel))
 
         self.updated_iname_to_tag.update(iname_to_tag)
 
@@ -416,15 +588,6 @@ class TemporarySaver(object):
 
         insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert)
 
-        # Add global no_sync_with between any added reloads and saves
-        from six import iteritems
-        for temporary, added_insns in iteritems(self.saves_or_reloads_added):
-            for insn_id in added_insns:
-                insn = insns_to_insert[insn_id]
-                insns_to_insert[insn_id] = insn.copy(
-                    no_sync_with=frozenset(
-                        (added_insn, "global") for added_insn in added_insns))
-
         for orig_insn in self.kernel.instructions:
             if orig_insn.id in self.insns_to_update:
                 new_instructions.append(self.insns_to_update[orig_insn.id])
@@ -436,12 +599,31 @@ class TemporarySaver(object):
         self.updated_iname_to_tag.update(self.kernel.iname_to_tag)
         self.updated_temporary_variables.update(self.kernel.temporary_variables)
 
+        new_domains = list(self.kernel.domains)
+        import islpy as isl
+        if self.new_subdomain.dim(isl.dim_type.set) > 0:
+            new_domains.append(self.new_subdomain)
+
         kernel = self.kernel.copy(
+            domains=new_domains,
             instructions=new_instructions,
             iname_to_tag=self.updated_iname_to_tag,
             temporary_variables=self.updated_temporary_variables,
             overridden_get_grid_sizes_for_insn_ids=None)
 
+        # Add nosync directives to any saves or reloads that were added with a
+        # potential dependency chain.
+        from loopy.kernel.tools import get_subkernels
+        for subkernel in get_subkernels(kernel):
+            relevant_insns = self.subkernel_to_newly_added_insn_ids[subkernel]
+
+            from itertools import product
+            for temporary in self.temporary_to_reload_ids:
+                for source, sink in product(
+                        relevant_insns & self.temporary_to_reload_ids[temporary],
+                        relevant_insns & self.temporary_to_save_ids[temporary]):
+                    kernel = lp.add_nosync(kernel, "global", source, sink)
+
         from loopy.kernel.tools import assign_automatic_axes
         return assign_automatic_axes(kernel)
 
@@ -456,22 +638,28 @@ class TemporarySaver(object):
         """
         Add new axes to the domain corresponding to the dimensions of
         `promoted_temporary`. These axes will be used in the save/
-        reload stage.
+        reload stage. These get prefixed onto the already existing axes.
         """
         assert mode in ("save", "reload")
         import islpy as isl
 
-        orig_temporary = promoted_temporary.orig_temporary
+        orig_temporary = (
+                self.kernel.temporary_variables[
+                    promoted_temporary.orig_temporary_name])
         orig_dim = domain.dim(isl.dim_type.set)
 
         # Tags for newly added inames
         iname_to_tag = {}
 
+        from loopy.symbolic import aff_from_expr
+
         # FIXME: Restrict size of new inames to access footprint.
 
         # Add dimension-dependent inames.
         dim_inames = []
-        domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims))
+        domain = domain.add(isl.dim_type.set,
+                            len(promoted_temporary.non_hw_dims)
+                            + len(promoted_temporary.hw_dims))
 
         for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims):
             new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}".
@@ -493,25 +681,31 @@ class TemporarySaver(object):
             # Add size information.
             aff = isl.affs_from_space(domain.space)
             domain &= aff[0].le_set(aff[new_iname])
-            from loopy.symbolic import aff_from_expr
             domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size))
 
-        # FIXME: Use promoted_temporary.hw_inames
-        hw_inames = []
+        dim_offset = orig_dim + len(promoted_temporary.non_hw_dims)
 
-        # Add hardware inames duplicates.
-        for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames):
+        hw_inames = []
+        # Add hardware dims.
+        for hw_iname_idx, (hw_tag, dim) in enumerate(
+                zip(promoted_temporary.hw_tags, promoted_temporary.hw_dims)):
             new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}".
                 format(name=orig_temporary.name,
                        mode=mode,
-                       dim=t_idx,
+                       dim=hw_iname_idx,
                        sk=subkernel))
-            hw_inames.append(new_iname)
-            iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname]
+            domain = domain.set_dim_name(
+                isl.dim_type.set, dim_offset + hw_iname_idx, new_iname)
 
-        from loopy.isl_helpers import duplicate_axes
-        domain = duplicate_axes(
-            domain, promoted_temporary.hw_inames, hw_inames)
+            aff = isl.affs_from_space(domain.space)
+            domain = (domain
+                &
+                aff[0].le_set(aff[new_iname])
+                &
+                aff[new_iname].lt_set(aff_from_expr(domain.space, dim)))
+
+            self.updated_iname_to_tag[new_iname] = hw_tag
+            hw_inames.append(new_iname)
 
         # The operations on the domain above return a Set object, but the
         # underlying domain should be expressible as a single BasicSet.
@@ -551,7 +745,8 @@ def save_and_reload_temporaries(knl):
     liveness = LivenessAnalysis(knl)
     saver = TemporarySaver(knl)
 
-    insn_query = InstructionQuery(knl)
+    from loopy.schedule.tools import (
+        temporaries_read_in_subkernel, temporaries_written_in_subkernel)
 
     for sched_idx, sched_item in enumerate(knl.schedule):
 
@@ -562,9 +757,10 @@ def save_and_reload_temporaries(knl):
                 # Kernel entry: nothing live
                 interesting_temporaries = set()
             else:
+                subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    insn_query.temporaries_read_or_written_in_subkernel(
-                        sched_item.kernel_name))
+                    temporaries_read_in_subkernel(knl, subkernel)
+                    | temporaries_written_in_subkernel(knl, subkernel))
 
             for temporary in liveness[sched_idx].live_out & interesting_temporaries:
                 logger.info("reloading {0} at entry of {1}"
@@ -576,9 +772,9 @@ def save_and_reload_temporaries(knl):
                 # Kernel exit: nothing live
                 interesting_temporaries = set()
             else:
+                subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    insn_query.temporaries_written_in_subkernel(
-                        sched_item.kernel_name))
+                    temporaries_written_in_subkernel(knl, subkernel))
 
             for temporary in liveness[sched_idx].live_in & interesting_temporaries:
                 logger.info("saving {0} before return of {1}"
diff --git a/loopy/version.py b/loopy/version.py
index 18f4aafdcbb622a9b309d141b362b035243301d1..36a48e2f2e6975c83b658a7f38c231e287fd74bc 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v61-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v62-islpy%s" % _islpy_version
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a941eecd2b58daf413830fc22500179d3e8a8cf1..150cb1cc4bc6ee13a7d516ab09c8824d76a2c6c9 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@ setup(name="loo.py",
           ],
 
       install_requires=[
-          "pytools>=2016.2.6",
+          "pytools>=2017.1",
           "pymbolic>=2016.2",
           "genpy>=2016.1.2",
           "cgen>=2016.1",
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 58435efdc48ce39a538783d236ab8f4e647cfab9..d8bf76f96dddc19592066b49f73419a01efe8867 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1146,7 +1146,7 @@ def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False):
         1/0
 
     _, (out,) = knl(queue, out_host=True)
-    assert (out == out_expect).all()
+    assert (out == out_expect).all(), (out, out_expect)
 
 
 @pytest.mark.parametrize("hw_loop", [True, False])
@@ -1338,6 +1338,73 @@ def test_save_local_multidim_array(ctx_factory, debug=False):
     save_and_reload_temporaries_test(queue, knl, 1, debug)
 
 
+def test_save_with_base_storage(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i]: 0 <= i < 10}",
+            """
+            <>a[i] = 0
+            <>b[i] = i
+            ... gbarrier
+            out[i] = a[i]
+            """,
+            "...",
+            seq_dependencies=True)
+
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.set_temporary_scope(knl, "a", "local")
+    knl = lp.set_temporary_scope(knl, "b", "local")
+
+    knl = lp.alias_temporaries(knl, ["a", "b"],
+            synchronize_for_exclusive_use=False)
+
+    save_and_reload_temporaries_test(queue, knl, np.arange(10), debug)
+
+
+def test_save_ambiguous_storage_requirements():
+    knl = lp.make_kernel(
+            "{[i,j]: 0 <= i < 10 and 0 <= j < 10}",
+            """
+            <>a[j] = j
+            ... gbarrier
+            out[i,j] = a[j]
+            """,
+            seq_dependencies=True)
+
+    knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
+    knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"})
+    knl = lp.set_temporary_scope(knl, "a", "local")
+
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+
+    from loopy.diagnostic import LoopyError
+    with pytest.raises(LoopyError):
+        lp.save_and_reload_temporaries(knl)
+
+
+def test_save_across_inames_with_same_tag(ctx_factory, debug=False):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    knl = lp.make_kernel(
+            "{[i]: 0 <= i < 10}",
+            """
+            <>a[i] = i
+            ... gbarrier
+            out[i] = a[i]
+            """,
+            "...",
+            seq_dependencies=True)
+
+    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.duplicate_inames(knl, "i", within="reads:a", tags={"i": "l.0"})
+
+    save_and_reload_temporaries_test(queue, knl, np.arange(10), debug)
+
+
 def test_missing_temporary_definition_detection():
     knl = lp.make_kernel(
             "{ [i]: 0<=i<10 }",
@@ -2231,6 +2298,43 @@ def test_struct_assignment(ctx_factory):
     knl(queue, N=200)
 
 
+def test_inames_conditional_generation(ctx_factory):
+    ctx = ctx_factory()
+    knl = lp.make_kernel(
+            "{[i,j,k]: 0 < k < i and 0 < j < 10 and 0 < i < 10}",
+            """
+            for k
+                ... gbarrier
+                <>tmp1 = 0
+            end
+            for j
+                ... gbarrier
+                <>tmp2 = i
+            end
+            """,
+            "...",
+            seq_dependencies=True)
+
+    knl = lp.tag_inames(knl, dict(i="g.0"))
+
+    with cl.CommandQueue(ctx) as queue:
+        knl(queue)
+
+
+def test_kernel_var_name_generator():
+    knl = lp.make_kernel(
+            "{[i]: 0 <= i <= 10}",
+            """
+            <>a = 0
+            <>b_s0 = 0
+            """)
+
+    vng = knl.get_var_name_generator()
+
+    assert vng("a_s0") != "a_s0"
+    assert vng("b") != "b"
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])