diff --git a/loopy/__init__.py b/loopy/__init__.py
index 2f8fd8f5df94bb9d2116802d9069ba5fcacccf8c..0dc0fdf76e7ee38c6b47e97912df0fb1f3127396 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -128,9 +128,10 @@ def split_dimension(kernel, split_iname, inner_length,
         new_expr = subst_mapper(rls(insn.expression))
 
         if split_iname in insn.forced_iname_deps:
-            new_forced_iname_deps = insn.forced_iname_deps.copy()
-            new_forced_iname_deps.remove(split_iname)
-            new_forced_iname_deps.update([outer_iname, inner_iname])
+            new_forced_iname_deps = (
+                    (insn.forced_iname_deps.copy()
+                    - frozenset([split_iname]))
+                    | frozenset([outer_iname, inner_iname]))
         else:
             new_forced_iname_deps = insn.forced_iname_deps
 
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 953ea4918a5d50e15ec3bb6e9d58a3a75daa1ff3..49ccbf055534dc65fd4d342b97a2d267a3b3bf32 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -303,11 +303,10 @@ def generate_code(kernel, with_annotation=False,
     else:
         body.append(gen_code.ast)
 
-    from loopy.symbolic import pw_aff_to_expr
     mod.append(
         FunctionBody(
             CLRequiredWorkGroupSize(
-                tuple(pw_aff_to_expr(sz) for sz in kernel.get_grid_sizes()[1]),
+                kernel.get_grid_sizes_as_exprs()[1],
                 CLKernel(FunctionDeclaration(
                     Value("void", kernel.name), args))),
             body))
diff --git a/loopy/cse.py b/loopy/cse.py
index 624ed6b5e34526f50e4262639accc2b4ba1befee..67e41cb9c1c4c198beb972dd19b12aa4f4b74864 100644
--- a/loopy/cse.py
+++ b/loopy/cse.py
@@ -166,12 +166,22 @@ def build_global_storage_to_sweep_map(invocation_descriptors,
 
 # {{{ compute storage bounds
 
-def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames,
+def find_var_base_indices_and_shape_from_inames(
+        domain, inames, cache_manager, context=None):
+    base_indices_and_sizes = [
+            cache_manager.base_index_and_length(domain, iname, context)
+            for iname in inames]
+    return zip(*base_indices_and_sizes)
+
+
+
+
+def compute_bounds(kernel, sweep_domain, subst_name, stor2sweep, sweep_inames,
         storage_axis_names):
 
     # move non-sweep inames into parameter space
 
-    dup_sweep_index = kernel.space.dim(dim_type.out)
+    dup_sweep_index = sweep_domain.get_space().dim(dim_type.out)
     # map_space: [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep]
 
     sp = stor2sweep.get_space()
@@ -187,7 +197,6 @@ def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames,
                 "sweep did not result in a bounded storage domain"
                 % subst_name)
 
-    from loopy.kernel import find_var_base_indices_and_shape_from_inames
     return find_var_base_indices_and_shape_from_inames(
             storage_domain, [saxis+"'" for saxis in storage_axis_names],
             kernel.cache_manager, context=kernel.assumptions)
@@ -198,7 +207,7 @@ def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames,
 
 
 
-def get_access_info(kernel, subst_name,
+def get_access_info(kernel, sweep_domain, subst_name,
         storage_axis_names, storage_axis_sources,
         sweep_inames, invocation_descriptors):
 
@@ -206,9 +215,9 @@ def get_access_info(kernel, subst_name,
 
     primed_sweep_inames = [psin+"'" for psin in sweep_inames]
     from loopy.isl_helpers import duplicate_axes
-    dup_sweep_index = kernel.space.dim(dim_type.out)
+    dup_sweep_index = sweep_domain.space.dim(dim_type.out)
     domain_dup_sweep = duplicate_axes(
-            kernel.domain, sweep_inames,
+            sweep_domain, sweep_inames,
             primed_sweep_inames)
 
     prime_sweep_inames = SubstitutionMapper(make_subst_func(
@@ -221,7 +230,7 @@ def get_access_info(kernel, subst_name,
             storage_axis_names, storage_axis_sources, prime_sweep_inames)
 
     storage_base_indices, storage_shape = compute_bounds(
-            kernel, subst_name, stor2sweep, sweep_inames,
+            kernel, sweep_domain, subst_name, stor2sweep, sweep_inames,
             storage_axis_names)
 
     # compute augmented domain
@@ -588,9 +597,20 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[],
 
     # }}}
 
+    if sweep_inames:
+        leaf_domain_index = kernel.get_leaf_domain_index(frozenset(sweep_inames))
+        sweep_domain = kernel.domains[leaf_domain_index]
+
+        for iname in sweep_inames:
+            if kernel.get_home_domain_index(iname) != leaf_domain_index:
+                raise RuntimeError("sweep iname '%s' is not 'at home' in the "
+                        "sweep's leaf domain" % iname)
+    else:
+        sweep_domain = kernel.combine_domains(())
+
     (non1_storage_axis_names, new_domain,
-            storage_base_indices, non1_storage_base_indices, non1_storage_shape)= \
-                    get_access_info(kernel, subst_name,
+            storage_base_indices, non1_storage_base_indices, non1_storage_shape) = \
+                    get_access_info(kernel, sweep_domain, subst_name,
                             storage_axis_names, storage_axis_sources,
                             sweep_inames, invocation_descriptors)
 
@@ -598,7 +618,7 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[],
 
     if len(new_domain.get_basic_sets()) > 1:
         hull_new_domain = new_domain.simple_hull()
-        if hull_new_domain <= new_domain:
+        if isl.Set.from_basic_set(hull_new_domain) <= new_domain:
             new_domain = hull_new_domain
 
     new_domain = new_domain.coalesce()
@@ -793,8 +813,12 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[],
 
     # }}}
 
+    new_domains = kernel.domains[:]
+    if sweep_inames:
+        new_domains[leaf_domain_index] = new_domain
+
     return kernel.copy(
-            domain=new_domain,
+            domains=new_domains,
             instructions=new_insns,
             substitutions=new_substs,
             temporary_variables=new_temporary_variables,
diff --git a/loopy/kernel.py b/loopy/kernel.py
index 0c9714001fc52e68da9fd58fbf2730f72188cf2c..acbc018111b31de858b166b99dd952ffb052be5a 100644
--- a/loopy/kernel.py
+++ b/loopy/kernel.py
@@ -1090,6 +1090,9 @@ class LoopKernel(Record):
         return result
 
     def get_inames_domain(self, inames):
+        if not inames:
+            return self.combine_domains(())
+
         if isinstance(inames, str):
             inames = frozenset([inames])
         if not isinstance(inames, frozenset):
@@ -1101,17 +1104,43 @@ class LoopKernel(Record):
         return self._get_inames_domain_backend(inames)
 
     @memoize_method
-    def _get_inames_domain_backend(self, inames):
+    def get_leaf_domain_index(self, inames):
+        """Find the leaf of the domain tree needed to cover all inames."""
+
         hdm = self._get_home_domain_map()
         ppd = self.all_parents_per_domain()
 
         domain_indices = set()
+
+        leaf_domain_index = None
+
         for iname in inames:
             home_domain_index = hdm[iname]
+            if home_domain_index in domain_indices:
+                # nothin' new
+                continue
+
+            leaf_domain_index = home_domain_index
+
+            all_parents = set(ppd[home_domain_index])
+            if not domain_indices <= all_parents:
+                raise RuntimeError("iname set '%s' requires "
+                        "branch in domain tree (when adding '%s')"
+                        % (", ".join(inames), iname))
+
             domain_indices.add(home_domain_index)
-            domain_indices.update(ppd[home_domain_index])
+            domain_indices.update(all_parents)
 
-        return self.combine_domains(tuple(sorted(domain_indices)))
+        return leaf_domain_index
+
+    @memoize_method
+    def _get_inames_domain_backend(self, inames):
+        leaf_dom_idx = self.get_leaf_domain_index(inames)
+
+        return self.combine_domains(tuple(sorted(
+            self.all_parents_per_domain()[leaf_dom_idx]
+            + [leaf_dom_idx]
+            )))
 
     # }}}
 
@@ -1193,15 +1222,8 @@ class LoopKernel(Record):
         """
         result = {}
 
-        admissible_vars = (
-                set(arg.name for arg in self.args)
-                | set(self.temporary_variables.iterkeys()))
-
         for insn in self.instructions:
             var_name = insn.get_assignee_var_name()
-
-            if var_name not in admissible_vars:
-                raise RuntimeError("variable '%s' not declared or not allowed for writing" % var_name)
             var_names = [var_name]
 
             for var_name in var_names:
@@ -1298,26 +1320,14 @@ class LoopKernel(Record):
 
     def find_var_base_indices_and_shape_from_inames(
             self, inames, cache_manager, context=None):
-        base_indices = []
-        shape = []
+        if not inames:
+            return [], []
 
-        for iname in inames:
-            domain = self.get_inames_domain(iname)
-            iname_to_dim = domain.space.get_var_dict()
-            lower_bound_pw_aff = cache_manager.dim_min(domain, iname_to_dim[iname][1])
-            upper_bound_pw_aff = cache_manager.dim_max(domain, iname_to_dim[iname][1])
-
-            from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff
-            from loopy.symbolic import pw_aff_to_expr
-
-            shape.append(pw_aff_to_expr(static_max_of_pw_aff(
-                    upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True,
-                    context=context)))
-            base_indices.append(pw_aff_to_expr(
-                static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False,
-                    context=context)))
-
-        return base_indices, shape
+        base_indices_and_sizes = [
+                cache_manager.base_index_and_length(
+                    self.get_inames_domain(iname), iname, context)
+                for iname in inames]
+        return zip(*base_indices_and_sizes)
 
     @memoize_method
     def get_constant_iname_length(self, iname):
@@ -1418,7 +1428,7 @@ class LoopKernel(Record):
 
         def tup_to_exprs(tup):
             from loopy.symbolic import pw_aff_to_expr
-            return tuple(pw_aff_to_expr(i) for i in tup)
+            return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup)
 
         return tup_to_exprs(grid_size), tup_to_exprs(group_size)
 
@@ -1649,6 +1659,23 @@ class SetOperationCacheManager:
     def dim_max(self, set, *args):
         return self.op(set, "dim_max", set.dim_max, args)
 
+    def base_index_and_length(self, set, iname, context=None):
+        iname_to_dim = set.space.get_var_dict()
+        lower_bound_pw_aff = self.dim_min(set, iname_to_dim[iname][1])
+        upper_bound_pw_aff = self.dim_max(set, iname_to_dim[iname][1])
+
+        from loopy.isl_helpers import static_max_of_pw_aff, static_min_of_pw_aff
+        from loopy.symbolic import pw_aff_to_expr
+
+        size = pw_aff_to_expr(static_max_of_pw_aff(
+                upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True,
+                context=context))
+        base_index = pw_aff_to_expr(
+            static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False,
+                context=context))
+
+        return base_index, size
+
 
 
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 92629cc160ab8c0482e8a400a117ba58680f46b5..29b2512ece0f22665d3f5ab36a75fdf6117babf9 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -432,10 +432,12 @@ def aff_to_expr(aff, except_name=None, error_on_name=None):
 
 
 
-def pw_aff_to_expr(pw_aff):
+def pw_aff_to_expr(pw_aff, int_ok=False):
     if isinstance(pw_aff, int):
-        from warnings import warn
-        warn("expected PwAff, got int", stacklevel=2)
+        if not int_ok:
+            from warnings import warn
+            warn("expected PwAff, got int", stacklevel=2)
+
         return pw_aff
 
     pieces = pw_aff.get_pieces()
diff --git a/test/test_loopy.py b/test/test_loopy.py
index d9811564fdcd4d7bcd19f6e623fbf55cc3835541..2c447e9cffcad0dfb44176934363479075c852d9 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -92,14 +92,15 @@ def test_stencil(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "{[i,j]: 0<= i,j < 32}",
             [
-                "[i] <float32> z[i,j] = -2*a[i,j]"
+                "[i] z[i,j] = -2*a[i,j]"
                     " + a[i,j-1]"
                     " + a[i,j+1]"
                     " + a[i-1,j]"
                     " + a[i+1,j]"
                 ],
             [
-                lp.GlobalArg("a", np.float32, shape=(32,32,))
+                lp.GlobalArg("a", np.float32, shape=(32,32,)),
+                lp.GlobalArg("z", np.float32, shape=(32,32,))
                 ])
 
 
@@ -305,13 +306,13 @@ def test_empty_reduction(ctx_factory):
 
 
 def test_nested_dependent_reduction(ctx_factory):
-    dtype = np.dtype(np.float32)
+    dtype = np.dtype(np.int32)
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel(ctx.devices[0],
             [
-                "{[i]: 0<=i<20}",
+                "{[i]: 0<=i<n}",
                 "{[j]: 0<=j<i+sumlen}"
                 ],
             [
@@ -319,14 +320,21 @@ def test_nested_dependent_reduction(ctx_factory):
                 "a[i] = sum(j, j)",
                 ],
             [
-                lp.GlobalArg("a", dtype, (20,)),
-                lp.GlobalArg("l", np.int32, (20,)),
+                lp.ScalarArg("n", np.int32),
+                lp.GlobalArg("a", dtype, ("n",)),
+                lp.GlobalArg("l", np.int32, ("n",)),
                 ])
 
     cknl = lp.CompiledKernel(ctx, knl)
-    cknl.print_code()
 
-    evt, (a,) = cknl(queue)
+    n = 330
+    l = np.arange(n, dtype=np.int32)
+    evt, (a,) = cknl(queue, l=l, n=n, out_host=True)
+
+    tgt_result = (2*l-1)*2*l/2
+    assert (a == tgt_result).all()
+
+
 
 
 
@@ -442,7 +450,7 @@ def test_dependent_loop_bounds_3(ctx_factory):
 
 
 
-def test_independent_multi_domains(ctx_factory):
+def test_independent_multi_domain(ctx_factory):
     dtype = np.dtype(np.float32)
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -453,26 +461,30 @@ def test_independent_multi_domains(ctx_factory):
                 "{[j]: 0<=j<n}",
                 ],
             [
-                "a[i,j] = 1",
+                "a[i] = 1",
+                "b[j] = 2",
                 ],
             [
-                lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
+                lp.GlobalArg("a", dtype, shape=("n"), order="C"),
+                lp.GlobalArg("b", dtype, shape=("n"), order="C"),
                 lp.ScalarArg("n", np.int32),
                 ])
 
 
     knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0",
             inner_tag="l.0")
-    knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1",
-            inner_tag="l.1")
+    knl = lp.split_dimension(knl, "j", 16, outer_tag="g.0",
+            inner_tag="l.0")
     assert knl.parents_per_domain() == 2*[None]
 
     n = 50
     cknl = lp.CompiledKernel(ctx, knl)
-    evt, (a,) = cknl(queue, n=n, out_host=True)
+    evt, (a, b) = cknl(queue, n=n, out_host=True)
 
-    assert a.shape == (50, 50)
+    assert a.shape == (50,)
+    assert b.shape == (50,)
     assert (a == 1).all()
+    assert (b == 2).all()