diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index ad80475c1d27f67b3df8a885f60dd96ff28efe6a..a334462049634fff1e3137ffd09acd3ef254bb51 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -220,7 +220,7 @@ def intersect_kernel_with_slab(kernel, slab, iname): domch = DomainChanger(kernel, (iname,)) orig_domain = domch.get_original_domain() - orig_domain, slab = isl.align_two(orig_domain, slab) + orig_domain, slab = isl.align_two(slab, orig_domain) return domch.get_kernel_with(orig_domain & slab) @@ -376,10 +376,10 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): # move inames that are usable into parameters moved_inames = [] - for iname in dom_and_slab.get_var_names(dim_type.set): - if iname in usable_inames: - moved_inames.append(iname) - dt, idx = dom_and_slab.get_var_dict()[iname] + for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): + if das_iname in usable_inames: + moved_inames.append(das_iname) + dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) @@ -422,8 +422,9 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): impl_lbound, impl_ubound) - for iname in moved_inames: - dt, idx = impl_loop.get_var_dict()[iname] + for moved_iname in moved_inames: + # move moved_iname to 'set' dim_type in impl_loop + dt, idx = impl_loop.get_var_dict()[moved_iname] impl_loop = impl_loop.move_dims( dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) @@ -432,7 +433,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): codegen_state .intersect(impl_loop) .copy(kernel=intersect_kernel_with_slab( - kernel, slab, iname))) + kernel, slab, loop_iname))) inner = build_loop_nest(new_codegen_state, sched_index+1) diff --git a/test/test_loopy.py b/test/test_loopy.py index 4c3dbd6d0c723d7be14c3737ae63ad02c722d418..851a7f0762fcec3ccbb55399e183f5fb51322ac1 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1948,8 +1948,8 @@ def test_tight_loop_bounds_codegen(): for_loop = \ "for (int j = " \ - "(lid(0) == 0 && gid(0) == 0 ? 0 : -2 + 10 * gid(0) + 2 * lid(0)); " \ - "j <= (lid(0) == 0 && -1 + gid(0) == 0 ? 9 : 2 * lid(0)); ++j)" + "(gid(0) == 0 && lid(0) == 0 ? 0 : -2 + 2 * lid(0) + 10 * gid(0)); " \ + "j <= (-1 + gid(0) == 0 && lid(0) == 0 ? 9 : 2 * lid(0)); ++j)" assert for_loop in cgr.device_code()