diff --git a/loopy/__init__.py b/loopy/__init__.py index 2f8fd8f5df94bb9d2116802d9069ba5fcacccf8c..0dc0fdf76e7ee38c6b47e97912df0fb1f3127396 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -128,9 +128,10 @@ def split_dimension(kernel, split_iname, inner_length, new_expr = subst_mapper(rls(insn.expression)) if split_iname in insn.forced_iname_deps: - new_forced_iname_deps = insn.forced_iname_deps.copy() - new_forced_iname_deps.remove(split_iname) - new_forced_iname_deps.update([outer_iname, inner_iname]) + new_forced_iname_deps = ( + (insn.forced_iname_deps.copy() + - frozenset([split_iname])) + | frozenset([outer_iname, inner_iname])) else: new_forced_iname_deps = insn.forced_iname_deps diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 953ea4918a5d50e15ec3bb6e9d58a3a75daa1ff3..49ccbf055534dc65fd4d342b97a2d267a3b3bf32 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -303,11 +303,10 @@ def generate_code(kernel, with_annotation=False, else: body.append(gen_code.ast) - from loopy.symbolic import pw_aff_to_expr mod.append( FunctionBody( CLRequiredWorkGroupSize( - tuple(pw_aff_to_expr(sz) for sz in kernel.get_grid_sizes()[1]), + kernel.get_grid_sizes_as_exprs()[1], CLKernel(FunctionDeclaration( Value("void", kernel.name), args))), body)) diff --git a/loopy/cse.py b/loopy/cse.py index 624ed6b5e34526f50e4262639accc2b4ba1befee..67e41cb9c1c4c198beb972dd19b12aa4f4b74864 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -166,12 +166,22 @@ def build_global_storage_to_sweep_map(invocation_descriptors, # {{{ compute storage bounds -def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames, +def find_var_base_indices_and_shape_from_inames( + domain, inames, cache_manager, context=None): + base_indices_and_sizes = [ + cache_manager.base_index_and_length(domain, iname, context) + for iname in inames] + return zip(*base_indices_and_sizes) + + + + +def compute_bounds(kernel, sweep_domain, subst_name, stor2sweep, sweep_inames, storage_axis_names): # move non-sweep inames into parameter space - dup_sweep_index = kernel.space.dim(dim_type.out) + dup_sweep_index = sweep_domain.get_space().dim(dim_type.out) # map_space: [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep] sp = stor2sweep.get_space() @@ -187,7 +197,6 @@ def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames, "sweep did not result in a bounded storage domain" % subst_name) - from loopy.kernel import find_var_base_indices_and_shape_from_inames return find_var_base_indices_and_shape_from_inames( storage_domain, [saxis+"'" for saxis in storage_axis_names], kernel.cache_manager, context=kernel.assumptions) @@ -198,7 +207,7 @@ def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames, -def get_access_info(kernel, subst_name, +def get_access_info(kernel, sweep_domain, subst_name, storage_axis_names, storage_axis_sources, sweep_inames, invocation_descriptors): @@ -206,9 +215,9 @@ def get_access_info(kernel, subst_name, primed_sweep_inames = [psin+"'" for psin in sweep_inames] from loopy.isl_helpers import duplicate_axes - dup_sweep_index = kernel.space.dim(dim_type.out) + dup_sweep_index = sweep_domain.space.dim(dim_type.out) domain_dup_sweep = duplicate_axes( - kernel.domain, sweep_inames, + sweep_domain, sweep_inames, primed_sweep_inames) prime_sweep_inames = SubstitutionMapper(make_subst_func( @@ -221,7 +230,7 @@ def get_access_info(kernel, subst_name, storage_axis_names, storage_axis_sources, prime_sweep_inames) storage_base_indices, storage_shape = compute_bounds( - kernel, subst_name, stor2sweep, sweep_inames, + kernel, sweep_domain, subst_name, stor2sweep, sweep_inames, storage_axis_names) # compute augmented domain @@ -588,9 +597,20 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # }}} + if sweep_inames: + leaf_domain_index = kernel.get_leaf_domain_index(frozenset(sweep_inames)) + sweep_domain = kernel.domains[leaf_domain_index] + + for iname in sweep_inames: + if kernel.get_home_domain_index(iname) != leaf_domain_index: + raise RuntimeError("sweep iname '%s' is not 'at home' in the " + "sweep's leaf domain" % iname) + else: + sweep_domain = kernel.combine_domains(()) + (non1_storage_axis_names, new_domain, - storage_base_indices, non1_storage_base_indices, non1_storage_shape)= \ - get_access_info(kernel, subst_name, + storage_base_indices, non1_storage_base_indices, non1_storage_shape) = \ + get_access_info(kernel, sweep_domain, subst_name, storage_axis_names, storage_axis_sources, sweep_inames, invocation_descriptors) @@ -598,7 +618,7 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], if len(new_domain.get_basic_sets()) > 1: hull_new_domain = new_domain.simple_hull() - if hull_new_domain <= new_domain: + if isl.Set.from_basic_set(hull_new_domain) <= new_domain: new_domain = hull_new_domain new_domain = new_domain.coalesce() @@ -793,8 +813,12 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # }}} + new_domains = kernel.domains[:] + if sweep_inames: + new_domains[leaf_domain_index] = new_domain + return kernel.copy( - domain=new_domain, + domains=new_domains, instructions=new_insns, substitutions=new_substs, temporary_variables=new_temporary_variables, diff --git a/loopy/kernel.py b/loopy/kernel.py index 0c9714001fc52e68da9fd58fbf2730f72188cf2c..acbc018111b31de858b166b99dd952ffb052be5a 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -1090,6 +1090,9 @@ class LoopKernel(Record): return result def get_inames_domain(self, inames): + if not inames: + return self.combine_domains(()) + if isinstance(inames, str): inames = frozenset([inames]) if not isinstance(inames, frozenset): @@ -1101,17 +1104,43 @@ class LoopKernel(Record): return self._get_inames_domain_backend(inames) @memoize_method - def _get_inames_domain_backend(self, inames): + def get_leaf_domain_index(self, inames): + """Find the leaf of the domain tree needed to cover all inames.""" + hdm = self._get_home_domain_map() ppd = self.all_parents_per_domain() domain_indices = set() + + leaf_domain_index = None + for iname in inames: home_domain_index = hdm[iname] + if home_domain_index in domain_indices: + # nothin' new + continue + + leaf_domain_index = home_domain_index + + all_parents = set(ppd[home_domain_index]) + if not domain_indices <= all_parents: + raise RuntimeError("iname set '%s' requires " + "branch in domain tree (when adding '%s')" + % (", ".join(inames), iname)) + domain_indices.add(home_domain_index) - domain_indices.update(ppd[home_domain_index]) + domain_indices.update(all_parents) - return self.combine_domains(tuple(sorted(domain_indices))) + return leaf_domain_index + + @memoize_method + def _get_inames_domain_backend(self, inames): + leaf_dom_idx = self.get_leaf_domain_index(inames) + + return self.combine_domains(tuple(sorted( + self.all_parents_per_domain()[leaf_dom_idx] + + [leaf_dom_idx] + ))) # }}} @@ -1193,15 +1222,8 @@ class LoopKernel(Record): """ result = {} - admissible_vars = ( - set(arg.name for arg in self.args) - | set(self.temporary_variables.iterkeys())) - for insn in self.instructions: var_name = insn.get_assignee_var_name() - - if var_name not in admissible_vars: - raise RuntimeError("variable '%s' not declared or not allowed for writing" % var_name) var_names = [var_name] for var_name in var_names: @@ -1298,26 +1320,14 @@ class LoopKernel(Record): def find_var_base_indices_and_shape_from_inames( self, inames, cache_manager, context=None): - base_indices = [] - shape = [] + if not inames: + return [], [] - for iname in inames: - domain = self.get_inames_domain(iname) - iname_to_dim = domain.space.get_var_dict() - lower_bound_pw_aff = cache_manager.dim_min(domain, iname_to_dim[iname][1]) - upper_bound_pw_aff = cache_manager.dim_max(domain, iname_to_dim[iname][1]) - - from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff - from loopy.symbolic import pw_aff_to_expr - - shape.append(pw_aff_to_expr(static_max_of_pw_aff( - upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True, - context=context))) - base_indices.append(pw_aff_to_expr( - static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, - context=context))) - - return base_indices, shape + base_indices_and_sizes = [ + cache_manager.base_index_and_length( + self.get_inames_domain(iname), iname, context) + for iname in inames] + return zip(*base_indices_and_sizes) @memoize_method def get_constant_iname_length(self, iname): @@ -1418,7 +1428,7 @@ class LoopKernel(Record): def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr - return tuple(pw_aff_to_expr(i) for i in tup) + return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup) return tup_to_exprs(grid_size), tup_to_exprs(group_size) @@ -1649,6 +1659,23 @@ class SetOperationCacheManager: def dim_max(self, set, *args): return self.op(set, "dim_max", set.dim_max, args) + def base_index_and_length(self, set, iname, context=None): + iname_to_dim = set.space.get_var_dict() + lower_bound_pw_aff = self.dim_min(set, iname_to_dim[iname][1]) + upper_bound_pw_aff = self.dim_max(set, iname_to_dim[iname][1]) + + from loopy.isl_helpers import static_max_of_pw_aff, static_min_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + + size = pw_aff_to_expr(static_max_of_pw_aff( + upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True, + context=context)) + base_index = pw_aff_to_expr( + static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, + context=context)) + + return base_index, size + diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 92629cc160ab8c0482e8a400a117ba58680f46b5..29b2512ece0f22665d3f5ab36a75fdf6117babf9 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -432,10 +432,12 @@ def aff_to_expr(aff, except_name=None, error_on_name=None): -def pw_aff_to_expr(pw_aff): +def pw_aff_to_expr(pw_aff, int_ok=False): if isinstance(pw_aff, int): - from warnings import warn - warn("expected PwAff, got int", stacklevel=2) + if not int_ok: + from warnings import warn + warn("expected PwAff, got int", stacklevel=2) + return pw_aff pieces = pw_aff.get_pieces() diff --git a/test/test_loopy.py b/test/test_loopy.py index d9811564fdcd4d7bcd19f6e623fbf55cc3835541..2c447e9cffcad0dfb44176934363479075c852d9 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -92,14 +92,15 @@ def test_stencil(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "{[i,j]: 0<= i,j < 32}", [ - "[i] <float32> z[i,j] = -2*a[i,j]" + "[i] z[i,j] = -2*a[i,j]" " + a[i,j-1]" " + a[i,j+1]" " + a[i-1,j]" " + a[i+1,j]" ], [ - lp.GlobalArg("a", np.float32, shape=(32,32,)) + lp.GlobalArg("a", np.float32, shape=(32,32,)), + lp.GlobalArg("z", np.float32, shape=(32,32,)) ]) @@ -305,13 +306,13 @@ def test_empty_reduction(ctx_factory): def test_nested_dependent_reduction(ctx_factory): - dtype = np.dtype(np.float32) + dtype = np.dtype(np.int32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel(ctx.devices[0], [ - "{[i]: 0<=i<20}", + "{[i]: 0<=i<n}", "{[j]: 0<=j<i+sumlen}" ], [ @@ -319,14 +320,21 @@ def test_nested_dependent_reduction(ctx_factory): "a[i] = sum(j, j)", ], [ - lp.GlobalArg("a", dtype, (20,)), - lp.GlobalArg("l", np.int32, (20,)), + lp.ScalarArg("n", np.int32), + lp.GlobalArg("a", dtype, ("n",)), + lp.GlobalArg("l", np.int32, ("n",)), ]) cknl = lp.CompiledKernel(ctx, knl) - cknl.print_code() - evt, (a,) = cknl(queue) + n = 330 + l = np.arange(n, dtype=np.int32) + evt, (a,) = cknl(queue, l=l, n=n, out_host=True) + + tgt_result = (2*l-1)*2*l/2 + assert (a == tgt_result).all() + + @@ -442,7 +450,7 @@ def test_dependent_loop_bounds_3(ctx_factory): -def test_independent_multi_domains(ctx_factory): +def test_independent_multi_domain(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -453,26 +461,30 @@ def test_independent_multi_domains(ctx_factory): "{[j]: 0<=j<n}", ], [ - "a[i,j] = 1", + "a[i] = 1", + "b[j] = 2", ], [ - lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), + lp.GlobalArg("a", dtype, shape=("n"), order="C"), + lp.GlobalArg("b", dtype, shape=("n"), order="C"), lp.ScalarArg("n", np.int32), ]) knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") - knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", - inner_tag="l.1") + knl = lp.split_dimension(knl, "j", 16, outer_tag="g.0", + inner_tag="l.0") assert knl.parents_per_domain() == 2*[None] n = 50 cknl = lp.CompiledKernel(ctx, knl) - evt, (a,) = cknl(queue, n=n, out_host=True) + evt, (a, b) = cknl(queue, n=n, out_host=True) - assert a.shape == (50, 50) + assert a.shape == (50,) + assert b.shape == (50,) assert (a == 1).all() + assert (b == 2).all()