From 76de8414ab14869c7752ead23ecdcfef334fc421 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 9 Aug 2012 11:29:32 -0400 Subject: [PATCH] More work towards getting multi-domain to work. --- MEMO | 5 -- loopy/__init__.py | 10 ++-- loopy/codegen/__init__.py | 2 +- loopy/codegen/loop.py | 16 +++--- loopy/compiled.py | 6 +-- loopy/creation.py | 2 +- loopy/kernel.py | 100 +++++++++++++++++++++++++------------- test/test_loopy.py | 7 +-- 8 files changed, 89 insertions(+), 59 deletions(-) diff --git a/MEMO b/MEMO index 4ba101b76..34286cc94 100644 --- a/MEMO +++ b/MEMO @@ -54,8 +54,6 @@ To-do - Kernel splitting (via what variables get computed in a kernel) -- test_loopy.py: test_empty_reduction - - What if no universally valid precompute base index expression is found? (test_intel_matrix_mul with n = 6*16, e.g.?) @@ -63,9 +61,6 @@ To-do - Expose iname-duplicate-and-rename as a primitive. -- Allow parameters to be varying during run-time, substituting values - that depend on other inames? - - Fix all tests - Scalar insn priority diff --git a/loopy/__init__.py b/loopy/__init__.py index aac8bc674..fe7f8859b 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -79,10 +79,10 @@ def split_dimension(kernel, split_iname, inner_length, if inner_iname is None: inner_iname = split_iname+"_inner" - outer_var_nr = kernel.space.dim(dim_type.set) - inner_var_nr = kernel.space.dim(dim_type.set)+1 - def process_set(s): + outer_var_nr = s.dim(dim_type.set) + inner_var_nr = s.dim(dim_type.set)+1 + s = s.add_dims(dim_type.set, 2) s = s.set_dim_name(dim_type.set, outer_var_nr, outer_iname) s = s.set_dim_name(dim_type.set, inner_var_nr, inner_iname) @@ -102,7 +102,7 @@ def split_dimension(kernel, split_iname, inner_length, .eliminate(name_dim_type, name_idx, 1) .remove_dims(name_dim_type, name_idx, 1)) - new_domain = process_set(kernel.domain) + new_domains = [process_set(dom) for dom in kernel.domains] from pymbolic import var inner = var(inner_iname) @@ -144,7 +144,7 @@ def split_dimension(kernel, split_iname, inner_length, iname_slab_increments[outer_iname] = slabs result = (kernel .map_expressions(subst_mapper, exclude_instructions=True) - .copy(domain=new_domain, + .copy(domains=new_domains, iname_slab_increments=iname_slab_increments, instructions=new_insns, applied_iname_rewrites=applied_iname_rewrites, diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 142a78fa5..c7135e550 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -338,7 +338,7 @@ def generate_code(kernel, with_annotation=False, for arg in kernel.args: seen_dtypes.add(arg.dtype) - for tv in kernel.temporary_variables: + for tv in kernel.temporary_variables.itervalues(): seen_dtypes.add(tv.dtype) preambles = kernel.preambles[:] diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 0267e7f20..d37a720ca 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -8,9 +8,7 @@ from loopy.codegen.control import build_loop_nest -def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): - iname_domain = kernel.get_inames_domain(iname) - +def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain, iname_domain): from loopy.codegen.bounds import get_bounds_constraints, get_defined_inames lower_constraints_orig, upper_constraints_orig, equality_constraints_orig = \ get_bounds_constraints(iname_domain, iname, @@ -34,8 +32,13 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): # {{{ conditional-minimizing slab decomposition def get_slab_decomposition(kernel, iname, sched_index, codegen_state): + iname_domain = kernel.get_inames_domain(iname) + + if iname_domain.is_empty(): + return () + lb_cns_orig, ub_cns_orig = get_simple_loop_bounds(kernel, sched_index, iname, - codegen_state.implemented_domain) + codegen_state.implemented_domain, iname_domain) space = lb_cns_orig.space @@ -192,12 +195,13 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= result = [] bounds = kernel.get_iname_bounds(iname) + domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab from loopy.isl_helpers import static_value_of_pw_aff lower_bound = static_value_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) - slab = make_slab(kernel.space, iname, + slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) @@ -219,7 +223,7 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= if len(slabs) == 1: cmt = None - new_kernel = kernel.copy(domain=kernel.domain & slab) + new_kernel = kernel.copy(domain=domain & slab) inner = set_up_hw_parallel_loops( new_kernel, sched_index, codegen_state, hw_inames_left) result.append(add_comment(cmt, inner)) diff --git a/loopy/compiled.py b/loopy/compiled.py index 187c17050..469db3217 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -157,7 +157,7 @@ class CompiledKernel: args = [] outputs = [] - encountered_non_numpy = False + encountered_numpy = False kwargs_copy = kwargs.copy() @@ -172,7 +172,7 @@ class CompiledKernel: # synchronous, so nothing to worry about val = cl_array.to_device(queue, val, allocator=allocator) elif val is not None: - encountered_non_numpy = True + encountered_numpy = True if val is None: if not is_written: @@ -209,7 +209,7 @@ class CompiledKernel: *args, g_times_l=True, wait_for=wait_for) - if out_host is None and not encountered_non_numpy: + if out_host is None and encountered_numpy: out_host = True if out_host: outputs = [o.get() for o in outputs] diff --git a/loopy/creation.py b/loopy/creation.py index f2a65064e..c4e44f1ff 100644 --- a/loopy/creation.py +++ b/loopy/creation.py @@ -16,7 +16,7 @@ def check_for_nonexistent_iname_deps(knl): set(insn.forced_iname_deps)-knl.all_inames()))) def check_for_multiple_writes_to_loop_bounds(knl): - from isl import dim_type + from islpy import dim_type domain_parameters = set() for dom in knl.domains: diff --git a/loopy/kernel.py b/loopy/kernel.py index b8b1b4b88..8d81d2aa0 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -115,14 +115,14 @@ def parse_tag(tag): # {{{ arguments class _ShapedArg(object): - def __init__(self, name, dtype, strides=None, shape=None, order="C", + def __init__(self, name, dtype, shape=None, strides=None, order="C", offset=0): """ All of the following are optional. Specify either strides or shape. + :arg shape: :arg strides: like numpy strides, but in multiples of data type size - :arg shape: :arg order: :arg offset: Offset from the beginning of the vector from which the strides are counted. @@ -707,8 +707,6 @@ class LoopKernel(Record): return result - # {{{ instruction parser - def parse_insn(insn): insn_match = INSN_RE.match(insn) subst_match = SUBST_RE.match(insn) @@ -949,7 +947,11 @@ class LoopKernel(Record): tree to the root. """ - domain_parents = [] + # The stack of iname sets records which inames are active + # as we step through the linear list of domains. It also + # determines the granularity of inames to be popped/decactivated + # if we ascend a level. + iname_set_stack = [] result = [] @@ -957,6 +959,11 @@ class LoopKernel(Record): parameters = set(dom.get_var_names(dim_type.param)) inames = set(dom.get_var_names(dim_type.set)) + # This next domain may be nested inside the previous domain. + # Or it may not, in which case we need to figure out how many + # levels of parents we need to discard in order to find the + # true parent. + discard_level_count = 0 while discard_level_count < len(iname_set_stack): last_inames = iname_set_stack[-1-discard_level_count] @@ -969,25 +976,17 @@ class LoopKernel(Record): if discard_level_count: iname_set_stack = iname_set_stack[:-discard_level_count] - if domain_parents: + if result: parent = len(result)-1 else: parent = None for i in range(discard_level_count): assert parent is not None - parent = domain_parents[parent] + parent = result[parent] # found this domain's parent - domain_parents.append(parent) - - # keep walking up tree to make result - dom_result = [] - while parent is not None: - dom_result.insert(0, parent) - parent = domain_parents[parent] - - result.append(dom_result) + result.append(parent) if iname_set_stack: parent_inames = iname_set_stack[-1] @@ -997,6 +996,29 @@ class LoopKernel(Record): return result + @memoize_method + def all_parents_per_domain(self): + """Return a list corresponding to self.domains (by index) + containing domain indices which are nested around this + domain. + + Each domains nest list walks from the leaves of the nesting + tree to the root. + """ + result = [] + + ppd = self.parents_per_domain() + for dom, parent in zip(self.domains, ppd): + # keep walking up tree to find *all* parents + dom_result = [] + while parent is not None: + dom_result.insert(0, parent) + parent = ppd[parent] + + result.append(dom_result) + + return result + @memoize_method def _get_home_domain_map(self): return dict( @@ -1009,7 +1031,12 @@ class LoopKernel(Record): @memoize_method def combine_domains(self, domains): - assert isinstance(domains, frozenset) # for caching + """ + :arg domains: domain indices of domains to be combined. More 'dominant' + domains (those which get most say on the actual dim_type of an iname) + must be later in the order. + """ + assert isinstance(domains, tuple) # for caching result = None assert domains @@ -1018,22 +1045,27 @@ class LoopKernel(Record): if result is None: result = dom else: - aligned_result, aligned_dom = isl.align_two(result, dom) + aligned_dom, aligned_result = isl.align_two( + dom, result, across_dim_types=True) result = aligned_result & aligned_dom return result - def get_effective_domain(self, domain_index): - return self.combine_domains( - frozenset([domain_index] - + self.get_parents_per_domain()[domain_index])) - def get_inames_domain(self, inames): if isinstance(inames, str): - inames = [inames] + inames = frozenset([inames]) + if not isinstance(inames, frozenset): + inames = frozenset(inames) + + from warnings import warn + warn("get_inames_domain did not get a frozenset", stacklevel=2) + return self._get_inames_domain_backend(inames) + + @memoize_method + def _get_inames_domain_backend(self, inames): hdm = self._get_home_domain_map() - ppd = self.parents_per_domain() + ppd = self.all_parents_per_domain() domain_indices = set() for iname in inames: @@ -1041,7 +1073,7 @@ class LoopKernel(Record): domain_indices.add(home_domain_index) domain_indices.update(ppd[home_domain_index]) - return self.combine_domains(frozenset(domain_indices)) + return self.combine_domains(tuple(sorted(domain_indices))) # }}} @@ -1197,18 +1229,20 @@ class LoopKernel(Record): @memoize_method def get_iname_bounds(self, iname): + domain = self.get_inames_domain(frozenset([iname])) + d_var_dict = domain.get_var_dict() + dom_intersect_assumptions = ( - isl.align_spaces(self.assumptions, self.domain) - & self.domain) + isl.align_spaces(self.assumptions, domain) & domain) lower_bound_pw_aff = ( self.cache_manager.dim_min( dom_intersect_assumptions, - self.iname_to_dim[iname][1]) + d_var_dict[iname][1]) .coalesce()) upper_bound_pw_aff = ( self.cache_manager.dim_max( dom_intersect_assumptions, - self.iname_to_dim[iname][1]) + d_var_dict[iname][1]) .coalesce()) class BoundsRecord(Record): @@ -1385,7 +1419,7 @@ class LoopKernel(Record): # {{{ examine domains for i_dom, (dom, parent_indices) in enumerate( - zip(self.domains, self.parents_per_domain())): + zip(self.domains, self.all_parents_per_domain())): for parent_index in parent_indices: for iname in dom.get_var_names(dim_type.set): parent = self.domains[parent_index] @@ -1420,8 +1454,8 @@ class LoopKernel(Record): lines.append(sep) lines.append("DOMAINS:") - for dom, parents in zip(self.domains, self.parents_per_domain()): - lines.append(str(dom)) + for dom, parents in zip(self.domains, self.all_parents_per_domain()): + lines.append(len(parents)*" " + str(dom)) if self.substitutions: lines.append(sep) diff --git a/test/test_loopy.py b/test/test_loopy.py index 015468497..7ade22467 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -294,13 +294,12 @@ def test_empty_reduction(ctx_factory): [ lp.GlobalArg("a", dtype, (20,)), ]) - print knl - cknl = lp.CompiledKernel(ctx, knl) - cknl.print_code() evt, (a,) = cknl(queue) + assert (a.get() == 0).all() + @@ -323,8 +322,6 @@ def test_nested_dependent_reduction(ctx_factory): lp.GlobalArg("a", dtype, (20,)), lp.GlobalArg("l", np.int32, (20,)), ]) - print knl - 1/0 cknl = lp.CompiledKernel(ctx, knl) cknl.print_code() -- GitLab