diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 840b2df418d694fcf4bce0c080282bc5ee782b7c..e657beecbc5453ae5b2390da5a958d2fc9a70771 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -60,6 +60,8 @@ def dump_space(ls): for dt in dim_type.names) +# {{{ make_slab + def make_slab(space, iname, start, stop): zero = isl.Aff.zero_on_domain(space) @@ -114,6 +116,8 @@ def make_slab_from_bound_pwaffs(space, iname, lbound, ubound): & iname_pwaff.le_set(ubound)) +# }}} + def iname_rel_aff(space, iname, rel, aff): """*aff*'s domain space is allowed to not match *space*.""" @@ -138,6 +142,8 @@ def iname_rel_aff(space, iname, rel, aff): raise ValueError("unknown value of 'rel': %s" % rel) +# {{{ static_*_of_pw_aff + def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context): if context is not None: context = isl.align_spaces(context, pw_aff.get_domain_space(), @@ -214,6 +220,10 @@ def static_value_of_pw_aff(pw_aff, constants_only, context=None): return static_extremum_of_pw_aff(pw_aff, constants_only, isl.PwAff.eq_set, "value", context) +# }}} + + +# {{{ duplicate_axes def duplicate_axes(isl_obj, duplicate_inames, new_inames): if isinstance(isl_obj, list): @@ -262,6 +272,8 @@ def duplicate_axes(isl_obj, duplicate_inames, new_inames): return moved_dims.intersect(more_dims) +# }}} + def is_nonnegative(expr, over_set): space = over_set.get_space() @@ -276,6 +288,8 @@ def is_nonnegative(expr, over_set): return over_set.intersect(expr_neg_set).is_empty() +# {{{ convexify + def convexify(domain): """Try a few ways to get *domain* to be a BasicSet, i.e. explicitly convex. @@ -312,6 +326,10 @@ def convexify(domain): print(" %s" % (isl.Set.from_basic_set(dbs).gist(domain))) raise NotImplementedError("Could not find convex representation of set") +# }}} + + +# {{{ boxify def boxify(cache_manager, domain, box_inames, context): var_dict = domain.get_var_dict(dim_type.set) @@ -357,6 +375,8 @@ def boxify(cache_manager, domain, box_inames, context): return convexify(result) +# }}} + def simplify_via_aff(expr): from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies @@ -512,6 +532,8 @@ def dim_max_with_elimination(obj, idx): # }}} +# {{{ get_simple_strides + def get_simple_strides(bset, key_by="name"): """Return a dictionary from inames to strides in bset. Each stride is returned as a :class:`islpy.Val`. If no stride can be determined, the @@ -570,4 +592,29 @@ def get_simple_strides(bset, key_by="name"): return result +# }}} + + +# {{{{ find_max_of_pwaff_with_params + +def find_max_of_pwaff_with_params(pw_aff, n_allowed_params): + if n_allowed_params is None: + return pw_aff + + extra_dim_idx = pw_aff.dim(dim_type.param,) + pw_aff = pw_aff.add_dims(dim_type.param, 1) + + zero = isl.Aff.zero_on_domain(pw_aff.domain().space) + extra_dim = zero.set_coefficient_val(dim_type.param, extra_dim_idx, 1) + + pw_aff_set = pw_aff.eq_set(extra_dim) + + pw_aff_set = pw_aff_set.move_dims( + dim_type.set, 0, dim_type.param, n_allowed_params, + pw_aff_set.dim(dim_type.param) - n_allowed_params) + + return pw_aff_set.dim_max(pw_aff_set.dim(dim_type.set)-1) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 83119d770257790eddccb2189167213ed82d6221..d618d4b0dd9adc0ccb50827dce914571538a62a4 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1140,18 +1140,36 @@ class LoopKernel(RecordWithoutPickling): lines.append("INSTRUCTIONS:") loop_list_width = 35 - printed_insn_ids = set() + # {{{ topological sort - Fore = self.options._fore - Style = self.options._style + printed_insn_ids = set() + printed_insn_order = [] - def print_insn(insn): + def insert_insn_into_order(insn): if insn.id in printed_insn_ids: return printed_insn_ids.add(insn.id) for dep_id in sorted(insn.depends_on): - print_insn(kernel.id_to_insn[dep_id]) + insert_insn_into_order(kernel.id_to_insn[dep_id]) + + printed_insn_order.append(insn) + + for insn in kernel.instructions: + insert_insn_into_order(insn) + + # }}} + + import loopy as lp + + Fore = self.options._fore + Style = self.options._style + + from loopy.kernel.tools import draw_dependencies_as_unicode_arrows + for insn, (arrows, extender) in zip( + printed_insn_order, + draw_dependencies_as_unicode_arrows( + printed_insn_order, fore=Fore, style=Style)): if isinstance(insn, lp.MultiAssignmentBase): lhs = ", ".join(str(a) for a in insn.assignees) @@ -1206,13 +1224,15 @@ class LoopKernel(RecordWithoutPickling): core = Fore.MAGENTA+rhs+Style.RESET_ALL if len(loop_list) > loop_list_width: - lines.append("[%s]" % loop_list) - lines.append("%s%s # %s" % ( + lines.append("%s[%s]" % (arrows, loop_list)) + lines.append("%s%s%s # %s" % ( + extender, (loop_list_width+2)*" ", core, ", ".join(options))) else: - lines.append("[%s]%s%s # %s" % ( + lines.append("%s[%s]%s%s # %s" % ( + arrows, loop_list, " "*(loop_list_width-len(loop_list)), core, ",".join(options))) @@ -1223,10 +1243,6 @@ class LoopKernel(RecordWithoutPickling): lines.append(10*" " + "if (%s)" % " && ".join( [str(x) for x in insn.predicates])) - import loopy as lp - for insn in kernel.instructions: - print_insn(insn) - dep_lines = [] for insn in kernel.instructions: if insn.depends_on: diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 921d2538a5068a8c309fe9f412aba588aa6243f2..c54d1fc329a3a8797b17458dc40e489044e9374a 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -364,7 +364,7 @@ class InstructionBase(Record): raise RuntimeError("unexpected value for Instruction.boostable") if self.depends_on: - result.append("deps="+":".join(self.depends_on)) + result.append("dep="+":".join(self.depends_on)) if self.no_sync_with: result.append("nosync="+":".join(self.no_sync_with)) if self.groups: diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 54236efca651e676745c4198cf509b019b95c084..7e9bd549fede6abf6d4d5db99896063b34246793 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1,3 +1,4 @@ +# coding=utf-8 """Operations on the kernel object.""" from __future__ import division, absolute_import, print_function @@ -322,7 +323,13 @@ class SetOperationCacheManager: from loopy.isl_helpers import dim_max_with_elimination return self.op(set, "dim_max", dim_max_with_elimination, args) - def base_index_and_length(self, set, iname, context=None): + def base_index_and_length(self, set, iname, context=None, + n_allowed_params_in_length=None): + """ + :arg n_allowed_params_in_length: Simplifies the 'length' + argument so that only the first that many params + (in the domain of *set*) occur. + """ if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] @@ -336,7 +343,8 @@ class SetOperationCacheManager: from loopy.isl_helpers import ( static_max_of_pw_aff, static_min_of_pw_aff, - static_value_of_pw_aff) + static_value_of_pw_aff, + find_max_of_pwaff_with_params) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value @@ -351,11 +359,14 @@ class SetOperationCacheManager: if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) - size = pw_aff_to_expr(static_max_of_pw_aff( - upper_bound_pw_aff - base_index_aff + 1, constants_only=False, + length = find_max_of_pwaff_with_params( + upper_bound_pw_aff - base_index_aff + 1, + n_allowed_params_in_length) + length = pw_aff_to_expr(static_max_of_pw_aff( + length, constants_only=False, context=context)) - return base_index, size + return base_index, length # }}} @@ -367,11 +378,14 @@ class SetOperationCacheManager: base_index = pw_aff_to_expr(base_index_aff) - size = pw_aff_to_expr(static_max_of_pw_aff( - upper_bound_pw_aff - base_index_aff + 1, constants_only=False, + length = find_max_of_pwaff_with_params( + upper_bound_pw_aff - base_index_aff + 1, + n_allowed_params_in_length) + length = pw_aff_to_expr(static_max_of_pw_aff( + length, constants_only=False, context=context)) - return base_index, size + return base_index, length # }}} @@ -1083,4 +1097,139 @@ def guess_var_shape(kernel, var_name): # }}} + +# {{{ find_recursive_dependencies + +def find_recursive_dependencies(kernel, insn_ids): + queue = list(insn_ids) + + result = set(insn_ids) + + while queue: + new_queue = [] + + for insn_id in queue: + insn = kernel.id_to_insn[insn_id] + additionals = insn.depends_on - result + result.update(additionals) + new_queue.extend(additionals) + + queue = new_queue + + return result + +# }}} + + +# {{{ find_reverse_dependencies + +def find_reverse_dependencies(kernel, insn_ids): + """Finds a set of IDs of instructions that depend on one of the insn_ids. + + :arg insn_ids: a set of instruction IDs + """ + return frozenset( + insn.id + for insn in kernel.instructions + if insn.depends_on & insn_ids) + +# }}} + + +# {{{ draw_dependencies_as_unicode_arrows + +def draw_dependencies_as_unicode_arrows( + instructions, fore, style, flag_downward=True): + """ + :arg instructions: an ordered iterable of :class:`loopy.InstructionBase` + instances + :arg fore: if given, will be used like a :mod:`colorama` ``Fore`` object + to color-code dependencies. (E.g. red for downward edges) + :returns: A list of tuples (arrows, extender) with Unicode-drawn dependency + arrows, one per entry of *instructions*. *extender* can be used to + extend arrows below the line of an instruction. + """ + reverse_deps = {} + + for insn in instructions: + for dep in insn.depends_on: + reverse_deps.setdefault(dep, []).append(insn.id) + + # mapping of (from_id, to_id) tuples to column_index + dep_to_column = {} + + # {{{ find column assignments + + # mapping from column indices to (end_insn_id, updown) + columns_in_use = {} + + n_columns = [0] + + def find_free_column(): + i = 0 + while i in columns_in_use: + i += 1 + if i+1 > n_columns[0]: + n_columns[0] = i+1 + row.append(" ") + return i + + def do_flag_downward(s, updown): + if flag_downward and updown == "down": + return fore.RED+s+style.RESET_ALL + else: + return s + + def make_extender(): + result = n_columns[0] * [" "] + for col, (_, updown) in six.iteritems(columns_in_use): + result[col] = do_flag_downward("│", updown) + + return result + + rows = [] + for insn in instructions: + row = make_extender() + + for rdep in reverse_deps.get(insn.id, []): + assert rdep != insn.id + + dep_key = (rdep, insn.id) + if dep_key not in dep_to_column: + col = dep_to_column[dep_key] = find_free_column() + columns_in_use[col] = (rdep, "up") + row[col] = "↱" + + for dep in insn.depends_on: + assert dep != insn.id + dep_key = (insn.id, dep) + if dep_key not in dep_to_column: + col = dep_to_column[dep_key] = find_free_column() + columns_in_use[col] = (dep, "down") + row[col] = do_flag_downward("┌", "down") + + for col, (end, updown) in list(six.iteritems(columns_in_use)): + if insn.id == end: + del columns_in_use[col] + if updown == "up": + row[col] = "└" + else: + row[col] = do_flag_downward("↳", updown) + + extender = make_extender() + + rows.append(("".join(row), "".join(extender))) + + # }}} + + def extend_to_uniform_length(s): + return s + " "*(n_columns[0]-len(s)) + + return [ + (extend_to_uniform_length(row), + extend_to_uniform_length(extender)) + for row, extender in rows] + +# }}} + # vim: foldmethod=marker diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py index 38e35a94190bff49e537de873f0278d4d8b7b38b..3c7bfed43b9bd02a4be3d71b2317cee94da75b4b 100644 --- a/loopy/transform/array_buffer_map.py +++ b/loopy/transform/array_buffer_map.py @@ -162,9 +162,12 @@ def build_global_storage_to_sweep_map(kernel, access_descriptors, # {{{ compute storage bounds def find_var_base_indices_and_shape_from_inames( - domain, inames, cache_manager, context=None): + domain, inames, cache_manager, context=None, + n_allowed_params_in_shape=None): base_indices_and_sizes = [ - cache_manager.base_index_and_length(domain, iname, context) + cache_manager.base_index_and_length( + domain, iname, context, + n_allowed_params_in_length=n_allowed_params_in_shape) for iname in inames] return list(zip(*base_indices_and_sizes)) @@ -183,7 +186,8 @@ def compute_bounds(kernel, domain, stor2sweep, return find_var_base_indices_and_shape_from_inames( storage_domain, [saxis+"'" for saxis in storage_axis_names], - kernel.cache_manager, context=kernel.assumptions) + kernel.cache_manager, context=kernel.assumptions, + n_allowed_params_in_shape=stor2sweep.dim(isl.dim_type.param)) # }}} diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 27fad67f8838622d24e276defc7b8c66eaf11306..5ab9dfab3c8ac0669c3e7eaf4091bb3ab4b0e2a2 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -136,7 +136,8 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): access_descriptors, array_base_map, storage_axis_names, storage_axis_sources, non1_storage_axis_names, - temporary_name, compute_insn_id): + temporary_name, compute_insn_id, compute_dep_id, + compute_read_variables): super(RuleInvocationReplacer, self).__init__(rule_mapping_context) self.subst_name = subst_name @@ -152,6 +153,10 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): self.temporary_name = temporary_name self.compute_insn_id = compute_insn_id + self.compute_dep_id = compute_dep_id + + self.compute_read_variables = compute_read_variables + self.compute_insn_depends_on = set() def map_substitution(self, name, tag, arguments, expn_state): if not ( @@ -219,6 +224,8 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): def map_kernel(self, kernel): new_insns = [] + excluded_insn_ids = set([self.compute_insn_id, self.compute_dep_id]) + for insn in kernel.instructions: self.replaced_something = False @@ -228,7 +235,17 @@ class RuleInvocationReplacer(RuleAwareIdentityMapper): insn = insn.copy( depends_on=( insn.depends_on - | frozenset([self.compute_insn_id]))) + | frozenset([self.compute_dep_id]))) + + for dep in insn.depends_on: + if dep in excluded_insn_ids: + continue + + dep_insn = kernel.id_to_insn[dep] + if (frozenset(dep_insn.assignee_var_names()) + & self.compute_read_variables): + self.compute_insn_depends_on.update( + insn.depends_on - excluded_insn_ids) new_insns.append(insn) @@ -777,25 +794,84 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, expression=compute_expression, # within_inames determined below ) + compute_dep_id = compute_insn_id + added_compute_insns = [compute_insn] + + if temporary_scope == temp_var_scope.GLOBAL: + barrier_insn_id = kernel.make_unique_instruction_id( + based_on=c_subst_name+"_b") + from loopy.kernel.instruction import BarrierInstruction + barrier_insn = BarrierInstruction( + id=barrier_insn_id, + depends_on=frozenset([compute_insn_id]), + kind="global") + compute_dep_id = barrier_insn_id + + added_compute_insns.append(barrier_insn) # }}} # {{{ substitute rule into expressions in kernel (if within footprint) + from loopy.symbolic import SubstitutionRuleExpander + expander = SubstitutionRuleExpander(kernel.substitutions) + invr = RuleInvocationReplacer(rule_mapping_context, subst_name, subst_tag, within, access_descriptors, abm, storage_axis_names, storage_axis_sources, non1_storage_axis_names, - temporary_name, compute_insn_id) + temporary_name, compute_insn_id, compute_dep_id, + compute_read_variables=get_dependencies(expander(compute_expression))) kernel = invr.map_kernel(kernel) kernel = kernel.copy( - instructions=[compute_insn] + kernel.instructions) + instructions=added_compute_insns + kernel.instructions) kernel = rule_mapping_context.finish_kernel(kernel) # }}} + # {{{ add dependencies to compute insn + + kernel = kernel.copy( + instructions=[ + insn.copy(depends_on=frozenset(invr.compute_insn_depends_on)) + if insn.id == compute_insn_id + else insn + for insn in kernel.instructions]) + + # }}} + + # {{{ propagate storage iname subst to dependencies of compute instructions + + from loopy.kernel.tools import find_recursive_dependencies + compute_deps = find_recursive_dependencies( + kernel, frozenset([compute_insn_id])) + + # FIXME: Need to verify that there are no outside dependencies + # on compute_deps + + prior_storage_axis_names = frozenset(storage_axis_subst_dict) + + new_insns = [] + for insn in kernel.instructions: + if (insn.id in compute_deps + and insn.within_inames & prior_storage_axis_names): + insn = (insn + .with_transformed_expressions( + lambda expr: expr_subst_map(expr, kernel, insn)) + .copy(within_inames=frozenset( + storage_axis_subst_dict.get(iname, var(iname)).name + for iname in insn.within_inames))) + + new_insns.append(insn) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + # {{{ determine inames for compute insn if precompute_outer_inames is None: diff --git a/test/test_reduction.py b/test/test_reduction.py index 68f6242440a14eeeb2144762f0d5175f2135ffa2..b78509b6318a984d117d00b1a6854d9611db80d1 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -214,18 +214,19 @@ def test_local_parallel_reduction(ctx_factory, size): lp.auto_test_vs_ref(ref_knl, ctx, knl) -# FIXME: Make me a test @pytest.mark.parametrize("size", [10000]) -def no_test_global_parallel_reduction(ctx_factory, size): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) +def test_global_parallel_reduction(ctx_factory, size): + # ctx = ctx_factory() + # queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ - <> key = make_uint2(i, 324830944) {inames=i} - <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} - <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} + for i + <> key = make_uint2(i, 324830944) {inames=i} + <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} + <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} + end z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) @@ -238,12 +239,11 @@ def no_test_global_parallel_reduction(ctx_factory, size): knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") - knl = lp.precompute(knl, "red_i_outer_arg", "i_outer") - print(knl) - 1/0 + knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", + temporary_scope=lp.temp_var_scope.GLOBAL) knl = lp.realize_reduction(knl) - evt, (z,) = knl(queue, n=size) + #evt, (z,) = knl(queue, n=size) #lp.auto_test_vs_ref(ref_knl, ctx, knl)