From 7a208d16410deaceef80a566e1f5c0c02bdaa37d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 8 Mar 2017 22:10:35 -0600 Subject: [PATCH] [ci skip] Three level scan: sort of working version. --- loopy/preprocess.py | 62 ++++++++++++++++++-------- loopy/schedule/__init__.py | 2 +- loopy/transform/reduction.py | 86 +++++++++++++++++++++++++++++------- loopy/transform/save.py | 55 +++++++++++++++++------ test/test_scan.py | 85 ++++++++++++++++++++++++++++++++--- 5 files changed, 235 insertions(+), 55 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ef49faa33..e32ad719d 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -363,7 +363,7 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): dim_type = isl.dim_type orig_domain = kernel.get_inames_domain( - (scan_param.sweep_iname, scan_param.scan_iname)) + frozenset((scan_param.sweep_iname, scan_param.scan_iname))) domain = _move_set_to_param_dims_except(orig_domain, (scan_param.sweep_iname, scan_param.scan_iname)) @@ -443,7 +443,7 @@ def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname)) except ValueError as v: - raise ValueError("Couldn't determine bounds for scan: %s" % e) + raise ValueError("Couldn't determine bounds for scan: %s" % v) try: stride = _try_infer_scan_stride( @@ -506,16 +506,21 @@ def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname): - # FIXME: use home domain of scan_iname... - domain = kernel.get_inames_domain((sweep_iname, scan_iname)) + domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname))) domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname)) - domain = domain.gist_params(domain.params()).project_out_except( - (sweep_iname,), (isl.dim_type.param,)) + var_dict = domain.get_var_dict() + sweep_idx = var_dict[sweep_iname][1] + scan_idx = var_dict[scan_iname][1] - sweep_lower_bound = domain.dim_min(domain.get_var_dict()[sweep_iname][1]) - sweep_upper_bound = domain.dim_max(domain.get_var_dict()[sweep_iname][1]) - scan_lower_bound = domain.dim_min(domain.get_var_dict()[scan_iname][1]) + domain = domain.gist_params(domain.params()) + + try: + sweep_lower_bound = domain.dim_min(sweep_idx) + sweep_upper_bound = domain.dim_max(sweep_idx) + scan_lower_bound = domain.dim_min(scan_idx) + except isl.Error as e: + raise ValueError("isl error: %s" % e) return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound) @@ -867,7 +872,11 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): - dependent_inames = frozenset(subdomain.get_var_names(isl.dim_type.param)) + # Intersect with inames, because we could have captured some kernel params + # in here too.. + dependent_inames = ( + frozenset(subdomain.get_var_names(isl.dim_type.param)) + & kernel.all_inames()) idx, = kernel.get_leaf_domain_indices(dependent_inames) domains.insert(idx + 1, subdomain) @@ -1165,7 +1174,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, @memoize def get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride): - domain = kernel.get_inames_domain((scan_iname, sweep_iname)) + domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname))) tracking_iname = var_name_gen( "{scan_iname}_tracking_{sweep_iname}" @@ -1176,9 +1185,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, new_domain = _create_domain_for_sweep_tracking(domain, tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) - _insert_subdomain_into_domain_tree(kernel, domains, new_domain) + from loopy.kernel.tools import DomainChanger + domain_idx, = temp_kernel.get_leaf_domain_indices(frozenset([sweep_iname])) + + orig_domain = domains[domain_idx] + new_domain = isl.align_spaces(new_domain, domains[domain_idx], + obj_bigger_ok=True, + across_dim_types=True) + orig_domain = isl.align_spaces(orig_domain, new_domain) - return tracking_iname, new_domain + orig_domain &= new_domain + + domains[domain_idx] = orig_domain + + return tracking_iname def replace_var_within_expr(expr, from_var, to_var): from pymbolic.mapper.substitutor import make_subst_func @@ -1223,7 +1243,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, outer_insn_inames = temp_kernel.insn_inames(insn) inames_to_remove.add(scan_iname) - track_iname, track_iname_domain = ( + track_iname = ( get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride)) @@ -1303,7 +1323,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # TODO: rename red_iname = scan_iname - size = _get_int_iname_size(sweep_iname) + scan_size = _get_int_iname_size(sweep_iname) + + assert scan_size > 0 + + if scan_size == 1: + raise NotImplementedError("tell matt to fix this") outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1324,7 +1349,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _get_int_iname_size(oiname) for oiname in outer_local_inames) - track_iname, track_iname_domain = get_or_add_sweep_tracking_iname_and_domain( + track_iname = get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride) # {{{ add separate iname to carry out the scan @@ -1333,7 +1358,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # on our red_iname. base_exec_iname = var_name_gen("scan_"+sweep_iname) - domains.append(_make_slab_set(base_exec_iname, size)) + domains.append(_make_slab_set(base_exec_iname, scan_size)) new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname] # }}} @@ -1359,7 +1384,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, acc_var_names = make_temporaries( name_based_on="acc_"+scan_iname, nvars=nresults, - shape=outer_local_iname_sizes + (size,), + shape=outer_local_iname_sizes + (scan_size,), dtypes=reduction_dtypes, scope=temp_var_scope.LOCAL) @@ -1418,7 +1443,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, else: return c - scan_size = size prev_id = transfer_id istage = 0 diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index c078da2ec..10a19a3c7 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -324,7 +324,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): if not may_add_to_loop_dep_map: continue - logger.debug("{knl}: loop dependency map: iname '{iname}' " + print("{knl}: loop dependency map: iname '{iname}' " "depends on '{dep_insn}' via '{insn}'" .format( knl=kernel.name, diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 2fd086912..c46c9481a 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -197,11 +197,16 @@ def _expand_subst_within_expression(kernel, expr): def _add_global_barrier(kernel, source, sink, barrier_id): from loopy.kernel.instruction import BarrierInstruction + within_inames = ( + kernel.id_to_insn[source].within_inames + & kernel.id_to_insn[sink].within_inames) + barrier_insn = BarrierInstruction( id=barrier_id, depends_on=frozenset([source]), + within_inames = within_inames, kind="global") - + updated_sink = kernel.id_to_insn[sink] updated_sink = updated_sink.copy( depends_on=updated_sink.depends_on | frozenset([barrier_id])) @@ -211,6 +216,39 @@ def _add_global_barrier(kernel, source, sink, barrier_id): return kernel +def _get_scan_level(sweep_iname): + SWEEP_RE = r"l(\d+)_.*" + + import re + match_result = re.match(SWEEP_RE, sweep_iname) + + if match_result is None: + return 0 + + return int(match_result.group(1)) + + +def _get_base_iname(iname): + BASE_INAME_RE = r"l\d+_(.*)" + + import re + match_result = re.match(BASE_INAME_RE, iname) + + if match_result is None: + return iname + + base_iname = match_result.group(1) + + MODIFIERS = ("inner_", "outer_") + + for modifier in MODIFIERS: + if base_iname.startswith(modifier): + base_iname = base_iname[len(modifier):] + break + + return base_iname + + def make_two_level_scan( kernel, insn_id, scan_iname, @@ -262,10 +300,12 @@ def make_two_level_scan( var_name_gen = kernel.get_var_name_generator() insn_id_gen = kernel.get_instruction_id_generator() - level = 0 #scan_level or try_get_scan_level(sweep_iname) + level = _get_scan_level(sweep_iname) + base_scan_iname = _get_base_iname(scan_iname) + base_sweep_iname = _get_base_iname(sweep_iname) format_kwargs = { - "insn": insn_id, "iname": scan_iname, "sweep": sweep_iname, + "insn": insn_id, "iname": base_scan_iname, "sweep": base_sweep_iname, "level": level, "next_level": level + 1, "prefix": "l"} nonlocal_storage_name = var_name_gen( @@ -273,11 +313,11 @@ def make_two_level_scan( if inner_iname is None: inner_iname = var_name_gen( - "{prefix}{level}_inner_update_{sweep}".format(**format_kwargs)) + "{prefix}{level}_inner2_{sweep}".format(**format_kwargs)) if outer_iname is None: outer_iname = var_name_gen( - "{prefix}{level}_outer_update_{sweep}".format(**format_kwargs)) + "{prefix}{level}_outer2_{sweep}".format(**format_kwargs)) nonlocal_iname = var_name_gen( "{prefix}{level}_combine_{sweep}".format(**format_kwargs)) @@ -302,11 +342,11 @@ def make_two_level_scan( if local_storage_name is None: local_storage_name = var_name_gen( - "{prefix}{next_level}_{insn}".format(**format_kwargs)) + "{prefix}{next_level}l_{insn}".format(**format_kwargs)) if nonlocal_storage_name is None: nonlocal_storage_name = var_name_gen( - "{prefix}{level}_{insn}".format(**format_kwargs)) + "{prefix}{level}nl_{insn}".format(**format_kwargs)) local_scan_insn_id = insn_id_gen( "{iname}_local_scan".format(**format_kwargs)) @@ -388,15 +428,28 @@ def make_two_level_scan( var(subst_name)(var(outer_local_iname) * inner_length + var(inner_scan_iname))) + new_inames = ["temp"] + + kernel = lp.duplicate_inames(kernel, + (sweep_iname), + within="not id:*", + new_inames=new_inames) + kernel = lp.split_iname(kernel, sweep_iname, inner_length, inner_iname=inner_iname, outer_iname=outer_iname, inner_tag=inner_tag, outer_tag=outer_tag) + kernel = lp.split_iname(kernel, new_inames[0], inner_length, + inner_iname=inner_local_iname, outer_iname=outer_local_iname, + inner_tag=inner_local_tag, outer_tag=outer_local_tag) + + """ kernel = lp.duplicate_inames(kernel, (outer_iname, inner_iname), within="not id:*", new_inames=[outer_local_iname, inner_local_iname], tags={outer_iname: outer_local_tag, inner_iname: inner_local_tag}) + """ kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) @@ -423,13 +476,17 @@ def make_two_level_scan( frozenset(all_precompute_inames) - frozenset(precompute_inames)) + insn = kernel.id_to_insn[insn_id] + + within_inames = insn.within_inames - frozenset([outer_iname, inner_iname]) + from pymbolic import var kernel = lp.precompute(kernel, [var(local_subst_name)(var(outer_iname), var(inner_iname))], sweep_inames=sweep_inames, precompute_inames=precompute_inames, storage_axes=local_storage_axes, - precompute_outer_inames=precompute_outer_inames, + precompute_outer_inames=precompute_outer_inames | within_inames, temporary_name=local_storage_name, compute_insn_id=local_scan_insn_id) @@ -458,17 +515,13 @@ def make_two_level_scan( kernel = kernel.copy(temporary_variables=new_temporary_variables) - insn = kernel.id_to_insn[insn_id] - - # XXX: should not include sweep iname? - within_inames = insn.within_inames - from loopy.kernel.instruction import make_assignment nonlocal_init_head = make_assignment( id=nonlocal_init_head_insn_id, assignees=(var(nonlocal_storage_name)[0],), expression=0, - within_inames=frozenset([outer_local_iname,inner_local_iname]), + within_inames=( + within_inames | frozenset([outer_local_iname,inner_local_iname])), predicates=frozenset([var(inner_local_iname).eq(0)]), depends_on=frozenset([local_scan_insn_id])) @@ -482,7 +535,8 @@ def make_two_level_scan( (var(outer_local_iname),var(inner_local_iname)), strip_scalar=True)], no_sync_with=frozenset([(local_scan_insn_id, "local")]), - within_inames=frozenset([outer_local_iname,inner_local_iname]), + within_inames=( + within_inames | frozenset([outer_local_iname,inner_local_iname])), depends_on=frozenset([local_scan_insn_id]), predicates=frozenset([var(inner_local_iname).eq(inner_length - 1)])) @@ -507,7 +561,7 @@ def make_two_level_scan( scan.operation, (outer_scan_iname,), var(nonlocal_storage_name)[var(outer_scan_iname)]), - within_inames=frozenset([nonlocal_iname]), + within_inames=within_inames | frozenset([nonlocal_iname]), depends_on=frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id])) kernel = _update_instructions(kernel, (nonlocal_scan,), copy=False) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 29f4c0238..ccb7c1236 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -193,9 +193,9 @@ class TemporarySaver(object): The name of the new temporary. - .. attribute:: orig_temporary + .. attribute:: orig_temporary_name - The original temporary variable object. + The name of original temporary variable object. .. attribute:: hw_dims @@ -214,9 +214,10 @@ class TemporarySaver(object): non-hardware dimensions """ - @memoize_method - def as_variable(self): - temporary = self.orig_temporary + __slots__ = ["name", "orig_temporary_name", "hw_dims", "hw_tags", "non_hw_dims"] + + def as_kernel_temporary(self, kernel): + temporary = kernel.temporary_variables[self.orig_temporary_name] from loopy.kernel.data import TemporaryVariable return TemporaryVariable( name=self.name, @@ -239,7 +240,12 @@ class TemporarySaver(object): self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} + # temporary name -> save or reload insns self.saves_or_reloads_added = {} + from collections import defaultdict + self.subkernel_to_saves = defaultdict(lambda: set()) + self.subkernel_to_reloads = defaultdict(lambda: set()) + self.base_storage_to_representative = {} def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary): """ @@ -256,6 +262,7 @@ class TemporarySaver(object): group_tags = None local_tags = None + originating_insn_id = None def _sortedtags(tags): return sorted(tags, key=lambda tag: tag.axis) @@ -286,13 +293,18 @@ class TemporarySaver(object): if group_tags is None: group_tags = _sortedtags(my_group_tags) local_tags = _sortedtags(my_local_tags) + originating_insn_id = insn_id if ( group_tags != _sortedtags(my_group_tags) or local_tags != _sortedtags(my_local_tags)): raise ValueError( - "inconsistent parallel tags across instructions that access '%s'" - % temporary.name) + "inconsistent parallel tags across instructions that access " + "'%s', instruction '%s' has tags '%s' but instruction '%s' " + "has tags '%s'" + % (temporary.name, + originating_insn_id, group_tags + local_tags, + insn_id, my_group_tags + my_local_tags)) if group_tags is None: assert local_tags is None @@ -324,9 +336,8 @@ class TemporarySaver(object): assert temporary.read_only return None - if temporary.base_storage is not None: - raise ValueError( - "Cannot promote temporaries with base_storage to global") + if temporary.base_storage in self.base_storage_to_representative: + return self.base_storage_to_representative[temporary.base_storage] hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary) non_hw_dims = temporary.shape @@ -337,11 +348,14 @@ class TemporarySaver(object): backing_temporary = self.PromotedTemporary( name=self.var_name_gen(temporary.name + "_save_slot"), - orig_temporary=temporary, + orig_temporary_name=temporary.name, hw_dims=hw_dims, hw_tags=hw_tags, non_hw_dims=non_hw_dims) + if temporary.base_storage is not None: + self.base_storage_to_representative[temporary.base_storage] = backing_temporary + return backing_temporary def save_or_reload_impl(self, temporary, subkernel, mode, @@ -354,6 +368,18 @@ class TemporarySaver(object): if promoted_temporary is None: return + if mode == "save": + if promoted_temporary.name in self.subkernel_to_saves[subkernel]: + return + else: + self.subkernel_to_saves[subkernel].add(promoted_temporary.name) + + elif mode == "reload": + if promoted_temporary.name in self.subkernel_to_reloads[subkernel]: + return + else: + self.subkernel_to_reloads[subkernel].add(promoted_temporary.name) + from loopy.kernel.tools import DomainChanger dchg = DomainChanger( self.kernel, @@ -378,7 +404,8 @@ class TemporarySaver(object): Variable(agg), tuple(map(Variable, subscript))) - dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)] + orig_temporary = self.kernel.temporary_variables[promoted_temporary.orig_temporary_name] + dim_inames_trunc = dim_inames[:len(orig_temporary.shape)] args = ( add_subscript_if_nonempty( @@ -433,7 +460,7 @@ class TemporarySaver(object): depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) self.updated_temporary_variables[promoted_temporary.name] = \ - promoted_temporary.as_variable() + promoted_temporary.as_kernel_temporary(self.kernel) self.updated_iname_to_tag.update(iname_to_tag) @@ -488,7 +515,7 @@ class TemporarySaver(object): assert mode in ("save", "reload") import islpy as isl - orig_temporary = promoted_temporary.orig_temporary + orig_temporary = self.kernel.temporary_variables[promoted_temporary.orig_temporary_name] orig_dim = domain.dim(isl.dim_type.set) # Tags for newly added inames diff --git a/test/test_scan.py b/test/test_scan.py index ae046818b..60a2f4272 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -207,7 +207,7 @@ def test_local_parallel_scan(ctx_factory, n): "..." ) - knl = lp.fix_parameters(knl, n=16) + knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) @@ -215,8 +215,8 @@ def test_local_parallel_scan(ctx_factory, n): knl = lp.add_dtypes(knl, dict(a=int)) - evt, (a,) = knl(queue, a=np.arange(16)) - assert (a == np.cumsum(np.arange(16)**2)).all() + evt, (a,) = knl(queue, a=np.arange(n)) + assert (a == np.cumsum(np.arange(n)**2)).all() def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): @@ -369,7 +369,7 @@ def test_two_level_scan(ctx_getter): "{[i,j]: 0 <= i < 16 and 0 <= j <= i}", ], """ - out[i] = sum(j, 1) {id=insn} + out[i] = sum(j, j) {id=insn} """, "...") @@ -381,7 +381,7 @@ def test_two_level_scan(ctx_getter): knl, "insn", inner_length=4, scan_iname="j", sweep_iname="i", - local_storage_axes=(("l0_inner_update_i",)), + local_storage_axes=(("l0_inner2_i",)), inner_iname="l0_inner_update_i", inner_tag="l.0", outer_tag="g.0", @@ -418,6 +418,81 @@ def test_two_level_scan(ctx_getter): print(out.get()) +def test_three_level_scan(ctx_getter): + knl = lp.make_kernel( + [ + "{[i,j]: 0 <= i < 16 and 0 <= j <= i}", + ], + """ + out[i] = sum(j, j) {id=insn} + """, + "...") + + #knl = lp.tag_inames(knl, dict(i="l.0")) + + from loopy.transform.reduction import make_two_level_scan + + knl = make_two_level_scan( + knl, "insn", inner_length=4, + scan_iname="j", + sweep_iname="i", + local_storage_axes=(("l0_inner_update_i",)), + inner_iname="l0_inner_update_i", + inner_tag="l.0", + outer_tag="g.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + inner_local_tag=None, + outer_local_tag="g.0") + + knl = make_two_level_scan( + knl, "j_local_scan", inner_length=2, + scan_iname="l1_j", + sweep_iname="l1_inner_i", + inner_tag="for", + outer_tag="l.0", + nonlocal_tag="l.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.LOCAL, + inner_local_tag="for", + outer_local_tag="l.0") + + print(knl) + + knl = lp.realize_reduction(knl, force_scan=True) + + from loopy.transform.instruction import add_nosync_to_instructions + knl = add_nosync_to_instructions( + knl, + scope="global", + source="writes:acc_l0_j", + sink="reads:acc_l0_j") + + knl = lp.alias_temporaries(knl, ["l1l_insn", "l2l_j_local_scan"], synchronize_for_exclusive_use=False) + + print(knl.get_temporary_to_base_storage_map()) + + print(knl) + + from loopy.transform.save import save_and_reload_temporaries + + print(knl) + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + knl = save_and_reload_temporaries(knl) + knl = lp.get_one_scheduled_kernel(knl) + + print(knl) + + c = ctx_getter() + q = cl.CommandQueue(c) + + _, (out,) = knl(q) + + print(out.get()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab