From f62b6b2405d18d24261f58a3eb5bc47802b7a5e3 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 22 Feb 2017 20:38:15 -0600 Subject: [PATCH 01/27] [WIP] Initial work towards scans in loopy. --- loopy/diagnostic.py | 4 + loopy/preprocess.py | 804 ++++++++++++++++++++++++++++++++++++++++++-- test/test_scan.py | 215 ++++++++++++ 3 files changed, 995 insertions(+), 28 deletions(-) create mode 100644 test/test_scan.py diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 15ab8a1ee..512e4ac86 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -103,6 +103,10 @@ class MissingDefinitionError(LoopyError): class UnscheduledInstructionError(LoopyError): pass + +class ReductionIsNotTriangularError(LoopyError): + pass + # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index db7792cce..393b4cd4c 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -30,6 +30,7 @@ from loopy.diagnostic import ( import islpy as isl +from pytools import memoize from pytools.persistent_dict import PersistentDict from loopy.tools import LoopyKeyBuilder @@ -272,7 +273,329 @@ def find_temporary_scope(kernel): # {{{ rewrite reduction to imperative form -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): + +# {{{ utils (not stateful) + +from collections import namedtuple + + +_InameClassification = namedtuple("_InameClassifiction", + "sequential, local_parallel, nonlocal_parallel") + + +def _classify_reduction_inames(kernel, inames): + sequential = [] + local_par = [] + nonlocal_par = [] + + from loopy.kernel.data import ( + LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, + ParallelTag) + + for iname in inames: + iname_tag = kernel.iname_to_tag.get(iname) + + if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): + # These are nominally parallel, but we can live with + # them as sequential. + sequential.append(iname) + + elif isinstance(iname_tag, LocalIndexTagBase): + local_par.append(iname) + + elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): + nonlocal_par.append(iname) + + else: + sequential.append(iname) + + return _InameClassification(tuple(sequential), + tuple(local_par), + tuple(nonlocal_par)) + + +def _add_params_to_domain(domain, param_names): + dim_type = isl.dim_type + nparams_orig = domain.dim(dim_type.param) + domain = domain.add_dims(dim_type.param, len(param_names)) + + for param_idx, param_name in enumerate(param_names): + domain = domain.set_dim_name( + dim_type.param, param_idx + nparams_orig, param_name) + + return domain + + +def _check_reduction_is_triangular(kernel, expr, scan_info): + """ + This verifies that the domain for the scan and sweep inames is as follows: + + [scan_iname, sweep_iname]: + (sweep_min_value + <= sweep_iname + <= sweep_max_value) + and + (sweep_min_value + offset + <= scan_iname + <= stride * sweep_iname + offset) + """ + + dim_type = isl.dim_type + + domain = kernel.get_inames_domain( + (scan_info.sweep_iname, scan_info.scan_iname)) + + tri_domain = isl.BasicSet.universe(domain.params().space) + + sweep_iname = scan_info.sweep_iname + scan_iname = scan_info.scan_iname + + tri_domain = _add_params_to_domain(tri_domain, (sweep_iname, scan_iname)) + + affs = isl.affs_from_space(tri_domain.space) + + # Add sweep iname constraints + tri_domain &= affs[sweep_iname].ge_set(scan_info.sweep_lower_bound) + tri_domain &= affs[sweep_iname].le_set(scan_info.sweep_upper_bound) + + # Add scan iname constraints + offset = scan_info.offset + tri_domain &= affs[scan_iname].ge_set(scan_info.sweep_lower_bound + offset) + tri_domain &= affs[scan_iname].le_set( + scan_info.stride * affs[sweep_iname] + offset) + + # Gist against domain params + tri_domain = tri_domain.gist(domain.params()) + + # Move sweep and scan inames into the set + tri_domain = tri_domain.move_dims( + dim_type.set, 0, + dim_type.param, tri_domain.dim(dim_type.param) - 2, 2) + + tri_domain, domain = isl.align_two(tri_domain, domain) + + if domain != tri_domain: + # FIXME: Return a more descriptive error message. + return False, "domains are not equal" + else: + return True, "ok" + + +_ScanCandidateInfo = namedtuple( + "_ScanCandidateInfo", + "sweep_iname, scan_iname, sweep_lower_bound, " + "sweep_upper_bound, offset, stride") + + +def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): + from loopy.symbolic import Reduction + assert isinstance(expr, Reduction) + + if len(expr.inames) != 1: + raise ValueError( + "Multiple inames in reduction: '%s'" % (", ".join(expr.inames),)) + + scan_iname, = expr.inames + + from loopy.kernel.tools import DomainChanger + dchg = DomainChanger(kernel, (scan_iname,)) + domain = dchg.get_original_domain() + + if sweep_iname is None: + try: + sweep_iname = _try_infer_sweep_iname( + domain, scan_iname, kernel.all_inames()) + except ValueError as v: + raise ValueError("Couldn't determine a sweep iname for the scan: %s" % v) + + try: + sweep_lower_bound, sweep_upper_bound, offset = ( + _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname)) + except Exception as e: + raise ValueError("Couldn't determine bounds for scan: %s" % e) + + try: + stride = _try_infer_scan_stride( + kernel, scan_iname, sweep_iname, sweep_lower_bound) + except ValueError as v: + raise ValueError("Couldn't determine a scan stride: %s" % v) + + return _ScanCandidateInfo(sweep_iname, scan_iname, sweep_lower_bound, + sweep_upper_bound, offset, stride) + + +def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): + """ + The sweep iname is the outer iname which guides the scan. + + E.g. for a domain of {[i,j]: 0<=i 1: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' found " + "(via constraint '%s')" % (scan_iname, constr)) + + next_candidate = candidate_vars.pop() + + if sweep_iname_candidate is None: + sweep_iname_candidate = next_candidate + defining_constraint = constr + else: + # Check next_candidate consistency + if sweep_iname_candidate != next_candidate: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' " + "found (via constraints '%s', '%s')" % + (scan_iname, defining_constraint, constr)) + + if sweep_iname_candidate is None: + raise ValueError( + "Couldn't find any sweep iname candidates for " + "scan iname '%s'" % scan_iname) + + return sweep_iname_candidate + + +def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname): + sweep_bounds = kernel.get_iname_bounds(sweep_iname) + scan_bounds = kernel.get_iname_bounds(scan_iname) + scan_offset = scan_bounds.lower_bound_pw_aff - sweep_bounds.lower_bound_pw_aff + + return (sweep_bounds.lower_bound_pw_aff, + sweep_bounds.upper_bound_pw_aff, + scan_offset) + + +def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): + """The stride is the number of steps the scan iname takes per iteration + of the sweep iname. This is allowed to be an integer constant. + + E.g. for a domain of {[i,j]: 0<=i 1: + raise ValueError("range in multiple pieces: %s" % scan_iname_range) + + scan_iname_constr, scan_iname_aff = scan_iname_pieces[0] + + if not scan_iname_constr.plain_is_universe(): + raise ValueError("found constraints: %s" % scan_iname_constr) + + if scan_iname_aff.dim(dim_type.div): + raise ValueError("aff has div: %s" % scan_iname_aff) + + coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param) + + if len(coeffs) > 1: + raise ValueError("found more than one coeff: %s" % coeffs) + + if sweep_iname not in coeffs: + raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname) + + stride = coeffs[sweep_iname] + + if not stride.is_int(): + raise ValueError("stride not an integer: %s" % stride) + + if not stride.is_pos(): + raise ValueError("stride not positive: %s" % stride) + + return stride.to_python() + + +def _get_domain_with_iname_as_param(domain, iname): + dim_type = isl.dim_type + + if domain.find_dim_by_name(dim_type.param, iname) >= 0: + return domain + + iname_idx = domain.find_dim_by_name(dim_type.set, iname) + + assert iname_idx >= 0, (iname, domain) + + return domain.move_dims( + dim_type.param, domain.dim(dim_type.param), + dim_type.set, iname_idx, 1) + + +def _create_domain_for_sweep_tracking(orig_domain, + tracking_iname, sweep_iname, sweep_min_value, offset, stride): + dim_type = isl.dim_type + + subd = isl.BasicSet.universe(orig_domain.params().space) + + # Add tracking_iname and sweep iname. + + subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname)) + + # Here we realize the domain: + # + # [params, sweep_iname] -> { + # [tracking_iname]: + # offset + stride * (sweep_iname - 1) < tracking_iname + # and tracking_iname <= stride * sweep_iname + offset + # and min_value + offset <= tracking_iname } + # + affs = isl.affs_from_space(subd.space) + + subd &= affs[tracking_iname].gt_set(stride * affs[sweep_iname] - stride + offset) + subd &= affs[tracking_iname].le_set(stride * affs[sweep_iname] + offset) + subd &= affs[tracking_iname].ge_set(sweep_min_value + offset) + + # Move tracking_iname into a set dim (NOT sweep iname). + subd = subd.move_dims( + dim_type.set, 0, + dim_type.param, subd.dim(dim_type.param) - 1, 1) + + subd, = subd.get_basic_sets() + + return subd + + +# }}} + + +def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, + automagic_scans_ok=True, force_scan=False, + force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -283,6 +606,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): If *insn_id_filter* is not given, all reductions in all instructions will be realized. + + If *automagic_scans_ok*, this function will attempt to rewrite triangular + reductions as scans automatically. + + If *force_scan* is *True*, this function will attempt to rewrite *all* + candidate reductions as scans and raise an error if this is not possible + (this is most useful combined with *insn_id_filter*). + + If *force_outer_iname_for_scan* is not *None*, this function will attempt + to realize candidate reductions as scans using the specified iname as the + outer (sweep) iname. """ logger.debug("%s: realize reduction" % kernel.name) @@ -295,9 +629,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): var_name_gen = kernel.get_var_name_generator() new_temporary_variables = kernel.temporary_variables.copy() + # Dummy inames to remove after scans have been realized + inames_to_remove = set() + from loopy.type_inference import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel) + inames_added_for_scan = set() + # {{{ sequential def map_reduction_seq(expr, rec, nresults, arg_dtype, @@ -382,6 +721,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): v[iname].lt_set(v[0] + size)).get_basic_sets() return bs + def _make_slab_set_from_range(iname, lbound, ubound): + v = isl.make_zero_and_vars([iname]) + bs, = ( + v[iname].ge_set(v[0] + lbound) + & + v[iname].lt_set(v[0] + ubound)).get_basic_sets() + return bs + def map_reduction_local(expr, rec, nresults, arg_dtype, reduction_dtypes): red_iname, = expr.inames @@ -527,6 +874,337 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] # }}} + # {{{ scan utils (stateful) + + @memoize + def get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, offset, stride): + domain = kernel.get_inames_domain((scan_iname, sweep_iname)) + + tracking_iname = var_name_gen( + "{scan_iname}_tracking_{sweep_iname}" + .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + + inames_added_for_scan.add(tracking_iname) + + new_domain = _create_domain_for_sweep_tracking(domain, + tracking_iname, sweep_iname, sweep_min_value, offset, stride) + + domains.append(new_domain) + + return tracking_iname, new_domain + + def replace_scan_iname_with_tracking_iname(scan_iname, track_iname, expr): + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import ( + SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) + + rule_mapping_context = SubstitutionRuleMappingContext( + temp_kernel.substitutions, var_name_gen) + + from pymbolic import var + mapper = RuleAwareSubstitutionMapper( + rule_mapping_context, + make_subst_func({scan_iname: var(track_iname)}), + within=lambda *args: True) + + return mapper(expr, temp_kernel, None) + + # }}} + + # {{{ sequential scan + + def map_scan_seq(expr, rec, nresults, arg_dtype, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, offset, + stride): + outer_insn_inames = temp_kernel.insn_inames(insn) + inames_to_remove.add(scan_iname) + + track_iname, track_iname_domain = ( + get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, offset, stride)) + + from pymbolic import var + acc_var_names = [ + var_name_gen("acc_"+"_".join(expr.inames)) + for i in range(nresults)] + acc_vars = tuple(var(n) for n in acc_var_names) + + from loopy.kernel.data import TemporaryVariable, temp_var_scope + + for name, dtype in zip(acc_var_names, reduction_dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=(), + dtype=dtype, + scope=temp_var_scope.PRIVATE) + + init_id = insn_id_gen( + "%s_%s_init" % (insn.id, "_".join(expr.inames))) + + init_insn = make_assignment( + id=init_id, + assignees=acc_vars, + within_inames=outer_insn_inames - frozenset( + (sweep_iname,) + expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset(), + expression=expr.operation.neutral_element(arg_dtype, expr.inames)) + + generated_insns.append(init_insn) + + updated_inner_expr = replace_scan_iname_with_tracking_iname( + scan_iname, track_iname, expr.expr) + + updated_inames = tuple( + (set(expr.inames) - set([scan_iname])) | set([track_iname])) + + update_id = insn_id_gen( + based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) + + update_insn_iname_deps = temp_kernel.insn_inames(insn) | set([track_iname]) + if insn.within_inames_is_final: + update_insn_iname_deps = insn.within_inames | set([track_iname]) + + scan_insn = make_assignment( + id=update_id, + assignees=acc_vars, + expression=expr.operation( + arg_dtype, + acc_vars if len(acc_vars) > 1 else acc_vars[0], + updated_inner_expr, updated_inames), + depends_on=frozenset([init_insn.id]) | insn.depends_on, + within_inames=update_insn_iname_deps, + within_inames_is_final=insn.within_inames_is_final) + + generated_insns.append(scan_insn) + + new_insn_add_depends_on.add(scan_insn.id) + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0] + else: + return acc_vars + + # }}} + + # {{{ local-parallel scan + + def map_scan_local(expr, rec, nresults, arg_dtype, + reduction_dtypes, sweep_iname, scan_iname, + sweep_min_value, offset, stride): + + # TODO: rename + red_iname = scan_iname + + size = _get_int_iname_size(sweep_iname) + + outer_insn_inames = temp_kernel.insn_inames(insn) + + from loopy.kernel.data import LocalIndexTagBase + outer_local_inames = tuple( + oiname + for oiname in outer_insn_inames + if isinstance( + kernel.iname_to_tag.get(oiname), + LocalIndexTagBase) + and oiname != sweep_iname) + + from pymbolic import var + outer_local_iname_vars = tuple( + var(oiname) for oiname in outer_local_inames) + + outer_local_iname_sizes = tuple( + _get_int_iname_size(oiname) + for oiname in outer_local_inames) + + track_iname, track_iname_domain = get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, offset, stride) + + # {{{ add separate iname to carry out the scan + + # Doing this sheds any odd conditionals that may be active + # on our red_iname. + + base_exec_iname = var_name_gen("scan_"+sweep_iname) + domains.append(_make_slab_set(base_exec_iname, size)) + new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname] + + # }}} + + acc_var_names = [ + var_name_gen("acc_"+scan_iname) + for i in range(nresults)] + acc_vars = tuple(var(n) for n in acc_var_names) + + read_var_names = [ + var_name_gen("read_"+scan_iname) + for i in range(nresults)] + + read_vars = tuple(var(n) for n in read_var_names) + + from loopy.kernel.data import TemporaryVariable, temp_var_scope + for name, dtype in zip(acc_var_names, reduction_dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=outer_local_iname_sizes + (size,), + dtype=dtype, + scope=temp_var_scope.LOCAL) + + for name, dtype in zip(read_var_names, reduction_dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=(), + dtype=dtype, + scope=temp_var_scope.PRIVATE) + + base_iname_deps = (outer_insn_inames + - frozenset(expr.inames) - frozenset([sweep_iname])) + + neutral = expr.operation.neutral_element(arg_dtype, expr.inames) + + init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) + init_insn = make_assignment( + id=init_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(base_exec_iname),)] + for acc_var in acc_vars), + expression=neutral, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset()) + generated_insns.append(init_insn) + + # TODO: make a function.. + + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import ( + SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) + + rule_mapping_context = SubstitutionRuleMappingContext( + temp_kernel.substitutions, var_name_gen) + + from pymbolic import var + mapper = RuleAwareSubstitutionMapper( + rule_mapping_context, + make_subst_func({red_iname: var(track_iname)}), + within=lambda *args: True) + + from loopy.symbolic import Reduction + + # TODO: change sweep iname to base exec iname... + + transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) + transfer_insn = make_assignment( + id=transfer_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(sweep_iname),)] + for acc_var in acc_vars), + expression=Reduction( + operation=expr.operation, + inames=(track_iname,), + expr=mapper(expr.expr, temp_kernel, None), + allow_simultaneous=False, + ), + within_inames=outer_insn_inames - frozenset(expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([init_id]) | insn.depends_on, + no_sync_with=frozenset([(init_id, "any")])) + generated_insns.append(transfer_insn) + + def _strip_if_scalar(c): + if len(acc_vars) == 1: + return c[0] + else: + return c + + scan_size = 1 + while scan_size < size: + scan_size *= 2 + + prev_id = transfer_id + + istage = 0 + cur_size = 1 + while cur_size != scan_size: + #new_size = cur_size // 2 + #assert new_size * 2 == cur_size + + stage_exec_iname = var_name_gen("scan_%s_s%d" % (red_iname, istage)) + domains.append( + _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) + new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] + + read_stage_id = insn_id_gen( + "scan_%s_read_stage_%d" % (red_iname, istage)) + read_stage_insn = make_assignment( + id=read_stage_id, + assignees=read_vars, + expression=_strip_if_scalar([ + acc_var[ + outer_local_iname_vars + ( + var(stage_exec_iname) - cur_size,)] + for acc_var in acc_vars]), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id])) + + generated_insns.append(read_stage_insn) + prev_id = read_stage_id + + write_stage_id = insn_id_gen( + "scan_%s_write_stage_%d" % (red_iname, istage)) + write_stage_insn = make_assignment( + id=write_stage_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars), + expression=expr.operation( + arg_dtype, + _strip_if_scalar([ + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars]), + _strip_if_scalar(read_vars), + expr.inames), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id]), + ) + + generated_insns.append(write_stage_insn) + prev_id = write_stage_id + + #cur_size = new_size + #bound = cur_size + cur_size *= 2 + istage += 1 + + new_insn_add_depends_on.add(prev_id) + new_insn_add_no_sync_with.add((prev_id, "any")) + + #output_iname = var_name_gen("scan_%s_output" % red_iname) + #domains.append(_make_slab_set(output_iname, scan_size)) + #new_iname_tags[output_iname] = kernel.iname_to_tag[sweep_iname] + #new_insn_add_within_inames.add(output_iname) + new_insn_add_within_inames.add(sweep_iname) + + output_idx = var(sweep_iname) + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0][outer_local_iname_vars + (output_idx,)] + else: + return [acc_var[outer_local_iname_vars + (output_idx,)] + for acc_var in acc_vars] + + # }}} + # {{{ seq/par dispatch def map_reduction(expr, rec, nresults=1): @@ -558,31 +1236,41 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) - n_sequential = 0 - n_local_par = 0 + iname_classes = _classify_reduction_inames(temp_kernel, expr.inames) - from loopy.kernel.data import ( - LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, - ParallelTag) - for iname in expr.inames: - iname_tag = kernel.iname_to_tag.get(iname) + n_sequential = len(iname_classes.sequential) + n_local_par = len(iname_classes.local_parallel) + n_nonlocal_par = len(iname_classes.nonlocal_parallel) + + really_force_scan = force_scan and ( + len(expr.inames) != 1 or expr.inames[0] not in inames_added_for_scan) - if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): - # These are nominally parallel, but we can live with - # them as sequential. - n_sequential += 1 + def _error_if_force_scan_on(cls, msg): + if really_force_scan: + raise cls(msg) - elif isinstance(iname_tag, LocalIndexTagBase): - n_local_par += 1 + may_be_implemented_as_scan = False + if force_scan or automagic_scans_ok: + from loopy.diagnostic import ReductionIsNotTriangularError - elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): - raise LoopyError("the only form of parallelism supported " - "by reductions is 'local'--found iname '%s' " - "tagged '%s'" - % (iname, type(iname_tag).__name__)) + try: + # Try to determine scan candidate information (sweep iname, scan + # iname, etc). + scan_info = _try_infer_scan_candidate_from_expr( + temp_kernel, expr, sweep_iname=force_outer_iname_for_scan) + + except ValueError as v: + error = str(v) else: - n_sequential += 1 + # Ensures the reduction is triangular (somewhat expensive). + may_be_implemented_as_scan, error = ( + _check_reduction_is_triangular(kernel, expr, scan_info)) + + if not may_be_implemented_as_scan: + _error_if_force_scan_on(ReductionIsNotTriangularError, error) + + # {{{ sanity checks if n_local_par and n_sequential: raise LoopyError("Reduction over '%s' contains both parallel and " @@ -598,14 +1286,16 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): "before code generation." % ", ".join(expr.inames)) - if n_sequential: - assert n_local_par == 0 - return map_reduction_seq(expr, rec, nresults, arg_dtype, - reduction_dtypes) - elif n_local_par: - return map_reduction_local(expr, rec, nresults, arg_dtype, - reduction_dtypes) - else: + if n_nonlocal_par: + bad_inames = iname_classes.nonlocal_parallel + raise LoopyError("the only form of parallelism supported " + "by reductions is 'local'--found iname(s) '%s' " + "respectively tagged '%s'" + % (", ".join(bad_inames), + ", ".join(kernel.iname_to_tag[iname] + for iname in bad_inames))) + + if n_local_par == 0 and n_sequential == 0: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "empty_reduction", "Empty reduction found (no inames to reduce over). " @@ -613,6 +1303,62 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): return expr.expr + # }}} + + if may_be_implemented_as_scan: + assert force_scan or automagic_scans_ok + + if n_sequential: + sweep_iname = scan_info.sweep_iname + sweep_class = _classify_reduction_inames(kernel, (sweep_iname,)) + + sequential = sweep_iname in sweep_class.sequential + parallel = sweep_iname in sweep_class.local_parallel + bad_parallel = sweep_iname in sweep_class.nonlocal_parallel + + if sweep_iname not in outer_insn_inames: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' was detected, but is not an iname " + "for the instruction." % sweep_iname) + elif bad_parallel: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' has an unsupported parallel tag '%s' " + "- the only parallelism allowed is 'local'." % + (sweep_iname, sweep_class.nonlocal_parallel[0])) + elif parallel: + return map_scan_local( + expr, rec, nresults, arg_dtype, reduction_dtypes, + sweep_iname, scan_info.scan_iname, + scan_info.sweep_lower_bound, scan_info.offset, + scan_info.stride) + elif sequential: + return map_scan_seq( + expr, rec, nresults, arg_dtype, reduction_dtypes, + sweep_iname, scan_info.scan_iname, + scan_info.sweep_lower_bound, scan_info.offset, + scan_info.stride) + + # fallthrough to reduction implementation + + else: + assert n_local_par > 0 + scan_iname, = expr.inames + _error_if_force_scan_on(LoopyError, + "Scan iname '%s' is parallel tagged: this is not allowed " + "(only the sweep iname should be tagged if parallelism " + "is desired)." % scan_iname) + + # fallthrough to reduction implementation + + if n_sequential: + assert n_local_par == 0 + return map_reduction_seq( + expr, rec, nresults, arg_dtype, reduction_dtypes) + else: + assert n_local_par > 0 + return map_reduction_local( + expr, rec, nresults, arg_dtype, reduction_dtypes) + # }}} from loopy.symbolic import ReductionCallbackMapper @@ -718,6 +1464,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): kernel = lp.tag_inames(kernel, new_iname_tags) + # TODO: remove unused inames... + return kernel # }}} diff --git a/test/test_scan.py b/test/test_scan.py new file mode 100644 index 000000000..3200e8c56 --- /dev/null +++ b/test/test_scan.py @@ -0,0 +1,215 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = """ +Copyright (C) 2012 Andreas Kloeckner +Copyright (C) 2016 Matt Wala +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +# More things to test. +# - test that dummy inames are removed +# - nested sequential/parallel scan +# - scan(a) + scan(b) +# - global parallel scan +# - segmented scan + + +@pytest.mark.parametrize("n", [1, 2, 3, 16]) +def test_sequential_scan(ctx_factory, n): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i {[i,j]: 0<=i {[i]: 0<=i {[i]: 0 <= i < n}", + "{[j]: 0 <= j <= i}", + "{[k]: 0 <= j <= k}" + ], + "a[i] = sum(j, sum(k, k))") +""" + + +def test_scan_unsupported_stride(): + knl = lp.make_kernel( + "{[i,j]: 0<=i<100 and 1<=j<=2*i}", + """ + a[i] = sum(j, j**2) {id=scan} + """ + ) + + with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): + knl = lp.realize_reduction(knl, force_scan=True) + + +@pytest.mark.parametrize("n", [1, 2, 3, 16, 17]) +def test_local_parallel_scan(ctx_factory, n): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i {[i,j,k]: 0<=i 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker -- GitLab From c2868e8578e53c4dc7d211a5259538f735573abd Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 23 Feb 2017 12:52:25 -0600 Subject: [PATCH 02/27] Fix empty reduction issue, rename 'scan info' to 'scan parameter'. --- loopy/preprocess.py | 54 ++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 393b4cd4c..7d431ed90 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -326,9 +326,10 @@ def _add_params_to_domain(domain, param_names): return domain -def _check_reduction_is_triangular(kernel, expr, scan_info): - """ - This verifies that the domain for the scan and sweep inames is as follows: +def _check_reduction_is_triangular(kernel, expr, scan_param): + """Check whether the reduction within `expr` with scan parameters described by + the structure `scan_param` is triangular. This attempts to verify that the + domain for the scan and sweep inames is as follows: [scan_iname, sweep_iname]: (sweep_min_value @@ -343,26 +344,26 @@ def _check_reduction_is_triangular(kernel, expr, scan_info): dim_type = isl.dim_type domain = kernel.get_inames_domain( - (scan_info.sweep_iname, scan_info.scan_iname)) + (scan_param.sweep_iname, scan_param.scan_iname)) tri_domain = isl.BasicSet.universe(domain.params().space) - sweep_iname = scan_info.sweep_iname - scan_iname = scan_info.scan_iname + sweep_iname = scan_param.sweep_iname + scan_iname = scan_param.scan_iname tri_domain = _add_params_to_domain(tri_domain, (sweep_iname, scan_iname)) affs = isl.affs_from_space(tri_domain.space) # Add sweep iname constraints - tri_domain &= affs[sweep_iname].ge_set(scan_info.sweep_lower_bound) - tri_domain &= affs[sweep_iname].le_set(scan_info.sweep_upper_bound) + tri_domain &= affs[sweep_iname].ge_set(scan_param.sweep_lower_bound) + tri_domain &= affs[sweep_iname].le_set(scan_param.sweep_upper_bound) # Add scan iname constraints - offset = scan_info.offset - tri_domain &= affs[scan_iname].ge_set(scan_info.sweep_lower_bound + offset) + offset = scan_param.offset + tri_domain &= affs[scan_iname].ge_set(scan_param.sweep_lower_bound + offset) tri_domain &= affs[scan_iname].le_set( - scan_info.stride * affs[sweep_iname] + offset) + scan_param.stride * affs[sweep_iname] + offset) # Gist against domain params tri_domain = tri_domain.gist(domain.params()) @@ -381,13 +382,15 @@ def _check_reduction_is_triangular(kernel, expr, scan_info): return True, "ok" -_ScanCandidateInfo = namedtuple( - "_ScanCandidateInfo", +_ScanCandidateParameters = namedtuple( + "_ScanCandidateParameters", "sweep_iname, scan_iname, sweep_lower_bound, " "sweep_upper_bound, offset, stride") def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): + """Analyze `expr` and determine if it can be implemented as a scan. + """ from loopy.symbolic import Reduction assert isinstance(expr, Reduction) @@ -420,13 +423,12 @@ def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): except ValueError as v: raise ValueError("Couldn't determine a scan stride: %s" % v) - return _ScanCandidateInfo(sweep_iname, scan_iname, sweep_lower_bound, + return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, sweep_upper_bound, offset, stride) def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): - """ - The sweep iname is the outer iname which guides the scan. + """The sweep iname is the outer iname which guides the scan. E.g. for a domain of {[i,j]: 0<=i 1: raise ValueError("range in multiple pieces: %s" % scan_iname_range) + elif len(scan_iname_pieces) == 0: + raise ValueError("empty range found for iname '%s'" % scan_iname) scan_iname_constr, scan_iname_aff = scan_iname_pieces[0] @@ -1256,7 +1260,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, try: # Try to determine scan candidate information (sweep iname, scan # iname, etc). - scan_info = _try_infer_scan_candidate_from_expr( + scan_param = _try_infer_scan_candidate_from_expr( temp_kernel, expr, sweep_iname=force_outer_iname_for_scan) except ValueError as v: @@ -1265,7 +1269,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, else: # Ensures the reduction is triangular (somewhat expensive). may_be_implemented_as_scan, error = ( - _check_reduction_is_triangular(kernel, expr, scan_info)) + _check_reduction_is_triangular(kernel, expr, scan_param)) if not may_be_implemented_as_scan: _error_if_force_scan_on(ReductionIsNotTriangularError, error) @@ -1309,7 +1313,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, assert force_scan or automagic_scans_ok if n_sequential: - sweep_iname = scan_info.sweep_iname + sweep_iname = scan_param.sweep_iname sweep_class = _classify_reduction_inames(kernel, (sweep_iname,)) sequential = sweep_iname in sweep_class.sequential @@ -1328,15 +1332,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, elif parallel: return map_scan_local( expr, rec, nresults, arg_dtype, reduction_dtypes, - sweep_iname, scan_info.scan_iname, - scan_info.sweep_lower_bound, scan_info.offset, - scan_info.stride) + sweep_iname, scan_param.scan_iname, + scan_param.sweep_lower_bound, scan_param.offset, + scan_param.stride) elif sequential: return map_scan_seq( expr, rec, nresults, arg_dtype, reduction_dtypes, - sweep_iname, scan_info.scan_iname, - scan_info.sweep_lower_bound, scan_info.offset, - scan_info.stride) + sweep_iname, scan_param.scan_iname, + scan_param.sweep_lower_bound, scan_param.offset, + scan_param.stride) # fallthrough to reduction implementation -- GitLab From 611c5914c760261d3dd4711c8b7a5b254fcc687a Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 1 Mar 2017 01:30:44 -0600 Subject: [PATCH 03/27] Initial support for segmented scans. These changes also involve overhauling the reduction semantics to have tuple-based expressions. See also: #32 --- loopy/kernel/instruction.py | 3 +- loopy/library/reduction.py | 242 ++++++++++++++++++---- loopy/preprocess.py | 388 ++++++++++++++++++++---------------- loopy/symbolic.py | 57 +++--- loopy/transform/data.py | 16 +- loopy/transform/iname.py | 18 +- loopy/type_inference.py | 17 +- test/test_reduction.py | 4 +- test/test_scan.py | 110 +++++++++- 9 files changed, 594 insertions(+), 261 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index fdd8f1d37..fc1025861 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -658,7 +658,8 @@ class MultiAssignmentBase(InstructionBase): @memoize_method def reduction_inames(self): def map_reduction(expr, rec): - rec(expr.expr) + for sub_expr in expr.exprs: + rec(sub_expr) for iname in expr.inames: result.add(iname) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index d24b61c12..b6dbc4b43 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -36,15 +36,19 @@ class ReductionOperation(object): equality-comparable. """ - def result_dtypes(self, target, arg_dtype, inames): + def result_dtypes(self, target, *arg_dtypes): """ - :arg arg_dtype: may be None if not known + :arg arg_dtypes: may be None if not known :returns: None if not known, otherwise the returned type """ raise NotImplementedError - def neutral_element(self, dtype, inames): + @property + def arg_count(self): + raise NotImplementedError + + def neutral_element(self, *dtypes): raise NotImplementedError def __hash__(self): @@ -55,7 +59,7 @@ class ReductionOperation(object): # Force subclasses to override raise NotImplementedError - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): raise NotImplementedError def __ne__(self, other): @@ -87,7 +91,11 @@ class ScalarReductionOperation(ReductionOperation): """ self.forced_result_type = forced_result_type - def result_dtypes(self, kernel, arg_dtype, inames): + @property + def arg_count(self): + return 1 + + def result_dtypes(self, kernel, arg_dtype): if self.forced_result_type is not None: return (self.parse_result_type( kernel.target, self.forced_result_type),) @@ -114,18 +122,18 @@ class ScalarReductionOperation(ReductionOperation): class SumReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype, inames): + def neutral_element(self, dtype): return 0 - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): return operand1 + operand2 class ProductReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype, inames): + def neutral_element(self, dtype): return 1 - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): return operand1 * operand2 @@ -166,32 +174,144 @@ def get_ge_neutral(dtype): class MaxReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype, inames): + def neutral_element(self, dtype): return get_ge_neutral(dtype) - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): return var("max")(operand1, operand2) class MinReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype, inames): + def neutral_element(self, dtype): return get_le_neutral(dtype) - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): return var("min")(operand1, operand2) +# {{{ segmented reduction + +class _SegmentedScalarReductionOperation(ReductionOperation): + def __init__(self, **kwargs): + self.inner_reduction = self.base_reduction_class(**kwargs) + + @property + def arg_count(self): + return 2 + + def prefix(self, scalar_dtype, segment_flag_dtype): + return "loopy_segmented_%s_%s_%s" % (self.which, + scalar_dtype.numpy_dtype.type.__name__, + segment_flag_dtype.numpy_dtype.type.__name__) + + def neutral_element(self, scalar_dtype, segment_flag_dtype): + return SegmentedFunction(self, (scalar_dtype, segment_flag_dtype), "init")() + + def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): + return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) + + (segment_flag_dtype,)) + + def __str__(self): + return "segmented_" + self.which + + def __hash__(self): + return hash(type(self)) + + def __eq__(self, other): + return type(self) == type(other) + + def __call__(self, dtypes, operand1, operand2): + return SegmentedFunction(self, dtypes, "update")(*(operand1 + operand2)) + + +class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): + base_reduction_class = SumReductionOperation + which = "sum" + op = "((%s) + (%s))" + + +class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): + base_reduction_class = ProductReductionOperation + op = "((%s) * (%s))" + which = "product" + + +class SegmentedFunction(FunctionIdentifier): + init_arg_names = ("reduction_op", "dtypes", "name") + + def __init__(self, reduction_op, dtypes, name): + """ + :arg dtypes: A :class:`tuple` of `(scalar_dtype, segment_flag_dtype)` + """ + self.reduction_op = reduction_op + self.dtypes = dtypes + self.name = name + + @property + def scalar_dtype(self): + return self.dtypes[0] + + @property + def segment_flag_dtype(self): + return self.dtypes[1] + + def __getinitargs__(self): + return (self.reduction_op, self.dtypes, self.name) + + +def get_segmented_function_preamble(kernel, func_id): + op = func_id.reduction_op + prefix = op.prefix(func_id.scalar_dtype, func_id.segment_flag_dtype) + + from pymbolic.mapper.c_code import CCodeMapper + + c_code_mapper = CCodeMapper() + + return (prefix, """ + inline %(scalar_t)s %(prefix)s_init(%(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = 0; + return %(neutral)s; + } + + inline %(scalar_t)s %(prefix)s_update( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype), + prefix=prefix, + segment_flag_t=kernel.target.dtype_to_typename( + func_id.segment_flag_dtype), + neutral=c_code_mapper( + op.inner_reduction.neutral_element(func_id.scalar_dtype)), + combined=op.op % ("op1", "op2"), + )) + + +# }}} + + # {{{ argmin/argmax class _ArgExtremumReductionOperation(ReductionOperation): - def prefix(self, dtype): - return "loopy_arg%s_%s" % (self.which, dtype.numpy_dtype.type.__name__) + def prefix(self, scalar_dtype, index_dtype): + return "loopy_arg%s_%s_%s" % (self.which, + index_dtype.numpy_dtype.type.__name__, + scalar_dtype.numpy_dtype.type.__name__) - def result_dtypes(self, kernel, dtype, inames): - return (dtype, kernel.index_dtype) + def result_dtypes(self, kernel, scalar_dtype, index_dtype): + return (scalar_dtype, index_dtype) - def neutral_element(self, dtype, inames): - return ArgExtFunction(self, dtype, "init", inames)() + def neutral_element(self, scalar_dtype, index_dtype): + return ArgExtFunction(self, (scalar_dtype, index_dtype), "init")() + + def __str__(self): + return self.which def __hash__(self): return hash(type(self)) @@ -199,11 +319,12 @@ class _ArgExtremumReductionOperation(ReductionOperation): def __eq__(self, other): return type(self) == type(other) - def __call__(self, dtype, operand1, operand2, inames): - iname, = inames + @property + def arg_count(self): + return 2 - return ArgExtFunction(self, dtype, "update", inames)( - *(operand1 + (operand2, var(iname)))) + def __call__(self, dtypes, operand1, operand2): + return ArgExtFunction(self, dtypes, "update")(*(operand1 + operand2)) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -219,21 +340,28 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): class ArgExtFunction(FunctionIdentifier): - init_arg_names = ("reduction_op", "scalar_dtype", "name", "inames") + init_arg_names = ("reduction_op", "dtypes", "name") - def __init__(self, reduction_op, scalar_dtype, name, inames): + def __init__(self, reduction_op, dtypes, name): self.reduction_op = reduction_op - self.scalar_dtype = scalar_dtype + self.dtypes = dtypes self.name = name - self.inames = inames + + @property + def scalar_dtype(self): + return self.dtypes[0] + + @property + def index_dtype(self): + return self.dtypes[1] def __getinitargs__(self): - return (self.reduction_op, self.scalar_dtype, self.name, self.inames) + return (self.reduction_op, self.dtypes, self.name) def get_argext_preamble(kernel, func_id): op = func_id.reduction_op - prefix = op.prefix(func_id.scalar_dtype) + prefix = op.prefix(func_id.scalar_dtype, func_id.index_dtype) from pymbolic.mapper.c_code import CCodeMapper @@ -267,7 +395,7 @@ def get_argext_preamble(kernel, func_id): """ % dict( scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype), prefix=prefix, - index_t=kernel.target.dtype_to_typename(kernel.index_dtype), + index_t=kernel.target.dtype_to_typename(func_id.index_dtype), neutral=c_code_mapper(neutral(func_id.scalar_dtype)), comp=op.update_comparison, )) @@ -284,6 +412,8 @@ _REDUCTION_OPS = { "min": MinReductionOperation, "argmax": ArgMaxReductionOperation, "argmin": ArgMinReductionOperation, + "segmented_sum": SegmentedSumReductionOperation, + "segmented_product": SegmentedProductReductionOperation, } _REDUCTION_OP_PARSERS = [ @@ -333,9 +463,10 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes): from loopy.kernel.data import CallMangleInfo return CallMangleInfo( - target_name="%s_init" % op.prefix(func_id.scalar_dtype), + target_name="%s_init" % op.prefix( + func_id.scalar_dtype, func_id.index_dtype), result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.inames), + kernel, func_id.scalar_dtype, func_id.index_dtype), arg_dtypes=(), ) @@ -348,9 +479,10 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes): from loopy.kernel.data import CallMangleInfo return CallMangleInfo( - target_name="%s_update" % op.prefix(func_id.scalar_dtype), + target_name="%s_update" % op.prefix( + func_id.scalar_dtype, func_id.index_dtype), result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.inames), + kernel, func_id.scalar_dtype, func_id.index_dtype), arg_dtypes=( func_id.scalar_dtype, kernel.index_dtype, @@ -358,6 +490,42 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes): kernel.index_dtype), ) + elif isinstance(func_id, SegmentedFunction) and func_id.name == "init": + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + raise LoopyError("only OpenCL supported for now") + + op = func_id.reduction_op + + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo( + target_name="%s_init" % op.prefix( + func_id.scalar_dtype, func_id.segment_flag_dtype), + result_dtypes=op.result_dtypes( + kernel, func_id.scalar_dtype, func_id.segment_flag_dtype), + arg_dtypes=(), + ) + + elif isinstance(func_id, SegmentedFunction) and func_id.name == "update": + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + raise LoopyError("only OpenCL supported for now") + + op = func_id.reduction_op + + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo( + target_name="%s_update" % op.prefix( + func_id.scalar_dtype, func_id.segment_flag_dtype), + result_dtypes=op.result_dtypes( + kernel, func_id.scalar_dtype, func_id.segment_flag_dtype), + arg_dtypes=( + func_id.scalar_dtype, + func_id.segment_flag_dtype, + func_id.scalar_dtype, + func_id.segment_flag_dtype), + ) + return None @@ -371,4 +539,10 @@ def reduction_preamble_generator(preamble_info): yield get_argext_preamble(preamble_info.kernel, func.name) + elif isinstance(func.name, SegmentedFunction): + if not isinstance(preamble_info.kernel.target, OpenCLTarget): + raise LoopyError("only OpenCL supported for now") + + yield get_segmented_function_preamble(preamble_info.kernel, func.name) + # vim: fdm=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index b6463de96..b2875b44e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -98,7 +98,9 @@ def check_reduction_iname_uniqueness(kernel): iname_to_nonsimultaneous_reduction_count = {} def map_reduction(expr, rec): - rec(expr.expr) + for sub_expr in expr.exprs: + rec(sub_expr) + for iname in expr.inames: iname_to_reduction_count[iname] = ( iname_to_reduction_count.get(iname, 0) + 1) @@ -531,6 +533,18 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): if len(coeffs) > 1: raise ValueError("found more than one coeff: %s" % coeffs) + if len(coeffs) == 0: + try: + scan_iname_aff.get_constant_val() + except: + raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff) + + # If this point is reached we're assuming the domain is of the form + # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value + # this function returns will be verified later by + # _check_reduction_is_triangular(). + return 1 + if sweep_iname not in coeffs: raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname) @@ -586,14 +600,55 @@ def _create_domain_for_sweep_tracking(orig_domain, # Move tracking_iname into a set dim (NOT sweep iname). subd = subd.move_dims( - dim_type.set, 0, - dim_type.param, subd.dim(dim_type.param) - 1, 1) + dim_type.set, 0, + dim_type.param, subd.dim(dim_type.param) - 1, 1) + + # Simplify (maybe). + orig_domain_with_sweep_param = ( + _get_domain_with_iname_as_param(orig_domain, sweep_iname)) + subd = subd.gist_params(orig_domain_with_sweep_param.params()) subd, = subd.get_basic_sets() return subd +def _strip_if_scalar(reference_exprs, expr): + if len(reference_exprs) == 1: + return expr[0] + else: + return expr + + +def _infer_arg_dtypes_and_reduction_dtypes(kernel, expr, unknown_types_ok): + arg_dtypes = [] + + from loopy.type_inference import TypeInferenceMapper + type_inf_mapper = TypeInferenceMapper(kernel) + import loopy as lp + + for sub_expr in expr.exprs: + try: + arg_dtype = type_inf_mapper(sub_expr) + except DependencyTypeInferenceFailure: + if unknown_types_ok: + arg_dtype = lp.auto + else: + raise LoopyError("failed to determine type of accumulator for " + "reduction sub-expression '%s'" % sub_expr) + else: + arg_dtype = arg_dtype.with_target(kernel.target) + + arg_dtypes.append(arg_dtype) + + reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes) + reduction_dtypes = tuple( + dt.with_target(kernel.target) + if dt is not lp.auto else dt + for dt in reduction_dtypes) + + return tuple(arg_dtypes), reduction_dtypes + # }}} @@ -636,32 +691,25 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Dummy inames to remove after scans have been realized inames_to_remove = set() - from loopy.type_inference import TypeInferenceMapper - type_inf_mapper = TypeInferenceMapper(kernel) - inames_added_for_scan = set() # {{{ sequential - def map_reduction_seq(expr, rec, nresults, arg_dtype, + def map_reduction_seq(expr, rec, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) + from loopy.kernel.data import temp_var_scope + acc_var_names = make_temporaries( + name_based_on="acc_"+"_".join(expr.inames), + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + from pymbolic import var - acc_var_names = [ - var_name_gen("acc_"+"_".join(expr.inames)) - for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) - from loopy.kernel.data import TemporaryVariable, temp_var_scope - - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) - init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) @@ -671,7 +719,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset(), - expression=expr.operation.neutral_element(arg_dtype, expr.inames)) + expression=expr.operation.neutral_element(*arg_dtypes)) generated_insns.append(init_insn) @@ -686,9 +734,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, id=update_id, assignees=acc_vars, expression=expr.operation( - arg_dtype, - acc_vars if len(acc_vars) > 1 else acc_vars[0], - expr.expr, expr.inames), + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, expr.exprs)), depends_on=frozenset([init_insn.id]) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final) @@ -733,7 +781,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, nresults, arg_dtype, + def map_reduction_local(expr, rec, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -757,6 +805,24 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _get_int_iname_size(oiname) for oiname in outer_local_inames) + from loopy.kernel.data import temp_var_scope + + neutral_var_names = make_temporaries( + name_based_on="neutral_"+red_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + acc_var_names = make_temporaries( + name_based_on="acc_"+red_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (size,), + dtypes=reduction_dtypes, + scope=temp_var_scope.LOCAL) + + acc_vars = tuple(var(n) for n in acc_var_names) + # {{{ add separate iname to carry out the reduction # Doing this sheds any odd conditionals that may be active @@ -768,31 +834,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # }}} - neutral_var_names = [ - var_name_gen("neutral_"+red_iname) - for i in range(nresults)] - acc_var_names = [ - var_name_gen("acc_"+red_iname) - for i in range(nresults)] - acc_vars = tuple(var(n) for n in acc_var_names) - - from loopy.kernel.data import TemporaryVariable, temp_var_scope - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=outer_local_iname_sizes + (size,), - dtype=dtype, - scope=temp_var_scope.LOCAL) - for name, dtype in zip(neutral_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) - base_iname_deps = outer_insn_inames - frozenset(expr.inames) - neutral = expr.operation.neutral_element(arg_dtype, expr.inames) + neutral = expr.operation.neutral_element(*arg_dtypes) init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( @@ -806,12 +850,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, depends_on=frozenset()) generated_insns.append(init_insn) - def _strip_if_scalar(c): - if len(acc_vars) == 1: - return c[0] - else: - return c - init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname)) init_neutral_insn = make_assignment( id=init_neutral_id, @@ -829,9 +867,11 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), expression=expr.operation( - arg_dtype, - _strip_if_scalar(tuple(var(nvn) for nvn in neutral_var_names)), - expr.expr, expr.inames), + arg_dtypes, + _strip_if_scalar( + expr.exprs, + tuple(var(nvn) for nvn in neutral_var_names)), + _strip_if_scalar(expr.exprs, expr.exprs)), within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), @@ -864,17 +904,16 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), expression=expr.operation( - arg_dtype, - _strip_if_scalar(tuple( + arg_dtypes, + _strip_if_scalar(acc_vars, tuple( acc_var[ outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars)), - _strip_if_scalar(tuple( + _strip_if_scalar(acc_vars, tuple( acc_var[ outer_local_iname_vars + ( var(stage_exec_iname) + new_size,)] - for acc_var in acc_vars)), - expr.inames), + for acc_var in acc_vars))), within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -899,7 +938,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] # }}} - # {{{ scan utils (stateful) + # {{{ utils (stateful) @memoize def get_or_add_sweep_tracking_iname_and_domain( @@ -919,7 +958,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return tracking_iname, new_domain - def replace_scan_iname_with_tracking_iname(scan_iname, track_iname, expr): + def replace_var_within_expr(expr, from_var, to_var): from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import ( @@ -931,16 +970,32 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, from pymbolic import var mapper = RuleAwareSubstitutionMapper( rule_mapping_context, - make_subst_func({scan_iname: var(track_iname)}), + make_subst_func({from_var: var(to_var)}), within=lambda *args: True) return mapper(expr, temp_kernel, None) + def make_temporaries(name_based_on, nvars, shape, dtypes, scope): + var_names = [ + var_name_gen(name_based_on.format(index=i)) + for i in range(nvars)] + + from loopy.kernel.data import TemporaryVariable + + for name, dtype in zip(var_names, dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=shape, + dtype=dtype, + scope=scope) + + return var_names + # }}} # {{{ sequential scan - def map_scan_seq(expr, rec, nresults, arg_dtype, + def map_scan_seq(expr, rec, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, offset, stride): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -948,23 +1003,19 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, track_iname, track_iname_domain = ( get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, offset, stride)) + scan_iname, sweep_iname, sweep_min_value, offset, stride)) + + from loopy.kernel.data import temp_var_scope + acc_var_names = make_temporaries( + name_based_on="acc_" + scan_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) from pymbolic import var - acc_var_names = [ - var_name_gen("acc_"+"_".join(expr.inames)) - for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) - from loopy.kernel.data import TemporaryVariable, temp_var_scope - - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) - init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) @@ -975,15 +1026,18 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, (sweep_iname,) + expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset(), - expression=expr.operation.neutral_element(arg_dtype, expr.inames)) + expression=expr.operation.neutral_element(*arg_dtypes)) generated_insns.append(init_insn) - updated_inner_expr = replace_scan_iname_with_tracking_iname( - scan_iname, track_iname, expr.expr) + updated_inner_exprs = tuple( + replace_var_within_expr(sub_expr, scan_iname, track_iname) + for sub_expr in expr.exprs) + """ updated_inames = tuple( - (set(expr.inames) - set([scan_iname])) | set([track_iname])) + (set(expr.inames) - set([scan_iname])) | set([track_iname])) + """ update_id = insn_id_gen( based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) @@ -996,9 +1050,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, id=update_id, assignees=acc_vars, expression=expr.operation( - arg_dtype, - acc_vars if len(acc_vars) > 1 else acc_vars[0], - updated_inner_expr, updated_inames), + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, updated_inner_exprs)), depends_on=frozenset([init_insn.id]) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final) @@ -1017,7 +1071,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ local-parallel scan - def map_scan_local(expr, rec, nresults, arg_dtype, + def map_scan_local(expr, rec, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, offset, stride): @@ -1059,36 +1113,39 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # }}} - acc_var_names = [ - var_name_gen("acc_"+scan_iname) - for i in range(nresults)] - acc_vars = tuple(var(n) for n in acc_var_names) - - read_var_names = [ - var_name_gen("read_"+scan_iname) - for i in range(nresults)] + from loopy.kernel.data import temp_var_scope + + """ + neutral_var_names = make_temporaries( + name_based_on="neutral_"+scan_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + """ + + read_var_names = make_temporaries( + name_based_on="read_"+scan_iname+"_arg_{index}", + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + acc_var_names = make_temporaries( + name_based_on="acc_"+scan_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (size,), + dtypes=reduction_dtypes, + scope=temp_var_scope.LOCAL) + acc_vars = tuple(var(n) for n in acc_var_names) read_vars = tuple(var(n) for n in read_var_names) - - from loopy.kernel.data import TemporaryVariable, temp_var_scope - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=outer_local_iname_sizes + (size,), - dtype=dtype, - scope=temp_var_scope.LOCAL) - - for name, dtype in zip(read_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) + #neutral_vars = tuple(var(n) for n in neutral_var_names) base_iname_deps = (outer_insn_inames - frozenset(expr.inames) - frozenset([sweep_iname])) - neutral = expr.operation.neutral_element(arg_dtype, expr.inames) + neutral = expr.operation.neutral_element(*arg_dtypes) init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( @@ -1102,21 +1159,22 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, depends_on=frozenset()) generated_insns.append(init_insn) - # TODO: make a function.. - - from pymbolic.mapper.substitutor import make_subst_func - - from loopy.symbolic import ( - SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) - - rule_mapping_context = SubstitutionRuleMappingContext( - temp_kernel.substitutions, var_name_gen) + """ + # XXX: Is this needed? + init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname)) + init_neutral_insn = make_assignment( + id=init_neutral_id, + assignees=tuple(var(nvn) for nvn in neutral_var_names), + expression=neutral, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset()) + generated_insns.append(init_neutral_insn) + """ - from pymbolic import var - mapper = RuleAwareSubstitutionMapper( - rule_mapping_context, - make_subst_func({red_iname: var(track_iname)}), - within=lambda *args: True) + updated_inner_exprs = tuple( + replace_var_within_expr(sub_expr, scan_iname, track_iname) + for sub_expr in expr.exprs) from loopy.symbolic import Reduction @@ -1131,7 +1189,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=Reduction( operation=expr.operation, inames=(track_iname,), - expr=mapper(expr.expr, temp_kernel, None), + exprs=updated_inner_exprs, allow_simultaneous=False, ), within_inames=outer_insn_inames - frozenset(expr.inames), @@ -1163,23 +1221,24 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] - read_stage_id = insn_id_gen( - "scan_%s_read_stage_%d" % (red_iname, istage)) - read_stage_insn = make_assignment( - id=read_stage_id, - assignees=read_vars, - expression=_strip_if_scalar([ - acc_var[ - outer_local_iname_vars + ( - var(stage_exec_iname) - cur_size,)] - for acc_var in acc_vars]), - within_inames=( - base_iname_deps | frozenset([stage_exec_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id])) - - generated_insns.append(read_stage_insn) - prev_id = read_stage_id + for read_var, acc_var in zip(read_vars, acc_vars): + read_stage_id = insn_id_gen( + "scan_%s_read_stage_%d" % (red_iname, istage)) + + read_stage_insn = make_assignment( + id=read_stage_id, + assignees=(read_var,), + expression=( + acc_var[ + outer_local_iname_vars + + (var(stage_exec_iname) - cur_size,)]), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id])) + + generated_insns.append(read_stage_insn) + prev_id = read_stage_id write_stage_id = insn_id_gen( "scan_%s_write_stage_%d" % (red_iname, istage)) @@ -1189,13 +1248,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), expression=expr.operation( - arg_dtype, - _strip_if_scalar([ + arg_dtypes, + _strip_if_scalar(tuple( acc_var[ outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars]), - _strip_if_scalar(read_vars), - expr.inames), + for acc_var in acc_vars)), + _strip_if_scalar(read_vars) + ), within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -1236,24 +1295,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. - try: - arg_dtype = type_inf_mapper(expr.expr) - except DependencyTypeInferenceFailure: - if unknown_types_ok: - arg_dtype = lp.auto - - reduction_dtypes = (lp.auto,)*nresults - - else: - raise LoopyError("failed to determine type of accumulator for " - "reduction '%s'" % expr) - else: - arg_dtype = arg_dtype.with_target(kernel.target) - - reduction_dtypes = expr.operation.result_dtypes( - kernel, arg_dtype, expr.inames) - reduction_dtypes = tuple( - dt.with_target(kernel.target) for dt in reduction_dtypes) + arg_dtypes, reduction_dtypes = ( + _infer_arg_dtypes_and_reduction_dtypes( + temp_kernel, expr, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1290,7 +1334,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, else: # Ensures the reduction is triangular (somewhat expensive). may_be_implemented_as_scan, error = ( - _check_reduction_is_triangular(kernel, expr, scan_param)) + _check_reduction_is_triangular( + temp_kernel, expr, scan_param)) if not may_be_implemented_as_scan: _error_if_force_scan_on(ReductionIsNotTriangularError, error) @@ -1326,6 +1371,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, "Empty reduction found (no inames to reduce over). " "Eliminating.") + # FIXME: return neutral element... + return expr.expr # }}} @@ -1351,14 +1398,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, "- the only parallelism allowed is 'local'." % (sweep_iname, sweep_class.nonlocal_parallel[0])) elif parallel: + print(temp_kernel) return map_scan_local( - expr, rec, nresults, arg_dtype, reduction_dtypes, + expr, rec, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.offset, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, nresults, arg_dtype, reduction_dtypes, + expr, rec, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.offset, scan_param.stride) @@ -1378,11 +1426,11 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if n_sequential: assert n_local_par == 0 return map_reduction_seq( - expr, rec, nresults, arg_dtype, reduction_dtypes) + expr, rec, nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, nresults, arg_dtype, reduction_dtypes) + expr, rec, nresults, arg_dtypes, reduction_dtypes) # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 50c891be4..bbf798e5e 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -95,7 +95,8 @@ class IdentityMapperMixin(object): new_inames.append(new_sym_iname.name) return Reduction( - expr.operation, tuple(new_inames), self.rec(expr.expr, *args), + expr.operation, tuple(new_inames), + tuple(self.rec(e, *args) for e in expr.exprs), allow_simultaneous=expr.allow_simultaneous) def map_tagged_variable(self, expr, *args): @@ -144,7 +145,8 @@ class WalkMapper(WalkMapperBase): if not self.visit(expr): return - self.rec(expr.expr, *args) + for sub_expr in expr.exprs: + self.rec(sub_expr, *args) map_tagged_variable = WalkMapperBase.map_variable @@ -162,7 +164,7 @@ class CallbackMapper(CallbackMapperBase, IdentityMapper): class CombineMapper(CombineMapperBase): def map_reduction(self, expr): - return self.rec(expr.expr) + return self.combine(self.rec(sub_expr) for sub_expr in expr.exprs) map_linear_subscript = CombineMapperBase.map_subscript @@ -192,9 +194,11 @@ class StringifyMapper(StringifyMapperBase): return "loc.%d" % expr.index def map_reduction(self, expr, prec): + from pymbolic.mapper.stringifier import PREC_NONE return "%sreduce(%s, [%s], %s)" % ( "simul_" if expr.allow_simultaneous else "", - expr.operation, ", ".join(expr.inames), expr.expr) + expr.operation, ", ".join(expr.inames), + ", ".join(self.rec(e, PREC_NONE) for e in expr.exprs)) def map_tagged_variable(self, expr, prec): return "%s$%s" % (expr.name, expr.tag) @@ -225,7 +229,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase): ): return [] - return self.rec(expr.expr, other.expr, unis) + return self.rec(expr.exprs, other.exprs, unis) def map_tagged_variable(self, expr, other, urecs): new_uni_record = self.unification_record_from_equation( @@ -258,7 +262,7 @@ class DependencyMapper(DependencyMapperBase): self.rec(child, *args) for child in expr.parameters) def map_reduction(self, expr): - return (self.rec(expr.expr) + return (self.combine(self.rec(sub_expr) for sub_expr in expr.exprs) - set(p.Variable(iname) for iname in expr.inames)) def map_tagged_variable(self, expr): @@ -440,10 +444,10 @@ class Reduction(p.Expression): a list of inames across which reduction on :attr:`expr` is being carried out. - .. attribute:: expr + .. attribute:: exprs - The expression (as a :class:`pymbolic.primitives.Expression`) - on which reduction is performed. + A :class:`tuple` of :class:`pymbolic.primitives.Expression`, + representing the expression(s) over which reduction is performed. .. attribute:: allow_simultaneous @@ -451,9 +455,9 @@ class Reduction(p.Expression): in precisely one reduction, to avoid mis-nesting errors. """ - init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") + init_arg_names = ("operation", "inames", "exprs", "allow_simultaneous") - def __init__(self, operation, inames, expr, allow_simultaneous=False): + def __init__(self, operation, inames, exprs, allow_simultaneous=False): if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -475,26 +479,28 @@ class Reduction(p.Expression): from loopy.library.reduction import parse_reduction_op operation = parse_reduction_op(operation) + if not isinstance(exprs, tuple): + exprs = (exprs,) + from loopy.library.reduction import ReductionOperation assert isinstance(operation, ReductionOperation) self.operation = operation self.inames = inames - self.expr = expr + self.exprs = exprs self.allow_simultaneous = allow_simultaneous def __getinitargs__(self): - return (self.operation, self.inames, self.expr, self.allow_simultaneous) + return (self.operation, self.inames, self.exprs, self.allow_simultaneous) def get_hash(self): - return hash((self.__class__, self.operation, self.inames, - self.expr)) + return hash((self.__class__, self.operation, self.inames, self.exprs)) def is_equal(self, other): return (other.__class__ == self.__class__ and other.operation == self.operation and other.inames == self.inames - and other.expr == self.expr) + and other.exprs == self.exprs) def stringifier(self): return StringifyMapper @@ -924,7 +930,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): turns those into the actual pymbolic primitives used for that. """ - def _parse_reduction(self, operation, inames, red_expr, + def _parse_reduction(self, operation, inames, red_exprs, allow_simultaneous=False): if isinstance(inames, p.Variable): inames = (inames,) @@ -941,7 +947,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): processed_inames.append(iname.name) - return Reduction(operation, tuple(processed_inames), red_expr, + return Reduction(operation, tuple(processed_inames), red_exprs, allow_simultaneous=allow_simultaneous) def map_call(self, expr): @@ -991,12 +997,17 @@ class FunctionToPrimitiveMapper(IdentityMapper): operation = parse_reduction_op(name) if operation: - if len(expr.parameters) != 2: + # arg_count counts arguments but not inames + if len(expr.parameters) != 1 + operation.arg_count: raise RuntimeError("invalid invocation of " - "reduction operation '%s'" % expr.function.name) + "reduction operation '%s': expected %d arguments, " + "got %d instead" % (expr.function.name, + 1 + operation.arg_count, + len(expr.parameters))) - inames, red_expr = expr.parameters - return self._parse_reduction(operation, inames, self.rec(red_expr)) + inames = expr.parameters[0] + red_exprs = tuple(self.rec(param) for param in expr.parameters[1:]) + return self._parse_reduction(operation, inames, red_exprs) else: return IdentityMapper.map_call(self, expr) @@ -1385,7 +1396,7 @@ class IndexVariableFinder(CombineMapper): return result def map_reduction(self, expr): - result = self.rec(expr.expr) + result = self.combine(self.rec(sub_expr) for sub_expr in expr.exprs) if not (expr.inames_set & result): raise RuntimeError("reduction '%s' does not depend on " diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 575311b11..4014b8575 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -683,7 +683,8 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule -def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None): +def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None, + arg_number=0): if isinstance(inames, str): inames = [s.strip() for s in inames.split(",")] @@ -715,15 +716,24 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No substs[my_subst_rule_name] = SubstitutionRule( name=my_subst_rule_name, arguments=tuple(inames), - expression=expr.expr) + expression=expr.exprs[arg_number]) from pymbolic import var iname_vars = [var(iname) for iname in inames] + new_exprs = [] + for sub_expr_number, sub_expr in enumerate(expr.exprs): + if sub_expr_number == arg_number: + new_exprs.append(var(my_subst_rule_name)(*iname_vars)) + else: + new_exprs.append(sub_expr) + + new_exprs = tuple(new_exprs) + return type(expr)( operation=expr.operation, inames=expr.inames, - expr=var(my_subst_rule_name)(*iname_vars), + exprs=new_exprs, allow_simultaneous=expr.allow_simultaneous) from loopy.symbolic import ReductionCallbackMapper diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index c35b50643..300faa638 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -145,7 +145,8 @@ class _InameSplitter(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec(expr.expr, expn_state), + tuple(self.rec(sub_expr, expn_state) + for sub_expr in expr.exprs), expr.allow_simultaneous) else: return super(_InameSplitter, self).map_reduction(expr, expn_state) @@ -1191,13 +1192,15 @@ class _ReductionSplitter(RuleAwareIdentityMapper): if self.direction == "in": return Reduction(expr.operation, tuple(leftover_inames), Reduction(expr.operation, tuple(self.inames), - self.rec(expr.expr, expn_state), + tuple(self.rec(sub_expr, expn_state) + for sub_expr in expr.exprs), expr.allow_simultaneous), expr.allow_simultaneous) elif self.direction == "out": return Reduction(expr.operation, tuple(self.inames), Reduction(expr.operation, tuple(leftover_inames), - self.rec(expr.expr, expn_state), + tuple(self.rec(sub_expr, expn_state) + for sub_expr in expr.exprs), expr.allow_simultaneous)) else: assert False @@ -1589,10 +1592,11 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec( - SubstitutionMapper(make_subst_func(subst_dict))( - expr.expr), - expn_state), + tuple(self.rec( + SubstitutionMapper(make_subst_func(subst_dict))( + sub_expr), + expn_state) + for sub_expr in expr.exprs), expr.allow_simultaneous) else: return super(_ReductionInameUniquifier, self).map_reduction( diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 4c1e423e9..18f34cf17 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -352,28 +352,23 @@ class TypeInferenceMapper(CombineMapper): return [self.kernel.index_dtype] def map_reduction(self, expr, return_tuple=False): - rec_result = self.rec(expr.expr) + rec_results = tuple(self.rec(sub_expr) for sub_expr in expr.exprs) - if rec_result: - rec_result, = rec_result - result = expr.operation.result_dtypes( - self.kernel, rec_result, expr.inames) - else: - result = expr.operation.result_dtypes( - self.kernel, None, expr.inames) + result = expr.operation.result_dtypes(self.kernel, *rec_results) if result is None: return [] if return_tuple: - return [result] + from itertools import product + return list(product(*result)) else: if len(result) != 1 and not return_tuple: raise LoopyError("reductions with more or fewer than one " "return value may only be used in direct assignments") - return [result[0]] + return result[0] # FIXME: wtf is going on here # }}} @@ -381,6 +376,7 @@ class TypeInferenceMapper(CombineMapper): # {{{ infer single variable def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + print(kernel) if var_name in kernel.all_params(): return [kernel.index_dtype], [] @@ -429,6 +425,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if not dtype_sets: return None, type_inf_mapper.symbols_with_unknown_types + print("got dtype sets", dtype_sets) result = type_inf_mapper.combine(dtype_sets) return result, type_inf_mapper.symbols_with_unknown_types diff --git a/test/test_reduction.py b/test/test_reduction.py index 5887df7a6..290f3d483 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -297,7 +297,7 @@ def test_argmax(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i<%d}" % n, """ - max_val, max_idx = argmax(i, fabs(a[i])) + max_val, max_idx = argmax(i, fabs(a[i]), i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) @@ -397,7 +397,7 @@ def test_parallel_multi_output_reduction(): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ - max_val, max_indices = argmax(i, fabs(a[i])) + max_val, max_indices = argmax(i, fabs(a[i]), i) """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl) diff --git a/test/test_scan.py b/test/test_scan.py index 3200e8c56..bcce9f34d 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -58,24 +58,28 @@ __all__ = [ # - scan(a) + scan(b) # - global parallel scan # - segmented scan +# - base_exec_iname different bounds from sweep iname + +# TO DO: +# segmented(...) syntax @pytest.mark.parametrize("n", [1, 2, 3, 16]) -def test_sequential_scan(ctx_factory, n): +@pytest.mark.parametrize("stride", [1, 2]) +def test_sequential_scan(ctx_factory, n, stride): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "[n] -> {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i {[i,j]: 0<=i_ = segmented_sum(j, arr[j], segflag[j])", + [ + lp.GlobalArg("arr", np.float32, shape=("n",)), + lp.GlobalArg("segflag", np.int32, shape=("n",)), + "..." + ]) + + knl = lp.fix_parameters(knl, n=n) + knl = lp.tag_inames(knl, dict(i=iname_tag)) + knl = lp.realize_reduction(knl, force_scan=True) + + (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries) + + class SegmentGrouper(object): + + def __init__(self): + self.seg_idx = 0 + self.idx = 0 + + def __call__(self, key): + if self.idx in segment_boundaries_indices: + self.seg_idx += 1 + self.idx += 1 + return self.seg_idx + + from itertools import groupby + + expected = [np.cumsum(list(group)) + for _, group in groupby(arr, SegmentGrouper())] + actual = [np.array(list(group)) + for _, group in groupby(out, SegmentGrouper())] + + assert len(expected) == len(actual) == len(segment_boundaries_indices) + assert [(e == a).all() for e, a in zip(expected, actual)] + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 05e76e39f9bf1480999b87553a8f61bdfb3ae167 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 1 Mar 2017 01:32:20 -0600 Subject: [PATCH 04/27] Bump kernel version for reduction library changes. --- loopy/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/version.py b/loopy/version.py index 5c6ad47f8..77d0e21bd 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v59-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v60-islpy%s" % _islpy_version -- GitLab From 4e963c04c114bb3752466bbbb1e279be2cccec30 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 1 Mar 2017 12:24:31 -0600 Subject: [PATCH 05/27] Type inference fixes. --- loopy/type_inference.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 18f34cf17..da7a5b52d 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -352,23 +352,30 @@ class TypeInferenceMapper(CombineMapper): return [self.kernel.index_dtype] def map_reduction(self, expr, return_tuple=False): + """ + :arg return_tuple: If *True*, treat the type of the reduction expression + as a tuple type. Otherwise, the number of expressions being reduced over + must equal 1, and the type of the first expression is returned. + """ rec_results = tuple(self.rec(sub_expr) for sub_expr in expr.exprs) - result = expr.operation.result_dtypes(self.kernel, *rec_results) - - if result is None: + if any(len(rec_result) == 0 for rec_result in rec_results): return [] if return_tuple: from itertools import product - return list(product(*result)) + return list( + expr.operation.result_dtypes(self.kernel, *product_element) + for product_element in product(*rec_results)) else: - if len(result) != 1 and not return_tuple: + if len(rec_results) != 1: raise LoopyError("reductions with more or fewer than one " "return value may only be used in direct assignments") - return result[0] # FIXME: wtf is going on here + return list( + expr.operation.result_dtypes(self.kernel, rec_result)[0] + for rec_result in rec_results[0]) # }}} @@ -376,7 +383,6 @@ class TypeInferenceMapper(CombineMapper): # {{{ infer single variable def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): - print(kernel) if var_name in kernel.all_params(): return [kernel.index_dtype], [] @@ -425,7 +431,6 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if not dtype_sets: return None, type_inf_mapper.symbols_with_unknown_types - print("got dtype sets", dtype_sets) result = type_inf_mapper.combine(dtype_sets) return result, type_inf_mapper.symbols_with_unknown_types -- GitLab From 9144e93ee82efb8033f57d5d41938585aec8f2f1 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 1 Mar 2017 12:24:41 -0600 Subject: [PATCH 06/27] Fix test. --- test/test_loopy.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 851a7f076..5e4d013b3 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1987,19 +1987,28 @@ def test_integer_reduction(ctx_factory): dtype=to_loopy_type(vtype), shape=lp.auto) - reductions = [('max', lambda x: x == np.max(var_int)), - ('min', lambda x: x == np.min(var_int)), - ('sum', lambda x: x == np.sum(var_int)), - ('product', lambda x: x == np.prod(var_int)), - ('argmax', lambda x: (x[0] == np.max(var_int) and - var_int[out[1]] == np.max(var_int))), - ('argmin', lambda x: (x[0] == np.min(var_int) and - var_int[out[1]] == np.min(var_int)))] - - for reduction, function in reductions: + from collections import namedtuple + ReductionTest = namedtuple('ReductionTest', 'kind, check, args') + + reductions = [ + ReductionTest('max', lambda x: x == np.max(var_int), args='var[k]'), + ReductionTest('min', lambda x: x == np.min(var_int), args='var[k]'), + ReductionTest('sum', lambda x: x == np.sum(var_int), args='var[k]'), + ReductionTest('product', lambda x: x == np.prod(var_int), args='var[k]'), + ReductionTest('argmax', + lambda x: ( + x[0] == np.max(var_int) and var_int[out[1]] == np.max(var_int)), + args='var[k], k'), + ReductionTest('argmin', + lambda x: ( + x[0] == np.min(var_int) and var_int[out[1]] == np.min(var_int)), + args='var[k], k') + ] + + for reduction, function, args in reductions: kstr = ("out" if 'arg' not in reduction else "out[0], out[1]") - kstr += ' = {0}(k, var[k])'.format(reduction) + kstr += ' = {0}(k, {1})'.format(reduction, args) knl = lp.make_kernel('{[k]: 0<=k Date: Wed, 1 Mar 2017 18:12:15 -0600 Subject: [PATCH 07/27] Reduction: Fix multi return value scoping for OpenCL (see also: #34). --- loopy/preprocess.py | 181 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index b2875b44e..ec3ed99f7 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -649,6 +649,183 @@ def _infer_arg_dtypes_and_reduction_dtypes(kernel, expr, unknown_types_ok): return tuple(arg_dtypes), reduction_dtypes + +def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): + """ + Multi assignment function calls are currently lowered into OpenCL so that + the function call: + + a, b = segmented_sum(x, y, z, w) + + becomes + + a = segmented_sum_mangled(x, y, z, w, &b). + + For OpenCL, the scope of "b" is significant, and the preamble generation + currently assumes the scope is always private. This function forces that to + be the case by introducing temporary assignments into the kernel. + """ + + insn_id_gen = kernel.get_instruction_id_generator() + var_name_gen = kernel.get_var_name_generator() + + new_or_updated_instructions = {} + new_temporaries = {} + + dep_map = dict( + (insn.id, insn.depends_on) for insn in kernel.instructions) + + inverse_dep_map = dict((insn.id, set()) for insn in kernel.instructions) + + import six + for insn_id, deps in six.iteritems(dep_map): + for dep in deps: + inverse_dep_map[dep].add(insn_id) + + del dep_map + + # {{{ utils + + def _add_to_no_sync_with(insn_id, new_no_sync_with_params): + insn = kernel.id_to_insn.get(insn_id) + insn = new_or_updated_instructions.get(insn_id, insn) + new_or_updated_instructions[insn_id] = ( + insn.copy( + no_sync_with=( + insn.no_sync_with | frozenset(new_no_sync_with_params)))) + + def _add_to_depends_on(insn_id, new_depends_on_params): + insn = kernel.id_to_insn.get(insn_id) + insn = new_or_updated_instructions.get(insn_id, insn) + new_or_updated_instructions[insn_id] = ( + insn.copy( + depends_on=insn.depends_on | frozenset(new_depends_on_params))) + + # }}} + + from loopy.kernel.instruction import CallInstruction + for insn in kernel.instructions: + if not isinstance(insn, CallInstruction): + continue + + if len(insn.assignees) <= 1: + continue + + assignees = insn.assignees + assignee_var_names = insn.assignee_var_names() + + new_assignees = [assignees[0]] + newly_added_assignments_ids = set() + needs_replacement = False + + last_added_insn_id = insn.id + + from loopy.kernel.data import temp_var_scope, TemporaryVariable + + # The first assignee is not passed by pointer, so we start + # by looking at the second assignee. + for assignee_nr, assignee_var_name, assignee in zip( + range(1, len(assignees)), + assignee_var_names[1:], + assignees[1:]): + + if ( + assignee_var_name in kernel.temporary_variables + and + (kernel.temporary_variables[assignee_var_name].scope + == temp_var_scope.PRIVATE)): + new_assignees.append(assignee) + continue + + needs_replacement = True + + # {{{ generate a new assignent instruction + + new_assignee_name = var_name_gen( + "{insn_id}_retval_{assignee_nr}" + .format(insn_id=insn.id, assignee_nr=assignee_nr)) + + new_assignment_id = insn_id_gen( + "{insn_id}_assign_retval_{assignee_nr}" + .format(insn_id=insn.id, assignee_nr=assignee_nr)) + + newly_added_assignments_ids.add(new_assignment_id) + + import loopy as lp + new_temporaries[new_assignee_name] = ( + TemporaryVariable( + name=new_assignee_name, + dtype=lp.auto, + scope=temp_var_scope.PRIVATE)) + + from pymbolic import var + new_assignee = var(new_assignee_name) + new_assignees.append(new_assignee) + + new_or_updated_instructions[new_assignment_id] = ( + make_assignment( + assignees=(assignee,), + expression=new_assignee, + id=new_assignment_id, + depends_on=frozenset([last_added_insn_id]), + depends_on_is_final=True, + no_sync_with=insn.no_sync_with, + predicates=insn.predicates, + within_inames=insn.within_inames)) + + last_added_insn_id = new_assignment_id + + # }}} + + if not needs_replacement: + continue + + # {{{ update originating instruction + + orig_insn = new_or_updated_instructions.get(insn.id, insn) + + new_or_updated_instructions[insn.id] = ( + orig_insn.copy(assignees=tuple(new_assignees))) + + _add_to_no_sync_with(insn.id, + [(id, "any") for id in newly_added_assignments_ids]) + + # }}} + + # {{{ squash spurious memory dependencies amongst new assignments + + for new_insn_id in newly_added_assignments_ids: + _add_to_no_sync_with(new_insn_id, + [(id, "any") for id in newly_added_assignments_ids]) + + # }}} + + # {{{ update instructions that depend on the originating instruction + + for inverse_dep in inverse_dep_map[insn.id]: + _add_to_depends_on(inverse_dep, newly_added_assignments_ids) + + for insn_id, scope in ( + new_or_updated_instructions[inverse_dep].no_sync_with): + if insn_id == insn.id: + _add_to_no_sync_with( + inverse_dep, + [(id, scope) for id in newly_added_assignments_ids]) + + # }}} + + new_temporary_variables = kernel.temporary_variables.copy() + new_temporary_variables.update(new_temporaries) + + new_instructions = ( + list(new_or_updated_instructions.values()) + + list(insn + for insn in kernel.instructions + if insn.id not in new_or_updated_instructions)) + + return kernel.copy(temporary_variables=new_temporary_variables, + instructions=new_instructions) + # }}} @@ -1539,6 +1716,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # TODO: remove unused inames... + kernel = ( + _hackily_ensure_multi_assignment_return_values_are_scoped_private( + kernel)) + return kernel # }}} -- GitLab From 8171a33336d077eb96b2a3eada9c1de1d5a912b6 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 2 Mar 2017 10:54:01 -0600 Subject: [PATCH 08/27] Private variable hack: Don't include self in newly added assignments. --- loopy/preprocess.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ec3ed99f7..87f58501f 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -769,7 +769,8 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): id=new_assignment_id, depends_on=frozenset([last_added_insn_id]), depends_on_is_final=True, - no_sync_with=insn.no_sync_with, + no_sync_with=( + insn.no_sync_with | frozenset([(insn.id, "any")])), predicates=insn.predicates, within_inames=insn.within_inames)) @@ -796,7 +797,9 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): for new_insn_id in newly_added_assignments_ids: _add_to_no_sync_with(new_insn_id, - [(id, "any") for id in newly_added_assignments_ids]) + [(id, "any") + for id in newly_added_assignments_ids + if id != new_insn_id]) # }}} -- GitLab From 29267fe50c014192bc06193ba4c0317319fdf1b9 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 2 Mar 2017 10:54:50 -0600 Subject: [PATCH 09/27] Get more scan tests to pass. --- loopy/preprocess.py | 12 ++---------- test/test_scan.py | 29 +++++++++++++++++++---------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 87f58501f..a052dc075 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1384,18 +1384,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, else: return c - scan_size = 1 - while scan_size < size: - scan_size *= 2 - + scan_size = size prev_id = transfer_id istage = 0 cur_size = 1 - while cur_size != scan_size: - #new_size = cur_size // 2 - #assert new_size * 2 == cur_size - + while cur_size < scan_size: stage_exec_iname = var_name_gen("scan_%s_s%d" % (red_iname, istage)) domains.append( _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) @@ -1450,8 +1444,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, istage += 1 new_insn_add_depends_on.add(prev_id) - new_insn_add_no_sync_with.add((prev_id, "any")) - #output_iname = var_name_gen("scan_%s_output" % red_iname) #domains.append(_make_slab_set(output_iname, scan_size)) #new_iname_tags[output_iname] = kernel.iname_to_tag[sweep_iname] diff --git a/test/test_scan.py b/test/test_scan.py index bcce9f34d..e8ae00f90 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -57,7 +57,6 @@ __all__ = [ # - nested sequential/parallel scan # - scan(a) + scan(b) # - global parallel scan -# - segmented scan # - base_exec_iname different bounds from sweep iname # TO DO: @@ -182,19 +181,29 @@ def test_local_parallel_scan(ctx_factory, n): assert (a == np.cumsum(np.arange(16)**2)).all() -""" @pytest.mark.parametrize("sweep_iname_tag", ["for", "l.1"]) def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "[n] -> {[i,j,k]: 0<=i {[i,j]: 0<=i Date: Thu, 2 Mar 2017 15:14:23 -0600 Subject: [PATCH 10/27] Lower bound fixes for scan, more tests. --- loopy/preprocess.py | 96 +++++++++++++++++++++--------------------- test/test_reduction.py | 2 +- test/test_scan.py | 51 ++++++++++++++++++---- 3 files changed, 91 insertions(+), 58 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index a052dc075..5c9a27805 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -338,9 +338,9 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): <= sweep_iname <= sweep_max_value) and - (sweep_min_value + offset + (scan_min_value <= scan_iname - <= stride * sweep_iname + offset) + <= stride * (sweep_iname - sweep_min_value) + scan_min_value) """ dim_type = isl.dim_type @@ -362,10 +362,11 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): tri_domain &= affs[sweep_iname].le_set(scan_param.sweep_upper_bound) # Add scan iname constraints - offset = scan_param.offset - tri_domain &= affs[scan_iname].ge_set(scan_param.sweep_lower_bound + offset) + scan_min_value = scan_param.scan_lower_bound + tri_domain &= affs[scan_iname].ge_set(scan_min_value) tri_domain &= affs[scan_iname].le_set( - scan_param.stride * affs[sweep_iname] + offset) + scan_param.stride * (affs[sweep_iname] - scan_param.sweep_lower_bound) + + scan_min_value) # Gist against domain params tri_domain = tri_domain.gist(domain.params()) @@ -379,7 +380,8 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): if domain != tri_domain: # FIXME: Return a more descriptive error message. - return False, "domains are not equal" + return False, ( + "domains are not equal: expected '%s', got '%s'" % (tri_domain, domain)) else: return True, "ok" @@ -387,7 +389,7 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): _ScanCandidateParameters = namedtuple( "_ScanCandidateParameters", "sweep_iname, scan_iname, sweep_lower_bound, " - "sweep_upper_bound, offset, stride") + "sweep_upper_bound, scan_lower_bound, stride") def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): @@ -414,7 +416,7 @@ def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): raise ValueError("Couldn't determine a sweep iname for the scan: %s" % v) try: - sweep_lower_bound, sweep_upper_bound, offset = ( + sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname)) except Exception as e: raise ValueError("Couldn't determine bounds for scan: %s" % e) @@ -426,7 +428,7 @@ def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): raise ValueError("Couldn't determine a scan stride: %s" % v) return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, - sweep_upper_bound, offset, stride) + sweep_upper_bound, scan_lower_bound, stride) def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): @@ -482,11 +484,10 @@ def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname): sweep_bounds = kernel.get_iname_bounds(sweep_iname) scan_bounds = kernel.get_iname_bounds(scan_iname) - scan_offset = scan_bounds.lower_bound_pw_aff - sweep_bounds.lower_bound_pw_aff return (sweep_bounds.lower_bound_pw_aff, sweep_bounds.upper_bound_pw_aff, - scan_offset) + scan_bounds.lower_bound_pw_aff) def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): @@ -508,7 +509,6 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): scan_iname_range = ( domain_with_sweep_param.dim_max(scan_iname_idx) - domain_with_sweep_param.dim_min(scan_iname_idx) - - sweep_lower_bound ).gist(domain_with_sweep_param.params()) scan_iname_pieces = scan_iname_range.get_pieces() @@ -530,9 +530,6 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param) - if len(coeffs) > 1: - raise ValueError("found more than one coeff: %s" % coeffs) - if len(coeffs) == 0: try: scan_iname_aff.get_constant_val() @@ -575,7 +572,7 @@ def _get_domain_with_iname_as_param(domain, iname): def _create_domain_for_sweep_tracking(orig_domain, - tracking_iname, sweep_iname, sweep_min_value, offset, stride): + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride): dim_type = isl.dim_type subd = isl.BasicSet.universe(orig_domain.params().space) @@ -586,17 +583,26 @@ def _create_domain_for_sweep_tracking(orig_domain, # Here we realize the domain: # - # [params, sweep_iname] -> { - # [tracking_iname]: - # offset + stride * (sweep_iname - 1) < tracking_iname - # and tracking_iname <= stride * sweep_iname + offset - # and min_value + offset <= tracking_iname } + # [..., i] -> { + # [j]: 0 <= j - l + # and + # j - l <= k * (i - m) + # and + # k * (i - m - 1) < j - l } + # where + # * i is the sweep iname + # * j is the tracking iname + # * k is the stride for the scan + # * l is the lower bound for the scan + # * m is the lower bound for the sweep iname # affs = isl.affs_from_space(subd.space) - subd &= affs[tracking_iname].gt_set(stride * affs[sweep_iname] - stride + offset) - subd &= affs[tracking_iname].le_set(stride * affs[sweep_iname] + offset) - subd &= affs[tracking_iname].ge_set(sweep_min_value + offset) + subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0]) + subd &= (affs[tracking_iname] - scan_min_value)\ + .le_set(stride * (affs[sweep_iname] - sweep_min_value)) + subd &= (affs[tracking_iname] - scan_min_value)\ + .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1)) # Move tracking_iname into a set dim (NOT sweep iname). subd = subd.move_dims( @@ -1122,7 +1128,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, @memoize def get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, offset, stride): + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride): domain = kernel.get_inames_domain((scan_iname, sweep_iname)) tracking_iname = var_name_gen( @@ -1132,7 +1138,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, inames_added_for_scan.add(tracking_iname) new_domain = _create_domain_for_sweep_tracking(domain, - tracking_iname, sweep_iname, sweep_min_value, offset, stride) + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) domains.append(new_domain) @@ -1176,14 +1182,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential scan def map_scan_seq(expr, rec, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, offset, - stride): + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): outer_insn_inames = temp_kernel.insn_inames(insn) inames_to_remove.add(scan_iname) track_iname, track_iname_domain = ( get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, offset, stride)) + scan_iname, sweep_iname, sweep_min_value, scan_min_value, + stride)) from loopy.kernel.data import temp_var_scope acc_var_names = make_temporaries( @@ -1253,7 +1260,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, def map_scan_local(expr, rec, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, - sweep_min_value, offset, stride): + sweep_min_value, scan_min_value, stride): # TODO: rename red_iname = scan_iname @@ -1280,7 +1287,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, for oiname in outer_local_inames) track_iname, track_iname_domain = get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, offset, stride) + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride) # {{{ add separate iname to carry out the scan @@ -1339,32 +1346,21 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, depends_on=frozenset()) generated_insns.append(init_insn) - """ - # XXX: Is this needed? - init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname)) - init_neutral_insn = make_assignment( - id=init_neutral_id, - assignees=tuple(var(nvn) for nvn in neutral_var_names), - expression=neutral, - within_inames=base_iname_deps | frozenset([base_exec_iname]), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset()) - generated_insns.append(init_neutral_insn) - """ - updated_inner_exprs = tuple( replace_var_within_expr(sub_expr, scan_iname, track_iname) for sub_expr in expr.exprs) from loopy.symbolic import Reduction - # TODO: change sweep iname to base exec iname... + from loopy.symbolic import pw_aff_to_expr + sweep_min_value_expr = pw_aff_to_expr(sweep_min_value) transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( - acc_var[outer_local_iname_vars + (var(sweep_iname),)] + acc_var[outer_local_iname_vars + + (var(sweep_iname) - sweep_min_value_expr,)] for acc_var in acc_vars), expression=Reduction( operation=expr.operation, @@ -1450,7 +1446,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, #new_insn_add_within_inames.add(output_iname) new_insn_add_within_inames.add(sweep_iname) - output_idx = var(sweep_iname) + output_idx = var(sweep_iname) - sweep_min_value_expr if nresults == 1: assert len(acc_vars) == 1 @@ -1574,13 +1570,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return map_scan_local( expr, rec, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, - scan_param.sweep_lower_bound, scan_param.offset, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, scan_param.stride) elif sequential: return map_scan_seq( expr, rec, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, - scan_param.sweep_lower_bound, scan_param.offset, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, scan_param.stride) # fallthrough to reduction implementation diff --git a/test/test_reduction.py b/test/test_reduction.py index 290f3d483..83aac9d37 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -186,7 +186,7 @@ def test_local_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( - "{[i, j]: 0 <= i < n and 0 <= j < 5}", + "{[i, j]: 1 <= i < n and 0 <= j < 5}", """ z[j] = sum(i, i+j) """) diff --git a/test/test_scan.py b/test/test_scan.py index e8ae00f90..d77a82d59 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -81,24 +81,36 @@ def test_sequential_scan(ctx_factory, n, stride): assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all() -def test_scan_with_different_lower_bound_from_sweep(ctx_factory): +@pytest.mark.parametrize("sweep_lbound, scan_lbound", [ + (4, 0), + (3, 1), + (2, 2), + (1, 3), + (0, 4), + (5, -1), + ]) +def test_scan_with_different_lower_bound_from_sweep( + ctx_factory, sweep_lbound, scan_lbound): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "[n, lbound] -> {[i,j]: 0<=i " + "{[i,j]: sweep_lbound<=i {[i,j]: 1<=i Date: Thu, 2 Mar 2017 15:15:43 -0600 Subject: [PATCH 11/27] Undo test_reduction change. --- test/test_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_reduction.py b/test/test_reduction.py index 83aac9d37..290f3d483 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -186,7 +186,7 @@ def test_local_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( - "{[i, j]: 1 <= i < n and 0 <= j < 5}", + "{[i, j]: 0 <= i < n and 0 <= j < 5}", """ z[j] = sum(i, i+j) """) -- GitLab From 3db910efac3737f7e00e476e1c3c6b95604802b5 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 4 Mar 2017 02:39:21 -0600 Subject: [PATCH 12/27] [ci skip] Two level reduction + two level scan, semi-working version. --- loopy/__init__.py | 2 + loopy/preprocess.py | 76 +++-- loopy/transform/data.py | 4 +- loopy/transform/reduction.py | 611 +++++++++++++++++++++++++++++++++++ test/test_reduction.py | 28 +- test/test_scan.py | 101 ++++-- 6 files changed, 778 insertions(+), 44 deletions(-) create mode 100644 loopy/transform/reduction.py diff --git a/loopy/__init__.py b/loopy/__init__.py index 6cbb3362e..a10d94463 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -108,6 +108,8 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries +from loopy.transform.reduction import make_two_level_reduction + # }}} from loopy.type_inference import infer_unknown_types diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 5c9a27805..f139810f1 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -328,26 +328,50 @@ def _add_params_to_domain(domain, param_names): return domain +def _move_set_to_param_dims_except(domain, except_dims): + dim_type = isl.dim_type + + iname_idx = 0 + for iname in domain.get_var_names(dim_type.set): + if iname not in except_dims: + domain = domain.move_dims( + dim_type.param, 0, + dim_type.set, iname_idx, 1) + iname_idx -= 1 + iname_idx += 1 + + return domain + + def _check_reduction_is_triangular(kernel, expr, scan_param): """Check whether the reduction within `expr` with scan parameters described by the structure `scan_param` is triangular. This attempts to verify that the domain for the scan and sweep inames is as follows: - [scan_iname, sweep_iname]: - (sweep_min_value - <= sweep_iname - <= sweep_max_value) - and - (scan_min_value - <= scan_iname - <= stride * (sweep_iname - sweep_min_value) + scan_min_value) + [other inames] -> { + [scan_iname, sweep_iname]: + (sweep_min_value + <= sweep_iname + <= sweep_max_value) + and + (scan_min_value + <= scan_iname + <= stride * (sweep_iname - sweep_min_value) + scan_min_value) + } """ dim_type = isl.dim_type - domain = kernel.get_inames_domain( + orig_domain = kernel.get_inames_domain( + (scan_param.sweep_iname, scan_param.scan_iname)) + + domain = _move_set_to_param_dims_except(orig_domain, (scan_param.sweep_iname, scan_param.scan_iname)) + params_for_gisting = domain.params() + + domain = domain.gist_params(params_for_gisting) + tri_domain = isl.BasicSet.universe(domain.params().space) sweep_iname = scan_param.sweep_iname @@ -369,7 +393,7 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): + scan_min_value) # Gist against domain params - tri_domain = tri_domain.gist(domain.params()) + tri_domain = tri_domain.gist_params(params_for_gisting) # Move sweep and scan inames into the set tri_domain = tri_domain.move_dims( @@ -418,7 +442,7 @@ def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): try: sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname)) - except Exception as e: + except ValueError as v: raise ValueError("Couldn't determine bounds for scan: %s" % e) try: @@ -482,12 +506,18 @@ def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname): - sweep_bounds = kernel.get_iname_bounds(sweep_iname) - scan_bounds = kernel.get_iname_bounds(scan_iname) + # FIXME: use home domain of scan_iname... + domain = kernel.get_inames_domain((sweep_iname, scan_iname)) + domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname)) - return (sweep_bounds.lower_bound_pw_aff, - sweep_bounds.upper_bound_pw_aff, - scan_bounds.lower_bound_pw_aff) + domain = domain.gist_params(domain.params()).project_out_except( + (sweep_iname,), (isl.dim_type.param,)) + + sweep_lower_bound = domain.dim_min(domain.get_var_dict()[sweep_iname][1]) + sweep_upper_bound = domain.dim_max(domain.get_var_dict()[sweep_iname][1]) + scan_lower_bound = domain.dim_min(domain.get_var_dict()[scan_iname][1]) + + return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound) def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): @@ -499,7 +529,7 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): dim_type = isl.dim_type domain = kernel.get_inames_domain((sweep_iname, scan_iname)) - domain_with_sweep_param = _get_domain_with_iname_as_param(domain, sweep_iname) + domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,)) scan_iname_idx = domain_with_sweep_param.find_dim_by_name( dim_type.set, scan_iname) @@ -659,11 +689,11 @@ def _infer_arg_dtypes_and_reduction_dtypes(kernel, expr, unknown_types_ok): def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): """ Multi assignment function calls are currently lowered into OpenCL so that - the function call: + the function call:: a, b = segmented_sum(x, y, z, w) - becomes + becomes:: a = segmented_sum_mangled(x, y, z, w, &b). @@ -835,6 +865,12 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): return kernel.copy(temporary_variables=new_temporary_variables, instructions=new_instructions) + +def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): + dependent_inames = frozenset(subdomain.get_var_names(isl.dim_type.param)) + idx, = kernel.get_leaf_domain_indices(dependent_inames) + domains.insert(idx + 1, subdomain) + # }}} @@ -1140,7 +1176,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, new_domain = _create_domain_for_sweep_tracking(domain, tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) - domains.append(new_domain) + _insert_subdomain_into_domain_tree(kernel, domains, new_domain) return tracking_iname, new_domain diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 4014b8575..c6ff596b0 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -694,12 +694,14 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No var_name_gen = knl.get_var_name_generator() + # XXX def map_reduction(expr, rec, nresults=1): if frozenset(expr.inames) != inames_set: + assert len(expr.exprs) == 1 return type(expr)( operation=expr.operation, inames=expr.inames, - expr=rec(expr.expr), + exprs=(rec(expr.exprs[0]),), allow_simultaneous=expr.allow_simultaneous) if subst_rule_name is None: diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py new file mode 100644 index 000000000..1693fb515 --- /dev/null +++ b/loopy/transform/reduction.py @@ -0,0 +1,611 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2017 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from loopy.diagnostic import LoopyError +import loopy as lp + +from loopy.kernel.data import auto, temp_var_scope +from pytools import memoize_method, Record +import islpy as isl + + +import logging +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: make_two_level_reduction +.. autofunction:: make_two_level_scan +.. autofunction:: precompute_scan +""" + + +def make_two_level_reduction( + kernel, insn_id, inner_length, + nonlocal_storage_scope=None, + nonlocal_tag=None, + outer_tag=None, + inner_tag=None): + """ + Two level reduction, mediated through a "nonlocal" array. + + This turns a reduction of the form:: + + [...] result = reduce(i, f(i)) + + into:: + + i -> inner + inner_length * outer + + [..., nl] nonlocal[nl] = reduce(inner, f(nl, inner)) + [...] result = reduce(outer, nonlocal[outer]) + """ + + # {{{ sanity checks + + reduction = kernel.id_to_insn[insn_id].expression + reduction_iname, = reduction.inames + + # }}} + + # {{{ get stable names for everything + + var_name_gen = kernel.get_var_name_generator() + insn_id_gen = kernel.get_instruction_id_generator() + + format_kwargs = {"insn": insn_id, "iname": reduction_iname} + + nonlocal_storage_name = var_name_gen( + "{insn}_nonlocal".format(**format_kwargs)) + + inner_iname = var_name_gen( + "{iname}_inner".format(**format_kwargs)) + outer_iname = var_name_gen( + "{iname}_outer".format(**format_kwargs)) + nonlocal_iname = var_name_gen( + "{iname}_nonlocal".format(**format_kwargs)) + + inner_subst = var_name_gen( + "{insn}_inner_subst".format(**format_kwargs)) + + # }}} + + # First we split this iname. This results in (roughly) + # + # [...] result = reduce([outer, inner], f(outer, inner)) + # + # FIXME: within + + kernel = lp.split_iname(kernel, reduction_iname, inner_length, + outer_iname=outer_iname, inner_iname=inner_iname) + + # Next, we split the reduction inward and then extract a substitution + # rule for the reduction. This results in + # + # subst(outer) := reduce(inner, f(outer, inner)) + # [...] result = reduce([outer], subst(outer)) + # + # FIXME: within, insn_match... + + kernel = lp.split_reduction_inward(kernel, inner_iname) + from loopy.transform.data import reduction_arg_to_subst_rule + kernel = reduction_arg_to_subst_rule(kernel, outer_iname, + subst_rule_name=inner_subst) + + # Next, we precompute the inner iname into its own storage. + + # [...,nl] nonlocal[nl] = reduce(inner, f(nl, inner)) + # [...] result = reduce([outer], nonlocal[outer]) + + kernel = lp.precompute(kernel, inner_subst, + sweep_inames=[outer_iname], + precompute_inames=[nonlocal_iname], + temporary_name=nonlocal_storage_name, + temporary_scope=nonlocal_storage_scope) + + return kernel + + +def _update_instructions(kernel, id_to_new_insn, copy=True): + if not isinstance(id_to_new_insn, dict): + id_to_new_insn = dict((insn.id, insn) for insn in id_to_new_insn) + + new_instructions = ( + list(insn for insn in kernel.instructions + if insn.id not in id_to_new_insn) + + list(id_to_new_insn.values())) + + if copy: + kernel = kernel.copy() + + kernel.instructions = new_instructions + return kernel + + +def _make_slab_set(iname, size): + # FIXME: stolen from preprocess, should be its own thing... + v = isl.make_zero_and_vars([iname]) + bs, = ( + v[0].le_set(v[iname]) + & + v[iname].lt_set(v[0] + size)).get_basic_sets() + print("ADDING SLAB", bs) + return bs + + +def _add_scan_subdomain( + kernel, scan_iname, sweep_iname): + """ + Add the following domain to the kernel:: + + [sweep_iname] -> {[scan_iname] : 0 <= scan_iname <= sweep_iname } + """ + sp = ( + isl.Space.set_alloc(isl.DEFAULT_CONTEXT, 1, 1) + .set_dim_name(isl.dim_type.param, 0, sweep_iname) + .set_dim_name(isl.dim_type.set, 0, scan_iname)) + + affs = isl.affs_from_space(sp) + + subd, = ( + affs[scan_iname].le_set(affs[sweep_iname]) + & + affs[scan_iname].ge_set(affs[0])).get_basic_sets() + + sweep_idx, = kernel.get_leaf_domain_indices((sweep_iname,)) + + domains = list(kernel.domains) + domains.insert(sweep_idx + 1, subd) + + return kernel.copy(domains=domains) + + +def _expand_subst_within_expression(kernel, expr): + from loopy.symbolic import RuleAwareSubstitutionRuleExpander, SubstitutionRuleMappingContext + from loopy.match import parse_stack_match + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + submap = RuleAwareSubstitutionRuleExpander( + rule_mapping_context, + kernel.substitutions, + within=lambda *args: True + ) + return submap(expr, kernel, insn=None) + + +def make_two_level_scan( + kernel, insn_id, + scan_iname, + sweep_iname, + inner_length, + local_storage_name=None, + local_storage_scope=None, + local_storage_axes=None, + nonlocal_storage_name=None, + nonlocal_storage_scope=None, + nonlocal_tag=None, + outer_local_tag=None, + inner_local_tag=None, + inner_tag=None, + outer_tag=None, + inner_local_iname=None, + outer_local_iname=None): + """ + Two level scan, mediated through a "local" and "nonlocal" array. + + This turns a scan of the form:: + + [...,i] result = reduce(j, f(j)) + + into:: + + [...,l',l''] + [...,l'] nonlocal[0] = 0 + [...,l'] nonlocal[l'+1] = local[l',-1] + [...,nl] + [...,i',i''] result = nonlocal[i'] + local[i',i''] + """ + + # {{{ sanity checks + + insn = kernel.id_to_insn[insn_id] + scan = insn.expression + assert scan.inames[0] == scan_iname + assert len(scan.inames) == 1 + + # }}} + + # {{{ get stable names for everything + + var_name_gen = kernel.get_var_name_generator() + insn_id_gen = kernel.get_instruction_id_generator() + + format_kwargs = {"insn": insn_id, "iname": scan_iname, "sweep": sweep_iname} + + nonlocal_storage_name = var_name_gen( + "{insn}_nonlocal".format(**format_kwargs)) + + inner_iname = var_name_gen( + "{sweep}_inner".format(**format_kwargs)) + outer_iname = var_name_gen( + "{sweep}_outer".format(**format_kwargs)) + nonlocal_iname = var_name_gen( + "{sweep}_nonlocal".format(**format_kwargs)) + + if inner_local_iname is None: + inner_local_iname = var_name_gen( + "{sweep}_inner_local".format(**format_kwargs)) + + inner_scan_iname = var_name_gen( + "{iname}_inner".format(**format_kwargs)) + + outer_scan_iname = var_name_gen( + "{iname}_outer".format(**format_kwargs)) + + if outer_local_iname is None: + outer_local_iname = var_name_gen( + "{sweep}_outer_local".format(**format_kwargs)) + + subst_name = var_name_gen( + "{insn}_inner_subst".format(**format_kwargs)) + + local_subst_name = var_name_gen( + "{insn}_local_subst".format(**format_kwargs)) + + if local_storage_name is None: + local_storage_name = var_name_gen( + "{insn}_local".format(**format_kwargs)) + + if nonlocal_storage_name is None: + nonlocal_storage_name = var_name_gen( + "{insn}_nonlocal".format(**format_kwargs)) + + local_scan_insn_id = insn_id_gen( + "{iname}_local_scan".format(**format_kwargs)) + + nonlocal_scan_insn_id = insn_id_gen( + "{iname}_nonlocal_scan".format(**format_kwargs)) + + format_kwargs.update({"nonlocal": nonlocal_storage_name}) + + nonlocal_init_head_insn_id = insn_id_gen( + "{nonlocal}_init_head".format(**format_kwargs)) + + nonlocal_init_tail_insn_id = insn_id_gen( + "{nonlocal}_init_tail".format(**format_kwargs)) + + # }}} + + # Turn the scan into a substitution rule, replace the original scan with a + # nop and delete the scan iname. + # + # (The presence of the scan iname seems to be making precompute very confused.) + + from loopy.transform.data import reduction_arg_to_subst_rule + kernel = reduction_arg_to_subst_rule( + kernel, scan_iname, subst_rule_name=subst_name) + + from loopy.kernel.instruction import NoOpInstruction + # FIXME: this is stupid + kernel = _update_instructions(kernel, {insn_id: insn.copy(expression=0)}) + """ + {insn_id: NoOpInstruction( + id=insn_id, + depends_on=insn.depends_on, + groups=insn.groups, + conflicts_with_groups=insn.groups, + no_sync_with=insn.no_sync_with, + within_inames_is_final=insn.within_inames_is_final, + within_inames=insn.within_inames, + priority=insn.priority, + boostable=insn.boostable, + boostable_into=insn.boostable_into, + predicates=insn.predicates, + tags=insn.tags)}, + copy=False) + """ + + kernel = lp.remove_unused_inames(kernel, inames=(scan_iname,)) + + # Make sure we got rid of everything + assert scan_iname not in kernel.all_inames() + + # {{{ implement local scan + + from pymbolic import var + local_scan_expr = _expand_subst_within_expression(kernel, + var(subst_name)(var(outer_local_iname) * inner_length + + var(inner_scan_iname))) + + kernel = lp.split_iname(kernel, sweep_iname, inner_length, + inner_iname=inner_iname, outer_iname=outer_iname) + + print("SPLITTING INAME, GOT DOMAINS", kernel.domains) + + from loopy.kernel.data import SubstitutionRule + from loopy.symbolic import Reduction + + local_subst = SubstitutionRule( + name=local_subst_name, + arguments=(outer_iname, inner_iname), + expression=Reduction( + scan.operation, + (inner_scan_iname,), + local_scan_expr) + ) + + substitutions = kernel.substitutions.copy() + substitutions[local_subst_name] = local_subst + + kernel = kernel.copy(substitutions=substitutions) + + print(kernel) + + from pymbolic import var + kernel = lp.precompute( + kernel, + [var(local_subst_name)(var(outer_iname), var(inner_iname))], + storage_axes=(outer_iname, inner_iname), + sweep_inames=(outer_iname, inner_iname), + precompute_inames=(outer_local_iname, inner_local_iname), + temporary_name=local_storage_name, + compute_insn_id=local_scan_insn_id) + + kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) + + # }}} + + # {{{ implement local to nonlocal information transfer + + from loopy.symbolic import pw_aff_to_expr + nonlocal_storage_len_pw_aff = ( + # The 2 here is because the first element is 0. + 2 + kernel.get_iname_bounds(outer_iname).upper_bound_pw_aff) + + nonlocal_storage_len = pw_aff_to_expr(nonlocal_storage_len_pw_aff) + + if nonlocal_storage_name not in kernel.temporary_variables: + from loopy.kernel.data import TemporaryVariable + new_temporary_variables = kernel.temporary_variables.copy() + + new_temporary_variables[nonlocal_storage_name] = ( + TemporaryVariable( + nonlocal_storage_name, + shape=(nonlocal_storage_len,), + scope=lp.auto, + base_indices=lp.auto, + dtype=lp.auto)) + + kernel = kernel.copy(temporary_variables=new_temporary_variables) + + insn = kernel.id_to_insn[insn_id] + + # XXX: should not include sweep iname? + within_inames = insn.within_inames + + from loopy.kernel.instruction import make_assignment + nonlocal_init_head = make_assignment( + id=nonlocal_init_head_insn_id, + assignees=(var(nonlocal_storage_name)[0],), + expression=0, + within_inames=frozenset([outer_local_iname]), + depends_on=frozenset([local_scan_insn_id])) + + final_element_indices = [] + + nonlocal_init_tail = make_assignment( + id=nonlocal_init_tail_insn_id, + assignees=(var(nonlocal_storage_name)[var(outer_local_iname) + 1],), + expression=var(local_storage_name)[var(outer_local_iname),inner_length - 1], + within_inames=frozenset([outer_local_iname]), + depends_on=frozenset([local_scan_insn_id])) + + kernel = _update_instructions(kernel, (nonlocal_init_head, nonlocal_init_tail), copy=False) + + # }}} + + # {{{ implement nonlocal scan + + kernel.domains.append(_make_slab_set(nonlocal_iname, nonlocal_storage_len)) + + kernel = _add_scan_subdomain(kernel, outer_scan_iname, nonlocal_iname) + + nonlocal_scan = make_assignment( + id=nonlocal_scan_insn_id, + assignees=(var(nonlocal_storage_name)[var(nonlocal_iname)],), + expression=Reduction( + scan.operation, + (outer_scan_iname,), + var(nonlocal_storage_name)[var(outer_scan_iname)]), + within_inames=frozenset([nonlocal_iname]), + depends_on=frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id])) + + kernel = _update_instructions(kernel, (nonlocal_scan,), copy=False) + + # }}} + + # {{{ replace scan with local + nonlocal + + updated_insn = insn.copy( + depends_on=insn.depends_on | frozenset([nonlocal_scan_insn_id]), + expression=var(nonlocal_storage_name)[var(outer_iname)] + var(local_storage_name)[var(outer_iname), var(inner_iname)]) + + kernel = _update_instructions(kernel, (updated_insn,), copy=False) + + # }}} + + return kernel + + +def precompute_scan( + kernel, insn_id, + sweep_iname, + scan_iname, + outer_inames=(), + temporary_scope=None, + temporary_name=None, + replace_insn_with_nop=False): + """ + Turn an expression-based scan into an array-based one. + + This takes a reduction of the form:: + + [...,sweep_iname] result = reduce(scan_iname, f(scan_iname)) + + and does essentially the following transformation:: + + [...,sweep_iname'] temp[sweep_iname'] = f(sweep_iname') + [...,sweep_iname] temp[sweep_iname] = reduce(scan_iname, temp[scan_iname]) + [...,sweep_iname] result = temp[sweep_iname] + + Note: this makes an explicit assumption that the sweep iname shares the + same bounds as the scan iname and the bounds start at 0. + """ + + # {{{ sanity checks + + insn = kernel.id_to_insn[insn_id] + scan = insn.expression + assert scan.inames[0] == scan_iname + assert len(scan.inames) == 1 + + # }}} + + # {{{ get a stable name for things + + var_name_gen = kernel.get_var_name_generator() + insn_id_gen = kernel.get_instruction_id_generator() + + format_kwargs = {"insn": insn_id, "iname": scan_iname} + + orig_subst_name = var_name_gen( + "{iname}_orig_subst".format(**format_kwargs)) + + scan_subst_name = var_name_gen( + "{iname}_subst".format(**format_kwargs)) + + precompute_insn = insn_id_gen( + "{insn}_precompute".format(**format_kwargs)) + + precompute_reduction_insn = insn_id_gen( + "{insn}_precompute_reduce".format(**format_kwargs)) + + if temporary_name is None: + temporary_name = var_name_gen( + "{insn}_precompute".format(**format_kwargs)) + + # }}} + + from loopy.transform.data import reduction_arg_to_subst_rule + kernel = reduction_arg_to_subst_rule( + kernel, scan_iname, subst_rule_name=orig_subst_name) + + # {{{ create our own variant of the substitution rule + + # FIXME: There has to be a better way of this. + + orig_subst = kernel.substitutions[orig_subst_name] + + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import ( + SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, var_name_gen) + + from pymbolic import var + mapper = RuleAwareSubstitutionMapper( + rule_mapping_context, + make_subst_func({scan_iname: var(sweep_iname)}), + within=lambda *args: True) + + scan_subst = orig_subst.copy( + name=scan_subst_name, + arguments=outer_inames + (sweep_iname,), + expression=mapper(orig_subst.expression, kernel, None)) + + substitutions = kernel.substitutions.copy() + + substitutions[scan_subst_name] = scan_subst + + kernel = kernel.copy(substitutions=substitutions) + + # }}} + + print(kernel) + + # FIXME: multi assignments + from pymbolic import var + + # FIXME: Make a new precompute iname.... + + kernel = lp.precompute(kernel, + [var(scan_subst_name)( + *(tuple(var(o) for o in outer_inames) + + (var(sweep_iname),)))], + sweep_inames=outer_inames + (sweep_iname,), + precompute_inames=(sweep_iname,), + temporary_name=temporary_name, + temporary_scope=temporary_scope, + # FIXME: why on earth is this needed + compute_insn_id=precompute_insn) + + from loopy.kernel.instruction import make_assignment + + from loopy.symbolic import Reduction + precompute_reduction = insn.copy( + id=precompute_reduction_insn, + assignee=var(temporary_name)[var(sweep_iname)], + expression=Reduction( + operation=scan.operation, + inames=(scan_iname,), + exprs=(var(temporary_name)[var(scan_iname)],), + allow_simultaneous=False, + ), + depends_on=insn.depends_on | frozenset([precompute_insn])) + + kernel = kernel.copy(instructions=kernel.instructions + + [precompute_reduction]) + + new_insn = insn.copy( + expression=var(temporary_name)[var(sweep_iname)], + depends_on= + frozenset([precompute_reduction_insn]) | insn.depends_on) + + instructions = list(kernel.instructions) + + for i, insn in enumerate(instructions): + if insn.id == insn_id: + instructions[i] = new_insn + + kernel = kernel.copy(instructions=instructions) + + return kernel + + +# vim: foldmethod=marker diff --git a/test/test_reduction.py b/test/test_reduction.py index 290f3d483..96d85beb6 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -240,12 +240,38 @@ def test_global_parallel_reduction(ctx_factory, size): knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") - lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True) +def test_global_parallel_reduction_2(): + knl = lp.make_kernel( + "{[i]: 0 <= i < n }", + """ + # Using z[0] instead of z works around a bug in ancient PyOpenCL. + z[0] = sum(i, i/13) {id=reduce} + """) + + gsize = 128 + knl = lp.make_two_level_reduction(knl, + "reduce", + inner_length=gsize * 20, + nonlocal_tag="g.0", + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + outer_tag=None, + inner_tag=None) + + print(knl) + + knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") + knl = lp.split_reduction_inward(knl, "i_inner_inner") + + knl = lp.realize_reduction(knl) + + print(knl) + + @pytest.mark.parametrize("size", [1000]) def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() diff --git a/test/test_scan.py b/test/test_scan.py index d77a82d59..aabfe3031 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -2,7 +2,7 @@ from __future__ import division, absolute_import, print_function __copyright__ = """ Copyright (C) 2012 Andreas Kloeckner -Copyright (C) 2016 Matt Wala +Copyright (C) 2016, 2017 Matt Wala """ __license__ = """ @@ -54,10 +54,8 @@ __all__ = [ # More things to test. # - test that dummy inames are removed -# - nested sequential/parallel scan # - scan(a) + scan(b) # - global parallel scan -# - base_exec_iname different bounds from sweep iname # TO DO: # segmented(...) syntax @@ -71,11 +69,14 @@ def test_sequential_scan(ctx_factory, n, stride): knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i {[i]: 0<=i {[i]: 0 <= i < n}", - "{[j]: 0 <= j <= i}", - "{[k]: 0 <= j <= k}" + "[i] -> {[j]: 0 <= j <= i}", + "[i] -> {[k]: 0 <= k <= i}" ], - "a[i] = sum(j, sum(k, k))") -""" + """ + <>tmp[i] = sum(k, 1) + out[i] = sum(j, tmp[j]) + """) + knl = lp.fix_parameters(knl, n=10) + knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag)) + + knl = lp.realize_reduction(knl, force_scan=True) + + print(knl) + + evt, (out,) = knl(queue) -def test_scan_unsupported_stride(): + print(out) + + +def test_scan_not_triangular(): knl = lp.make_kernel( "{[i,j]: 0<=i<100 and 1<=j<=2*i}", """ - a[i] = sum(j, j**2) {id=scan} + a[i] = sum(j, j**2) """ ) @@ -177,19 +211,11 @@ def test_local_parallel_scan(ctx_factory, n): knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) - print(knl) - knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) - c = lp.generate_code_v2(knl) - - print(c.device_code()) evt, (a,) = knl(queue, a=np.arange(16)) - - print(a) - assert (a == np.cumsum(np.arange(16)**2)).all() @@ -291,8 +317,8 @@ def test_argmax(ctx_factory, i_tag): (16, (0, 5)), )) @pytest.mark.parametrize("iname_tag", ("for", "l.0")) -def test_segmented_scan(ctx_getter, n, segment_boundaries_indices, iname_tag): - ctx = ctx_getter() +def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): + ctx = ctx_factory() queue = cl.CommandQueue(ctx) arr = np.ones(n, dtype=np.float32) @@ -337,6 +363,37 @@ def test_segmented_scan(ctx_getter, n, segment_boundaries_indices, iname_tag): assert [(e == a).all() for e, a in zip(expected, actual)] +def test_two_level_scan(ctx_getter): + knl = lp.make_kernel( + [ + "{[i,j]: 0 <= i < 256 and 0 <= j <= i}", + ], + """ + out[i] = sum(j, j) {id=scan} + """, + "...") + + #knl = lp.tag_inames(knl, dict(i="l.0")) + + from loopy.transform.reduction import make_two_level_scan + + knl = make_two_level_scan( + knl, "scan", inner_length=128, + scan_iname="j", + sweep_iname="i") + + knl = lp.realize_reduction(knl, force_scan=True) + + print(knl) + + c = ctx_getter() + q = cl.CommandQueue(c) + + _, (out,) = knl(q) + + print(out.get()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From d39c0fb3e19b96aa69d9edf1fc1019e90bcba596 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 8 Mar 2017 00:24:21 -0600 Subject: [PATCH 13/27] [ci skip] Two level scan now can be done in parallel. --- loopy/kernel/__init__.py | 89 +++++++++ loopy/kernel/tools.py | 1 + loopy/preprocess.py | 17 +- loopy/schedule/tools.py | 11 -- loopy/transform/instruction.py | 69 ++++++- loopy/transform/reduction.py | 321 ++++++++++++++------------------- loopy/transform/save.py | 171 +++++++++++------- test/test_loopy.py | 44 +++++ test/test_scan.py | 32 +++- test/test_transform.py | 4 + 10 files changed, 478 insertions(+), 281 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 793d31791..dfe9c857c 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -823,6 +823,95 @@ class LoopKernel(ImmutableRecordWithoutPickling): return result + @property + @memoize_method + def global_barrier_order(self): + """Return a :class:`tuple` of the listing the ids of global barrier instructions + as they appear in order in the kernel. + + See also :class:`loopy.instruction.BarrierInstruction`. + """ + barriers = [] + visiting = set() + visited = set() + + unvisited = set(insn.id for insn in self.instructions) + + while unvisited: + stack = [unvisited.pop()] + + while stack: + top = stack[-1] + + if top in visiting: + visiting.remove(top) + + from loopy.kernel.instruction import BarrierInstruction + insn = self.id_to_insn[top] + if isinstance(insn, BarrierInstruction): + if insn.kind == "global": + barriers.append(top) + + if top in visited: + stack.pop() + continue + + visited.add(top) + visiting.add(top) + + for child in self.id_to_insn[top].depends_on: + # Check for no cycles. + assert child not in visiting + stack.append(child) + + # Ensure this is the only possible order. + for prev_barrier, barrier in zip(barriers, barriers[1:]): + if prev_barrier not in self.recursive_insn_dep_map()[barrier]: + raise LoopyError( + "Unordered global barriers detected: '%s', '%s'" + % (barrier, prev_barrier)) + + return tuple(barriers) + + @memoize_method + def find_most_recent_global_barrier(self, insn_id): + """Return the id of the latest occuring global barrier which the + given instruction (indirectly or directly) depends on, or *None* if this + instruction does not depend on a global barrier. + + The return value is guaranteed to be unique because global barriers are + totally ordered within the kernel. + """ + + if len(self.global_barrier_order) == 0: + return None + + insn = self.id_to_insn[insn_id] + + if len(insn.depends_on) == 0: + return None + + def is_barrier(my_insn_id): + insn = self.id_to_insn[my_insn_id] + from loopy.kernel.instruction import BarrierInstruction + return isinstance(insn, BarrierInstruction) and insn.kind == "global" + + global_barrier_to_ordinal = dict( + (b, i) for i, b in enumerate(self.global_barrier_order)) + + def get_barrier_ordinal(barrier_id): + return global_barrier_to_ordinal[barrier_id] if barrier_id is not None else -1 + + direct_barrier_dependencies = set( + dep for dep in insn.depends_on if is_barrier(dep)) + + if len(direct_barrier_dependencies) > 0: + return max(direct_barrier_dependencies, key=get_barrier_ordinal) + else: + return max((self.find_most_recent_global_barrier(dep) + for dep in insn.depends_on), + key=get_barrier_ordinal) + # }}} # {{{ argument wrangling diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 539bfbed0..d94136e43 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1354,4 +1354,5 @@ def draw_dependencies_as_unicode_arrows( # }}} + # vim: foldmethod=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f139810f1..ef49faa33 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1242,13 +1242,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) + init_insn_depends_on = frozenset() + + global_barrier = temp_kernel.find_most_recent_global_barrier(insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset( (sweep_iname,) + expr.inames), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(), + depends_on=init_insn_depends_on, expression=expr.operation.neutral_element(*arg_dtypes)) generated_insns.append(init_insn) @@ -1257,11 +1264,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, replace_var_within_expr(sub_expr, scan_iname, track_iname) for sub_expr in expr.exprs) - """ - updated_inames = tuple( - (set(expr.inames) - set([scan_iname])) | set([track_iname])) - """ - update_id = insn_id_gen( based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) @@ -1600,9 +1602,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _error_if_force_scan_on(LoopyError, "Sweep iname '%s' has an unsupported parallel tag '%s' " "- the only parallelism allowed is 'local'." % - (sweep_iname, sweep_class.nonlocal_parallel[0])) + (sweep_iname, temp_kernel.iname_to_tag[sweep_iname])) elif parallel: - print(temp_kernel) return map_scan_local( expr, rec, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 5de677e72..692e39028 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -144,17 +144,6 @@ class InstructionQuery(object): if isinstance(self.kernel.iname_to_tag.get(iname), HardwareParallelTag)) - @memoize_method - def common_hw_inames(self, insn_ids): - """ - Return the common set of hardware parallel tagged inames among - the list of instructions. - """ - # Get the list of hardware inames in which the temporary is defined. - if len(insn_ids) == 0: - return set() - return set.intersection(*(self.hw_inames(id) for id in insn_ids)) - # }}} diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 7c9c96886..9143052a4 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -34,7 +34,6 @@ def find_instructions(kernel, insn_match): match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] - # }}} @@ -207,4 +206,72 @@ def tag_instructions(kernel, new_tag, within=None): # }}} +# {{{ add nosync + +def add_nosync_to_instructions( + kernel, scope, source, sink, bidirectional=False): + """Add a *nosync* directive between *source* and *sync*. + + *source* and *sink* may be any instruction id match understood by + :func:`loopy.match.parse_match`. + + *scope* should be a valid nosync scope. + + If *bidirectional* is True, this adds a nosync to both the source + and sink instructions, otherwise the directive is only added to the + sink instructions. + + *nosync* attributes are only added if a dependency is present or if + the instruction pair is spread across a conflicting group. + """ + + if isinstance(source, str) and source in kernel.id_to_insn: + sources = frozenset([source]) + else: + sources = frozenset( + source.id for source in find_instructions(kernel, source)) + + if isinstance(sink, str) and sink in kernel.id_to_insn: + sinks = frozenset([sink]) + else: + sinks = frozenset( + sink.id for sink in find_instructions(kernel, sink)) + + def insns_in_conflicting_groups(insn1_id, insn2_id): + insn1 = kernel.id_to_insn[insn1_id] + insn2 = kernel.id_to_insn[insn2_id] + return ( + bool(insn1.groups & insn2.conflicts_with_groups) + or + bool(insn2.groups & insn1.conflicts_with_groups)) + + from collections import defaultdict + nosync_to_add = defaultdict(lambda: set()) + + for sink in sinks: + for source in sources: + + needs_nosync = ( + source in kernel.recursive_insn_dep_map()[sink] + or insns_in_conflicting_groups(source, sink)) + + if not needs_nosync: + continue + + nosync_to_add[sink].add((source, scope)) + if bidirectional: + nosync_to_add[source].add((sink, scope)) + + new_instructions = list(kernel.instructions) + + for i, insn in enumerate(new_instructions): + if insn.id in nosync_to_add: + new_instructions[i] = insn.copy( + no_sync_with=insn.no_sync_with | frozenset(nosync_to_add[insn.id])) + + return kernel.copy(instructions=new_instructions) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 1693fb515..2fd086912 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -40,7 +40,6 @@ __doc__ = """ .. autofunction:: make_two_level_reduction .. autofunction:: make_two_level_scan -.. autofunction:: precompute_scan """ @@ -153,7 +152,6 @@ def _make_slab_set(iname, size): v[0].le_set(v[iname]) & v[iname].lt_set(v[0] + size)).get_basic_sets() - print("ADDING SLAB", bs) return bs @@ -197,6 +195,22 @@ def _expand_subst_within_expression(kernel, expr): return submap(expr, kernel, insn=None) +def _add_global_barrier(kernel, source, sink, barrier_id): + from loopy.kernel.instruction import BarrierInstruction + barrier_insn = BarrierInstruction( + id=barrier_id, + depends_on=frozenset([source]), + kind="global") + + updated_sink = kernel.id_to_insn[sink] + updated_sink = updated_sink.copy( + depends_on=updated_sink.depends_on | frozenset([barrier_id])) + + kernel = _update_instructions(kernel, (barrier_insn, updated_sink), copy=True) + + return kernel + + def make_two_level_scan( kernel, insn_id, scan_iname, @@ -212,8 +226,8 @@ def make_two_level_scan( inner_local_tag=None, inner_tag=None, outer_tag=None, - inner_local_iname=None, - outer_local_iname=None): + inner_iname=None, + outer_iname=None): """ Two level scan, mediated through a "local" and "nonlocal" array. @@ -232,6 +246,8 @@ def make_two_level_scan( # {{{ sanity checks + # FIXME: More sanity checks... + insn = kernel.id_to_insn[insn_id] scan = insn.expression assert scan.inames[0] == scan_iname @@ -241,34 +257,42 @@ def make_two_level_scan( # {{{ get stable names for everything + # XXX: add inner_iname and outer_iname to var_name_gen if not none + var_name_gen = kernel.get_var_name_generator() insn_id_gen = kernel.get_instruction_id_generator() - format_kwargs = {"insn": insn_id, "iname": scan_iname, "sweep": sweep_iname} + level = 0 #scan_level or try_get_scan_level(sweep_iname) + + format_kwargs = { + "insn": insn_id, "iname": scan_iname, "sweep": sweep_iname, + "level": level, "next_level": level + 1, "prefix": "l"} nonlocal_storage_name = var_name_gen( - "{insn}_nonlocal".format(**format_kwargs)) + "{prefix}{level}_insn".format(**format_kwargs)) + + if inner_iname is None: + inner_iname = var_name_gen( + "{prefix}{level}_inner_update_{sweep}".format(**format_kwargs)) + + if outer_iname is None: + outer_iname = var_name_gen( + "{prefix}{level}_outer_update_{sweep}".format(**format_kwargs)) - inner_iname = var_name_gen( - "{sweep}_inner".format(**format_kwargs)) - outer_iname = var_name_gen( - "{sweep}_outer".format(**format_kwargs)) nonlocal_iname = var_name_gen( - "{sweep}_nonlocal".format(**format_kwargs)) + "{prefix}{level}_combine_{sweep}".format(**format_kwargs)) - if inner_local_iname is None: - inner_local_iname = var_name_gen( - "{sweep}_inner_local".format(**format_kwargs)) + inner_local_iname = var_name_gen( + "{prefix}{next_level}_inner_{sweep}".format(**format_kwargs)) inner_scan_iname = var_name_gen( - "{iname}_inner".format(**format_kwargs)) + "{prefix}{next_level}_{iname}".format(**format_kwargs)) outer_scan_iname = var_name_gen( - "{iname}_outer".format(**format_kwargs)) + "{prefix}{level}_{iname}".format(**format_kwargs)) - if outer_local_iname is None: - outer_local_iname = var_name_gen( - "{sweep}_outer_local".format(**format_kwargs)) + outer_local_iname = var_name_gen( + "{prefix}{next_level}_outer_{sweep}".format(**format_kwargs)) subst_name = var_name_gen( "{insn}_inner_subst".format(**format_kwargs)) @@ -278,11 +302,11 @@ def make_two_level_scan( if local_storage_name is None: local_storage_name = var_name_gen( - "{insn}_local".format(**format_kwargs)) + "{prefix}{next_level}_{insn}".format(**format_kwargs)) if nonlocal_storage_name is None: nonlocal_storage_name = var_name_gen( - "{insn}_nonlocal".format(**format_kwargs)) + "{prefix}{level}_{insn}".format(**format_kwargs)) local_scan_insn_id = insn_id_gen( "{iname}_local_scan".format(**format_kwargs)) @@ -300,6 +324,27 @@ def make_two_level_scan( # }}} + # {{{ utils + + if local_storage_axes is None: + local_storage_axes = (outer_iname, inner_iname) + + def pick_out_relevant_axes(full_indices, strip_scalar=False): + assert len(full_indices) == 2 + iname_to_index = dict(zip((outer_iname, inner_iname), full_indices)) + + result = [] + for iname in local_storage_axes: + result.append(iname_to_index[iname]) + + assert len(result) > 0 + + return tuple(result) if not (strip_scalar and len(result) == 1) else result[0] + + # }}} + + # {{{ prepare for two level scan + # Turn the scan into a substitution rule, replace the original scan with a # nop and delete the scan iname. # @@ -334,6 +379,8 @@ def make_two_level_scan( # Make sure we got rid of everything assert scan_iname not in kernel.all_inames() + # }}} + # {{{ implement local scan from pymbolic import var @@ -342,9 +389,16 @@ def make_two_level_scan( var(inner_scan_iname))) kernel = lp.split_iname(kernel, sweep_iname, inner_length, - inner_iname=inner_iname, outer_iname=outer_iname) + inner_iname=inner_iname, outer_iname=outer_iname, + inner_tag=inner_tag, outer_tag=outer_tag) - print("SPLITTING INAME, GOT DOMAINS", kernel.domains) + kernel = lp.duplicate_inames(kernel, + (outer_iname, inner_iname), + within="not id:*", + new_inames=[outer_local_iname, inner_local_iname], + tags={outer_iname: outer_local_tag, inner_iname: inner_local_tag}) + + kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) from loopy.kernel.data import SubstitutionRule from loopy.symbolic import Reduction @@ -353,37 +407,39 @@ def make_two_level_scan( name=local_subst_name, arguments=(outer_iname, inner_iname), expression=Reduction( - scan.operation, - (inner_scan_iname,), - local_scan_expr) - ) + scan.operation, (inner_scan_iname,), local_scan_expr)) substitutions = kernel.substitutions.copy() substitutions[local_subst_name] = local_subst kernel = kernel.copy(substitutions=substitutions) - print(kernel) + all_precompute_inames = (outer_local_iname, inner_local_iname) + + precompute_inames = pick_out_relevant_axes(all_precompute_inames) + sweep_inames = pick_out_relevant_axes((outer_iname, inner_iname)) + + precompute_outer_inames = ( + frozenset(all_precompute_inames) + - frozenset(precompute_inames)) from pymbolic import var - kernel = lp.precompute( - kernel, + kernel = lp.precompute(kernel, [var(local_subst_name)(var(outer_iname), var(inner_iname))], - storage_axes=(outer_iname, inner_iname), - sweep_inames=(outer_iname, inner_iname), - precompute_inames=(outer_local_iname, inner_local_iname), + sweep_inames=sweep_inames, + precompute_inames=precompute_inames, + storage_axes=local_storage_axes, + precompute_outer_inames=precompute_outer_inames, temporary_name=local_storage_name, compute_insn_id=local_scan_insn_id) - kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) - # }}} # {{{ implement local to nonlocal information transfer from loopy.symbolic import pw_aff_to_expr nonlocal_storage_len_pw_aff = ( - # The 2 here is because the first element is 0. + # FIXME: should be 1 + len, bounds check doesnt like this.. 2 + kernel.get_iname_bounds(outer_iname).upper_bound_pw_aff) nonlocal_storage_len = pw_aff_to_expr(nonlocal_storage_len_pw_aff) @@ -396,7 +452,7 @@ def make_two_level_scan( TemporaryVariable( nonlocal_storage_name, shape=(nonlocal_storage_len,), - scope=lp.auto, + scope=nonlocal_storage_scope, base_indices=lp.auto, dtype=lp.auto)) @@ -412,7 +468,8 @@ def make_two_level_scan( id=nonlocal_init_head_insn_id, assignees=(var(nonlocal_storage_name)[0],), expression=0, - within_inames=frozenset([outer_local_iname]), + within_inames=frozenset([outer_local_iname,inner_local_iname]), + predicates=frozenset([var(inner_local_iname).eq(0)]), depends_on=frozenset([local_scan_insn_id])) final_element_indices = [] @@ -420,11 +477,17 @@ def make_two_level_scan( nonlocal_init_tail = make_assignment( id=nonlocal_init_tail_insn_id, assignees=(var(nonlocal_storage_name)[var(outer_local_iname) + 1],), - expression=var(local_storage_name)[var(outer_local_iname),inner_length - 1], - within_inames=frozenset([outer_local_iname]), - depends_on=frozenset([local_scan_insn_id])) - - kernel = _update_instructions(kernel, (nonlocal_init_head, nonlocal_init_tail), copy=False) + expression=var(local_storage_name)[ + pick_out_relevant_axes( + (var(outer_local_iname),var(inner_local_iname)), + strip_scalar=True)], + no_sync_with=frozenset([(local_scan_insn_id, "local")]), + within_inames=frozenset([outer_local_iname,inner_local_iname]), + depends_on=frozenset([local_scan_insn_id]), + predicates=frozenset([var(inner_local_iname).eq(inner_length - 1)])) + + kernel = _update_instructions( + kernel, (nonlocal_init_head, nonlocal_init_tail), copy=False) # }}} @@ -432,6 +495,9 @@ def make_two_level_scan( kernel.domains.append(_make_slab_set(nonlocal_iname, nonlocal_storage_len)) + if nonlocal_tag is not None: + kernel = lp.tag_inames(kernel, {nonlocal_iname: nonlocal_tag}) + kernel = _add_scan_subdomain(kernel, outer_scan_iname, nonlocal_iname) nonlocal_scan = make_assignment( @@ -446,165 +512,40 @@ def make_two_level_scan( kernel = _update_instructions(kernel, (nonlocal_scan,), copy=False) - # }}} - - # {{{ replace scan with local + nonlocal - - updated_insn = insn.copy( - depends_on=insn.depends_on | frozenset([nonlocal_scan_insn_id]), - expression=var(nonlocal_storage_name)[var(outer_iname)] + var(local_storage_name)[var(outer_iname), var(inner_iname)]) - - kernel = _update_instructions(kernel, (updated_insn,), copy=False) + if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL: + barrier_id = insn_id_gen("barrier_{insn}".format(**format_kwargs)) + kernel = _add_global_barrier(kernel, + source=nonlocal_init_tail_insn_id, + sink=nonlocal_scan_insn_id, + barrier_id=barrier_id) # }}} - return kernel - - -def precompute_scan( - kernel, insn_id, - sweep_iname, - scan_iname, - outer_inames=(), - temporary_scope=None, - temporary_name=None, - replace_insn_with_nop=False): - """ - Turn an expression-based scan into an array-based one. - - This takes a reduction of the form:: - - [...,sweep_iname] result = reduce(scan_iname, f(scan_iname)) - - and does essentially the following transformation:: - - [...,sweep_iname'] temp[sweep_iname'] = f(sweep_iname') - [...,sweep_iname] temp[sweep_iname] = reduce(scan_iname, temp[scan_iname]) - [...,sweep_iname] result = temp[sweep_iname] - - Note: this makes an explicit assumption that the sweep iname shares the - same bounds as the scan iname and the bounds start at 0. - """ - - # {{{ sanity checks - - insn = kernel.id_to_insn[insn_id] - scan = insn.expression - assert scan.inames[0] == scan_iname - assert len(scan.inames) == 1 - - # }}} - - # {{{ get a stable name for things - - var_name_gen = kernel.get_var_name_generator() - insn_id_gen = kernel.get_instruction_id_generator() - - format_kwargs = {"insn": insn_id, "iname": scan_iname} - - orig_subst_name = var_name_gen( - "{iname}_orig_subst".format(**format_kwargs)) - - scan_subst_name = var_name_gen( - "{iname}_subst".format(**format_kwargs)) - - precompute_insn = insn_id_gen( - "{insn}_precompute".format(**format_kwargs)) - - precompute_reduction_insn = insn_id_gen( - "{insn}_precompute_reduce".format(**format_kwargs)) - - if temporary_name is None: - temporary_name = var_name_gen( - "{insn}_precompute".format(**format_kwargs)) - - # }}} - - from loopy.transform.data import reduction_arg_to_subst_rule - kernel = reduction_arg_to_subst_rule( - kernel, scan_iname, subst_rule_name=orig_subst_name) - - # {{{ create our own variant of the substitution rule - - # FIXME: There has to be a better way of this. - - orig_subst = kernel.substitutions[orig_subst_name] - - from pymbolic.mapper.substitutor import make_subst_func - - from loopy.symbolic import ( - SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) + # {{{ replace scan with local + nonlocal - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, var_name_gen) + updated_depends_on = insn.depends_on | frozenset([nonlocal_scan_insn_id]) - from pymbolic import var - mapper = RuleAwareSubstitutionMapper( - rule_mapping_context, - make_subst_func({scan_iname: var(sweep_iname)}), - within=lambda *args: True) + if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL: + barrier_id = insn_id_gen("barrier_{insn}".format(**format_kwargs)) + kernel = (_add_global_barrier(kernel, + source=nonlocal_scan_insn_id, sink=insn_id, barrier_id=barrier_id)) + updated_depends_on |= frozenset([barrier_id]) - scan_subst = orig_subst.copy( - name=scan_subst_name, - arguments=outer_inames + (sweep_iname,), - expression=mapper(orig_subst.expression, kernel, None)) + nonlocal_part = var(nonlocal_storage_name)[var(outer_iname)] - substitutions = kernel.substitutions.copy() + local_part = var(local_storage_name)[ + pick_out_relevant_axes( + (var(outer_iname), var(inner_iname)), strip_scalar=True)] - substitutions[scan_subst_name] = scan_subst + updated_insn = insn.copy( + depends_on=updated_depends_on, + # XXX: scan binary op + expression=nonlocal_part + local_part) - kernel = kernel.copy(substitutions=substitutions) + kernel = _update_instructions(kernel, (updated_insn,), copy=False) # }}} - print(kernel) - - # FIXME: multi assignments - from pymbolic import var - - # FIXME: Make a new precompute iname.... - - kernel = lp.precompute(kernel, - [var(scan_subst_name)( - *(tuple(var(o) for o in outer_inames) + - (var(sweep_iname),)))], - sweep_inames=outer_inames + (sweep_iname,), - precompute_inames=(sweep_iname,), - temporary_name=temporary_name, - temporary_scope=temporary_scope, - # FIXME: why on earth is this needed - compute_insn_id=precompute_insn) - - from loopy.kernel.instruction import make_assignment - - from loopy.symbolic import Reduction - precompute_reduction = insn.copy( - id=precompute_reduction_insn, - assignee=var(temporary_name)[var(sweep_iname)], - expression=Reduction( - operation=scan.operation, - inames=(scan_iname,), - exprs=(var(temporary_name)[var(scan_iname)],), - allow_simultaneous=False, - ), - depends_on=insn.depends_on | frozenset([precompute_insn])) - - kernel = kernel.copy(instructions=kernel.instructions + - [precompute_reduction]) - - new_insn = insn.copy( - expression=var(temporary_name)[var(sweep_iname)], - depends_on= - frozenset([precompute_reduction_insn]) | insn.depends_on) - - instructions = list(kernel.instructions) - - for i, insn in enumerate(instructions): - if insn.id == insn_id: - instructions[i] = new_insn - - kernel = kernel.copy(instructions=instructions) - return kernel diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 8afc1695a..29f4c0238 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -197,16 +197,16 @@ class TemporarySaver(object): The original temporary variable object. - .. attribute:: hw_inames - - The common list of hw axes that define the original object. - .. attribute:: hw_dims A list of expressions, to be added in front of the shape of the promoted temporary value, corresponding to hardware dimensions + .. attribute:: hw_tags + + The tags for the inames associated with hw_dims + .. attribute:: non_hw_dims A list of expressions, to be added in front of the shape @@ -241,6 +241,75 @@ class TemporarySaver(object): self.updated_temporary_variables = {} self.saves_or_reloads_added = {} + def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary): + """ + This is used for determining the amount of global storage needed for saving + and restoring the temporary across kernel calls, due to hardware + parallel inames (the inferred axes get prefixed to the number of + dimensions in the temporary). + + In the case of local temporaries, inames that are tagged + hw-local do not contribute to the global storage shape. + """ + accessor_insn_ids = ( + self.insn_query.insns_reading_or_writing(temporary.name)) + + group_tags = None + local_tags = None + + def _sortedtags(tags): + return sorted(tags, key=lambda tag: tag.axis) + + for insn_id in accessor_insn_ids: + insn = self.kernel.id_to_insn[insn_id] + + my_group_tags = [] + my_local_tags = [] + + for iname in insn.within_inames: + tag = self.kernel.iname_to_tag[iname] + + from loopy.kernel.data import ( + GroupIndexTag, LocalIndexTag, ParallelTag) + + if isinstance(tag, GroupIndexTag): + my_group_tags.append(tag) + elif isinstance(tag, LocalIndexTag): + my_local_tags.append(tag) + elif isinstance(tag, ParallelTag): + raise ValueError( + "iname '%s' is tagged with '%s' - only " + "local and global tags are supported for " + "auto saving of temporaries" % + (iname, tag)) + + if group_tags is None: + group_tags = _sortedtags(my_group_tags) + local_tags = _sortedtags(my_local_tags) + + if ( + group_tags != _sortedtags(my_group_tags) + or local_tags != _sortedtags(my_local_tags)): + raise ValueError( + "inconsistent parallel tags across instructions that access '%s'" + % temporary.name) + + if group_tags is None: + assert local_tags is None + return (), () + + group_sizes, local_sizes = ( + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + + if temporary.scope == lp.temp_var_scope.LOCAL: + # Elide local axes in the save slot for local temporaries. + del local_tags[:] + local_sizes = () + + # We set hw_dims to be arranged according to the order: + # g.0 < g.1 < ... < l.0 < l.1 < ... + return (group_sizes + local_sizes), tuple(group_tags + local_tags) + @memoize_method def auto_promote_temporary(self, temporary_name): temporary = self.kernel.temporary_variables[temporary_name] @@ -259,48 +328,7 @@ class TemporarySaver(object): raise ValueError( "Cannot promote temporaries with base_storage to global") - # `hw_inames`: The set of hw-parallel tagged inames that this temporary - # is associated with. This is used for determining the shape of the - # global storage needed for saving and restoring the temporary across - # kernel calls. - # - # TODO: Make a policy decision about which dimensions to use. Currently, - # the code looks at each instruction that defines or uses the temporary, - # and takes the common set of hw-parallel tagged inames associated with - # these instructions. - # - # Furthermore, in the case of local temporaries, inames that are tagged - # hw-local do not contribute to the global storage shape. - hw_inames = self.insn_query.common_hw_inames( - self.insn_query.insns_reading_or_writing(temporary.name)) - - # We want hw_inames to be arranged according to the order: - # g.0 < g.1 < ... < l.0 < l.1 < ... - # Sorting lexicographically accomplishes this. - hw_inames = sorted(hw_inames, - key=lambda iname: str(self.kernel.iname_to_tag[iname])) - - # Calculate the sizes of the dimensions that get added in front for - # the global storage of the temporary. - hw_dims = [] - - backing_hw_inames = [] - - for iname in hw_inames: - tag = self.kernel.iname_to_tag[iname] - from loopy.kernel.data import LocalIndexTag - is_local_iname = isinstance(tag, LocalIndexTag) - if is_local_iname and temporary.scope == temp_var_scope.LOCAL: - # Restrict shape to that of group inames for locals. - continue - backing_hw_inames.append(iname) - from loopy.isl_helpers import static_max_of_pw_aff - from loopy.symbolic import aff_to_expr - hw_dims.append( - aff_to_expr( - static_max_of_pw_aff( - self.kernel.get_iname_bounds(iname).size, False))) - + hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary) non_hw_dims = temporary.shape if len(non_hw_dims) == 0 and len(hw_dims) == 0: @@ -310,9 +338,9 @@ class TemporarySaver(object): backing_temporary = self.PromotedTemporary( name=self.var_name_gen(temporary.name + "_save_slot"), orig_temporary=temporary, - hw_dims=tuple(hw_dims), - non_hw_dims=non_hw_dims, - hw_inames=backing_hw_inames) + hw_dims=hw_dims, + hw_tags=hw_tags, + non_hw_dims=non_hw_dims) return backing_temporary @@ -330,8 +358,7 @@ class TemporarySaver(object): dchg = DomainChanger( self.kernel, frozenset( - self.insn_query.inames_in_subkernel(subkernel) | - set(promoted_temporary.hw_inames))) + self.insn_query.inames_in_subkernel(subkernel))) domain, hw_inames, dim_inames, iname_to_tag = \ self.augment_domain_for_save_or_reload( @@ -342,7 +369,7 @@ class TemporarySaver(object): save_or_load_insn_id = self.insn_name_gen( "{name}.{mode}".format(name=temporary, mode=mode)) - def subscript_or_var(agg, subscript=()): + def add_subscript_if_nonempty(agg, subscript=()): from pymbolic.primitives import Subscript, Variable if len(subscript) == 0: return Variable(agg) @@ -354,10 +381,10 @@ class TemporarySaver(object): dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)] args = ( - subscript_or_var( - temporary, dim_inames_trunc), - subscript_or_var( - promoted_temporary.name, hw_inames + dim_inames)) + add_subscript_if_nonempty( + temporary, subscript=dim_inames_trunc), + add_subscript_if_nonempty( + promoted_temporary.name, subscript=hw_inames + dim_inames)) if mode == "save": args = reversed(args) @@ -471,7 +498,9 @@ class TemporarySaver(object): # Add dimension-dependent inames. dim_inames = [] - domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims)) + domain = domain.add(isl.dim_type.set, + len(promoted_temporary.non_hw_dims) + + len(promoted_temporary.hw_dims)) for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims): new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}". @@ -496,22 +525,30 @@ class TemporarySaver(object): from loopy.symbolic import aff_from_expr domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size)) - # FIXME: Use promoted_temporary.hw_inames - hw_inames = [] + dim_offset = orig_dim + len(promoted_temporary.non_hw_dims) - # Add hardware inames duplicates. - for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): + hw_inames = [] + # Add hardware dims. + for hw_iname_idx, (hw_tag, dim) in enumerate( + zip(promoted_temporary.hw_tags, promoted_temporary.hw_dims)): new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}". format(name=orig_temporary.name, mode=mode, - dim=t_idx, + dim=hw_iname_idx, sk=subkernel)) - hw_inames.append(new_iname) - iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname] + domain = domain.set_dim_name( + isl.dim_type.set, dim_offset + hw_iname_idx, new_iname) - from loopy.isl_helpers import duplicate_axes - domain = duplicate_axes( - domain, promoted_temporary.hw_inames, hw_inames) + aff = isl.affs_from_space(domain.space) + from loopy.symbolic import aff_from_expr + domain = (domain + & + aff[0].le_set(aff[new_iname]) + & + aff[new_iname].lt_set(aff_from_expr(domain.space, dim))) + + self.updated_iname_to_tag[new_iname] = hw_tag + hw_inames.append(new_iname) # The operations on the domain above return a Set object, but the # underlying domain should be expressible as a single BasicSet. diff --git a/test/test_loopy.py b/test/test_loopy.py index 5e4d013b3..1d1450fc0 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2117,6 +2117,50 @@ def test_barrier_insertion_near_bottom_of_loop(): assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) +def test_global_barrier_order_finding(): + knl = lp.make_kernel( + "{[i,itrip]: 0<=i z[i] = z[i+1] + z[i] {id=wr_z,dep=top} + <> v[i] = 11 {id=wr_v,dep=top} + ... gbarrier {dep=wr_z:wr_v,id=yoink} + z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink} + end + ... nop {id=nop} + ... gbarrier {dep=iupd,id=postloop} + z[i] = z[i] - z[i+1] + v[i] {id=zzzv,dep=postloop} + end + """) + + assert knl.global_barrier_order == ("top", "yoink", "postloop") + + for insn, barrier in ( + ("nop", None), + ("top", None), + ("wr_z", "top"), + ("wr_v", "top"), + ("yoink", "top"), + ("postloop", "yoink"), + ("zzzv", "postloop")): + assert knl.find_most_recent_global_barrier(insn) == barrier + + +def test_global_barrier_error_if_unordered(): + # FIXME: Should be illegal to declare this + knl = lp.make_kernel("{[i]: 0 <= i < 10}", + """ + ... gbarrier + ... gbarrier + """) + + from loopy.diagnostic import LoopyError + with pytest.raises(LoopyError): + knl.global_barrier_order + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_scan.py b/test/test_scan.py index aabfe3031..ae046818b 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -366,10 +366,10 @@ def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): def test_two_level_scan(ctx_getter): knl = lp.make_kernel( [ - "{[i,j]: 0 <= i < 256 and 0 <= j <= i}", + "{[i,j]: 0 <= i < 16 and 0 <= j <= i}", ], """ - out[i] = sum(j, j) {id=scan} + out[i] = sum(j, 1) {id=insn} """, "...") @@ -378,12 +378,36 @@ def test_two_level_scan(ctx_getter): from loopy.transform.reduction import make_two_level_scan knl = make_two_level_scan( - knl, "scan", inner_length=128, + knl, "insn", inner_length=4, scan_iname="j", - sweep_iname="i") + sweep_iname="i", + local_storage_axes=(("l0_inner_update_i",)), + inner_iname="l0_inner_update_i", + inner_tag="l.0", + outer_tag="g.0", + local_storage_scope=lp.temp_var_scope.PRIVATE, + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + inner_local_tag="l.0", + outer_local_tag="g.0") + + print(knl) knl = lp.realize_reduction(knl, force_scan=True) + from loopy.transform.instruction import add_nosync_to_instructions + knl = add_nosync_to_instructions( + knl, + scope="global", + source="writes:acc_l0_j", + sink="reads:acc_l0_j") + + from loopy.transform.save import save_and_reload_temporaries + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + knl = save_and_reload_temporaries(knl) + knl = lp.get_one_scheduled_kernel(knl) + print(knl) c = ctx_getter() diff --git a/test/test_transform.py b/test/test_transform.py index ac5a26f6a..cf2dac48f 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -402,6 +402,10 @@ def test_precompute_with_preexisting_inames_fail(): precompute_inames="ii,jj") +def test_add_nosync_to_instructions(): + knl = lp.make_kernel("") + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 7a208d16410deaceef80a566e1f5c0c02bdaa37d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 8 Mar 2017 22:10:35 -0600 Subject: [PATCH 14/27] [ci skip] Three level scan: sort of working version. --- loopy/preprocess.py | 62 ++++++++++++++++++-------- loopy/schedule/__init__.py | 2 +- loopy/transform/reduction.py | 86 +++++++++++++++++++++++++++++------- loopy/transform/save.py | 55 +++++++++++++++++------ test/test_scan.py | 85 ++++++++++++++++++++++++++++++++--- 5 files changed, 235 insertions(+), 55 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ef49faa33..e32ad719d 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -363,7 +363,7 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): dim_type = isl.dim_type orig_domain = kernel.get_inames_domain( - (scan_param.sweep_iname, scan_param.scan_iname)) + frozenset((scan_param.sweep_iname, scan_param.scan_iname))) domain = _move_set_to_param_dims_except(orig_domain, (scan_param.sweep_iname, scan_param.scan_iname)) @@ -443,7 +443,7 @@ def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname)) except ValueError as v: - raise ValueError("Couldn't determine bounds for scan: %s" % e) + raise ValueError("Couldn't determine bounds for scan: %s" % v) try: stride = _try_infer_scan_stride( @@ -506,16 +506,21 @@ def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname): - # FIXME: use home domain of scan_iname... - domain = kernel.get_inames_domain((sweep_iname, scan_iname)) + domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname))) domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname)) - domain = domain.gist_params(domain.params()).project_out_except( - (sweep_iname,), (isl.dim_type.param,)) + var_dict = domain.get_var_dict() + sweep_idx = var_dict[sweep_iname][1] + scan_idx = var_dict[scan_iname][1] - sweep_lower_bound = domain.dim_min(domain.get_var_dict()[sweep_iname][1]) - sweep_upper_bound = domain.dim_max(domain.get_var_dict()[sweep_iname][1]) - scan_lower_bound = domain.dim_min(domain.get_var_dict()[scan_iname][1]) + domain = domain.gist_params(domain.params()) + + try: + sweep_lower_bound = domain.dim_min(sweep_idx) + sweep_upper_bound = domain.dim_max(sweep_idx) + scan_lower_bound = domain.dim_min(scan_idx) + except isl.Error as e: + raise ValueError("isl error: %s" % e) return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound) @@ -867,7 +872,11 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): - dependent_inames = frozenset(subdomain.get_var_names(isl.dim_type.param)) + # Intersect with inames, because we could have captured some kernel params + # in here too.. + dependent_inames = ( + frozenset(subdomain.get_var_names(isl.dim_type.param)) + & kernel.all_inames()) idx, = kernel.get_leaf_domain_indices(dependent_inames) domains.insert(idx + 1, subdomain) @@ -1165,7 +1174,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, @memoize def get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride): - domain = kernel.get_inames_domain((scan_iname, sweep_iname)) + domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname))) tracking_iname = var_name_gen( "{scan_iname}_tracking_{sweep_iname}" @@ -1176,9 +1185,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, new_domain = _create_domain_for_sweep_tracking(domain, tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) - _insert_subdomain_into_domain_tree(kernel, domains, new_domain) + from loopy.kernel.tools import DomainChanger + domain_idx, = temp_kernel.get_leaf_domain_indices(frozenset([sweep_iname])) + + orig_domain = domains[domain_idx] + new_domain = isl.align_spaces(new_domain, domains[domain_idx], + obj_bigger_ok=True, + across_dim_types=True) + orig_domain = isl.align_spaces(orig_domain, new_domain) - return tracking_iname, new_domain + orig_domain &= new_domain + + domains[domain_idx] = orig_domain + + return tracking_iname def replace_var_within_expr(expr, from_var, to_var): from pymbolic.mapper.substitutor import make_subst_func @@ -1223,7 +1243,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, outer_insn_inames = temp_kernel.insn_inames(insn) inames_to_remove.add(scan_iname) - track_iname, track_iname_domain = ( + track_iname = ( get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride)) @@ -1303,7 +1323,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # TODO: rename red_iname = scan_iname - size = _get_int_iname_size(sweep_iname) + scan_size = _get_int_iname_size(sweep_iname) + + assert scan_size > 0 + + if scan_size == 1: + raise NotImplementedError("tell matt to fix this") outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1324,7 +1349,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _get_int_iname_size(oiname) for oiname in outer_local_inames) - track_iname, track_iname_domain = get_or_add_sweep_tracking_iname_and_domain( + track_iname = get_or_add_sweep_tracking_iname_and_domain( scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride) # {{{ add separate iname to carry out the scan @@ -1333,7 +1358,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # on our red_iname. base_exec_iname = var_name_gen("scan_"+sweep_iname) - domains.append(_make_slab_set(base_exec_iname, size)) + domains.append(_make_slab_set(base_exec_iname, scan_size)) new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname] # }}} @@ -1359,7 +1384,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, acc_var_names = make_temporaries( name_based_on="acc_"+scan_iname, nvars=nresults, - shape=outer_local_iname_sizes + (size,), + shape=outer_local_iname_sizes + (scan_size,), dtypes=reduction_dtypes, scope=temp_var_scope.LOCAL) @@ -1418,7 +1443,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, else: return c - scan_size = size prev_id = transfer_id istage = 0 diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index c078da2ec..10a19a3c7 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -324,7 +324,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): if not may_add_to_loop_dep_map: continue - logger.debug("{knl}: loop dependency map: iname '{iname}' " + print("{knl}: loop dependency map: iname '{iname}' " "depends on '{dep_insn}' via '{insn}'" .format( knl=kernel.name, diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 2fd086912..c46c9481a 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -197,11 +197,16 @@ def _expand_subst_within_expression(kernel, expr): def _add_global_barrier(kernel, source, sink, barrier_id): from loopy.kernel.instruction import BarrierInstruction + within_inames = ( + kernel.id_to_insn[source].within_inames + & kernel.id_to_insn[sink].within_inames) + barrier_insn = BarrierInstruction( id=barrier_id, depends_on=frozenset([source]), + within_inames = within_inames, kind="global") - + updated_sink = kernel.id_to_insn[sink] updated_sink = updated_sink.copy( depends_on=updated_sink.depends_on | frozenset([barrier_id])) @@ -211,6 +216,39 @@ def _add_global_barrier(kernel, source, sink, barrier_id): return kernel +def _get_scan_level(sweep_iname): + SWEEP_RE = r"l(\d+)_.*" + + import re + match_result = re.match(SWEEP_RE, sweep_iname) + + if match_result is None: + return 0 + + return int(match_result.group(1)) + + +def _get_base_iname(iname): + BASE_INAME_RE = r"l\d+_(.*)" + + import re + match_result = re.match(BASE_INAME_RE, iname) + + if match_result is None: + return iname + + base_iname = match_result.group(1) + + MODIFIERS = ("inner_", "outer_") + + for modifier in MODIFIERS: + if base_iname.startswith(modifier): + base_iname = base_iname[len(modifier):] + break + + return base_iname + + def make_two_level_scan( kernel, insn_id, scan_iname, @@ -262,10 +300,12 @@ def make_two_level_scan( var_name_gen = kernel.get_var_name_generator() insn_id_gen = kernel.get_instruction_id_generator() - level = 0 #scan_level or try_get_scan_level(sweep_iname) + level = _get_scan_level(sweep_iname) + base_scan_iname = _get_base_iname(scan_iname) + base_sweep_iname = _get_base_iname(sweep_iname) format_kwargs = { - "insn": insn_id, "iname": scan_iname, "sweep": sweep_iname, + "insn": insn_id, "iname": base_scan_iname, "sweep": base_sweep_iname, "level": level, "next_level": level + 1, "prefix": "l"} nonlocal_storage_name = var_name_gen( @@ -273,11 +313,11 @@ def make_two_level_scan( if inner_iname is None: inner_iname = var_name_gen( - "{prefix}{level}_inner_update_{sweep}".format(**format_kwargs)) + "{prefix}{level}_inner2_{sweep}".format(**format_kwargs)) if outer_iname is None: outer_iname = var_name_gen( - "{prefix}{level}_outer_update_{sweep}".format(**format_kwargs)) + "{prefix}{level}_outer2_{sweep}".format(**format_kwargs)) nonlocal_iname = var_name_gen( "{prefix}{level}_combine_{sweep}".format(**format_kwargs)) @@ -302,11 +342,11 @@ def make_two_level_scan( if local_storage_name is None: local_storage_name = var_name_gen( - "{prefix}{next_level}_{insn}".format(**format_kwargs)) + "{prefix}{next_level}l_{insn}".format(**format_kwargs)) if nonlocal_storage_name is None: nonlocal_storage_name = var_name_gen( - "{prefix}{level}_{insn}".format(**format_kwargs)) + "{prefix}{level}nl_{insn}".format(**format_kwargs)) local_scan_insn_id = insn_id_gen( "{iname}_local_scan".format(**format_kwargs)) @@ -388,15 +428,28 @@ def make_two_level_scan( var(subst_name)(var(outer_local_iname) * inner_length + var(inner_scan_iname))) + new_inames = ["temp"] + + kernel = lp.duplicate_inames(kernel, + (sweep_iname), + within="not id:*", + new_inames=new_inames) + kernel = lp.split_iname(kernel, sweep_iname, inner_length, inner_iname=inner_iname, outer_iname=outer_iname, inner_tag=inner_tag, outer_tag=outer_tag) + kernel = lp.split_iname(kernel, new_inames[0], inner_length, + inner_iname=inner_local_iname, outer_iname=outer_local_iname, + inner_tag=inner_local_tag, outer_tag=outer_local_tag) + + """ kernel = lp.duplicate_inames(kernel, (outer_iname, inner_iname), within="not id:*", new_inames=[outer_local_iname, inner_local_iname], tags={outer_iname: outer_local_tag, inner_iname: inner_local_tag}) + """ kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) @@ -423,13 +476,17 @@ def make_two_level_scan( frozenset(all_precompute_inames) - frozenset(precompute_inames)) + insn = kernel.id_to_insn[insn_id] + + within_inames = insn.within_inames - frozenset([outer_iname, inner_iname]) + from pymbolic import var kernel = lp.precompute(kernel, [var(local_subst_name)(var(outer_iname), var(inner_iname))], sweep_inames=sweep_inames, precompute_inames=precompute_inames, storage_axes=local_storage_axes, - precompute_outer_inames=precompute_outer_inames, + precompute_outer_inames=precompute_outer_inames | within_inames, temporary_name=local_storage_name, compute_insn_id=local_scan_insn_id) @@ -458,17 +515,13 @@ def make_two_level_scan( kernel = kernel.copy(temporary_variables=new_temporary_variables) - insn = kernel.id_to_insn[insn_id] - - # XXX: should not include sweep iname? - within_inames = insn.within_inames - from loopy.kernel.instruction import make_assignment nonlocal_init_head = make_assignment( id=nonlocal_init_head_insn_id, assignees=(var(nonlocal_storage_name)[0],), expression=0, - within_inames=frozenset([outer_local_iname,inner_local_iname]), + within_inames=( + within_inames | frozenset([outer_local_iname,inner_local_iname])), predicates=frozenset([var(inner_local_iname).eq(0)]), depends_on=frozenset([local_scan_insn_id])) @@ -482,7 +535,8 @@ def make_two_level_scan( (var(outer_local_iname),var(inner_local_iname)), strip_scalar=True)], no_sync_with=frozenset([(local_scan_insn_id, "local")]), - within_inames=frozenset([outer_local_iname,inner_local_iname]), + within_inames=( + within_inames | frozenset([outer_local_iname,inner_local_iname])), depends_on=frozenset([local_scan_insn_id]), predicates=frozenset([var(inner_local_iname).eq(inner_length - 1)])) @@ -507,7 +561,7 @@ def make_two_level_scan( scan.operation, (outer_scan_iname,), var(nonlocal_storage_name)[var(outer_scan_iname)]), - within_inames=frozenset([nonlocal_iname]), + within_inames=within_inames | frozenset([nonlocal_iname]), depends_on=frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id])) kernel = _update_instructions(kernel, (nonlocal_scan,), copy=False) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 29f4c0238..ccb7c1236 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -193,9 +193,9 @@ class TemporarySaver(object): The name of the new temporary. - .. attribute:: orig_temporary + .. attribute:: orig_temporary_name - The original temporary variable object. + The name of original temporary variable object. .. attribute:: hw_dims @@ -214,9 +214,10 @@ class TemporarySaver(object): non-hardware dimensions """ - @memoize_method - def as_variable(self): - temporary = self.orig_temporary + __slots__ = ["name", "orig_temporary_name", "hw_dims", "hw_tags", "non_hw_dims"] + + def as_kernel_temporary(self, kernel): + temporary = kernel.temporary_variables[self.orig_temporary_name] from loopy.kernel.data import TemporaryVariable return TemporaryVariable( name=self.name, @@ -239,7 +240,12 @@ class TemporarySaver(object): self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} + # temporary name -> save or reload insns self.saves_or_reloads_added = {} + from collections import defaultdict + self.subkernel_to_saves = defaultdict(lambda: set()) + self.subkernel_to_reloads = defaultdict(lambda: set()) + self.base_storage_to_representative = {} def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary): """ @@ -256,6 +262,7 @@ class TemporarySaver(object): group_tags = None local_tags = None + originating_insn_id = None def _sortedtags(tags): return sorted(tags, key=lambda tag: tag.axis) @@ -286,13 +293,18 @@ class TemporarySaver(object): if group_tags is None: group_tags = _sortedtags(my_group_tags) local_tags = _sortedtags(my_local_tags) + originating_insn_id = insn_id if ( group_tags != _sortedtags(my_group_tags) or local_tags != _sortedtags(my_local_tags)): raise ValueError( - "inconsistent parallel tags across instructions that access '%s'" - % temporary.name) + "inconsistent parallel tags across instructions that access " + "'%s', instruction '%s' has tags '%s' but instruction '%s' " + "has tags '%s'" + % (temporary.name, + originating_insn_id, group_tags + local_tags, + insn_id, my_group_tags + my_local_tags)) if group_tags is None: assert local_tags is None @@ -324,9 +336,8 @@ class TemporarySaver(object): assert temporary.read_only return None - if temporary.base_storage is not None: - raise ValueError( - "Cannot promote temporaries with base_storage to global") + if temporary.base_storage in self.base_storage_to_representative: + return self.base_storage_to_representative[temporary.base_storage] hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary) non_hw_dims = temporary.shape @@ -337,11 +348,14 @@ class TemporarySaver(object): backing_temporary = self.PromotedTemporary( name=self.var_name_gen(temporary.name + "_save_slot"), - orig_temporary=temporary, + orig_temporary_name=temporary.name, hw_dims=hw_dims, hw_tags=hw_tags, non_hw_dims=non_hw_dims) + if temporary.base_storage is not None: + self.base_storage_to_representative[temporary.base_storage] = backing_temporary + return backing_temporary def save_or_reload_impl(self, temporary, subkernel, mode, @@ -354,6 +368,18 @@ class TemporarySaver(object): if promoted_temporary is None: return + if mode == "save": + if promoted_temporary.name in self.subkernel_to_saves[subkernel]: + return + else: + self.subkernel_to_saves[subkernel].add(promoted_temporary.name) + + elif mode == "reload": + if promoted_temporary.name in self.subkernel_to_reloads[subkernel]: + return + else: + self.subkernel_to_reloads[subkernel].add(promoted_temporary.name) + from loopy.kernel.tools import DomainChanger dchg = DomainChanger( self.kernel, @@ -378,7 +404,8 @@ class TemporarySaver(object): Variable(agg), tuple(map(Variable, subscript))) - dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)] + orig_temporary = self.kernel.temporary_variables[promoted_temporary.orig_temporary_name] + dim_inames_trunc = dim_inames[:len(orig_temporary.shape)] args = ( add_subscript_if_nonempty( @@ -433,7 +460,7 @@ class TemporarySaver(object): depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) self.updated_temporary_variables[promoted_temporary.name] = \ - promoted_temporary.as_variable() + promoted_temporary.as_kernel_temporary(self.kernel) self.updated_iname_to_tag.update(iname_to_tag) @@ -488,7 +515,7 @@ class TemporarySaver(object): assert mode in ("save", "reload") import islpy as isl - orig_temporary = promoted_temporary.orig_temporary + orig_temporary = self.kernel.temporary_variables[promoted_temporary.orig_temporary_name] orig_dim = domain.dim(isl.dim_type.set) # Tags for newly added inames diff --git a/test/test_scan.py b/test/test_scan.py index ae046818b..60a2f4272 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -207,7 +207,7 @@ def test_local_parallel_scan(ctx_factory, n): "..." ) - knl = lp.fix_parameters(knl, n=16) + knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) @@ -215,8 +215,8 @@ def test_local_parallel_scan(ctx_factory, n): knl = lp.add_dtypes(knl, dict(a=int)) - evt, (a,) = knl(queue, a=np.arange(16)) - assert (a == np.cumsum(np.arange(16)**2)).all() + evt, (a,) = knl(queue, a=np.arange(n)) + assert (a == np.cumsum(np.arange(n)**2)).all() def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): @@ -369,7 +369,7 @@ def test_two_level_scan(ctx_getter): "{[i,j]: 0 <= i < 16 and 0 <= j <= i}", ], """ - out[i] = sum(j, 1) {id=insn} + out[i] = sum(j, j) {id=insn} """, "...") @@ -381,7 +381,7 @@ def test_two_level_scan(ctx_getter): knl, "insn", inner_length=4, scan_iname="j", sweep_iname="i", - local_storage_axes=(("l0_inner_update_i",)), + local_storage_axes=(("l0_inner2_i",)), inner_iname="l0_inner_update_i", inner_tag="l.0", outer_tag="g.0", @@ -418,6 +418,81 @@ def test_two_level_scan(ctx_getter): print(out.get()) +def test_three_level_scan(ctx_getter): + knl = lp.make_kernel( + [ + "{[i,j]: 0 <= i < 16 and 0 <= j <= i}", + ], + """ + out[i] = sum(j, j) {id=insn} + """, + "...") + + #knl = lp.tag_inames(knl, dict(i="l.0")) + + from loopy.transform.reduction import make_two_level_scan + + knl = make_two_level_scan( + knl, "insn", inner_length=4, + scan_iname="j", + sweep_iname="i", + local_storage_axes=(("l0_inner_update_i",)), + inner_iname="l0_inner_update_i", + inner_tag="l.0", + outer_tag="g.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + inner_local_tag=None, + outer_local_tag="g.0") + + knl = make_two_level_scan( + knl, "j_local_scan", inner_length=2, + scan_iname="l1_j", + sweep_iname="l1_inner_i", + inner_tag="for", + outer_tag="l.0", + nonlocal_tag="l.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.LOCAL, + inner_local_tag="for", + outer_local_tag="l.0") + + print(knl) + + knl = lp.realize_reduction(knl, force_scan=True) + + from loopy.transform.instruction import add_nosync_to_instructions + knl = add_nosync_to_instructions( + knl, + scope="global", + source="writes:acc_l0_j", + sink="reads:acc_l0_j") + + knl = lp.alias_temporaries(knl, ["l1l_insn", "l2l_j_local_scan"], synchronize_for_exclusive_use=False) + + print(knl.get_temporary_to_base_storage_map()) + + print(knl) + + from loopy.transform.save import save_and_reload_temporaries + + print(knl) + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + knl = save_and_reload_temporaries(knl) + knl = lp.get_one_scheduled_kernel(knl) + + print(knl) + + c = ctx_getter() + q = cl.CommandQueue(c) + + _, (out,) = knl(q) + + print(out.get()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From f8e5383a222c5febb77629d476d87a1e8fd7b47b Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 10 Mar 2017 01:46:58 -0600 Subject: [PATCH 15/27] [ci skip] Three level scan improvements. --- loopy/preprocess.py | 159 ++++++++++++++++++++++------------ loopy/schedule/__init__.py | 2 +- loopy/transform/precompute.py | 6 +- loopy/transform/reduction.py | 136 ++++++++++++++++------------- loopy/transform/save.py | 67 ++++++++------ 5 files changed, 226 insertions(+), 144 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e32ad719d..ce69efa66 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -343,13 +343,21 @@ def _move_set_to_param_dims_except(domain, except_dims): return domain +def _domain_depends_on_given_set_dims(domain, set_dim_names): + set_dim_names = frozenset(set_dim_names) + + return any( + set_dim_names & set(constr.get_coefficients_by_name()) + for constr in domain.get_constraints()) + + def _check_reduction_is_triangular(kernel, expr, scan_param): """Check whether the reduction within `expr` with scan parameters described by the structure `scan_param` is triangular. This attempts to verify that the domain for the scan and sweep inames is as follows: - [other inames] -> { - [scan_iname, sweep_iname]: + [params] -> { + [other inames..., scan_iname, sweep_iname]: (sweep_min_value <= sweep_iname <= sweep_max_value) @@ -357,6 +365,8 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): (scan_min_value <= scan_iname <= stride * (sweep_iname - sweep_min_value) + scan_min_value) + and + (irrelevant constraints) } """ @@ -365,49 +375,74 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): orig_domain = kernel.get_inames_domain( frozenset((scan_param.sweep_iname, scan_param.scan_iname))) - domain = _move_set_to_param_dims_except(orig_domain, - (scan_param.sweep_iname, scan_param.scan_iname)) + sweep_iname = scan_param.sweep_iname + scan_iname = scan_param.scan_iname + affs = isl.affs_from_space(orig_domain.space) - params_for_gisting = domain.params() + sweep_lower_bound = isl.align_spaces( + scan_param.sweep_lower_bound, + affs[0], + across_dim_types=True) - domain = domain.gist_params(params_for_gisting) + sweep_upper_bound = isl.align_spaces( + scan_param.sweep_upper_bound, + affs[0], + across_dim_types=True) - tri_domain = isl.BasicSet.universe(domain.params().space) + scan_lower_bound = isl.align_spaces( + scan_param.scan_lower_bound, + affs[0], + across_dim_types=True) - sweep_iname = scan_param.sweep_iname - scan_iname = scan_param.scan_iname + """ + print("SWEEP AND SCAN INAMES", sweep_iname, scan_iname) + print("SWEEP UPPER BOUND", sweep_upper_bound) + print("SCAN LOWER BOUND", scan_lower_bound) + print("SWEEP LOWER BOUND", sweep_lower_bound) + """ - tri_domain = _add_params_to_domain(tri_domain, (sweep_iname, scan_iname)) + from itertools import product - affs = isl.affs_from_space(tri_domain.space) + for (sweep_lb_domain, sweep_lb_aff), \ + (sweep_ub_domain, sweep_ub_aff), \ + (scan_lb_domain, scan_lb_aff) in \ + product(sweep_lower_bound.get_pieces(), + sweep_upper_bound.get_pieces(), + scan_lower_bound.get_pieces()): - # Add sweep iname constraints - tri_domain &= affs[sweep_iname].ge_set(scan_param.sweep_lower_bound) - tri_domain &= affs[sweep_iname].le_set(scan_param.sweep_upper_bound) + # Assumptions inherited from the domains of the pwaffs + assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain - # Add scan iname constraints - scan_min_value = scan_param.scan_lower_bound - tri_domain &= affs[scan_iname].ge_set(scan_min_value) - tri_domain &= affs[scan_iname].le_set( - scan_param.stride * (affs[sweep_iname] - scan_param.sweep_lower_bound) - + scan_min_value) + # Sweep iname constraints + hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff) + hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff) - # Gist against domain params - tri_domain = tri_domain.gist_params(params_for_gisting) + # Scan iname constraints + hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff) + hyp_domain &= affs[scan_iname].le_set( + scan_param.stride * (affs[sweep_iname] - sweep_lb_aff) + + scan_lb_aff) - # Move sweep and scan inames into the set - tri_domain = tri_domain.move_dims( - dim_type.set, 0, - dim_type.param, tri_domain.dim(dim_type.param) - 2, 2) + hyp_domain, = (hyp_domain & assumptions).get_basic_sets() + test_domain, = (orig_domain & assumptions).get_basic_sets() - tri_domain, domain = isl.align_two(tri_domain, domain) + """ + print("ASSUMPTIONS", assumptions) + print("HYP", hyp_domain) + print("TEST", test_domain) + print("HYP AGAINST TEST", hyp_domain.gist(test_domain)) + print("TEST AGAINST HYP", test_domain.gist(hyp_domain)) + """ - if domain != tri_domain: - # FIXME: Return a more descriptive error message. - return False, ( - "domains are not equal: expected '%s', got '%s'" % (tri_domain, domain)) - else: - return True, "ok" + if _domain_depends_on_given_set_dims(hyp_domain.gist(test_domain), + (sweep_iname, scan_iname)): + return False, "cond1" + + if _domain_depends_on_given_set_dims(test_domain.gist(hyp_domain), + (sweep_iname, scan_iname)): + return False, "cond2" + + return True, "ok" _ScanCandidateParameters = namedtuple( @@ -416,7 +451,7 @@ _ScanCandidateParameters = namedtuple( "sweep_upper_bound, scan_lower_bound, stride") -def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): +def _try_infer_scan_candidate_from_expr(kernel, expr, within_inames, sweep_iname=None): """Analyze `expr` and determine if it can be implemented as a scan. """ from loopy.symbolic import Reduction @@ -437,19 +472,19 @@ def _try_infer_scan_candidate_from_expr(kernel, expr, sweep_iname=None): sweep_iname = _try_infer_sweep_iname( domain, scan_iname, kernel.all_inames()) except ValueError as v: - raise ValueError("Couldn't determine a sweep iname for the scan: %s" % v) + raise ValueError("Couldn't determine a sweep iname for the scan expression '%s': %s" % (expr, v)) try: sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( - _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname)) + _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames)) except ValueError as v: - raise ValueError("Couldn't determine bounds for scan: %s" % v) + raise ValueError("Couldn't determine bounds for the scan with expression '%s' (sweep iname: '%s', scan iname: '%s'): %s" % (expr, sweep_iname, scan_iname, v)) try: stride = _try_infer_scan_stride( kernel, scan_iname, sweep_iname, sweep_lower_bound) except ValueError as v: - raise ValueError("Couldn't determine a scan stride: %s" % v) + raise ValueError("Couldn't determine a scan stride for the scan with expression '%s' (sweep iname: '%s', scan iname: '%s'): %s" % (expr, sweep_iname, scan_iname, v)) return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, sweep_upper_bound, scan_lower_bound, stride) @@ -505,7 +540,7 @@ def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): return sweep_iname_candidate -def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname): +def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames): domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname))) domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname)) @@ -513,7 +548,8 @@ def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname): sweep_idx = var_dict[sweep_iname][1] scan_idx = var_dict[scan_iname][1] - domain = domain.gist_params(domain.params()) + domain = domain.project_out_except( + within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,)) try: sweep_lower_bound = domain.dim_min(sweep_idx) @@ -536,6 +572,9 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): domain = kernel.get_inames_domain((sweep_iname, scan_iname)) domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,)) + domain_with_sweep_param = domain_with_sweep_param.project_out_except( + (sweep_iname, scan_iname), (dim_type.set, dim_type.param)) + scan_iname_idx = domain_with_sweep_param.find_dim_by_name( dim_type.set, scan_iname) @@ -1173,18 +1212,16 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, @memoize def get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride): + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + tracking_iname): domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname))) - tracking_iname = var_name_gen( - "{scan_iname}_tracking_{sweep_iname}" - .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) - inames_added_for_scan.add(tracking_iname) new_domain = _create_domain_for_sweep_tracking(domain, tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) + """ from loopy.kernel.tools import DomainChanger domain_idx, = temp_kernel.get_leaf_domain_indices(frozenset([sweep_iname])) @@ -1195,8 +1232,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, orig_domain = isl.align_spaces(orig_domain, new_domain) orig_domain &= new_domain + """ + _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain) - domains[domain_idx] = orig_domain + #domains[domain_idx] = orig_domain return tracking_iname @@ -1243,10 +1282,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, outer_insn_inames = temp_kernel.insn_inames(insn) inames_to_remove.add(scan_iname) - track_iname = ( - get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, scan_min_value, - stride)) + track_iname = var_name_gen( + "{sweep_iname}__seq_scan" + .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + + get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, + stride, track_iname) from loopy.kernel.data import temp_var_scope acc_var_names = make_temporaries( @@ -1349,15 +1391,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _get_int_iname_size(oiname) for oiname in outer_local_inames) - track_iname = get_or_add_sweep_tracking_iname_and_domain( - scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride) + track_iname = var_name_gen( + "{sweep_iname}__pre_scan" + .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + + get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + track_iname) # {{{ add separate iname to carry out the scan # Doing this sheds any odd conditionals that may be active # on our red_iname. - base_exec_iname = var_name_gen("scan_"+sweep_iname) + base_exec_iname = var_name_gen(sweep_iname + "__scan") domains.append(_make_slab_set(base_exec_iname, scan_size)) new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname] @@ -1448,7 +1495,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, istage = 0 cur_size = 1 while cur_size < scan_size: - stage_exec_iname = var_name_gen("scan_%s_s%d" % (red_iname, istage)) + stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) domains.append( _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] @@ -1525,6 +1572,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. + #print(temp_kernel) + arg_dtypes, reduction_dtypes = ( _infer_arg_dtypes_and_reduction_dtypes( temp_kernel, expr, unknown_types_ok)) @@ -1556,7 +1605,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Try to determine scan candidate information (sweep iname, scan # iname, etc). scan_param = _try_infer_scan_candidate_from_expr( - temp_kernel, expr, sweep_iname=force_outer_iname_for_scan) + temp_kernel, expr, outer_insn_inames, sweep_iname=force_outer_iname_for_scan) except ValueError as v: error = str(v) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 10a19a3c7..c078da2ec 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -324,7 +324,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): if not may_add_to_loop_dep_map: continue - print("{knl}: loop dependency map: iname '{iname}' " + logger.debug("{knl}: loop dependency map: iname '{iname}' " "depends on '{dep_insn}' via '{insn}'" .format( knl=kernel.name, diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index a19e06ecd..e6a329386 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -725,11 +725,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, mod_check_domain, check_domain) # The modified domain can't get bigger by adding constraints - assert mod_check_domain <= check_domain + assert mod_check_domain.gist_params(kernel.assumptions) <= check_domain.gist_params(kernel.assumptions) if not check_domain <= mod_check_domain: - print(check_domain) - print(mod_check_domain) + print(check_domain.gist_params(kernel.assumptions)) + print(mod_check_domain.gist_params(kernel.assumptions)) raise LoopyError("original domain got shrunk by applying the precompute") # }}} diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index c46c9481a..97ff545b0 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -145,16 +145,47 @@ def _update_instructions(kernel, id_to_new_insn, copy=True): return kernel -def _make_slab_set(iname, size): +def _make_slab_set(iname, size, space): # FIXME: stolen from preprocess, should be its own thing... - v = isl.make_zero_and_vars([iname]) + from pymbolic.mapper.dependency import DependencyMapper + + space = size.get_domain_space() + + new_space = (size.get_domain_space() + .add_dims(isl.dim_type.set, 1) + .set_dim_name(isl.dim_type.set, space.dim(isl.dim_type.set), iname)) + + v = isl.affs_from_space(new_space) + + size = isl.align_spaces(size, v[0]) + bs, = ( v[0].le_set(v[iname]) & v[iname].lt_set(v[0] + size)).get_basic_sets() + return bs +def _add_subdomain_to_kernel(kernel, subdomain): + domains = list(kernel.domains) + dep_inames = ( + frozenset(subdomain.get_var_names(isl.dim_type.param)) & kernel.all_inames()) + + indices = kernel.get_leaf_domain_indices(dep_inames) + + if len(indices) == 0: + domains.append(subdomain) + elif len(indices) == 1: + idx, = indices + domains.insert(idx + 1, subdomain) + else: + print(indices) + raise ValueError("more than 1 leaf index") + + return kernel.copy(domains=domains) + + def _add_scan_subdomain( kernel, scan_iname, sweep_iname): """ @@ -174,12 +205,7 @@ def _add_scan_subdomain( & affs[scan_iname].ge_set(affs[0])).get_basic_sets() - sweep_idx, = kernel.get_leaf_domain_indices((sweep_iname,)) - - domains = list(kernel.domains) - domains.insert(sweep_idx + 1, subd) - - return kernel.copy(domains=domains) + return _add_subdomain_to_kernel(kernel, subd) def _expand_subst_within_expression(kernel, expr): @@ -217,7 +243,7 @@ def _add_global_barrier(kernel, source, sink, barrier_id): def _get_scan_level(sweep_iname): - SWEEP_RE = r"l(\d+)_.*" + SWEEP_RE = r".*__l(\d+)(?:_outer)?" import re match_result = re.match(SWEEP_RE, sweep_iname) @@ -229,7 +255,7 @@ def _get_scan_level(sweep_iname): def _get_base_iname(iname): - BASE_INAME_RE = r"l\d+_(.*)" + BASE_INAME_RE = r"(.*)__l\d+(?:_outer)?" import re match_result = re.match(BASE_INAME_RE, iname) @@ -237,16 +263,7 @@ def _get_base_iname(iname): if match_result is None: return iname - base_iname = match_result.group(1) - - MODIFIERS = ("inner_", "outer_") - - for modifier in MODIFIERS: - if base_iname.startswith(modifier): - base_iname = base_iname[len(modifier):] - break - - return base_iname + return match_result.group(1) def make_two_level_scan( @@ -303,36 +320,34 @@ def make_two_level_scan( level = _get_scan_level(sweep_iname) base_scan_iname = _get_base_iname(scan_iname) base_sweep_iname = _get_base_iname(sweep_iname) + base_insn_id = _get_base_iname(insn_id) format_kwargs = { - "insn": insn_id, "iname": base_scan_iname, "sweep": base_sweep_iname, + "insn": base_insn_id, "iname": base_scan_iname, "sweep": base_sweep_iname, "level": level, "next_level": level + 1, "prefix": "l"} - nonlocal_storage_name = var_name_gen( - "{prefix}{level}_insn".format(**format_kwargs)) - if inner_iname is None: inner_iname = var_name_gen( - "{prefix}{level}_inner2_{sweep}".format(**format_kwargs)) + "{sweep}__l{level}".format(**format_kwargs)) if outer_iname is None: outer_iname = var_name_gen( - "{prefix}{level}_outer2_{sweep}".format(**format_kwargs)) + "{sweep}__l{level}_outer".format(**format_kwargs)) nonlocal_iname = var_name_gen( - "{prefix}{level}_combine_{sweep}".format(**format_kwargs)) + "{sweep}__l{level}_nonloc".format(**format_kwargs)) inner_local_iname = var_name_gen( - "{prefix}{next_level}_inner_{sweep}".format(**format_kwargs)) + "{sweep}__l{next_level}".format(**format_kwargs)) inner_scan_iname = var_name_gen( - "{prefix}{next_level}_{iname}".format(**format_kwargs)) - - outer_scan_iname = var_name_gen( - "{prefix}{level}_{iname}".format(**format_kwargs)) + "{iname}__l{next_level}".format(**format_kwargs)) outer_local_iname = var_name_gen( - "{prefix}{next_level}_outer_{sweep}".format(**format_kwargs)) + "{sweep}__l{next_level}_outer".format(**format_kwargs)) + + outer_scan_iname = var_name_gen( + "{iname}__l{level}".format(**format_kwargs)) subst_name = var_name_gen( "{insn}_inner_subst".format(**format_kwargs)) @@ -342,17 +357,17 @@ def make_two_level_scan( if local_storage_name is None: local_storage_name = var_name_gen( - "{prefix}{next_level}l_{insn}".format(**format_kwargs)) + "{insn}__l{next_level}".format(**format_kwargs)) if nonlocal_storage_name is None: nonlocal_storage_name = var_name_gen( - "{prefix}{level}nl_{insn}".format(**format_kwargs)) + "{insn}__l{level}_outer".format(**format_kwargs)) local_scan_insn_id = insn_id_gen( - "{iname}_local_scan".format(**format_kwargs)) + "{insn}__l{next_level}".format(**format_kwargs)) nonlocal_scan_insn_id = insn_id_gen( - "{iname}_nonlocal_scan".format(**format_kwargs)) + "{insn}__l{level}".format(**format_kwargs)) format_kwargs.update({"nonlocal": nonlocal_storage_name}) @@ -394,9 +409,11 @@ def make_two_level_scan( kernel = reduction_arg_to_subst_rule( kernel, scan_iname, subst_rule_name=subst_name) + from loopy.kernel.instruction import NoOpInstruction # FIXME: this is stupid kernel = _update_instructions(kernel, {insn_id: insn.copy(expression=0)}) + """ {insn_id: NoOpInstruction( id=insn_id, @@ -425,33 +442,13 @@ def make_two_level_scan( from pymbolic import var local_scan_expr = _expand_subst_within_expression(kernel, - var(subst_name)(var(outer_local_iname) * inner_length + + var(subst_name)(var(outer_iname) * inner_length + var(inner_scan_iname))) - new_inames = ["temp"] - - kernel = lp.duplicate_inames(kernel, - (sweep_iname), - within="not id:*", - new_inames=new_inames) - kernel = lp.split_iname(kernel, sweep_iname, inner_length, inner_iname=inner_iname, outer_iname=outer_iname, inner_tag=inner_tag, outer_tag=outer_tag) - kernel = lp.split_iname(kernel, new_inames[0], inner_length, - inner_iname=inner_local_iname, outer_iname=outer_local_iname, - inner_tag=inner_local_tag, outer_tag=outer_local_tag) - - """ - kernel = lp.duplicate_inames(kernel, - (outer_iname, inner_iname), - within="not id:*", - new_inames=[outer_local_iname, inner_local_iname], - tags={outer_iname: outer_local_tag, inner_iname: inner_local_tag}) - """ - - kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) from loopy.kernel.data import SubstitutionRule from loopy.symbolic import Reduction @@ -467,6 +464,8 @@ def make_two_level_scan( kernel = kernel.copy(substitutions=substitutions) + outer_local_iname = outer_iname + all_precompute_inames = (outer_local_iname, inner_local_iname) precompute_inames = pick_out_relevant_axes(all_precompute_inames) @@ -481,8 +480,10 @@ def make_two_level_scan( within_inames = insn.within_inames - frozenset([outer_iname, inner_iname]) from pymbolic import var + kernel = lp.precompute(kernel, - [var(local_subst_name)(var(outer_iname), var(inner_iname))], + [var(local_subst_name)( + var(outer_iname), var(inner_iname))], sweep_inames=sweep_inames, precompute_inames=precompute_inames, storage_axes=local_storage_axes, @@ -490,14 +491,27 @@ def make_two_level_scan( temporary_name=local_storage_name, compute_insn_id=local_scan_insn_id) + kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) + # }}} # {{{ implement local to nonlocal information transfer + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import aff_to_expr, pw_aff_to_expr + nonlocal_storage_len_pw_aff = 2 + static_max_of_pw_aff( + kernel.get_iname_bounds(outer_iname).size, + constants_only=False) + + """ + nonlocal_storage_len = ( + kernel.get_constant_iname_length(outer_iname) + from loopy.symbolic import pw_aff_to_expr nonlocal_storage_len_pw_aff = ( # FIXME: should be 1 + len, bounds check doesnt like this.. 2 + kernel.get_iname_bounds(outer_iname).upper_bound_pw_aff) + """ nonlocal_storage_len = pw_aff_to_expr(nonlocal_storage_len_pw_aff) @@ -547,7 +561,9 @@ def make_two_level_scan( # {{{ implement nonlocal scan - kernel.domains.append(_make_slab_set(nonlocal_iname, nonlocal_storage_len)) + outer_idx, = kernel.get_leaf_domain_indices((outer_iname,)) + subd = _make_slab_set(nonlocal_iname, nonlocal_storage_len_pw_aff, nonlocal_storage_len_pw_aff.space) + kernel = _add_subdomain_to_kernel(kernel, subd) if nonlocal_tag is not None: kernel = lp.tag_inames(kernel, {nonlocal_iname: nonlocal_tag}) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index ccb7c1236..24360a808 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -234,19 +234,33 @@ class TemporarySaver(object): self.insn_query = InstructionQuery(kernel) self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() + # These fields keep track of updates to the kernel. self.insns_to_insert = [] self.insns_to_update = {} self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} + # temporary name -> save or reload insns self.saves_or_reloads_added = {} + from collections import defaultdict self.subkernel_to_saves = defaultdict(lambda: set()) self.subkernel_to_reloads = defaultdict(lambda: set()) self.base_storage_to_representative = {} + from loopy.kernel.data import ValueArg + import islpy as isl + self.new_subdomain = ( + isl.BasicSet.universe( + isl.Space.create_from_names( + isl.DEFAULT_CONTEXT, + set=[], + params=set( + arg.name for arg in kernel.args + if isinstance(arg, ValueArg))))) + def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary): """ This is used for determining the amount of global storage needed for saving @@ -257,12 +271,13 @@ class TemporarySaver(object): In the case of local temporaries, inames that are tagged hw-local do not contribute to the global storage shape. """ + accessor_insn_ids = ( self.insn_query.insns_reading_or_writing(temporary.name)) group_tags = None local_tags = None - originating_insn_id = None + group_tag_originating_insn_id = None def _sortedtags(tags): return sorted(tags, key=lambda tag: tag.axis) @@ -274,7 +289,10 @@ class TemporarySaver(object): my_local_tags = [] for iname in insn.within_inames: - tag = self.kernel.iname_to_tag[iname] + tag = self.kernel.iname_to_tag.get(iname) + + if tag is None: + continue from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, ParallelTag) @@ -286,24 +304,24 @@ class TemporarySaver(object): elif isinstance(tag, ParallelTag): raise ValueError( "iname '%s' is tagged with '%s' - only " - "local and global tags are supported for " - "auto saving of temporaries" % + "group and local tags are supported for " + "auto save/reload of temporaries" % (iname, tag)) if group_tags is None: group_tags = _sortedtags(my_group_tags) local_tags = _sortedtags(my_local_tags) - originating_insn_id = insn_id + group_tags_originating_insn_id = insn_id if ( group_tags != _sortedtags(my_group_tags) or local_tags != _sortedtags(my_local_tags)): raise ValueError( "inconsistent parallel tags across instructions that access " - "'%s', instruction '%s' has tags '%s' but instruction '%s' " - "has tags '%s'" + "'%s' (specifically, instruction '%s' has tags '%s' but " + "instruction '%s' has tags '%s')" % (temporary.name, - originating_insn_id, group_tags + local_tags, + group_tags_originating_insn_id, group_tags + local_tags, insn_id, my_group_tags + my_local_tags)) if group_tags is None: @@ -337,6 +355,7 @@ class TemporarySaver(object): return None if temporary.base_storage in self.base_storage_to_representative: + # FIXME: Pick the representative with the largest size... return self.base_storage_to_representative[temporary.base_storage] hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary) @@ -347,7 +366,7 @@ class TemporarySaver(object): non_hw_dims = (1,) backing_temporary = self.PromotedTemporary( - name=self.var_name_gen(temporary.name + "_save_slot"), + name=self.var_name_gen(temporary.name + "__save_slot"), orig_temporary_name=temporary.name, hw_dims=hw_dims, hw_tags=hw_tags, @@ -371,31 +390,23 @@ class TemporarySaver(object): if mode == "save": if promoted_temporary.name in self.subkernel_to_saves[subkernel]: return - else: - self.subkernel_to_saves[subkernel].add(promoted_temporary.name) + self.subkernel_to_saves[subkernel].add(promoted_temporary.name) elif mode == "reload": if promoted_temporary.name in self.subkernel_to_reloads[subkernel]: return - else: - self.subkernel_to_reloads[subkernel].add(promoted_temporary.name) + self.subkernel_to_reloads[subkernel].add(promoted_temporary.name) - from loopy.kernel.tools import DomainChanger - dchg = DomainChanger( - self.kernel, - frozenset( - self.insn_query.inames_in_subkernel(subkernel))) - - domain, hw_inames, dim_inames, iname_to_tag = \ + new_subdomain, hw_inames, dim_inames, iname_to_tag = \ self.augment_domain_for_save_or_reload( - dchg.domain, promoted_temporary, mode, subkernel) + self.new_subdomain, promoted_temporary, mode, subkernel) - self.kernel = dchg.get_kernel_with(domain) + self.new_subdomain = new_subdomain save_or_load_insn_id = self.insn_name_gen( "{name}.{mode}".format(name=temporary, mode=mode)) - def add_subscript_if_nonempty(agg, subscript=()): + def add_subscript_if_subscript_nonempty(agg, subscript=()): from pymbolic.primitives import Subscript, Variable if len(subscript) == 0: return Variable(agg) @@ -408,9 +419,9 @@ class TemporarySaver(object): dim_inames_trunc = dim_inames[:len(orig_temporary.shape)] args = ( - add_subscript_if_nonempty( + add_subscript_if_subscript_nonempty( temporary, subscript=dim_inames_trunc), - add_subscript_if_nonempty( + add_subscript_if_subscript_nonempty( promoted_temporary.name, subscript=hw_inames + dim_inames)) if mode == "save": @@ -490,7 +501,13 @@ class TemporarySaver(object): self.updated_iname_to_tag.update(self.kernel.iname_to_tag) self.updated_temporary_variables.update(self.kernel.temporary_variables) + new_domains = list(self.kernel.domains) + import islpy as isl + if self.new_subdomain.dim(isl.dim_type.set) > 0: + new_domains.append(self.new_subdomain) + kernel = self.kernel.copy( + domains=new_domains, instructions=new_instructions, iname_to_tag=self.updated_iname_to_tag, temporary_variables=self.updated_temporary_variables, -- GitLab From 5cd9be0358077be892fa40f2d7aa5d77719e6844 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 10 Mar 2017 12:54:42 -0600 Subject: [PATCH 16/27] Make tests pass. --- loopy/preprocess.py | 23 +++++---- test/test_scan.py | 118 +++++++++++++++++++++++++------------------- 2 files changed, 81 insertions(+), 60 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ce69efa66..41e67452e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -478,7 +478,8 @@ def _try_infer_scan_candidate_from_expr(kernel, expr, within_inames, sweep_iname sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames)) except ValueError as v: - raise ValueError("Couldn't determine bounds for the scan with expression '%s' (sweep iname: '%s', scan iname: '%s'): %s" % (expr, sweep_iname, scan_iname, v)) + raise ValueError("Couldn't determine bounds for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" % (expr, sweep_iname, scan_iname, v)) try: stride = _try_infer_scan_stride( @@ -580,10 +581,13 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): # Should be equal to k * sweep_iname, where k is the stride. - scan_iname_range = ( - domain_with_sweep_param.dim_max(scan_iname_idx) - - domain_with_sweep_param.dim_min(scan_iname_idx) - ).gist(domain_with_sweep_param.params()) + try: + scan_iname_range = ( + domain_with_sweep_param.dim_max(scan_iname_idx) + - domain_with_sweep_param.dim_min(scan_iname_idx) + ).gist(domain_with_sweep_param.params()) + except isl.Error as e: + raise ValueError("isl error: '%s'" % e) scan_iname_pieces = scan_iname_range.get_pieces() @@ -1370,7 +1374,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, assert scan_size > 0 if scan_size == 1: - raise NotImplementedError("tell matt to fix this") + return map_reduction_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1482,6 +1487,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id]) | insn.depends_on, no_sync_with=frozenset([(init_id, "any")])) + generated_insns.append(transfer_insn) def _strip_if_scalar(c): @@ -1494,6 +1500,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, istage = 0 cur_size = 1 + while cur_size < scan_size: stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) domains.append( @@ -1549,10 +1556,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, istage += 1 new_insn_add_depends_on.add(prev_id) - #output_iname = var_name_gen("scan_%s_output" % red_iname) - #domains.append(_make_slab_set(output_iname, scan_size)) - #new_iname_tags[output_iname] = kernel.iname_to_tag[sweep_iname] - #new_insn_add_within_inames.add(output_iname) new_insn_add_within_inames.add(sweep_iname) output_idx = var(sweep_iname) - sweep_min_value_expr diff --git a/test/test_scan.py b/test/test_scan.py index 60a2f4272..5ea203fff 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -32,6 +32,7 @@ import pyopencl as cl import pyopencl.clmath # noqa import pyopencl.clrandom # noqa import pytest +from pytools import memoize import logging logger = logging.getLogger(__name__) @@ -126,7 +127,7 @@ def test_automatic_scan_detection(): ) cgr = lp.generate_code_v2(knl) - assert "tracking" in cgr.device_code() + assert "scan" in cgr.device_code() def test_selective_scan_realization(): @@ -363,43 +364,41 @@ def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): assert [(e == a).all() for e, a in zip(expected, actual)] -def test_two_level_scan(ctx_getter): +# {{{ two and three level scan getters + +@memoize +def _get_two_level_scan_kernel(g_size): knl = lp.make_kernel( [ - "{[i,j]: 0 <= i < 16 and 0 <= j <= i}", + "[n] -> {[i,j]: 0 <= i < n and 0 <= j <= i}", ], """ - out[i] = sum(j, j) {id=insn} + out[i] = sum(j, a[j]) {id=insn} """, "...") - #knl = lp.tag_inames(knl, dict(i="l.0")) - from loopy.transform.reduction import make_two_level_scan - knl = make_two_level_scan( - knl, "insn", inner_length=4, + knl, "insn", inner_length=g_size, scan_iname="j", sweep_iname="i", - local_storage_axes=(("l0_inner2_i",)), - inner_iname="l0_inner_update_i", + local_storage_axes=(("i__l0",)), + inner_iname="i__l0", inner_tag="l.0", outer_tag="g.0", - local_storage_scope=lp.temp_var_scope.PRIVATE, + local_storage_scope=lp.temp_var_scope.LOCAL, nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, inner_local_tag="l.0", outer_local_tag="g.0") - print(knl) - knl = lp.realize_reduction(knl, force_scan=True) from loopy.transform.instruction import add_nosync_to_instructions knl = add_nosync_to_instructions( knl, scope="global", - source="writes:acc_l0_j", - sink="reads:acc_l0_j") + source="writes:acc_j__l0", + sink="reads:acc_j__l0") from loopy.transform.save import save_and_reload_temporaries @@ -408,47 +407,40 @@ def test_two_level_scan(ctx_getter): knl = save_and_reload_temporaries(knl) knl = lp.get_one_scheduled_kernel(knl) - print(knl) - - c = ctx_getter() - q = cl.CommandQueue(c) - - _, (out,) = knl(q) - - print(out.get()) + return knl -def test_three_level_scan(ctx_getter): +@memoize +def _get_three_level_scan_kernel(g_size, p_size): knl = lp.make_kernel( [ - "{[i,j]: 0 <= i < 16 and 0 <= j <= i}", + "[n] -> {[i,j]: 0 <= i < n and 0 <= j <= i}", ], """ - out[i] = sum(j, j) {id=insn} + out[i] = sum(j, a[j]) {id=insn} """, "...") - #knl = lp.tag_inames(knl, dict(i="l.0")) - from loopy.transform.reduction import make_two_level_scan - knl = make_two_level_scan( - knl, "insn", inner_length=4, + knl, "insn", inner_length=g_size, scan_iname="j", sweep_iname="i", - local_storage_axes=(("l0_inner_update_i",)), - inner_iname="l0_inner_update_i", - inner_tag="l.0", + local_storage_axes=(("i__l0",)), + inner_iname="i__l0", + inner_tag=None, outer_tag="g.0", local_storage_scope=lp.temp_var_scope.LOCAL, nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, inner_local_tag=None, outer_local_tag="g.0") + knl = lp.tag_inames(knl, dict(i__l0="l.0")) + knl = make_two_level_scan( - knl, "j_local_scan", inner_length=2, - scan_iname="l1_j", - sweep_iname="l1_inner_i", + knl, "insn__l1", inner_length=p_size, + scan_iname="j__l1", + sweep_iname="i__l1", inner_tag="for", outer_tag="l.0", nonlocal_tag="l.0", @@ -457,40 +449,66 @@ def test_three_level_scan(ctx_getter): inner_local_tag="for", outer_local_tag="l.0") - print(knl) - knl = lp.realize_reduction(knl, force_scan=True) from loopy.transform.instruction import add_nosync_to_instructions knl = add_nosync_to_instructions( knl, scope="global", - source="writes:acc_l0_j", - sink="reads:acc_l0_j") + source="writes:acc_j__l0", + sink="reads:acc_j__l0") - knl = lp.alias_temporaries(knl, ["l1l_insn", "l2l_j_local_scan"], synchronize_for_exclusive_use=False) - - print(knl.get_temporary_to_base_storage_map()) - - print(knl) + knl = lp.alias_temporaries(knl, + ("insn__l1", "insn__l2"), + synchronize_for_exclusive_use=False) from loopy.transform.save import save_and_reload_temporaries - print(knl) - knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) knl = save_and_reload_temporaries(knl) knl = lp.get_one_scheduled_kernel(knl) - print(knl) + return knl + +# }}} + + +@pytest.mark.parametrize("input_len", + (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 32)) +@pytest.mark.parametrize("g_size", (16,)) +def test_two_level_scan(ctx_getter, input_len, g_size): + knl = _get_two_level_scan_kernel(g_size) + + import numpy as np + np.random.seed(0) + a = np.random.randint(low=0, high=100, size=input_len) + + c = ctx_getter() + q = cl.CommandQueue(c) + + _, (out,) = knl(q, a=a) + + assert (out == np.cumsum(a)).all() + + +@pytest.mark.parametrize("input_len", + (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 32)) +@pytest.mark.parametrize("g_size", (16,)) +@pytest.mark.parametrize("p_size", (4,)) +def test_three_level_scan(ctx_getter, g_size, p_size, input_len): + knl = _get_three_level_scan_kernel(g_size, p_size) + + import numpy as np + np.random.seed(0) + a = np.random.randint(low=0, high=100, size=input_len) c = ctx_getter() q = cl.CommandQueue(c) - _, (out,) = knl(q) + _, (out,) = knl(q, a=a) - print(out.get()) + assert (out == np.cumsum(a)).all() if __name__ == "__main__": -- GitLab From cdfc1303cd297013f69d0a6f3a3aa7a7857bf29c Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Mar 2017 19:23:48 -0500 Subject: [PATCH 17/27] Make more tests pass. --- loopy/kernel/__init__.py | 3 +- loopy/preprocess.py | 278 +++++++++++++++++++++-------------- loopy/transform/reduction.py | 215 ++++++++++++++++++--------- test/test_scan.py | 13 +- test/test_transform.py | 3 +- 5 files changed, 331 insertions(+), 181 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index dfe9c857c..079d5c460 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1057,7 +1057,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): try: # insist block size is constant size = static_max_of_pw_aff(size, - constants_only=isinstance(tag, LocalIndexTag)) + constants_only=isinstance(tag, LocalIndexTag), + context=self.assumptions) except ValueError: pass diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 41e67452e..94facdedd 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -291,8 +291,8 @@ def _classify_reduction_inames(kernel, inames): nonlocal_par = [] from loopy.kernel.data import ( - LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, - ParallelTag) + LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, + ParallelTag) for iname in inames: iname_tag = kernel.iname_to_tag.get(iname) @@ -311,9 +311,8 @@ def _classify_reduction_inames(kernel, inames): else: sequential.append(iname) - return _InameClassification(tuple(sequential), - tuple(local_par), - tuple(nonlocal_par)) + return _InameClassification( + tuple(sequential), tuple(local_par), tuple(nonlocal_par)) def _add_params_to_domain(domain, param_names): @@ -501,8 +500,8 @@ def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): for constr in constrs: candidate_vars = set([ - var for var in constr.get_var_dict() - if var in candidate_inames]) + var for var in constr.get_var_dict() + if var in candidate_inames]) # Irrelevant constraint - skip if scan_iname not in candidate_vars: @@ -570,7 +569,7 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): """ dim_type = isl.dim_type - domain = kernel.get_inames_domain((sweep_iname, scan_iname)) + domain = kernel.get_inames_domain(frozenset([sweep_iname, scan_iname])) domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,)) domain_with_sweep_param = domain_with_sweep_param.project_out_except( @@ -806,12 +805,12 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): from loopy.kernel.data import temp_var_scope, TemporaryVariable - # The first assignee is not passed by pointer, so we start - # by looking at the second assignee. + FIRST_POINTER_ASSIGNEE_IDX = 1 # noqa + for assignee_nr, assignee_var_name, assignee in zip( - range(1, len(assignees)), - assignee_var_names[1:], - assignees[1:]): + range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)), + assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:], + assignees[FIRST_POINTER_ASSIGNEE_IDX:]): if ( assignee_var_name in kernel.temporary_variables @@ -916,7 +915,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # Intersect with inames, because we could have captured some kernel params - # in here too.. + # in here too... dependent_inames = ( frozenset(subdomain.get_var_names(isl.dim_type.param)) & kernel.all_inames()) @@ -981,6 +980,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, dtypes=reduction_dtypes, scope=temp_var_scope.PRIVATE) + init_insn_depends_on = frozenset() + + global_barrier = temp_kernel.find_most_recent_global_barrier(insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + from pymbolic import var acc_vars = tuple(var(n) for n in acc_var_names) @@ -992,7 +998,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(), + depends_on=init_insn_depends_on, expression=expr.operation.neutral_element(*arg_dtypes)) generated_insns.append(init_insn) @@ -1225,21 +1231,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, new_domain = _create_domain_for_sweep_tracking(domain, tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) - """ - from loopy.kernel.tools import DomainChanger - domain_idx, = temp_kernel.get_leaf_domain_indices(frozenset([sweep_iname])) - - orig_domain = domains[domain_idx] - new_domain = isl.align_spaces(new_domain, domains[domain_idx], - obj_bigger_ok=True, - across_dim_types=True) - orig_domain = isl.align_spaces(orig_domain, new_domain) - - orig_domain &= new_domain - """ _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain) - - #domains[domain_idx] = orig_domain return tracking_iname @@ -1346,6 +1338,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _strip_if_scalar(acc_vars, updated_inner_exprs)), depends_on=frozenset([init_insn.id]) | insn.depends_on, within_inames=update_insn_iname_deps, + no_sync_with=insn.no_sync_with, within_inames_is_final=insn.within_inames_is_final) generated_insns.append(scan_insn) @@ -1366,9 +1359,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): - # TODO: rename - red_iname = scan_iname - scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 @@ -1407,7 +1397,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ add separate iname to carry out the scan # Doing this sheds any odd conditionals that may be active - # on our red_iname. + # on our scan_iname. base_exec_iname = var_name_gen(sweep_iname + "__scan") domains.append(_make_slab_set(base_exec_iname, scan_size)) @@ -1417,39 +1407,48 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, from loopy.kernel.data import temp_var_scope - """ - neutral_var_names = make_temporaries( - name_based_on="neutral_"+scan_iname, - nvars=nresults, - shape=(), - dtypes=reduction_dtypes, - scope=temp_var_scope.PRIVATE) - """ + LOCAL_SCAN_SUBSTAGES = 1 - read_var_names = make_temporaries( - name_based_on="read_"+scan_iname+"_arg_{index}", - nvars=nresults, - shape=(), - dtypes=reduction_dtypes, - scope=temp_var_scope.PRIVATE) + read_var_names_by_substage = [] + + for i in range(LOCAL_SCAN_SUBSTAGES): + substage_suffix = "" if i == 0 else ("_substage%d" % i) + + read_var_names_by_substage.append( + make_temporaries( + name_based_on=( + "read_" + scan_iname + "_arg_{index}" + substage_suffix), + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE)) acc_var_names = make_temporaries( - name_based_on="acc_"+scan_iname, + name_based_on="acc_" + scan_iname, nvars=nresults, shape=outer_local_iname_sizes + (scan_size,), dtypes=reduction_dtypes, scope=temp_var_scope.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) - read_vars = tuple(var(n) for n in read_var_names) - #neutral_vars = tuple(var(n) for n in neutral_var_names) + + read_vars_by_substage = [ + tuple(var(n) for n in read_var_names_by_substage[i]) + for i in range(len(read_var_names_by_substage))] base_iname_deps = (outer_insn_inames - frozenset(expr.inames) - frozenset([sweep_iname])) neutral = expr.operation.neutral_element(*arg_dtypes) - init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) + init_insn_depends_on = insn.depends_on + + global_barrier = temp_kernel.find_most_recent_global_barrier(insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + init_id = insn_id_gen("%s_%s_init" % (insn.id, scan_iname)) init_insn = make_assignment( id=init_id, assignees=tuple( @@ -1458,7 +1457,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset()) + depends_on=init_insn_depends_on) generated_insns.append(init_insn) updated_inner_exprs = tuple( @@ -1470,7 +1469,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, from loopy.symbolic import pw_aff_to_expr sweep_min_value_expr = pw_aff_to_expr(sweep_min_value) - transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) + transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, scan_iname)) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( @@ -1486,7 +1485,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id]) | insn.depends_on, - no_sync_with=frozenset([(init_id, "any")])) + no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with) generated_insns.append(transfer_insn) @@ -1499,60 +1498,125 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, prev_id = transfer_id istage = 0 - cur_size = 1 - - while cur_size < scan_size: - stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) - domains.append( - _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) - new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] - - for read_var, acc_var in zip(read_vars, acc_vars): - read_stage_id = insn_id_gen( - "scan_%s_read_stage_%d" % (red_iname, istage)) - - read_stage_insn = make_assignment( - id=read_stage_id, - assignees=(read_var,), - expression=( + curr_stride = 1 + substage_chunk_size = scan_size // LOCAL_SCAN_SUBSTAGES + + substage_chunks = list(zip( + range(0, scan_size, substage_chunk_size), + range(substage_chunk_size, + (LOCAL_SCAN_SUBSTAGES+1)*substage_chunk_size, + substage_chunk_size))) + + # Fix up last one. + substage_chunks[-1] = (substage_chunks[-1][0], scan_size) + + # Parallel scan algorithm: + # - I add to myself the item that's to the left of me; + # - I add to myself the item that's 2 to the left of me; + # - I add to myself the item that's 4 to the left of me; + # - etc. + while curr_stride < scan_size: + # Lowers a single parallel iteration of the local scan. + # + # This is divided into a "read stage" followed by a "write stage" + + # Add inames. + + substage_exec_inames = [] + substage_suffixes = [] + substage_kept_indices = [] + + for isubstage, chunk in enumerate(substage_chunks): + substage_min, substage_max = chunk + + if substage_max <= curr_stride: + continue + + substage_kept_indices.append(isubstage) + + substage_suffix = ("_chunk%d" % isubstage) if isubstage > 0 else "" + substage_suffixes.append(substage_suffix) + + substage_exec_iname = var_name_gen( + "%s__scan_s%d%s" % (sweep_iname, istage, substage_suffix)) + substage_exec_inames.append(substage_exec_iname) + new_iname_tags[substage_exec_iname] = kernel.iname_to_tag[sweep_iname] + + domains.append( + _make_slab_set_from_range( + substage_exec_iname, + max(curr_stride, substage_min), + substage_max)) + + # Read stage + for isubstage, suffix, substage_exec_iname in zip( + substage_kept_indices, + substage_suffixes, + substage_exec_inames): + + read_vars = read_vars_by_substage[isubstage] + + for read_var, acc_var in zip(read_vars, acc_vars): + read_stage_id = insn_id_gen( + "scan_%s_read_stage_%d%s" + % (scan_iname, istage, substage_suffix)) + + read_stage_insn = make_assignment( + id=read_stage_id, + assignees=(read_var,), + expression=( + acc_var[ + outer_local_iname_vars + + (var(substage_exec_iname) - curr_stride,)]), + within_inames=( + base_iname_deps | frozenset([substage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id])) + + generated_insns.append(read_stage_insn) + prev_id = read_stage_id + + last_write_id = None + + for isubstage, suffix, substage_exec_iname in zip( + substage_kept_indices, + substage_suffixes, + substage_exec_inames): + + read_vars = read_vars_by_substage[isubstage] + + write_stage_id = insn_id_gen( + "scan_%s_write_stage_%d%s" % (scan_iname, istage, substage_suffix)) + write_stage_insn = make_assignment( + id=write_stage_id, + no_sync_with=frozenset( + [(last_write_id, "local")] + if last_write_id is not None + else []), + assignees=tuple( + acc_var[outer_local_iname_vars + (var(substage_exec_iname),)] + for acc_var in acc_vars), + expression=expr.operation( + arg_dtypes, + _strip_if_scalar(tuple( acc_var[ - outer_local_iname_vars - + (var(stage_exec_iname) - cur_size,)]), + outer_local_iname_vars + (var(substage_exec_iname),)] + for acc_var in acc_vars)), + _strip_if_scalar(read_vars) + ), within_inames=( - base_iname_deps | frozenset([stage_exec_iname])), + base_iname_deps | frozenset([substage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id])) - - generated_insns.append(read_stage_insn) - prev_id = read_stage_id + depends_on=frozenset([prev_id]), + ) - write_stage_id = insn_id_gen( - "scan_%s_write_stage_%d" % (red_iname, istage)) - write_stage_insn = make_assignment( - id=write_stage_id, - assignees=tuple( - acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)), - _strip_if_scalar(read_vars) - ), - within_inames=( - base_iname_deps | frozenset([stage_exec_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id]), - ) - - generated_insns.append(write_stage_insn) - prev_id = write_stage_id + generated_insns.append(write_stage_insn) + last_write_id = write_stage_id + prev_id = write_stage_id - #cur_size = new_size - #bound = cur_size - cur_size *= 2 + #curr_stride = new_size + #bound = curr_stride + curr_stride *= 2 istage += 1 new_insn_add_depends_on.add(prev_id) @@ -1575,8 +1639,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. - #print(temp_kernel) - arg_dtypes, reduction_dtypes = ( _infer_arg_dtypes_and_reduction_dtypes( temp_kernel, expr, unknown_types_ok)) @@ -1608,7 +1670,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Try to determine scan candidate information (sweep iname, scan # iname, etc). scan_param = _try_infer_scan_candidate_from_expr( - temp_kernel, expr, outer_insn_inames, sweep_iname=force_outer_iname_for_scan) + temp_kernel, expr, outer_insn_inames, + sweep_iname=force_outer_iname_for_scan) except ValueError as v: error = str(v) @@ -1653,7 +1716,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, "Empty reduction found (no inames to reduce over). " "Eliminating.") - # FIXME: return neutral element... + # FIXME: return a neutral element. return expr.expr @@ -1662,6 +1725,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if may_be_implemented_as_scan: assert force_scan or automagic_scans_ok + # We require the "scan" iname to be tagged sequential. if n_sequential: sweep_iname = scan_param.sweep_iname sweep_class = _classify_reduction_inames(kernel, (sweep_iname,)) diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 97ff545b0..461d41068 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -23,11 +23,7 @@ THE SOFTWARE. """ -from loopy.diagnostic import LoopyError import loopy as lp - -from loopy.kernel.data import auto, temp_var_scope -from pytools import memoize_method, Record import islpy as isl @@ -74,7 +70,6 @@ def make_two_level_reduction( # {{{ get stable names for everything var_name_gen = kernel.get_var_name_generator() - insn_id_gen = kernel.get_instruction_id_generator() format_kwargs = {"insn": insn_id, "iname": reduction_iname} @@ -132,7 +127,7 @@ def make_two_level_reduction( def _update_instructions(kernel, id_to_new_insn, copy=True): if not isinstance(id_to_new_insn, dict): id_to_new_insn = dict((insn.id, insn) for insn in id_to_new_insn) - + new_instructions = ( list(insn for insn in kernel.instructions if insn.id not in id_to_new_insn) @@ -145,17 +140,22 @@ def _update_instructions(kernel, id_to_new_insn, copy=True): return kernel -def _make_slab_set(iname, size, space): - # FIXME: stolen from preprocess, should be its own thing... - from pymbolic.mapper.dependency import DependencyMapper +def _make_slab_set(iname, size): + # FIXME: There is a very similar identically named function in + # preprocess. Refactor. - space = size.get_domain_space() + if not isinstance(size, (isl.PwAff, isl.Aff)): + from loopy.symbolic import pwaff_from_expr + size = pwaff_from_expr( + isl.Space.params_alloc(isl.DEFAULT_CONTEXT, 0), size) - new_space = (size.get_domain_space() + base_space = size.get_domain_space() + + space = (base_space .add_dims(isl.dim_type.set, 1) - .set_dim_name(isl.dim_type.set, space.dim(isl.dim_type.set), iname)) + .set_dim_name(isl.dim_type.set, base_space.dim(isl.dim_type.set), iname)) - v = isl.affs_from_space(new_space) + v = isl.affs_from_space(space) size = isl.align_spaces(size, v[0]) @@ -169,8 +169,10 @@ def _make_slab_set(iname, size, space): def _add_subdomain_to_kernel(kernel, subdomain): domains = list(kernel.domains) + # Filter out value parameters. dep_inames = ( - frozenset(subdomain.get_var_names(isl.dim_type.param)) & kernel.all_inames()) + frozenset(subdomain.get_var_names(isl.dim_type.param)) + & kernel.all_inames()) indices = kernel.get_leaf_domain_indices(dep_inames) @@ -209,8 +211,9 @@ def _add_scan_subdomain( def _expand_subst_within_expression(kernel, expr): - from loopy.symbolic import RuleAwareSubstitutionRuleExpander, SubstitutionRuleMappingContext - from loopy.match import parse_stack_match + from loopy.symbolic import ( + RuleAwareSubstitutionRuleExpander, SubstitutionRuleMappingContext) + rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) submap = RuleAwareSubstitutionRuleExpander( @@ -223,21 +226,28 @@ def _expand_subst_within_expression(kernel, expr): def _add_global_barrier(kernel, source, sink, barrier_id): from loopy.kernel.instruction import BarrierInstruction - within_inames = ( - kernel.id_to_insn[source].within_inames - & kernel.id_to_insn[sink].within_inames) + + sources = (source,) if isinstance(source, str) else source + sinks = (sink,) if isinstance(sink, str) else sink + + within_inames = kernel.id_to_insn[sources[0]].within_inames + from itertools import chain + for iname in chain(sources[1:], sinks): + within_inames &= kernel.id_to_insn[iname].within_inames barrier_insn = BarrierInstruction( id=barrier_id, - depends_on=frozenset([source]), - within_inames = within_inames, + depends_on=frozenset(sources), + within_inames=within_inames, kind="global") - - updated_sink = kernel.id_to_insn[sink] - updated_sink = updated_sink.copy( - depends_on=updated_sink.depends_on | frozenset([barrier_id])) - kernel = _update_instructions(kernel, (barrier_insn, updated_sink), copy=True) + sink_insns = (kernel.id_to_insn[sink] for sink in sinks) + updated_sinks = ( + sink.copy(depends_on=sink.depends_on | frozenset([barrier_id])) + for sink in sink_insns) + + kernel = _update_instructions( + kernel, chain([barrier_insn], updated_sinks), copy=True) return kernel @@ -293,8 +303,8 @@ def make_two_level_scan( into:: [...,l',l''] - [...,l'] nonlocal[0] = 0 - [...,l'] nonlocal[l'+1] = local[l',-1] + [...,nlinit] nonlocal[0] = 0 + [...,nlinit] nonlocal[nlinit+1] = local[nlinit,-1] [...,nl] [...,i',i''] result = nonlocal[i'] + local[i',i''] """ @@ -312,8 +322,6 @@ def make_two_level_scan( # {{{ get stable names for everything - # XXX: add inner_iname and outer_iname to var_name_gen if not none - var_name_gen = kernel.get_var_name_generator() insn_id_gen = kernel.get_instruction_id_generator() @@ -323,16 +331,37 @@ def make_two_level_scan( base_insn_id = _get_base_iname(insn_id) format_kwargs = { - "insn": base_insn_id, "iname": base_scan_iname, "sweep": base_sweep_iname, - "level": level, "next_level": level + 1, "prefix": "l"} + "insn": base_insn_id, + "iname": base_scan_iname, + "sweep": base_sweep_iname, + "level": level, + "next_level": level + 1} if inner_iname is None: inner_iname = var_name_gen( "{sweep}__l{level}".format(**format_kwargs)) + else: + var_name_gen.add_name(inner_iname) if outer_iname is None: outer_iname = var_name_gen( "{sweep}__l{level}_outer".format(**format_kwargs)) + else: + var_name_gen.add_iname(outer_iname) + + """ + nonlocal_init_head_outer_iname = var_name_gen( + "{sweep}__l{level}_nlhead_outer".format(**format_kwargs)) + + nonlocal_init_head_inner_iname = var_name_gen( + "{sweep}__l{level}_nlhead_inner".format(**format_kwargs)) + """ + + nonlocal_init_tail_outer_iname = var_name_gen( + "{sweep}__l{level}_nltail_outer".format(**format_kwargs)) + + nonlocal_init_tail_inner_iname = var_name_gen( + "{sweep}__l{level}_nltail_inner".format(**format_kwargs)) nonlocal_iname = var_name_gen( "{sweep}__l{level}_nonloc".format(**format_kwargs)) @@ -358,10 +387,14 @@ def make_two_level_scan( if local_storage_name is None: local_storage_name = var_name_gen( "{insn}__l{next_level}".format(**format_kwargs)) + else: + var_name_gen.add_name(local_storage_name) if nonlocal_storage_name is None: nonlocal_storage_name = var_name_gen( "{insn}__l{level}_outer".format(**format_kwargs)) + else: + var_name_gen.add_name(nonlocal_storage_name) local_scan_insn_id = insn_id_gen( "{insn}__l{next_level}".format(**format_kwargs)) @@ -394,7 +427,9 @@ def make_two_level_scan( assert len(result) > 0 - return tuple(result) if not (strip_scalar and len(result) == 1) else result[0] + return (tuple(result) + if not (strip_scalar and len(result) == 1) + else result[0]) # }}} @@ -409,12 +444,10 @@ def make_two_level_scan( kernel = reduction_arg_to_subst_rule( kernel, scan_iname, subst_rule_name=subst_name) - - from loopy.kernel.instruction import NoOpInstruction - # FIXME: this is stupid kernel = _update_instructions(kernel, {insn_id: insn.copy(expression=0)}) """ + from loopy.kernel.instruction import NoOpInstruction {insn_id: NoOpInstruction( id=insn_id, depends_on=insn.depends_on, @@ -441,6 +474,10 @@ def make_two_level_scan( # {{{ implement local scan from pymbolic import var + + # FIXME: This can probably be done using split_reduction_inward() + # and will end up looking as less of a mess that way. + local_scan_expr = _expand_subst_within_expression(kernel, var(subst_name)(var(outer_iname) * inner_length + var(inner_scan_iname))) @@ -449,7 +486,6 @@ def make_two_level_scan( inner_iname=inner_iname, outer_iname=outer_iname, inner_tag=inner_tag, outer_tag=outer_tag) - from loopy.kernel.data import SubstitutionRule from loopy.symbolic import Reduction @@ -471,12 +507,16 @@ def make_two_level_scan( precompute_inames = pick_out_relevant_axes(all_precompute_inames) sweep_inames = pick_out_relevant_axes((outer_iname, inner_iname)) + storage_axis_to_tag = { + outer_iname: outer_local_tag, + inner_iname: inner_local_tag, + outer_local_iname: outer_local_tag, + inner_local_iname: inner_local_tag} + precompute_outer_inames = ( - frozenset(all_precompute_inames) - - frozenset(precompute_inames)) + frozenset(all_precompute_inames) - frozenset(precompute_inames)) insn = kernel.id_to_insn[insn_id] - within_inames = insn.within_inames - frozenset([outer_iname, inner_iname]) from pymbolic import var @@ -487,10 +527,17 @@ def make_two_level_scan( sweep_inames=sweep_inames, precompute_inames=precompute_inames, storage_axes=local_storage_axes, + storage_axis_to_tag=storage_axis_to_tag, precompute_outer_inames=precompute_outer_inames | within_inames, temporary_name=local_storage_name, compute_insn_id=local_scan_insn_id) + compute_insn_with_deps = kernel.id_to_insn[local_scan_insn_id] + compute_insn_with_deps = compute_insn_with_deps.copy( + depends_on=compute_insn_with_deps.depends_on | insn.depends_on) + + kernel = _update_instructions(kernel, (compute_insn_with_deps,)) + kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) # }}} @@ -498,24 +545,39 @@ def make_two_level_scan( # {{{ implement local to nonlocal information transfer from loopy.isl_helpers import static_max_of_pw_aff - from loopy.symbolic import aff_to_expr, pw_aff_to_expr - nonlocal_storage_len_pw_aff = 2 + static_max_of_pw_aff( + from loopy.symbolic import pw_aff_to_expr + + local_storage_local_axis_len = ( + kernel.temporary_variables[local_storage_name].shape[-1]) + + nonlocal_storage_len_pw_aff = static_max_of_pw_aff( kernel.get_iname_bounds(outer_iname).size, constants_only=False) - """ - nonlocal_storage_len = ( - kernel.get_constant_iname_length(outer_iname) + # FIXME: this shouldn't have to have an extra element. + nonlocal_storage_len = pw_aff_to_expr(1 + nonlocal_storage_len_pw_aff) + + nonlocal_tail_inner_subd = _make_slab_set(nonlocal_init_tail_inner_iname, 1) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_tail_inner_subd) + nonlocal_tail_outer_subd = _make_slab_set( + nonlocal_init_tail_outer_iname, nonlocal_storage_len_pw_aff) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_tail_outer_subd) - from loopy.symbolic import pw_aff_to_expr - nonlocal_storage_len_pw_aff = ( - # FIXME: should be 1 + len, bounds check doesnt like this.. - 2 + kernel.get_iname_bounds(outer_iname).upper_bound_pw_aff) + """ + nonlocal_head_inner_subd = _make_slab_set(nonlocal_init_head_inner_iname, 1) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_inner_subd) + nonlocal_head_outer_subd = _make_slab_set(nonlocal_init_head_outer_iname, 1) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_outer_subd) """ - nonlocal_storage_len = pw_aff_to_expr(nonlocal_storage_len_pw_aff) + kernel = lp.tag_inames(kernel, { + #nonlocal_init_head_outer_iname: outer_local_tag, + #nonlocal_init_head_inner_iname: inner_local_tag, + nonlocal_init_tail_outer_iname: outer_local_tag, + nonlocal_init_tail_inner_iname: inner_local_tag}) if nonlocal_storage_name not in kernel.temporary_variables: + from loopy.kernel.data import TemporaryVariable new_temporary_variables = kernel.temporary_variables.copy() @@ -530,46 +592,59 @@ def make_two_level_scan( kernel = kernel.copy(temporary_variables=new_temporary_variables) from loopy.kernel.instruction import make_assignment + + # FIXME: neutral element... nonlocal_init_head = make_assignment( id=nonlocal_init_head_insn_id, assignees=(var(nonlocal_storage_name)[0],), expression=0, within_inames=( - within_inames | frozenset([outer_local_iname,inner_local_iname])), - predicates=frozenset([var(inner_local_iname).eq(0)]), + within_inames | frozenset([nonlocal_init_tail_outer_iname, + nonlocal_init_tail_inner_iname])), + no_sync_with=frozenset([(nonlocal_init_tail_insn_id, "any")]), + predicates=(var(nonlocal_init_tail_inner_iname).eq(0), + var(nonlocal_init_tail_outer_iname).eq(0)), depends_on=frozenset([local_scan_insn_id])) - final_element_indices = [] - nonlocal_init_tail = make_assignment( id=nonlocal_init_tail_insn_id, - assignees=(var(nonlocal_storage_name)[var(outer_local_iname) + 1],), + assignees=( + var(nonlocal_storage_name)[ + var(nonlocal_init_tail_outer_iname) + 1],), expression=var(local_storage_name)[ pick_out_relevant_axes( - (var(outer_local_iname),var(inner_local_iname)), + (var(nonlocal_init_tail_outer_iname), + var(nonlocal_init_tail_inner_iname) + + local_storage_local_axis_len - 1), strip_scalar=True)], - no_sync_with=frozenset([(local_scan_insn_id, "local")]), + no_sync_with=frozenset([(nonlocal_init_head_insn_id, "any")]), within_inames=( - within_inames | frozenset([outer_local_iname,inner_local_iname])), - depends_on=frozenset([local_scan_insn_id]), - predicates=frozenset([var(inner_local_iname).eq(inner_length - 1)])) + within_inames | frozenset([nonlocal_init_tail_outer_iname, + nonlocal_init_tail_inner_iname])), + depends_on=frozenset([local_scan_insn_id])) kernel = _update_instructions( kernel, (nonlocal_init_head, nonlocal_init_tail), copy=False) + # The write race warnings are spurious - the inner iname is length + # 1, so there's really no write race at all here. + kernel = kernel.copy( + silenced_warnings=kernel.silenced_warnings + + ["write_race(%s)" % nonlocal_init_tail_insn_id] + + ["write_race(%s)" % nonlocal_init_head_insn_id]) + # }}} # {{{ implement nonlocal scan - outer_idx, = kernel.get_leaf_domain_indices((outer_iname,)) - subd = _make_slab_set(nonlocal_iname, nonlocal_storage_len_pw_aff, nonlocal_storage_len_pw_aff.space) + subd = _make_slab_set(nonlocal_iname, nonlocal_storage_len_pw_aff) kernel = _add_subdomain_to_kernel(kernel, subd) if nonlocal_tag is not None: kernel = lp.tag_inames(kernel, {nonlocal_iname: nonlocal_tag}) kernel = _add_scan_subdomain(kernel, outer_scan_iname, nonlocal_iname) - + nonlocal_scan = make_assignment( id=nonlocal_scan_insn_id, assignees=(var(nonlocal_storage_name)[var(nonlocal_iname)],), @@ -578,14 +653,16 @@ def make_two_level_scan( (outer_scan_iname,), var(nonlocal_storage_name)[var(outer_scan_iname)]), within_inames=within_inames | frozenset([nonlocal_iname]), - depends_on=frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id])) + depends_on=( + frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id]))) kernel = _update_instructions(kernel, (nonlocal_scan,), copy=False) if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL: - barrier_id = insn_id_gen("barrier_{insn}".format(**format_kwargs)) + barrier_id = insn_id_gen( + "{insn}_nonlocal_init_barrier".format(**format_kwargs)) kernel = _add_global_barrier(kernel, - source=nonlocal_init_tail_insn_id, + source=(nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id), sink=nonlocal_scan_insn_id, barrier_id=barrier_id) @@ -593,10 +670,12 @@ def make_two_level_scan( # {{{ replace scan with local + nonlocal - updated_depends_on = insn.depends_on | frozenset([nonlocal_scan_insn_id]) + updated_depends_on = (insn.depends_on + | frozenset([nonlocal_scan_insn_id, local_scan_insn_id])) if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL: - barrier_id = insn_id_gen("barrier_{insn}".format(**format_kwargs)) + barrier_id = insn_id_gen( + "{insn}_nonlocal_scan_barrier".format(**format_kwargs)) kernel = (_add_global_barrier(kernel, source=nonlocal_scan_insn_id, sink=insn_id, barrier_id=barrier_id)) updated_depends_on |= frozenset([barrier_id]) diff --git a/test/test_scan.py b/test/test_scan.py index 5ea203fff..b9d5a21bc 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -216,6 +216,8 @@ def test_local_parallel_scan(ctx_factory, n): knl = lp.add_dtypes(knl, dict(a=int)) + print(knl) + evt, (a,) = knl(queue, a=np.arange(n)) assert (a == np.cumsum(np.arange(n)**2)).all() @@ -375,7 +377,8 @@ def _get_two_level_scan_kernel(g_size): """ out[i] = sum(j, a[j]) {id=insn} """, - "...") + "...", + assumptions="n > 0") from loopy.transform.reduction import make_two_level_scan knl = make_two_level_scan( @@ -419,7 +422,8 @@ def _get_three_level_scan_kernel(g_size, p_size): """ out[i] = sum(j, a[j]) {id=insn} """, - "...") + "...", + assumptions="n > 0") from loopy.transform.reduction import make_two_level_scan knl = make_two_level_scan( @@ -435,8 +439,6 @@ def _get_three_level_scan_kernel(g_size, p_size): inner_local_tag=None, outer_local_tag="g.0") - knl = lp.tag_inames(knl, dict(i__l0="l.0")) - knl = make_two_level_scan( knl, "insn__l1", inner_length=p_size, scan_iname="j__l1", @@ -449,6 +451,9 @@ def _get_three_level_scan_kernel(g_size, p_size): inner_local_tag="for", outer_local_tag="l.0") + knl = lp.tag_inames(knl, dict(i__l0="l.0", + i__l0_nltail_inner="l.0")) + knl = lp.realize_reduction(knl, force_scan=True) from loopy.transform.instruction import add_nosync_to_instructions diff --git a/test/test_transform.py b/test/test_transform.py index cf2dac48f..0d63ba284 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -403,7 +403,8 @@ def test_precompute_with_preexisting_inames_fail(): def test_add_nosync_to_instructions(): - knl = lp.make_kernel("") + # FIXME: Write test. + pass if __name__ == "__main__": -- GitLab From e84b161f99054b4e259682fc48e214fa2a596919 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Mar 2017 19:40:16 -0500 Subject: [PATCH 18/27] Fix doctest for temporary saving naming convention change. --- doc/tutorial.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 942c7d56e..b98331264 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1228,18 +1228,18 @@ put those instructions into the schedule. --------------------------------------------------------------------------- TEMPORARIES: tmp: type: np:dtype('int32'), shape: () scope:private - tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) scope:global + tmp__save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) scope:global --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- SCHEDULE: - 0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[]) + 0: CALL KERNEL rotate_v2(extra_args=['tmp__save_slot'], extra_inames=[]) 1: [maketmp] tmp <- arr[i_inner + i_outer*16] - 2: [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp + 2: [tmp.save] tmp__save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp 3: RETURN FROM KERNEL rotate_v2 4: ---BARRIER:global--- - 5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[]) - 6: [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] + 5: CALL KERNEL rotate_v2_0(extra_args=['tmp__save_slot'], extra_inames=[]) + 6: [tmp.reload] tmp <- tmp__save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] 7: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp 8: RETURN FROM KERNEL rotate_v2_0 --------------------------------------------------------------------------- @@ -1264,19 +1264,19 @@ The kernel translates into two OpenCL kernels. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp__save_slot) { int tmp; tmp = arr[16 * gid(0) + lid(0)]; - tmp_save_slot[16 * gid(0) + lid(0)] = tmp; + tmp__save_slot[16 * gid(0) + lid(0)] = tmp; } - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp__save_slot) { int tmp; - tmp = tmp_save_slot[16 * gid(0) + lid(0)]; + tmp = tmp__save_slot[16 * gid(0) + lid(0)]; arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; } -- GitLab From 54160ac7bcc8b58bc3c1b65e7cca22d4dda5b8d0 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Mar 2017 20:25:03 -0500 Subject: [PATCH 19/27] Flake8 pacification. --- loopy/__init__.py | 2 + loopy/kernel/__init__.py | 4 +- loopy/preprocess.py | 225 +++++++++++++--------------------- loopy/transform/precompute.py | 6 +- loopy/transform/reduction.py | 4 +- loopy/transform/save.py | 19 +-- 6 files changed, 104 insertions(+), 156 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a10d94463..329313b8d 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -209,6 +209,8 @@ __all__ = [ "assume", "fix_parameters", + "make_two_level_reduction", + "save_and_reload_temporaries", # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 079d5c460..3145970c7 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -900,7 +900,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): (b, i) for i, b in enumerate(self.global_barrier_order)) def get_barrier_ordinal(barrier_id): - return global_barrier_to_ordinal[barrier_id] if barrier_id is not None else -1 + return (global_barrier_to_ordinal[barrier_id] + if barrier_id is not None + else -1) direct_barrier_dependencies = set( dep for dep in insn.depends_on if is_barrier(dep)) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 94facdedd..479bbfc61 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -369,8 +369,6 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): } """ - dim_type = isl.dim_type - orig_domain = kernel.get_inames_domain( frozenset((scan_param.sweep_iname, scan_param.scan_iname))) @@ -450,7 +448,8 @@ _ScanCandidateParameters = namedtuple( "sweep_upper_bound, scan_lower_bound, stride") -def _try_infer_scan_candidate_from_expr(kernel, expr, within_inames, sweep_iname=None): +def _try_infer_scan_candidate_from_expr( + kernel, expr, within_inames, sweep_iname=None): """Analyze `expr` and determine if it can be implemented as a scan. """ from loopy.symbolic import Reduction @@ -471,20 +470,28 @@ def _try_infer_scan_candidate_from_expr(kernel, expr, within_inames, sweep_iname sweep_iname = _try_infer_sweep_iname( domain, scan_iname, kernel.all_inames()) except ValueError as v: - raise ValueError("Couldn't determine a sweep iname for the scan expression '%s': %s" % (expr, v)) + raise ValueError( + "Couldn't determine a sweep iname for the scan " + "expression '%s': %s" % (expr, v)) try: sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( - _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames)) + _try_infer_scan_and_sweep_bounds( + kernel, scan_iname, sweep_iname, within_inames)) except ValueError as v: - raise ValueError("Couldn't determine bounds for the scan with expression '%s' " - "(sweep iname: '%s', scan iname: '%s'): %s" % (expr, sweep_iname, scan_iname, v)) + raise ValueError( + "Couldn't determine bounds for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) try: stride = _try_infer_scan_stride( kernel, scan_iname, sweep_iname, sweep_lower_bound) except ValueError as v: - raise ValueError("Couldn't determine a scan stride for the scan with expression '%s' (sweep iname: '%s', scan iname: '%s'): %s" % (expr, sweep_iname, scan_iname, v)) + raise ValueError( + "Couldn't determine a scan stride for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, sweep_upper_bound, scan_lower_bound, stride) @@ -1117,7 +1124,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_iname_deps = outer_insn_inames - frozenset(expr.inames) neutral = expr.operation.neutral_element(*arg_dtypes) - init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, @@ -1232,7 +1238,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain) - + return tracking_iname def replace_var_within_expr(expr, from_var, to_var): @@ -1407,34 +1413,32 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, from loopy.kernel.data import temp_var_scope - LOCAL_SCAN_SUBSTAGES = 1 - - read_var_names_by_substage = [] - - for i in range(LOCAL_SCAN_SUBSTAGES): - substage_suffix = "" if i == 0 else ("_substage%d" % i) + """ + neutral_var_names = make_temporaries( + name_based_on="neutral_"+scan_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + """ - read_var_names_by_substage.append( - make_temporaries( - name_based_on=( - "read_" + scan_iname + "_arg_{index}" + substage_suffix), - nvars=nresults, - shape=(), - dtypes=reduction_dtypes, - scope=temp_var_scope.PRIVATE)) + read_var_names = make_temporaries( + name_based_on="read_"+scan_iname+"_arg_{index}", + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) acc_var_names = make_temporaries( - name_based_on="acc_" + scan_iname, + name_based_on="acc_"+scan_iname, nvars=nresults, shape=outer_local_iname_sizes + (scan_size,), dtypes=reduction_dtypes, scope=temp_var_scope.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) - - read_vars_by_substage = [ - tuple(var(n) for n in read_var_names_by_substage[i]) - for i in range(len(read_var_names_by_substage))] + read_vars = tuple(var(n) for n in read_var_names) + #neutral_vars = tuple(var(n) for n in neutral_var_names) base_iname_deps = (outer_insn_inames - frozenset(expr.inames) - frozenset([sweep_iname])) @@ -1498,125 +1502,60 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, prev_id = transfer_id istage = 0 - curr_stride = 1 - substage_chunk_size = scan_size // LOCAL_SCAN_SUBSTAGES - - substage_chunks = list(zip( - range(0, scan_size, substage_chunk_size), - range(substage_chunk_size, - (LOCAL_SCAN_SUBSTAGES+1)*substage_chunk_size, - substage_chunk_size))) - - # Fix up last one. - substage_chunks[-1] = (substage_chunks[-1][0], scan_size) - - # Parallel scan algorithm: - # - I add to myself the item that's to the left of me; - # - I add to myself the item that's 2 to the left of me; - # - I add to myself the item that's 4 to the left of me; - # - etc. - while curr_stride < scan_size: - # Lowers a single parallel iteration of the local scan. - # - # This is divided into a "read stage" followed by a "write stage" - - # Add inames. - - substage_exec_inames = [] - substage_suffixes = [] - substage_kept_indices = [] - - for isubstage, chunk in enumerate(substage_chunks): - substage_min, substage_max = chunk - - if substage_max <= curr_stride: - continue - - substage_kept_indices.append(isubstage) - - substage_suffix = ("_chunk%d" % isubstage) if isubstage > 0 else "" - substage_suffixes.append(substage_suffix) - - substage_exec_iname = var_name_gen( - "%s__scan_s%d%s" % (sweep_iname, istage, substage_suffix)) - substage_exec_inames.append(substage_exec_iname) - new_iname_tags[substage_exec_iname] = kernel.iname_to_tag[sweep_iname] - - domains.append( - _make_slab_set_from_range( - substage_exec_iname, - max(curr_stride, substage_min), - substage_max)) - - # Read stage - for isubstage, suffix, substage_exec_iname in zip( - substage_kept_indices, - substage_suffixes, - substage_exec_inames): - - read_vars = read_vars_by_substage[isubstage] - - for read_var, acc_var in zip(read_vars, acc_vars): - read_stage_id = insn_id_gen( - "scan_%s_read_stage_%d%s" - % (scan_iname, istage, substage_suffix)) - - read_stage_insn = make_assignment( - id=read_stage_id, - assignees=(read_var,), - expression=( - acc_var[ - outer_local_iname_vars - + (var(substage_exec_iname) - curr_stride,)]), - within_inames=( - base_iname_deps | frozenset([substage_exec_iname])), - within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id])) - - generated_insns.append(read_stage_insn) - prev_id = read_stage_id - - last_write_id = None - - for isubstage, suffix, substage_exec_iname in zip( - substage_kept_indices, - substage_suffixes, - substage_exec_inames): - - read_vars = read_vars_by_substage[isubstage] - - write_stage_id = insn_id_gen( - "scan_%s_write_stage_%d%s" % (scan_iname, istage, substage_suffix)) - write_stage_insn = make_assignment( - id=write_stage_id, - no_sync_with=frozenset( - [(last_write_id, "local")] - if last_write_id is not None - else []), - assignees=tuple( - acc_var[outer_local_iname_vars + (var(substage_exec_iname),)] - for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(tuple( + cur_size = 1 + + while cur_size < scan_size: + stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) + domains.append( + _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) + new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] + + for read_var, acc_var in zip(read_vars, acc_vars): + read_stage_id = insn_id_gen( + "scan_%s_read_stage_%d" % (scan_iname, istage)) + + read_stage_insn = make_assignment( + id=read_stage_id, + assignees=(read_var,), + expression=( acc_var[ - outer_local_iname_vars + (var(substage_exec_iname),)] - for acc_var in acc_vars)), - _strip_if_scalar(read_vars) - ), + outer_local_iname_vars + + (var(stage_exec_iname) - cur_size,)]), within_inames=( - base_iname_deps | frozenset([substage_exec_iname])), + base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset([prev_id]), - ) + depends_on=frozenset([prev_id])) + + generated_insns.append(read_stage_insn) + prev_id = read_stage_id + + write_stage_id = insn_id_gen( + "scan_%s_write_stage_%d" % (scan_iname, istage)) + write_stage_insn = make_assignment( + id=write_stage_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars), + expression=expr.operation( + arg_dtypes, + _strip_if_scalar(tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + _strip_if_scalar(read_vars) + ), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id]), + ) - generated_insns.append(write_stage_insn) - last_write_id = write_stage_id - prev_id = write_stage_id + generated_insns.append(write_stage_insn) + prev_id = write_stage_id - #curr_stride = new_size - #bound = curr_stride - curr_stride *= 2 + #cur_size = new_size + #bound = cur_size + cur_size *= 2 istage += 1 new_insn_add_depends_on.add(prev_id) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index e6a329386..a19e06ecd 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -725,11 +725,11 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, mod_check_domain, check_domain) # The modified domain can't get bigger by adding constraints - assert mod_check_domain.gist_params(kernel.assumptions) <= check_domain.gist_params(kernel.assumptions) + assert mod_check_domain <= check_domain if not check_domain <= mod_check_domain: - print(check_domain.gist_params(kernel.assumptions)) - print(mod_check_domain.gist_params(kernel.assumptions)) + print(check_domain) + print(mod_check_domain) raise LoopyError("original domain got shrunk by applying the precompute") # }}} diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 461d41068..8c99932ed 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -253,7 +253,7 @@ def _add_global_barrier(kernel, source, sink, barrier_id): def _get_scan_level(sweep_iname): - SWEEP_RE = r".*__l(\d+)(?:_outer)?" + SWEEP_RE = r".*__l(\d+)(?:_outer)?" # noqa import re match_result = re.match(SWEEP_RE, sweep_iname) @@ -265,7 +265,7 @@ def _get_scan_level(sweep_iname): def _get_base_iname(iname): - BASE_INAME_RE = r"(.*)__l\d+(?:_outer)?" + BASE_INAME_RE = r"(.*)__l\d+(?:_outer)?" # noqa import re match_result = re.match(BASE_INAME_RE, iname) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 24360a808..efe0e83f4 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -214,7 +214,8 @@ class TemporarySaver(object): non-hardware dimensions """ - __slots__ = ["name", "orig_temporary_name", "hw_dims", "hw_tags", "non_hw_dims"] + __slots__ = ["name", "orig_temporary_name", "hw_dims", "hw_tags", + "non_hw_dims"] def as_kernel_temporary(self, kernel): temporary = kernel.temporary_variables[self.orig_temporary_name] @@ -277,7 +278,6 @@ class TemporarySaver(object): group_tags = None local_tags = None - group_tag_originating_insn_id = None def _sortedtags(tags): return sorted(tags, key=lambda tag: tag.axis) @@ -373,7 +373,8 @@ class TemporarySaver(object): non_hw_dims=non_hw_dims) if temporary.base_storage is not None: - self.base_storage_to_representative[temporary.base_storage] = backing_temporary + self.base_storage_to_representative[temporary.base_storage] = ( + backing_temporary) return backing_temporary @@ -415,7 +416,9 @@ class TemporarySaver(object): Variable(agg), tuple(map(Variable, subscript))) - orig_temporary = self.kernel.temporary_variables[promoted_temporary.orig_temporary_name] + orig_temporary = ( + self.kernel.temporary_variables[ + promoted_temporary.orig_temporary_name]) dim_inames_trunc = dim_inames[:len(orig_temporary.shape)] args = ( @@ -532,12 +535,16 @@ class TemporarySaver(object): assert mode in ("save", "reload") import islpy as isl - orig_temporary = self.kernel.temporary_variables[promoted_temporary.orig_temporary_name] + orig_temporary = ( + self.kernel.temporary_variables[ + promoted_temporary.orig_temporary_name]) orig_dim = domain.dim(isl.dim_type.set) # Tags for newly added inames iname_to_tag = {} + from loopy.symbolic import aff_from_expr + # FIXME: Restrict size of new inames to access footprint. # Add dimension-dependent inames. @@ -566,7 +573,6 @@ class TemporarySaver(object): # Add size information. aff = isl.affs_from_space(domain.space) domain &= aff[0].le_set(aff[new_iname]) - from loopy.symbolic import aff_from_expr domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size)) dim_offset = orig_dim + len(promoted_temporary.non_hw_dims) @@ -584,7 +590,6 @@ class TemporarySaver(object): isl.dim_type.set, dim_offset + hw_iname_idx, new_iname) aff = isl.affs_from_space(domain.space) - from loopy.symbolic import aff_from_expr domain = (domain & aff[0].le_set(aff[new_iname]) -- GitLab From cd4f8f507dd6d70158f4c4e7440f28d2232ab292 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 12 Mar 2017 20:46:48 -0500 Subject: [PATCH 20/27] Fix another flake8 warning. --- loopy/transform/instruction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 9143052a4..f30e94594 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -266,8 +266,8 @@ def add_nosync_to_instructions( for i, insn in enumerate(new_instructions): if insn.id in nosync_to_add: - new_instructions[i] = insn.copy( - no_sync_with=insn.no_sync_with | frozenset(nosync_to_add[insn.id])) + new_instructions[i] = insn.copy(no_sync_with=insn.no_sync_with + | frozenset(nosync_to_add[insn.id])) return kernel.copy(instructions=new_instructions) -- GitLab From a5d483b47f6e50364a064c85afcb6aa588027129 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 13 Mar 2017 13:35:57 -0500 Subject: [PATCH 21/27] Scan code generation: Remove some barriers. --- loopy/preprocess.py | 39 ++++++++++++++++---------------- loopy/transform/instruction.py | 41 ++++++++++++++++++++++------------ loopy/transform/reduction.py | 3 +++ 3 files changed, 50 insertions(+), 33 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 479bbfc61..c70b13b25 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -391,13 +391,6 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): affs[0], across_dim_types=True) - """ - print("SWEEP AND SCAN INAMES", sweep_iname, scan_iname) - print("SWEEP UPPER BOUND", sweep_upper_bound) - print("SCAN LOWER BOUND", scan_lower_bound) - print("SWEEP LOWER BOUND", sweep_lower_bound) - """ - from itertools import product for (sweep_lb_domain, sweep_lb_aff), \ @@ -423,21 +416,21 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): hyp_domain, = (hyp_domain & assumptions).get_basic_sets() test_domain, = (orig_domain & assumptions).get_basic_sets() - """ - print("ASSUMPTIONS", assumptions) - print("HYP", hyp_domain) - print("TEST", test_domain) - print("HYP AGAINST TEST", hyp_domain.gist(test_domain)) - print("TEST AGAINST HYP", test_domain.gist(hyp_domain)) - """ - - if _domain_depends_on_given_set_dims(hyp_domain.gist(test_domain), + hyp_gist_against_test = hyp_domain.gist(test_domain) + if _domain_depends_on_given_set_dims(hyp_gist_against_test, (sweep_iname, scan_iname)): - return False, "cond1" + return False, ( + "gist of hypothesis against test domain " + "has sweep or scan dependent constraints: '%s'" + % hyp_gist_against_test) - if _domain_depends_on_given_set_dims(test_domain.gist(hyp_domain), + test_gist_against_hyp = test_domain.gist(hyp_domain) + if _domain_depends_on_given_set_dims(test_gist_against_hyp, (sweep_iname, scan_iname)): - return False, "cond2" + return False, ( + "gist of test against hypothesis domain " + "has sweep or scan dependent constraint: '%s'" + % test_gist_against_hyp) return True, "ok" @@ -1526,6 +1519,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([prev_id])) + if cur_size == 1: + # Performance hack: don't add a barrier here with transfer_insn. + # XXX: If the lowering logic changes, this could be brittle. + read_stage_insn = read_stage_insn.copy( + no_sync_with=( + read_stage_insn.no_sync_with + | frozenset([(transfer_id, "any")]))) + generated_insns.append(read_stage_insn) prev_id = read_stage_id diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index f30e94594..e3f3eb4ad 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -170,6 +170,7 @@ def replace_instruction_ids(kernel, replacements): for insn in kernel.instructions: changed = False new_depends_on = [] + new_no_sync_with = [] for dep in insn.depends_on: if dep in replacements: @@ -178,8 +179,18 @@ def replace_instruction_ids(kernel, replacements): else: new_depends_on.append(dep) + for insn_id, scope in insn.no_sync_with: + if insn_id in replacements: + new_no_sync_with.extend( + (repl, scope) for repl in replacements[insn_id]) + changed = True + else: + new_no_sync_with.append((insn_id, scope)) + new_insns.append( - insn.copy(depends_on=frozenset(new_depends_on)) + insn.copy( + depends_on=frozenset(new_depends_on), + no_sync_with=frozenset(new_no_sync_with)) if changed else insn) return kernel.copy(instructions=new_insns) @@ -210,19 +221,21 @@ def tag_instructions(kernel, new_tag, within=None): def add_nosync_to_instructions( kernel, scope, source, sink, bidirectional=False): - """Add a *nosync* directive between *source* and *sync*. - - *source* and *sink* may be any instruction id match understood by - :func:`loopy.match.parse_match`. - - *scope* should be a valid nosync scope. - - If *bidirectional* is True, this adds a nosync to both the source - and sink instructions, otherwise the directive is only added to the - sink instructions. - - *nosync* attributes are only added if a dependency is present or if - the instruction pair is spread across a conflicting group. + """Add a *no_sync_with* directive between *source* and *sink*. + *no_sync_with* is only added if a dependency is present or if the + instruction pair is in a conflicting group. + + :arg kernel: + :arg source: Either a single instruction id, or any instruction id + match understood by :func:`loopy.match.parse_match`. + :arg sink: Either a single instruction id, or any instruction id + match understood by :func:`loopy.match.parse_match`. + :arg scope: A string which is a valid *no_sync_with* scope. + :arg bidirectional: A :class:`bool`. If *True*, add a *no_sync_with* + to both the source and sink instructions, otherwise the directive + is only added to the sink instructions. + + :return: The updated kernel """ if isinstance(source, str) and source in kernel.id_to_insn: diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 8c99932ed..c63ac9d72 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -687,6 +687,9 @@ def make_two_level_scan( (var(outer_iname), var(inner_iname)), strip_scalar=True)] updated_insn = insn.copy( + no_sync_with=( + insn.no_sync_with + | frozenset([(nonlocal_scan_insn_id, "any")])), depends_on=updated_depends_on, # XXX: scan binary op expression=nonlocal_part + local_part) -- GitLab From be08c8e4b97148cdc00cfff634ec42a2c311957f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 14 Mar 2017 19:06:18 -0500 Subject: [PATCH 22/27] Add a test for force_outer_iname_for_scan. --- loopy/preprocess.py | 2 +- test/test_scan.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c70b13b25..f2213d942 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -430,7 +430,7 @@ def _check_reduction_is_triangular(kernel, expr, scan_param): return False, ( "gist of test against hypothesis domain " "has sweep or scan dependent constraint: '%s'" - % test_gist_against_hyp) + % test_gist_against_hyp) return True, "ok" diff --git a/test/test_scan.py b/test/test_scan.py index b9d5a21bc..71fe559fa 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -134,6 +134,20 @@ def test_selective_scan_realization(): pass +def test_force_outer_iname_for_scan(): + knl = lp.make_kernel( + "[n] -> {[i,j,k]: 0<=k Date: Fri, 17 Mar 2017 20:47:45 -0500 Subject: [PATCH 23/27] - Rename add_nosync_to_instructions to add_nosync() - Refactor to get rid of the InstructionQuery class (it duplicated a lot of functionality with other things). --- loopy/__init__.py | 3 +- loopy/check.py | 27 +++--- loopy/kernel/__init__.py | 36 ++++++++ loopy/schedule/tools.py | 139 +++++++--------------------- loopy/transform/instruction.py | 15 +-- loopy/transform/save.py | 163 ++++++++++++++++++++++----------- test/test_scan.py | 20 +++- test/test_transform.py | 37 +++++++- 8 files changed, 252 insertions(+), 188 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 329313b8d..a5d06881e 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -75,7 +75,8 @@ from loopy.transform.instruction import ( set_instruction_priority, add_dependency, remove_instructions, replace_instruction_ids, - tag_instructions) + tag_instructions, + add_nosync) from loopy.transform.data import ( add_prefetch, change_arg_to_image, diff --git a/loopy/check.py b/loopy/check.py index 6a1e3dc33..68ca4a2b3 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -505,22 +505,22 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): # {{{ check that temporaries are defined in subkernels where used def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): - from loopy.schedule.tools import InstructionQuery from loopy.kernel.data import temp_var_scope - insn_query = InstructionQuery(kernel) - - for subkernel in insn_query.subkernels(): + for subkernel in kernel.subkernels: defined_base_storage = set() - for temporary in insn_query.temporaries_written_in_subkernel(subkernel): + from loopy.schedule.tools import ( + temporaries_written_in_subkernel, temporaries_read_in_subkernel) + + for temporary in temporaries_written_in_subkernel(kernel, subkernel): tval = kernel.temporary_variables[temporary] if tval.base_storage is not None: defined_base_storage.add(tval.base_storage) for temporary in ( - insn_query.temporaries_read_in_subkernel(subkernel) - - insn_query.temporaries_written_in_subkernel(subkernel)): + temporaries_read_in_subkernel(kernel, subkernel) - + temporaries_written_in_subkernel(kernel, subkernel)): tval = kernel.temporary_variables[temporary] if tval.initializer is not None: @@ -530,16 +530,17 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): if tval.base_storage is not None: if tval.base_storage not in defined_base_storage: from loopy.diagnostic import MissingDefinitionError - raise MissingDefinitionError("temporary variable '%s' gets used " - "in subkernel '%s' and neither it nor its aliases have a " - "definition" % (temporary, subkernel)) + raise MissingDefinitionError("temporary variable '%s' gets " + "used in subkernel '%s' and neither it nor its " + "aliases have a definition" % (temporary, subkernel)) continue if tval.scope in (temp_var_scope.PRIVATE, temp_var_scope.LOCAL): from loopy.diagnostic import MissingDefinitionError - raise MissingDefinitionError("temporary variable '%s' gets used in " - "subkernel '%s' without a definition (maybe you forgot to call " - "loopy.save_and_reload_temporaries?)" % (temporary, subkernel)) + raise MissingDefinitionError("temporary variable '%s' gets used " + "in subkernel '%s' without a definition (maybe you forgot " + "to call loopy.save_and_reload_temporaries?)" + % (temporary, subkernel)) # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 3145970c7..dea9c93b9 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -786,6 +786,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): for var_name in insn.read_dependency_names() & admissible_vars: result.setdefault(var_name, set()).add(insn.id) + return result + @memoize_method def writer_map(self): """ @@ -914,6 +916,40 @@ class LoopKernel(ImmutableRecordWithoutPickling): for dep in insn.depends_on), key=get_barrier_ordinal) + @property + @memoize_method + def subkernels(self): + return tuple(self.subkernel_to_insn_ids.keys()) + + @property + @memoize_method + def subkernel_to_insn_ids(self): + if self.state != kernel_state.SCHEDULED: + raise LoopyError("Kernel must be scheduled") + + from loopy.schedule import ( + sched_item_to_insn_id, CallKernel, ReturnFromKernel) + + from collections import OrderedDict + result = OrderedDict() + + subkernel = None + + for sched_item in self.schedule: + if isinstance(sched_item, CallKernel): + subkernel = sched_item.kernel_name + result[subkernel] = set() + + if isinstance(sched_item, ReturnFromKernel): + subkernel = None + + if subkernel is not None: + for insn_id in sched_item_to_insn_id(sched_item): + result[subkernel].add(insn_id) + + return OrderedDict( + (subkernel, frozenset(ids)) for subkernel, ids in result.items()) + # }}} # {{{ argument wrangling diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 692e39028..e058fe30f 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -23,10 +23,6 @@ THE SOFTWARE. """ from loopy.kernel.data import temp_var_scope -from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem, - RunInstruction, Barrier) - -from pytools import memoize_method # {{{ block boundary finder @@ -37,6 +33,7 @@ def get_block_boundaries(schedule): :class:`loopy.schedule.BlockBeginItem`s to :class:`loopy.schedule.BlockEndItem`s and vice versa. """ + from loopy.schedule import (BeginBlockItem, EndBlockItem) block_bounds = {} active_blocks = [] for idx, sched_item in enumerate(schedule): @@ -51,98 +48,20 @@ def get_block_boundaries(schedule): # }}} -# {{{ instruction query utility - -class InstructionQuery(object): - - def __init__(self, kernel): - self.kernel = kernel - block_bounds = get_block_boundaries(kernel.schedule) - subkernel_slices = {} - from six import iteritems - for start, end in iteritems(block_bounds): - sched_item = kernel.schedule[start] - if isinstance(sched_item, CallKernel): - subkernel_slices[sched_item.kernel_name] = slice(start, end + 1) - self.subkernel_slices = subkernel_slices - - @memoize_method - def subkernels(self): - return frozenset(self.subkernel_slices.keys()) - - @memoize_method - def insns_reading_or_writing(self, var): - return frozenset(insn.id for insn in self.kernel.instructions - if var in insn.read_dependency_names() - or var in insn.assignee_var_names()) - - @memoize_method - def insns_in_subkernel(self, subkernel): - return frozenset(sched_item.insn_id for sched_item - in self.kernel.schedule[self.subkernel_slices[subkernel]] - if isinstance(sched_item, RunInstruction)) - - @memoize_method - def temporaries_read_in_subkernel(self, subkernel): - return frozenset( - var - for insn in self.insns_in_subkernel(subkernel) - for var in self.kernel.id_to_insn[insn].read_dependency_names() - if var in self.kernel.temporary_variables) - - @memoize_method - def temporaries_written_in_subkernel(self, subkernel): - return frozenset( - var - for insn in self.insns_in_subkernel(subkernel) - for var in self.kernel.id_to_insn[insn].assignee_var_names() - if var in self.kernel.temporary_variables) - - @memoize_method - def temporaries_read_or_written_in_subkernel(self, subkernel): - return ( - self.temporaries_read_in_subkernel(subkernel) | - self.temporaries_written_in_subkernel(subkernel)) - - @memoize_method - def inames_in_subkernel(self, subkernel): - subkernel_start = self.subkernel_slices[subkernel].start - return frozenset(self.kernel.schedule[subkernel_start].extra_inames) - - @memoize_method - def pre_and_post_barriers(self, subkernel): - subkernel_start = self.subkernel_slices[subkernel].start - subkernel_end = self.subkernel_slices[subkernel].stop - - def is_global_barrier(item): - return isinstance(item, Barrier) and item.kind == "global" - - try: - pre_barrier = next(item for item in - self.kernel.schedule[subkernel_start::-1] - if is_global_barrier(item)).originating_insn_id - except StopIteration: - pre_barrier = None - - try: - post_barrier = next(item for item in - self.kernel.schedule[subkernel_end:] - if is_global_barrier(item)).originating_insn_id - except StopIteration: - post_barrier = None - - return (pre_barrier, post_barrier) - - @memoize_method - def hw_inames(self, insn_id): - """ - Return the inames that insn runs in and that are tagged as hardware - parallel. - """ - from loopy.kernel.data import HardwareParallelTag - return set(iname for iname in self.kernel.insn_inames(insn_id) - if isinstance(self.kernel.iname_to_tag.get(iname), - HardwareParallelTag)) +# {{{ subkernel tools + +def temporaries_read_in_subkernel(kernel, subkernel): + return frozenset(tv + for insn_id in kernel.subkernel_to_insn_ids[subkernel] + for tv in kernel.id_to_insn[insn_id].read_dependency_names() + if tv in kernel.temporary_variables) + + +def temporaries_written_in_subkernel(kernel, subkernel): + return frozenset(tv + for insn_id in kernel.subkernel_to_insn_ids[subkernel] + for tv in kernel.id_to_insn[insn_id].write_dependency_names() + if tv in kernel.temporary_variables) # }}} @@ -155,23 +74,27 @@ def add_extra_args_to_schedule(kernel): instructions in the schedule with global temporaries. """ new_schedule = [] - - insn_query = InstructionQuery(kernel) + from loopy.schedule import CallKernel for sched_item in kernel.schedule: if isinstance(sched_item, CallKernel): - subrange_temporaries = (insn_query - .temporaries_read_or_written_in_subkernel(sched_item.kernel_name)) + subkernel = sched_item.kernel_name + + used_temporaries = ( + temporaries_read_in_subkernel(kernel, subkernel) + | temporaries_written_in_subkernel(kernel, subkernel)) + more_args = set(tv - for tv in subrange_temporaries - if - kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL - and - kernel.temporary_variables[tv].initializer is None - and - tv not in sched_item.extra_args) + for tv in used_temporaries + if + kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL + and + kernel.temporary_variables[tv].initializer is None + and + tv not in sched_item.extra_args) + new_schedule.append(sched_item.copy( - extra_args=sched_item.extra_args + sorted(more_args))) + extra_args=sched_item.extra_args + sorted(more_args))) else: new_schedule.append(sched_item) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index e3f3eb4ad..410274f90 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -219,11 +219,11 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync -def add_nosync_to_instructions( - kernel, scope, source, sink, bidirectional=False): +def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): """Add a *no_sync_with* directive between *source* and *sink*. - *no_sync_with* is only added if a dependency is present or if the - instruction pair is in a conflicting group. + *no_sync_with* is only added if a (syntactic) dependency edge + is present or if the instruction pair is in a conflicting group + (this does not check for memory dependencies). :arg kernel: :arg source: Either a single instruction id, or any instruction id @@ -234,6 +234,9 @@ def add_nosync_to_instructions( :arg bidirectional: A :class:`bool`. If *True*, add a *no_sync_with* to both the source and sink instructions, otherwise the directive is only added to the sink instructions. + :arg force: A :class:`bool`. If *True*, will add a *no_sync_with* + even without the presence of a syntactic dependency edge/ + conflicting instruction group. :return: The updated kernel """ @@ -259,12 +262,12 @@ def add_nosync_to_instructions( bool(insn2.groups & insn1.conflicts_with_groups)) from collections import defaultdict - nosync_to_add = defaultdict(lambda: set()) + nosync_to_add = defaultdict(set) for sink in sinks: for source in sources: - needs_nosync = ( + needs_nosync = force or ( source in kernel.recursive_insn_dep_map()[sink] or insns_in_conflicting_groups(source, sink)) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index efe0e83f4..1c431fa10 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -32,7 +32,7 @@ from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, CallKernel, ReturnFromKernel, Barrier) -from loopy.schedule.tools import (get_block_boundaries, InstructionQuery) +from loopy.schedule.tools import get_block_boundaries import logging @@ -232,7 +232,6 @@ class TemporarySaver(object): def __init__(self, kernel): self.kernel = kernel - self.insn_query = InstructionQuery(kernel) self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -243,12 +242,14 @@ class TemporarySaver(object): self.updated_iname_to_tag = {} self.updated_temporary_variables = {} - # temporary name -> save or reload insns - self.saves_or_reloads_added = {} - + # temporary name -> save or reload insn ids from collections import defaultdict - self.subkernel_to_saves = defaultdict(lambda: set()) - self.subkernel_to_reloads = defaultdict(lambda: set()) + self.temporary_to_save_ids = defaultdict(set) + self.temporary_to_reload_ids = defaultdict(set) + self.subkernel_to_newly_added_insn_ids = defaultdict(set) + + # Maps names of base_storage to the name of the temporary + # representative chosen for saves/reloads self.base_storage_to_representative = {} from loopy.kernel.data import ValueArg @@ -262,6 +263,64 @@ class TemporarySaver(object): arg.name for arg in kernel.args if isinstance(arg, ValueArg))))) + @property + @memoize_method + def subkernel_to_slice_indices(self): + result = {} + + for sched_item_idx, sched_item in enumerate(self.kernel.schedule): + if isinstance(sched_item, CallKernel): + start_idx = sched_item_idx + elif isinstance(sched_item, ReturnFromKernel): + result[sched_item.kernel_name] = (start_idx, 1 + sched_item_idx) + + return result + + @property + @memoize_method + def subkernel_to_surrounding_inames(self): + current_outer_inames = set() + within_subkernel = False + result = {} + + for sched_item_idx, sched_item in enumerate(self.kernel.schedule): + if isinstance(sched_item, CallKernel): + within_subkernel = True + result[sched_item.kernel_name] = frozenset(current_outer_inames) + elif isinstance(sched_item, ReturnFromKernel): + within_subkernel = False + elif isinstance(sched_item, EnterLoop): + if not within_subkernel: + current_outer_inames.add(sched_item.iname) + elif isinstance(sched_item, LeaveLoop): + current_outer_inames.discard(sched_item.iname) + + return result + + @memoize_method + def get_defining_global_barrier_pair(self, subkernel): + subkernel_start, subkernel_end = ( + self.subkernel_to_slice_indices[subkernel]) + + def is_global_barrier(item): + return isinstance(item, Barrier) and item.kind == "global" + + try: + pre_barrier = next(item for item in + self.kernel.schedule[subkernel_start::-1] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + pre_barrier = None + + try: + post_barrier = next(item for item in + self.kernel.schedule[subkernel_end:] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + post_barrier = None + + return (pre_barrier, post_barrier) + def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary): """ This is used for determining the amount of global storage needed for saving @@ -272,9 +331,9 @@ class TemporarySaver(object): In the case of local temporaries, inames that are tagged hw-local do not contribute to the global storage shape. """ - - accessor_insn_ids = ( - self.insn_query.insns_reading_or_writing(temporary.name)) + accessor_insn_ids = frozenset( + self.kernel.reader_map()[temporary.name] + | self.kernel.writer_map()[temporary.name]) group_tags = None local_tags = None @@ -355,8 +414,9 @@ class TemporarySaver(object): return None if temporary.base_storage in self.base_storage_to_representative: - # FIXME: Pick the representative with the largest size... - return self.base_storage_to_representative[temporary.base_storage] + # XXX: Todo: Warn about multiple base_storage + #repr = self.base_storage_to_representative[temporary.base_storage] + pass hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary) non_hw_dims = temporary.shape @@ -388,19 +448,9 @@ class TemporarySaver(object): if promoted_temporary is None: return - if mode == "save": - if promoted_temporary.name in self.subkernel_to_saves[subkernel]: - return - self.subkernel_to_saves[subkernel].add(promoted_temporary.name) - - elif mode == "reload": - if promoted_temporary.name in self.subkernel_to_reloads[subkernel]: - return - self.subkernel_to_reloads[subkernel].add(promoted_temporary.name) - - new_subdomain, hw_inames, dim_inames, iname_to_tag = \ + new_subdomain, hw_inames, dim_inames, iname_to_tag = ( self.augment_domain_for_save_or_reload( - self.new_subdomain, promoted_temporary, mode, subkernel) + self.new_subdomain, promoted_temporary, mode, subkernel)) self.new_subdomain = new_subdomain @@ -417,8 +467,8 @@ class TemporarySaver(object): tuple(map(Variable, subscript))) orig_temporary = ( - self.kernel.temporary_variables[ - promoted_temporary.orig_temporary_name]) + self.kernel.temporary_variables[ + promoted_temporary.orig_temporary_name]) dim_inames_trunc = dim_inames[:len(orig_temporary.shape)] args = ( @@ -430,9 +480,10 @@ class TemporarySaver(object): if mode == "save": args = reversed(args) - accessing_insns_in_subkernel = ( - self.insn_query.insns_reading_or_writing(temporary) & - self.insn_query.insns_in_subkernel(subkernel)) + accessing_insns_in_subkernel = (frozenset( + self.kernel.reader_map()[temporary] + | self.kernel.writer_map()[temporary]) + & self.kernel.subkernel_to_insn_ids[subkernel]) if mode == "save": depends_on = accessing_insns_in_subkernel @@ -441,7 +492,7 @@ class TemporarySaver(object): depends_on = frozenset() update_deps = accessing_insns_in_subkernel - pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel) + pre_barrier, post_barrier = self.get_defining_global_barrier_pair(subkernel) if pre_barrier is not None: depends_on |= set([pre_barrier]) @@ -455,16 +506,19 @@ class TemporarySaver(object): *args, id=save_or_load_insn_id, within_inames=( - self.insn_query.inames_in_subkernel(subkernel) | - frozenset(hw_inames + dim_inames)), + self.subkernel_to_surrounding_inames[subkernel] + | frozenset(hw_inames + dim_inames)), within_inames_is_final=True, depends_on=depends_on, boostable=False, boostable_into=frozenset()) - if temporary not in self.saves_or_reloads_added: - self.saves_or_reloads_added[temporary] = set() - self.saves_or_reloads_added[temporary].add(save_or_load_insn_id) + if mode == "save": + self.temporary_to_save_ids[temporary].add(save_or_load_insn_id) + else: + self.temporary_to_reload_ids[temporary].add(save_or_load_insn_id) + + self.subkernel_to_newly_added_insn_ids[subkernel].add(save_or_load_insn_id) self.insns_to_insert.append(save_or_load_insn) @@ -473,8 +527,8 @@ class TemporarySaver(object): self.insns_to_update[insn_id] = insn.copy( depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) - self.updated_temporary_variables[promoted_temporary.name] = \ - promoted_temporary.as_kernel_temporary(self.kernel) + self.updated_temporary_variables[promoted_temporary.name] = ( + promoted_temporary.as_kernel_temporary(self.kernel)) self.updated_iname_to_tag.update(iname_to_tag) @@ -484,15 +538,6 @@ class TemporarySaver(object): insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) - # Add global no_sync_with between any added reloads and saves - from six import iteritems - for temporary, added_insns in iteritems(self.saves_or_reloads_added): - for insn_id in added_insns: - insn = insns_to_insert[insn_id] - insns_to_insert[insn_id] = insn.copy( - no_sync_with=frozenset( - (added_insn, "global") for added_insn in added_insns)) - for orig_insn in self.kernel.instructions: if orig_insn.id in self.insns_to_update: new_instructions.append(self.insns_to_update[orig_insn.id]) @@ -516,6 +561,18 @@ class TemporarySaver(object): temporary_variables=self.updated_temporary_variables, overridden_get_grid_sizes_for_insn_ids=None) + # Add nosync directives to any saves or reloads that were added with a + # potential dependency chain. + for subkernel in self.kernel.subkernels: + relevant_insns = self.subkernel_to_newly_added_insn_ids[subkernel] + + from itertools import product + for temporary in self.temporary_to_reload_ids: + for source, sink in product( + relevant_insns & self.temporary_to_reload_ids[temporary], + relevant_insns & self.temporary_to_save_ids[temporary]): + kernel = lp.add_nosync(kernel, "global", source, sink) + from loopy.kernel.tools import assign_automatic_axes return assign_automatic_axes(kernel) @@ -530,7 +587,7 @@ class TemporarySaver(object): """ Add new axes to the domain corresponding to the dimensions of `promoted_temporary`. These axes will be used in the save/ - reload stage. + reload stage. These get prefixed onto the already existing axes. """ assert mode in ("save", "reload") import islpy as isl @@ -637,7 +694,8 @@ def save_and_reload_temporaries(knl): liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl) - insn_query = InstructionQuery(knl) + from loopy.schedule.tools import ( + temporaries_read_in_subkernel, temporaries_written_in_subkernel) for sched_idx, sched_item in enumerate(knl.schedule): @@ -648,9 +706,10 @@ def save_and_reload_temporaries(knl): # Kernel entry: nothing live interesting_temporaries = set() else: + subkernel = sched_item.kernel_name interesting_temporaries = ( - insn_query.temporaries_read_or_written_in_subkernel( - sched_item.kernel_name)) + temporaries_read_in_subkernel(knl, subkernel) + | temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}" @@ -662,9 +721,9 @@ def save_and_reload_temporaries(knl): # Kernel exit: nothing live interesting_temporaries = set() else: + subkernel = sched_item.kernel_name interesting_temporaries = ( - insn_query.temporaries_written_in_subkernel( - sched_item.kernel_name)) + temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: logger.info("saving {0} before return of {1}" diff --git a/test/test_scan.py b/test/test_scan.py index 71fe559fa..5c84d6e4d 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -54,8 +54,8 @@ __all__ = [ # More things to test. -# - test that dummy inames are removed # - scan(a) + scan(b) +# - test for badly tagged inames # - global parallel scan # TO DO: @@ -410,8 +410,7 @@ def _get_two_level_scan_kernel(g_size): knl = lp.realize_reduction(knl, force_scan=True) - from loopy.transform.instruction import add_nosync_to_instructions - knl = add_nosync_to_instructions( + knl = lp.add_nosync( knl, scope="global", source="writes:acc_j__l0", @@ -470,8 +469,7 @@ def _get_three_level_scan_kernel(g_size, p_size): knl = lp.realize_reduction(knl, force_scan=True) - from loopy.transform.instruction import add_nosync_to_instructions - knl = add_nosync_to_instructions( + knl = lp.add_nosync( knl, scope="global", source="writes:acc_j__l0", @@ -493,6 +491,8 @@ def _get_three_level_scan_kernel(g_size, p_size): # }}} +# TODO: Test everything from the matrix +# (l.0, seq) x (l.0, seq) @pytest.mark.parametrize("input_len", (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 32)) @pytest.mark.parametrize("g_size", (16,)) @@ -530,6 +530,16 @@ def test_three_level_scan(ctx_getter, g_size, p_size, input_len): assert (out == np.cumsum(a)).all() +def test_scan_extra_constraints_on_domain(): + knl = lp.make_kernel( + "{[i,j,k]: 0<=i 1: exec(sys.argv[1]) diff --git a/test/test_transform.py b/test/test_transform.py index 0d63ba284..b5fcdf04c 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -402,9 +402,40 @@ def test_precompute_with_preexisting_inames_fail(): precompute_inames="ii,jj") -def test_add_nosync_to_instructions(): - # FIXME: Write test. - pass +def test_add_nosync(): + orig_knl = lp.make_kernel("{[i]: 0<=i<10}", + """ + <>tmp[i] = 10 {id=insn1} + <>tmp2[i] = 10 {id=insn2} + + <>tmp3[2*i] = 0 {id=insn3} + <>tmp4 = 1 + tmp3[2*i] {id=insn4} + + <>tmp5[i] = 0 {id=insn5,groups=g1} + tmp5[i] = 1 {id=insn6,conflicts=g1} + """) + + orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") + orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + + # No dependency present - don't add nosync + knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2") + assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + + # Dependency present + knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == knl.id_to_insn["insn3"].no_sync_with + assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + + # Bidirectional + knl = lp.add_nosync( + orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with + assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + + # Groups + knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with if __name__ == "__main__": -- GitLab From f0044292654dbf7aa1aed1e54e5233005b1824ac Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 17 Mar 2017 20:55:07 -0500 Subject: [PATCH 24/27] Save and reload: For now, make sure we only save/reload one representative per base_storage class (see also: #42). --- loopy/transform/save.py | 16 ++++++++++------ test/test_loopy.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 1c431fa10..fa98f478d 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -298,7 +298,7 @@ class TemporarySaver(object): return result @memoize_method - def get_defining_global_barrier_pair(self, subkernel): + def get_enclosing_global_barrier_pair(self, subkernel): subkernel_start, subkernel_end = ( self.subkernel_to_slice_indices[subkernel]) @@ -413,10 +413,14 @@ class TemporarySaver(object): assert temporary.read_only return None - if temporary.base_storage in self.base_storage_to_representative: - # XXX: Todo: Warn about multiple base_storage - #repr = self.base_storage_to_representative[temporary.base_storage] - pass + base_storage_conflict = ( + self.base_storage_to_representative.get( + temporary.base_storage, temporary) is not temporary) + + if base_storage_conflict: + raise NotImplementedError( + "tried to save/reload multiple temporaries with the " + "same base_storage; this is currently not supported") hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary) non_hw_dims = temporary.shape @@ -492,7 +496,7 @@ class TemporarySaver(object): depends_on = frozenset() update_deps = accessing_insns_in_subkernel - pre_barrier, post_barrier = self.get_defining_global_barrier_pair(subkernel) + pre_barrier, post_barrier = self.get_enclosing_global_barrier_pair(subkernel) if pre_barrier is not None: depends_on |= set([pre_barrier]) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1d1450fc0..19a001084 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2161,6 +2161,25 @@ def test_global_barrier_error_if_unordered(): knl.global_barrier_order +def test_multi_base_storage_save_and_reload_not_supported(): + # FIXME: This ought to work, change the test when it does. + knl = lp.make_kernel("{[i]: 0<=i<10}", + """ + <>a[0] = 1 + <>b[0] = 2 + ... gbarrier + out = a[0] + b[0] + """, + seq_dependencies=True) + + knl = lp.alias_temporaries(knl, ("a", "b"), synchronize_for_exclusive_use=False) + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + + with pytest.raises(NotImplementedError): + lp.save_and_reload_temporaries(knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From e26ce8e211e39ab41e804686f5d9d0e8a24949ec Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 17 Mar 2017 21:04:05 -0500 Subject: [PATCH 25/27] Fix flake8 issue. --- loopy/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/__init__.py b/loopy/__init__.py index a5d06881e..206cf49ff 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -192,6 +192,7 @@ __all__ = [ "remove_instructions", "replace_instruction_ids", "tag_instructions", + "add_nosync", "extract_subst", "expand_subst", "assignment_to_subst", "find_rules_matching", "find_one_rule_matching", -- GitLab From caff210970936ac5f4449222a193112af7ee5a16 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 17 Mar 2017 21:09:44 -0500 Subject: [PATCH 26/27] Work around ancient python's lack of OrderedDict. --- loopy/kernel/__init__.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index dea9c93b9..cc3414489 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -919,7 +919,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): @property @memoize_method def subkernels(self): - return tuple(self.subkernel_to_insn_ids.keys()) + if self.state != kernel_state.SCHEDULED: + raise LoopyError("Kernel must be scheduled") + + from loopy.schedule import CallKernel + + return tuple(sched_item.kernel_name + for sched_item in self.schedule + if isinstance(sched_item, CallKernel)) @property @memoize_method @@ -930,10 +937,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): from loopy.schedule import ( sched_item_to_insn_id, CallKernel, ReturnFromKernel) - from collections import OrderedDict - result = OrderedDict() - subkernel = None + result = {} for sched_item in self.schedule: if isinstance(sched_item, CallKernel): @@ -947,8 +952,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): for insn_id in sched_item_to_insn_id(sched_item): result[subkernel].add(insn_id) - return OrderedDict( - (subkernel, frozenset(ids)) for subkernel, ids in result.items()) + for subkernel in result: + result[subkernel] = frozenset(result[subkernel]) + + return result # }}} -- GitLab From ff43c0955764d214d899904ee15244c9b9d34df6 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 26 Mar 2017 19:55:24 -0500 Subject: [PATCH 27/27] ISL: Be quieter when failing to find an upper/lower bound. Without this, we occasionally get the following error written to stderr: ``` isl/isl_tab_pip.c:499: unbounded optimum ``` for certain inputs to the automagic scan detection. Since the inputs are valid kernels, this is undesirable. --- loopy/isl_helpers.py | 9 +++++++++ loopy/preprocess.py | 18 +++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 602830de3..0b3068a46 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -29,6 +29,7 @@ from six.moves import range, zip from loopy.diagnostic import StaticValueFindingError +import contextlib import islpy as isl from islpy import dim_type @@ -60,6 +61,14 @@ def dump_space(ls): for dt in dim_type.names) +@contextlib.contextmanager +def no_stderr_output_from_isl(ctx): + prev_on_error = ctx.get_on_error() + ctx.set_on_error(isl.on_error.CONTINUE) + yield + ctx.set_on_error(prev_on_error) + + # {{{ make_slab def make_slab(space, iname, start, stop): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f2213d942..c2aaf9cd1 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -552,9 +552,11 @@ def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_ina within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,)) try: - sweep_lower_bound = domain.dim_min(sweep_idx) - sweep_upper_bound = domain.dim_max(sweep_idx) - scan_lower_bound = domain.dim_min(scan_idx) + from loopy.isl_helpers import no_stderr_output_from_isl + with no_stderr_output_from_isl(domain.get_ctx()): + sweep_lower_bound = domain.dim_min(sweep_idx) + sweep_upper_bound = domain.dim_max(sweep_idx) + scan_lower_bound = domain.dim_min(scan_idx) except isl.Error as e: raise ValueError("isl error: %s" % e) @@ -581,10 +583,12 @@ def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): # Should be equal to k * sweep_iname, where k is the stride. try: - scan_iname_range = ( - domain_with_sweep_param.dim_max(scan_iname_idx) - - domain_with_sweep_param.dim_min(scan_iname_idx) - ).gist(domain_with_sweep_param.params()) + from loopy.isl_helpers import no_stderr_output_from_isl + with no_stderr_output_from_isl(domain_with_sweep_param.get_ctx()): + scan_iname_range = ( + domain_with_sweep_param.dim_max(scan_iname_idx) + - domain_with_sweep_param.dim_min(scan_iname_idx) + ).gist(domain_with_sweep_param.params()) except isl.Error as e: raise ValueError("isl error: '%s'" % e) -- GitLab