diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ae70a0d6c07e6b922871c6293162321ea335f80a..d561652d605b256fc7da3b0544efcc308594e091 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1465,6 +1465,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): + print("sweep and scan inames", sweep_iname, scan_iname) scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 2b5068b46fc416627065ebc755b76918a2617f2f..dd2fe1fa5e364c750ca254e0eee25d6e8747f89f 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -334,9 +334,10 @@ class ISPCASTBuilder(CASTBuilder): result = [] from cgen import Statement as S, Block if lsize: + # FIXME: not sure result.append( S( - "assert(programCount == (%s))" + "assert(programCount >= (%s))" % ecm(lsize[0], PREC_NONE))) arg_names, arg_decls = self._arg_names_and_decls(codegen_state, extra_args) diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 7b52d8fff3da7d55198555a17c75d368ad61d894..be7f612ea0015166787c6e6ae0e55e285dcd3ab1 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -302,12 +302,11 @@ def make_two_level_scan( nonlocal_scan_storage_name=None, nonlocal_storage_scope=None, nonlocal_tag=None, - outer_local_tag=None, - inner_local_tag=None, - inner_tag=None, - outer_tag=None, - inner_iname=None, - outer_iname=None): + slow_local_tag=None, + fast_local_tag=None, + fast_sweep_iname=None, + slow_sweep_iname=None, + local_scan_uses_fast_axis=True): """Two level scan, mediated through a "local" and "nonlocal" array. This turns a scan of the form:: @@ -322,10 +321,23 @@ def make_two_level_scan( [...,nl] [...,i',i''] result = nonlocal[i'] + local[i',i''] + *sweep_iname* will be split into *fast_sweep_iname* and *slow_sweep_iname*. + The names of *fast_sweep_iname* and *slow_sweep_iname* are supplied so that + they can be passed to *local_storage_axes* if needed. + + :arg nonlocal_storage_name: The nonlocal storage that is an input to the + nonlocal scan. + :arg nonlocal_scan_storage_name: The nonlocal storage that is an output of + the nonlocal scan. :arg local_storage_axes: A tuple of inames. For each iname, a corresponding axis will be added to the temporary array that does the local part of the scan (the "local" array). May be *None*, in which case it is automatically inferred from the tags of the inames. + :arg local_scan_uses_fast_axis: Whether the local scan should be a scan over + the fast axis of the split iname or the slow axis. A local scan + over the fast axis is a "small" scan, and is typically implemented as a + local-parallel scan. A local scan over the slow axis is a "big" scan + and is typically implemented as a sequential scan. """ # TODO: Test that this works even when doing split scans in a loop @@ -359,17 +371,17 @@ def make_two_level_scan( "level": level, "next_level": level + 1} - if inner_iname is None: - inner_iname = var_name_gen( + if fast_sweep_iname is None: + fast_sweep_iname = var_name_gen( "{sweep}__l{level}".format(**format_kwargs)) else: - var_name_gen.add_name(inner_iname) + var_name_gen.add_name(fast_sweep_iname) - if outer_iname is None: - outer_iname = var_name_gen( + if slow_sweep_iname is None: + slow_sweep_iname = var_name_gen( "{sweep}__l{level}_outer".format(**format_kwargs)) else: - var_name_gen.add_iname(outer_iname) + var_name_gen.add_iname(slow_sweep_iname) """ nonlocal_init_head_outer_iname = var_name_gen( @@ -391,16 +403,16 @@ def make_two_level_scan( nonlocal_iname = var_name_gen( "{sweep}__l{level}_nonloc".format(**format_kwargs)) - inner_local_iname = var_name_gen( + fast_local_iname = var_name_gen( "{sweep}__l{next_level}".format(**format_kwargs)) - inner_scan_iname = var_name_gen( + fast_scan_iname = var_name_gen( "{iname}__l{next_level}".format(**format_kwargs)) - outer_local_iname = var_name_gen( + slow_local_iname = var_name_gen( "{sweep}__l{next_level}_outer".format(**format_kwargs)) - outer_scan_iname = var_name_gen( + slow_scan_iname = var_name_gen( "{iname}__l{level}".format(**format_kwargs)) subst_name = var_name_gen( @@ -449,8 +461,8 @@ def make_two_level_scan( auto_local_storage_axes = [ iname for iname, tag in [ - (outer_iname, outer_tag), - (inner_iname, inner_tag)] + (slow_sweep_iname, slow_local_tag), + (fast_sweep_iname, fast_local_tag)] # ">" is "more global" # In a way, global inames are automatically part of an access to a @@ -471,7 +483,8 @@ def make_two_level_scan( def pick_out_relevant_axes(full_indices, strip_scalar=False): assert len(full_indices) == 2 - iname_to_index = dict(zip((outer_iname, inner_iname), full_indices)) + iname_to_index = dict( + zip((slow_sweep_iname, fast_sweep_iname), full_indices)) result = [] for iname in local_storage_axes: @@ -532,52 +545,61 @@ def make_two_level_scan( # FIXME: This can probably be done using split_reduction_inward() # and will end up looking like less of a mess that way. + if local_scan_uses_fast_axis: + subst_expr = var(slow_sweep_iname) * inner_length + var(fast_scan_iname) + else: + subst_expr = var(slow_scan_iname) * inner_length + var(fast_sweep_iname) + local_scan_expr = _expand_subst_within_expression(kernel, - var(subst_name)(var(outer_iname) * inner_length + - var(inner_scan_iname))) + var(subst_name)(subst_expr)) kernel = lp.split_iname(kernel, sweep_iname, inner_length, - inner_iname=inner_iname, outer_iname=outer_iname, - inner_tag=inner_tag, outer_tag=outer_tag) + inner_iname=fast_sweep_iname, outer_iname=slow_sweep_iname, + inner_tag=fast_local_tag, outer_tag=slow_local_tag) from loopy.kernel.data import SubstitutionRule from loopy.symbolic import Reduction + local_scan_iname = ( + fast_scan_iname + if local_scan_uses_fast_axis + else slow_scan_iname) + + local_subst_arguments = (slow_sweep_iname, fast_sweep_iname) + local_subst = SubstitutionRule( name=local_subst_name, - arguments=(outer_iname, inner_iname), + arguments=local_subst_arguments, expression=Reduction( - scan.operation, (inner_scan_iname,), local_scan_expr)) + scan.operation, (local_scan_iname,), local_scan_expr)) substitutions = kernel.substitutions.copy() substitutions[local_subst_name] = local_subst kernel = kernel.copy(substitutions=substitutions) - outer_local_iname = outer_iname - - all_precompute_inames = (outer_local_iname, inner_local_iname) + all_precompute_inames = (slow_local_iname, fast_local_iname) precompute_inames = pick_out_relevant_axes(all_precompute_inames) - sweep_inames = pick_out_relevant_axes((outer_iname, inner_iname)) + sweep_inames = pick_out_relevant_axes((slow_sweep_iname, fast_sweep_iname)) storage_axis_to_tag = { - outer_iname: outer_local_tag, - inner_iname: inner_local_tag, - outer_local_iname: outer_local_tag, - inner_local_iname: inner_local_tag} + slow_sweep_iname: slow_local_tag, + fast_sweep_iname: fast_local_tag, + slow_local_iname: slow_local_tag, + fast_local_iname: fast_local_tag} precompute_outer_inames = ( frozenset(all_precompute_inames) - frozenset(precompute_inames)) within_inames = ( kernel.id_to_insn[insn_id].within_inames - - frozenset([outer_iname, inner_iname])) + - frozenset([slow_sweep_iname, fast_sweep_iname])) from pymbolic import var local_precompute_xform_info = lp.precompute(kernel, [var(local_subst_name)( - var(outer_iname), var(inner_iname))], + var(slow_sweep_iname), var(fast_sweep_iname))], sweep_inames=sweep_inames, precompute_inames=precompute_inames, storage_axes=local_storage_axes, @@ -601,12 +623,22 @@ def make_two_level_scan( kernel = _update_instructions(kernel, (compute_insn_with_deps,)) - kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) + local_sweep_iname = ( + fast_local_iname + if local_scan_uses_fast_axis + else slow_local_iname) + + kernel = _add_scan_subdomain(kernel, local_scan_iname, local_sweep_iname) # }}} + nonlocal_sweep_iname = ( + slow_sweep_iname + if local_scan_uses_fast_axis + else fast_sweep_iname) + from loopy.kernel.data import ConcurrentTag - if not isinstance(kernel.iname_to_tag[outer_iname], ConcurrentTag): + if not isinstance(kernel.iname_to_tag[nonlocal_sweep_iname], ConcurrentTag): # FIXME raise NotImplementedError("outer iname must currently be concurrent because " "it occurs in the local scan and the final addition and one of " @@ -618,11 +650,14 @@ def make_two_level_scan( from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr + # FIXME: Not sure if this is the right thing to do. local_storage_local_axis_len = ( - kernel.temporary_variables[local_storage_name].shape[-1]) + kernel.temporary_variables[local_storage_name].shape[-1] + if local_scan_uses_fast_axis + else kernel.temporary_variables[local_storage_name].shape[0]) nonlocal_storage_len_pw_aff = static_max_of_pw_aff( - kernel.get_iname_bounds(outer_iname).size, + kernel.get_iname_bounds(nonlocal_sweep_iname).size, constants_only=False) # FIXME: this shouldn't have to have an extra element. @@ -646,11 +681,16 @@ def make_two_level_scan( kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_outer_subd) """ + # FIXME: This was commented out so that the nonlocal init part is + # sequential, as a workaround for ISPC. This should just get its own + # parameter controlling the tag of the local-to-nonlocal transfer. + """ kernel = lp.tag_inames(kernel, { - #nonlocal_init_head_outer_iname: outer_local_tag, - #nonlocal_init_head_inner_iname: inner_local_tag, - nonlocal_init_tail_outer_iname: outer_local_tag, - nonlocal_init_tail_inner_iname: inner_local_tag}) + #nonlocal_init_head_outer_iname: slow_local_tag, + #nonlocal_init_head_inner_iname: fast_local_tag, + nonlocal_init_tail_outer_iname: fast_local_tag, + nonlocal_init_tail_inner_iname: slow_local_tag}) + """ for nls_name in [nonlocal_storage_name, nonlocal_scan_storage_name]: if nls_name not in kernel.temporary_variables: @@ -685,17 +725,23 @@ def make_two_level_scan( var(nonlocal_init_tail_outer_iname).eq(0)), depends_on=frozenset([local_scan_dep_id])) + if local_scan_uses_fast_axis: + nonlocal_init_tail_index = ( + var(nonlocal_init_tail_outer_iname), + var(nonlocal_init_tail_inner_iname) + + local_storage_local_axis_len - 1) + else: + nonlocal_init_tail_index = ( + local_storage_local_axis_len - 1, + var(nonlocal_init_tail_outer_iname)) + nonlocal_init_tail = make_assignment( id=nonlocal_init_tail_insn_id, assignees=( var(nonlocal_storage_name)[ var(nonlocal_init_tail_outer_iname) + 1],), expression=var(local_storage_name)[ - pick_out_relevant_axes( - (var(nonlocal_init_tail_outer_iname), - var(nonlocal_init_tail_inner_iname) - + local_storage_local_axis_len - 1), - strip_scalar=True)], + pick_out_relevant_axes(nonlocal_init_tail_index, strip_scalar=True)], no_sync_with=frozenset([(nonlocal_init_head_insn_id, "any")]), within_inames=( within_inames | frozenset([nonlocal_init_tail_outer_iname, @@ -724,15 +770,20 @@ def make_two_level_scan( if nonlocal_tag is not None: kernel = lp.tag_inames(kernel, {nonlocal_iname: nonlocal_tag}) - kernel = _add_scan_subdomain(kernel, outer_scan_iname, nonlocal_iname) + nonlocal_scan_iname = ( + slow_scan_iname + if local_scan_uses_fast_axis + else fast_scan_iname) + + kernel = _add_scan_subdomain(kernel, nonlocal_scan_iname, nonlocal_iname) nonlocal_scan = make_assignment( id=nonlocal_scan_insn_id, assignees=(var(nonlocal_scan_storage_name)[var(nonlocal_iname)],), expression=Reduction( scan.operation, - (outer_scan_iname,), - var(nonlocal_storage_name)[var(outer_scan_iname)]), + (nonlocal_scan_iname,), + var(nonlocal_storage_name)[var(nonlocal_scan_iname)]), within_inames=within_inames | frozenset([nonlocal_iname]), depends_on=( frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id]))) @@ -761,11 +812,11 @@ def make_two_level_scan( source=nonlocal_scan_insn_id, sink=insn_id, barrier_id=barrier_id)) updated_depends_on |= frozenset([barrier_id]) - nonlocal_part = var(nonlocal_scan_storage_name)[var(outer_iname)] + nonlocal_part = var(nonlocal_scan_storage_name)[var(nonlocal_sweep_iname)] local_part = var(local_storage_name)[ pick_out_relevant_axes( - (var(outer_iname), var(inner_iname)), strip_scalar=True)] + (var(slow_sweep_iname), var(fast_sweep_iname)), strip_scalar=True)] updated_insn = insn.copy( no_sync_with=(