From f2c3a75e2dfb7d9d184d7314e245e6249d7efd6e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Oct 2017 22:16:09 -0500 Subject: [PATCH 1/5] Rename inner_iname/outer_iname to inner_sweep_iname/outer_sweep_iname. --- loopy/transform/reduction.py | 50 +++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 7b52d8fff..8dbd45a7e 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -306,8 +306,8 @@ def make_two_level_scan( inner_local_tag=None, inner_tag=None, outer_tag=None, - inner_iname=None, - outer_iname=None): + inner_sweep_iname=None, + outer_sweep_iname=None): """Two level scan, mediated through a "local" and "nonlocal" array. This turns a scan of the form:: @@ -326,6 +326,7 @@ def make_two_level_scan( axis will be added to the temporary array that does the local part of the scan (the "local" array). May be *None*, in which case it is automatically inferred from the tags of the inames. + :arg inner_sweep_iname: The sweep (guiding) iname for the innermost scan. """ # TODO: Test that this works even when doing split scans in a loop @@ -359,17 +360,17 @@ def make_two_level_scan( "level": level, "next_level": level + 1} - if inner_iname is None: - inner_iname = var_name_gen( + if inner_sweep_iname is None: + inner_sweep_iname = var_name_gen( "{sweep}__l{level}".format(**format_kwargs)) else: - var_name_gen.add_name(inner_iname) + var_name_gen.add_name(inner_sweep_iname) - if outer_iname is None: - outer_iname = var_name_gen( + if outer_sweep_iname is None: + outer_sweep_iname = var_name_gen( "{sweep}__l{level}_outer".format(**format_kwargs)) else: - var_name_gen.add_iname(outer_iname) + var_name_gen.add_iname(outer_sweep_iname) """ nonlocal_init_head_outer_iname = var_name_gen( @@ -449,8 +450,8 @@ def make_two_level_scan( auto_local_storage_axes = [ iname for iname, tag in [ - (outer_iname, outer_tag), - (inner_iname, inner_tag)] + (outer_sweep_iname, outer_tag), + (inner_sweep_iname, inner_tag)] # ">" is "more global" # In a way, global inames are automatically part of an access to a @@ -471,7 +472,8 @@ def make_two_level_scan( def pick_out_relevant_axes(full_indices, strip_scalar=False): assert len(full_indices) == 2 - iname_to_index = dict(zip((outer_iname, inner_iname), full_indices)) + iname_to_index = dict( + zip((outer_sweep_iname, inner_sweep_iname), full_indices)) result = [] for iname in local_storage_axes: @@ -533,11 +535,11 @@ def make_two_level_scan( # and will end up looking like less of a mess that way. local_scan_expr = _expand_subst_within_expression(kernel, - var(subst_name)(var(outer_iname) * inner_length + + var(subst_name)(var(outer_sweep_iname) * inner_length + var(inner_scan_iname))) kernel = lp.split_iname(kernel, sweep_iname, inner_length, - inner_iname=inner_iname, outer_iname=outer_iname, + inner_iname=inner_sweep_iname, outer_iname=outer_sweep_iname, inner_tag=inner_tag, outer_tag=outer_tag) from loopy.kernel.data import SubstitutionRule @@ -545,7 +547,7 @@ def make_two_level_scan( local_subst = SubstitutionRule( name=local_subst_name, - arguments=(outer_iname, inner_iname), + arguments=(outer_sweep_iname, inner_sweep_iname), expression=Reduction( scan.operation, (inner_scan_iname,), local_scan_expr)) @@ -554,16 +556,16 @@ def make_two_level_scan( kernel = kernel.copy(substitutions=substitutions) - outer_local_iname = outer_iname + outer_local_iname = outer_sweep_iname all_precompute_inames = (outer_local_iname, inner_local_iname) precompute_inames = pick_out_relevant_axes(all_precompute_inames) - sweep_inames = pick_out_relevant_axes((outer_iname, inner_iname)) + sweep_inames = pick_out_relevant_axes((outer_sweep_iname, inner_sweep_iname)) storage_axis_to_tag = { - outer_iname: outer_local_tag, - inner_iname: inner_local_tag, + outer_sweep_iname: outer_local_tag, + inner_sweep_iname: inner_local_tag, outer_local_iname: outer_local_tag, inner_local_iname: inner_local_tag} @@ -571,13 +573,13 @@ def make_two_level_scan( frozenset(all_precompute_inames) - frozenset(precompute_inames)) within_inames = ( kernel.id_to_insn[insn_id].within_inames - - frozenset([outer_iname, inner_iname])) + - frozenset([outer_sweep_iname, inner_sweep_iname])) from pymbolic import var local_precompute_xform_info = lp.precompute(kernel, [var(local_subst_name)( - var(outer_iname), var(inner_iname))], + var(outer_sweep_iname), var(inner_sweep_iname))], sweep_inames=sweep_inames, precompute_inames=precompute_inames, storage_axes=local_storage_axes, @@ -606,7 +608,7 @@ def make_two_level_scan( # }}} from loopy.kernel.data import ConcurrentTag - if not isinstance(kernel.iname_to_tag[outer_iname], ConcurrentTag): + if not isinstance(kernel.iname_to_tag[outer_sweep_iname], ConcurrentTag): # FIXME raise NotImplementedError("outer iname must currently be concurrent because " "it occurs in the local scan and the final addition and one of " @@ -622,7 +624,7 @@ def make_two_level_scan( kernel.temporary_variables[local_storage_name].shape[-1]) nonlocal_storage_len_pw_aff = static_max_of_pw_aff( - kernel.get_iname_bounds(outer_iname).size, + kernel.get_iname_bounds(outer_sweep_iname).size, constants_only=False) # FIXME: this shouldn't have to have an extra element. @@ -761,11 +763,11 @@ def make_two_level_scan( source=nonlocal_scan_insn_id, sink=insn_id, barrier_id=barrier_id)) updated_depends_on |= frozenset([barrier_id]) - nonlocal_part = var(nonlocal_scan_storage_name)[var(outer_iname)] + nonlocal_part = var(nonlocal_scan_storage_name)[var(outer_sweep_iname)] local_part = var(local_storage_name)[ pick_out_relevant_axes( - (var(outer_iname), var(inner_iname)), strip_scalar=True)] + (var(outer_sweep_iname), var(inner_sweep_iname)), strip_scalar=True)] updated_insn = insn.copy( no_sync_with=( -- GitLab From 72bad067349ecea82972ec8375a7961f8d4d4971 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 21 Oct 2017 22:55:05 -0500 Subject: [PATCH 2/5] More renaming of inner/outer into fast/slow. --- loopy/transform/reduction.py | 106 +++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 47 deletions(-) diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 8dbd45a7e..339e9fc92 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -302,12 +302,15 @@ def make_two_level_scan( nonlocal_scan_storage_name=None, nonlocal_storage_scope=None, nonlocal_tag=None, - outer_local_tag=None, - inner_local_tag=None, - inner_tag=None, - outer_tag=None, - inner_sweep_iname=None, - outer_sweep_iname=None): + # FIXME: Not sure if these two pairs are necessary. It would seem that + # only one is necessary. + slow_local_tag=None, + fast_local_tag=None, + slow_tag=None, + fast_tag=None, + fast_sweep_iname=None, + slow_sweep_iname=None, + inner_scan_uses_fast_axis=True): """Two level scan, mediated through a "local" and "nonlocal" array. This turns a scan of the form:: @@ -326,7 +329,6 @@ def make_two_level_scan( axis will be added to the temporary array that does the local part of the scan (the "local" array). May be *None*, in which case it is automatically inferred from the tags of the inames. - :arg inner_sweep_iname: The sweep (guiding) iname for the innermost scan. """ # TODO: Test that this works even when doing split scans in a loop @@ -360,17 +362,17 @@ def make_two_level_scan( "level": level, "next_level": level + 1} - if inner_sweep_iname is None: - inner_sweep_iname = var_name_gen( + if fast_sweep_iname is None: + fast_sweep_iname = var_name_gen( "{sweep}__l{level}".format(**format_kwargs)) else: - var_name_gen.add_name(inner_sweep_iname) + var_name_gen.add_name(fast_sweep_iname) - if outer_sweep_iname is None: - outer_sweep_iname = var_name_gen( + if slow_sweep_iname is None: + slow_sweep_iname = var_name_gen( "{sweep}__l{level}_outer".format(**format_kwargs)) else: - var_name_gen.add_iname(outer_sweep_iname) + var_name_gen.add_iname(slow_sweep_iname) """ nonlocal_init_head_outer_iname = var_name_gen( @@ -392,16 +394,16 @@ def make_two_level_scan( nonlocal_iname = var_name_gen( "{sweep}__l{level}_nonloc".format(**format_kwargs)) - inner_local_iname = var_name_gen( + fast_local_iname = var_name_gen( "{sweep}__l{next_level}".format(**format_kwargs)) - inner_scan_iname = var_name_gen( + fast_scan_iname = var_name_gen( "{iname}__l{next_level}".format(**format_kwargs)) - outer_local_iname = var_name_gen( + slow_local_iname = var_name_gen( "{sweep}__l{next_level}_outer".format(**format_kwargs)) - outer_scan_iname = var_name_gen( + slow_scan_iname = var_name_gen( "{iname}__l{level}".format(**format_kwargs)) subst_name = var_name_gen( @@ -450,8 +452,8 @@ def make_two_level_scan( auto_local_storage_axes = [ iname for iname, tag in [ - (outer_sweep_iname, outer_tag), - (inner_sweep_iname, inner_tag)] + (slow_sweep_iname, slow_tag), + (fast_sweep_iname, fast_tag)] # ">" is "more global" # In a way, global inames are automatically part of an access to a @@ -473,7 +475,7 @@ def make_two_level_scan( def pick_out_relevant_axes(full_indices, strip_scalar=False): assert len(full_indices) == 2 iname_to_index = dict( - zip((outer_sweep_iname, inner_sweep_iname), full_indices)) + zip((slow_sweep_iname, fast_sweep_iname), full_indices)) result = [] for iname in local_storage_axes: @@ -534,52 +536,62 @@ def make_two_level_scan( # FIXME: This can probably be done using split_reduction_inward() # and will end up looking like less of a mess that way. + if inner_scan_uses_fast_axis: + subst_expr = var(slow_sweep_iname) * inner_length + var(fast_scan_iname) + else: + subst_expr = var(fast_scan_iname) * inner_length + var(fast_sweep_iname) + local_scan_expr = _expand_subst_within_expression(kernel, - var(subst_name)(var(outer_sweep_iname) * inner_length + - var(inner_scan_iname))) + var(subst_name)(subst_expr)) kernel = lp.split_iname(kernel, sweep_iname, inner_length, - inner_iname=inner_sweep_iname, outer_iname=outer_sweep_iname, - inner_tag=inner_tag, outer_tag=outer_tag) + inner_iname=fast_sweep_iname, outer_iname=slow_sweep_iname, + inner_tag=fast_tag, outer_tag=slow_tag) from loopy.kernel.data import SubstitutionRule from loopy.symbolic import Reduction + local_reduction_iname = ( + fast_scan_iname + if inner_scan_uses_fast_axis + else slow_scan_iname) + local_subst = SubstitutionRule( name=local_subst_name, - arguments=(outer_sweep_iname, inner_sweep_iname), + arguments=(slow_sweep_iname, fast_sweep_iname), expression=Reduction( - scan.operation, (inner_scan_iname,), local_scan_expr)) + scan.operation, (local_reduction_iname,), local_scan_expr)) substitutions = kernel.substitutions.copy() substitutions[local_subst_name] = local_subst kernel = kernel.copy(substitutions=substitutions) - outer_local_iname = outer_sweep_iname + if inner_scan_uses_fast_axis: + slow_local_iname = slow_sweep_iname - all_precompute_inames = (outer_local_iname, inner_local_iname) + all_precompute_inames = (slow_local_iname, fast_local_iname) precompute_inames = pick_out_relevant_axes(all_precompute_inames) - sweep_inames = pick_out_relevant_axes((outer_sweep_iname, inner_sweep_iname)) + sweep_inames = pick_out_relevant_axes((slow_sweep_iname, fast_sweep_iname)) storage_axis_to_tag = { - outer_sweep_iname: outer_local_tag, - inner_sweep_iname: inner_local_tag, - outer_local_iname: outer_local_tag, - inner_local_iname: inner_local_tag} + slow_sweep_iname: slow_local_tag, + fast_sweep_iname: fast_local_tag, + slow_local_iname: slow_local_tag, + fast_local_iname: fast_local_tag} precompute_outer_inames = ( frozenset(all_precompute_inames) - frozenset(precompute_inames)) within_inames = ( kernel.id_to_insn[insn_id].within_inames - - frozenset([outer_sweep_iname, inner_sweep_iname])) + - frozenset([slow_sweep_iname, fast_sweep_iname])) from pymbolic import var local_precompute_xform_info = lp.precompute(kernel, [var(local_subst_name)( - var(outer_sweep_iname), var(inner_sweep_iname))], + var(slow_sweep_iname), var(fast_sweep_iname))], sweep_inames=sweep_inames, precompute_inames=precompute_inames, storage_axes=local_storage_axes, @@ -603,12 +615,12 @@ def make_two_level_scan( kernel = _update_instructions(kernel, (compute_insn_with_deps,)) - kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) + kernel = _add_scan_subdomain(kernel, fast_scan_iname, fast_local_iname) # }}} from loopy.kernel.data import ConcurrentTag - if not isinstance(kernel.iname_to_tag[outer_sweep_iname], ConcurrentTag): + if not isinstance(kernel.iname_to_tag[slow_sweep_iname], ConcurrentTag): # FIXME raise NotImplementedError("outer iname must currently be concurrent because " "it occurs in the local scan and the final addition and one of " @@ -624,7 +636,7 @@ def make_two_level_scan( kernel.temporary_variables[local_storage_name].shape[-1]) nonlocal_storage_len_pw_aff = static_max_of_pw_aff( - kernel.get_iname_bounds(outer_sweep_iname).size, + kernel.get_iname_bounds(slow_sweep_iname).size, constants_only=False) # FIXME: this shouldn't have to have an extra element. @@ -649,10 +661,10 @@ def make_two_level_scan( """ kernel = lp.tag_inames(kernel, { - #nonlocal_init_head_outer_iname: outer_local_tag, - #nonlocal_init_head_inner_iname: inner_local_tag, - nonlocal_init_tail_outer_iname: outer_local_tag, - nonlocal_init_tail_inner_iname: inner_local_tag}) + #nonlocal_init_head_outer_iname: slow_local_tag, + #nonlocal_init_head_inner_iname: fast_local_tag, + nonlocal_init_tail_outer_iname: slow_local_tag, + nonlocal_init_tail_inner_iname: fast_local_tag}) for nls_name in [nonlocal_storage_name, nonlocal_scan_storage_name]: if nls_name not in kernel.temporary_variables: @@ -726,15 +738,15 @@ def make_two_level_scan( if nonlocal_tag is not None: kernel = lp.tag_inames(kernel, {nonlocal_iname: nonlocal_tag}) - kernel = _add_scan_subdomain(kernel, outer_scan_iname, nonlocal_iname) + kernel = _add_scan_subdomain(kernel, slow_scan_iname, nonlocal_iname) nonlocal_scan = make_assignment( id=nonlocal_scan_insn_id, assignees=(var(nonlocal_scan_storage_name)[var(nonlocal_iname)],), expression=Reduction( scan.operation, - (outer_scan_iname,), - var(nonlocal_storage_name)[var(outer_scan_iname)]), + (slow_scan_iname,), + var(nonlocal_storage_name)[var(slow_scan_iname)]), within_inames=within_inames | frozenset([nonlocal_iname]), depends_on=( frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id]))) @@ -763,11 +775,11 @@ def make_two_level_scan( source=nonlocal_scan_insn_id, sink=insn_id, barrier_id=barrier_id)) updated_depends_on |= frozenset([barrier_id]) - nonlocal_part = var(nonlocal_scan_storage_name)[var(outer_sweep_iname)] + nonlocal_part = var(nonlocal_scan_storage_name)[var(slow_sweep_iname)] local_part = var(local_storage_name)[ pick_out_relevant_axes( - (var(outer_sweep_iname), var(inner_sweep_iname)), strip_scalar=True)] + (var(slow_sweep_iname), var(fast_sweep_iname)), strip_scalar=True)] updated_insn = insn.copy( no_sync_with=( -- GitLab From 8a92ee72581ffb3fb0f4ef2cb1a43b17784848b3 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 22 Oct 2017 01:18:44 -0500 Subject: [PATCH 3/5] WIP --- loopy/preprocess.py | 1 + loopy/target/ispc.py | 3 +- loopy/transform/reduction.py | 96 +++++++++++++++++++++++------------- 3 files changed, 65 insertions(+), 35 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ae70a0d6c..d561652d6 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1465,6 +1465,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): + print("sweep and scan inames", sweep_iname, scan_iname) scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 2b5068b46..dd2fe1fa5 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -334,9 +334,10 @@ class ISPCASTBuilder(CASTBuilder): result = [] from cgen import Statement as S, Block if lsize: + # FIXME: not sure result.append( S( - "assert(programCount == (%s))" + "assert(programCount >= (%s))" % ecm(lsize[0], PREC_NONE))) arg_names, arg_decls = self._arg_names_and_decls(codegen_state, extra_args) diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 339e9fc92..6ae2adb5e 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -206,8 +206,7 @@ def _add_scan_subdomain( [sweep_iname] -> {[scan_iname] : 0 <= scan_iname <= sweep_iname } """ sp = ( - isl.Space.set_alloc(isl.DEFAULT_CONTEXT, 1, 1) - .set_dim_name(isl.dim_type.param, 0, sweep_iname) + isl.Space.set_alloc(isl.DEFAULT_CONTEXT, 1, 1) .set_dim_name(isl.dim_type.param, 0, sweep_iname) .set_dim_name(isl.dim_type.set, 0, scan_iname)) affs = isl.affs_from_space(sp) @@ -302,15 +301,11 @@ def make_two_level_scan( nonlocal_scan_storage_name=None, nonlocal_storage_scope=None, nonlocal_tag=None, - # FIXME: Not sure if these two pairs are necessary. It would seem that - # only one is necessary. slow_local_tag=None, fast_local_tag=None, - slow_tag=None, - fast_tag=None, fast_sweep_iname=None, slow_sweep_iname=None, - inner_scan_uses_fast_axis=True): + local_scan_uses_fast_axis=True): """Two level scan, mediated through a "local" and "nonlocal" array. This turns a scan of the form:: @@ -325,6 +320,14 @@ def make_two_level_scan( [...,nl] [...,i',i''] result = nonlocal[i'] + local[i',i''] + *sweep_iname* will be split into *fast_sweep_iname* and *slow_sweep_iname*. + The names of *fast_sweep_iname* and *slow_sweep_iname* are supplied so that + they can be passed to *local_storage_axes* if needed. + + :arg nonlocal_storage_name: The nonlocal storage that is an input to the + nonlocal scan. + :arg nonlocal_scan_storage_name: The nonlocal storage that is an output of + the nonlocal scan. :arg local_storage_axes: A tuple of inames. For each iname, a corresponding axis will be added to the temporary array that does the local part of the scan (the "local" array). May be *None*, in which case it is @@ -452,8 +455,8 @@ def make_two_level_scan( auto_local_storage_axes = [ iname for iname, tag in [ - (slow_sweep_iname, slow_tag), - (fast_sweep_iname, fast_tag)] + (slow_sweep_iname, slow_local_tag), + (fast_sweep_iname, fast_local_tag)] # ">" is "more global" # In a way, global inames are automatically part of an access to a @@ -536,40 +539,39 @@ def make_two_level_scan( # FIXME: This can probably be done using split_reduction_inward() # and will end up looking like less of a mess that way. - if inner_scan_uses_fast_axis: + if local_scan_uses_fast_axis: subst_expr = var(slow_sweep_iname) * inner_length + var(fast_scan_iname) else: - subst_expr = var(fast_scan_iname) * inner_length + var(fast_sweep_iname) + subst_expr = var(slow_scan_iname) * inner_length + var(fast_sweep_iname) local_scan_expr = _expand_subst_within_expression(kernel, var(subst_name)(subst_expr)) kernel = lp.split_iname(kernel, sweep_iname, inner_length, inner_iname=fast_sweep_iname, outer_iname=slow_sweep_iname, - inner_tag=fast_tag, outer_tag=slow_tag) + inner_tag=fast_local_tag, outer_tag=slow_local_tag) from loopy.kernel.data import SubstitutionRule from loopy.symbolic import Reduction - local_reduction_iname = ( + local_scan_iname = ( fast_scan_iname - if inner_scan_uses_fast_axis + if local_scan_uses_fast_axis else slow_scan_iname) + local_subst_arguments = (slow_sweep_iname, fast_sweep_iname) + local_subst = SubstitutionRule( name=local_subst_name, - arguments=(slow_sweep_iname, fast_sweep_iname), + arguments=local_subst_arguments, expression=Reduction( - scan.operation, (local_reduction_iname,), local_scan_expr)) + scan.operation, (local_scan_iname,), local_scan_expr)) substitutions = kernel.substitutions.copy() substitutions[local_subst_name] = local_subst kernel = kernel.copy(substitutions=substitutions) - if inner_scan_uses_fast_axis: - slow_local_iname = slow_sweep_iname - all_precompute_inames = (slow_local_iname, fast_local_iname) precompute_inames = pick_out_relevant_axes(all_precompute_inames) @@ -615,12 +617,22 @@ def make_two_level_scan( kernel = _update_instructions(kernel, (compute_insn_with_deps,)) - kernel = _add_scan_subdomain(kernel, fast_scan_iname, fast_local_iname) + local_sweep_iname = ( + fast_local_iname + if local_scan_uses_fast_axis + else slow_local_iname) + + kernel = _add_scan_subdomain(kernel, local_scan_iname, local_sweep_iname) # }}} + nonlocal_sweep_iname = ( + slow_sweep_iname + if local_scan_uses_fast_axis + else fast_sweep_iname) + from loopy.kernel.data import ConcurrentTag - if not isinstance(kernel.iname_to_tag[slow_sweep_iname], ConcurrentTag): + if not isinstance(kernel.iname_to_tag[nonlocal_sweep_iname], ConcurrentTag): # FIXME raise NotImplementedError("outer iname must currently be concurrent because " "it occurs in the local scan and the final addition and one of " @@ -632,11 +644,14 @@ def make_two_level_scan( from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr + # FIXME: Not sure if this works. local_storage_local_axis_len = ( - kernel.temporary_variables[local_storage_name].shape[-1]) + kernel.temporary_variables[local_storage_name].shape[-1] + if local_scan_uses_fast_axis + else kernel.temporary_variables[local_storage_name].shape[0]) nonlocal_storage_len_pw_aff = static_max_of_pw_aff( - kernel.get_iname_bounds(slow_sweep_iname).size, + kernel.get_iname_bounds(nonlocal_sweep_iname).size, constants_only=False) # FIXME: this shouldn't have to have an extra element. @@ -660,11 +675,13 @@ def make_two_level_scan( kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_outer_subd) """ + """ kernel = lp.tag_inames(kernel, { #nonlocal_init_head_outer_iname: slow_local_tag, #nonlocal_init_head_inner_iname: fast_local_tag, - nonlocal_init_tail_outer_iname: slow_local_tag, - nonlocal_init_tail_inner_iname: fast_local_tag}) + nonlocal_init_tail_outer_iname: fast_local_tag, + nonlocal_init_tail_inner_iname: slow_local_tag}) + """ for nls_name in [nonlocal_storage_name, nonlocal_scan_storage_name]: if nls_name not in kernel.temporary_variables: @@ -699,17 +716,23 @@ def make_two_level_scan( var(nonlocal_init_tail_outer_iname).eq(0)), depends_on=frozenset([local_scan_dep_id])) + if local_scan_uses_fast_axis: + nonlocal_init_tail_index = ( + var(nonlocal_init_tail_outer_iname), + var(nonlocal_init_tail_inner_iname) + + local_storage_local_axis_len - 1) + else: + nonlocal_init_tail_index = ( + local_storage_local_axis_len - 1, + var(nonlocal_init_tail_outer_iname)) + nonlocal_init_tail = make_assignment( id=nonlocal_init_tail_insn_id, assignees=( var(nonlocal_storage_name)[ var(nonlocal_init_tail_outer_iname) + 1],), expression=var(local_storage_name)[ - pick_out_relevant_axes( - (var(nonlocal_init_tail_outer_iname), - var(nonlocal_init_tail_inner_iname) - + local_storage_local_axis_len - 1), - strip_scalar=True)], + pick_out_relevant_axes(nonlocal_init_tail_index, strip_scalar=True)], no_sync_with=frozenset([(nonlocal_init_head_insn_id, "any")]), within_inames=( within_inames | frozenset([nonlocal_init_tail_outer_iname, @@ -738,15 +761,20 @@ def make_two_level_scan( if nonlocal_tag is not None: kernel = lp.tag_inames(kernel, {nonlocal_iname: nonlocal_tag}) - kernel = _add_scan_subdomain(kernel, slow_scan_iname, nonlocal_iname) + nonlocal_scan_iname = ( + slow_scan_iname + if local_scan_uses_fast_axis + else fast_scan_iname) + + kernel = _add_scan_subdomain(kernel, nonlocal_scan_iname, nonlocal_iname) nonlocal_scan = make_assignment( id=nonlocal_scan_insn_id, assignees=(var(nonlocal_scan_storage_name)[var(nonlocal_iname)],), expression=Reduction( scan.operation, - (slow_scan_iname,), - var(nonlocal_storage_name)[var(slow_scan_iname)]), + (nonlocal_scan_iname,), + var(nonlocal_storage_name)[var(nonlocal_scan_iname)]), within_inames=within_inames | frozenset([nonlocal_iname]), depends_on=( frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id]))) @@ -775,7 +803,7 @@ def make_two_level_scan( source=nonlocal_scan_insn_id, sink=insn_id, barrier_id=barrier_id)) updated_depends_on |= frozenset([barrier_id]) - nonlocal_part = var(nonlocal_scan_storage_name)[var(slow_sweep_iname)] + nonlocal_part = var(nonlocal_scan_storage_name)[var(nonlocal_sweep_iname)] local_part = var(local_storage_name)[ pick_out_relevant_axes( -- GitLab From 86250a4f0e1709631e111489a3f49dd818d55fd5 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 22 Oct 2017 01:51:57 -0500 Subject: [PATCH 4/5] Add comments. --- loopy/transform/reduction.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 6ae2adb5e..58e9ab99c 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -206,7 +206,8 @@ def _add_scan_subdomain( [sweep_iname] -> {[scan_iname] : 0 <= scan_iname <= sweep_iname } """ sp = ( - isl.Space.set_alloc(isl.DEFAULT_CONTEXT, 1, 1) .set_dim_name(isl.dim_type.param, 0, sweep_iname) + isl.Space.set_alloc(isl.DEFAULT_CONTEXT, 1, 1) + .set_dim_name(isl.dim_type.param, 0, sweep_iname) .set_dim_name(isl.dim_type.set, 0, scan_iname)) affs = isl.affs_from_space(sp) @@ -644,7 +645,7 @@ def make_two_level_scan( from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr - # FIXME: Not sure if this works. + # FIXME: Not sure if this is the right thing to do. local_storage_local_axis_len = ( kernel.temporary_variables[local_storage_name].shape[-1] if local_scan_uses_fast_axis @@ -675,6 +676,9 @@ def make_two_level_scan( kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_outer_subd) """ + # FIXME: This was commented out so that the nonlocal init part is + # sequential, as a workaround for ISPC. This should just get its own + # parameter controlling the tag of the local-to-nonlocal transfer. """ kernel = lp.tag_inames(kernel, { #nonlocal_init_head_outer_iname: slow_local_tag, -- GitLab From 7ec2b00dd3c7131f036ca181f3d672bdda01b937 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 22 Oct 2017 02:05:45 -0500 Subject: [PATCH 5/5] Document local_scan_uses_fast_axis --- loopy/transform/reduction.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py index 58e9ab99c..be7f612ea 100644 --- a/loopy/transform/reduction.py +++ b/loopy/transform/reduction.py @@ -333,6 +333,11 @@ def make_two_level_scan( axis will be added to the temporary array that does the local part of the scan (the "local" array). May be *None*, in which case it is automatically inferred from the tags of the inames. + :arg local_scan_uses_fast_axis: Whether the local scan should be a scan over + the fast axis of the split iname or the slow axis. A local scan + over the fast axis is a "small" scan, and is typically implemented as a + local-parallel scan. A local scan over the slow axis is a "big" scan + and is typically implemented as a sequential scan. """ # TODO: Test that this works even when doing split scans in a loop -- GitLab