diff --git a/loopy/check.py b/loopy/check.py index 611043d53fc5a6b711f17795ffcbbefd0d94ed09..9a9ff1fd575883ad4ee212f9e17f7c0197781201 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -213,12 +213,15 @@ def check_for_write_races(kernel): def check_for_orphaned_user_hardware_axes(kernel): - from loopy.kernel.data import LocalIndexTag + from loopy.kernel.data import LocalIndexTag, check_iname_tags for axis in kernel.local_sizes: found = False - for tag in six.itervalues(kernel.iname_to_tag): - if isinstance(tag, LocalIndexTag) and tag.axis == axis: - found = True + for tags in six.itervalues(kernel.iname_to_tags): + for tag in tags: + if isinstance(tag, LocalIndexTag) and tag.axis == axis: + found = True + break + if found: break if not found: @@ -893,6 +896,8 @@ def check_implemented_domains(kernel, implemented_domains, code=None): from islpy import align_two + from loopy.kernel.data import check_iname_tags + last_idomains = None last_insn_inames = None @@ -928,9 +933,8 @@ def check_implemented_domains(kernel, implemented_domains, code=None): from loopy.kernel.data import LocalIndexTag if isinstance(insn, BarrierInstruction): # project out local-id-mapped inames, solves #94 on gitlab - non_lid_inames = frozenset( - [iname for iname in insn_inames if not isinstance( - kernel.iname_to_tag.get(iname), LocalIndexTag)]) + non_lid_inames = frozenset(iname for iname in insn_inames + if not check_iname_tags(kernel.iname_to_tags[iname], LocalIndexTag)) insn_impl_domain = insn_impl_domain.project_out_except( non_lid_inames, [dim_type.set]) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 89e832127327356f2c51c84ba0dbef8e2a980a80..01f8a82554595d4eda9dfa899fa26dc262294259 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -254,11 +254,16 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() - tag = kernel.iname_to_tag.get(iname) + tags = kernel.iname_to_tags[iname] from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex - assert isinstance(tag, UniqueTag) + assert check_iname_tags(tags, UniqueTag) + + if len(tags) > 1: + raise LoopyError("cannot have more than one UniqueTag") + + tag, = tags if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): @@ -267,10 +272,10 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ - other_iname for other_iname in kernel.all_inames() - if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag) - and kernel.iname_to_tag.get(other_iname).key == tag.key - and other_iname != iname] + other_iname for other_iname in kernel.all_inames() + if check_iname_tags(kernel.iname_to_tags[other_iname], UniqueTag) + and any(_tag.key == tag.key for _tag in kernel.iname_to_tags[other_iname]) + and other_iname != iname] # {{{ 'implement' hardware axis boundaries diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index c1d17371fd57f606d9bb3f758d7fe865585c83c9..7282542dccd8c70b8f0dd90f9c66f9f0dfb3992c 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -632,7 +632,7 @@ def is_domain_dependent_on_inames(kernel, domain_index, inames): # {{{ rank inames by stride def get_auto_axis_iname_ranking_by_stride(kernel, insn): - from loopy.kernel.data import ImageArg, ValueArg + from loopy.kernel.data import ImageArg, ValueArg, check_iname_tags approximate_arg_values = {} for arg in kernel.args: @@ -677,10 +677,8 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): from loopy.kernel.data import AutoLocalIndexTagBase auto_axis_inames = set( - iname - for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), - AutoLocalIndexTagBase)) + iname for iname in kernel.insn_inames(insn) + if check_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase)) # }}} @@ -752,8 +750,11 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): def assign_automatic_axes(kernel, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) + # TODO: do the tag removal rigorously, might be easier after switching + # to set() from tuple() - from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag) + from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag, + check_iname_tags, get_iname_tags) # Realize that at this point in time, axis lengths are already # fixed. So we compute them once and pass them to our recursive @@ -777,10 +778,10 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): except isl.Error: # Likely unbounded, automatic assignment is not # going to happen for this iname. - new_iname_to_tag = kernel.iname_to_tag.copy() - new_iname_to_tag[iname] = None + new_iname_to_tags = kernel.iname_to_tags.copy() + new_iname_to_tags[iname] = tuple() return assign_automatic_axes( - kernel.copy(iname_to_tag=new_iname_to_tag), + kernel.copy(iname_to_tags=new_iname_to_tags), axis=recursion_axis) if axis is None: @@ -816,9 +817,9 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): # }}} if axis is None: - new_tag = None + new_tag = tuple() else: - new_tag = LocalIndexTag(axis) + new_tag = (LocalIndexTag(axis),) if desired_length > local_size[axis]: from loopy import split_iname @@ -831,12 +832,12 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): do_tagged_check=False), axis=recursion_axis, local_size=local_size) - if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase): + if not check_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase): raise LoopyError("trying to reassign '%s'" % iname) - new_iname_to_tag = kernel.iname_to_tag.copy() - new_iname_to_tag[iname] = new_tag - return assign_automatic_axes(kernel.copy(iname_to_tag=new_iname_to_tag), + new_iname_to_tags = kernel.iname_to_tags.copy() + new_iname_to_tags[iname] = new_tag + return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), axis=recursion_axis, local_size=local_size) # }}} @@ -853,10 +854,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): continue auto_axis_inames = [ - iname - for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), - AutoLocalIndexTagBase)] + iname for iname in kernel.insn_inames(insn) + if check_iname_tags(kernel.iname_to_tags[iname], AutoLocalIndexTagBase)] if not auto_axis_inames: continue @@ -864,8 +863,11 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): assigned_local_axes = set() for iname in kernel.insn_inames(insn): - tag = kernel.iname_to_tag.get(iname) - if isinstance(tag, LocalIndexTag): + tags = get(kernel.iname_to_tags[iname], LocalIndexTag) + if tags: + if len(tags) > 1: + raise LoopyError("cannot have more than one LocalIndexTags") + tag, = tags assigned_local_axes.add(tag.axis) if axis < len(local_size): @@ -875,8 +877,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): iname_ranking = get_auto_axis_iname_ranking_by_stride(kernel, insn) if iname_ranking is not None: for iname in iname_ranking: - prev_tag = kernel.iname_to_tag.get(iname) - if isinstance(prev_tag, AutoLocalIndexTagBase): + prev_tags = kernel.iname_to_tags[iname] + if check_iname_tags(prev_tags, AutoLocalIndexTagBase): return assign_axis(axis, iname, axis) else: diff --git a/loopy/preprocess.py b/loopy/preprocess.py index bd0a7c8ddbdc35bf39ba4f8700f8404132aa0f71..b20fbef91168ab4a5580b031d1cab4403bfec7c1 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -291,20 +291,20 @@ def _classify_reduction_inames(kernel, inames): from loopy.kernel.data import ( LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, - ConcurrentTag) + ConcurrentTag, check_iname_tags) for iname in inames: - iname_tag = kernel.iname_to_tag.get(iname) + iname_tags = kernel.iname_to_tags[iname] - if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): + if check_iname_tags(iname_tags, (UnrollTag, UnrolledIlpTag)): # These are nominally parallel, but we can live with # them as sequential. sequential.append(iname) - elif isinstance(iname_tag, LocalIndexTagBase): + elif check_iname_tags(iname_tags, LocalIndexTagBase): local_par.append(iname) - elif isinstance(iname_tag, (ConcurrentTag, VectorizeTag)): + elif check_iname_tags(iname_tags, (ConcurrentTag, VectorizeTag)): nonlocal_par.append(iname) else: @@ -912,6 +912,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, outer (sweep) iname. """ + # TODO: reassigning tags needs some thinking here + logger.debug("%s: realize reduction" % kernel.name) new_insns = [] @@ -1134,13 +1136,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, outer_insn_inames = temp_kernel.insn_inames(insn) - from loopy.kernel.data import LocalIndexTagBase - outer_local_inames = tuple( - oiname - for oiname in outer_insn_inames - if isinstance( - kernel.iname_to_tag.get(oiname), - LocalIndexTagBase)) + from loopy.kernel.data import LocalIndexTagBase, check_iname_tags + outer_local_inames = tuple(oiname for oiname in outer_insn_inames + if check_iname_tags(kernel.iname_to_tags[oiname], LocalIndexTagBase)) from pymbolic import var outer_local_iname_vars = tuple( @@ -1175,7 +1173,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_exec_iname = var_name_gen("red_"+red_iname) domains.append(_make_slab_set(base_exec_iname, size)) - new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname] + new_iname_tags[base_exec_iname] = kernel.iname_to_tags[red_iname] # }}} @@ -1270,7 +1268,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage)) domains.append(_make_slab_set(stage_exec_iname, bound-new_size)) - new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname] + new_iname_tags[stage_exec_iname] = kernel.iname_to_tags[red_iname] stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) stage_insn = make_assignment( @@ -1473,13 +1471,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, outer_insn_inames = temp_kernel.insn_inames(insn) - from loopy.kernel.data import LocalIndexTagBase - outer_local_inames = tuple( - oiname - for oiname in outer_insn_inames - if isinstance( - kernel.iname_to_tag.get(oiname), - LocalIndexTagBase) + from loopy.kernel.data import LocalIndexTagBase, check_iname_tags + outer_local_inames = tuple(oiname for oiname in outer_insn_inames + if check_iname_tags(kernel.iname_to_tags[oiname], LocalIndexTagBase) and oiname != sweep_iname) from pymbolic import var @@ -1505,7 +1499,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_exec_iname = var_name_gen(sweep_iname + "__scan") domains.append(_make_slab_set(base_exec_iname, scan_size)) - new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname] + new_iname_tags[base_exec_iname] = kernel.iname_to_tags[sweep_iname] # }}} @@ -1596,7 +1590,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) domains.append( _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) - new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] + new_iname_tags[stage_exec_iname] = kernel.iname_to_tags[sweep_iname] for read_var, acc_var in zip(read_vars, acc_vars): read_stage_id = insn_id_gen( @@ -1746,7 +1740,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, "by reductions is 'local'--found iname(s) '%s' " "respectively tagged '%s'" % (", ".join(bad_inames), - ", ".join(kernel.iname_to_tag[iname] + ", ".join(tag.key for tag in kernel.iname_to_tags[iname] for iname in bad_inames))) if n_local_par == 0 and n_sequential == 0: @@ -1784,7 +1778,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _error_if_force_scan_on(LoopyError, "Sweep iname '%s' has an unsupported parallel tag '%s' " "- the only parallelism allowed is 'local'." % - (sweep_iname, temp_kernel.iname_to_tag[sweep_iname])) + (sweep_iname, + ", ".join(tag.key + for tag in temp_kernel.iname_to_tags[sweep_iname]))) elif parallel: return map_scan_local( expr, rec, nresults, arg_dtypes, reduction_dtypes, diff --git a/loopy/statistics.py b/loopy/statistics.py index 6f4cc78b711196fbff2cff60920d178dd357a101..77c638128fe3dab63ee79f72bd14d70e0ca866bf 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -917,18 +917,22 @@ class GlobalMemAccessCounter(MemAccessCounter): index = (index,) from loopy.symbolic import get_dependencies - from loopy.kernel.data import LocalIndexTag, GroupIndexTag + from loopy.kernel.data import LocalIndexTag, GroupIndexTag, get_iname_tags + my_inames = get_dependencies(index) & self.knl.all_inames() # find all local and global index tags and corresponding inames lid_to_iname = {} gid_to_iname = {} for iname in my_inames: - tag = self.knl.iname_to_tag.get(iname) - if isinstance(tag, LocalIndexTag): - lid_to_iname[tag.axis] = iname - elif isinstance(tag, GroupIndexTag): - gid_to_iname[tag.axis] = iname + tags = get_iname_tags(self.knl.iname_to_tags[iname], + (GroupIndexTag, LocalIndexTag)) + if tags: + tag, = get_iname_tags(tags, (GroupIndexTag, LocalIndexTag), 1) + if isinstance(tag, LocalIndexTag): + lid_to_iname[tag.axis] = iname + else: + gid_to_iname[tag.axis] = iname # create lid_strides and gid_strides dicts @@ -1177,14 +1181,18 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): g_used = set() l_used = set() - from loopy.kernel.data import LocalIndexTag, GroupIndexTag + from loopy.kernel.data import (LocalIndexTag, GroupIndexTag, + get_iname_tags, check_iname_tags) for iname in knl.insn_inames(insn): - tag = knl.iname_to_tag.get(iname) - - if isinstance(tag, LocalIndexTag): - l_used.add(tag.axis) - elif isinstance(tag, GroupIndexTag): - g_used.add(tag.axis) + tags = get_iname_tags(knl.iname_to_tags[iname], (LocalIndexTag, GroupIndexTag)) + if tags: + if len(tags) > 1: + raise LoopyError("cannot have more than one UniqueTags") + tag, = tags + if isinstance(tag, LocalIndexTag): + l_used.add(tag.axis) + elif isinstance(tag, GroupIndexTag): + g_used.add(tag.axis) def mult_grid_factor(used_axes, size): result = 1 @@ -1213,9 +1221,9 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) insn_inames = knl.insn_inames(insn) if disregard_local_axes: - from loopy.kernel.data import LocalIndexTag + from loopy.kernel.data import LocalIndexTag, check_iname_tags insn_inames = [iname for iname in insn_inames if not - isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] + check_iname_tags(kernel.iname_to_tags[iname], LocalIndexTag)] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except(