diff --git a/loopy/__init__.py b/loopy/__init__.py index 6e221b24c4f82e2f3c172783af5a26024228648e..54c3523d5107d5a8516e1cf7cf7a6bbceef1b991 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -68,7 +68,7 @@ from loopy.library.reduction import register_reduction_parser from loopy.version import VERSION, MOST_RECENT_LANGUAGE_VERSION from loopy.transform.iname import ( - set_loop_priority, prioritize_loops, + set_loop_priority, prioritize_loops, untag_inames, split_iname, chunk_iname, join_inames, tag_inames, duplicate_inames, rename_iname, remove_unused_inames, split_reduction_inward, split_reduction_outward, @@ -177,7 +177,7 @@ __all__ = [ # {{{ transforms - "set_loop_priority", "prioritize_loops", + "set_loop_priority", "prioritize_loops", "untag_inames", "split_iname", "chunk_iname", "join_inames", "tag_inames", "duplicate_inames", "rename_iname", "remove_unused_inames", diff --git a/loopy/check.py b/loopy/check.py index 146391bf2533e35e7bc2f2091c9968fb5b321b6f..17b1186abfd1da7b0154d39869626c114b92cceb 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -113,14 +113,28 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) +def check_multiple_tags_allowed(kernel): + from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag, + UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type) + illegal_combinations = [ + (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag), + (IlpBaseTag, ForceSequentialTag) + ] + for iname, tags in six.iteritems(kernel.iname_to_tags): + for comb in illegal_combinations: + if len(filter_iname_tags_by_type(tags, comb)) > 1: + raise LoopyError("iname {0} has illegal combination of " + "tags: {1}".format(iname, tags)) + + def check_for_double_use_of_hw_axes(kernel): - from loopy.kernel.data import UniqueTag + from loopy.kernel.data import UniqueTag, filter_iname_tags_by_type for insn in kernel.instructions: insn_tag_keys = set() for iname in kernel.insn_inames(insn): - tag = kernel.iname_to_tag.get(iname) - if isinstance(tag, UniqueTag): + tags = kernel.iname_to_tags[iname] + for tag in filter_iname_tags_by_type(tags, UniqueTag): key = tag.key if key in insn_tag_keys: raise LoopyError("instruction '%s' has multiple " @@ -168,9 +182,8 @@ def _is_racing_iname_tag(tv, tag): def check_for_write_races(kernel): - from loopy.kernel.data import ConcurrentTag + from loopy.kernel.data import ConcurrentTag, filter_iname_tags_by_type - iname_to_tag = kernel.iname_to_tag.get for insn in kernel.instructions: for assignee_name, assignee_indices in zip( insn.assignee_var_names(), @@ -187,16 +200,16 @@ def check_for_write_races(kernel): # will cause write races. raceable_parallel_insn_inames = set( - iname - for iname in kernel.insn_inames(insn) - if isinstance(iname_to_tag(iname), ConcurrentTag)) + iname for iname in kernel.insn_inames(insn) + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], + ConcurrentTag)) elif assignee_name in kernel.temporary_variables: temp_var = kernel.temporary_variables[assignee_name] raceable_parallel_insn_inames = set( - iname - for iname in kernel.insn_inames(insn) - if _is_racing_iname_tag(temp_var, iname_to_tag(iname))) + iname for iname in kernel.insn_inames(insn) + if any(_is_racing_iname_tag(temp_var, tag) + for tag in kernel.iname_to_tags[iname])) else: raise LoopyError("invalid assignee name in instruction '%s'" @@ -218,9 +231,12 @@ def check_for_orphaned_user_hardware_axes(kernel): from loopy.kernel.data import LocalIndexTag for axis in kernel.local_sizes: found = False - for tag in six.itervalues(kernel.iname_to_tag): - if isinstance(tag, LocalIndexTag) and tag.axis == axis: - found = True + for tags in six.itervalues(kernel.iname_to_tags): + for tag in tags: + if isinstance(tag, LocalIndexTag) and tag.axis == axis: + found = True + break + if found: break if not found: @@ -229,13 +245,12 @@ def check_for_orphaned_user_hardware_axes(kernel): def check_for_data_dependent_parallel_bounds(kernel): - from loopy.kernel.data import ConcurrentTag + from loopy.kernel.data import ConcurrentTag, filter_iname_tags_by_type for i, dom in enumerate(kernel.domains): dom_inames = set(dom.get_var_names(dim_type.set)) - par_inames = set(iname - for iname in dom_inames - if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag)) + par_inames = set(iname for iname in dom_inames + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], ConcurrentTag)) if not par_inames: continue @@ -600,6 +615,7 @@ def pre_schedule_checks(kernel): check_for_double_use_of_hw_axes(kernel) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) + check_multiple_tags_allowed(kernel) check_for_inactive_iname_access(kernel) check_for_write_races(kernel) check_for_data_dependent_parallel_bounds(kernel) @@ -650,7 +666,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): # alternative: just disregard length-1 dimensions? - from loopy.kernel.data import LocalIndexTag, AutoLocalIndexTagBase, GroupIndexTag + from loopy.kernel.data import (LocalIndexTag, AutoLocalIndexTagBase, + GroupIndexTag, filter_iname_tags_by_type) while i < loop_end_i: sched_item = kernel.schedule[i] @@ -668,13 +685,15 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): local_axes_used = set() for iname in kernel.insn_inames(insn): - tag = kernel.iname_to_tag.get(iname) + tags = kernel.iname_to_tags[iname] - if isinstance(tag, LocalIndexTag): + if filter_iname_tags_by_type(tags, LocalIndexTag): + tag, = filter_iname_tags_by_type(tags, LocalIndexTag, 1) local_axes_used.add(tag.axis) - elif isinstance(tag, GroupIndexTag): + elif filter_iname_tags_by_type(tags, GroupIndexTag): + tag, = filter_iname_tags_by_type(tags, GroupIndexTag, 1) group_axes_used.add(tag.axis) - elif isinstance(tag, AutoLocalIndexTagBase): + elif filter_iname_tags_by_type(tags, AutoLocalIndexTagBase): raise LoopyError("auto local tag encountered") if group_axes != group_axes_used: @@ -919,12 +938,12 @@ def check_implemented_domains(kernel, implemented_domains, code=None): .project_out_except(insn_inames, [dim_type.set])) from loopy.kernel.instruction import BarrierInstruction - from loopy.kernel.data import LocalIndexTag + from loopy.kernel.data import LocalIndexTag, filter_iname_tags_by_type if isinstance(insn, BarrierInstruction): # project out local-id-mapped inames, solves #94 on gitlab - non_lid_inames = frozenset( - [iname for iname in insn_inames if not isinstance( - kernel.iname_to_tag.get(iname), LocalIndexTag)]) + non_lid_inames = frozenset(iname for iname in insn_inames + if not filter_iname_tags_by_type( + kernel.iname_to_tags[iname], LocalIndexTag)) insn_impl_domain = insn_impl_domain.project_out_except( non_lid_inames, [dim_type.set]) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index f398a063dc41f3f82267f6d4850158e4c45f4733..a6b70359af614525255388ab8536e5fb903483e4 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -58,7 +58,8 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) - from loopy.kernel.data import ConcurrentTag, LocalIndexTagBase, IlpBaseTag + from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, + IlpBaseTag, filter_iname_tags_by_type) result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) @@ -87,7 +88,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): for iname in kernel.insn_inames(insn)) for iname in inames_for_subkernel: - tag = kernel.iname_to_tag.get(iname) + tags = kernel.iname_to_tags[iname] # Parallel inames are defined within a subkernel, BUT: # @@ -97,10 +98,11 @@ def get_usable_inames_for_conditional(kernel, sched_index): # at the innermost level of nesting. if ( - isinstance(tag, ConcurrentTag) - and not (isinstance(tag, LocalIndexTagBase) and crosses_barrier) - and not isinstance(tag, IlpBaseTag) - ): + filter_iname_tags_by_type(tags, ConcurrentTag) + and not (filter_iname_tags_by_type(tags, LocalIndexTagBase) + and crosses_barrier) + and not filter_iname_tags_by_type(tags, IlpBaseTag) + ): result.add(iname) return frozenset(result) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e3e209726879741c31d686f2a6530e1b7ec67b97..fcf8ea3b4c4a3c6140612c5e2d943d4bceaca9d1 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -40,17 +40,18 @@ def get_admissible_conditional_inames_for(codegen_state, sched_index): kernel = codegen_state.kernel - from loopy.kernel.data import LocalIndexTag, HardwareConcurrentTag + from loopy.kernel.data import (LocalIndexTag, HardwareConcurrentTag, + filter_iname_tags_by_type) from loopy.schedule import find_active_inames_at, has_barrier_within result = find_active_inames_at(kernel, sched_index) has_barrier = has_barrier_within(kernel, sched_index) - for iname, tag in six.iteritems(kernel.iname_to_tag): - if (isinstance(tag, HardwareConcurrentTag) + for iname, tags in six.iteritems(kernel.iname_to_tags): + if (filter_iname_tags_by_type(tags, HardwareConcurrentTag) and codegen_state.is_generating_device_code): - if not has_barrier or not isinstance(tag, LocalIndexTag): + if not has_barrier or not filter_iname_tags_by_type(tags, LocalIndexTag): result.add(iname) return frozenset(result) @@ -127,25 +128,28 @@ def generate_code_for_sched_index(codegen_state, sched_index): ]) elif isinstance(sched_item, EnterLoop): - tag = kernel.iname_to_tag.get(sched_item.iname) + tags = kernel.iname_to_tags[sched_item.iname] + tags = tuple(tag for tag in tags if tag) from loopy.codegen.loop import ( generate_unroll_loop, generate_vectorize_loop, generate_sequential_loop_dim_code) - from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, - LoopedIlpTag, VectorizeTag, InOrderSequentialSequentialTag) - if isinstance(tag, (UnrollTag, UnrolledIlpTag)): + from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, + ForceSequentialTag, LoopedIlpTag, VectorizeTag, + InOrderSequentialSequentialTag, filter_iname_tags_by_type) + if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop - elif isinstance(tag, VectorizeTag): + elif filter_iname_tags_by_type(tags, VectorizeTag): func = generate_vectorize_loop - elif tag is None or isinstance(tag, ( - LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): + elif len(tags) == 0 or filter_iname_tags_by_type(tags, (LoopedIlpTag, + ForceSequentialTag, InOrderSequentialSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop " - "for '%s', tagged '%s'" % (sched_item.iname, tag)) + "for '%s', tagged '%s'" + % (sched_item.iname, ", ".join(str(tag) for tag in tags))) return func(codegen_state, sched_index) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 1db7b0445efd2a2e27e761164fa919647df37a07..7b44fd7b27d0d1191778d7a01aa34904a013f808 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -230,8 +230,8 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel - from loopy.kernel.data import ( - UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag) + from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, + LocalIndexTag, GroupIndexTag, filter_iname_tags_by_type) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) @@ -241,9 +241,9 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) - hw_inames_left = [iname - for iname in all_inames_by_insns - if isinstance(kernel.iname_to_tag.get(iname), HardwareConcurrentTag)] + hw_inames_left = [iname for iname in all_inames_by_insns + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], + HardwareConcurrentTag)] if not hw_inames_left: return next_func(codegen_state) @@ -254,11 +254,12 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() - tag = kernel.iname_to_tag.get(iname) + tags = kernel.iname_to_tags[iname] from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex - assert isinstance(tag, UniqueTag) + tag, = filter_iname_tags_by_type(tags, UniqueTag, max_num=1, min_num=1) + if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): @@ -267,10 +268,11 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ - other_iname for other_iname in kernel.all_inames() - if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag) - and kernel.iname_to_tag.get(other_iname).key == tag.key - and other_iname != iname] + other_iname for other_iname in kernel.all_inames() + if (filter_iname_tags_by_type(kernel.iname_to_tags[other_iname], UniqueTag) + and other_iname != iname + and any(_tag.key == tag.key + for _tag in kernel.iname_to_tags[other_iname] if _tag))] # {{{ 'implement' hardware axis boundaries diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 72d3a7dba117e1f005a887d9a6c9a04bb7a37588..429961a71449e7b92cf211ce9ab14a68e21f42e9 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -27,6 +27,8 @@ THE SOFTWARE. import six from six.moves import range, zip, intern +from collections import defaultdict + import numpy as np from pytools import ImmutableRecordWithoutPickling, ImmutableRecord, memoize_method import islpy as isl @@ -42,6 +44,7 @@ from loopy.library.function import ( from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted from loopy.diagnostic import StaticValueFindingError +from loopy.kernel.data import filter_iname_tags_by_type # {{{ unique var names @@ -137,10 +140,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): :class:`loopy.TemporaryVariable` instances. - .. attribute:: iname_to_tag + .. attribute:: iname_to_tags A :class:`dict` mapping inames (as strings) - to instances of :class:`loopy.kernel.data.IndexTag`. + to set of instances of :class:`loopy.kernel.data.IndexTag`. + .. versionadded:: 2018.1 .. attribute:: function_manglers .. attribute:: symbol_manglers @@ -194,7 +198,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): assumptions=None, local_sizes={}, temporary_variables={}, - iname_to_tag={}, + iname_to_tags=defaultdict(set), substitutions={}, function_manglers=[ default_function_mangler, @@ -280,7 +284,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): silenced_warnings=silenced_warnings, temporary_variables=temporary_variables, local_sizes=local_sizes, - iname_to_tag=iname_to_tag, + iname_to_tags=iname_to_tags, substitutions=substitutions, cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, @@ -298,6 +302,24 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} + # {{{ compatibility wrapper for iname_to_tag.get("iname") + + @property + def iname_to_tag(self): + from warnings import warn + warn("Since version 2018.1, inames can hold multiple tags. Use " + "iname_to_tags['iname'] instead. iname_to_tag.get('iname') will be " + "deprecated at version 2019.0.", DeprecationWarning) + for iname, tags in six.iteritems(self.iname_to_tags): + if len(tags) > 1: + raise LoopyError( + "iname {0} has multiple tags: {1}. " + "Use iname_to_tags['iname'] instead.".format(iname, tags)) + return dict((k, next(iter(v))) + for k, v in six.iteritems(self.iname_to_tags) if v) + + # }}} + # {{{ function mangling def mangle_function(self, identifier, arg_dtypes, ast_builder=None): @@ -703,15 +725,16 @@ class LoopKernel(ImmutableRecordWithoutPickling): the other inames as well.) """ - tag_key_uses = {} + tag_key_uses = defaultdict(list) from loopy.kernel.data import HardwareConcurrentTag for iname in cond_inames: - tag = self.iname_to_tag.get(iname) - - if isinstance(tag, HardwareConcurrentTag): - tag_key_uses.setdefault(tag.key, []).append(iname) + tags = filter_iname_tags_by_type(self.iname_to_tags[iname], + HardwareConcurrentTag, 1) + if tags: + tag, = tags + tag_key_uses[tag.key].append(iname) multi_use_keys = set( key for key, user_inames in six.iteritems(tag_key_uses) @@ -719,9 +742,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): multi_use_inames = set() for iname in cond_inames: - tag = self.iname_to_tag.get(iname) - if isinstance(tag, HardwareConcurrentTag) and tag.key in multi_use_keys: - multi_use_inames.add(iname) + tags = filter_iname_tags_by_type(self.iname_to_tags[iname], + HardwareConcurrentTag) + if tags: + tag, = filter_iname_tags_by_type(tags, HardwareConcurrentTag, 1) + if tag.key in multi_use_keys: + multi_use_inames.add(iname) return frozenset(cond_inames - multi_use_inames) @@ -951,21 +977,21 @@ class LoopKernel(ImmutableRecordWithoutPickling): AutoLocalIndexTagBase) for iname in all_inames_by_insns: - tag = self.iname_to_tag.get(iname) + tags = self.iname_to_tags[iname] - if isinstance(tag, GroupIndexTag): + if filter_iname_tags_by_type(tags, GroupIndexTag): tgt_dict = global_sizes - elif isinstance(tag, LocalIndexTag): + elif filter_iname_tags_by_type(tags, LocalIndexTag): tgt_dict = local_sizes - elif isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto: + elif (filter_iname_tags_by_type(tags, AutoLocalIndexTagBase) + and not ignore_auto): raise RuntimeError("cannot find grid sizes if automatic " "local index tags are present") else: - tgt_dict = None - - if tgt_dict is None: continue + tag, = filter_iname_tags_by_type(tags, (GroupIndexTag, LocalIndexTag), 1) + size = self.get_iname_bounds(iname).size if tag.axis in tgt_dict: @@ -1171,7 +1197,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): if show_labels: lines.append("INAME IMPLEMENTATION TAGS:") for iname in natsorted(kernel.all_inames()): - line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname)) + if not kernel.iname_to_tags[iname]: + tags = "None" + else: + tags = ", ".join(str(tag) for tag in kernel.iname_to_tags[iname]) + line = "%s: %s" % (iname, tags) lines.append(line) if "variables" in what and kernel.temporary_variables: @@ -1349,7 +1379,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "assumptions", "local_sizes", "temporary_variables", - "iname_to_tag", + "iname_to_tags", "substitutions", "iname_slab_increments", "loop_priority", diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index c90e8a64b6f47a87e87c5e64d2ef930232d34894..35a8e3b1d8f02ad5e260af8f42f5b7ba9815d6c3 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -54,11 +54,38 @@ class auto(object): # noqa # {{{ iname tags + +def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None): + """Return a subset of *tags* that matches type *tag_type*. Raises exception + if the number of tags found were greater than *max_num* or less than + *min_num*W. + + :arg tags: An iterable of tags. + :arg tag_type: a subclass of :class:`loopy.kernel.data.IndexTag`. + :arg max_num: the maximum number of tags expected to be found. + :arg min_num: the minimum number of tags expected to be found. + """ + + result = set(tag for tag in tags if isinstance(tag, tag_type)) + if max_num: + if len(result) > max_num: + raise LoopyError("cannot have more than {0} tags" + "of type(s): {1}".format(max_num, tag_type)) + if min_num: + if len(result) < min_num: + raise LoopyError("must have more than {0} tags" + "of type(s): {1}".format(max_num, tag_type)) + return result + + class IndexTag(ImmutableRecord): __slots__ = [] def __hash__(self): - raise RuntimeError("use .key to hash index tags") + return hash(self.key) + + def __lt__(self, other): + return self.__hash__() < other.__hash__() def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index ec26916f35c2ec67fb43185ed5cbc911de271869..3f8d118c41780f1545f91e16ef6524f62ee03a16 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -36,6 +36,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg from loopy.tools import natsorted +from loopy.kernel.data import filter_iname_tags_by_type import logging logger = logging.getLogger(__name__) @@ -631,7 +632,7 @@ def is_domain_dependent_on_inames(kernel, domain_index, inames): # {{{ rank inames by stride def get_auto_axis_iname_ranking_by_stride(kernel, insn): - from loopy.kernel.data import ImageArg, ValueArg + from loopy.kernel.data import ImageArg, ValueArg, filter_iname_tags_by_type approximate_arg_values = {} for arg in kernel.args: @@ -676,10 +677,9 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): from loopy.kernel.data import AutoLocalIndexTagBase auto_axis_inames = set( - iname - for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), - AutoLocalIndexTagBase)) + iname for iname in kernel.insn_inames(insn) + if filter_iname_tags_by_type( + kernel.iname_to_tags[iname], AutoLocalIndexTagBase)) # }}} @@ -751,8 +751,11 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): def assign_automatic_axes(kernel, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) + # TODO: do the tag removal rigorously, might be easier after switching + # to set() from tuple() - from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag) + from loopy.kernel.data import (AutoLocalIndexTagBase, LocalIndexTag, + filter_iname_tags_by_type) # Realize that at this point in time, axis lengths are already # fixed. So we compute them once and pass them to our recursive @@ -776,10 +779,10 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): except isl.Error: # Likely unbounded, automatic assignment is not # going to happen for this iname. - new_iname_to_tag = kernel.iname_to_tag.copy() - new_iname_to_tag[iname] = None + new_iname_to_tags = kernel.iname_to_tags.copy() + new_iname_to_tags[iname] = set() return assign_automatic_axes( - kernel.copy(iname_to_tag=new_iname_to_tag), + kernel.copy(iname_to_tags=new_iname_to_tags), axis=recursion_axis) if axis is None: @@ -819,23 +822,30 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: - from loopy import split_iname + from loopy import split_iname, untag_inames # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. return assign_automatic_axes( - split_iname(kernel, iname, inner_length=local_size[axis], + split_iname( + untag_inames(kernel, iname, AutoLocalIndexTagBase), + iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), axis=recursion_axis, local_size=local_size) - if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase): + if not filter_iname_tags_by_type(kernel.iname_to_tags[iname], + AutoLocalIndexTagBase): raise LoopyError("trying to reassign '%s'" % iname) - new_iname_to_tag = kernel.iname_to_tag.copy() - new_iname_to_tag[iname] = new_tag - return assign_automatic_axes(kernel.copy(iname_to_tag=new_iname_to_tag), + if new_tag: + new_tag = set([new_tag]) + else: + new_tag = set() + new_iname_to_tags = kernel.iname_to_tags.copy() + new_iname_to_tags[iname] = new_tag + return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), axis=recursion_axis, local_size=local_size) # }}} @@ -852,10 +862,9 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): continue auto_axis_inames = [ - iname - for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), - AutoLocalIndexTagBase)] + iname for iname in kernel.insn_inames(insn) + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], + AutoLocalIndexTagBase)] if not auto_axis_inames: continue @@ -863,8 +872,12 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): assigned_local_axes = set() for iname in kernel.insn_inames(insn): - tag = kernel.iname_to_tag.get(iname) - if isinstance(tag, LocalIndexTag): + tags = filter_iname_tags_by_type( + kernel.iname_to_tags[iname], LocalIndexTag) + if tags: + if len(tags) > 1: + raise LoopyError("cannot have more than one LocalIndexTags") + tag, = tags assigned_local_axes.add(tag.axis) if axis < len(local_size): @@ -874,8 +887,9 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): iname_ranking = get_auto_axis_iname_ranking_by_stride(kernel, insn) if iname_ranking is not None: for iname in iname_ranking: - prev_tag = kernel.iname_to_tag.get(iname) - if isinstance(prev_tag, AutoLocalIndexTagBase): + prev_tags = kernel.iname_to_tags[iname] + if filter_iname_tags_by_type( + prev_tags, AutoLocalIndexTagBase): return assign_axis(axis, iname, axis) else: @@ -1129,9 +1143,9 @@ def get_visual_iname_order_embedding(kernel): from loopy.kernel.data import IlpBaseTag # Ignore ILP tagged inames, since they do not have to form a strict loop # nest. - ilp_inames = frozenset( - iname for iname in kernel.iname_to_tag - if isinstance(kernel.iname_to_tag[iname], IlpBaseTag)) + ilp_inames = frozenset(iname + for iname in kernel.iname_to_tags + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], IlpBaseTag)) iname_trie = SetTrie() diff --git a/loopy/preprocess.py b/loopy/preprocess.py index a5284dc74ddcba6ffb527bbf9df07d81f31916e3..0b19ff416b2ca15e9d626a1644b9223367da2f69 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -34,7 +34,7 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION -from loopy.kernel.data import make_assignment +from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types @@ -135,9 +135,8 @@ def check_reduction_iname_uniqueness(kernel): # {{{ decide temporary scope def _get_compute_inames_tagged(kernel, insn, tag_base): - return set(iname - for iname in kernel.insn_inames(insn.id) - if isinstance(kernel.iname_to_tag.get(iname), tag_base)) + return set(iname for iname in kernel.insn_inames(insn.id) + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], tag_base)) def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names): @@ -147,7 +146,7 @@ def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names): insn.assignee_subscript_deps()) for iname in adeps & kernel.all_inames() if aname in tv_names - if isinstance(kernel.iname_to_tag.get(iname), tag_base)) + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], tag_base)) def find_temporary_scope(kernel): @@ -292,20 +291,20 @@ def _classify_reduction_inames(kernel, inames): from loopy.kernel.data import ( LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, - ConcurrentTag) + ConcurrentTag, filter_iname_tags_by_type) for iname in inames: - iname_tag = kernel.iname_to_tag.get(iname) + iname_tags = kernel.iname_to_tags[iname] - if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): + if filter_iname_tags_by_type(iname_tags, (UnrollTag, UnrolledIlpTag)): # These are nominally parallel, but we can live with # them as sequential. sequential.append(iname) - elif isinstance(iname_tag, LocalIndexTagBase): + elif filter_iname_tags_by_type(iname_tags, LocalIndexTagBase): local_par.append(iname) - elif isinstance(iname_tag, (ConcurrentTag, VectorizeTag)): + elif filter_iname_tags_by_type(iname_tags, (ConcurrentTag, VectorizeTag)): nonlocal_par.append(iname) else: @@ -1135,13 +1134,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, outer_insn_inames = temp_kernel.insn_inames(insn) - from loopy.kernel.data import LocalIndexTagBase - outer_local_inames = tuple( - oiname - for oiname in outer_insn_inames - if isinstance( - kernel.iname_to_tag.get(oiname), - LocalIndexTagBase)) + from loopy.kernel.data import LocalIndexTagBase, filter_iname_tags_by_type + outer_local_inames = tuple(oiname for oiname in outer_insn_inames + if filter_iname_tags_by_type( + kernel.iname_to_tags[oiname], LocalIndexTagBase)) from pymbolic import var outer_local_iname_vars = tuple( @@ -1176,7 +1172,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_exec_iname = var_name_gen("red_"+red_iname) domains.append(_make_slab_set(base_exec_iname, size)) - new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname] + new_iname_tags[base_exec_iname] = kernel.iname_to_tags[red_iname] # }}} @@ -1271,7 +1267,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage)) domains.append(_make_slab_set(stage_exec_iname, bound-new_size)) - new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname] + new_iname_tags[stage_exec_iname] = kernel.iname_to_tags[red_iname] stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) stage_insn = make_assignment( @@ -1474,13 +1470,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, outer_insn_inames = temp_kernel.insn_inames(insn) - from loopy.kernel.data import LocalIndexTagBase - outer_local_inames = tuple( - oiname - for oiname in outer_insn_inames - if isinstance( - kernel.iname_to_tag.get(oiname), - LocalIndexTagBase) + from loopy.kernel.data import LocalIndexTagBase, filter_iname_tags_by_type + outer_local_inames = tuple(oiname for oiname in outer_insn_inames + if filter_iname_tags_by_type(kernel.iname_to_tags[oiname], + LocalIndexTagBase) and oiname != sweep_iname) from pymbolic import var @@ -1506,7 +1499,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, base_exec_iname = var_name_gen(sweep_iname + "__scan") domains.append(_make_slab_set(base_exec_iname, scan_size)) - new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname] + new_iname_tags[base_exec_iname] = kernel.iname_to_tags[sweep_iname] # }}} @@ -1597,7 +1590,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) domains.append( _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) - new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] + new_iname_tags[stage_exec_iname] = kernel.iname_to_tags[sweep_iname] for read_var, acc_var in zip(read_vars, acc_vars): read_stage_id = insn_id_gen( @@ -1747,7 +1740,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, "by reductions is 'local'--found iname(s) '%s' " "respectively tagged '%s'" % (", ".join(bad_inames), - ", ".join(kernel.iname_to_tag[iname] + ", ".join(str(kernel.iname_to_tags[iname]) for iname in bad_inames))) if n_local_par == 0 and n_sequential == 0: @@ -1785,7 +1778,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _error_if_force_scan_on(LoopyError, "Sweep iname '%s' has an unsupported parallel tag '%s' " "- the only parallelism allowed is 'local'." % - (sweep_iname, temp_kernel.iname_to_tag[sweep_iname])) + (sweep_iname, + ", ".join(tag.key + for tag in temp_kernel.iname_to_tags[sweep_iname]))) elif parallel: return map_scan_local( expr, rec, nresults, arg_dtypes, reduction_dtypes, @@ -2154,8 +2149,8 @@ def preprocess_kernel(kernel, device=None): # {{{ check that there are no l.auto-tagged inames from loopy.kernel.data import AutoLocalIndexTagBase - for iname, tag in six.iteritems(kernel.iname_to_tag): - if (isinstance(tag, AutoLocalIndexTagBase) + for iname, tags in six.iteritems(kernel.iname_to_tags): + if (filter_iname_tags_by_type(tags, AutoLocalIndexTagBase) and iname in kernel.all_inames()): raise LoopyError("kernel with automatically-assigned " "local axes passed to preprocessing") diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 3c9a6baed99b9e3267608570aaa8dac5c73ddff7..616c8e62a5900dea981ef4d1d5c12b5cf8925a27 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -212,22 +212,20 @@ def find_loop_nest_with_map(kernel): """ result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import (ConcurrentTag, IlpBaseTag, VectorizeTag, + filter_iname_tags_by_type) - all_nonpar_inames = set([ - iname - for iname in kernel.all_inames() - if not isinstance(kernel.iname_to_tag.get(iname), - (ConcurrentTag, IlpBaseTag, VectorizeTag))]) + all_nonpar_inames = set( + iname for iname in kernel.all_inames() + if not filter_iname_tags_by_type(kernel.iname_to_tags[iname], + (ConcurrentTag, IlpBaseTag, VectorizeTag))) iname_to_insns = kernel.iname_to_insns() for iname in all_nonpar_inames: - result[iname] = set([ - other_iname + result[iname] = set(other_iname for insn in iname_to_insns[iname] - for other_iname in kernel.insn_inames(insn) & all_nonpar_inames - ]) + for other_iname in kernel.insn_inames(insn) & all_nonpar_inames) return result @@ -243,15 +241,15 @@ def find_loop_nest_around_map(kernel): iname_to_insns = kernel.iname_to_insns() # examine pairs of all inames--O(n**2), I know. - from loopy.kernel.data import IlpBaseTag + from loopy.kernel.data import IlpBaseTag, filter_iname_tags_by_type for inner_iname in all_inames: result[inner_iname] = set() for outer_iname in all_inames: if inner_iname == outer_iname: continue - tag = kernel.iname_to_tag.get(outer_iname) - if isinstance(tag, IlpBaseTag): + tags = kernel.iname_to_tags[outer_iname] + if filter_iname_tags_by_type(tags, IlpBaseTag): # ILP tags are special because they are parallel tags # and therefore 'in principle' nest around everything. # But they're realized by the scheduler as a loop @@ -280,10 +278,11 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import (ConcurrentTag, IlpBaseTag, VectorizeTag, + filter_iname_tags_by_type) for insn in kernel.instructions: for iname in kernel.insn_inames(insn): - if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag): + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], ConcurrentTag): continue iname_dep = result.setdefault(iname, set()) @@ -313,8 +312,9 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): # -> safe. continue - tag = kernel.iname_to_tag.get(dep_insn_iname) - if isinstance(tag, (ConcurrentTag, IlpBaseTag, VectorizeTag)): + tags = kernel.iname_to_tags[dep_insn_iname] + if filter_iname_tags_by_type(tags, + (ConcurrentTag, IlpBaseTag, VectorizeTag)): # Parallel tags don't really nest, so we'll disregard # them here. continue @@ -1878,18 +1878,20 @@ def generate_loop_schedules_inner(kernel, debug_args={}): for item in preschedule for insn_id in sched_item_to_insn_id(item)) - from loopy.kernel.data import IlpBaseTag, ConcurrentTag, VectorizeTag + from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag, + filter_iname_tags_by_type) ilp_inames = set( iname - for iname in kernel.all_inames() - if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag)) + for iname, tags in six.iteritems(kernel.iname_to_tags) + if filter_iname_tags_by_type(tags, IlpBaseTag)) vec_inames = set( iname - for iname in kernel.all_inames() - if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag)) + for iname, tags in six.iteritems(kernel.iname_to_tags) + if filter_iname_tags_by_type(tags, VectorizeTag)) parallel_inames = set( - iname for iname in kernel.all_inames() - if isinstance(kernel.iname_to_tag.get(iname), ConcurrentTag)) + iname + for iname, tags in six.iteritems(kernel.iname_to_tags) + if filter_iname_tags_by_type(tags, ConcurrentTag)) loop_nest_with_map = find_loop_nest_with_map(kernel) loop_nest_around_map = find_loop_nest_around_map(kernel) diff --git a/loopy/statistics.py b/loopy/statistics.py index 6f4cc78b711196fbff2cff60920d178dd357a101..97eded2e36d967c3c7a6b01b3f30a52b08d3c933 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -917,18 +917,24 @@ class GlobalMemAccessCounter(MemAccessCounter): index = (index,) from loopy.symbolic import get_dependencies - from loopy.kernel.data import LocalIndexTag, GroupIndexTag + from loopy.kernel.data import (LocalIndexTag, GroupIndexTag, + filter_iname_tags_by_type) + my_inames = get_dependencies(index) & self.knl.all_inames() # find all local and global index tags and corresponding inames lid_to_iname = {} gid_to_iname = {} for iname in my_inames: - tag = self.knl.iname_to_tag.get(iname) - if isinstance(tag, LocalIndexTag): - lid_to_iname[tag.axis] = iname - elif isinstance(tag, GroupIndexTag): - gid_to_iname[tag.axis] = iname + tags = filter_iname_tags_by_type(self.knl.iname_to_tags[iname], + (GroupIndexTag, LocalIndexTag)) + if tags: + tag, = filter_iname_tags_by_type( + tags, (GroupIndexTag, LocalIndexTag), 1) + if isinstance(tag, LocalIndexTag): + lid_to_iname[tag.axis] = iname + else: + gid_to_iname[tag.axis] = iname # create lid_strides and gid_strides dicts @@ -1177,14 +1183,17 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): g_used = set() l_used = set() - from loopy.kernel.data import LocalIndexTag, GroupIndexTag + from loopy.kernel.data import (LocalIndexTag, GroupIndexTag, + filter_iname_tags_by_type) for iname in knl.insn_inames(insn): - tag = knl.iname_to_tag.get(iname) - - if isinstance(tag, LocalIndexTag): - l_used.add(tag.axis) - elif isinstance(tag, GroupIndexTag): - g_used.add(tag.axis) + tags = filter_iname_tags_by_type(knl.iname_to_tags[iname], + (LocalIndexTag, GroupIndexTag), 1) + if tags: + tag, = tags + if isinstance(tag, LocalIndexTag): + l_used.add(tag.axis) + elif isinstance(tag, GroupIndexTag): + g_used.add(tag.axis) def mult_grid_factor(used_axes, size): result = 1 @@ -1213,9 +1222,9 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) insn_inames = knl.insn_inames(insn) if disregard_local_axes: - from loopy.kernel.data import LocalIndexTag + from loopy.kernel.data import LocalIndexTag, filter_iname_tags_by_type insn_inames = [iname for iname in insn_inames if not - isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] + filter_iname_tags_by_type(knl.iname_to_tags[iname], LocalIndexTag)] inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 45a59847ba9f175df5ca1be46aa78566b2aab03b..8e07eb692214d627e4a86dd883198f4a9b72406d 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -418,28 +418,33 @@ class ISPCASTBuilder(CASTBuilder): new_terms = [] - from loopy.kernel.data import LocalIndexTag + from loopy.kernel.data import LocalIndexTag, filter_iname_tags_by_type from loopy.symbolic import get_dependencies saw_l0 = False for term in terms: if (isinstance(term, Variable) - and isinstance( - kernel.iname_to_tag.get(term.name), LocalIndexTag) - and kernel.iname_to_tag.get(term.name).axis == 0): - if saw_l0: - raise LoopyError("streaming store must have stride 1 " - "in local index, got: %s" % subscript) - saw_l0 = True - continue + and filter_iname_tags_by_type( + kernel.iname_to_tags[term.name], LocalIndexTag)): + tag, = filter_iname_tags_by_type( + kernel.iname_to_tags[term.name], LocalIndexTag, 1) + if tag.axis == 0: + if saw_l0: + raise LoopyError( + "streaming store must have stride 1 in " + "local index, got: %s" % subscript) + saw_l0 = True + continue else: for dep in get_dependencies(term): - if ( - isinstance( - kernel.iname_to_tag.get(dep), LocalIndexTag) - and kernel.iname_to_tag.get(dep).axis == 0): - raise LoopyError("streaming store must have stride 1 " - "in local index, got: %s" % subscript) + if filter_iname_tags_by_type( + kernel.iname_to_tags[dep], LocalIndexTag): + tag, = filter_iname_tags_by_type( + kernel.iname_to_tags[dep], LocalIndexTag, 1) + if tag.axis == 0: + raise LoopyError( + "streaming store must have stride 1 in " + "local index, got: %s" % subscript) new_terms.append(term) @@ -452,10 +457,9 @@ class ISPCASTBuilder(CASTBuilder): "data type") rhs_has_programindex = any( - isinstance( - kernel.iname_to_tag.get(dep), LocalIndexTag) - and kernel.iname_to_tag.get(dep).axis == 0 - for dep in get_dependencies(insn.expression)) + isinstance(tag, LocalIndexTag) and tag.axis == 0 + for tag in kernel.iname_to_tags[dep] + for dep in get_dependencies(insn.expression)) if not rhs_has_programindex: rhs_code = "broadcast(%s, 0)" % rhs_code diff --git a/loopy/tools.py b/loopy/tools.py index 288e0c3c4c6035612a3368a6348f624090ea9c16..15d2a859a9cbe7c7a4e0711e705c6ccce6fff61b 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -75,6 +75,8 @@ class LoopyKeyBuilder(KeyBuilderBase): for dict_key in sorted(six.iterkeys(key)): self.rec(key_hash, (dict_key, key[dict_key])) + update_for_defaultdict = update_for_dict + def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 77c2d3adecb6db4c0b77c9eb32983c9c04067c43..8f8593c2c03542f75e721f9037577507dd70eef6 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -248,10 +248,10 @@ def _fuse_two_kernels(knla, knlb): local_sizes=_merge_dicts( "local size", knla.local_sizes, knlb.local_sizes), temporary_variables=new_temporaries, - iname_to_tag=_merge_dicts( + iname_to_tags=_merge_dicts( "iname-to-tag mapping", - knla.iname_to_tag, - knlb.iname_to_tag), + knla.iname_to_tags, + knlb.iname_to_tags), substitutions=_merge_dicts( "substitution", knla.substitutions, diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 45d0dc9daee03632e2da33d870faa99e3f92598f..423ccfb5517f634434f3b3882b1d4e03926f79b7 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -44,6 +44,8 @@ __doc__ = """ .. autofunction:: join_inames +.. autofunction:: untag_inames + .. autofunction:: tag_inames .. autofunction:: duplicate_inames @@ -175,11 +177,10 @@ def _split_iname_backend(kernel, split_iname, for syntax. """ - existing_tag = kernel.iname_to_tag.get(split_iname) - from loopy.kernel.data import ForceSequentialTag - if do_tagged_check and ( - existing_tag is not None - and not isinstance(existing_tag, ForceSequentialTag)): + existing_tags = kernel.iname_to_tags[split_iname] + from loopy.kernel.data import ForceSequentialTag, filter_iname_tags_by_type + if (do_tagged_check and existing_tags + and not filter_iname_tags_by_type(existing_tags, ForceSequentialTag)): raise LoopyError("cannot split already tagged iname '%s'" % split_iname) if split_iname not in kernel.all_inames(): @@ -294,7 +295,7 @@ def _split_iname_backend(kernel, split_iname, kernel = ins.map_kernel(kernel) kernel = rule_mapping_context.finish_kernel(kernel) - if existing_tag is not None: + for existing_tag in existing_tags: kernel = tag_inames(kernel, {outer_iname: existing_tag, inner_iname: existing_tag}) @@ -596,25 +597,49 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): # }}} +# {{{ untag inames + +def untag_inames(kernel, iname_to_untag, tag_type): + """ + Remove tags on *iname_to_untag* which matches *tag_type*. + + :arg iname_to_untag: iname as string. + :arg tag_type: a subclass of :class:`loopy.kernel.data.IndexTag`. + + .. versionadded:: 2018.1 + """ + + knl_iname_to_tags = kernel.iname_to_tags.copy() + old_tags = knl_iname_to_tags[iname_to_untag] + old_tags = set(tag for tag in old_tags if not isinstance(tag, tag_type)) + knl_iname_to_tags[iname_to_untag] = old_tags + + return kernel.copy(iname_to_tags=knl_iname_to_tags) + +# }}} + + # {{{ tag inames def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): """Tag an iname :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given - as an instance of a subclass of :class:`loopy.kernel.data.IndexTag` or - as a string as shown in :ref:`iname-tags`. May also be a dictionary - for backwards compatibility. *iname* may also be a wildcard using ``*`` - and ``?``. + as an instance of a subclass of :class:`loopy.kernel.data.IndexTag` or an + iterable of which, or as a string as shown in :ref:`iname-tags`. May also + be a dictionary for backwards compatibility. *iname* may also be a wildcard + using ``*`` and ``?``. .. versionchanged:: 2016.3 Added wildcards. + + .. versionchanged:: 2018.1 + + Added iterable of tags """ - if isinstance(iname_to_tag, dict): - iname_to_tag = list(six.iteritems(iname_to_tag)) - elif isinstance(iname_to_tag, str): + if isinstance(iname_to_tag, str): def parse_kv(s): colon_index = s.find(":") if colon_index == -1: @@ -626,12 +651,33 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): parse_kv(s) for s in iname_to_tag.split(",") if s.strip()] + # convert dict to list of tuples + if isinstance(iname_to_tag, dict): + iname_to_tag = list(six.iteritems(iname_to_tag)) + + # flatten iterables of tags for each iname + from collections import Iterable + unpack_iname_to_tag = [] + for iname, tags in iname_to_tag: + if isinstance(tags, Iterable) and not isinstance(tags, str): + for tag in tags: + unpack_iname_to_tag.append((iname, tag)) + else: + unpack_iname_to_tag.append((iname, tags)) + iname_to_tag = unpack_iname_to_tag + from loopy.kernel.data import parse_tag as inner_parse_tag def parse_tag(tag): if isinstance(tag, str): if tag.startswith("like."): - return kernel.iname_to_tag.get(tag[5:]) + tags = kernel.iname_to_tags[tag[5:]] + if len(tags) == 0: + return None + if len(tags) == 1: + return tags[0] + else: + raise LoopyError("cannot use like for multiple tags (for now)") elif tag == "unused.g": return find_unused_axis_tag(kernel, "g") elif tag == "unused.l": @@ -641,8 +687,8 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): iname_to_tag = [(iname, parse_tag(tag)) for iname, tag in iname_to_tag] - from loopy.kernel.data import (ConcurrentTag, AutoLocalIndexTagBase, - ForceSequentialTag) + from loopy.kernel.data import (ConcurrentTag, ForceSequentialTag, + filter_iname_tags_by_type) # {{{ globbing @@ -671,41 +717,31 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): # }}} - knl_iname_to_tag = kernel.iname_to_tag.copy() + knl_iname_to_tags = kernel.iname_to_tags.copy() for iname, new_tag in six.iteritems(iname_to_tag): - old_tag = kernel.iname_to_tag.get(iname) - - retag_ok = False - - if isinstance(old_tag, (AutoLocalIndexTagBase, ForceSequentialTag)): - retag_ok = True + if not new_tag: + continue - if not retag_ok and old_tag is not None and new_tag is None: - raise ValueError("cannot untag iname '%s'" % iname) + old_tags = kernel.iname_to_tags[iname] if iname not in kernel.all_inames(): raise ValueError("cannot tag '%s'--not known" % iname) - if isinstance(new_tag, ConcurrentTag) \ - and isinstance(old_tag, ForceSequentialTag): + if (isinstance(new_tag, ConcurrentTag) + and filter_iname_tags_by_type(old_tags, ForceSequentialTag)): raise ValueError("cannot tag '%s' as parallel--" "iname requires sequential execution" % iname) - if isinstance(new_tag, ForceSequentialTag) \ - and isinstance(old_tag, ConcurrentTag): + if (isinstance(new_tag, ForceSequentialTag) + and filter_iname_tags_by_type(old_tags, ConcurrentTag)): raise ValueError("'%s' is already tagged as parallel, " "but is now prohibited from being parallel " "(likely because of participation in a precompute or " "a reduction)" % iname) - if (not retag_ok) and (not force) \ - and old_tag is not None and (old_tag != new_tag): - raise LoopyError("'%s' is already tagged '%s'--cannot retag" - % (iname, old_tag)) - - knl_iname_to_tag[iname] = new_tag + knl_iname_to_tags[iname] = old_tags.union([new_tag]) - return kernel.copy(iname_to_tag=knl_iname_to_tag) + return kernel.copy(iname_to_tags=knl_iname_to_tags) # }}} @@ -956,12 +992,12 @@ def get_iname_duplication_options(knl, use_boostable_into=False): Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be duplicated in a given kernel. """ - from loopy.kernel.data import ConcurrentTag + from loopy.kernel.data import ConcurrentTag, filter_iname_tags_by_type concurrent_inames = set( iname - for iname in knl.all_inames() - if isinstance(knl.iname_to_tag.get(iname), ConcurrentTag)) + for iname in knl.all_inames() if filter_iname_tags_by_type( + knl.iname_to_tags[iname], ConcurrentTag)) # First we extract the minimal necessary information from the kernel if use_boostable_into: @@ -985,8 +1021,8 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # Get the duplication options as a tuple of iname and a set for iname, insns in _get_iname_duplication_options(insn_iname_sets): # Check whether this iname has a parallel tag and discard it if so - if (iname in knl.iname_to_tag - and isinstance(knl.iname_to_tag[iname], ConcurrentTag)): + if (iname in knl.iname_to_tags and filter_iname_tags_by_type( + knl.iname_to_tags[iname], ConcurrentTag)): continue # If we find a duplication option and to not use boostable_into @@ -1503,7 +1539,8 @@ def find_unused_axis_tag(kernel, kind, insn_match=None): """ used_axes = set() - from loopy.kernel.data import GroupIndexTag, LocalIndexTag + from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, + filter_iname_tags_by_type) if isinstance(kind, str): found = False @@ -1522,9 +1559,8 @@ def find_unused_axis_tag(kernel, kind, insn_match=None): for insn in insns: for iname in kernel.insn_inames(insn): - dim_tag = kernel.iname_to_tag.get(iname) - - if isinstance(dim_tag, kind): + dim_tags = kernel.iname_to_tags[iname] + if filter_iname_tags_by_type(dim_tags, kind): used_axes.add(kind.axis) i = 0 diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py index 47f64815e19c7bdcfc00ade1c069a157f4d5cbea..c953c1cee0b1e6930423ebf42d2fba55a602c3df 100644 --- a/loopy/transform/privatize.py +++ b/loopy/transform/privatize.py @@ -101,9 +101,9 @@ def privatize_temporaries_with_inames( end facilitating loop interchange of the *imatrix* loop. - .. versionadded:: 2018.1 """ + if isinstance(privatizing_inames, str): privatizing_inames = frozenset( s.strip() @@ -174,7 +174,7 @@ def privatize_temporaries_with_inames( # {{{ change temporary variables - from loopy.kernel.data import VectorizeTag + from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname): @@ -187,7 +187,7 @@ def privatize_temporaries_with_inames( dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): - if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag): + if filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape, diff --git a/loopy/transform/save.py b/loopy/transform/save.py index e3d8368a78bd2415d4f1f64846e9198081b48cd6..dfdd7a1545a527cde7fe74e98ba9a51f7826cdd9 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -241,14 +241,14 @@ class TemporarySaver(object): self.insn_name_gen = kernel.get_instruction_id_generator() # These fields keep track of updates to the kernel. + from collections import defaultdict self.insns_to_insert = [] self.insns_to_update = {} self.extra_args_to_add = {} - self.updated_iname_to_tag = {} + self.updated_iname_to_tags = defaultdict(set) self.updated_temporary_variables = {} # temporary name -> save or reload insn ids - from collections import defaultdict self.temporary_to_save_ids = defaultdict(set) self.temporary_to_reload_ids = defaultdict(set) self.subkernel_to_newly_added_insn_ids = defaultdict(set) @@ -397,24 +397,26 @@ class TemporarySaver(object): my_local_tags = [] for iname in insn.within_inames: - tag = self.kernel.iname_to_tag.get(iname) + tags = self.kernel.iname_to_tags[iname] - if tag is None: + if not tags: continue - from loopy.kernel.data import ( - GroupIndexTag, LocalIndexTag, ConcurrentTag) + from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, + ConcurrentTag, filter_iname_tags_by_type) - if isinstance(tag, GroupIndexTag): + if filter_iname_tags_by_type(tags, GroupIndexTag): + tag, = filter_iname_tags_by_type(tags, GroupIndexTag, 1) my_group_tags.append(tag) - elif isinstance(tag, LocalIndexTag): + elif filter_iname_tags_by_type(tags, LocalIndexTag): + tag, = filter_iname_tags_by_type(tags, LocalIndexTag, 1) my_local_tags.append(tag) - elif isinstance(tag, ConcurrentTag): + elif filter_iname_tags_by_type(tags, ConcurrentTag): raise LoopyError( "iname '%s' is tagged with '%s' - only " "group and local tags are supported for " "auto save/reload of temporaries" % - (iname, tag)) + (iname, tags)) if group_tags is None: group_tags = _sortedtags(my_group_tags) @@ -501,7 +503,7 @@ class TemporarySaver(object): if promoted_temporary is None: return - new_subdomain, hw_inames, dim_inames, iname_to_tag = ( + new_subdomain, hw_inames, dim_inames, iname_to_tags = ( self.augment_domain_for_save_or_reload( self.new_subdomain, promoted_temporary, mode, subkernel)) @@ -581,7 +583,7 @@ class TemporarySaver(object): self.updated_temporary_variables[promoted_temporary.name] = ( promoted_temporary.as_kernel_temporary(self.kernel)) - self.updated_iname_to_tag.update(iname_to_tag) + self.updated_iname_to_tags.update(iname_to_tags) @memoize_method def finish(self): @@ -597,7 +599,7 @@ class TemporarySaver(object): new_instructions.extend( sorted(insns_to_insert.values(), key=lambda insn: insn.id)) - self.updated_iname_to_tag.update(self.kernel.iname_to_tag) + self.updated_iname_to_tags.update(self.kernel.iname_to_tags) self.updated_temporary_variables.update(self.kernel.temporary_variables) new_domains = list(self.kernel.domains) @@ -608,7 +610,7 @@ class TemporarySaver(object): kernel = self.kernel.copy( domains=new_domains, instructions=new_instructions, - iname_to_tag=self.updated_iname_to_tag, + iname_to_tags=self.updated_iname_to_tags, temporary_variables=self.updated_temporary_variables, overridden_get_grid_sizes_for_insn_ids=None) @@ -650,7 +652,7 @@ class TemporarySaver(object): orig_dim = domain.dim(isl.dim_type.set) # Tags for newly added inames - iname_to_tag = {} + iname_to_tags = {} from loopy.symbolic import aff_from_expr @@ -675,7 +677,7 @@ class TemporarySaver(object): # If the temporary has local scope, then loads / stores can # be done in parallel. from loopy.kernel.data import AutoFitLocalIndexTag - iname_to_tag[new_iname] = AutoFitLocalIndexTag() + iname_to_tags[new_iname] = set([AutoFitLocalIndexTag()]) dim_inames.append(new_iname) @@ -705,7 +707,7 @@ class TemporarySaver(object): & aff[new_iname].lt_set(aff_from_expr(domain.space, dim))) - self.updated_iname_to_tag[new_iname] = hw_tag + self.updated_iname_to_tags[new_iname] = set([hw_tag]) hw_inames.append(new_iname) # The operations on the domain above return a Set object, but the @@ -713,7 +715,7 @@ class TemporarySaver(object): domain_list = domain.get_basic_set_list() assert domain_list.n_basic_set() == 1 domain = domain_list.get_basic_set(0) - return domain, hw_inames, dim_inames, iname_to_tag + return domain, hw_inames, dim_inames, iname_to_tags # }}}