From 164ad102c682ed3ad5b04ce6f84c3174e184b6a7 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 25 Apr 2017 12:29:44 -0500 Subject: [PATCH 1/6] [WIP] Improve save and reload towards global scan (see: #62). * Closes #40, by changing the way storage is computes for save and reload by using the tags of the accessing instructions, not the inames. * Allow a single representative per base_storage equivalence class to be saved and reloaded (see also: #42). * Removes InstructionQuery class from schedule tools. Still needs: tests --- loopy/check.py | 28 +-- loopy/schedule/tools.py | 154 ++++------------ loopy/transform/save.py | 385 ++++++++++++++++++++++++++++------------ 3 files changed, 318 insertions(+), 249 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 6a1e3dc33..54ab043d6 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -505,22 +505,23 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): # {{{ check that temporaries are defined in subkernels where used def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): - from loopy.schedule.tools import InstructionQuery from loopy.kernel.data import temp_var_scope + from loopy.kernel.tools import get_subkernels - insn_query = InstructionQuery(kernel) - - for subkernel in insn_query.subkernels(): + for subkernel in get_subkernels(kernel): defined_base_storage = set() - for temporary in insn_query.temporaries_written_in_subkernel(subkernel): + from loopy.schedule.tools import ( + temporaries_written_in_subkernel, temporaries_read_in_subkernel) + + for temporary in temporaries_written_in_subkernel(kernel, subkernel): tval = kernel.temporary_variables[temporary] if tval.base_storage is not None: defined_base_storage.add(tval.base_storage) for temporary in ( - insn_query.temporaries_read_in_subkernel(subkernel) - - insn_query.temporaries_written_in_subkernel(subkernel)): + temporaries_read_in_subkernel(kernel, subkernel) - + temporaries_written_in_subkernel(kernel, subkernel)): tval = kernel.temporary_variables[temporary] if tval.initializer is not None: @@ -530,16 +531,17 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): if tval.base_storage is not None: if tval.base_storage not in defined_base_storage: from loopy.diagnostic import MissingDefinitionError - raise MissingDefinitionError("temporary variable '%s' gets used " - "in subkernel '%s' and neither it nor its aliases have a " - "definition" % (temporary, subkernel)) + raise MissingDefinitionError("temporary variable '%s' gets " + "used in subkernel '%s' and neither it nor its " + "aliases have a definition" % (temporary, subkernel)) continue if tval.scope in (temp_var_scope.PRIVATE, temp_var_scope.LOCAL): from loopy.diagnostic import MissingDefinitionError - raise MissingDefinitionError("temporary variable '%s' gets used in " - "subkernel '%s' without a definition (maybe you forgot to call " - "loopy.save_and_reload_temporaries?)" % (temporary, subkernel)) + raise MissingDefinitionError("temporary variable '%s' gets used " + "in subkernel '%s' without a definition (maybe you forgot " + "to call loopy.save_and_reload_temporaries?)" + % (temporary, subkernel)) # }}} diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 5de677e72..f9b08d343 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -23,10 +23,6 @@ THE SOFTWARE. """ from loopy.kernel.data import temp_var_scope -from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem, - RunInstruction, Barrier) - -from pytools import memoize_method # {{{ block boundary finder @@ -37,6 +33,7 @@ def get_block_boundaries(schedule): :class:`loopy.schedule.BlockBeginItem`s to :class:`loopy.schedule.BlockEndItem`s and vice versa. """ + from loopy.schedule import (BeginBlockItem, EndBlockItem) block_bounds = {} active_blocks = [] for idx, sched_item in enumerate(schedule): @@ -51,109 +48,24 @@ def get_block_boundaries(schedule): # }}} -# {{{ instruction query utility - -class InstructionQuery(object): - - def __init__(self, kernel): - self.kernel = kernel - block_bounds = get_block_boundaries(kernel.schedule) - subkernel_slices = {} - from six import iteritems - for start, end in iteritems(block_bounds): - sched_item = kernel.schedule[start] - if isinstance(sched_item, CallKernel): - subkernel_slices[sched_item.kernel_name] = slice(start, end + 1) - self.subkernel_slices = subkernel_slices - - @memoize_method - def subkernels(self): - return frozenset(self.subkernel_slices.keys()) - - @memoize_method - def insns_reading_or_writing(self, var): - return frozenset(insn.id for insn in self.kernel.instructions - if var in insn.read_dependency_names() - or var in insn.assignee_var_names()) - - @memoize_method - def insns_in_subkernel(self, subkernel): - return frozenset(sched_item.insn_id for sched_item - in self.kernel.schedule[self.subkernel_slices[subkernel]] - if isinstance(sched_item, RunInstruction)) - - @memoize_method - def temporaries_read_in_subkernel(self, subkernel): - return frozenset( - var - for insn in self.insns_in_subkernel(subkernel) - for var in self.kernel.id_to_insn[insn].read_dependency_names() - if var in self.kernel.temporary_variables) - - @memoize_method - def temporaries_written_in_subkernel(self, subkernel): - return frozenset( - var - for insn in self.insns_in_subkernel(subkernel) - for var in self.kernel.id_to_insn[insn].assignee_var_names() - if var in self.kernel.temporary_variables) - - @memoize_method - def temporaries_read_or_written_in_subkernel(self, subkernel): - return ( - self.temporaries_read_in_subkernel(subkernel) | - self.temporaries_written_in_subkernel(subkernel)) - - @memoize_method - def inames_in_subkernel(self, subkernel): - subkernel_start = self.subkernel_slices[subkernel].start - return frozenset(self.kernel.schedule[subkernel_start].extra_inames) - - @memoize_method - def pre_and_post_barriers(self, subkernel): - subkernel_start = self.subkernel_slices[subkernel].start - subkernel_end = self.subkernel_slices[subkernel].stop - - def is_global_barrier(item): - return isinstance(item, Barrier) and item.kind == "global" - - try: - pre_barrier = next(item for item in - self.kernel.schedule[subkernel_start::-1] - if is_global_barrier(item)).originating_insn_id - except StopIteration: - pre_barrier = None - - try: - post_barrier = next(item for item in - self.kernel.schedule[subkernel_end:] - if is_global_barrier(item)).originating_insn_id - except StopIteration: - post_barrier = None - - return (pre_barrier, post_barrier) - - @memoize_method - def hw_inames(self, insn_id): - """ - Return the inames that insn runs in and that are tagged as hardware - parallel. - """ - from loopy.kernel.data import HardwareParallelTag - return set(iname for iname in self.kernel.insn_inames(insn_id) - if isinstance(self.kernel.iname_to_tag.get(iname), - HardwareParallelTag)) - - @memoize_method - def common_hw_inames(self, insn_ids): - """ - Return the common set of hardware parallel tagged inames among - the list of instructions. - """ - # Get the list of hardware inames in which the temporary is defined. - if len(insn_ids) == 0: - return set() - return set.intersection(*(self.hw_inames(id) for id in insn_ids)) +# {{{ subkernel tools + +def temporaries_read_in_subkernel(kernel, subkernel): + from loopy.kernel.tools import get_subkernel_to_insn_id_map + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + return frozenset(tv + for insn_id in insn_ids + for tv in kernel.id_to_insn[insn_id].read_dependency_names() + if tv in kernel.temporary_variables) + + +def temporaries_written_in_subkernel(kernel, subkernel): + from loopy.kernel.tools import get_subkernel_to_insn_id_map + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + return frozenset(tv + for insn_id in insn_ids + for tv in kernel.id_to_insn[insn_id].write_dependency_names() + if tv in kernel.temporary_variables) # }}} @@ -166,23 +78,27 @@ def add_extra_args_to_schedule(kernel): instructions in the schedule with global temporaries. """ new_schedule = [] - - insn_query = InstructionQuery(kernel) + from loopy.schedule import CallKernel for sched_item in kernel.schedule: if isinstance(sched_item, CallKernel): - subrange_temporaries = (insn_query - .temporaries_read_or_written_in_subkernel(sched_item.kernel_name)) + subkernel = sched_item.kernel_name + + used_temporaries = ( + temporaries_read_in_subkernel(kernel, subkernel) + | temporaries_written_in_subkernel(kernel, subkernel)) + more_args = set(tv - for tv in subrange_temporaries - if - kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL - and - kernel.temporary_variables[tv].initializer is None - and - tv not in sched_item.extra_args) + for tv in used_temporaries + if + kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL + and + kernel.temporary_variables[tv].initializer is None + and + tv not in sched_item.extra_args) + new_schedule.append(sched_item.copy( - extra_args=sched_item.extra_args + sorted(more_args))) + extra_args=sched_item.extra_args + sorted(more_args))) else: new_schedule.append(sched_item) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 8afc1695a..0e63e1a2b 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -32,7 +32,7 @@ from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, CallKernel, ReturnFromKernel, Barrier) -from loopy.schedule.tools import (get_block_boundaries, InstructionQuery) +from loopy.schedule.tools import get_block_boundaries import logging @@ -193,13 +193,9 @@ class TemporarySaver(object): The name of the new temporary. - .. attribute:: orig_temporary + .. attribute:: orig_temporary_name - The original temporary variable object. - - .. attribute:: hw_inames - - The common list of hw axes that define the original object. + The name of original temporary variable object. .. attribute:: hw_dims @@ -207,6 +203,10 @@ class TemporarySaver(object): of the promoted temporary value, corresponding to hardware dimensions + .. attribute:: hw_tags + + The tags for the inames associated with hw_dims + .. attribute:: non_hw_dims A list of expressions, to be added in front of the shape @@ -214,9 +214,11 @@ class TemporarySaver(object): non-hardware dimensions """ - @memoize_method - def as_variable(self): - temporary = self.orig_temporary + __slots__ = ["name", "orig_temporary_name", "hw_dims", "hw_tags", + "non_hw_dims"] + + def as_kernel_temporary(self, kernel): + temporary = kernel.temporary_variables[self.orig_temporary_name] from loopy.kernel.data import TemporaryVariable return TemporaryVariable( name=self.name, @@ -230,16 +232,172 @@ class TemporarySaver(object): def __init__(self, kernel): self.kernel = kernel - self.insn_query = InstructionQuery(kernel) self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() + # These fields keep track of updates to the kernel. self.insns_to_insert = [] self.insns_to_update = {} self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} - self.saves_or_reloads_added = {} + + # temporary name -> save or reload insn ids + from collections import defaultdict + self.temporary_to_save_ids = defaultdict(set) + self.temporary_to_reload_ids = defaultdict(set) + self.subkernel_to_newly_added_insn_ids = defaultdict(set) + + # Maps names of base_storage to the name of the temporary + # representative chosen for saves/reloads + self.base_storage_to_representative = {} + + from loopy.kernel.data import ValueArg + import islpy as isl + self.new_subdomain = ( + isl.BasicSet.universe( + isl.Space.create_from_names( + isl.DEFAULT_CONTEXT, + set=[], + params=set( + arg.name for arg in kernel.args + if isinstance(arg, ValueArg))))) + + @property + @memoize_method + def subkernel_to_slice_indices(self): + result = {} + + for sched_item_idx, sched_item in enumerate(self.kernel.schedule): + if isinstance(sched_item, CallKernel): + start_idx = sched_item_idx + elif isinstance(sched_item, ReturnFromKernel): + result[sched_item.kernel_name] = (start_idx, 1 + sched_item_idx) + + return result + + @property + @memoize_method + def subkernel_to_surrounding_inames(self): + current_outer_inames = set() + within_subkernel = False + result = {} + + for sched_item_idx, sched_item in enumerate(self.kernel.schedule): + if isinstance(sched_item, CallKernel): + within_subkernel = True + result[sched_item.kernel_name] = frozenset(current_outer_inames) + elif isinstance(sched_item, ReturnFromKernel): + within_subkernel = False + elif isinstance(sched_item, EnterLoop): + if not within_subkernel: + current_outer_inames.add(sched_item.iname) + elif isinstance(sched_item, LeaveLoop): + current_outer_inames.discard(sched_item.iname) + + return result + + @memoize_method + def get_enclosing_global_barrier_pair(self, subkernel): + subkernel_start, subkernel_end = ( + self.subkernel_to_slice_indices[subkernel]) + + def is_global_barrier(item): + return isinstance(item, Barrier) and item.kind == "global" + + try: + pre_barrier = next(item for item in + self.kernel.schedule[subkernel_start::-1] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + pre_barrier = None + + try: + post_barrier = next(item for item in + self.kernel.schedule[subkernel_end:] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + post_barrier = None + + return (pre_barrier, post_barrier) + + def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary): + """ + This is used for determining the amount of global storage needed for saving + and restoring the temporary across kernel calls, due to hardware + parallel inames (the inferred axes get prefixed to the number of + dimensions in the temporary). + + In the case of local temporaries, inames that are tagged + hw-local do not contribute to the global storage shape. + """ + accessor_insn_ids = frozenset( + self.kernel.reader_map()[temporary.name] + | self.kernel.writer_map()[temporary.name]) + + group_tags = None + local_tags = None + + def _sortedtags(tags): + return sorted(tags, key=lambda tag: tag.axis) + + for insn_id in accessor_insn_ids: + insn = self.kernel.id_to_insn[insn_id] + + my_group_tags = [] + my_local_tags = [] + + for iname in insn.within_inames: + tag = self.kernel.iname_to_tag.get(iname) + + if tag is None: + continue + + from loopy.kernel.data import ( + GroupIndexTag, LocalIndexTag, ParallelTag) + + if isinstance(tag, GroupIndexTag): + my_group_tags.append(tag) + elif isinstance(tag, LocalIndexTag): + my_local_tags.append(tag) + elif isinstance(tag, ParallelTag): + raise ValueError( + "iname '%s' is tagged with '%s' - only " + "group and local tags are supported for " + "auto save/reload of temporaries" % + (iname, tag)) + + if group_tags is None: + group_tags = _sortedtags(my_group_tags) + local_tags = _sortedtags(my_local_tags) + group_tags_originating_insn_id = insn_id + + if ( + group_tags != _sortedtags(my_group_tags) + or local_tags != _sortedtags(my_local_tags)): + raise ValueError( + "inconsistent parallel tags across instructions that access " + "'%s' (specifically, instruction '%s' has tags '%s' but " + "instruction '%s' has tags '%s')" + % (temporary.name, + group_tags_originating_insn_id, group_tags + local_tags, + insn_id, my_group_tags + my_local_tags)) + + if group_tags is None: + assert local_tags is None + return (), () + + group_sizes, local_sizes = ( + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + + if temporary.scope == lp.temp_var_scope.LOCAL: + # Elide local axes in the save slot for local temporaries. + del local_tags[:] + local_sizes = () + + # We set hw_dims to be arranged according to the order: + # g.0 < g.1 < ... < l.0 < l.1 < ... + return (group_sizes + local_sizes), tuple(group_tags + local_tags) @memoize_method def auto_promote_temporary(self, temporary_name): @@ -255,52 +413,16 @@ class TemporarySaver(object): assert temporary.read_only return None - if temporary.base_storage is not None: - raise ValueError( - "Cannot promote temporaries with base_storage to global") - - # `hw_inames`: The set of hw-parallel tagged inames that this temporary - # is associated with. This is used for determining the shape of the - # global storage needed for saving and restoring the temporary across - # kernel calls. - # - # TODO: Make a policy decision about which dimensions to use. Currently, - # the code looks at each instruction that defines or uses the temporary, - # and takes the common set of hw-parallel tagged inames associated with - # these instructions. - # - # Furthermore, in the case of local temporaries, inames that are tagged - # hw-local do not contribute to the global storage shape. - hw_inames = self.insn_query.common_hw_inames( - self.insn_query.insns_reading_or_writing(temporary.name)) - - # We want hw_inames to be arranged according to the order: - # g.0 < g.1 < ... < l.0 < l.1 < ... - # Sorting lexicographically accomplishes this. - hw_inames = sorted(hw_inames, - key=lambda iname: str(self.kernel.iname_to_tag[iname])) - - # Calculate the sizes of the dimensions that get added in front for - # the global storage of the temporary. - hw_dims = [] - - backing_hw_inames = [] - - for iname in hw_inames: - tag = self.kernel.iname_to_tag[iname] - from loopy.kernel.data import LocalIndexTag - is_local_iname = isinstance(tag, LocalIndexTag) - if is_local_iname and temporary.scope == temp_var_scope.LOCAL: - # Restrict shape to that of group inames for locals. - continue - backing_hw_inames.append(iname) - from loopy.isl_helpers import static_max_of_pw_aff - from loopy.symbolic import aff_to_expr - hw_dims.append( - aff_to_expr( - static_max_of_pw_aff( - self.kernel.get_iname_bounds(iname).size, False))) + base_storage_conflict = ( + self.base_storage_to_representative.get( + temporary.base_storage, temporary) is not temporary) + + if base_storage_conflict: + raise NotImplementedError( + "tried to save/reload multiple temporaries with the " + "same base_storage; this is currently not supported") + hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary) non_hw_dims = temporary.shape if len(non_hw_dims) == 0 and len(hw_dims) == 0: @@ -308,11 +430,15 @@ class TemporarySaver(object): non_hw_dims = (1,) backing_temporary = self.PromotedTemporary( - name=self.var_name_gen(temporary.name + "_save_slot"), - orig_temporary=temporary, - hw_dims=tuple(hw_dims), - non_hw_dims=non_hw_dims, - hw_inames=backing_hw_inames) + name=self.var_name_gen(temporary.name + "__save_slot"), + orig_temporary_name=temporary.name, + hw_dims=hw_dims, + hw_tags=hw_tags, + non_hw_dims=non_hw_dims) + + if temporary.base_storage is not None: + self.base_storage_to_representative[temporary.base_storage] = ( + backing_temporary) return backing_temporary @@ -326,23 +452,16 @@ class TemporarySaver(object): if promoted_temporary is None: return - from loopy.kernel.tools import DomainChanger - dchg = DomainChanger( - self.kernel, - frozenset( - self.insn_query.inames_in_subkernel(subkernel) | - set(promoted_temporary.hw_inames))) - - domain, hw_inames, dim_inames, iname_to_tag = \ + new_subdomain, hw_inames, dim_inames, iname_to_tag = ( self.augment_domain_for_save_or_reload( - dchg.domain, promoted_temporary, mode, subkernel) + self.new_subdomain, promoted_temporary, mode, subkernel)) - self.kernel = dchg.get_kernel_with(domain) + self.new_subdomain = new_subdomain save_or_load_insn_id = self.insn_name_gen( "{name}.{mode}".format(name=temporary, mode=mode)) - def subscript_or_var(agg, subscript=()): + def add_subscript_if_subscript_nonempty(agg, subscript=()): from pymbolic.primitives import Subscript, Variable if len(subscript) == 0: return Variable(agg) @@ -351,20 +470,25 @@ class TemporarySaver(object): Variable(agg), tuple(map(Variable, subscript))) - dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)] + orig_temporary = ( + self.kernel.temporary_variables[ + promoted_temporary.orig_temporary_name]) + dim_inames_trunc = dim_inames[:len(orig_temporary.shape)] args = ( - subscript_or_var( - temporary, dim_inames_trunc), - subscript_or_var( - promoted_temporary.name, hw_inames + dim_inames)) + add_subscript_if_subscript_nonempty( + temporary, subscript=dim_inames_trunc), + add_subscript_if_subscript_nonempty( + promoted_temporary.name, subscript=hw_inames + dim_inames)) if mode == "save": args = reversed(args) - accessing_insns_in_subkernel = ( - self.insn_query.insns_reading_or_writing(temporary) & - self.insn_query.insns_in_subkernel(subkernel)) + from loopy.kernel.tools import get_subkernel_to_insn_id_map + accessing_insns_in_subkernel = (frozenset( + self.kernel.reader_map()[temporary] + | self.kernel.writer_map()[temporary]) + & get_subkernel_to_insn_id_map(self.kernel)[subkernel]) if mode == "save": depends_on = accessing_insns_in_subkernel @@ -373,7 +497,7 @@ class TemporarySaver(object): depends_on = frozenset() update_deps = accessing_insns_in_subkernel - pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel) + pre_barrier, post_barrier = self.get_enclosing_global_barrier_pair(subkernel) if pre_barrier is not None: depends_on |= set([pre_barrier]) @@ -387,16 +511,19 @@ class TemporarySaver(object): *args, id=save_or_load_insn_id, within_inames=( - self.insn_query.inames_in_subkernel(subkernel) | - frozenset(hw_inames + dim_inames)), + self.subkernel_to_surrounding_inames[subkernel] + | frozenset(hw_inames + dim_inames)), within_inames_is_final=True, depends_on=depends_on, boostable=False, boostable_into=frozenset()) - if temporary not in self.saves_or_reloads_added: - self.saves_or_reloads_added[temporary] = set() - self.saves_or_reloads_added[temporary].add(save_or_load_insn_id) + if mode == "save": + self.temporary_to_save_ids[temporary].add(save_or_load_insn_id) + else: + self.temporary_to_reload_ids[temporary].add(save_or_load_insn_id) + + self.subkernel_to_newly_added_insn_ids[subkernel].add(save_or_load_insn_id) self.insns_to_insert.append(save_or_load_insn) @@ -405,8 +532,8 @@ class TemporarySaver(object): self.insns_to_update[insn_id] = insn.copy( depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) - self.updated_temporary_variables[promoted_temporary.name] = \ - promoted_temporary.as_variable() + self.updated_temporary_variables[promoted_temporary.name] = ( + promoted_temporary.as_kernel_temporary(self.kernel)) self.updated_iname_to_tag.update(iname_to_tag) @@ -416,15 +543,6 @@ class TemporarySaver(object): insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) - # Add global no_sync_with between any added reloads and saves - from six import iteritems - for temporary, added_insns in iteritems(self.saves_or_reloads_added): - for insn_id in added_insns: - insn = insns_to_insert[insn_id] - insns_to_insert[insn_id] = insn.copy( - no_sync_with=frozenset( - (added_insn, "global") for added_insn in added_insns)) - for orig_insn in self.kernel.instructions: if orig_insn.id in self.insns_to_update: new_instructions.append(self.insns_to_update[orig_insn.id]) @@ -436,12 +554,31 @@ class TemporarySaver(object): self.updated_iname_to_tag.update(self.kernel.iname_to_tag) self.updated_temporary_variables.update(self.kernel.temporary_variables) + new_domains = list(self.kernel.domains) + import islpy as isl + if self.new_subdomain.dim(isl.dim_type.set) > 0: + new_domains.append(self.new_subdomain) + kernel = self.kernel.copy( + domains=new_domains, instructions=new_instructions, iname_to_tag=self.updated_iname_to_tag, temporary_variables=self.updated_temporary_variables, overridden_get_grid_sizes_for_insn_ids=None) + # Add nosync directives to any saves or reloads that were added with a + # potential dependency chain. + from loopy.kernel.tools import get_subkernels + for subkernel in get_subkernels(kernel): + relevant_insns = self.subkernel_to_newly_added_insn_ids[subkernel] + + from itertools import product + for temporary in self.temporary_to_reload_ids: + for source, sink in product( + relevant_insns & self.temporary_to_reload_ids[temporary], + relevant_insns & self.temporary_to_save_ids[temporary]): + kernel = lp.add_nosync(kernel, "global", source, sink) + from loopy.kernel.tools import assign_automatic_axes return assign_automatic_axes(kernel) @@ -456,22 +593,28 @@ class TemporarySaver(object): """ Add new axes to the domain corresponding to the dimensions of `promoted_temporary`. These axes will be used in the save/ - reload stage. + reload stage. These get prefixed onto the already existing axes. """ assert mode in ("save", "reload") import islpy as isl - orig_temporary = promoted_temporary.orig_temporary + orig_temporary = ( + self.kernel.temporary_variables[ + promoted_temporary.orig_temporary_name]) orig_dim = domain.dim(isl.dim_type.set) # Tags for newly added inames iname_to_tag = {} + from loopy.symbolic import aff_from_expr + # FIXME: Restrict size of new inames to access footprint. # Add dimension-dependent inames. dim_inames = [] - domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims)) + domain = domain.add(isl.dim_type.set, + len(promoted_temporary.non_hw_dims) + + len(promoted_temporary.hw_dims)) for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims): new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}". @@ -493,25 +636,31 @@ class TemporarySaver(object): # Add size information. aff = isl.affs_from_space(domain.space) domain &= aff[0].le_set(aff[new_iname]) - from loopy.symbolic import aff_from_expr domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size)) - # FIXME: Use promoted_temporary.hw_inames - hw_inames = [] + dim_offset = orig_dim + len(promoted_temporary.non_hw_dims) - # Add hardware inames duplicates. - for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): + hw_inames = [] + # Add hardware dims. + for hw_iname_idx, (hw_tag, dim) in enumerate( + zip(promoted_temporary.hw_tags, promoted_temporary.hw_dims)): new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}". format(name=orig_temporary.name, mode=mode, - dim=t_idx, + dim=hw_iname_idx, sk=subkernel)) - hw_inames.append(new_iname) - iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname] + domain = domain.set_dim_name( + isl.dim_type.set, dim_offset + hw_iname_idx, new_iname) - from loopy.isl_helpers import duplicate_axes - domain = duplicate_axes( - domain, promoted_temporary.hw_inames, hw_inames) + aff = isl.affs_from_space(domain.space) + domain = (domain + & + aff[0].le_set(aff[new_iname]) + & + aff[new_iname].lt_set(aff_from_expr(domain.space, dim))) + + self.updated_iname_to_tag[new_iname] = hw_tag + hw_inames.append(new_iname) # The operations on the domain above return a Set object, but the # underlying domain should be expressible as a single BasicSet. @@ -551,7 +700,8 @@ def save_and_reload_temporaries(knl): liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl) - insn_query = InstructionQuery(knl) + from loopy.schedule.tools import ( + temporaries_read_in_subkernel, temporaries_written_in_subkernel) for sched_idx, sched_item in enumerate(knl.schedule): @@ -562,9 +712,10 @@ def save_and_reload_temporaries(knl): # Kernel entry: nothing live interesting_temporaries = set() else: + subkernel = sched_item.kernel_name interesting_temporaries = ( - insn_query.temporaries_read_or_written_in_subkernel( - sched_item.kernel_name)) + temporaries_read_in_subkernel(knl, subkernel) + | temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}" @@ -576,9 +727,9 @@ def save_and_reload_temporaries(knl): # Kernel exit: nothing live interesting_temporaries = set() else: + subkernel = sched_item.kernel_name interesting_temporaries = ( - insn_query.temporaries_written_in_subkernel( - sched_item.kernel_name)) + temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: logger.info("saving {0} before return of {1}" -- GitLab From 60743a9ebd4dc0b30dc47f312dcb26361335826e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 25 Apr 2017 12:36:36 -0500 Subject: [PATCH 2/6] Liveness analysis: fix logger (closes #54). --- loopy/transform/save.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 0e63e1a2b..2d89da288 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -135,7 +135,7 @@ class LivenessAnalysis(object): @memoize_method def liveness(self): - logging.info("running liveness analysis") + logger.info("running liveness analysis") successors = self.get_successor_relation() gen, kill = self.get_gen_and_kill_sets() @@ -152,7 +152,7 @@ class LivenessAnalysis(object): lr[idx].live_out.update(lr[succ].live_in) lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx]) - logging.info("done running liveness analysis") + logger.info("done running liveness analysis") return lr -- GitLab From 059862d5bd5ccd68a22f9f1d8b3a881bea731257 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 12 May 2017 15:07:12 -0500 Subject: [PATCH 3/6] Use previous naming convention for save slot. --- loopy/transform/save.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 2d89da288..a2e7a4d5b 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -214,8 +214,12 @@ class TemporarySaver(object): non-hardware dimensions """ - __slots__ = ["name", "orig_temporary_name", "hw_dims", "hw_tags", - "non_hw_dims"] + __slots__ = """ + name + orig_temporary_name + hw_dims + hw_tags + non_hw_dims""".split() def as_kernel_temporary(self, kernel): temporary = kernel.temporary_variables[self.orig_temporary_name] @@ -430,7 +434,7 @@ class TemporarySaver(object): non_hw_dims = (1,) backing_temporary = self.PromotedTemporary( - name=self.var_name_gen(temporary.name + "__save_slot"), + name=self.var_name_gen(temporary.name + "_save_slot"), orig_temporary_name=temporary.name, hw_dims=hw_dims, hw_tags=hw_tags, -- GitLab From 482c6d74f8cc94777c66f2e0ab4dda523aeacc62 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 23 May 2017 13:19:08 -0500 Subject: [PATCH 4/6] Save and reload: Ensure that instructions that access aliasing memory get included in dependency calculcations. Add a save and reload test that uses base_storage. --- loopy/transform/save.py | 51 +++++++++++++++++++++++++++++++++++++---- test/test_loopy.py | 27 +++++++++++++++++++++- 2 files changed, 72 insertions(+), 6 deletions(-) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index a2e7a4d5b..d3c4b9092 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -25,6 +25,7 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError import loopy as lp +import six from loopy.kernel.data import auto, temp_var_scope from pytools import memoize_method, Record @@ -267,6 +268,49 @@ class TemporarySaver(object): arg.name for arg in kernel.args if isinstance(arg, ValueArg))))) + def find_accessing_instructions_in_subkernel(self, temporary, subkernel): + # Find all accessing instructions in the subkernel. If base_storage is + # present, this includes instructions that access aliasing memory. + + aliasing_names = set([temporary]) + base_storage = self.kernel.temporary_variables[temporary].base_storage + + if base_storage is not None: + aliasing_names |= self.base_storage_to_temporary_map[base_storage] + + from loopy.kernel.tools import get_subkernel_to_insn_id_map + accessing_insns_in_subkernel = set() + subkernel_insns = get_subkernel_to_insn_id_map(self.kernel)[subkernel] + + for name in aliasing_names: + try: + accessing_insns_in_subkernel |= ( + self.kernel.reader_map()[name] & subkernel_insns) + except KeyError: + pass + + try: + accessing_insns_in_subkernel |= ( + self.kernel.writer_map()[name] & subkernel_insns) + except KeyError: + pass + + return frozenset(accessing_insns_in_subkernel) + + @property + @memoize_method + def base_storage_to_temporary_map(self): + from collections import defaultdict + + result = defaultdict(set) + + for temporary in six.itervalues(self.kernel.temporary_variables): + if temporary.base_storage is None: + continue + result[temporary.base_storage].add(temporary.name) + + return result + @property @memoize_method def subkernel_to_slice_indices(self): @@ -488,11 +532,8 @@ class TemporarySaver(object): if mode == "save": args = reversed(args) - from loopy.kernel.tools import get_subkernel_to_insn_id_map - accessing_insns_in_subkernel = (frozenset( - self.kernel.reader_map()[temporary] - | self.kernel.writer_map()[temporary]) - & get_subkernel_to_insn_id_map(self.kernel)[subkernel]) + accessing_insns_in_subkernel = self.find_accessing_instructions_in_subkernel( + temporary, subkernel) if mode == "save": depends_on = accessing_insns_in_subkernel diff --git a/test/test_loopy.py b/test/test_loopy.py index 4bb6a2726..e424e063f 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1146,7 +1146,7 @@ def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False): 1/0 _, (out,) = knl(queue, out_host=True) - assert (out == out_expect).all() + assert (out == out_expect).all(), (out, out_expect) @pytest.mark.parametrize("hw_loop", [True, False]) @@ -1338,6 +1338,31 @@ def test_save_local_multidim_array(ctx_factory, debug=False): save_and_reload_temporaries_test(queue, knl, 1, debug) +def test_save_with_base_storage(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0 <= i < 10}", + """ + <>a[i] = 0 + <>b[i] = i + ... gbarrier + out[i] = a[i] + """, + "...", + seq_dependencies=True) + + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.set_temporary_scope(knl, "a", "local") + knl = lp.set_temporary_scope(knl, "b", "local") + + knl = lp.alias_temporaries(knl, ["a", "b"], + synchronize_for_exclusive_use=False) + + save_and_reload_temporaries_test(queue, knl, np.arange(10), debug) + + def test_missing_temporary_definition_detection(): knl = lp.make_kernel( "{ [i]: 0<=i<10 }", -- GitLab From cd37f6011ac2feb65faac18fd2e3b6ddd5e15415 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 23 May 2017 13:25:26 -0500 Subject: [PATCH 5/6] ValueError -> LoopyError --- loopy/transform/save.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index d3c4b9092..3d4f5c2d4 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -409,7 +409,7 @@ class TemporarySaver(object): elif isinstance(tag, LocalIndexTag): my_local_tags.append(tag) elif isinstance(tag, ParallelTag): - raise ValueError( + raise LoopyError( "iname '%s' is tagged with '%s' - only " "group and local tags are supported for " "auto save/reload of temporaries" % @@ -423,7 +423,7 @@ class TemporarySaver(object): if ( group_tags != _sortedtags(my_group_tags) or local_tags != _sortedtags(my_local_tags)): - raise ValueError( + raise LoopyError( "inconsistent parallel tags across instructions that access " "'%s' (specifically, instruction '%s' has tags '%s' but " "instruction '%s' has tags '%s')" -- GitLab From 33079bda3b2c1d56dc9b0fa1ee4347666775bf73 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 23 May 2017 13:33:07 -0500 Subject: [PATCH 6/6] Save and reload: Add more tests related to iname tagging. --- test/test_loopy.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/test/test_loopy.py b/test/test_loopy.py index e424e063f..e62e52a6c 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1363,6 +1363,48 @@ def test_save_with_base_storage(ctx_factory, debug=False): save_and_reload_temporaries_test(queue, knl, np.arange(10), debug) +def test_save_ambiguous_storage_requirements(): + knl = lp.make_kernel( + "{[i,j]: 0 <= i < 10 and 0 <= j < 10}", + """ + <>a[j] = j + ... gbarrier + out[i,j] = a[j] + """, + seq_dependencies=True) + + knl = lp.tag_inames(knl, dict(i="g.0", j="l.0")) + knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"}) + knl = lp.set_temporary_scope(knl, "a", "local") + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + + from loopy.diagnostic import LoopyError + with pytest.raises(LoopyError): + lp.save_and_reload_temporaries(knl) + + +def test_save_across_inames_with_same_tag(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0 <= i < 10}", + """ + <>a[i] = i + ... gbarrier + out[i] = a[i] + """, + "...", + seq_dependencies=True) + + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.duplicate_inames(knl, "i", within="reads:a", tags={"i": "l.0"}) + + save_and_reload_temporaries_test(queue, knl, np.arange(10), debug) + + def test_missing_temporary_definition_detection(): knl = lp.make_kernel( "{ [i]: 0<=i<10 }", -- GitLab