From 6d8dac69d48bbbd58f6e72a7d06f085aa19f5ca0 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 3 Nov 2016 12:06:33 -0500 Subject: [PATCH 01/22] [WIP] --- loopy/schedule/__init__.py | 139 ++++++- loopy/schedule/device_mapping.py | 525 +------------------------ loopy/target/c/compyte | 2 +- loopy/transform/spill.py | 655 +++++++++++++++++++++++++++++++ test/test_loopy.py | 6 +- 5 files changed, 793 insertions(+), 534 deletions(-) create mode 100644 loopy/transform/spill.py diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index d6eb1a32c..79d644ccf 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -519,6 +519,23 @@ class SchedulerState(Record): .. attribute:: unscheduled_insn_ids + .. attribute:: preschedule + + A sequence of schedule items that must be inserted into the + schedule, maintaining the same ordering + + .. attribute:: prescheduled_insn_ids + + A :class:`frozenset` of any instruction that started prescheduled + + .. attribute:: prescheduled_inames + + A :class:`frozenset` of any iname that started prescheduled + + .. attribute:: can_schedule_insns + + Whether an instruction can be the next item scheduled + .. attribute:: group_insn_counts A mapping from instruction group names to the number of instructions @@ -560,6 +577,11 @@ def generate_loop_schedules_internal( active_inames_set = frozenset(sched_state.active_inames) + next_preschedule_item = ( + sched_state.preschedule[0] + if len(sched_state.preschedule) > 0 + else None) + # {{{ decide about debug mode debug_mode = False @@ -578,6 +600,10 @@ def generate_loop_schedules_internal( print(75*"=") print("CURRENT SCHEDULE:") print(dump_schedule(sched_state.kernel, sched_state.schedule)) + if sched_state.preschedule: + print(75*"=") + print("PRESCHEDULED ITEMS AWAITING SCHEDULING:") + print(dump_schedule(sched_state.kernel, sched_state.preschedule)) #print("boost allowed:", allow_boost) print(75*"=") print("LOOP NEST MAP (inner: outer):") @@ -593,6 +619,32 @@ def generate_loop_schedules_internal( # }}} + # {{{ see if we have reached the start/end of kernel in the preschedule + + if isinstance(next_preschedule_item, CallKernel): + assert sched_state.can_schedule_insns == False + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:], + can_schedule_insns=True), + allow_boost=rec_allow_boost, + debug=debug): + yield result + + if isinstance(next_preschedule_item, ReturnFromKernel): + assert sched_state.can_schedule_insns == True + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:], + can_schedule_insns=False), + allow_boost=rec_allow_boost, + debug=debug): + yield result + + # }}} + # {{{ see if any insns are ready to be scheduled now # Also take note of insns that have a chance of being schedulable inside @@ -608,9 +660,16 @@ def generate_loop_schedules_internal( # schedule generation order. return (insn.priority, len(active_groups & insn.groups), insn.id) - insn_ids_to_try = sorted(sched_state.unscheduled_insn_ids, + insn_ids_to_try = sorted( + # Non-prescheduled instructions go first. + sched_state.unscheduled_insn_ids - sched_state.prescheduled_insn_ids, key=insn_sort_key, reverse=True) + insn_ids_to_try.extend( + item.insn_id + for item in sched_state.preschedule + if isinstance(item, RunInstruction)) + for insn_id in insn_ids_to_try: insn = kernel.id_to_insn[insn_id] @@ -646,6 +705,18 @@ def generate_loop_schedules_internal( print("instruction '%s' won't work under inames '%s'" % (format_insn(kernel, insn.id), ",".join(have-want))) + if not sched_state.can_schedule_insns: + if debug_mode: + print("can't schedule '%s' because not inside subkernel" % format_insn(kernel, insn.id)) + is_ready = False + + if insn_id in sched_state.prescheduled_insn_ids and not ( + isinstance(next_preschedule_item, RunInstruction) + and next_preschedule_item.insn_id == insn_id): + if debug_mode: + print("can't schedule '%s' because another preschedule instruction precedes it" % format_insn(kernel, insn.id)) + is_ready = False + # {{{ determine group-based readiness if insn.conflicts_with_groups & active_groups: @@ -702,6 +773,9 @@ def generate_loop_schedules_internal( unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set, schedule=( sched_state.schedule + (RunInstruction(insn_id=insn.id),)), + preschedule=( + sched_state.preschedule if insn_id not in sched_state.prescheduled_insn_ids + else sched_state.preschedule[1:]), active_group_counts=new_active_group_counts, uses_of_boostability=( sched_state.uses_of_boostability @@ -731,7 +805,17 @@ def generate_loop_schedules_internal( if last_entered_loop is not None: can_leave = True - if last_entered_loop not in sched_state.breakable_inames: + if ( + last_entered_loop in sched_state.prescheduled_inames + and not ( + isinstance(next_preschedule_item, LeaveLoop) + and next_preschedule_item.iname == last_entered_loop)): + # A prescheduled loop can only be left if the preschedule agrees. + if debug_mode: + print("cannot leave '%s' because of preschedule constraints" + % last_entered_loop) + can_leave = False + elif last_entered_loop not in sched_state.breakable_inames: # If the iname is not breakable, then check that we've # scheduled all the instructions that require it. @@ -798,12 +882,18 @@ def generate_loop_schedules_internal( break if can_leave and not debug_mode: + for sub_sched in generate_loop_schedules_internal( sched_state.copy( schedule=( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), - active_inames=sched_state.active_inames[:-1]), + active_inames=sched_state.active_inames[:-1], + preschedule=( + sched_state.preschedule + if last_entered_loop not in sched_state.prescheduled_inames + else sched_state.preschedule[1:]), + ), allow_boost=rec_allow_boost, debug=debug): yield sub_sched @@ -843,6 +933,15 @@ def generate_loop_schedules_internal( # {{{ check if scheduling this iname now is allowed/plausible + if ( + iname in sched_state.prescheduled_inames + and not ( + isinstance(next_preschedule_item, EnterLoop) + and next_preschedule_item.iname == iname)): + if debug_mode: + print("scheduling %s prohibited by preschedule constraints" % iname) + continue + currently_accessible_inames = ( active_inames_set | sched_state.parallel_inames) if ( @@ -996,6 +1095,10 @@ def generate_loop_schedules_internal( entered_inames=( sched_state.entered_inames | frozenset((iname,))), + preschedule=( + sched_state.preschedule + if iname not in sched_state.prescheduled_inames + else sched_state.preschedule[1:]), ), allow_boost=rec_allow_boost, debug=debug): @@ -1015,7 +1118,7 @@ def generate_loop_schedules_internal( if inp: raise ScheduleDebugInput(inp) - if not sched_state.active_inames and not sched_state.unscheduled_insn_ids: + if not sched_state.active_inames and not sched_state.unscheduled_insn_ids and not sched_state.preschedule: # if done, yield result debug.log_success(sched_state.schedule) @@ -1408,6 +1511,9 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): result.append(sched_item) candidates.add(sched_item.insn_id) + elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + pass + else: raise ValueError("unexpected schedule item type '%s'" % type(sched_item).__name__) @@ -1429,7 +1535,7 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): def generate_loop_schedules(kernel, debug_args={}): from loopy.kernel import kernel_state - if kernel.state != kernel_state.PREPROCESSED: + if kernel.state not in (kernel_state.PREPROCESSED, kernel_state.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") @@ -1440,6 +1546,17 @@ def generate_loop_schedules(kernel, debug_args={}): debug = ScheduleDebugger(**debug_args) + preschedule = kernel.schedule if kernel.state == kernel_state.SCHEDULED else () + + prescheduled_inames = set( + insn.iname + for insn in preschedule + if isinstance(insn, EnterLoop)) + prescheduled_insn_ids = set( + insn.insn_id + for insn in preschedule + if isinstance(insn, RunInstruction)) + from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag ilp_inames = set( iname @@ -1466,6 +1583,9 @@ def generate_loop_schedules(kernel, debug_args={}): ilp_inames=ilp_inames, vec_inames=vec_inames, + prescheduled_inames=prescheduled_inames, + prescheduled_insn_ids=prescheduled_insn_ids, + # time-varying part active_inames=(), entered_inames=frozenset(), @@ -1474,6 +1594,9 @@ def generate_loop_schedules(kernel, debug_args={}): unscheduled_insn_ids=set(insn.id for insn in kernel.instructions), scheduled_insn_ids=frozenset(), + can_schedule_insns=kernel.state != kernel_state.SCHEDULED, + + preschedule=preschedule, # ilp and vec are not parallel for the purposes of the scheduler parallel_inames=parallel_inames - ilp_inames - vec_inames, @@ -1529,7 +1652,7 @@ def generate_loop_schedules(kernel, debug_args={}): gsize, lsize = kernel.get_grid_size_upper_bounds() - if gsize or lsize: + if (gsize or lsize): if not kernel.options.disable_global_barriers: logger.info("%s: barrier insertion: global" % kernel.name) @@ -1549,7 +1672,9 @@ def generate_loop_schedules(kernel, debug_args={}): from loopy.schedule.device_mapping import \ map_schedule_onto_host_or_device - new_kernel = map_schedule_onto_host_or_device(new_kernel) + if kernel.state != kernel_state.SCHEDULED: + # Device mapper only gets run once. + new_kernel = map_schedule_onto_host_or_device(new_kernel) yield new_kernel debug.start() diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index ca782a3d8..befc956af 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -23,11 +23,10 @@ THE SOFTWARE. """ from loopy.diagnostic import LoopyError -from loopy.kernel.data import TemporaryVariable, temp_var_scope +from loopy.kernel.data import temp_var_scope from loopy.schedule import (Barrier, BeginBlockItem, CallKernel, EndBlockItem, EnterLoop, LeaveLoop, ReturnFromKernel, RunInstruction) -from pytools import Record, memoize_method def map_schedule_onto_host_or_device(kernel): @@ -53,8 +52,7 @@ def map_schedule_onto_host_or_device(kernel): kernel = map_schedule_onto_host_or_device_impl( kernel, device_prog_name_gen) - return restore_and_save_temporaries( - add_extra_args_to_schedule(kernel)) + return add_extra_args_to_schedule(kernel) # {{{ Schedule / instruction utilities @@ -76,42 +74,6 @@ def get_block_boundaries(schedule): block_bounds[idx] = start return block_bounds - -def get_hw_inames(kernel, insn): - """ - Return the inames that insn runs in and that are tagged as hardware - parallel. - """ - from loopy.kernel.data import HardwareParallelTag - return set(iname for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)) - - -def get_common_hw_inames(kernel, insn_ids): - """ - Return the common set of hardware parallel tagged inames among - the list of instructions. - """ - # Get the list of hardware inames in which the temporary is defined. - if len(insn_ids) == 0: - return set() - return set.intersection( - *(get_hw_inames(kernel, kernel.id_to_insn[id]) for id in insn_ids)) - - -def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel): - from loopy.kernel.data import HardwareParallelTag - new_schedule = [] - - for item in kernel.schedule: - if isinstance(item, (EnterLoop, LeaveLoop)): - tag = kernel.iname_to_tag.get(item.iname) - if isinstance(tag, HardwareParallelTag): - continue - new_schedule.append(item) - - return kernel.copy(schedule=new_schedule) - # }}} @@ -152,10 +114,6 @@ from functools import partial filter_temporaries = partial(filter_items_by_varname, lambda kernel, name: name in kernel.temporary_variables) -filter_scalar_temporaries = partial(filter_items_by_varname, - lambda kernel, name: name in kernel.temporary_variables and - len(kernel.temporary_variables[name].shape) == 0) - def get_use_set(insn, include_subscripts=True): """ @@ -177,35 +135,6 @@ def get_def_set(insn, include_subscripts=True): return result -def get_def_and_use_lists_for_all_temporaries(kernel): - """ - Return a pair `def_lists`, `use_lists` which map temporary variable - names to lists of instructions where they are defined or used. - """ - def_lists = dict((t, []) for t in kernel.temporary_variables) - use_lists = dict((t, []) for t in kernel.temporary_variables) - - for insn in kernel.instructions: - assignees = get_def_set(insn, include_subscripts=False) - dependencies = get_use_set(insn, include_subscripts=False) - - from pymbolic.primitives import Variable - - for assignee in assignees: - if isinstance(assignee, Variable): - assignee = assignee.name - if assignee in kernel.temporary_variables: - def_lists[assignee].append(insn.id) - - for dep in dependencies: - if isinstance(dep, Variable): - dep = dep.name - if dep in kernel.temporary_variables: - use_lists[dep].append(insn.id) - - return def_lists, use_lists - - def get_temporaries_defined_and_used_in_subrange( kernel, schedule, start_idx, end_idx): defs = set() @@ -227,456 +156,6 @@ def get_temporaries_defined_and_used_in_subrange( # }}} -# {{{ Liveness analysis - -def compute_live_temporaries(kernel, schedule): - """ - Compute live-in and live-out sets for temporary variables. - """ - live_in = [set() for i in range(len(schedule) + 1)] - live_out = [set() for i in range(len(schedule))] - - id_to_insn = kernel.id_to_insn - block_bounds = get_block_boundaries(schedule) - - # {{{ Liveness analysis implementation - - def compute_subrange_liveness(start_idx, end_idx): - idx = end_idx - while start_idx <= idx: - sched_item = schedule[idx] - if isinstance(sched_item, LeaveLoop): - start = block_bounds[idx] - live_in[idx] = live_out[idx] = live_in[idx + 1] - compute_subrange_liveness(start + 1, idx - 1) - prev_live_in = live_in[start].copy() - live_in[start] = live_out[start] = live_in[start + 1] - # Propagate live values through the loop. - if live_in[start] != prev_live_in: - live_out[idx] |= live_in[start] - live_in[idx] = live_out[idx] - compute_subrange_liveness(start + 1, idx - 1) - idx = start - 1 - - elif isinstance(sched_item, ReturnFromKernel): - start = block_bounds[idx] - live_in[idx] = live_out[idx] = live_in[idx + 1] - compute_subrange_liveness(start + 1, idx - 1) - live_in[start] = live_out[start] = live_in[start + 1] - idx = start - 1 - - elif isinstance(sched_item, RunInstruction): - live_out[idx] = live_in[idx + 1] - insn = id_to_insn[sched_item.insn_id] - defs = filter_scalar_temporaries(kernel, - get_def_set(insn, include_subscripts=False)) - uses = filter_temporaries(kernel, - get_use_set(insn, include_subscripts=False)) - live_in[idx] = (live_out[idx] - defs) | uses - idx -= 1 - - elif isinstance(sched_item, Barrier): - live_in[idx] = live_out[idx] = live_in[idx + 1] - idx -= 1 - else: - raise LoopyError("unexpected type of schedule item: %s" - % type(sched_item).__name__) - - # }}} - - # Compute live variables - compute_subrange_liveness(0, len(schedule) - 1) - live_in = live_in[:-1] - - if 0: - print(kernel) - print("Live-in values:") - for i, li in enumerate(live_in): - print("{}: {}".format(i, ", ".join(li))) - print("Live-out values:") - for i, lo in enumerate(live_out): - print("{}: {}".format(i, ", ".join(lo))) - - # Strip off subscripts. - live_in = [filter_out_subscripts(li) for li in live_in] - live_out = [filter_out_subscripts(lo) for lo in live_out] - - return live_in, live_out - -# }}} - - -# {{{ Temporary promotion - -class PromotedTemporary(Record): - """ - .. attribute:: name - - The name of the new temporary. - - .. attribute:: orig_temporary - - The original temporary variable object. - - .. attribute:: hw_inames - - The common list of hw axes that define the original object. - - .. attribute:: shape_prefix - - A list of expressions, to be added in front of the shape - of the promoted temporary value - """ - - @memoize_method - def as_variable(self): - temporary = self.orig_temporary - return TemporaryVariable( - name=self.name, - dtype=temporary.dtype, - scope=temp_var_scope.GLOBAL, - shape=self.new_shape) - - @property - def new_shape(self): - return self.shape_prefix + self.orig_temporary.shape - - -def determine_temporaries_to_promote(kernel, temporaries, name_gen): - """ - For each temporary in the passed list of temporaries, construct a - :class:`PromotedTemporary` which describes how the temporary should - get promoted into global storage. - - :returns: A :class:`dict` mapping temporary names from `temporaries` to - :class:`PromotedTemporary` objects - """ - new_temporaries = {} - - def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) - - from loopy.kernel.data import LocalIndexTag - - for temporary in temporaries: - temporary = kernel.temporary_variables[temporary] - if temporary.scope == temp_var_scope.GLOBAL: - # Nothing to be done for global temporaries (I hope) - continue - - assert temporary.base_storage is None, \ - "Cannot promote temporaries with base_storage to global" - - # `hw_inames`: The set of hw-parallel tagged inames that this temporary - # is associated with. This is used for determining the shape of the - # global storage needed for saving and restoring the temporary across - # kernel calls. - # - # TODO: Make a policy decision about which dimensions to use. Currently, - # the code looks at each instruction that defines or uses the temporary, - # and takes the common set of hw-parallel tagged inames associated with - # these instructions. - # - # Furthermore, in the case of local temporaries, inames that are tagged - # hw-local do not contribute to the global storage shape. - hw_inames = get_common_hw_inames(kernel, - def_lists[temporary.name] + use_lists[temporary.name]) - - # This takes advantage of the fact that g < l in the alphabet :) - hw_inames = sorted(hw_inames, - key=lambda iname: str(kernel.iname_to_tag[iname])) - - # Calculate the sizes of the dimensions that get added in front for - # the global storage of the temporary. - shape_prefix = [] - - backing_hw_inames = [] - for iname in hw_inames: - tag = kernel.iname_to_tag[iname] - is_local_iname = isinstance(tag, LocalIndexTag) - if is_local_iname and temporary.scope == temp_var_scope.LOCAL: - # Restrict shape to that of group inames for locals. - continue - backing_hw_inames.append(iname) - from loopy.isl_helpers import static_max_of_pw_aff - from loopy.symbolic import aff_to_expr - shape_prefix.append( - aff_to_expr( - static_max_of_pw_aff( - kernel.get_iname_bounds(iname).size, False))) - - backing_temporary = PromotedTemporary( - name=name_gen(temporary.name), - orig_temporary=temporary, - shape_prefix=tuple(shape_prefix), - hw_inames=backing_hw_inames) - new_temporaries[temporary.name] = backing_temporary - - return new_temporaries - -# }}} - - -# {{{ Domain augmentation - -def augment_domain_for_temporary_promotion( - kernel, domain, promoted_temporary, mode, name_gen): - """ - Add new axes to the domain corresponding to the dimensions of - `promoted_temporary`. - """ - import islpy as isl - - orig_temporary = promoted_temporary.orig_temporary - orig_dim = domain.dim(isl.dim_type.set) - dims_to_insert = len(orig_temporary.shape) - - iname_to_tag = {} - - # Add dimension-dependent inames. - dim_inames = [] - - domain = domain.add(isl.dim_type.set, dims_to_insert) - for t_idx in range(len(orig_temporary.shape)): - new_iname = name_gen("{name}_{mode}_dim_{dim}". - format(name=orig_temporary.name, - mode=mode, - dim=t_idx)) - domain = domain.set_dim_name( - isl.dim_type.set, orig_dim + t_idx, new_iname) - if orig_temporary.is_local: - # If the temporary is has local scope, then loads / stores can be - # done in parallel. - from loopy.kernel.data import AutoFitLocalIndexTag - iname_to_tag[new_iname] = AutoFitLocalIndexTag() - - dim_inames.append(new_iname) - - # Add size information. - aff = isl.affs_from_space(domain.space) - domain &= aff[0].le_set(aff[new_iname]) - size = orig_temporary.shape[t_idx] - from loopy.symbolic import aff_from_expr - domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size)) - - hw_inames = [] - - # Add hardware inames duplicates. - for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): - new_iname = name_gen("{name}_{mode}_hw_dim_{dim}". - format(name=orig_temporary.name, - mode=mode, - dim=t_idx)) - hw_inames.append(new_iname) - iname_to_tag[new_iname] = kernel.iname_to_tag[hw_iname] - - from loopy.isl_helpers import duplicate_axes - domain = duplicate_axes( - domain, promoted_temporary.hw_inames, hw_inames) - - # The operations on the domain above return a Set object, but the - # underlying domain should be expressible as a single BasicSet. - domain_list = domain.get_basic_set_list() - assert domain_list.n_basic_set() == 1 - domain = domain_list.get_basic_set(0) - return domain, hw_inames, dim_inames, iname_to_tag - -# }}} - - -def restore_and_save_temporaries(kernel): - """ - Add code that loads / spills the temporaries in the kernel which are - live across sub-kernel calls. - """ - # Compute live temporaries. - live_in, live_out = compute_live_temporaries(kernel, kernel.schedule) - - # Create kernel variables based on live temporaries. - inter_kernel_temporaries = set() - - call_count = 0 - for idx, sched_item in enumerate(kernel.schedule): - if isinstance(sched_item, CallKernel): - inter_kernel_temporaries |= filter_out_subscripts(live_in[idx]) - call_count += 1 - - if call_count == 1: - # Single call corresponds to a kernel which has not been split - - # no need for restores / spills of temporaries. - return kernel - - name_gen = kernel.get_var_name_generator() - new_temporaries = determine_temporaries_to_promote( - kernel, inter_kernel_temporaries, name_gen) - - # {{{ Insert loads and spills of new temporaries - - new_schedule = [] - new_instructions = [] - new_iname_to_tag = {} - - idx = 0 - schedule = kernel.schedule - while idx < len(schedule): - sched_item = schedule[idx] - - if not isinstance(sched_item, CallKernel): - new_schedule.append(sched_item) - idx += 1 - continue - - subkernel_prolog = [] - subkernel_epilog = [] - subkernel_schedule = [] - - start_idx = idx - idx += 1 - while not isinstance(schedule[idx], ReturnFromKernel): - subkernel_schedule.append(schedule[idx]) - idx += 1 - - subkernel_defs, subkernel_uses = \ - get_temporaries_defined_and_used_in_subrange( - kernel, schedule, start_idx + 1, idx - 1) - - from loopy.kernel.data import temp_var_scope - # Filter out temporaries that are global. - subkernel_globals = set( - tval for tval in subkernel_defs | subkernel_uses - if kernel.temporary_variables[tval].scope == temp_var_scope.GLOBAL) - - tvals_to_spill = (subkernel_defs - subkernel_globals) & live_out[idx] - # Need to load tvals_to_spill, to avoid overwriting entries that the - # code doesn't touch when doing the spill. - tvals_to_load = ((subkernel_uses - subkernel_globals) - | tvals_to_spill) & live_in[start_idx] - - # Add new arguments. - sched_item = sched_item.copy( - extra_args=sched_item.extra_args - + sorted(new_temporaries[tv].name - for tv in tvals_to_load | tvals_to_spill)) - - # {{{ Add all the loads and spills. - - def insert_loads_or_spills(tvals, mode): - assert mode in ["load", "spill"] - local_temporaries = set() - - code_block = \ - subkernel_prolog if mode == "load" else subkernel_epilog - - new_kernel = kernel - - for tval in tvals: - from loopy.kernel.tools import DomainChanger - tval_hw_inames = new_temporaries[tval].hw_inames - dchg = DomainChanger(new_kernel, - frozenset(sched_item.extra_inames + tval_hw_inames)) - domain = dchg.domain - - domain, hw_inames, dim_inames, itt = \ - augment_domain_for_temporary_promotion( - new_kernel, domain, new_temporaries[tval], mode, - name_gen) - new_iname_to_tag.update(itt) - - new_kernel = dchg.get_kernel_with(domain) - - # Add the load / spill instruction. - insn_id = name_gen("{name}.{mode}".format(name=tval, mode=mode)) - - def subscript_or_var(agg, subscript): - from pymbolic.primitives import Subscript, Variable - if len(subscript) == 0: - return Variable(agg) - else: - return Subscript( - Variable(agg), - tuple(map(Variable, subscript))) - - args = ( - subscript_or_var( - tval, dim_inames), - subscript_or_var( - new_temporaries[tval].name, hw_inames + dim_inames)) - - if mode == "spill": - args = reversed(args) - - from loopy.kernel.data import Assignment - new_insn = Assignment(*args, id=insn_id, - within_inames=frozenset(hw_inames + dim_inames), - within_inames_is_final=True) - - new_instructions.append(new_insn) - - loop_begin = [EnterLoop(iname=iname) for iname in dim_inames] - loop_end = list(reversed([ - LeaveLoop(iname=iname) for iname in dim_inames])) - code_block.extend( - loop_begin + - [RunInstruction(insn_id=insn_id)] + - loop_end) - if new_temporaries[tval].orig_temporary.is_local: - local_temporaries.add(new_temporaries[tval].name) - - # After loading / before spilling local temporaries, we need to - # insert a barrier. - if local_temporaries: - if mode == "load": - subkernel_prolog.append( - Barrier(kind="local", - comment="for loads of {0}".format( - ", ".join(sorted(local_temporaries))))) - else: - subkernel_epilog.insert(0, - Barrier(kind="local", - comment="for spills of {0}".format( - ", ".join(sorted(local_temporaries))))) - return new_kernel - - kernel = insert_loads_or_spills(tvals_to_load, "load") - kernel = insert_loads_or_spills(tvals_to_spill, "spill") - - # }}} - - new_schedule.extend( - [sched_item] + - subkernel_prolog + - subkernel_schedule + - subkernel_epilog + - # ReturnFromKernel - [schedule[idx]]) - - # ReturnFromKernel - idx += 1 - - # }}} - - new_iname_to_tag.update(kernel.iname_to_tag) - updated_temporary_variables = dict( - (t.name, t.as_variable()) for t in new_temporaries.values()) - updated_temporary_variables.update(kernel.temporary_variables) - - kernel = kernel.copy( - iname_to_tag=new_iname_to_tag, - temporary_variables=updated_temporary_variables, - instructions=kernel.instructions + new_instructions, - schedule=new_schedule - ) - - from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) - - # Once assign_automatic_axes() does its job, loops in the schedule - # for newly hardware-tagged inames are no longer necessary (and in - # fact illegal), so remove them. - kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel) - - return kernel - - def add_extra_args_to_schedule(kernel): """ Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel` diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 11dc00352..ac1c71d46 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 11dc00352423cddd71f09e809d0a22ab1c3ea7a5 +Subproject commit ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc diff --git a/loopy/transform/spill.py b/loopy/transform/spill.py new file mode 100644 index 000000000..3a348fb8d --- /dev/null +++ b/loopy/transform/spill.py @@ -0,0 +1,655 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2016 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import six + +from loopy.diagnostic import LoopyError +import loopy as lp + +from loopy.kernel.data import auto +from loopy.kernel.instruction import BarrierInstruction +from pytools import memoize_method, Record +from loopy.schedule import ( + EnterLoop, LeaveLoop, RunInstruction, + CallKernel, ReturnFromKernel, Barrier) + + +import logging +logger = logging.getLogger(__name__) + + +# {{{ instruction query utility + +class InstructionQuery(object): + + def __init__(self, kernel): + self.kernel = kernel + block_bounds = get_block_boundaries(kernel.schedule) + subkernel_slices = {} + from six import iteritems + for start, end in iteritems(block_bounds): + sched_item = kernel.schedule[start] + if isinstance(sched_item, CallKernel): + subkernel_slices[sched_item.kernel_name] = slice(start, end + 1) + self.subkernel_slices = subkernel_slices + + @memoize_method + def subkernel_order(self): + pass + + @memoize_method + def insns_reading_or_writing(self, var): + return frozenset(insn.id for insn in self.kernel.instructions + if var in insn.read_dependency_names() + or var in insn.assignee_var_names()) + + @memoize_method + def insns_in_subkernel(self, subkernel): + return frozenset(sched_item.insn_id for sched_item + in self.kernel.schedule[self.subkernel_slices[subkernel]] + if isinstance(sched_item, RunInstruction)) + + @memoize_method + def inames_in_subkernel(self, subkernel): + return frozenset(self.kernel.schedule[self.subkernel_slices[subkernel].start].extra_inames) + + @memoize_method + def hw_inames(self, insn_id): + """ + Return the inames that insn runs in and that are tagged as hardware + parallel. + """ + from loopy.kernel.data import HardwareParallelTag + return set(iname for iname in self.kernel.insn_inames(insn_id) + if isinstance(self.kernel.iname_to_tag.get(iname), HardwareParallelTag)) + + @memoize_method + def common_hw_inames(self, insn_ids): + """ + Return the common set of hardware parallel tagged inames among + the list of instructions. + """ + # Get the list of hardware inames in which the temporary is defined. + if len(insn_ids) == 0: + return set() + return set.intersection(*(self.hw_inames(id) for id in insn_ids)) + +# }}} + + +def get_block_boundaries(schedule): + """ + Return a dictionary mapping indices of + :class:`loopy.schedule.BlockBeginItem`s to + :class:`loopy.schedule.BlockEndItem`s and vice versa. + """ + from loopy.schedule import (BeginBlockItem, EndBlockItem) + block_bounds = {} + active_blocks = [] + for idx, sched_item in enumerate(schedule): + if isinstance(sched_item, BeginBlockItem): + active_blocks.append(idx) + elif isinstance(sched_item, EndBlockItem): + start = active_blocks.pop() + block_bounds[start] = idx + block_bounds[idx] = start + return block_bounds + + +# {{{ liveness analysis + +class LivenessResult(dict): + + class InstructionResult(Record): + __slots__ = ["live_in", "live_out"] + + @classmethod + def make_empty(cls, nscheditems): + return cls((idx, cls.InstructionResult(live_in=set(), live_out=set())) + for idx in range(nscheditems)) + + +class LivenessAnalysis(object): + + def __init__(self, kernel): + self.kernel = kernel + self.schedule = self.kernel.schedule + + @memoize_method + def get_successor_relation(self): + successors = {} + block_bounds = get_block_boundaries(self.kernel.schedule) + + for idx, (item, next_item) in enumerate(zip( + reversed(self.schedule), + reversed(self.schedule + [None]))): + sched_idx = len(self.schedule) - idx - 1 + + # Look at next_item + if next_item is None: + after = set() + elif isinstance(next_item, EnterLoop): + # Account for empty loop + loop_end = block_bounds[sched_idx + 1] + after = successors[loop_end] | set([sched_idx + 1]) + elif isinstance(next_item, (LeaveLoop, RunInstruction, + CallKernel, ReturnFromKernel, Barrier)): + after = set([sched_idx + 1]) + else: + raise LoopyError("unexpected type of schedule item: {ty}" + .format(ty=type(next_item).__name__)) + + # Look at item + if isinstance(item, LeaveLoop): + # Account for loop + loop_begin = block_bounds[sched_idx] + after |= set([loop_begin]) + elif not isinstance(item, (EnterLoop, RunInstruction, + CallKernel, ReturnFromKernel, Barrier)): + raise LoopyError("unexpected type of schedule item: {ty}" + .format(ty=type(item).__name__)) + + successors[sched_idx] = after + + return successors + + def get_gen_and_kill_sets(self): + gen = dict((idx, set()) for idx in range(len(self.schedule))) + kill = dict((idx, set()) for idx in range(len(self.schedule))) + + for sched_idx, sched_item in enumerate(self.schedule): + if not isinstance(sched_item, RunInstruction): + continue + insn = self.kernel.id_to_insn[sched_item.insn_id] + for var in insn.assignee_var_names(): + if var not in self.kernel.temporary_variables: + continue + if not insn.predicates: + # Fully kills the liveness only when unconditional. + kill[sched_idx].add(var) + if len(self.kernel.temporary_variables[var].shape) > 0: + # For an array variable, all definitions generate a use as + # well, because the write could be a partial write, + # necessitating a reload of whatever is not written. + # + # We don't currently check if the write is a partial write + # or a full write. Instead, we analyze the access + # footprint later on to determine how much to reload/spill. + gen[sched_idx].add(var) + for var in insn.read_dependency_names(): + if var not in self.kernel.temporary_variables: + continue + gen[sched_idx].add(var) + + return gen, kill + + @memoize_method + def liveness(self): + logging.info("running liveness analysis") + successors = self.get_successor_relation() + gen, kill = self.get_gen_and_kill_sets() + + # Fixed point iteration for liveness analysis + lr = LivenessResult.make_empty(len(self.schedule)) + + prev_lr = None + + while prev_lr != lr: + from copy import deepcopy + prev_lr = deepcopy(lr) + for idx in range(len(self.schedule) - 1, -1, -1): + for succ in successors[idx]: + lr[idx].live_out.update(lr[succ].live_in) + lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx]) + + logging.info("done running liveness analysis") + + return lr + + def print_liveness(self): + print(75 * "-") + print("LIVE IN:") + for sched_idx, sched_item in enumerate(self.schedule): + print("{item}: {{{vars}}}".format( + item=sched_idx, + vars=", ".join(sorted(self[sched_idx].live_in)))) + print(75 * "-") + print("LIVE OUT:") + for sched_idx, sched_item in enumerate(self.schedule): + print("{item}: {{{vars}}}".format( + item=sched_idx, + vars=", ".join(sorted(self[sched_idx].live_out)))) + print(75 * "-") + + def __getitem__(self, sched_idx): + """ + :arg insn: An instruction name or instance of + :class:`loopy.instruction.InstructionBase` + + :returns: A :class:`LivenessResult` associated with `insn` + """ + return self.liveness()[sched_idx] + +# }}} + + +# {{{ spill and reload implementation + +class Spiller(object): + + class PromotedTemporary(Record): + """ + .. attribute:: name + + The name of the new temporary. + + .. attribute:: orig_temporary + + The original temporary variable object. + + .. attribute:: hw_inames + + The common list of hw axes that define the original object. + + .. attribute:: shape_prefix + + A list of expressions, to be added in front of the shape + of the promoted temporary value + """ + + @memoize_method + def as_variable(self): + temporary = self.orig_temporary + from loopy.kernel.data import TemporaryVariable, temp_var_scope + return TemporaryVariable( + name=self.name, + dtype=temporary.dtype, + scope=temp_var_scope.GLOBAL, + shape=self.new_shape) + + @property + def new_shape(self): + return self.shape_prefix + self.orig_temporary.shape + + def __init__(self, kernel): + self.kernel = kernel + self.insn_query = InstructionQuery(kernel) + self.var_name_gen = kernel.get_var_name_generator() + self.insn_name_gen = kernel.get_instruction_id_generator() + # These fields keep track of updates to the kernel. + self.insns_to_insert = [] + self.insns_to_update = {} + self.extra_args_to_add = {} + self.updated_iname_to_tag = {} + self.updated_temporary_variables = {} + # i.e. the "extra_args" field of CallKernel + self.updated_extra_args = {} + + @memoize_method + def auto_promote_temporary(self, temporary_name): + temporary = self.kernel.temporary_variables[temporary_name] + + from loopy.kernel.data import temp_var_scope + if temporary.scope == temp_var_scope.GLOBAL: + # Nothing to be done for global temporaries (I hope) + return None + + if temporary.base_storage is not None: + raise ValueError( + "Cannot promote temporaries with base_storage to global") + + # `hw_inames`: The set of hw-parallel tagged inames that this temporary + # is associated with. This is used for determining the shape of the + # global storage needed for saving and restoring the temporary across + # kernel calls. + # + # TODO: Make a policy decision about which dimensions to use. Currently, + # the code looks at each instruction that defines or uses the temporary, + # and takes the common set of hw-parallel tagged inames associated with + # these instructions. + # + # Furthermore, in the case of local temporaries, inames that are tagged + # hw-local do not contribute to the global storage shape. + hw_inames = self.insn_query.common_hw_inames( + self.insn_query.insns_reading_or_writing(temporary.name)) + + # We want hw_inames to be arranged according to the order: + # g.0 < g.1 < ... < l.0 < l.1 < ... + # Sorting lexicographically accomplishes this. + hw_inames = sorted(hw_inames, + key=lambda iname: str(self.kernel.iname_to_tag[iname])) + + # Calculate the sizes of the dimensions that get added in front for + # the global storage of the temporary. + shape_prefix = [] + + backing_hw_inames = [] + for iname in hw_inames: + tag = self.kernel.iname_to_tag[iname] + from loopy.kernel.data import LocalIndexTag + is_local_iname = isinstance(tag, LocalIndexTag) + if is_local_iname and temporary.scope == temp_var_scope.LOCAL: + # Restrict shape to that of group inames for locals. + continue + backing_hw_inames.append(iname) + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import aff_to_expr + shape_prefix.append( + aff_to_expr( + static_max_of_pw_aff( + self.kernel.get_iname_bounds(iname).size, False))) + + backing_temporary = self.PromotedTemporary( + name=self.var_name_gen(temporary.name + ".spill_slot"), + orig_temporary=temporary, + shape_prefix=tuple(shape_prefix), + hw_inames=backing_hw_inames) + + return backing_temporary + + def spill_or_reload_impl(self, temporary, subkernel, mode, + promoted_temporary=lp.auto): + assert mode in ("spill", "reload") + + if promoted_temporary is auto: + promoted_temporary = self.auto_promote_temporary(temporary) + + if promoted_temporary is None: + return + + from loopy.kernel.tools import DomainChanger + dchg = DomainChanger( + self.kernel, + frozenset( + self.insn_query.inames_in_subkernel(subkernel) | + set(promoted_temporary.hw_inames))) + + domain, hw_inames, dim_inames, iname_to_tag = \ + self.augment_domain_for_spill_or_reload( + dchg.domain, promoted_temporary, mode) + + self.kernel = dchg.get_kernel_with(domain) + + spill_or_load_insn_id = self.insn_name_gen( + "{name}.{mode}".format(name=temporary, mode=mode)) + + def subscript_or_var(agg, subscript=()): + from pymbolic.primitives import Subscript, Variable + if len(subscript) == 0: + return Variable(agg) + else: + return Subscript( + Variable(agg), + tuple(map(Variable, subscript))) + + args = ( + subscript_or_var( + temporary, dim_inames), + subscript_or_var( + promoted_temporary.name, hw_inames + dim_inames)) + + if subkernel in self.updated_extra_args: + self.updated_extra_args[subkernel].append(promoted_temporary.name) + else: + self.updated_extra_args[subkernel] = [promoted_temporary.name] + + if mode == "spill": + args = reversed(args) + + accessing_insns_in_subkernel = ( + self.insn_query.insns_reading_or_writing(temporary) & + self.insn_query.insns_in_subkernel(subkernel)) + + if mode == "spill": + depends_on = accessing_insns_in_subkernel + update_deps = frozenset() + elif mode == "reload": + depends_on = frozenset() + update_deps = accessing_insns_in_subkernel + + # Create the load / store instruction. + from loopy.kernel.data import Assignment + spill_or_load_insn = Assignment( + *args, + id=spill_or_load_insn_id, + within_inames=self.insn_query.inames_in_subkernel(subkernel) | + frozenset(hw_inames + dim_inames), + within_inames_is_final=True, + depends_on=depends_on, + boostable=False, + boostable_into=frozenset()) + + self.insns_to_insert.append(spill_or_load_insn) + + for insn_id in update_deps: + insn = self.insns_to_update.get(insn_id, self.kernel.id_to_insn[insn_id]) + self.insns_to_update[insn_id] = insn.copy( + depends_on=insn.depends_on | frozenset([spill_or_load_insn_id])) + + self.updated_temporary_variables[promoted_temporary.name] = \ + promoted_temporary.as_variable() + + self.updated_iname_to_tag.update(iname_to_tag) + + @memoize_method + def finish(self): + new_instructions = [] + + for orig_insn in self.kernel.instructions: + if orig_insn.id in self.insns_to_update: + new_instructions.append(self.insns_to_update[orig_insn.id]) + else: + new_instructions.append(orig_insn) + new_instructions.extend(self.insns_to_insert) + + new_schedule = [] + for sched_item in self.kernel.schedule: + if (isinstance(sched_item, CallKernel) and + sched_item.kernel_name in self.updated_extra_args): + new_schedule.append( + sched_item.copy(extra_args=( + sched_item.extra_args + + self.updated_extra_args[sched_item.kernel_name]))) + else: + new_schedule.append(sched_item) + + self.updated_iname_to_tag.update(self.kernel.iname_to_tag) + self.updated_temporary_variables.update(self.kernel.temporary_variables) + + return self.kernel.copy( + schedule=new_schedule, + instructions=new_instructions, + iname_to_tag=self.updated_iname_to_tag, + temporary_variables=self.updated_temporary_variables) + + def spill(self, temporary, subkernel): + self.spill_or_reload_impl(temporary, subkernel, "spill") + + def reload(self, temporary, subkernel): + self.spill_or_reload_impl(temporary, subkernel, "reload") + + def get_access_footprint_in_subkernel(self, temporary, subkernel, kind): + # FIXME: Return some sort of actual non-trivial access footprint. + assert kind in ("read", "write") + + def augment_domain_for_spill_or_reload(self, + domain, promoted_temporary, mode): + """ + Add new axes to the domain corresponding to the dimensions of + `promoted_temporary`. These axes will be used in the spill/ + reload stage. + """ + assert mode in ("spill", "reload") + import islpy as isl + + orig_temporary = promoted_temporary.orig_temporary + orig_dim = domain.dim(isl.dim_type.set) + dims_to_insert = len(orig_temporary.shape) + + # Tags for newly added inames + iname_to_tag = {} + + # Add dimension-dependent inames. + dim_inames = [] + domain = domain.add(isl.dim_type.set, dims_to_insert) + for t_idx in range(len(orig_temporary.shape)): + new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}". + format(name=orig_temporary.name, + mode=mode, + dim=t_idx)) + domain = domain.set_dim_name( + isl.dim_type.set, orig_dim + t_idx, new_iname) + if orig_temporary.is_local: + # If the temporary has local scope, then loads / stores can + # be done in parallel. + #from loopy.kernel.data import AutoFitLocalIndexTag + #iname_to_tag[new_iname] = AutoFitLocalIndexTag() + pass + + dim_inames.append(new_iname) + + # Add size information. + aff = isl.affs_from_space(domain.space) + domain &= aff[0].le_set(aff[new_iname]) + size = orig_temporary.shape[t_idx] + from loopy.symbolic import aff_from_expr + domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size)) + + hw_inames = [] + + # Add hardware inames duplicates. + for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): + new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}". + format(name=orig_temporary.name, + mode=mode, + dim=t_idx)) + hw_inames.append(new_iname) + iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname] + + from loopy.isl_helpers import duplicate_axes + domain = duplicate_axes( + domain, promoted_temporary.hw_inames, hw_inames) + + # The operations on the domain above return a Set object, but the + # underlying domain should be expressible as a single BasicSet. + domain_list = domain.get_basic_set_list() + assert domain_list.n_basic_set() == 1 + domain = domain_list.get_basic_set(0) + return domain, hw_inames, dim_inames, iname_to_tag + +# }}} + + +# {{{ auto spill and reload across kernel calls + +""" +TODO: +- flake8ify +- add TODO comments +- document +- assert kernel is scheduled etc +- write a bunch of tests +""" + +def spill_and_reload(knl, **kwargs): + """ + Add instructions to spill and reload temporary variables that are live + across kernel calls. + + The basic code transformation turns schedule segments: + + t = <...> + + <...> = t + + into this code: + + t = <...> + t.spill_slot = t + + t = t.spill_slot + <...> = t + + where `t.spill_slot` is a newly-created global temporary variable. + + :arg knl: + :arg barriers: + :returns: + """ + liveness = LivenessAnalysis(knl) + spiller = Spiller(knl) + + liveness.print_liveness() + + for sched_idx, sched_item in enumerate(knl.schedule): + # TODO: Rematerialization + if isinstance(sched_item, ReturnFromKernel): + for temporary in liveness[sched_idx].live_in: + logger.info("spilling {0} before return of {1}" + .format(temporary, sched_item.kernel_name)) + spiller.spill(temporary, sched_item.kernel_name) + + elif isinstance(sched_item, CallKernel): + for temporary in liveness[sched_idx].live_out: + logger.info("reloading {0} at entry of {1}" + .format(temporary, sched_item.kernel_name)) + spiller.reload(temporary, sched_item.kernel_name) + + return spiller.finish() + +# }}} + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + import loopy as lp + knl = lp.make_kernel( + "{ [i,j]: 0<=i,j<10 }", + """ + for i + <> t_private[i] = 1 {id=define_t_private} + <> j_ = 1 {id=definej} + for j + ... gbarrier {id=bar} + out[j] = j_ {id=setout,dep=bar} + ... gbarrier {id=barx,dep=define_t_private,dep=setout} + j_ = 10 {id=j1,dep=barx} + end + ... gbarrier {id=meow,dep=barx} + out[i] = t_private[i] {dep=meow} + end + """) + + #knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + + print("SCHEDULED INITIALLY", knl) + + knl = spill_and_reload(knl) + + knl = lp.get_one_scheduled_kernel(knl) + print(knl) + +# vim: foldmethod=marker diff --git a/test/test_loopy.py b/test/test_loopy.py index cf97ead38..5f0d4c670 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1101,8 +1101,6 @@ def test_kernel_splitting_with_loop(ctx_factory): def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): ctx = ctx_factory() - pytest.xfail("spilling doesn't yet use local axes") - knl = lp.make_kernel( "{ [i,k]: 0<=i Date: Sat, 5 Nov 2016 01:50:21 -0500 Subject: [PATCH 02/22] Initial spill and reload implementation. --- loopy/codegen/control.py | 11 +- loopy/kernel/creation.py | 1 + loopy/kernel/instruction.py | 18 +++ loopy/schedule/__init__.py | 165 ++++++++++++++----- loopy/schedule/device_mapping.py | 138 +--------------- loopy/transform/spill.py | 267 +++++++++++-------------------- test/test_loopy.py | 140 ++++++++++------ 7 files changed, 342 insertions(+), 398 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 3378ed81e..f0d73bf4a 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -150,8 +150,15 @@ def generate_code_for_sched_index(codegen_state, sched_index): return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): - return codegen_state.ast_builder.emit_barrier( - sched_item.kind, sched_item.comment) + if codegen_state.is_generating_device_code: + return codegen_state.ast_builder.emit_barrier( + sched_item.kind, sched_item.comment) + from loopy.codegen.result import CodeGenerationResult + return CodeGenerationResult( + host_program=None, + device_programs=[], + implemented_domains={}, + implemented_data_info=codegen_state.implemented_data_info) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index ff3bf16bc..ac41531f9 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -152,6 +152,7 @@ def get_default_insn_options_dict(): "depends_on": None, "depends_on_is_final": False, "no_sync_with": None, + "no_global_sync_with": None, "groups": frozenset(), "conflicts_with_groups": frozenset(), "insn_id": None, diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index c54d1fc32..a7904f30e 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -84,6 +84,8 @@ class InstructionBase(Record): with which no barrier synchronization is necessary, even given the existence of a dependency chain and apparently conflicting access + .. attribute:: no_global_sync_with + .. rubric:: Conditionals .. attribute:: predicates @@ -126,6 +128,7 @@ class InstructionBase(Record): fields = set("id depends_on depends_on_is_final " "groups conflicts_with_groups " "no_sync_with " + "no_global_sync_with " "predicates " "within_inames_is_final within_inames " "priority boostable boostable_into".split()) @@ -133,6 +136,7 @@ class InstructionBase(Record): def __init__(self, id, depends_on, depends_on_is_final, groups, conflicts_with_groups, no_sync_with, + no_global_sync_with, within_inames_is_final, within_inames, priority, boostable, boostable_into, predicates, tags, @@ -194,6 +198,9 @@ class InstructionBase(Record): if no_sync_with is None: no_sync_with = frozenset() + if no_global_sync_with is None: + no_global_sync_with = frozenset() + if within_inames is None: within_inames = frozenset() @@ -238,6 +245,7 @@ class InstructionBase(Record): depends_on=depends_on, depends_on_is_final=depends_on_is_final, no_sync_with=no_sync_with, + no_global_sync_with=no_global_sync_with, groups=groups, conflicts_with_groups=conflicts_with_groups, within_inames_is_final=within_inames_is_final, within_inames=within_inames, @@ -722,6 +730,7 @@ class Assignment(MultiAssignmentBase): groups=None, conflicts_with_groups=None, no_sync_with=None, + no_global_sync_with=None, within_inames_is_final=None, within_inames=None, boostable=None, boostable_into=None, tags=None, @@ -737,6 +746,7 @@ class Assignment(MultiAssignmentBase): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, + no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, boostable=boostable, @@ -874,6 +884,7 @@ class CallInstruction(MultiAssignmentBase): groups=None, conflicts_with_groups=None, no_sync_with=None, + no_global_sync_with=None, within_inames_is_final=None, within_inames=None, boostable=None, boostable_into=None, tags=None, @@ -890,6 +901,7 @@ class CallInstruction(MultiAssignmentBase): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, + no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, boostable=boostable, @@ -1060,6 +1072,7 @@ class CInstruction(InstructionBase): id=None, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, + no_global_sync_with=None, within_inames_is_final=None, within_inames=None, priority=0, boostable=None, boostable_into=None, predicates=frozenset(), tags=None, @@ -1080,6 +1093,7 @@ class CInstruction(InstructionBase): depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, + no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, boostable=boostable, @@ -1240,6 +1254,7 @@ class NoOpInstruction(_DataObliviousInstruction): def __init__(self, id=None, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, + no_global_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, boostable=None, boostable_into=None, @@ -1251,6 +1266,7 @@ class NoOpInstruction(_DataObliviousInstruction): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, + no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, @@ -1291,6 +1307,7 @@ class BarrierInstruction(_DataObliviousInstruction): def __init__(self, id, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, + no_global_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, boostable=None, boostable_into=None, @@ -1307,6 +1324,7 @@ class BarrierInstruction(_DataObliviousInstruction): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, + no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index fe238e13e..0a1db247a 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -399,6 +399,17 @@ def get_priority_tiers(wanted, priorities): for tier in get_priority_tiers(wanted, priorities): yield tier + +def sched_item_to_insn_id(sched_item): + # Helper for use in generator expressions, i.e. + # (... for insn_id in sched_item_to_insn_id(item) ...) + if isinstance(sched_item, RunInstruction): + yield sched_item.insn_id + elif isinstance(sched_item, Barrier): + if (hasattr(sched_item, "originating_insn_id") + and sched_item.originating_insn_id is not None): + yield sched_item.originating_insn_id + # }}} @@ -572,6 +583,10 @@ class SchedulerState(Record): A :class:`frozenset` of all inames ever entered. + .. attribute:: enclosing_subkernel_inames + + The inames of the last entered subkernel + .. attribute:: schedule .. attribute:: scheduled_insn_ids @@ -591,9 +606,13 @@ class SchedulerState(Record): A :class:`frozenset` of any iname that started prescheduled - .. attribute:: can_schedule_insns + .. attribute:: may_schedule_global_barriers + + Whether global barrier scheduling is allowed - Whether an instruction can be the next item scheduled + .. attribute:: within_subkernel + + Whether the scheduler is inside a subkernel .. attribute:: group_insn_counts @@ -638,8 +657,8 @@ def generate_loop_schedules_internal( next_preschedule_item = ( sched_state.preschedule[0] - if len(sched_state.preschedule) > 0 - else None) + if len(sched_state.preschedule) > 0 + else None) # {{{ decide about debug mode @@ -681,26 +700,31 @@ def generate_loop_schedules_internal( # {{{ see if we have reached the start/end of kernel in the preschedule if isinstance(next_preschedule_item, CallKernel): - assert sched_state.can_schedule_insns == False + assert sched_state.within_subkernel is False for result in generate_loop_schedules_internal( sched_state.copy( schedule=sched_state.schedule + (next_preschedule_item,), preschedule=sched_state.preschedule[1:], - can_schedule_insns=True), + within_subkernel=True, + may_schedule_global_barriers=False, + enclosing_subkernel_inames=sched_state.active_inames), allow_boost=rec_allow_boost, debug=debug): yield result if isinstance(next_preschedule_item, ReturnFromKernel): - assert sched_state.can_schedule_insns == True - for result in generate_loop_schedules_internal( - sched_state.copy( - schedule=sched_state.schedule + (next_preschedule_item,), - preschedule=sched_state.preschedule[1:], - can_schedule_insns=False), - allow_boost=rec_allow_boost, - debug=debug): - yield result + assert sched_state.within_subkernel is True + # Make sure all subkernel inames have finished. + if sched_state.active_inames == sched_state.enclosing_subkernel_inames: + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:], + within_subkernel=False, + may_schedule_global_barriers=True), + allow_boost=rec_allow_boost, + debug=debug): + yield result # }}} @@ -725,9 +749,9 @@ def generate_loop_schedules_internal( key=insn_sort_key, reverse=True) insn_ids_to_try.extend( - item.insn_id + insn_id for item in sched_state.preschedule - if isinstance(item, RunInstruction)) + for insn_id in sched_item_to_insn_id(item)) for insn_id in insn_ids_to_try: insn = kernel.id_to_insn[insn_id] @@ -764,17 +788,40 @@ def generate_loop_schedules_internal( print("instruction '%s' won't work under inames '%s'" % (format_insn(kernel, insn.id), ",".join(have-want))) - if not sched_state.can_schedule_insns: - if debug_mode: - print("can't schedule '%s' because not inside subkernel" % format_insn(kernel, insn.id)) - is_ready = False + # {{{ check if scheduling this insn is compatible with preschedule - if insn_id in sched_state.prescheduled_insn_ids and not ( - isinstance(next_preschedule_item, RunInstruction) - and next_preschedule_item.insn_id == insn_id): - if debug_mode: - print("can't schedule '%s' because another preschedule instruction precedes it" % format_insn(kernel, insn.id)) - is_ready = False + if insn_id in sched_state.prescheduled_insn_ids: + try: + next_preschedule_insn_id = next( + sched_item_to_insn_id(next_preschedule_item)) + except StopIteration: + next_preschedule_insn_id = None + + if next_preschedule_insn_id != insn_id: + if debug_mode: + print("can't schedule '%s' because another preschedule " + "instruction precedes it" % format_insn(kernel, insn.id)) + is_ready = False + + # }}} + + # {{{ check if scheduler state allows insn scheduling + + from loopy.kernel.instruction import BarrierInstruction + if isinstance(insn, BarrierInstruction) and insn.kind == "global": + if not sched_state.may_schedule_global_barriers: + if debug_mode: + print("can't schedule '%s' because global barriers are " + "not currently allowed" % format_insn(kernel, insn.id)) + is_ready = False + else: + if not sched_state.within_subkernel: + if debug_mode: + print("can't schedule '%s' because not within subkernel" + % format_insn(kernel, insn.id)) + is_ready = False + + # }}} # {{{ determine group-based readiness @@ -833,7 +880,8 @@ def generate_loop_schedules_internal( schedule=( sched_state.schedule + (RunInstruction(insn_id=insn.id),)), preschedule=( - sched_state.preschedule if insn_id not in sched_state.prescheduled_insn_ids + sched_state.preschedule + if insn_id not in sched_state.prescheduled_insn_ids else sched_state.preschedule[1:]), active_group_counts=new_active_group_counts, uses_of_boostability=( @@ -950,7 +998,8 @@ def generate_loop_schedules_internal( active_inames=sched_state.active_inames[:-1], preschedule=( sched_state.preschedule - if last_entered_loop not in sched_state.prescheduled_inames + if last_entered_loop + not in sched_state.prescheduled_inames else sched_state.preschedule[1:]), ), allow_boost=rec_allow_boost, debug=debug): @@ -998,7 +1047,8 @@ def generate_loop_schedules_internal( isinstance(next_preschedule_item, EnterLoop) and next_preschedule_item.iname == iname)): if debug_mode: - print("scheduling %s prohibited by preschedule constraints" % iname) + print("scheduling %s prohibited by preschedule constraints" + % iname) continue currently_accessible_inames = ( @@ -1185,7 +1235,10 @@ def generate_loop_schedules_internal( if inp: raise ScheduleDebugInput(inp) - if not sched_state.active_inames and not sched_state.unscheduled_insn_ids and not sched_state.preschedule: + if ( + not sched_state.active_inames + and not sched_state.unscheduled_insn_ids + and not sched_state.preschedule): # if done, yield result debug.log_success(sched_state.schedule) @@ -1240,6 +1293,7 @@ def convert_barrier_instructions_to_barriers(kernel, schedule): insn = kernel.id_to_insn[sched_item.insn_id] if isinstance(insn, BarrierInstruction): result.append(Barrier( + comment="from instruction '{0}'".format(insn.id), kind=insn.kind, originating_insn_id=insn.id)) continue @@ -1315,6 +1369,9 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): if source.id in target.no_sync_with: return None + if var_kind == "global" and source.id in target.no_global_sync_with: + return None + # {{{ check that a dependency exists dep_descr = None @@ -1412,6 +1469,9 @@ def get_tail_starting_at_last_barrier(schedule, kind): elif isinstance(sched_item, (EnterLoop, LeaveLoop)): pass + elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + pass + else: raise ValueError("unexpected schedule item type '%s'" % type(sched_item).__name__) @@ -1425,7 +1485,8 @@ def insn_ids_from_schedule(schedule): if isinstance(sched_item, RunInstruction): result.append(sched_item.insn_id) - elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier)): + elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier, CallKernel, + ReturnFromKernel)): pass else: @@ -1499,6 +1560,7 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): candidates.clear() def issue_barrier(dep): + print("issuing barrier") seen_barrier() comment = None @@ -1558,8 +1620,22 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): source=dep_src_insn_id, reverse=reverse, var_kind=kind) if dep: - issue_barrier(dep=dep) - break + if verify_only: + from loopy.diagnostic import MissingBarrierError + raise MissingBarrierError( + "Dependency '%s' (for variable '%s') " + "requires synchronization " + "by a %s barrier (add a 'no_sync_with' " + "instruction option to state that no" + "synchronization is needed)" + % ( + dep.dep_descr.format( + tgt=dep.target.id, src=dep.source.id), + dep.variable, + kind)) + else: + issue_barrier(dep=dep) + break # }}} @@ -1612,6 +1688,7 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): kind)) else: + print("HIIII") issue_barrier(dep=dep) break @@ -1619,7 +1696,8 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): candidates.add(sched_item.insn_id) elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): - pass + result.append(sched_item) + i += 1 else: raise ValueError("unexpected schedule item type '%s'" @@ -1659,10 +1737,11 @@ def generate_loop_schedules(kernel, debug_args={}): insn.iname for insn in preschedule if isinstance(insn, EnterLoop)) + prescheduled_insn_ids = set( - insn.insn_id - for insn in preschedule - if isinstance(insn, RunInstruction)) + insn_id + for item in preschedule + for insn_id in sched_item_to_insn_id(item)) from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag ilp_inames = set( @@ -1696,12 +1775,14 @@ def generate_loop_schedules(kernel, debug_args={}): # time-varying part active_inames=(), entered_inames=frozenset(), + enclosing_subkernel_inames=(), schedule=(), unscheduled_insn_ids=set(insn.id for insn in kernel.instructions), scheduled_insn_ids=frozenset(), - can_schedule_insns=kernel.state != kernel_state.SCHEDULED, + within_subkernel=kernel.state != kernel_state.SCHEDULED, + may_schedule_global_barriers=True, preschedule=preschedule, @@ -1764,15 +1845,12 @@ def generate_loop_schedules(kernel, debug_args={}): if (gsize or lsize): if not kernel.options.disable_global_barriers: logger.info("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, reverse=False, kind="global", verify_only=True) logger.info("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, reverse=False, kind="local", verify_only=False) - logger.info("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( @@ -1784,6 +1862,9 @@ def generate_loop_schedules(kernel, debug_args={}): if kernel.state != kernel_state.SCHEDULED: # Device mapper only gets run once. new_kernel = map_schedule_onto_host_or_device(new_kernel) + + from loopy.schedule.tools import add_extra_args_to_schedule + new_kernel = add_extra_args_to_schedule(new_kernel) yield new_kernel debug.start() diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index befc956af..1a0789c2f 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -23,13 +23,13 @@ THE SOFTWARE. """ from loopy.diagnostic import LoopyError -from loopy.kernel.data import temp_var_scope -from loopy.schedule import (Barrier, BeginBlockItem, CallKernel, EndBlockItem, - EnterLoop, LeaveLoop, ReturnFromKernel, - RunInstruction) +from loopy.schedule import (Barrier, CallKernel, EnterLoop, LeaveLoop, + ReturnFromKernel, RunInstruction) +from loopy.schedule.tools import get_block_boundaries def map_schedule_onto_host_or_device(kernel): + # FIXME: Should be idempotent. from loopy.kernel import kernel_state assert kernel.state == kernel_state.SCHEDULED @@ -52,139 +52,14 @@ def map_schedule_onto_host_or_device(kernel): kernel = map_schedule_onto_host_or_device_impl( kernel, device_prog_name_gen) - return add_extra_args_to_schedule(kernel) - - -# {{{ Schedule / instruction utilities - -def get_block_boundaries(schedule): - """ - Return a dictionary mapping indices of - :class:`loopy.schedule.BlockBeginItem`s to - :class:`loopy.schedule.BlockEndItem`s and vice versa. - """ - block_bounds = {} - active_blocks = [] - for idx, sched_item in enumerate(schedule): - if isinstance(sched_item, BeginBlockItem): - active_blocks.append(idx) - elif isinstance(sched_item, EndBlockItem): - start = active_blocks.pop() - block_bounds[start] = idx - block_bounds[idx] = start - return block_bounds - -# }}} - - -# {{{ Use / def utilities - -def filter_out_subscripts(exprs): - """ - Remove subscripts from expressions in `exprs`. - """ - result = set() - from pymbolic.primitives import Subscript - for expr in exprs: - if isinstance(expr, Subscript): - expr = expr.aggregate - result.add(expr) - return result - - -def filter_items_by_varname(pred, kernel, items): - """ - Keep only the values in `items` whose variable names satisfy `pred`. - """ - from pymbolic.primitives import Subscript, Variable - result = set() - for item in items: - base = item - if isinstance(base, Subscript): - base = base.aggregate - if isinstance(base, Variable): - base = base.name - if pred(kernel, base): - result.add(item) - return result - - -from functools import partial - -filter_temporaries = partial(filter_items_by_varname, - lambda kernel, name: name in kernel.temporary_variables) - - -def get_use_set(insn, include_subscripts=True): - """ - Return the use-set of the instruction, for liveness analysis. - """ - result = insn.read_dependency_names() - if not include_subscripts: - result = filter_out_subscripts(result) - return result - - -def get_def_set(insn, include_subscripts=True): - """ - Return the def-set of the instruction, for liveness analysis. - """ - result = insn.write_dependency_names() - if not include_subscripts: - result = filter_out_subscripts(result) - return result - - -def get_temporaries_defined_and_used_in_subrange( - kernel, schedule, start_idx, end_idx): - defs = set() - uses = set() - - for idx in range(start_idx, end_idx + 1): - sched_item = schedule[idx] - if isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] - defs.update( - filter_temporaries( - kernel, get_def_set(insn))) - uses.update( - filter_temporaries( - kernel, get_use_set(insn))) - - return defs, uses - -# }}} - - -def add_extra_args_to_schedule(kernel): - """ - Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel` - instructions in the schedule with global temporaries. - """ - new_schedule = [] - - block_bounds = get_block_boundaries(kernel.schedule) - for idx, sched_item in enumerate(kernel.schedule): - if isinstance(sched_item, CallKernel): - defs, uses = get_temporaries_defined_and_used_in_subrange( - kernel, kernel.schedule, idx + 1, block_bounds[idx] - 1) - # Filter out temporaries that are global. - extra_args = (tv for tv in defs | uses if - kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL - and - kernel.temporary_variables[tv].initializer is None) - new_schedule.append(sched_item.copy(extra_args=sorted(extra_args))) - else: - new_schedule.append(sched_item) - - return kernel.copy(schedule=new_schedule) + return kernel def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): schedule = kernel.schedule loop_bounds = get_block_boundaries(schedule) - # {{{ Inner mapper function + # {{{ inner mapper function dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[]) dummy_return = ReturnFromKernel(kernel_name="") @@ -239,6 +114,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): [dummy_call.copy()] + current_chunk + [dummy_return.copy()]) + new_schedule.append(sched_item) current_chunk = [] else: current_chunk.append(sched_item) diff --git a/loopy/transform/spill.py b/loopy/transform/spill.py index 3a348fb8d..cecf24ddf 100644 --- a/loopy/transform/spill.py +++ b/loopy/transform/spill.py @@ -23,101 +23,22 @@ THE SOFTWARE. """ -import six - from loopy.diagnostic import LoopyError import loopy as lp from loopy.kernel.data import auto -from loopy.kernel.instruction import BarrierInstruction from pytools import memoize_method, Record from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, CallKernel, ReturnFromKernel, Barrier) +from loopy.schedule.tools import (get_block_boundaries, InstructionQuery) + import logging logger = logging.getLogger(__name__) -# {{{ instruction query utility - -class InstructionQuery(object): - - def __init__(self, kernel): - self.kernel = kernel - block_bounds = get_block_boundaries(kernel.schedule) - subkernel_slices = {} - from six import iteritems - for start, end in iteritems(block_bounds): - sched_item = kernel.schedule[start] - if isinstance(sched_item, CallKernel): - subkernel_slices[sched_item.kernel_name] = slice(start, end + 1) - self.subkernel_slices = subkernel_slices - - @memoize_method - def subkernel_order(self): - pass - - @memoize_method - def insns_reading_or_writing(self, var): - return frozenset(insn.id for insn in self.kernel.instructions - if var in insn.read_dependency_names() - or var in insn.assignee_var_names()) - - @memoize_method - def insns_in_subkernel(self, subkernel): - return frozenset(sched_item.insn_id for sched_item - in self.kernel.schedule[self.subkernel_slices[subkernel]] - if isinstance(sched_item, RunInstruction)) - - @memoize_method - def inames_in_subkernel(self, subkernel): - return frozenset(self.kernel.schedule[self.subkernel_slices[subkernel].start].extra_inames) - - @memoize_method - def hw_inames(self, insn_id): - """ - Return the inames that insn runs in and that are tagged as hardware - parallel. - """ - from loopy.kernel.data import HardwareParallelTag - return set(iname for iname in self.kernel.insn_inames(insn_id) - if isinstance(self.kernel.iname_to_tag.get(iname), HardwareParallelTag)) - - @memoize_method - def common_hw_inames(self, insn_ids): - """ - Return the common set of hardware parallel tagged inames among - the list of instructions. - """ - # Get the list of hardware inames in which the temporary is defined. - if len(insn_ids) == 0: - return set() - return set.intersection(*(self.hw_inames(id) for id in insn_ids)) - -# }}} - - -def get_block_boundaries(schedule): - """ - Return a dictionary mapping indices of - :class:`loopy.schedule.BlockBeginItem`s to - :class:`loopy.schedule.BlockEndItem`s and vice versa. - """ - from loopy.schedule import (BeginBlockItem, EndBlockItem) - block_bounds = {} - active_blocks = [] - for idx, sched_item in enumerate(schedule): - if isinstance(sched_item, BeginBlockItem): - active_blocks.append(idx) - elif isinstance(sched_item, EndBlockItem): - start = active_blocks.pop() - block_bounds[start] = idx - block_bounds[idx] = start - return block_bounds - - # {{{ liveness analysis class LivenessResult(dict): @@ -273,10 +194,17 @@ class Spiller(object): The common list of hw axes that define the original object. - .. attribute:: shape_prefix + .. attribute:: hw_dims + + A list of expressions, to be added in front of the shape + of the promoted temporary value, corresponding to + hardware dimensions + + .. attribute:: non_hw_dims A list of expressions, to be added in front of the shape - of the promoted temporary value + of the promoted temporary value, corresponding to + non-hardware dimensions """ @memoize_method @@ -291,7 +219,7 @@ class Spiller(object): @property def new_shape(self): - return self.shape_prefix + self.orig_temporary.shape + return self.hw_dims + self.non_hw_dims def __init__(self, kernel): self.kernel = kernel @@ -304,8 +232,7 @@ class Spiller(object): self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} - # i.e. the "extra_args" field of CallKernel - self.updated_extra_args = {} + self.spills_or_reloads_added = {} @memoize_method def auto_promote_temporary(self, temporary_name): @@ -343,9 +270,10 @@ class Spiller(object): # Calculate the sizes of the dimensions that get added in front for # the global storage of the temporary. - shape_prefix = [] + hw_dims = [] backing_hw_inames = [] + for iname in hw_inames: tag = self.kernel.iname_to_tag[iname] from loopy.kernel.data import LocalIndexTag @@ -356,15 +284,22 @@ class Spiller(object): backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr - shape_prefix.append( + hw_dims.append( aff_to_expr( static_max_of_pw_aff( self.kernel.get_iname_bounds(iname).size, False))) + non_hw_dims = temporary.shape + + if len(non_hw_dims) == 0 and len(hw_dims) == 0: + # Scalar not in hardware: ensure at least one dimension. + non_hw_dims = (1,) + backing_temporary = self.PromotedTemporary( - name=self.var_name_gen(temporary.name + ".spill_slot"), + name=self.var_name_gen(temporary.name + "_spill_slot"), orig_temporary=temporary, - shape_prefix=tuple(shape_prefix), + hw_dims=tuple(hw_dims), + non_hw_dims=non_hw_dims, hw_inames=backing_hw_inames) return backing_temporary @@ -404,17 +339,14 @@ class Spiller(object): Variable(agg), tuple(map(Variable, subscript))) + dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)] + args = ( subscript_or_var( - temporary, dim_inames), + temporary, dim_inames_trunc), subscript_or_var( promoted_temporary.name, hw_inames + dim_inames)) - if subkernel in self.updated_extra_args: - self.updated_extra_args[subkernel].append(promoted_temporary.name) - else: - self.updated_extra_args[subkernel] = [promoted_temporary.name] - if mode == "spill": args = reversed(args) @@ -429,18 +361,31 @@ class Spiller(object): depends_on = frozenset() update_deps = accessing_insns_in_subkernel + pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel) + + if pre_barrier is not None: + depends_on |= set([pre_barrier]) + + if post_barrier is not None: + update_deps |= set([post_barrier]) + # Create the load / store instruction. from loopy.kernel.data import Assignment spill_or_load_insn = Assignment( *args, id=spill_or_load_insn_id, - within_inames=self.insn_query.inames_in_subkernel(subkernel) | - frozenset(hw_inames + dim_inames), + within_inames=( + self.insn_query.inames_in_subkernel(subkernel) | + frozenset(hw_inames + dim_inames)), within_inames_is_final=True, depends_on=depends_on, boostable=False, boostable_into=frozenset()) + if temporary not in self.spills_or_reloads_added: + self.spills_or_reloads_added[temporary] = set() + self.spills_or_reloads_added[temporary].add(spill_or_load_insn_id) + self.insns_to_insert.append(spill_or_load_insn) for insn_id in update_deps: @@ -457,33 +402,35 @@ class Spiller(object): def finish(self): new_instructions = [] + insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) + + # Add no_global_sync_with between any added reloads and spills + from six import iteritems + for temporary, added_insns in iteritems(self.spills_or_reloads_added): + for insn_id in added_insns: + insn = insns_to_insert[insn_id] + insns_to_insert[insn_id] = insn.copy( + no_global_sync_with=added_insns) + for orig_insn in self.kernel.instructions: if orig_insn.id in self.insns_to_update: new_instructions.append(self.insns_to_update[orig_insn.id]) else: new_instructions.append(orig_insn) - new_instructions.extend(self.insns_to_insert) - - new_schedule = [] - for sched_item in self.kernel.schedule: - if (isinstance(sched_item, CallKernel) and - sched_item.kernel_name in self.updated_extra_args): - new_schedule.append( - sched_item.copy(extra_args=( - sched_item.extra_args - + self.updated_extra_args[sched_item.kernel_name]))) - else: - new_schedule.append(sched_item) + new_instructions.extend( + sorted(insns_to_insert.values(), key=lambda insn: insn.id)) self.updated_iname_to_tag.update(self.kernel.iname_to_tag) self.updated_temporary_variables.update(self.kernel.temporary_variables) - return self.kernel.copy( - schedule=new_schedule, + kernel = self.kernel.copy( instructions=new_instructions, iname_to_tag=self.updated_iname_to_tag, temporary_variables=self.updated_temporary_variables) + from loopy.kernel.tools import assign_automatic_axes + return assign_automatic_axes(kernel) + def spill(self, temporary, subkernel): self.spill_or_reload_impl(temporary, subkernel, "spill") @@ -506,37 +453,39 @@ class Spiller(object): orig_temporary = promoted_temporary.orig_temporary orig_dim = domain.dim(isl.dim_type.set) - dims_to_insert = len(orig_temporary.shape) # Tags for newly added inames iname_to_tag = {} + # FIXME: Restrict size of new inames to access footprint. + # Add dimension-dependent inames. dim_inames = [] - domain = domain.add(isl.dim_type.set, dims_to_insert) - for t_idx in range(len(orig_temporary.shape)): + domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims)) + + for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims): new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}". format(name=orig_temporary.name, mode=mode, - dim=t_idx)) + dim=dim_idx)) domain = domain.set_dim_name( - isl.dim_type.set, orig_dim + t_idx, new_iname) + isl.dim_type.set, orig_dim + dim_idx, new_iname) + if orig_temporary.is_local: # If the temporary has local scope, then loads / stores can # be done in parallel. - #from loopy.kernel.data import AutoFitLocalIndexTag - #iname_to_tag[new_iname] = AutoFitLocalIndexTag() - pass + from loopy.kernel.data import AutoFitLocalIndexTag + iname_to_tag[new_iname] = AutoFitLocalIndexTag() dim_inames.append(new_iname) # Add size information. aff = isl.affs_from_space(domain.space) domain &= aff[0].le_set(aff[new_iname]) - size = orig_temporary.shape[t_idx] from loopy.symbolic import aff_from_expr - domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size)) + domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size)) + # FIXME: Use promoted_temporary.hw_inames hw_inames = [] # Add hardware inames duplicates. @@ -564,15 +513,6 @@ class Spiller(object): # {{{ auto spill and reload across kernel calls -""" -TODO: -- flake8ify -- add TODO comments -- document -- assert kernel is scheduled etc -- write a bunch of tests -""" - def spill_and_reload(knl, **kwargs): """ Add instructions to spill and reload temporary variables that are live @@ -587,12 +527,12 @@ def spill_and_reload(knl, **kwargs): into this code: t = <...> - t.spill_slot = t + t_spill_slot = t - t = t.spill_slot + t = t_spill_slot <...> = t - where `t.spill_slot` is a newly-created global temporary variable. + where `t_spill_slot` is a newly-created global temporary variable. :arg knl: :arg barriers: @@ -601,55 +541,36 @@ def spill_and_reload(knl, **kwargs): liveness = LivenessAnalysis(knl) spiller = Spiller(knl) - liveness.print_liveness() + #liveness.print_liveness() + + insn_query = InstructionQuery(knl) for sched_idx, sched_item in enumerate(knl.schedule): - # TODO: Rematerialization - if isinstance(sched_item, ReturnFromKernel): - for temporary in liveness[sched_idx].live_in: - logger.info("spilling {0} before return of {1}" - .format(temporary, sched_item.kernel_name)) - spiller.spill(temporary, sched_item.kernel_name) - elif isinstance(sched_item, CallKernel): - for temporary in liveness[sched_idx].live_out: + if isinstance(sched_item, CallKernel): + # Any written temporary that is live-out needs to be read into + # memory because of the potential for partial writes. + interesting_temporaries = ( + insn_query.temporaries_read_or_written_in_subkernel( + sched_item.kernel_name)) + + for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}" .format(temporary, sched_item.kernel_name)) spiller.reload(temporary, sched_item.kernel_name) + elif isinstance(sched_item, ReturnFromKernel): + interesting_temporaries = ( + insn_query.temporaries_written_in_subkernel( + sched_item.kernel_name)) + for temporary in liveness[sched_idx].live_in & interesting_temporaries: + logger.info("spilling {0} before return of {1}" + .format(temporary, sched_item.kernel_name)) + spiller.spill(temporary, sched_item.kernel_name) + return spiller.finish() # }}} -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - import loopy as lp - knl = lp.make_kernel( - "{ [i,j]: 0<=i,j<10 }", - """ - for i - <> t_private[i] = 1 {id=define_t_private} - <> j_ = 1 {id=definej} - for j - ... gbarrier {id=bar} - out[j] = j_ {id=setout,dep=bar} - ... gbarrier {id=barx,dep=define_t_private,dep=setout} - j_ = 10 {id=j1,dep=barx} - end - ... gbarrier {id=meow,dep=barx} - out[i] = t_private[i] {dep=meow} - end - """) - - #knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) - - print("SCHEDULED INITIALLY", knl) - - knl = spill_and_reload(knl) - - knl = lp.get_one_scheduled_kernel(knl) - print(knl) - # vim: foldmethod=marker diff --git a/test/test_loopy.py b/test/test_loopy.py index dba6685c5..9f8b8d15a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1056,7 +1056,7 @@ def test_kernel_splitting(ctx_factory): cgr = lp.generate_code_v2(knl) - assert len(cgr.device_programs) == 2 + assert len(cgr.device_programs) == 3 print(cgr.device_code()) print(cgr.host_code()) @@ -1105,95 +1105,135 @@ def test_kernel_splitting_with_loop(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) -def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): +def test_spill_and_reload_of_private_temporary(ctx_factory): ctx = ctx_factory() + queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{ [i,k]: 0<=i t_private_scalar = a[k,i+1] - <> t_private_array[i % 2] = a[k,i+1] - c[k,i] = a[k,i+1] - ... gbarrier - out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] + for i + for k + <> t_arr[k] = k + end + <> t_scalar = 1 + for j + ... gbarrier {id=bar} + out[j] = t_scalar {dep=bar} + ... gbarrier {id=bar2,dep=bar} + t_scalar = 10 {dep=bar2} + end + ... gbarrier {id=bar3,dep=bar2} + <> flag = i == 9 + out[i] = t_arr[i] {dep=bar3,if=flag} end - """, seq_dependencies=True) - - knl = lp.add_and_infer_dtypes(knl, - {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) - knl = lp.set_temporary_scope(knl, "t_private_scalar", "private") - knl = lp.set_temporary_scope(knl, "t_private_array", "private") - - ref_knl = knl + """) - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.set_temporary_scope(knl, "t_arr", "private") + knl = lp.add_and_infer_dtypes(knl, {"out": np.float32}) + knl = lp.tag_inames(knl, dict(i="g.0")) - # schedule from loopy.preprocess import preprocess_kernel + from loopy.schedule import get_one_scheduled_kernel + knl = preprocess_kernel(knl) + knl = get_one_scheduled_kernel(knl) + + from loopy.transform.spill import spill_and_reload + knl = spill_and_reload(knl) + knl = get_one_scheduled_kernel(knl) + + cgr = lp.generate_code_v2(knl) + + assert len(cgr.device_programs) == 4 + + _, (out,) = knl(queue) + assert (out.get() + == np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9], + dtype=np.float32)).all() + +def test_spill_and_reload_of_private_temporary_no_hw_loop(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i,j,k]: 0<=i,j,k<10 }", + """ + for i + for k + <> t_arr[k] = k + end + <> t_scalar = 1 + for j + ... gbarrier {id=bar} + out[j] = t_scalar {dep=bar} + ... gbarrier {id=bar2,dep=bar} + t_scalar = 10 {dep=bar2} + end + ... gbarrier {id=bar3,dep=bar2} + out[i] = t_arr[i] {dep=bar3} + end + """) + + knl = lp.set_temporary_scope(knl, "t_arr", "private") + knl = lp.add_and_infer_dtypes(knl, {"out": np.float32}) + + from loopy.preprocess import preprocess_kernel from loopy.schedule import get_one_scheduled_kernel + + knl = preprocess_kernel(knl) knl = get_one_scheduled_kernel(knl) - from loop.transform import spill_and_reload + from loopy.transform.spill import spill_and_reload knl = spill_and_reload(knl) - - print(knl) + knl = get_one_scheduled_kernel(knl) cgr = lp.generate_code_v2(knl) - assert len(cgr.device_programs) == 2 + assert len(cgr.device_programs) == 4 - print(cgr.device_code()) - print(cgr.host_code()) + _, (out,) = knl(queue) + assert (out.get() + == np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9], + dtype=np.float32)).all() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) - -def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory): +def test_spill_and_reload_of_local_temporary(ctx_factory): ctx = ctx_factory() + queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{ [i,k]: 0<=i t_local[i % 8,k] = i % 8 - c[k,i] = a[k,i+1] + <> t_local[k,j] = 1 ... gbarrier - out[k,i] = c[k,i] + t_local[i % 8,k] + out[k,i*2+j] = t_local[k,j] end """, seq_dependencies=True) - knl = lp.add_and_infer_dtypes(knl, - {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) - + knl = lp.add_and_infer_dtypes(knl, {"out": np.int}) knl = lp.set_temporary_scope(knl, "t_local", "local") + knl = lp.tag_inames(knl, dict(j="l.0", i="g.0")) - ref_knl = knl - - knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0") - - # schedule from loopy.preprocess import preprocess_kernel - knl = preprocess_kernel(knl) - from loopy.schedule import get_one_scheduled_kernel + + knl = preprocess_kernel(knl) knl = get_one_scheduled_kernel(knl) - # map schedule onto host or device - print(knl) + from loopy.transform.spill import spill_and_reload + knl = spill_and_reload(knl) + knl = get_one_scheduled_kernel(knl) cgr = lp.generate_code_v2(knl) - assert len(cgr.device_programs) == 2 - print(cgr.device_code()) - print(cgr.host_code()) + _, (out,) = knl(queue) - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8)) + assert (out.get() == 1).all() def test_global_temporary(ctx_factory): -- GitLab From 3ae18dfc6ae16fd57192ead04c30a9e754291f15 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 5 Nov 2016 01:54:43 -0500 Subject: [PATCH 03/22] fix test; add missing tools module --- loopy/schedule/tools.py | 191 ++++++++++++++++++++++++++++++++++++++++ test/test_loopy.py | 2 +- 2 files changed, 192 insertions(+), 1 deletion(-) create mode 100644 loopy/schedule/tools.py diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py new file mode 100644 index 000000000..5de677e72 --- /dev/null +++ b/loopy/schedule/tools.py @@ -0,0 +1,191 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2016 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.kernel.data import temp_var_scope +from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem, + RunInstruction, Barrier) + +from pytools import memoize_method + + +# {{{ block boundary finder + +def get_block_boundaries(schedule): + """ + Return a dictionary mapping indices of + :class:`loopy.schedule.BlockBeginItem`s to + :class:`loopy.schedule.BlockEndItem`s and vice versa. + """ + block_bounds = {} + active_blocks = [] + for idx, sched_item in enumerate(schedule): + if isinstance(sched_item, BeginBlockItem): + active_blocks.append(idx) + elif isinstance(sched_item, EndBlockItem): + start = active_blocks.pop() + block_bounds[start] = idx + block_bounds[idx] = start + return block_bounds + +# }}} + + +# {{{ instruction query utility + +class InstructionQuery(object): + + def __init__(self, kernel): + self.kernel = kernel + block_bounds = get_block_boundaries(kernel.schedule) + subkernel_slices = {} + from six import iteritems + for start, end in iteritems(block_bounds): + sched_item = kernel.schedule[start] + if isinstance(sched_item, CallKernel): + subkernel_slices[sched_item.kernel_name] = slice(start, end + 1) + self.subkernel_slices = subkernel_slices + + @memoize_method + def subkernels(self): + return frozenset(self.subkernel_slices.keys()) + + @memoize_method + def insns_reading_or_writing(self, var): + return frozenset(insn.id for insn in self.kernel.instructions + if var in insn.read_dependency_names() + or var in insn.assignee_var_names()) + + @memoize_method + def insns_in_subkernel(self, subkernel): + return frozenset(sched_item.insn_id for sched_item + in self.kernel.schedule[self.subkernel_slices[subkernel]] + if isinstance(sched_item, RunInstruction)) + + @memoize_method + def temporaries_read_in_subkernel(self, subkernel): + return frozenset( + var + for insn in self.insns_in_subkernel(subkernel) + for var in self.kernel.id_to_insn[insn].read_dependency_names() + if var in self.kernel.temporary_variables) + + @memoize_method + def temporaries_written_in_subkernel(self, subkernel): + return frozenset( + var + for insn in self.insns_in_subkernel(subkernel) + for var in self.kernel.id_to_insn[insn].assignee_var_names() + if var in self.kernel.temporary_variables) + + @memoize_method + def temporaries_read_or_written_in_subkernel(self, subkernel): + return ( + self.temporaries_read_in_subkernel(subkernel) | + self.temporaries_written_in_subkernel(subkernel)) + + @memoize_method + def inames_in_subkernel(self, subkernel): + subkernel_start = self.subkernel_slices[subkernel].start + return frozenset(self.kernel.schedule[subkernel_start].extra_inames) + + @memoize_method + def pre_and_post_barriers(self, subkernel): + subkernel_start = self.subkernel_slices[subkernel].start + subkernel_end = self.subkernel_slices[subkernel].stop + + def is_global_barrier(item): + return isinstance(item, Barrier) and item.kind == "global" + + try: + pre_barrier = next(item for item in + self.kernel.schedule[subkernel_start::-1] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + pre_barrier = None + + try: + post_barrier = next(item for item in + self.kernel.schedule[subkernel_end:] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + post_barrier = None + + return (pre_barrier, post_barrier) + + @memoize_method + def hw_inames(self, insn_id): + """ + Return the inames that insn runs in and that are tagged as hardware + parallel. + """ + from loopy.kernel.data import HardwareParallelTag + return set(iname for iname in self.kernel.insn_inames(insn_id) + if isinstance(self.kernel.iname_to_tag.get(iname), + HardwareParallelTag)) + + @memoize_method + def common_hw_inames(self, insn_ids): + """ + Return the common set of hardware parallel tagged inames among + the list of instructions. + """ + # Get the list of hardware inames in which the temporary is defined. + if len(insn_ids) == 0: + return set() + return set.intersection(*(self.hw_inames(id) for id in insn_ids)) + +# }}} + + +# {{{ add extra args to schedule + +def add_extra_args_to_schedule(kernel): + """ + Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel` + instructions in the schedule with global temporaries. + """ + new_schedule = [] + + insn_query = InstructionQuery(kernel) + + for sched_item in kernel.schedule: + if isinstance(sched_item, CallKernel): + subrange_temporaries = (insn_query + .temporaries_read_or_written_in_subkernel(sched_item.kernel_name)) + more_args = set(tv + for tv in subrange_temporaries + if + kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL + and + kernel.temporary_variables[tv].initializer is None + and + tv not in sched_item.extra_args) + new_schedule.append(sched_item.copy( + extra_args=sched_item.extra_args + sorted(more_args))) + else: + new_schedule.append(sched_item) + + return kernel.copy(schedule=new_schedule) + +# }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 9f8b8d15a..4334ae577 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1056,7 +1056,7 @@ def test_kernel_splitting(ctx_factory): cgr = lp.generate_code_v2(knl) - assert len(cgr.device_programs) == 3 + assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) -- GitLab From 638c7c3e016227d2b8411a28f71c0b88ded9e229 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 5 Nov 2016 02:06:10 -0500 Subject: [PATCH 04/22] Remove stowaway print statement --- loopy/schedule/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 0a1db247a..e50f148f5 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1560,7 +1560,6 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): candidates.clear() def issue_barrier(dep): - print("issuing barrier") seen_barrier() comment = None -- GitLab From a01be876679ee3d0cfa27ead9b9d5f7a26355998 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 5 Nov 2016 02:14:57 -0500 Subject: [PATCH 05/22] Remove friendly print statement. --- loopy/schedule/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index e50f148f5..36eed0d63 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1687,7 +1687,6 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): kind)) else: - print("HIIII") issue_barrier(dep=dep) break -- GitLab From 9af6f6b8ac676cb0f416fee600adb1c3049aede6 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 5 Nov 2016 15:32:56 -0500 Subject: [PATCH 06/22] spill and restore: handle entry/exit kernels --- loopy/transform/spill.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/loopy/transform/spill.py b/loopy/transform/spill.py index cecf24ddf..61c68b80b 100644 --- a/loopy/transform/spill.py +++ b/loopy/transform/spill.py @@ -550,9 +550,13 @@ def spill_and_reload(knl, **kwargs): if isinstance(sched_item, CallKernel): # Any written temporary that is live-out needs to be read into # memory because of the potential for partial writes. - interesting_temporaries = ( - insn_query.temporaries_read_or_written_in_subkernel( - sched_item.kernel_name)) + if sched_idx == 0: + # Kernel entry: nothing live + interesting_temporaries = set() + else: + interesting_temporaries = ( + insn_query.temporaries_read_or_written_in_subkernel( + sched_item.kernel_name)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}" @@ -560,9 +564,14 @@ def spill_and_reload(knl, **kwargs): spiller.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): - interesting_temporaries = ( - insn_query.temporaries_written_in_subkernel( - sched_item.kernel_name)) + if sched_idx == len(knl.schedule) - 1: + # Kernel exit: nothing live + interesting_temporaries = set() + else: + interesting_temporaries = ( + insn_query.temporaries_written_in_subkernel( + sched_item.kernel_name)) + for temporary in liveness[sched_idx].live_in & interesting_temporaries: logger.info("spilling {0} before return of {1}" .format(temporary, sched_item.kernel_name)) -- GitLab From b761d61e8c2f5861f8a4c3daf694c95902b2fca2 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 5 Nov 2016 16:33:33 -0500 Subject: [PATCH 07/22] Schedule barrier handling fixes. --- loopy/schedule/__init__.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 36eed0d63..c7dbd71e8 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -728,6 +728,23 @@ def generate_loop_schedules_internal( # }}} + # {{{ see if there are pending local barriers in the preschedule + + # Local barriers do not have associated instructions, so they need to + # be handled separately from instructions. + if ( + isinstance(next_preschedule_item, Barrier) + and next_preschedule_item.kind == "local"): + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:]), + allow_boost=rec_allow_boost, + debug=debug): + yield result + + # }}} + # {{{ see if any insns are ready to be scheduled now # Also take note of insns that have a chance of being schedulable inside @@ -791,10 +808,15 @@ def generate_loop_schedules_internal( # {{{ check if scheduling this insn is compatible with preschedule if insn_id in sched_state.prescheduled_insn_ids: - try: - next_preschedule_insn_id = next( - sched_item_to_insn_id(next_preschedule_item)) - except StopIteration: + if isinstance(next_preschedule_item, RunInstruction): + next_preschedule_insn_id = next_preschedule_item.insn_id + elif ( + isinstance(next_preschedule_item, Barrier) + and next_preschedule_item.kind == "global"): + assert hasattr(next_preschedule_item, "originating_insn_id") + assert next_preschedule_item.originating_insn_id is not None + next_preschedule_insn_id = next_preschedule_item.originating_insn_id + else: next_preschedule_insn_id = None if next_preschedule_insn_id != insn_id: -- GitLab From 3f6d0ce4a2c346d97d6bbcc6e6b8aa3e1d3c033a Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 5 Nov 2016 16:33:46 -0500 Subject: [PATCH 08/22] add more spill and reload tests. --- test/test_loopy.py | 225 ++++++++++++++++++++++++++++++--------------- 1 file changed, 152 insertions(+), 73 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 4334ae577..ba97e5f76 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1105,55 +1105,142 @@ def test_kernel_splitting_with_loop(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) -def test_spill_and_reload_of_private_temporary(ctx_factory): +def spill_and_reload_test(queue, knl, out_expect, debug=False): + from loopy.preprocess import preprocess_kernel + from loopy.schedule import get_one_scheduled_kernel + + knl = preprocess_kernel(knl) + knl = get_one_scheduled_kernel(knl) + + from loopy.transform.spill import spill_and_reload + knl = spill_and_reload(knl) + knl = get_one_scheduled_kernel(knl) + + if debug: + print(knl) + cgr = lp.generate_code_v2(knl) + print(cgr.device_code()) + print(cgr.host_code()) + 1/0 + + _, (out,) = knl(queue) + assert (out.get() == out_expect).all() + + +@pytest.mark.parametrize("hw_loop", [True, False]) +def test_spill_of_private_scalar(ctx_factory, hw_loop, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{ [i,j,k]: 0<=i,j,k<10 }", - """ - for i - for k - <> t_arr[k] = k - end - <> t_scalar = 1 - for j - ... gbarrier {id=bar} - out[j] = t_scalar {dep=bar} - ... gbarrier {id=bar2,dep=bar} - t_scalar = 10 {dep=bar2} - end - ... gbarrier {id=bar3,dep=bar2} - <> flag = i == 9 - out[i] = t_arr[i] {dep=bar3,if=flag} + "{ [i]: 0<=i<8 }", + """ + for i + <>t = i + ... gbarrier + out[i] = t + end + """, seq_dependencies=True) + + if hw_loop: + knl = lp.tag_inames(knl, dict(i="g.0")) + + spill_and_reload_test(queue, knl, np.arange(8), debug) + + +def test_spill_of_private_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i]: 0<=i<8 }", + """ + for i + <>t[i] = i + ... gbarrier + out[i] = t[i] + end + """, seq_dependencies=True) + + knl = lp.set_temporary_scope(knl, "t", "private") + spill_and_reload_test(queue, knl, np.arange(8), debug) + + +def test_spill_of_private_array_in_hw_loop(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i,j,k]: 0<=i,j,k<8 }", + """ + for i + for j + <>t[j] = j end - """) + ... gbarrier + for k + out[i,k] = t[k] + end + end + """, seq_dependencies=True) - knl = lp.set_temporary_scope(knl, "t_arr", "private") - knl = lp.add_and_infer_dtypes(knl, {"out": np.float32}) knl = lp.tag_inames(knl, dict(i="g.0")) + knl = lp.set_temporary_scope(knl, "t", "private") - from loopy.preprocess import preprocess_kernel - from loopy.schedule import get_one_scheduled_kernel + spill_and_reload_test(queue, knl, np.vstack((8 * (np.arange(8),))), debug) - knl = preprocess_kernel(knl) - knl = get_one_scheduled_kernel(knl) - from loopy.transform.spill import spill_and_reload - knl = spill_and_reload(knl) - knl = get_one_scheduled_kernel(knl) +def test_spill_of_private_multidim_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - cgr = lp.generate_code_v2(knl) + knl = lp.make_kernel( + "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }", + """ + for i + for j, k + <>t[j,k] = k + end + ... gbarrier + for l, m + out[i,l,m] = t[l,m] + end + end + """, seq_dependencies=True) - assert len(cgr.device_programs) == 4 + knl = lp.set_temporary_scope(knl, "t", "private") - _, (out,) = knl(queue) - assert (out.get() - == np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9], - dtype=np.float32)).all() + result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) + spill_and_reload_test(queue, knl, result, debug) + + +def test_spill_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }", + """ + for i + for j, k + <>t[j,k] = k + end + ... gbarrier + for l, m + out[i,l,m] = t[l,m] + end + end + """, seq_dependencies=True) + knl = lp.set_temporary_scope(knl, "t", "private") + knl = lp.tag_inames(knl, dict(i="g.0")) -def test_spill_and_reload_of_private_temporary_no_hw_loop(ctx_factory): + result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) + spill_and_reload_test(queue, knl, result, debug) + + +@pytest.mark.parametrize("hw_loop", [True, False]) +def test_spill_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1166,40 +1253,48 @@ def test_spill_and_reload_of_private_temporary_no_hw_loop(ctx_factory): end <> t_scalar = 1 for j - ... gbarrier {id=bar} - out[j] = t_scalar {dep=bar} - ... gbarrier {id=bar2,dep=bar} - t_scalar = 10 {dep=bar2} + ... gbarrier + out[j] = t_scalar + ... gbarrier + t_scalar = 10 end - ... gbarrier {id=bar3,dep=bar2} - out[i] = t_arr[i] {dep=bar3} + ... gbarrier + <> flag = i == 9 + out[i] = t_arr[i] {if=flag} end - """) + """, seq_dependencies=True) knl = lp.set_temporary_scope(knl, "t_arr", "private") - knl = lp.add_and_infer_dtypes(knl, {"out": np.float32}) + if hw_loop: + knl = lp.tag_inames(knl, dict(i="g.0")) - from loopy.preprocess import preprocess_kernel - from loopy.schedule import get_one_scheduled_kernel + result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9]) - knl = preprocess_kernel(knl) - knl = get_one_scheduled_kernel(knl) + spill_and_reload_test(queue, knl, result, debug) - from loopy.transform.spill import spill_and_reload - knl = spill_and_reload(knl) - knl = get_one_scheduled_kernel(knl) - cgr = lp.generate_code_v2(knl) +def test_spill_of_local_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - assert len(cgr.device_programs) == 4 + knl = lp.make_kernel( + "{ [i,j]: 0<=i,j<8 }", + """ + for i, j + <>t[2*j] = j + t[2*j+1] = j + ... gbarrier + out[i] = t[2*i] + end + """, seq_dependencies=True) - _, (out,) = knl(queue) - assert (out.get() - == np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9], - dtype=np.float32)).all() + knl = lp.set_temporary_scope(knl, "t", "local") + knl = lp.tag_inames(knl, dict(i="g.0", j="l.0")) + spill_and_reload_test(queue, knl, np.arange(8), debug) -def test_spill_and_reload_of_local_temporary(ctx_factory): + +def test_spill_local_multidim_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1214,26 +1309,10 @@ def test_spill_and_reload_of_local_temporary(ctx_factory): end """, seq_dependencies=True) - knl = lp.add_and_infer_dtypes(knl, {"out": np.int}) knl = lp.set_temporary_scope(knl, "t_local", "local") knl = lp.tag_inames(knl, dict(j="l.0", i="g.0")) - from loopy.preprocess import preprocess_kernel - from loopy.schedule import get_one_scheduled_kernel - - knl = preprocess_kernel(knl) - knl = get_one_scheduled_kernel(knl) - - from loopy.transform.spill import spill_and_reload - knl = spill_and_reload(knl) - knl = get_one_scheduled_kernel(knl) - - cgr = lp.generate_code_v2(knl) - assert len(cgr.device_programs) == 2 - - _, (out,) = knl(queue) - - assert (out.get() == 1).all() + spill_and_reload_test(queue, knl, 1, debug) def test_global_temporary(ctx_factory): -- GitLab From 8b85b2cb42df62637a9afd87de222714900d60a8 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 5 Nov 2016 19:08:05 -0500 Subject: [PATCH 09/22] Fix compyte version. --- loopy/target/c/compyte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index ac1c71d46..11dc00352 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc +Subproject commit 11dc00352423cddd71f09e809d0a22ab1c3ea7a5 -- GitLab From 79f2e7d8bfa907db927baca5d8900a9f1b78c1d4 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 5 Nov 2016 22:33:11 -0500 Subject: [PATCH 10/22] Scheduler: Add a heuristic to avoid scheduling: EnterLoop(temporary.reload) CallKernel instead of CallKernel EnterLoop(temporary.reload). --- loopy/schedule/__init__.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index c7dbd71e8..a97a31bfa 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1073,6 +1073,28 @@ def generate_loop_schedules_internal( % iname) continue + if ( + not sched_state.within_subkernel + and iname not in sched_state.prescheduled_inames): + # Avoid messing up some orderings such as picking: + # + # EnterLoop(temporary.reload) + # CallKernel + # ... + # + # instead of + # + # CallKernel + # EnterLoop(temporary.reload) + # ... + # + # This serves a heuristic to catch some bad decisions early, the + # scheduler will not allow the first variant regardless. + if debug_mode: + print("scheduling '%s' prohibited because we are outside " + "a subkernel" % iname) + continue + currently_accessible_inames = ( active_inames_set | sched_state.parallel_inames) if ( -- GitLab From 48bb35bab12ff742abcfcf30de589984078174c2 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 6 Nov 2016 16:10:38 -0600 Subject: [PATCH 11/22] Bump data model for no_global_sync_with --- loopy/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/version.py b/loopy/version.py index aa3e7abee..91348ac56 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v44-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v45-islpy%s" % _islpy_version -- GitLab From 0a3e36da0238985de3114a6b53dfe28d3edd78bf Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 6 Nov 2016 23:35:00 -0600 Subject: [PATCH 12/22] Remove no_global_sync_with in favor of using no_sync_with. --- loopy/kernel/__init__.py | 13 +++++++++++++ loopy/kernel/creation.py | 7 ++++--- loopy/kernel/instruction.py | 35 ++++++++++++++--------------------- loopy/schedule/__init__.py | 7 ++----- loopy/target/c/compyte | 2 +- loopy/transform/spill.py | 5 +++-- 6 files changed, 37 insertions(+), 32 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 83119d770..b31a6e185 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1056,6 +1056,19 @@ class LoopKernel(RecordWithoutPickling): # }}} + # {{{ nosync sets + + @memoize_method + def get_nosync_set(self, insn_id, scope): + assert scope in ("local", "global") + + return frozenset( + insn_id + for insn_id, nosync_scope in self.id_to_insn[insn_id].no_sync_with + if nosync_scope == scope or nosync_scope == "any") + + # }}} + # {{{ pretty-printing def stringify(self, what=None, with_dependencies=False): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index ac41531f9..1efaca6a6 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -152,7 +152,6 @@ def get_default_insn_options_dict(): "depends_on": None, "depends_on_is_final": False, "no_sync_with": None, - "no_global_sync_with": None, "groups": frozenset(), "conflicts_with_groups": frozenset(), "insn_id": None, @@ -231,9 +230,11 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): raise LoopyError("'nosync' option may not be specified " "in a 'with' block") + # TODO: Come up with a syntax that allows the user to express + # different synchronization scopes. result["no_sync_with"] = frozenset( - intern(dep.strip()) for dep in opt_value.split(":") - if dep.strip()) + (intern(dep.strip()), "any") + for dep in opt_value.split(":") if dep.strip()) elif opt_key == "groups" and opt_value is not None: result["groups"] = frozenset( diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index a7904f30e..d6b09ee64 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -80,11 +80,17 @@ class InstructionBase(Record): .. attribute:: no_sync_with - a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances - with which no barrier synchronization is necessary, even given the existence - of a dependency chain and apparently conflicting access + a :class:`frozenset` of tuples of the form `(insn_id, scope)`, where + `insn_id` refers to :attr:`id` of :class:`Instruction` instances + and `scope` is one of the following strings: + + - `"local"` + - `"global"` + - `"any"`. - .. attribute:: no_global_sync_with + This indicates no barrier synchronization is necessary with the given + instruction using barriers of type `scope`, even given the existence of + a dependency chain and apparently conflicting access. .. rubric:: Conditionals @@ -128,7 +134,6 @@ class InstructionBase(Record): fields = set("id depends_on depends_on_is_final " "groups conflicts_with_groups " "no_sync_with " - "no_global_sync_with " "predicates " "within_inames_is_final within_inames " "priority boostable boostable_into".split()) @@ -136,7 +141,6 @@ class InstructionBase(Record): def __init__(self, id, depends_on, depends_on_is_final, groups, conflicts_with_groups, no_sync_with, - no_global_sync_with, within_inames_is_final, within_inames, priority, boostable, boostable_into, predicates, tags, @@ -198,9 +202,6 @@ class InstructionBase(Record): if no_sync_with is None: no_sync_with = frozenset() - if no_global_sync_with is None: - no_global_sync_with = frozenset() - if within_inames is None: within_inames = frozenset() @@ -245,7 +246,6 @@ class InstructionBase(Record): depends_on=depends_on, depends_on_is_final=depends_on_is_final, no_sync_with=no_sync_with, - no_global_sync_with=no_global_sync_with, groups=groups, conflicts_with_groups=conflicts_with_groups, within_inames_is_final=within_inames_is_final, within_inames=within_inames, @@ -374,7 +374,10 @@ class InstructionBase(Record): if self.depends_on: result.append("dep="+":".join(self.depends_on)) if self.no_sync_with: - result.append("nosync="+":".join(self.no_sync_with)) + # TODO: Come up with a syntax to express different kinds of + # synchronization scopes. + result.append("nosync="+":".join( + insn_id for insn_id, _ in self.no_sync_with)) if self.groups: result.append("groups=%s" % ":".join(self.groups)) if self.conflicts_with_groups: @@ -730,7 +733,6 @@ class Assignment(MultiAssignmentBase): groups=None, conflicts_with_groups=None, no_sync_with=None, - no_global_sync_with=None, within_inames_is_final=None, within_inames=None, boostable=None, boostable_into=None, tags=None, @@ -746,7 +748,6 @@ class Assignment(MultiAssignmentBase): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, - no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, boostable=boostable, @@ -884,7 +885,6 @@ class CallInstruction(MultiAssignmentBase): groups=None, conflicts_with_groups=None, no_sync_with=None, - no_global_sync_with=None, within_inames_is_final=None, within_inames=None, boostable=None, boostable_into=None, tags=None, @@ -901,7 +901,6 @@ class CallInstruction(MultiAssignmentBase): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, - no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, boostable=boostable, @@ -1072,7 +1071,6 @@ class CInstruction(InstructionBase): id=None, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, - no_global_sync_with=None, within_inames_is_final=None, within_inames=None, priority=0, boostable=None, boostable_into=None, predicates=frozenset(), tags=None, @@ -1093,7 +1091,6 @@ class CInstruction(InstructionBase): depends_on_is_final=depends_on_is_final, groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, - no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, boostable=boostable, @@ -1254,7 +1251,6 @@ class NoOpInstruction(_DataObliviousInstruction): def __init__(self, id=None, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, - no_global_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, boostable=None, boostable_into=None, @@ -1266,7 +1262,6 @@ class NoOpInstruction(_DataObliviousInstruction): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, - no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, @@ -1307,7 +1302,6 @@ class BarrierInstruction(_DataObliviousInstruction): def __init__(self, id, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, - no_global_sync_with=None, within_inames_is_final=None, within_inames=None, priority=None, boostable=None, boostable_into=None, @@ -1324,7 +1318,6 @@ class BarrierInstruction(_DataObliviousInstruction): groups=groups, conflicts_with_groups=conflicts_with_groups, no_sync_with=no_sync_with, - no_global_sync_with=no_global_sync_with, within_inames_is_final=within_inames_is_final, within_inames=within_inames, priority=priority, diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index a97a31bfa..1a3f5b256 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1410,11 +1410,8 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): if reverse: source, target = target, source - if source.id in target.no_sync_with: - return None - - if var_kind == "global" and source.id in target.no_global_sync_with: - return None + if source.id in kernel.get_nosync_set(target.id, var_kind): + return # {{{ check that a dependency exists diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 11dc00352..ac1c71d46 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 11dc00352423cddd71f09e809d0a22ab1c3ea7a5 +Subproject commit ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc diff --git a/loopy/transform/spill.py b/loopy/transform/spill.py index 61c68b80b..b4caaa5f0 100644 --- a/loopy/transform/spill.py +++ b/loopy/transform/spill.py @@ -404,13 +404,14 @@ class Spiller(object): insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) - # Add no_global_sync_with between any added reloads and spills + # Add global no_sync_with between any added reloads and spills from six import iteritems for temporary, added_insns in iteritems(self.spills_or_reloads_added): for insn_id in added_insns: insn = insns_to_insert[insn_id] insns_to_insert[insn_id] = insn.copy( - no_global_sync_with=added_insns) + no_sync_with=frozenset( + (added_insn, "global") for added_insn in added_insns)) for orig_insn in self.kernel.instructions: if orig_insn.id in self.insns_to_update: -- GitLab From 81775912e23e2bfabe359cd2a6d161627e733e96 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 7 Nov 2016 00:08:18 -0600 Subject: [PATCH 13/22] Teach the rest of loopy about no_sync_with changes. --- loopy/kernel/creation.py | 7 +++++-- loopy/preprocess.py | 4 ++-- loopy/transform/buffer.py | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1efaca6a6..1da092f47 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1446,8 +1446,11 @@ def resolve_wildcard_deps(knl): if insn.depends_on is not None: insn = insn.copy( depends_on=resove_wildcard_insn_ids(knl, insn.depends_on), - no_sync_with=resove_wildcard_insn_ids( - knl, insn.no_sync_with), + no_sync_with=frozenset( + (resolved_insn_id, nosync_scope) + for insn_id, nosync_scope in insn.no_sync_with + for resolved_insn_id in + resove_wildcard_insn_ids(knl, [insn_id])), ) new_insns.append(insn) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c0f42e55a..713603c5c 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -626,7 +626,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id]) | insn.depends_on, - no_sync_with=frozenset([init_id])) + no_sync_with=frozenset([(init_id, "any")])) generated_insns.append(transfer_insn) def _strip_if_scalar(c): @@ -684,7 +684,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): istage += 1 new_insn_add_depends_on.add(prev_id) - new_insn_add_no_sync_with.add(prev_id) + new_insn_add_no_sync_with.add((prev_id, "any")) new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname) if nresults == 1: diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index b2c86c084..92cff7a50 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -500,7 +500,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, store_instruction = Assignment( id=kernel.make_unique_instruction_id(based_on="store_"+var_name), depends_on=frozenset(aar.modified_insn_ids), - no_sync_with=frozenset([init_insn_id]), + no_sync_with=frozenset([(init_insn_id, "any")]), assignee=store_target, expression=store_expression, within_inames=( -- GitLab From ec400fe9be2afa6cd1cf6b4336509ab5772573fe Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sat, 19 Nov 2016 18:47:56 -0600 Subject: [PATCH 14/22] spill and reload test: use out_host=True --- test/test_loopy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index ba97e5f76..2a7a30850 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1123,8 +1123,8 @@ def spill_and_reload_test(queue, knl, out_expect, debug=False): print(cgr.host_code()) 1/0 - _, (out,) = knl(queue) - assert (out.get() == out_expect).all() + _, (out,) = knl(queue, out_host=True) + assert (out == out_expect).all() @pytest.mark.parametrize("hw_loop", [True, False]) -- GitLab From 839963eadf78ca0086385013ef2cb267f48d1170 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 27 Nov 2016 00:02:26 -0600 Subject: [PATCH 15/22] spill => save --- loopy/target/c/compyte | 2 +- loopy/transform/{spill.py => save.py} | 94 +++++++++++++-------------- test/test_loopy.py | 38 +++++------ 3 files changed, 65 insertions(+), 69 deletions(-) rename loopy/transform/{spill.py => save.py} (89%) diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index ac1c71d46..11dc00352 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit ac1c71d46428c14aa1bd1c09d7da19cd0298d5cc +Subproject commit 11dc00352423cddd71f09e809d0a22ab1c3ea7a5 diff --git a/loopy/transform/spill.py b/loopy/transform/save.py similarity index 89% rename from loopy/transform/spill.py rename to loopy/transform/save.py index b4caaa5f0..5ae366905 100644 --- a/loopy/transform/spill.py +++ b/loopy/transform/save.py @@ -117,7 +117,7 @@ class LivenessAnalysis(object): # # We don't currently check if the write is a partial write # or a full write. Instead, we analyze the access - # footprint later on to determine how much to reload/spill. + # footprint later on to determine how much to reload/save. gen[sched_idx].add(var) for var in insn.read_dependency_names(): if var not in self.kernel.temporary_variables: @@ -176,9 +176,9 @@ class LivenessAnalysis(object): # }}} -# {{{ spill and reload implementation +# {{{ save and reload implementation -class Spiller(object): +class TemporarySaver(object): class PromotedTemporary(Record): """ @@ -232,7 +232,7 @@ class Spiller(object): self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} - self.spills_or_reloads_added = {} + self.saves_or_reloads_added = {} @memoize_method def auto_promote_temporary(self, temporary_name): @@ -296,7 +296,7 @@ class Spiller(object): non_hw_dims = (1,) backing_temporary = self.PromotedTemporary( - name=self.var_name_gen(temporary.name + "_spill_slot"), + name=self.var_name_gen(temporary.name + "_save_slot"), orig_temporary=temporary, hw_dims=tuple(hw_dims), non_hw_dims=non_hw_dims, @@ -304,9 +304,9 @@ class Spiller(object): return backing_temporary - def spill_or_reload_impl(self, temporary, subkernel, mode, + def save_or_reload_impl(self, temporary, subkernel, mode, promoted_temporary=lp.auto): - assert mode in ("spill", "reload") + assert mode in ("save", "reload") if promoted_temporary is auto: promoted_temporary = self.auto_promote_temporary(temporary) @@ -322,12 +322,12 @@ class Spiller(object): set(promoted_temporary.hw_inames))) domain, hw_inames, dim_inames, iname_to_tag = \ - self.augment_domain_for_spill_or_reload( - dchg.domain, promoted_temporary, mode) + self.augment_domain_for_save_or_reload( + dchg.domain, promoted_temporary, mode, subkernel) self.kernel = dchg.get_kernel_with(domain) - spill_or_load_insn_id = self.insn_name_gen( + save_or_load_insn_id = self.insn_name_gen( "{name}.{mode}".format(name=temporary, mode=mode)) def subscript_or_var(agg, subscript=()): @@ -347,14 +347,14 @@ class Spiller(object): subscript_or_var( promoted_temporary.name, hw_inames + dim_inames)) - if mode == "spill": + if mode == "save": args = reversed(args) accessing_insns_in_subkernel = ( self.insn_query.insns_reading_or_writing(temporary) & self.insn_query.insns_in_subkernel(subkernel)) - if mode == "spill": + if mode == "save": depends_on = accessing_insns_in_subkernel update_deps = frozenset() elif mode == "reload": @@ -371,9 +371,9 @@ class Spiller(object): # Create the load / store instruction. from loopy.kernel.data import Assignment - spill_or_load_insn = Assignment( + save_or_load_insn = Assignment( *args, - id=spill_or_load_insn_id, + id=save_or_load_insn_id, within_inames=( self.insn_query.inames_in_subkernel(subkernel) | frozenset(hw_inames + dim_inames)), @@ -382,16 +382,16 @@ class Spiller(object): boostable=False, boostable_into=frozenset()) - if temporary not in self.spills_or_reloads_added: - self.spills_or_reloads_added[temporary] = set() - self.spills_or_reloads_added[temporary].add(spill_or_load_insn_id) + if temporary not in self.saves_or_reloads_added: + self.saves_or_reloads_added[temporary] = set() + self.saves_or_reloads_added[temporary].add(save_or_load_insn_id) - self.insns_to_insert.append(spill_or_load_insn) + self.insns_to_insert.append(save_or_load_insn) for insn_id in update_deps: insn = self.insns_to_update.get(insn_id, self.kernel.id_to_insn[insn_id]) self.insns_to_update[insn_id] = insn.copy( - depends_on=insn.depends_on | frozenset([spill_or_load_insn_id])) + depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) self.updated_temporary_variables[promoted_temporary.name] = \ promoted_temporary.as_variable() @@ -404,9 +404,9 @@ class Spiller(object): insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) - # Add global no_sync_with between any added reloads and spills + # Add global no_sync_with between any added reloads and saves from six import iteritems - for temporary, added_insns in iteritems(self.spills_or_reloads_added): + for temporary, added_insns in iteritems(self.saves_or_reloads_added): for insn_id in added_insns: insn = insns_to_insert[insn_id] insns_to_insert[insn_id] = insn.copy( @@ -432,24 +432,20 @@ class Spiller(object): from loopy.kernel.tools import assign_automatic_axes return assign_automatic_axes(kernel) - def spill(self, temporary, subkernel): - self.spill_or_reload_impl(temporary, subkernel, "spill") + def save(self, temporary, subkernel): + self.save_or_reload_impl(temporary, subkernel, "save") def reload(self, temporary, subkernel): - self.spill_or_reload_impl(temporary, subkernel, "reload") + self.save_or_reload_impl(temporary, subkernel, "reload") - def get_access_footprint_in_subkernel(self, temporary, subkernel, kind): - # FIXME: Return some sort of actual non-trivial access footprint. - assert kind in ("read", "write") - - def augment_domain_for_spill_or_reload(self, - domain, promoted_temporary, mode): + def augment_domain_for_save_or_reload(self, + domain, promoted_temporary, mode, subkernel): """ Add new axes to the domain corresponding to the dimensions of - `promoted_temporary`. These axes will be used in the spill/ + `promoted_temporary`. These axes will be used in the save/ reload stage. """ - assert mode in ("spill", "reload") + assert mode in ("save", "reload") import islpy as isl orig_temporary = promoted_temporary.orig_temporary @@ -465,10 +461,11 @@ class Spiller(object): domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims)) for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims): - new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}". + new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}". format(name=orig_temporary.name, mode=mode, - dim=dim_idx)) + dim=dim_idx, + sk=subkernel)) domain = domain.set_dim_name( isl.dim_type.set, orig_dim + dim_idx, new_iname) @@ -491,10 +488,11 @@ class Spiller(object): # Add hardware inames duplicates. for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): - new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}". + new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}". format(name=orig_temporary.name, mode=mode, - dim=t_idx)) + dim=t_idx, + sk=subkernel)) hw_inames.append(new_iname) iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname] @@ -512,11 +510,11 @@ class Spiller(object): # }}} -# {{{ auto spill and reload across kernel calls +# {{{ auto save and reload across kernel calls -def spill_and_reload(knl, **kwargs): +def save_and_reload(knl, **kwargs): """ - Add instructions to spill and reload temporary variables that are live + Add instructions to save and reload temporary variables that are live across kernel calls. The basic code transformation turns schedule segments: @@ -528,21 +526,19 @@ def spill_and_reload(knl, **kwargs): into this code: t = <...> - t_spill_slot = t + t_save_slot = t - t = t_spill_slot + t = t_save_slot <...> = t - where `t_spill_slot` is a newly-created global temporary variable. + where `t_save_slot` is a newly-created global temporary variable. :arg knl: :arg barriers: :returns: """ liveness = LivenessAnalysis(knl) - spiller = Spiller(knl) - - #liveness.print_liveness() + saver = TemporarySaver(knl) insn_query = InstructionQuery(knl) @@ -562,7 +558,7 @@ def spill_and_reload(knl, **kwargs): for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}" .format(temporary, sched_item.kernel_name)) - spiller.reload(temporary, sched_item.kernel_name) + saver.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): if sched_idx == len(knl.schedule) - 1: @@ -574,11 +570,11 @@ def spill_and_reload(knl, **kwargs): sched_item.kernel_name)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: - logger.info("spilling {0} before return of {1}" + logger.info("saving {0} before return of {1}" .format(temporary, sched_item.kernel_name)) - spiller.spill(temporary, sched_item.kernel_name) + saver.save(temporary, sched_item.kernel_name) - return spiller.finish() + return saver.finish() # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 29fac21b8..0f280892d 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1105,15 +1105,15 @@ def test_kernel_splitting_with_loop(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) -def spill_and_reload_test(queue, knl, out_expect, debug=False): +def save_and_reload_test(queue, knl, out_expect, debug=False): from loopy.preprocess import preprocess_kernel from loopy.schedule import get_one_scheduled_kernel knl = preprocess_kernel(knl) knl = get_one_scheduled_kernel(knl) - from loopy.transform.spill import spill_and_reload - knl = spill_and_reload(knl) + from loopy.transform.save import save_and_reload + knl = save_and_reload(knl) knl = get_one_scheduled_kernel(knl) if debug: @@ -1128,7 +1128,7 @@ def spill_and_reload_test(queue, knl, out_expect, debug=False): @pytest.mark.parametrize("hw_loop", [True, False]) -def test_spill_of_private_scalar(ctx_factory, hw_loop, debug=False): +def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1145,10 +1145,10 @@ def test_spill_of_private_scalar(ctx_factory, hw_loop, debug=False): if hw_loop: knl = lp.tag_inames(knl, dict(i="g.0")) - spill_and_reload_test(queue, knl, np.arange(8), debug) + save_and_reload_test(queue, knl, np.arange(8), debug) -def test_spill_of_private_array(ctx_factory, debug=False): +def test_save_of_private_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1163,10 +1163,10 @@ def test_spill_of_private_array(ctx_factory, debug=False): """, seq_dependencies=True) knl = lp.set_temporary_scope(knl, "t", "private") - spill_and_reload_test(queue, knl, np.arange(8), debug) + save_and_reload_test(queue, knl, np.arange(8), debug) -def test_spill_of_private_array_in_hw_loop(ctx_factory, debug=False): +def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1187,10 +1187,10 @@ def test_spill_of_private_array_in_hw_loop(ctx_factory, debug=False): knl = lp.tag_inames(knl, dict(i="g.0")) knl = lp.set_temporary_scope(knl, "t", "private") - spill_and_reload_test(queue, knl, np.vstack((8 * (np.arange(8),))), debug) + save_and_reload_test(queue, knl, np.vstack((8 * (np.arange(8),))), debug) -def test_spill_of_private_multidim_array(ctx_factory, debug=False): +def test_save_of_private_multidim_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1211,10 +1211,10 @@ def test_spill_of_private_multidim_array(ctx_factory, debug=False): knl = lp.set_temporary_scope(knl, "t", "private") result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) - spill_and_reload_test(queue, knl, result, debug) + save_and_reload_test(queue, knl, result, debug) -def test_spill_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False): +def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1236,11 +1236,11 @@ def test_spill_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False): knl = lp.tag_inames(knl, dict(i="g.0")) result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) - spill_and_reload_test(queue, knl, result, debug) + save_and_reload_test(queue, knl, result, debug) @pytest.mark.parametrize("hw_loop", [True, False]) -def test_spill_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False): +def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1270,10 +1270,10 @@ def test_spill_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9]) - spill_and_reload_test(queue, knl, result, debug) + save_and_reload_test(queue, knl, result, debug) -def test_spill_of_local_array(ctx_factory, debug=False): +def test_save_of_local_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1291,10 +1291,10 @@ def test_spill_of_local_array(ctx_factory, debug=False): knl = lp.set_temporary_scope(knl, "t", "local") knl = lp.tag_inames(knl, dict(i="g.0", j="l.0")) - spill_and_reload_test(queue, knl, np.arange(8), debug) + save_and_reload_test(queue, knl, np.arange(8), debug) -def test_spill_local_multidim_array(ctx_factory, debug=False): +def test_save_local_multidim_array(ctx_factory, debug=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -1312,7 +1312,7 @@ def test_spill_local_multidim_array(ctx_factory, debug=False): knl = lp.set_temporary_scope(knl, "t_local", "local") knl = lp.tag_inames(knl, dict(j="l.0", i="g.0")) - spill_and_reload_test(queue, knl, 1, debug) + save_and_reload_test(queue, knl, 1, debug) def test_global_temporary(ctx_factory): -- GitLab From 50899209f6944b805dba44569f644872a68d60eb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 27 Nov 2016 01:04:12 -0600 Subject: [PATCH 16/22] Fix signature --- doc/ref_transform.rst | 2 ++ loopy/transform/save.py | 13 ++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 5609fc253..0ce59b630 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -106,6 +106,8 @@ Creating Batches of Operations Finishing up ------------ +.. automodule:: loopy.transform.save + .. currentmodule:: loopy .. autofunction:: preprocess_kernel diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 5ae366905..603f6c4c7 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -39,6 +39,11 @@ import logging logger = logging.getLogger(__name__) +__doc__ = """ +.. autofunction:: save_and_reload +""" + + # {{{ liveness analysis class LivenessResult(dict): @@ -512,18 +517,18 @@ class TemporarySaver(object): # {{{ auto save and reload across kernel calls -def save_and_reload(knl, **kwargs): +def save_and_reload(knl): """ Add instructions to save and reload temporary variables that are live across kernel calls. - The basic code transformation turns schedule segments: + The basic code transformation turns schedule segments:: t = <...> <...> = t - into this code: + into this code:: t = <...> t_save_slot = t @@ -533,8 +538,6 @@ def save_and_reload(knl, **kwargs): where `t_save_slot` is a newly-created global temporary variable. - :arg knl: - :arg barriers: :returns: """ liveness = LivenessAnalysis(knl) -- GitLab From 2333333536e5bbc8e9e5eb26f39815f73e2140ef Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 27 Nov 2016 01:12:28 -0600 Subject: [PATCH 17/22] Bump data model for no_sync_with. --- loopy/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/version.py b/loopy/version.py index 0b56284bb..f7d157f65 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v48-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v49-islpy%s" % _islpy_version -- GitLab From 972d40d8068047c0b62fe884ade8ad00d5a94006 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 27 Nov 2016 21:16:49 -0600 Subject: [PATCH 18/22] Fix no_sync_with printing. --- loopy/kernel/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index c08f90029..e71a88886 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1226,7 +1226,9 @@ class LoopKernel(RecordWithoutPickling): options.append( "conflicts=%s" % ":".join(insn.conflicts_with_groups)) if insn.no_sync_with: - options.append("no_sync_with=%s" % ":".join(insn.no_sync_with)) + # FIXME: Find a syntax to express scopes. + options.append("no_sync_with=%s" % ":".join(id for id, _ in + insn.no_sync_with)) if lhs: core = "%s <- %s" % ( -- GitLab From 92fc167a0bc71f841fb855f2a87f6c1917bfa082 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 27 Nov 2016 21:19:37 -0600 Subject: [PATCH 19/22] rename save_and_reload -> save_and_reload_temporaries Also make lp.save_and_reload_temporaries. --- doc/ref_transform.rst | 4 ++-- loopy/__init__.py | 4 +++- loopy/transform/save.py | 8 +++++--- test/test_loopy.py | 23 ++++++++++++----------- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 0ce59b630..4a07b6333 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -106,8 +106,6 @@ Creating Batches of Operations Finishing up ------------ -.. automodule:: loopy.transform.save - .. currentmodule:: loopy .. autofunction:: preprocess_kernel @@ -116,6 +114,8 @@ Finishing up .. autofunction:: get_one_scheduled_kernel +.. autofunction:: save_and_reload_temporaries + .. autoclass:: GeneratedProgram .. autoclass:: CodeGenerationResult diff --git a/loopy/__init__.py b/loopy/__init__.py index 110652cf7..6bd764f8d 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -106,6 +106,7 @@ from loopy.transform.padding import ( from loopy.transform.ilp import realize_ilp from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters +from loopy.transform.save import save_and_reload_temporaries # }}} @@ -206,6 +207,8 @@ __all__ = [ "assume", "fix_parameters", + "save_and_reload_temporaries", + # }}} "get_dot_dependency_graph", @@ -258,7 +261,6 @@ __all__ = [ # }}} ] - # }}} diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 603f6c4c7..8706bc4da 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -40,7 +40,9 @@ logger = logging.getLogger(__name__) __doc__ = """ -.. autofunction:: save_and_reload +.. currentmodule:: loopy + +.. autofunction:: save_and_reload_temporaries """ @@ -517,7 +519,7 @@ class TemporarySaver(object): # {{{ auto save and reload across kernel calls -def save_and_reload(knl): +def save_and_reload_temporaries(knl): """ Add instructions to save and reload temporary variables that are live across kernel calls. @@ -538,7 +540,7 @@ def save_and_reload(knl): where `t_save_slot` is a newly-created global temporary variable. - :returns: + :returns: The resulting kernel """ liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl) diff --git a/test/test_loopy.py b/test/test_loopy.py index 0f280892d..af4269047 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1105,15 +1105,15 @@ def test_kernel_splitting_with_loop(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) -def save_and_reload_test(queue, knl, out_expect, debug=False): +def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False): from loopy.preprocess import preprocess_kernel from loopy.schedule import get_one_scheduled_kernel knl = preprocess_kernel(knl) knl = get_one_scheduled_kernel(knl) - from loopy.transform.save import save_and_reload - knl = save_and_reload(knl) + from loopy.transform.save import save_and_reload_temporaries + knl = save_and_reload_temporaries(knl) knl = get_one_scheduled_kernel(knl) if debug: @@ -1145,7 +1145,7 @@ def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False): if hw_loop: knl = lp.tag_inames(knl, dict(i="g.0")) - save_and_reload_test(queue, knl, np.arange(8), debug) + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) def test_save_of_private_array(ctx_factory, debug=False): @@ -1163,7 +1163,7 @@ def test_save_of_private_array(ctx_factory, debug=False): """, seq_dependencies=True) knl = lp.set_temporary_scope(knl, "t", "private") - save_and_reload_test(queue, knl, np.arange(8), debug) + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False): @@ -1187,7 +1187,8 @@ def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False): knl = lp.tag_inames(knl, dict(i="g.0")) knl = lp.set_temporary_scope(knl, "t", "private") - save_and_reload_test(queue, knl, np.vstack((8 * (np.arange(8),))), debug) + save_and_reload_temporaries_test( + queue, knl, np.vstack((8 * (np.arange(8),))), debug) def test_save_of_private_multidim_array(ctx_factory, debug=False): @@ -1211,7 +1212,7 @@ def test_save_of_private_multidim_array(ctx_factory, debug=False): knl = lp.set_temporary_scope(knl, "t", "private") result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) - save_and_reload_test(queue, knl, result, debug) + save_and_reload_temporaries_test(queue, knl, result, debug) def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False): @@ -1236,7 +1237,7 @@ def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False): knl = lp.tag_inames(knl, dict(i="g.0")) result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) - save_and_reload_test(queue, knl, result, debug) + save_and_reload_temporaries_test(queue, knl, result, debug) @pytest.mark.parametrize("hw_loop", [True, False]) @@ -1270,7 +1271,7 @@ def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False) result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9]) - save_and_reload_test(queue, knl, result, debug) + save_and_reload_temporaries_test(queue, knl, result, debug) def test_save_of_local_array(ctx_factory, debug=False): @@ -1291,7 +1292,7 @@ def test_save_of_local_array(ctx_factory, debug=False): knl = lp.set_temporary_scope(knl, "t", "local") knl = lp.tag_inames(knl, dict(i="g.0", j="l.0")) - save_and_reload_test(queue, knl, np.arange(8), debug) + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) def test_save_local_multidim_array(ctx_factory, debug=False): @@ -1312,7 +1313,7 @@ def test_save_local_multidim_array(ctx_factory, debug=False): knl = lp.set_temporary_scope(knl, "t_local", "local") knl = lp.tag_inames(knl, dict(j="l.0", i="g.0")) - save_and_reload_test(queue, knl, 1, debug) + save_and_reload_temporaries_test(queue, knl, 1, debug) def test_global_temporary(ctx_factory): -- GitLab From 8da51e67c257ba3a89ba159d180d1aae68f4cac2 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 27 Nov 2016 21:20:16 -0600 Subject: [PATCH 20/22] Add section on synchronization to tutorial. --- doc/tutorial.rst | 186 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 167 insertions(+), 19 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index ec9a09ae6..09fa1f00a 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -922,6 +922,8 @@ expression being assigned. ... """) >>> evt, (out1, out2) = knl(queue, a=x_vec_dev) +.. _local_temporaries: + Temporaries in local memory ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1064,6 +1066,170 @@ Generic Precomputation .. }}} + +.. _synchronization: + +Synchronization +--------------- + +.. {{{ + +In OpenCL, memory operations are not generally guaranteed to be immediately +visible to other work items. In order to ensure that memory is consistent across +work items, some sort of synchronization operation is used. + +:mod:`loopy` supports synchronization in the form of *barriers* or *atomic +operations*. + +Barriers +~~~~~~~~ + +Prior to code generation, :mod:`loopy` performs a check to see that every memory +access is free of dependencies requiring a barrier. A memory access dependency +that exists across multiple work items requires a barrier if it involves at +least one write operation. + +:mod:`loopy` supports two kinds of barriers: + +* *Local barriers* ensure consistency of local memory accesses to items within + *the same* work group. As in OpenCL, all work items in the group are required + to wait until everyone has reached the barrier instruction before continuing. + +* *Global barriers* ensure consistency of *global* memory accesses across *all* + work groups. Note that there is no exact equivalent in OpenCL. All work items + across all work groups are required to wait until everyone has reached the + barrier instruction before continuing. + +By default, :mod:`loopy` inserts local barriers between two instructions when it +detects that a dependency involving local memory may occur across work items. To +see this in action, take a look at the section on :ref:`local_temporaries`. + +In contrast, :mod:`loopy` will *not* insert global barriers +automatically. Consider the following kernel, which attempts to rotate its input +to the right by 1: + +.. doctest:: + + >>> knl = lp.make_kernel( + ... "[n] -> {[i] : 0<=itmp = arr[i] {id=maketmp,dep=*} + ... arr[(i + 1) % n] = tmp {id=rotate,dep=*maketmp} + ... end + ... """, + ... [ + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), + ... "...", + ... ], + ... name="rotate_v1", + ... assumptions="n mod 16 = 0") + >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> cgr = lp.generate_code_v2(knl) + Traceback (most recent call last): + ... + loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that nosynchronization is needed) + +Because of the write-after-read dependency in global memory, a global barrier +needs to be inserted. This can be accomplished with a ``... gbarrier`` +instruction. Note that :mod:`loopy` implements global barriers by splitting the +kernel into multiple device-side kernels, so that the resulting code will +contain more than one kernel. + +.. doctest:: + + >>> knl = lp.make_kernel( + ... "[n] -> {[i] : 0<=itmp = arr[i] {id=maketmp,dep=*} + ... ... gbarrier {id=bar,dep=*maketmp} + ... arr[(i + 1) % n] = tmp {id=rotate,dep=*bar} + ... end + ... """, + ... [ + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), + ... "...", + ... ], + ... name="rotate_v2", + ... assumptions="n mod 16 = 0") + >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> cgr = lp.generate_code_v2(knl) + >>> print(cgr.device_code()) + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n) + { + int tmp; + + tmp = arr[16 * gid(0) + lid(0)]; + } + + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n) + { + int tmp; + + arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; + } + +Note that we are not done yet. The problem is that while `tmp` is assigned in +the first kernel, the assignment of `tmp` is not saved for the second +kernel. :mod:`loopy` provides a function called +:func:`loopy.save_and_reload_temporaries` for the purpose of handling the +situation of saving and restoring temporary values across global barriers. In +order to use this function the kernel must be preprocessed and scheduled first, +the latter of which is handled by :func:`loopy.get_one_scheduled_kernel`. + +.. doctest:: + + >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> knl = lp.save_and_reload_temporaries(knl) + >>> knl = lp.get_one_scheduled_kernel(knl) + >>> cgr = lp.generate_code_v2(knl) + >>> print(cgr.device_code()) + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + { + int tmp; + + tmp = arr[16 * gid(0) + lid(0)]; + tmp_save_slot[16 * gid(0) + lid(0)] = tmp; + } + + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + { + int tmp; + + tmp = tmp_save_slot[16 * gid(0) + lid(0)]; + arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; + } + >>> evt, (out,) = knl(queue, arr=cl.array.arange(queue, 16, dtype=np.int32), out_host=True) + >>> print(out) + [15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] + +Atomic operations +~~~~~~~~~~~~~~~~~ + +:mod:`loopy` supports atomic operations. To use them, both the data on which the +atomic operations work as well as the operations themselves must be suitably +tagged, as in the following example:: + + + knl = lp.make_kernel( + "{ [i]: 0<=i>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict) >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict) >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict) - >>> print("%i\n%i\n%i\n%i\n%i\n%i" % + >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 524288 -- GitLab From 387e782d536e036279aafdf5ddd56c232d240482 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 27 Nov 2016 21:25:53 -0600 Subject: [PATCH 21/22] memory operations => writes --- doc/tutorial.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 09fa1f00a..38635ee5e 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1074,9 +1074,9 @@ Synchronization .. {{{ -In OpenCL, memory operations are not generally guaranteed to be immediately -visible to other work items. In order to ensure that memory is consistent across -work items, some sort of synchronization operation is used. +In OpenCL, writes are not generally guaranteed to be immediately visible to +other work items. In order to ensure that memory is consistent across work +items, some sort of synchronization operation is used. :mod:`loopy` supports synchronization in the form of *barriers* or *atomic operations*. -- GitLab From 06c74acefeb129abbead1ae4f1621732a84aef4c Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 27 Nov 2016 21:36:55 -0600 Subject: [PATCH 22/22] Doctest fix. --- doc/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 38635ee5e..0024b1915 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1128,7 +1128,7 @@ to the right by 1: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... - loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that nosynchronization is needed) + MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that nosynchronization is needed) Because of the write-after-read dependency in global memory, a global barrier needs to be inserted. This can be accomplished with a ``... gbarrier`` -- GitLab