diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 5609fc25328ed5b0a243eaa8d33c4d1aa6b3eddc..4a07b63330747aa69d7ed498e004d60b7c312a7b 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -114,6 +114,8 @@ Finishing up .. autofunction:: get_one_scheduled_kernel +.. autofunction:: save_and_reload_temporaries + .. autoclass:: GeneratedProgram .. autoclass:: CodeGenerationResult diff --git a/doc/tutorial.rst b/doc/tutorial.rst index ec9a09ae6a0766081db51587f83e95a7a2d992fd..0024b19153e27ca4331edd742554441107d175d9 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -922,6 +922,8 @@ expression being assigned. ... """) >>> evt, (out1, out2) = knl(queue, a=x_vec_dev) +.. _local_temporaries: + Temporaries in local memory ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1064,6 +1066,170 @@ Generic Precomputation .. }}} + +.. _synchronization: + +Synchronization +--------------- + +.. {{{ + +In OpenCL, writes are not generally guaranteed to be immediately visible to +other work items. In order to ensure that memory is consistent across work +items, some sort of synchronization operation is used. + +:mod:`loopy` supports synchronization in the form of *barriers* or *atomic +operations*. + +Barriers +~~~~~~~~ + +Prior to code generation, :mod:`loopy` performs a check to see that every memory +access is free of dependencies requiring a barrier. A memory access dependency +that exists across multiple work items requires a barrier if it involves at +least one write operation. + +:mod:`loopy` supports two kinds of barriers: + +* *Local barriers* ensure consistency of local memory accesses to items within + *the same* work group. As in OpenCL, all work items in the group are required + to wait until everyone has reached the barrier instruction before continuing. + +* *Global barriers* ensure consistency of *global* memory accesses across *all* + work groups. Note that there is no exact equivalent in OpenCL. All work items + across all work groups are required to wait until everyone has reached the + barrier instruction before continuing. + +By default, :mod:`loopy` inserts local barriers between two instructions when it +detects that a dependency involving local memory may occur across work items. To +see this in action, take a look at the section on :ref:`local_temporaries`. + +In contrast, :mod:`loopy` will *not* insert global barriers +automatically. Consider the following kernel, which attempts to rotate its input +to the right by 1: + +.. doctest:: + + >>> knl = lp.make_kernel( + ... "[n] -> {[i] : 0<=itmp = arr[i] {id=maketmp,dep=*} + ... arr[(i + 1) % n] = tmp {id=rotate,dep=*maketmp} + ... end + ... """, + ... [ + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), + ... "...", + ... ], + ... name="rotate_v1", + ... assumptions="n mod 16 = 0") + >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> cgr = lp.generate_code_v2(knl) + Traceback (most recent call last): + ... + MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that nosynchronization is needed) + +Because of the write-after-read dependency in global memory, a global barrier +needs to be inserted. This can be accomplished with a ``... gbarrier`` +instruction. Note that :mod:`loopy` implements global barriers by splitting the +kernel into multiple device-side kernels, so that the resulting code will +contain more than one kernel. + +.. doctest:: + + >>> knl = lp.make_kernel( + ... "[n] -> {[i] : 0<=itmp = arr[i] {id=maketmp,dep=*} + ... ... gbarrier {id=bar,dep=*maketmp} + ... arr[(i + 1) % n] = tmp {id=rotate,dep=*bar} + ... end + ... """, + ... [ + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), + ... "...", + ... ], + ... name="rotate_v2", + ... assumptions="n mod 16 = 0") + >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> cgr = lp.generate_code_v2(knl) + >>> print(cgr.device_code()) + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n) + { + int tmp; + + tmp = arr[16 * gid(0) + lid(0)]; + } + + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n) + { + int tmp; + + arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; + } + +Note that we are not done yet. The problem is that while `tmp` is assigned in +the first kernel, the assignment of `tmp` is not saved for the second +kernel. :mod:`loopy` provides a function called +:func:`loopy.save_and_reload_temporaries` for the purpose of handling the +situation of saving and restoring temporary values across global barriers. In +order to use this function the kernel must be preprocessed and scheduled first, +the latter of which is handled by :func:`loopy.get_one_scheduled_kernel`. + +.. doctest:: + + >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> knl = lp.save_and_reload_temporaries(knl) + >>> knl = lp.get_one_scheduled_kernel(knl) + >>> cgr = lp.generate_code_v2(knl) + >>> print(cgr.device_code()) + #define lid(N) ((int) get_local_id(N)) + #define gid(N) ((int) get_group_id(N)) + + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + { + int tmp; + + tmp = arr[16 * gid(0) + lid(0)]; + tmp_save_slot[16 * gid(0) + lid(0)] = tmp; + } + + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + { + int tmp; + + tmp = tmp_save_slot[16 * gid(0) + lid(0)]; + arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; + } + >>> evt, (out,) = knl(queue, arr=cl.array.arange(queue, 16, dtype=np.int32), out_host=True) + >>> print(out) + [15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] + +Atomic operations +~~~~~~~~~~~~~~~~~ + +:mod:`loopy` supports atomic operations. To use them, both the data on which the +atomic operations work as well as the operations themselves must be suitably +tagged, as in the following example:: + + + knl = lp.make_kernel( + "{ [i]: 0<=i>> f64add = op_map[lp.Op(np.float64, 'add')].eval_with_dict(param_dict) >>> f64mul = op_map[lp.Op(np.float64, 'mul')].eval_with_dict(param_dict) >>> i32add = op_map[lp.Op(np.int32, 'add')].eval_with_dict(param_dict) - >>> print("%i\n%i\n%i\n%i\n%i\n%i" % + >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 524288 diff --git a/loopy/__init__.py b/loopy/__init__.py index 110652cf75d467ceb473d4997142f4dabe3e763b..6bd764f8df93f1b4b2ae5755c1c90ccddc654fe6 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -106,6 +106,7 @@ from loopy.transform.padding import ( from loopy.transform.ilp import realize_ilp from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters +from loopy.transform.save import save_and_reload_temporaries # }}} @@ -206,6 +207,8 @@ __all__ = [ "assume", "fix_parameters", + "save_and_reload_temporaries", + # }}} "get_dot_dependency_graph", @@ -258,7 +261,6 @@ __all__ = [ # }}} ] - # }}} diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e75a95dec781317389b06aeaf2fbdbcbcc7f7bf6..55d4560be0849511b0e60ae7382053ce7a97912d 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -150,8 +150,15 @@ def generate_code_for_sched_index(codegen_state, sched_index): return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): - return codegen_state.ast_builder.emit_barrier( - sched_item.kind, sched_item.comment) + if codegen_state.is_generating_device_code: + return codegen_state.ast_builder.emit_barrier( + sched_item.kind, sched_item.comment) + from loopy.codegen.result import CodeGenerationResult + return CodeGenerationResult( + host_program=None, + device_programs=[], + implemented_domains={}, + implemented_data_info=codegen_state.implemented_data_info) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 5701bf77254277501e86e32cdef0e01edaa90b29..e71a88886cd2c244211f8f39dca3745c9c6ebc80 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1056,6 +1056,19 @@ class LoopKernel(RecordWithoutPickling): # }}} + # {{{ nosync sets + + @memoize_method + def get_nosync_set(self, insn_id, scope): + assert scope in ("local", "global") + + return frozenset( + insn_id + for insn_id, nosync_scope in self.id_to_insn[insn_id].no_sync_with + if nosync_scope == scope or nosync_scope == "any") + + # }}} + # {{{ pretty-printing def stringify(self, what=None, with_dependencies=False): @@ -1213,7 +1226,9 @@ class LoopKernel(RecordWithoutPickling): options.append( "conflicts=%s" % ":".join(insn.conflicts_with_groups)) if insn.no_sync_with: - options.append("no_sync_with=%s" % ":".join(insn.no_sync_with)) + # FIXME: Find a syntax to express scopes. + options.append("no_sync_with=%s" % ":".join(id for id, _ in + insn.no_sync_with)) if lhs: core = "%s <- %s" % ( diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index ab3035be0388877af12d71dd15b7dbb522c7b84e..6c5491384d4fc37dc48604aa52753d11ac10fc55 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -235,9 +235,11 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): raise LoopyError("'nosync' option may not be specified " "in a 'with' block") + # TODO: Come up with a syntax that allows the user to express + # different synchronization scopes. result["no_sync_with"] = result["no_sync_with"].union(frozenset( - intern(dep.strip()) for dep in opt_value.split(":") - if dep.strip())) + (intern(dep.strip()), "any") + for dep in opt_value.split(":") if dep.strip())) elif opt_key == "nosync_query" and opt_value is not None: if is_with_block: @@ -246,8 +248,10 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): from loopy.match import parse_match match = parse_match(opt_value) + # TODO: Come up with a syntax that allows the user to express + # different synchronization scopes. result["no_sync_with"] = result["no_sync_with"].union( - frozenset([match])) + frozenset([(match, "any")])) elif opt_key == "groups" and opt_value is not None: result["groups"] = frozenset( @@ -1462,8 +1466,11 @@ def resolve_dependencies(knl): for insn in knl.instructions: new_insns.append(insn.copy( depends_on=_resolve_dependencies(knl, insn, insn.depends_on), - no_sync_with=_resolve_dependencies( - knl, insn, insn.no_sync_with), + no_sync_with=frozenset( + (resolved_insn_id, nosync_scope) + for nosync_dep, nosync_scope in insn.no_sync_with + for resolved_insn_id in + _resolve_dependencies(knl, insn, nosync_dep)), )) return knl.copy(instructions=new_insns) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index b306e6e6e25fbd3f661a4f271345129a82ea93f1..417ff9dd37c682e599e0586ec61f02169e9fe7aa 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -91,9 +91,17 @@ class InstructionBase(Record): .. attribute:: no_sync_with - a :class:`frozenset` of :attr:`id` values of :class:`Instruction` instances - with which no barrier synchronization is necessary, even given the existence - of a dependency chain and apparently conflicting access. + a :class:`frozenset` of tuples of the form `(insn_id, scope)`, where + `insn_id` refers to :attr:`id` of :class:`Instruction` instances + and `scope` is one of the following strings: + + - `"local"` + - `"global"` + - `"any"`. + + This indicates no barrier synchronization is necessary with the given + instruction using barriers of type `scope`, even given the existence of + a dependency chain and apparently conflicting access. Note, that :attr:`no_sync_with` allows instruction matching through wildcards and match expression, just like :attr:`depends_on`. @@ -380,7 +388,10 @@ class InstructionBase(Record): if self.depends_on: result.append("dep="+":".join(self.depends_on)) if self.no_sync_with: - result.append("nosync="+":".join(self.no_sync_with)) + # TODO: Come up with a syntax to express different kinds of + # synchronization scopes. + result.append("nosync="+":".join( + insn_id for insn_id, _ in self.no_sync_with)) if self.groups: result.append("groups=%s" % ":".join(self.groups)) if self.conflicts_with_groups: diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f93ea891fd7f1aa420c8c109c31b50fd8305a9e3..6b5488a20bc9d714fb5fde908b559ddebf4b9591 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -459,7 +459,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id]) | insn.depends_on, - no_sync_with=frozenset([init_id])) + no_sync_with=frozenset([(init_id, "any")])) generated_insns.append(transfer_insn) def _strip_if_scalar(c): @@ -517,7 +517,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): istage += 1 new_insn_add_depends_on.add(prev_id) - new_insn_add_no_sync_with.add(prev_id) + new_insn_add_no_sync_with.add((prev_id, "any")) new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname) if nresults == 1: diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 9110c4ac079c614ce6a078bbe986888db3c21422..694312df9768f8a6cf2f91b7d5d85dddbac71063 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -399,6 +399,17 @@ def get_priority_tiers(wanted, priorities): for tier in get_priority_tiers(wanted, priorities): yield tier + +def sched_item_to_insn_id(sched_item): + # Helper for use in generator expressions, i.e. + # (... for insn_id in sched_item_to_insn_id(item) ...) + if isinstance(sched_item, RunInstruction): + yield sched_item.insn_id + elif isinstance(sched_item, Barrier): + if (hasattr(sched_item, "originating_insn_id") + and sched_item.originating_insn_id is not None): + yield sched_item.originating_insn_id + # }}} @@ -572,12 +583,37 @@ class SchedulerState(Record): A :class:`frozenset` of all inames ever entered. + .. attribute:: enclosing_subkernel_inames + + The inames of the last entered subkernel + .. attribute:: schedule .. attribute:: scheduled_insn_ids .. attribute:: unscheduled_insn_ids + .. attribute:: preschedule + + A sequence of schedule items that must be inserted into the + schedule, maintaining the same ordering + + .. attribute:: prescheduled_insn_ids + + A :class:`frozenset` of any instruction that started prescheduled + + .. attribute:: prescheduled_inames + + A :class:`frozenset` of any iname that started prescheduled + + .. attribute:: may_schedule_global_barriers + + Whether global barrier scheduling is allowed + + .. attribute:: within_subkernel + + Whether the scheduler is inside a subkernel + .. attribute:: group_insn_counts A mapping from instruction group names to the number of instructions @@ -619,6 +655,11 @@ def generate_loop_schedules_internal( active_inames_set = frozenset(sched_state.active_inames) + next_preschedule_item = ( + sched_state.preschedule[0] + if len(sched_state.preschedule) > 0 + else None) + # {{{ decide about debug mode debug_mode = False @@ -637,6 +678,10 @@ def generate_loop_schedules_internal( print(75*"=") print("CURRENT SCHEDULE:") print(dump_schedule(sched_state.kernel, sched_state.schedule)) + if sched_state.preschedule: + print(75*"=") + print("PRESCHEDULED ITEMS AWAITING SCHEDULING:") + print(dump_schedule(sched_state.kernel, sched_state.preschedule)) #print("boost allowed:", allow_boost) print(75*"=") print("LOOP NEST MAP (inner: outer):") @@ -652,6 +697,54 @@ def generate_loop_schedules_internal( # }}} + # {{{ see if we have reached the start/end of kernel in the preschedule + + if isinstance(next_preschedule_item, CallKernel): + assert sched_state.within_subkernel is False + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:], + within_subkernel=True, + may_schedule_global_barriers=False, + enclosing_subkernel_inames=sched_state.active_inames), + allow_boost=rec_allow_boost, + debug=debug): + yield result + + if isinstance(next_preschedule_item, ReturnFromKernel): + assert sched_state.within_subkernel is True + # Make sure all subkernel inames have finished. + if sched_state.active_inames == sched_state.enclosing_subkernel_inames: + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:], + within_subkernel=False, + may_schedule_global_barriers=True), + allow_boost=rec_allow_boost, + debug=debug): + yield result + + # }}} + + # {{{ see if there are pending local barriers in the preschedule + + # Local barriers do not have associated instructions, so they need to + # be handled separately from instructions. + if ( + isinstance(next_preschedule_item, Barrier) + and next_preschedule_item.kind == "local"): + for result in generate_loop_schedules_internal( + sched_state.copy( + schedule=sched_state.schedule + (next_preschedule_item,), + preschedule=sched_state.preschedule[1:]), + allow_boost=rec_allow_boost, + debug=debug): + yield result + + # }}} + # {{{ see if any insns are ready to be scheduled now # Also take note of insns that have a chance of being schedulable inside @@ -667,9 +760,16 @@ def generate_loop_schedules_internal( # schedule generation order. return (insn.priority, len(active_groups & insn.groups), insn.id) - insn_ids_to_try = sorted(sched_state.unscheduled_insn_ids, + insn_ids_to_try = sorted( + # Non-prescheduled instructions go first. + sched_state.unscheduled_insn_ids - sched_state.prescheduled_insn_ids, key=insn_sort_key, reverse=True) + insn_ids_to_try.extend( + insn_id + for item in sched_state.preschedule + for insn_id in sched_item_to_insn_id(item)) + for insn_id in insn_ids_to_try: insn = kernel.id_to_insn[insn_id] @@ -705,6 +805,46 @@ def generate_loop_schedules_internal( print("instruction '%s' won't work under inames '%s'" % (format_insn(kernel, insn.id), ",".join(have-want))) + # {{{ check if scheduling this insn is compatible with preschedule + + if insn_id in sched_state.prescheduled_insn_ids: + if isinstance(next_preschedule_item, RunInstruction): + next_preschedule_insn_id = next_preschedule_item.insn_id + elif ( + isinstance(next_preschedule_item, Barrier) + and next_preschedule_item.kind == "global"): + assert hasattr(next_preschedule_item, "originating_insn_id") + assert next_preschedule_item.originating_insn_id is not None + next_preschedule_insn_id = next_preschedule_item.originating_insn_id + else: + next_preschedule_insn_id = None + + if next_preschedule_insn_id != insn_id: + if debug_mode: + print("can't schedule '%s' because another preschedule " + "instruction precedes it" % format_insn(kernel, insn.id)) + is_ready = False + + # }}} + + # {{{ check if scheduler state allows insn scheduling + + from loopy.kernel.instruction import BarrierInstruction + if isinstance(insn, BarrierInstruction) and insn.kind == "global": + if not sched_state.may_schedule_global_barriers: + if debug_mode: + print("can't schedule '%s' because global barriers are " + "not currently allowed" % format_insn(kernel, insn.id)) + is_ready = False + else: + if not sched_state.within_subkernel: + if debug_mode: + print("can't schedule '%s' because not within subkernel" + % format_insn(kernel, insn.id)) + is_ready = False + + # }}} + # {{{ determine group-based readiness if insn.conflicts_with_groups & active_groups: @@ -761,6 +901,10 @@ def generate_loop_schedules_internal( unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set, schedule=( sched_state.schedule + (RunInstruction(insn_id=insn.id),)), + preschedule=( + sched_state.preschedule + if insn_id not in sched_state.prescheduled_insn_ids + else sched_state.preschedule[1:]), active_group_counts=new_active_group_counts, uses_of_boostability=( sched_state.uses_of_boostability @@ -790,7 +934,17 @@ def generate_loop_schedules_internal( if last_entered_loop is not None: can_leave = True - if last_entered_loop not in sched_state.breakable_inames: + if ( + last_entered_loop in sched_state.prescheduled_inames + and not ( + isinstance(next_preschedule_item, LeaveLoop) + and next_preschedule_item.iname == last_entered_loop)): + # A prescheduled loop can only be left if the preschedule agrees. + if debug_mode: + print("cannot leave '%s' because of preschedule constraints" + % last_entered_loop) + can_leave = False + elif last_entered_loop not in sched_state.breakable_inames: # If the iname is not breakable, then check that we've # scheduled all the instructions that require it. @@ -857,12 +1011,19 @@ def generate_loop_schedules_internal( break if can_leave and not debug_mode: + for sub_sched in generate_loop_schedules_internal( sched_state.copy( schedule=( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), - active_inames=sched_state.active_inames[:-1]), + active_inames=sched_state.active_inames[:-1], + preschedule=( + sched_state.preschedule + if last_entered_loop + not in sched_state.prescheduled_inames + else sched_state.preschedule[1:]), + ), allow_boost=rec_allow_boost, debug=debug): yield sub_sched @@ -902,6 +1063,38 @@ def generate_loop_schedules_internal( # {{{ check if scheduling this iname now is allowed/plausible + if ( + iname in sched_state.prescheduled_inames + and not ( + isinstance(next_preschedule_item, EnterLoop) + and next_preschedule_item.iname == iname)): + if debug_mode: + print("scheduling %s prohibited by preschedule constraints" + % iname) + continue + + if ( + not sched_state.within_subkernel + and iname not in sched_state.prescheduled_inames): + # Avoid messing up some orderings such as picking: + # + # EnterLoop(temporary.reload) + # CallKernel + # ... + # + # instead of + # + # CallKernel + # EnterLoop(temporary.reload) + # ... + # + # This serves a heuristic to catch some bad decisions early, the + # scheduler will not allow the first variant regardless. + if debug_mode: + print("scheduling '%s' prohibited because we are outside " + "a subkernel" % iname) + continue + currently_accessible_inames = ( active_inames_set | sched_state.parallel_inames) if ( @@ -1063,6 +1256,10 @@ def generate_loop_schedules_internal( entered_inames=( sched_state.entered_inames | frozenset((iname,))), + preschedule=( + sched_state.preschedule + if iname not in sched_state.prescheduled_inames + else sched_state.preschedule[1:]), ), allow_boost=rec_allow_boost, debug=debug): @@ -1082,7 +1279,10 @@ def generate_loop_schedules_internal( if inp: raise ScheduleDebugInput(inp) - if not sched_state.active_inames and not sched_state.unscheduled_insn_ids: + if ( + not sched_state.active_inames + and not sched_state.unscheduled_insn_ids + and not sched_state.preschedule): # if done, yield result debug.log_success(sched_state.schedule) @@ -1210,8 +1410,8 @@ def get_barrier_needing_dependency(kernel, target, source, reverse, var_kind): if reverse: source, target = target, source - if source.id in target.no_sync_with: - return None + if source.id in kernel.get_nosync_set(target.id, var_kind): + return # {{{ check that a dependency exists @@ -1310,6 +1510,9 @@ def get_tail_starting_at_last_barrier(schedule, kind): elif isinstance(sched_item, (EnterLoop, LeaveLoop)): pass + elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + pass + else: raise ValueError("unexpected schedule item type '%s'" % type(sched_item).__name__) @@ -1323,7 +1526,8 @@ def insn_ids_from_schedule(schedule): if isinstance(sched_item, RunInstruction): result.append(sched_item.insn_id) - elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier)): + elif isinstance(sched_item, (EnterLoop, LeaveLoop, Barrier, CallKernel, + ReturnFromKernel)): pass else: @@ -1456,8 +1660,22 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): source=dep_src_insn_id, reverse=reverse, var_kind=kind) if dep: - issue_barrier(dep=dep) - break + if verify_only: + from loopy.diagnostic import MissingBarrierError + raise MissingBarrierError( + "Dependency '%s' (for variable '%s') " + "requires synchronization " + "by a %s barrier (add a 'no_sync_with' " + "instruction option to state that no" + "synchronization is needed)" + % ( + dep.dep_descr.format( + tgt=dep.target.id, src=dep.source.id), + dep.variable, + kind)) + else: + issue_barrier(dep=dep) + break # }}} @@ -1516,6 +1734,10 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): result.append(sched_item) candidates.add(sched_item.insn_id) + elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): + result.append(sched_item) + i += 1 + else: raise ValueError("unexpected schedule item type '%s'" % type(sched_item).__name__) @@ -1537,7 +1759,7 @@ def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): def generate_loop_schedules(kernel, debug_args={}): from loopy.kernel import kernel_state - if kernel.state != kernel_state.PREPROCESSED: + if kernel.state not in (kernel_state.PREPROCESSED, kernel_state.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") @@ -1548,6 +1770,18 @@ def generate_loop_schedules(kernel, debug_args={}): debug = ScheduleDebugger(**debug_args) + preschedule = kernel.schedule if kernel.state == kernel_state.SCHEDULED else () + + prescheduled_inames = set( + insn.iname + for insn in preschedule + if isinstance(insn, EnterLoop)) + + prescheduled_insn_ids = set( + insn_id + for item in preschedule + for insn_id in sched_item_to_insn_id(item)) + from loopy.kernel.data import IlpBaseTag, ParallelTag, VectorizeTag ilp_inames = set( iname @@ -1574,14 +1808,22 @@ def generate_loop_schedules(kernel, debug_args={}): ilp_inames=ilp_inames, vec_inames=vec_inames, + prescheduled_inames=prescheduled_inames, + prescheduled_insn_ids=prescheduled_insn_ids, + # time-varying part active_inames=(), entered_inames=frozenset(), + enclosing_subkernel_inames=(), schedule=(), unscheduled_insn_ids=set(insn.id for insn in kernel.instructions), scheduled_insn_ids=frozenset(), + within_subkernel=kernel.state != kernel_state.SCHEDULED, + may_schedule_global_barriers=True, + + preschedule=preschedule, # ilp and vec are not parallel for the purposes of the scheduler parallel_inames=parallel_inames - ilp_inames - vec_inames, @@ -1639,18 +1881,15 @@ def generate_loop_schedules(kernel, debug_args={}): gsize, lsize = kernel.get_grid_size_upper_bounds() - if gsize or lsize: + if (gsize or lsize): if not kernel.options.disable_global_barriers: logger.info("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, reverse=False, kind="global", verify_only=True) logger.info("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, reverse=False, kind="local", verify_only=False) - logger.info("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( @@ -1659,7 +1898,12 @@ def generate_loop_schedules(kernel, debug_args={}): from loopy.schedule.device_mapping import \ map_schedule_onto_host_or_device - new_kernel = map_schedule_onto_host_or_device(new_kernel) + if kernel.state != kernel_state.SCHEDULED: + # Device mapper only gets run once. + new_kernel = map_schedule_onto_host_or_device(new_kernel) + + from loopy.schedule.tools import add_extra_args_to_schedule + new_kernel = add_extra_args_to_schedule(new_kernel) yield new_kernel debug.start() diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index ca782a3d8ca85ea6250f7c9317ca0947db28d5e8..1a0789c2f61e21e4a0371e2a73195c9771245527 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -23,14 +23,13 @@ THE SOFTWARE. """ from loopy.diagnostic import LoopyError -from loopy.kernel.data import TemporaryVariable, temp_var_scope -from loopy.schedule import (Barrier, BeginBlockItem, CallKernel, EndBlockItem, - EnterLoop, LeaveLoop, ReturnFromKernel, - RunInstruction) -from pytools import Record, memoize_method +from loopy.schedule import (Barrier, CallKernel, EnterLoop, LeaveLoop, + ReturnFromKernel, RunInstruction) +from loopy.schedule.tools import get_block_boundaries def map_schedule_onto_host_or_device(kernel): + # FIXME: Should be idempotent. from loopy.kernel import kernel_state assert kernel.state == kernel_state.SCHEDULED @@ -53,659 +52,14 @@ def map_schedule_onto_host_or_device(kernel): kernel = map_schedule_onto_host_or_device_impl( kernel, device_prog_name_gen) - return restore_and_save_temporaries( - add_extra_args_to_schedule(kernel)) - - -# {{{ Schedule / instruction utilities - -def get_block_boundaries(schedule): - """ - Return a dictionary mapping indices of - :class:`loopy.schedule.BlockBeginItem`s to - :class:`loopy.schedule.BlockEndItem`s and vice versa. - """ - block_bounds = {} - active_blocks = [] - for idx, sched_item in enumerate(schedule): - if isinstance(sched_item, BeginBlockItem): - active_blocks.append(idx) - elif isinstance(sched_item, EndBlockItem): - start = active_blocks.pop() - block_bounds[start] = idx - block_bounds[idx] = start - return block_bounds - - -def get_hw_inames(kernel, insn): - """ - Return the inames that insn runs in and that are tagged as hardware - parallel. - """ - from loopy.kernel.data import HardwareParallelTag - return set(iname for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)) - - -def get_common_hw_inames(kernel, insn_ids): - """ - Return the common set of hardware parallel tagged inames among - the list of instructions. - """ - # Get the list of hardware inames in which the temporary is defined. - if len(insn_ids) == 0: - return set() - return set.intersection( - *(get_hw_inames(kernel, kernel.id_to_insn[id]) for id in insn_ids)) - - -def remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel): - from loopy.kernel.data import HardwareParallelTag - new_schedule = [] - - for item in kernel.schedule: - if isinstance(item, (EnterLoop, LeaveLoop)): - tag = kernel.iname_to_tag.get(item.iname) - if isinstance(tag, HardwareParallelTag): - continue - new_schedule.append(item) - - return kernel.copy(schedule=new_schedule) - -# }}} - - -# {{{ Use / def utilities - -def filter_out_subscripts(exprs): - """ - Remove subscripts from expressions in `exprs`. - """ - result = set() - from pymbolic.primitives import Subscript - for expr in exprs: - if isinstance(expr, Subscript): - expr = expr.aggregate - result.add(expr) - return result - - -def filter_items_by_varname(pred, kernel, items): - """ - Keep only the values in `items` whose variable names satisfy `pred`. - """ - from pymbolic.primitives import Subscript, Variable - result = set() - for item in items: - base = item - if isinstance(base, Subscript): - base = base.aggregate - if isinstance(base, Variable): - base = base.name - if pred(kernel, base): - result.add(item) - return result - - -from functools import partial - -filter_temporaries = partial(filter_items_by_varname, - lambda kernel, name: name in kernel.temporary_variables) - -filter_scalar_temporaries = partial(filter_items_by_varname, - lambda kernel, name: name in kernel.temporary_variables and - len(kernel.temporary_variables[name].shape) == 0) - - -def get_use_set(insn, include_subscripts=True): - """ - Return the use-set of the instruction, for liveness analysis. - """ - result = insn.read_dependency_names() - if not include_subscripts: - result = filter_out_subscripts(result) - return result - - -def get_def_set(insn, include_subscripts=True): - """ - Return the def-set of the instruction, for liveness analysis. - """ - result = insn.write_dependency_names() - if not include_subscripts: - result = filter_out_subscripts(result) - return result - - -def get_def_and_use_lists_for_all_temporaries(kernel): - """ - Return a pair `def_lists`, `use_lists` which map temporary variable - names to lists of instructions where they are defined or used. - """ - def_lists = dict((t, []) for t in kernel.temporary_variables) - use_lists = dict((t, []) for t in kernel.temporary_variables) - - for insn in kernel.instructions: - assignees = get_def_set(insn, include_subscripts=False) - dependencies = get_use_set(insn, include_subscripts=False) - - from pymbolic.primitives import Variable - - for assignee in assignees: - if isinstance(assignee, Variable): - assignee = assignee.name - if assignee in kernel.temporary_variables: - def_lists[assignee].append(insn.id) - - for dep in dependencies: - if isinstance(dep, Variable): - dep = dep.name - if dep in kernel.temporary_variables: - use_lists[dep].append(insn.id) - - return def_lists, use_lists - - -def get_temporaries_defined_and_used_in_subrange( - kernel, schedule, start_idx, end_idx): - defs = set() - uses = set() - - for idx in range(start_idx, end_idx + 1): - sched_item = schedule[idx] - if isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] - defs.update( - filter_temporaries( - kernel, get_def_set(insn))) - uses.update( - filter_temporaries( - kernel, get_use_set(insn))) - - return defs, uses - -# }}} - - -# {{{ Liveness analysis - -def compute_live_temporaries(kernel, schedule): - """ - Compute live-in and live-out sets for temporary variables. - """ - live_in = [set() for i in range(len(schedule) + 1)] - live_out = [set() for i in range(len(schedule))] - - id_to_insn = kernel.id_to_insn - block_bounds = get_block_boundaries(schedule) - - # {{{ Liveness analysis implementation - - def compute_subrange_liveness(start_idx, end_idx): - idx = end_idx - while start_idx <= idx: - sched_item = schedule[idx] - if isinstance(sched_item, LeaveLoop): - start = block_bounds[idx] - live_in[idx] = live_out[idx] = live_in[idx + 1] - compute_subrange_liveness(start + 1, idx - 1) - prev_live_in = live_in[start].copy() - live_in[start] = live_out[start] = live_in[start + 1] - # Propagate live values through the loop. - if live_in[start] != prev_live_in: - live_out[idx] |= live_in[start] - live_in[idx] = live_out[idx] - compute_subrange_liveness(start + 1, idx - 1) - idx = start - 1 - - elif isinstance(sched_item, ReturnFromKernel): - start = block_bounds[idx] - live_in[idx] = live_out[idx] = live_in[idx + 1] - compute_subrange_liveness(start + 1, idx - 1) - live_in[start] = live_out[start] = live_in[start + 1] - idx = start - 1 - - elif isinstance(sched_item, RunInstruction): - live_out[idx] = live_in[idx + 1] - insn = id_to_insn[sched_item.insn_id] - defs = filter_scalar_temporaries(kernel, - get_def_set(insn, include_subscripts=False)) - uses = filter_temporaries(kernel, - get_use_set(insn, include_subscripts=False)) - live_in[idx] = (live_out[idx] - defs) | uses - idx -= 1 - - elif isinstance(sched_item, Barrier): - live_in[idx] = live_out[idx] = live_in[idx + 1] - idx -= 1 - else: - raise LoopyError("unexpected type of schedule item: %s" - % type(sched_item).__name__) - - # }}} - - # Compute live variables - compute_subrange_liveness(0, len(schedule) - 1) - live_in = live_in[:-1] - - if 0: - print(kernel) - print("Live-in values:") - for i, li in enumerate(live_in): - print("{}: {}".format(i, ", ".join(li))) - print("Live-out values:") - for i, lo in enumerate(live_out): - print("{}: {}".format(i, ", ".join(lo))) - - # Strip off subscripts. - live_in = [filter_out_subscripts(li) for li in live_in] - live_out = [filter_out_subscripts(lo) for lo in live_out] - - return live_in, live_out - -# }}} - - -# {{{ Temporary promotion - -class PromotedTemporary(Record): - """ - .. attribute:: name - - The name of the new temporary. - - .. attribute:: orig_temporary - - The original temporary variable object. - - .. attribute:: hw_inames - - The common list of hw axes that define the original object. - - .. attribute:: shape_prefix - - A list of expressions, to be added in front of the shape - of the promoted temporary value - """ - - @memoize_method - def as_variable(self): - temporary = self.orig_temporary - return TemporaryVariable( - name=self.name, - dtype=temporary.dtype, - scope=temp_var_scope.GLOBAL, - shape=self.new_shape) - - @property - def new_shape(self): - return self.shape_prefix + self.orig_temporary.shape - - -def determine_temporaries_to_promote(kernel, temporaries, name_gen): - """ - For each temporary in the passed list of temporaries, construct a - :class:`PromotedTemporary` which describes how the temporary should - get promoted into global storage. - - :returns: A :class:`dict` mapping temporary names from `temporaries` to - :class:`PromotedTemporary` objects - """ - new_temporaries = {} - - def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) - - from loopy.kernel.data import LocalIndexTag - - for temporary in temporaries: - temporary = kernel.temporary_variables[temporary] - if temporary.scope == temp_var_scope.GLOBAL: - # Nothing to be done for global temporaries (I hope) - continue - - assert temporary.base_storage is None, \ - "Cannot promote temporaries with base_storage to global" - - # `hw_inames`: The set of hw-parallel tagged inames that this temporary - # is associated with. This is used for determining the shape of the - # global storage needed for saving and restoring the temporary across - # kernel calls. - # - # TODO: Make a policy decision about which dimensions to use. Currently, - # the code looks at each instruction that defines or uses the temporary, - # and takes the common set of hw-parallel tagged inames associated with - # these instructions. - # - # Furthermore, in the case of local temporaries, inames that are tagged - # hw-local do not contribute to the global storage shape. - hw_inames = get_common_hw_inames(kernel, - def_lists[temporary.name] + use_lists[temporary.name]) - - # This takes advantage of the fact that g < l in the alphabet :) - hw_inames = sorted(hw_inames, - key=lambda iname: str(kernel.iname_to_tag[iname])) - - # Calculate the sizes of the dimensions that get added in front for - # the global storage of the temporary. - shape_prefix = [] - - backing_hw_inames = [] - for iname in hw_inames: - tag = kernel.iname_to_tag[iname] - is_local_iname = isinstance(tag, LocalIndexTag) - if is_local_iname and temporary.scope == temp_var_scope.LOCAL: - # Restrict shape to that of group inames for locals. - continue - backing_hw_inames.append(iname) - from loopy.isl_helpers import static_max_of_pw_aff - from loopy.symbolic import aff_to_expr - shape_prefix.append( - aff_to_expr( - static_max_of_pw_aff( - kernel.get_iname_bounds(iname).size, False))) - - backing_temporary = PromotedTemporary( - name=name_gen(temporary.name), - orig_temporary=temporary, - shape_prefix=tuple(shape_prefix), - hw_inames=backing_hw_inames) - new_temporaries[temporary.name] = backing_temporary - - return new_temporaries - -# }}} - - -# {{{ Domain augmentation - -def augment_domain_for_temporary_promotion( - kernel, domain, promoted_temporary, mode, name_gen): - """ - Add new axes to the domain corresponding to the dimensions of - `promoted_temporary`. - """ - import islpy as isl - - orig_temporary = promoted_temporary.orig_temporary - orig_dim = domain.dim(isl.dim_type.set) - dims_to_insert = len(orig_temporary.shape) - - iname_to_tag = {} - - # Add dimension-dependent inames. - dim_inames = [] - - domain = domain.add(isl.dim_type.set, dims_to_insert) - for t_idx in range(len(orig_temporary.shape)): - new_iname = name_gen("{name}_{mode}_dim_{dim}". - format(name=orig_temporary.name, - mode=mode, - dim=t_idx)) - domain = domain.set_dim_name( - isl.dim_type.set, orig_dim + t_idx, new_iname) - if orig_temporary.is_local: - # If the temporary is has local scope, then loads / stores can be - # done in parallel. - from loopy.kernel.data import AutoFitLocalIndexTag - iname_to_tag[new_iname] = AutoFitLocalIndexTag() - - dim_inames.append(new_iname) - - # Add size information. - aff = isl.affs_from_space(domain.space) - domain &= aff[0].le_set(aff[new_iname]) - size = orig_temporary.shape[t_idx] - from loopy.symbolic import aff_from_expr - domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, size)) - - hw_inames = [] - - # Add hardware inames duplicates. - for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): - new_iname = name_gen("{name}_{mode}_hw_dim_{dim}". - format(name=orig_temporary.name, - mode=mode, - dim=t_idx)) - hw_inames.append(new_iname) - iname_to_tag[new_iname] = kernel.iname_to_tag[hw_iname] - - from loopy.isl_helpers import duplicate_axes - domain = duplicate_axes( - domain, promoted_temporary.hw_inames, hw_inames) - - # The operations on the domain above return a Set object, but the - # underlying domain should be expressible as a single BasicSet. - domain_list = domain.get_basic_set_list() - assert domain_list.n_basic_set() == 1 - domain = domain_list.get_basic_set(0) - return domain, hw_inames, dim_inames, iname_to_tag - -# }}} - - -def restore_and_save_temporaries(kernel): - """ - Add code that loads / spills the temporaries in the kernel which are - live across sub-kernel calls. - """ - # Compute live temporaries. - live_in, live_out = compute_live_temporaries(kernel, kernel.schedule) - - # Create kernel variables based on live temporaries. - inter_kernel_temporaries = set() - - call_count = 0 - for idx, sched_item in enumerate(kernel.schedule): - if isinstance(sched_item, CallKernel): - inter_kernel_temporaries |= filter_out_subscripts(live_in[idx]) - call_count += 1 - - if call_count == 1: - # Single call corresponds to a kernel which has not been split - - # no need for restores / spills of temporaries. - return kernel - - name_gen = kernel.get_var_name_generator() - new_temporaries = determine_temporaries_to_promote( - kernel, inter_kernel_temporaries, name_gen) - - # {{{ Insert loads and spills of new temporaries - - new_schedule = [] - new_instructions = [] - new_iname_to_tag = {} - - idx = 0 - schedule = kernel.schedule - while idx < len(schedule): - sched_item = schedule[idx] - - if not isinstance(sched_item, CallKernel): - new_schedule.append(sched_item) - idx += 1 - continue - - subkernel_prolog = [] - subkernel_epilog = [] - subkernel_schedule = [] - - start_idx = idx - idx += 1 - while not isinstance(schedule[idx], ReturnFromKernel): - subkernel_schedule.append(schedule[idx]) - idx += 1 - - subkernel_defs, subkernel_uses = \ - get_temporaries_defined_and_used_in_subrange( - kernel, schedule, start_idx + 1, idx - 1) - - from loopy.kernel.data import temp_var_scope - # Filter out temporaries that are global. - subkernel_globals = set( - tval for tval in subkernel_defs | subkernel_uses - if kernel.temporary_variables[tval].scope == temp_var_scope.GLOBAL) - - tvals_to_spill = (subkernel_defs - subkernel_globals) & live_out[idx] - # Need to load tvals_to_spill, to avoid overwriting entries that the - # code doesn't touch when doing the spill. - tvals_to_load = ((subkernel_uses - subkernel_globals) - | tvals_to_spill) & live_in[start_idx] - - # Add new arguments. - sched_item = sched_item.copy( - extra_args=sched_item.extra_args - + sorted(new_temporaries[tv].name - for tv in tvals_to_load | tvals_to_spill)) - - # {{{ Add all the loads and spills. - - def insert_loads_or_spills(tvals, mode): - assert mode in ["load", "spill"] - local_temporaries = set() - - code_block = \ - subkernel_prolog if mode == "load" else subkernel_epilog - - new_kernel = kernel - - for tval in tvals: - from loopy.kernel.tools import DomainChanger - tval_hw_inames = new_temporaries[tval].hw_inames - dchg = DomainChanger(new_kernel, - frozenset(sched_item.extra_inames + tval_hw_inames)) - domain = dchg.domain - - domain, hw_inames, dim_inames, itt = \ - augment_domain_for_temporary_promotion( - new_kernel, domain, new_temporaries[tval], mode, - name_gen) - new_iname_to_tag.update(itt) - - new_kernel = dchg.get_kernel_with(domain) - - # Add the load / spill instruction. - insn_id = name_gen("{name}.{mode}".format(name=tval, mode=mode)) - - def subscript_or_var(agg, subscript): - from pymbolic.primitives import Subscript, Variable - if len(subscript) == 0: - return Variable(agg) - else: - return Subscript( - Variable(agg), - tuple(map(Variable, subscript))) - - args = ( - subscript_or_var( - tval, dim_inames), - subscript_or_var( - new_temporaries[tval].name, hw_inames + dim_inames)) - - if mode == "spill": - args = reversed(args) - - from loopy.kernel.data import Assignment - new_insn = Assignment(*args, id=insn_id, - within_inames=frozenset(hw_inames + dim_inames), - within_inames_is_final=True) - - new_instructions.append(new_insn) - - loop_begin = [EnterLoop(iname=iname) for iname in dim_inames] - loop_end = list(reversed([ - LeaveLoop(iname=iname) for iname in dim_inames])) - code_block.extend( - loop_begin + - [RunInstruction(insn_id=insn_id)] + - loop_end) - if new_temporaries[tval].orig_temporary.is_local: - local_temporaries.add(new_temporaries[tval].name) - - # After loading / before spilling local temporaries, we need to - # insert a barrier. - if local_temporaries: - if mode == "load": - subkernel_prolog.append( - Barrier(kind="local", - comment="for loads of {0}".format( - ", ".join(sorted(local_temporaries))))) - else: - subkernel_epilog.insert(0, - Barrier(kind="local", - comment="for spills of {0}".format( - ", ".join(sorted(local_temporaries))))) - return new_kernel - - kernel = insert_loads_or_spills(tvals_to_load, "load") - kernel = insert_loads_or_spills(tvals_to_spill, "spill") - - # }}} - - new_schedule.extend( - [sched_item] + - subkernel_prolog + - subkernel_schedule + - subkernel_epilog + - # ReturnFromKernel - [schedule[idx]]) - - # ReturnFromKernel - idx += 1 - - # }}} - - new_iname_to_tag.update(kernel.iname_to_tag) - updated_temporary_variables = dict( - (t.name, t.as_variable()) for t in new_temporaries.values()) - updated_temporary_variables.update(kernel.temporary_variables) - - kernel = kernel.copy( - iname_to_tag=new_iname_to_tag, - temporary_variables=updated_temporary_variables, - instructions=kernel.instructions + new_instructions, - schedule=new_schedule - ) - - from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) - - # Once assign_automatic_axes() does its job, loops in the schedule - # for newly hardware-tagged inames are no longer necessary (and in - # fact illegal), so remove them. - kernel = remove_illegal_loops_for_hw_tagged_inames_in_schedule(kernel) - return kernel -def add_extra_args_to_schedule(kernel): - """ - Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel` - instructions in the schedule with global temporaries. - """ - new_schedule = [] - - block_bounds = get_block_boundaries(kernel.schedule) - for idx, sched_item in enumerate(kernel.schedule): - if isinstance(sched_item, CallKernel): - defs, uses = get_temporaries_defined_and_used_in_subrange( - kernel, kernel.schedule, idx + 1, block_bounds[idx] - 1) - # Filter out temporaries that are global. - extra_args = (tv for tv in defs | uses if - kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL - and - kernel.temporary_variables[tv].initializer is None) - new_schedule.append(sched_item.copy(extra_args=sorted(extra_args))) - else: - new_schedule.append(sched_item) - - return kernel.copy(schedule=new_schedule) - - def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): schedule = kernel.schedule loop_bounds = get_block_boundaries(schedule) - # {{{ Inner mapper function + # {{{ inner mapper function dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[]) dummy_return = ReturnFromKernel(kernel_name="") @@ -760,6 +114,7 @@ def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): [dummy_call.copy()] + current_chunk + [dummy_return.copy()]) + new_schedule.append(sched_item) current_chunk = [] else: current_chunk.append(sched_item) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..5de677e72708be844a5276b3d40ace8b1dad9da0 --- /dev/null +++ b/loopy/schedule/tools.py @@ -0,0 +1,191 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2016 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.kernel.data import temp_var_scope +from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem, + RunInstruction, Barrier) + +from pytools import memoize_method + + +# {{{ block boundary finder + +def get_block_boundaries(schedule): + """ + Return a dictionary mapping indices of + :class:`loopy.schedule.BlockBeginItem`s to + :class:`loopy.schedule.BlockEndItem`s and vice versa. + """ + block_bounds = {} + active_blocks = [] + for idx, sched_item in enumerate(schedule): + if isinstance(sched_item, BeginBlockItem): + active_blocks.append(idx) + elif isinstance(sched_item, EndBlockItem): + start = active_blocks.pop() + block_bounds[start] = idx + block_bounds[idx] = start + return block_bounds + +# }}} + + +# {{{ instruction query utility + +class InstructionQuery(object): + + def __init__(self, kernel): + self.kernel = kernel + block_bounds = get_block_boundaries(kernel.schedule) + subkernel_slices = {} + from six import iteritems + for start, end in iteritems(block_bounds): + sched_item = kernel.schedule[start] + if isinstance(sched_item, CallKernel): + subkernel_slices[sched_item.kernel_name] = slice(start, end + 1) + self.subkernel_slices = subkernel_slices + + @memoize_method + def subkernels(self): + return frozenset(self.subkernel_slices.keys()) + + @memoize_method + def insns_reading_or_writing(self, var): + return frozenset(insn.id for insn in self.kernel.instructions + if var in insn.read_dependency_names() + or var in insn.assignee_var_names()) + + @memoize_method + def insns_in_subkernel(self, subkernel): + return frozenset(sched_item.insn_id for sched_item + in self.kernel.schedule[self.subkernel_slices[subkernel]] + if isinstance(sched_item, RunInstruction)) + + @memoize_method + def temporaries_read_in_subkernel(self, subkernel): + return frozenset( + var + for insn in self.insns_in_subkernel(subkernel) + for var in self.kernel.id_to_insn[insn].read_dependency_names() + if var in self.kernel.temporary_variables) + + @memoize_method + def temporaries_written_in_subkernel(self, subkernel): + return frozenset( + var + for insn in self.insns_in_subkernel(subkernel) + for var in self.kernel.id_to_insn[insn].assignee_var_names() + if var in self.kernel.temporary_variables) + + @memoize_method + def temporaries_read_or_written_in_subkernel(self, subkernel): + return ( + self.temporaries_read_in_subkernel(subkernel) | + self.temporaries_written_in_subkernel(subkernel)) + + @memoize_method + def inames_in_subkernel(self, subkernel): + subkernel_start = self.subkernel_slices[subkernel].start + return frozenset(self.kernel.schedule[subkernel_start].extra_inames) + + @memoize_method + def pre_and_post_barriers(self, subkernel): + subkernel_start = self.subkernel_slices[subkernel].start + subkernel_end = self.subkernel_slices[subkernel].stop + + def is_global_barrier(item): + return isinstance(item, Barrier) and item.kind == "global" + + try: + pre_barrier = next(item for item in + self.kernel.schedule[subkernel_start::-1] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + pre_barrier = None + + try: + post_barrier = next(item for item in + self.kernel.schedule[subkernel_end:] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + post_barrier = None + + return (pre_barrier, post_barrier) + + @memoize_method + def hw_inames(self, insn_id): + """ + Return the inames that insn runs in and that are tagged as hardware + parallel. + """ + from loopy.kernel.data import HardwareParallelTag + return set(iname for iname in self.kernel.insn_inames(insn_id) + if isinstance(self.kernel.iname_to_tag.get(iname), + HardwareParallelTag)) + + @memoize_method + def common_hw_inames(self, insn_ids): + """ + Return the common set of hardware parallel tagged inames among + the list of instructions. + """ + # Get the list of hardware inames in which the temporary is defined. + if len(insn_ids) == 0: + return set() + return set.intersection(*(self.hw_inames(id) for id in insn_ids)) + +# }}} + + +# {{{ add extra args to schedule + +def add_extra_args_to_schedule(kernel): + """ + Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel` + instructions in the schedule with global temporaries. + """ + new_schedule = [] + + insn_query = InstructionQuery(kernel) + + for sched_item in kernel.schedule: + if isinstance(sched_item, CallKernel): + subrange_temporaries = (insn_query + .temporaries_read_or_written_in_subkernel(sched_item.kernel_name)) + more_args = set(tv + for tv in subrange_temporaries + if + kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL + and + kernel.temporary_variables[tv].initializer is None + and + tv not in sched_item.extra_args) + new_schedule.append(sched_item.copy( + extra_args=sched_item.extra_args + sorted(more_args))) + else: + new_schedule.append(sched_item) + + return kernel.copy(schedule=new_schedule) + +# }}} diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 8cc1950b8ea7358134e0861a7664ba3d74fb9417..68cc32e56be077c7e45d11b9e2aade86b04494cc 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -220,8 +220,9 @@ class ExpressionToCExpressionMapper(IdentityMapper): elif isinstance(ary, (GlobalArg, TemporaryVariable, ConstantArg)): if len(access_info.subscripts) == 0: - if (isinstance(ary, (ConstantArg, GlobalArg)) or - (isinstance(ary, TemporaryVariable) and ary.base_storage)): + if ( + (isinstance(ary, (ConstantArg, GlobalArg)) or + (isinstance(ary, TemporaryVariable) and ary.base_storage))): # unsubscripted global args are pointers result = var(access_info.array_name)[0] diff --git a/loopy/transform/__init__.py b/loopy/transform/__init__.py index 570b5efffb29e0ebb56b99444db19766127be596..f42fd3c8d2943bb37b75e9ef0003b88985950926 100644 --- a/loopy/transform/__init__.py +++ b/loopy/transform/__init__.py @@ -21,6 +21,3 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - - - diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index b2c86c084f0c56ebfb6ec8ebe4f6f5e65c5fd37d..92cff7a507d672a3acc51a8abed572a04cb7e86a 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -500,7 +500,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, store_instruction = Assignment( id=kernel.make_unique_instruction_id(based_on="store_"+var_name), depends_on=frozenset(aar.modified_insn_ids), - no_sync_with=frozenset([init_insn_id]), + no_sync_with=frozenset([(init_insn_id, "any")]), assignee=store_target, expression=store_expression, within_inames=( diff --git a/loopy/transform/save.py b/loopy/transform/save.py new file mode 100644 index 0000000000000000000000000000000000000000..8706bc4da70b94ad678f07158e0a0f648fdd0030 --- /dev/null +++ b/loopy/transform/save.py @@ -0,0 +1,587 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2016 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from loopy.diagnostic import LoopyError +import loopy as lp + +from loopy.kernel.data import auto +from pytools import memoize_method, Record +from loopy.schedule import ( + EnterLoop, LeaveLoop, RunInstruction, + CallKernel, ReturnFromKernel, Barrier) + +from loopy.schedule.tools import (get_block_boundaries, InstructionQuery) + + +import logging +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: save_and_reload_temporaries +""" + + +# {{{ liveness analysis + +class LivenessResult(dict): + + class InstructionResult(Record): + __slots__ = ["live_in", "live_out"] + + @classmethod + def make_empty(cls, nscheditems): + return cls((idx, cls.InstructionResult(live_in=set(), live_out=set())) + for idx in range(nscheditems)) + + +class LivenessAnalysis(object): + + def __init__(self, kernel): + self.kernel = kernel + self.schedule = self.kernel.schedule + + @memoize_method + def get_successor_relation(self): + successors = {} + block_bounds = get_block_boundaries(self.kernel.schedule) + + for idx, (item, next_item) in enumerate(zip( + reversed(self.schedule), + reversed(self.schedule + [None]))): + sched_idx = len(self.schedule) - idx - 1 + + # Look at next_item + if next_item is None: + after = set() + elif isinstance(next_item, EnterLoop): + # Account for empty loop + loop_end = block_bounds[sched_idx + 1] + after = successors[loop_end] | set([sched_idx + 1]) + elif isinstance(next_item, (LeaveLoop, RunInstruction, + CallKernel, ReturnFromKernel, Barrier)): + after = set([sched_idx + 1]) + else: + raise LoopyError("unexpected type of schedule item: {ty}" + .format(ty=type(next_item).__name__)) + + # Look at item + if isinstance(item, LeaveLoop): + # Account for loop + loop_begin = block_bounds[sched_idx] + after |= set([loop_begin]) + elif not isinstance(item, (EnterLoop, RunInstruction, + CallKernel, ReturnFromKernel, Barrier)): + raise LoopyError("unexpected type of schedule item: {ty}" + .format(ty=type(item).__name__)) + + successors[sched_idx] = after + + return successors + + def get_gen_and_kill_sets(self): + gen = dict((idx, set()) for idx in range(len(self.schedule))) + kill = dict((idx, set()) for idx in range(len(self.schedule))) + + for sched_idx, sched_item in enumerate(self.schedule): + if not isinstance(sched_item, RunInstruction): + continue + insn = self.kernel.id_to_insn[sched_item.insn_id] + for var in insn.assignee_var_names(): + if var not in self.kernel.temporary_variables: + continue + if not insn.predicates: + # Fully kills the liveness only when unconditional. + kill[sched_idx].add(var) + if len(self.kernel.temporary_variables[var].shape) > 0: + # For an array variable, all definitions generate a use as + # well, because the write could be a partial write, + # necessitating a reload of whatever is not written. + # + # We don't currently check if the write is a partial write + # or a full write. Instead, we analyze the access + # footprint later on to determine how much to reload/save. + gen[sched_idx].add(var) + for var in insn.read_dependency_names(): + if var not in self.kernel.temporary_variables: + continue + gen[sched_idx].add(var) + + return gen, kill + + @memoize_method + def liveness(self): + logging.info("running liveness analysis") + successors = self.get_successor_relation() + gen, kill = self.get_gen_and_kill_sets() + + # Fixed point iteration for liveness analysis + lr = LivenessResult.make_empty(len(self.schedule)) + + prev_lr = None + + while prev_lr != lr: + from copy import deepcopy + prev_lr = deepcopy(lr) + for idx in range(len(self.schedule) - 1, -1, -1): + for succ in successors[idx]: + lr[idx].live_out.update(lr[succ].live_in) + lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx]) + + logging.info("done running liveness analysis") + + return lr + + def print_liveness(self): + print(75 * "-") + print("LIVE IN:") + for sched_idx, sched_item in enumerate(self.schedule): + print("{item}: {{{vars}}}".format( + item=sched_idx, + vars=", ".join(sorted(self[sched_idx].live_in)))) + print(75 * "-") + print("LIVE OUT:") + for sched_idx, sched_item in enumerate(self.schedule): + print("{item}: {{{vars}}}".format( + item=sched_idx, + vars=", ".join(sorted(self[sched_idx].live_out)))) + print(75 * "-") + + def __getitem__(self, sched_idx): + """ + :arg insn: An instruction name or instance of + :class:`loopy.instruction.InstructionBase` + + :returns: A :class:`LivenessResult` associated with `insn` + """ + return self.liveness()[sched_idx] + +# }}} + + +# {{{ save and reload implementation + +class TemporarySaver(object): + + class PromotedTemporary(Record): + """ + .. attribute:: name + + The name of the new temporary. + + .. attribute:: orig_temporary + + The original temporary variable object. + + .. attribute:: hw_inames + + The common list of hw axes that define the original object. + + .. attribute:: hw_dims + + A list of expressions, to be added in front of the shape + of the promoted temporary value, corresponding to + hardware dimensions + + .. attribute:: non_hw_dims + + A list of expressions, to be added in front of the shape + of the promoted temporary value, corresponding to + non-hardware dimensions + """ + + @memoize_method + def as_variable(self): + temporary = self.orig_temporary + from loopy.kernel.data import TemporaryVariable, temp_var_scope + return TemporaryVariable( + name=self.name, + dtype=temporary.dtype, + scope=temp_var_scope.GLOBAL, + shape=self.new_shape) + + @property + def new_shape(self): + return self.hw_dims + self.non_hw_dims + + def __init__(self, kernel): + self.kernel = kernel + self.insn_query = InstructionQuery(kernel) + self.var_name_gen = kernel.get_var_name_generator() + self.insn_name_gen = kernel.get_instruction_id_generator() + # These fields keep track of updates to the kernel. + self.insns_to_insert = [] + self.insns_to_update = {} + self.extra_args_to_add = {} + self.updated_iname_to_tag = {} + self.updated_temporary_variables = {} + self.saves_or_reloads_added = {} + + @memoize_method + def auto_promote_temporary(self, temporary_name): + temporary = self.kernel.temporary_variables[temporary_name] + + from loopy.kernel.data import temp_var_scope + if temporary.scope == temp_var_scope.GLOBAL: + # Nothing to be done for global temporaries (I hope) + return None + + if temporary.base_storage is not None: + raise ValueError( + "Cannot promote temporaries with base_storage to global") + + # `hw_inames`: The set of hw-parallel tagged inames that this temporary + # is associated with. This is used for determining the shape of the + # global storage needed for saving and restoring the temporary across + # kernel calls. + # + # TODO: Make a policy decision about which dimensions to use. Currently, + # the code looks at each instruction that defines or uses the temporary, + # and takes the common set of hw-parallel tagged inames associated with + # these instructions. + # + # Furthermore, in the case of local temporaries, inames that are tagged + # hw-local do not contribute to the global storage shape. + hw_inames = self.insn_query.common_hw_inames( + self.insn_query.insns_reading_or_writing(temporary.name)) + + # We want hw_inames to be arranged according to the order: + # g.0 < g.1 < ... < l.0 < l.1 < ... + # Sorting lexicographically accomplishes this. + hw_inames = sorted(hw_inames, + key=lambda iname: str(self.kernel.iname_to_tag[iname])) + + # Calculate the sizes of the dimensions that get added in front for + # the global storage of the temporary. + hw_dims = [] + + backing_hw_inames = [] + + for iname in hw_inames: + tag = self.kernel.iname_to_tag[iname] + from loopy.kernel.data import LocalIndexTag + is_local_iname = isinstance(tag, LocalIndexTag) + if is_local_iname and temporary.scope == temp_var_scope.LOCAL: + # Restrict shape to that of group inames for locals. + continue + backing_hw_inames.append(iname) + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import aff_to_expr + hw_dims.append( + aff_to_expr( + static_max_of_pw_aff( + self.kernel.get_iname_bounds(iname).size, False))) + + non_hw_dims = temporary.shape + + if len(non_hw_dims) == 0 and len(hw_dims) == 0: + # Scalar not in hardware: ensure at least one dimension. + non_hw_dims = (1,) + + backing_temporary = self.PromotedTemporary( + name=self.var_name_gen(temporary.name + "_save_slot"), + orig_temporary=temporary, + hw_dims=tuple(hw_dims), + non_hw_dims=non_hw_dims, + hw_inames=backing_hw_inames) + + return backing_temporary + + def save_or_reload_impl(self, temporary, subkernel, mode, + promoted_temporary=lp.auto): + assert mode in ("save", "reload") + + if promoted_temporary is auto: + promoted_temporary = self.auto_promote_temporary(temporary) + + if promoted_temporary is None: + return + + from loopy.kernel.tools import DomainChanger + dchg = DomainChanger( + self.kernel, + frozenset( + self.insn_query.inames_in_subkernel(subkernel) | + set(promoted_temporary.hw_inames))) + + domain, hw_inames, dim_inames, iname_to_tag = \ + self.augment_domain_for_save_or_reload( + dchg.domain, promoted_temporary, mode, subkernel) + + self.kernel = dchg.get_kernel_with(domain) + + save_or_load_insn_id = self.insn_name_gen( + "{name}.{mode}".format(name=temporary, mode=mode)) + + def subscript_or_var(agg, subscript=()): + from pymbolic.primitives import Subscript, Variable + if len(subscript) == 0: + return Variable(agg) + else: + return Subscript( + Variable(agg), + tuple(map(Variable, subscript))) + + dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)] + + args = ( + subscript_or_var( + temporary, dim_inames_trunc), + subscript_or_var( + promoted_temporary.name, hw_inames + dim_inames)) + + if mode == "save": + args = reversed(args) + + accessing_insns_in_subkernel = ( + self.insn_query.insns_reading_or_writing(temporary) & + self.insn_query.insns_in_subkernel(subkernel)) + + if mode == "save": + depends_on = accessing_insns_in_subkernel + update_deps = frozenset() + elif mode == "reload": + depends_on = frozenset() + update_deps = accessing_insns_in_subkernel + + pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel) + + if pre_barrier is not None: + depends_on |= set([pre_barrier]) + + if post_barrier is not None: + update_deps |= set([post_barrier]) + + # Create the load / store instruction. + from loopy.kernel.data import Assignment + save_or_load_insn = Assignment( + *args, + id=save_or_load_insn_id, + within_inames=( + self.insn_query.inames_in_subkernel(subkernel) | + frozenset(hw_inames + dim_inames)), + within_inames_is_final=True, + depends_on=depends_on, + boostable=False, + boostable_into=frozenset()) + + if temporary not in self.saves_or_reloads_added: + self.saves_or_reloads_added[temporary] = set() + self.saves_or_reloads_added[temporary].add(save_or_load_insn_id) + + self.insns_to_insert.append(save_or_load_insn) + + for insn_id in update_deps: + insn = self.insns_to_update.get(insn_id, self.kernel.id_to_insn[insn_id]) + self.insns_to_update[insn_id] = insn.copy( + depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) + + self.updated_temporary_variables[promoted_temporary.name] = \ + promoted_temporary.as_variable() + + self.updated_iname_to_tag.update(iname_to_tag) + + @memoize_method + def finish(self): + new_instructions = [] + + insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) + + # Add global no_sync_with between any added reloads and saves + from six import iteritems + for temporary, added_insns in iteritems(self.saves_or_reloads_added): + for insn_id in added_insns: + insn = insns_to_insert[insn_id] + insns_to_insert[insn_id] = insn.copy( + no_sync_with=frozenset( + (added_insn, "global") for added_insn in added_insns)) + + for orig_insn in self.kernel.instructions: + if orig_insn.id in self.insns_to_update: + new_instructions.append(self.insns_to_update[orig_insn.id]) + else: + new_instructions.append(orig_insn) + new_instructions.extend( + sorted(insns_to_insert.values(), key=lambda insn: insn.id)) + + self.updated_iname_to_tag.update(self.kernel.iname_to_tag) + self.updated_temporary_variables.update(self.kernel.temporary_variables) + + kernel = self.kernel.copy( + instructions=new_instructions, + iname_to_tag=self.updated_iname_to_tag, + temporary_variables=self.updated_temporary_variables) + + from loopy.kernel.tools import assign_automatic_axes + return assign_automatic_axes(kernel) + + def save(self, temporary, subkernel): + self.save_or_reload_impl(temporary, subkernel, "save") + + def reload(self, temporary, subkernel): + self.save_or_reload_impl(temporary, subkernel, "reload") + + def augment_domain_for_save_or_reload(self, + domain, promoted_temporary, mode, subkernel): + """ + Add new axes to the domain corresponding to the dimensions of + `promoted_temporary`. These axes will be used in the save/ + reload stage. + """ + assert mode in ("save", "reload") + import islpy as isl + + orig_temporary = promoted_temporary.orig_temporary + orig_dim = domain.dim(isl.dim_type.set) + + # Tags for newly added inames + iname_to_tag = {} + + # FIXME: Restrict size of new inames to access footprint. + + # Add dimension-dependent inames. + dim_inames = [] + domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims)) + + for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims): + new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}". + format(name=orig_temporary.name, + mode=mode, + dim=dim_idx, + sk=subkernel)) + domain = domain.set_dim_name( + isl.dim_type.set, orig_dim + dim_idx, new_iname) + + if orig_temporary.is_local: + # If the temporary has local scope, then loads / stores can + # be done in parallel. + from loopy.kernel.data import AutoFitLocalIndexTag + iname_to_tag[new_iname] = AutoFitLocalIndexTag() + + dim_inames.append(new_iname) + + # Add size information. + aff = isl.affs_from_space(domain.space) + domain &= aff[0].le_set(aff[new_iname]) + from loopy.symbolic import aff_from_expr + domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size)) + + # FIXME: Use promoted_temporary.hw_inames + hw_inames = [] + + # Add hardware inames duplicates. + for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): + new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}". + format(name=orig_temporary.name, + mode=mode, + dim=t_idx, + sk=subkernel)) + hw_inames.append(new_iname) + iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname] + + from loopy.isl_helpers import duplicate_axes + domain = duplicate_axes( + domain, promoted_temporary.hw_inames, hw_inames) + + # The operations on the domain above return a Set object, but the + # underlying domain should be expressible as a single BasicSet. + domain_list = domain.get_basic_set_list() + assert domain_list.n_basic_set() == 1 + domain = domain_list.get_basic_set(0) + return domain, hw_inames, dim_inames, iname_to_tag + +# }}} + + +# {{{ auto save and reload across kernel calls + +def save_and_reload_temporaries(knl): + """ + Add instructions to save and reload temporary variables that are live + across kernel calls. + + The basic code transformation turns schedule segments:: + + t = <...> + + <...> = t + + into this code:: + + t = <...> + t_save_slot = t + + t = t_save_slot + <...> = t + + where `t_save_slot` is a newly-created global temporary variable. + + :returns: The resulting kernel + """ + liveness = LivenessAnalysis(knl) + saver = TemporarySaver(knl) + + insn_query = InstructionQuery(knl) + + for sched_idx, sched_item in enumerate(knl.schedule): + + if isinstance(sched_item, CallKernel): + # Any written temporary that is live-out needs to be read into + # memory because of the potential for partial writes. + if sched_idx == 0: + # Kernel entry: nothing live + interesting_temporaries = set() + else: + interesting_temporaries = ( + insn_query.temporaries_read_or_written_in_subkernel( + sched_item.kernel_name)) + + for temporary in liveness[sched_idx].live_out & interesting_temporaries: + logger.info("reloading {0} at entry of {1}" + .format(temporary, sched_item.kernel_name)) + saver.reload(temporary, sched_item.kernel_name) + + elif isinstance(sched_item, ReturnFromKernel): + if sched_idx == len(knl.schedule) - 1: + # Kernel exit: nothing live + interesting_temporaries = set() + else: + interesting_temporaries = ( + insn_query.temporaries_written_in_subkernel( + sched_item.kernel_name)) + + for temporary in liveness[sched_idx].live_in & interesting_temporaries: + logger.info("saving {0} before return of {1}" + .format(temporary, sched_item.kernel_name)) + saver.save(temporary, sched_item.kernel_name) + + return saver.finish() + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/version.py b/loopy/version.py index 0b56284bbbf68b92bbab368de3c30a997724b29e..f7d157f650304a83164e11763279d3c5eabbc4c0 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v48-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v49-islpy%s" % _islpy_version diff --git a/test/test_loopy.py b/test/test_loopy.py index 69e0ea325a328d03bbd98cec2163f5c7981a4a78..af4269047539b800a5fd389f9293f11551c9a291 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1105,95 +1105,215 @@ def test_kernel_splitting_with_loop(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) -def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): +def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False): + from loopy.preprocess import preprocess_kernel + from loopy.schedule import get_one_scheduled_kernel + + knl = preprocess_kernel(knl) + knl = get_one_scheduled_kernel(knl) + + from loopy.transform.save import save_and_reload_temporaries + knl = save_and_reload_temporaries(knl) + knl = get_one_scheduled_kernel(knl) + + if debug: + print(knl) + cgr = lp.generate_code_v2(knl) + print(cgr.device_code()) + print(cgr.host_code()) + 1/0 + + _, (out,) = knl(queue, out_host=True) + assert (out == out_expect).all() + + +@pytest.mark.parametrize("hw_loop", [True, False]) +def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False): ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - pytest.xfail("spilling doesn't yet use local axes") + knl = lp.make_kernel( + "{ [i]: 0<=i<8 }", + """ + for i + <>t = i + ... gbarrier + out[i] = t + end + """, seq_dependencies=True) + + if hw_loop: + knl = lp.tag_inames(knl, dict(i="g.0")) + + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) + + +def test_save_of_private_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{ [i,k]: 0<=i t_private_scalar = a[k,i+1] - <> t_private_array[i % 2] = a[k,i+1] - c[k,i] = a[k,i+1] - ... gbarrier - out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] + "{ [i]: 0<=i<8 }", + """ + for i + <>t[i] = i + ... gbarrier + out[i] = t[i] + end + """, seq_dependencies=True) + + knl = lp.set_temporary_scope(knl, "t", "private") + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) + + +def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i,j,k]: 0<=i,j,k<8 }", + """ + for i + for j + <>t[j] = j end - """, seq_dependencies=True) + ... gbarrier + for k + out[i,k] = t[k] + end + end + """, seq_dependencies=True) - knl = lp.add_and_infer_dtypes(knl, - {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) - knl = lp.set_temporary_scope(knl, "t_private_scalar", "private") - knl = lp.set_temporary_scope(knl, "t_private_array", "private") + knl = lp.tag_inames(knl, dict(i="g.0")) + knl = lp.set_temporary_scope(knl, "t", "private") - ref_knl = knl + save_and_reload_temporaries_test( + queue, knl, np.vstack((8 * (np.arange(8),))), debug) - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - # schedule - from loopy.preprocess import preprocess_kernel - knl = preprocess_kernel(knl) +def test_save_of_private_multidim_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - from loopy.schedule import get_one_scheduled_kernel - knl = get_one_scheduled_kernel(knl) + knl = lp.make_kernel( + "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }", + """ + for i + for j, k + <>t[j,k] = k + end + ... gbarrier + for l, m + out[i,l,m] = t[l,m] + end + end + """, seq_dependencies=True) - # map schedule onto host or device - print(knl) + knl = lp.set_temporary_scope(knl, "t", "private") - cgr = lp.generate_code_v2(knl) + result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) + save_and_reload_temporaries_test(queue, knl, result, debug) - assert len(cgr.device_programs) == 2 - print(cgr.device_code()) - print(cgr.host_code()) +def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5)) + knl = lp.make_kernel( + "{ [i,j,k,l,m]: 0<=i,j,k,l,m<8 }", + """ + for i + for j, k + <>t[j,k] = k + end + ... gbarrier + for l, m + out[i,l,m] = t[l,m] + end + end + """, seq_dependencies=True) + + knl = lp.set_temporary_scope(knl, "t", "private") + knl = lp.tag_inames(knl, dict(i="g.0")) + + result = np.array([np.vstack((8 * (np.arange(8),))) for i in range(8)]) + save_and_reload_temporaries_test(queue, knl, result, debug) -def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory): +@pytest.mark.parametrize("hw_loop", [True, False]) +def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False): ctx = ctx_factory() + queue = cl.CommandQueue(ctx) knl = lp.make_kernel( - "{ [i,k]: 0<=i t_local[i % 8,k] = i % 8 - c[k,i] = a[k,i+1] + for i + for k + <> t_arr[k] = k + end + <> t_scalar = 1 + for j + ... gbarrier + out[j] = t_scalar + ... gbarrier + t_scalar = 10 + end ... gbarrier - out[k,i] = c[k,i] + t_local[i % 8,k] + <> flag = i == 9 + out[i] = t_arr[i] {if=flag} end """, seq_dependencies=True) - knl = lp.add_and_infer_dtypes(knl, - {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + knl = lp.set_temporary_scope(knl, "t_arr", "private") + if hw_loop: + knl = lp.tag_inames(knl, dict(i="g.0")) - knl = lp.set_temporary_scope(knl, "t_local", "local") + result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9]) - ref_knl = knl + save_and_reload_temporaries_test(queue, knl, result, debug) - knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0") - # schedule - from loopy.preprocess import preprocess_kernel - knl = preprocess_kernel(knl) +def test_save_of_local_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) - from loopy.schedule import get_one_scheduled_kernel - knl = get_one_scheduled_kernel(knl) + knl = lp.make_kernel( + "{ [i,j]: 0<=i,j<8 }", + """ + for i, j + <>t[2*j] = j + t[2*j+1] = j + ... gbarrier + out[i] = t[2*i] + end + """, seq_dependencies=True) - # map schedule onto host or device - print(knl) + knl = lp.set_temporary_scope(knl, "t", "local") + knl = lp.tag_inames(knl, dict(i="g.0", j="l.0")) - cgr = lp.generate_code_v2(knl) + save_and_reload_temporaries_test(queue, knl, np.arange(8), debug) - assert len(cgr.device_programs) == 2 - print(cgr.device_code()) - print(cgr.host_code()) +def test_save_local_multidim_array(ctx_factory, debug=False): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{ [i,j,k]: 0<=i<2 and 0<=k<3 and 0<=j<2}", + """ + for i, j, k + ... gbarrier + <> t_local[k,j] = 1 + ... gbarrier + out[k,i*2+j] = t_local[k,j] + end + """, seq_dependencies=True) + + knl = lp.set_temporary_scope(knl, "t_local", "local") + knl = lp.tag_inames(knl, dict(j="l.0", i="g.0")) - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=8)) + save_and_reload_temporaries_test(queue, knl, 1, debug) def test_global_temporary(ctx_factory):