diff --git a/bin/loopy b/bin/loopy index 0ac8ee78a5da228bb7628cd0c8866bf5f29abeef..57dac8038ca638269249783151e4b195b05db223 100644 --- a/bin/loopy +++ b/bin/loopy @@ -197,7 +197,10 @@ def main(): import os edit_kernel_env = os.environ.get("LOOPY_EDIT_KERNEL") if (args.edit_code - or any(edit_kernel_env.lower() in k.name.lower() for k in kernels)): + or ( + edit_kernel_env is not None + and + any(edit_kernel_env.lower() in k.name.lower() for k in kernels))): from pytools import invoke_editor code = invoke_editor(code, filename="edit.cl") diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 0df707d0db82fcf42b60e8405d1f0d189fb31354..25cb764f3cd38c2b10b59edb71d850e28d6da2d6 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -997,9 +997,10 @@ transformation exists in :func:`loopy.add_prefetch`: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - a_fetch_0 = a[16 * gid(0) + lid(0)]; + a_fetch = a[16 * gid(0) + lid(0)]; + acc_k = 0.0f; for (int k = 0; k <= 15; ++k) - acc_k = acc_k + a_fetch_0; + acc_k = acc_k + a_fetch; out[16 * gid(0) + lid(0)] = acc_k; ... @@ -1021,12 +1022,14 @@ earlier: #define lid(N) ((int) get_local_id(N)) ... if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) - a_fetch_0[lid(0)] = a[lid(0) + 16 * gid(0)]; - barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch_0 (insn_k_update depends on a_fetch) */; + a_fetch[lid(0)] = a[lid(0) + 16 * gid(0)]; + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) + acc_k = 0.0f; + barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { for (int k = 0; k <= 15; ++k) - acc_k = acc_k + a_fetch_0[lid(0)]; + acc_k = acc_k + a_fetch[lid(0)]; out[16 * gid(0) + lid(0)] = acc_k; } ... @@ -1151,7 +1154,7 @@ sign that something is amiss: >>> evt, (out,) = knl(queue, a=a_mat_dev) Traceback (most recent call last): ... - WriteRaceConditionWarning: instruction 'a_fetch' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch_0' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch)' to silenced_warnings kernel argument to disable) + WriteRaceConditionWarning: instruction 'a_fetch_rule' looks invalid: it assigns to indices based on local IDs, but its temporary 'a_fetch' cannot be made local because a write race across the iname(s) 'j_inner' would emerge. (Do you need to add an extra iname to your prefetch?) (add 'write_race_local(a_fetch_rule)' to silenced_warnings kernel argument to disable) When we ask to see the code, the issue becomes apparent: @@ -1167,17 +1170,17 @@ When we ask to see the code, the issue becomes apparent: <BLANKLINE> __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *restrict a, int const n, __global float *restrict out) { - float a_fetch_0[16]; + float a_fetch[16]; <BLANKLINE> ... - a_fetch_0[lid(0)] = a[n * (lid(0) + 16 * gid(1)) + lid(1) + 16 * gid(0)]; + a_fetch[lid(0)] = a[n * (lid(0) + 16 * gid(1)) + lid(1) + 16 * gid(0)]; ... - out[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = a_fetch_0[lid(0)]; + out[n * (lid(1) + gid(0) * 16) + lid(0) + gid(1) * 16] = a_fetch[lid(0)]; ... } Loopy has a 2D workgroup to use for prefetching of a 1D array. When it -considers making *a_fetch_0* ``local`` (in the OpenCL memory sense of the word) +considers making *a_fetch* ``local`` (in the OpenCL memory sense of the word) to make use of parallelism in prefetching, it discovers that a write race across the remaining axis of the workgroup would emerge. diff --git a/loopy/__init__.py b/loopy/__init__.py index d417648b7272aeed5b4a594604ea5eb78024be55..ff3a004d98ec4e2f4894c719264c9d05b617fa90 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -944,6 +944,18 @@ def _add_kernel_axis(kernel, axis_name, start, stop, base_inames): .insert_dims(dim_type.set, new_dim_idx, 1) .set_dim_name(dim_type.set, new_dim_idx, axis_name)) + from loopy.symbolic import get_dependencies + deps = get_dependencies(start) | get_dependencies(stop) + assert deps <= kernel.all_params() + + param_names = domain.get_var_names(dim_type.param) + for dep in deps: + if dep not in param_names: + new_dim_idx = domain.dim(dim_type.param) + domain = (domain + .insert_dims(dim_type.param, new_dim_idx, 1) + .set_dim_name(dim_type.param, new_dim_idx, dep)) + from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), axis_name, start, stop) @@ -1023,7 +1035,9 @@ def _process_footprint_subscripts(kernel, rule_name, sweep_inames, def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, - default_tag="l.auto", rule_name=None, footprint_subscripts=None, + default_tag="l.auto", rule_name=None, + temporary_name=None, temporary_is_local=None, + footprint_subscripts=None, fetch_bounding_box=False): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. @@ -1082,7 +1096,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, var_name_gen = kernel.get_var_name_generator() if rule_name is None: - rule_name = var_name_gen("%s_fetch" % c_name) + rule_name = var_name_gen("%s_fetch_rule" % c_name) + if temporary_name is None: + temporary_name = var_name_gen("%s_fetch" % c_name) arg = kernel.arg_dict[var_name] @@ -1119,7 +1135,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, new_kernel = precompute(kernel, subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, - fetch_bounding_box=fetch_bounding_box) + fetch_bounding_box=fetch_bounding_box, + temporary_name=temporary_name, + temporary_is_local=temporary_is_local) # {{{ remove inames that were temporarily added by slice sweeps diff --git a/loopy/check.py b/loopy/check.py index 01a6e52c268a5c4e76f94653490d05c345e1a7d0..2673e1bdf9e3557a95b3504aac7f184281403214 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1,7 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -import six -from six.moves import range +from __future__ import absolute_import, division, print_function __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -25,12 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six +from six.moves import range from islpy import dim_type import islpy as isl from loopy.symbolic import WalkMapper from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn -from loopy.tools import is_integer import logging logger = logging.getLogger(__name__) @@ -38,16 +36,6 @@ logger = logging.getLogger(__name__) # {{{ sanity checks run pre-scheduling -def check_temp_variable_shapes_are_constant(kernel): - for tv in six.itervalues(kernel.temporary_variables): - if any(not is_integer(s_i) for s_i in tv.shape): - raise LoopyError("shape of temporary variable '%s' is not " - "constant (but has to be since the size of " - "the temporary needs to be known at build time). " - "Use loopy.fix_parameters to set variables to " - "constant values." % tv.name) - - def check_insn_attributes(kernel): all_insn_ids = set(insn.id for insn in kernel.instructions) @@ -360,7 +348,6 @@ def pre_schedule_checks(kernel): try: logger.info("pre-schedule check %s: start" % kernel.name) - check_temp_variable_shapes_are_constant(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel) check_insn_attributes(kernel) @@ -484,10 +471,14 @@ def check_implemented_domains(kernel, implemented_domains, code=None): .project_out_except(insn_inames, [dim_type.set])) insn_domain = kernel.get_inames_domain(insn_inames) + insn_parameters = frozenset(insn_domain.get_var_names(dim_type.param)) assumptions, insn_domain = align_two(assumption_non_param, insn_domain) desired_domain = ((insn_domain & assumptions) - .project_out_except(insn_inames, [dim_type.set])) + .project_out_except(insn_inames, [dim_type.set]) + .project_out_except(insn_parameters, [dim_type.param])) + insn_impl_domain = (insn_impl_domain + .project_out_except(insn_parameters, [dim_type.param])) insn_impl_domain, desired_domain = align_two( insn_impl_domain, desired_domain) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index c5c0baec6d07fc3daf3adfa5a8e130d712575963..d2d178bc459cb0c231a78a5a1b2c3b8092d07536 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1074,7 +1074,7 @@ class LoopKernel(RecordWithoutPickling): if kernel.schedule is not None: lines.append("SCHEDULE:") from loopy.schedule import dump_schedule - lines.append(dump_schedule(kernel.schedule)) + lines.append(dump_schedule(kernel, kernel.schedule)) lines.append(sep) return "\n".join(lines) diff --git a/loopy/schedule.py b/loopy/schedule.py index 1d0dc1221a8280d73cdd03858bb72404c7c79afb..bba42381a5a4f67e957c137a2c866dd816c523c4 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -171,7 +171,33 @@ def find_used_inames_within(kernel, sched_index): return result -def find_loop_nest_map(kernel): +def find_loop_nest_with_map(kernel): + """Returns a dictionary mapping inames to other inames that are + always nested with them. + """ + result = {} + + from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag + + all_nonpar_inames = set([ + iname + for iname in kernel.all_inames() + if not isinstance(kernel.iname_to_tag.get(iname), + (ParallelTag, IlpBaseTag, VectorizeTag))]) + + iname_to_insns = kernel.iname_to_insns() + + for iname in all_nonpar_inames: + result[iname] = set([ + other_iname + for insn in iname_to_insns[iname] + for other_iname in kernel.insn_inames(insn) & all_nonpar_inames + ]) + + return result + + +def find_loop_nest_around_map(kernel): """Returns a dictionary mapping inames to other inames that are always nested around them. """ @@ -212,20 +238,26 @@ def find_loop_nest_map(kernel): return result -def find_loop_insn_dep_map(kernel, loop_nest_map): +def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): """Returns a dictionary mapping inames to other instruction ids that need to be scheduled before the iname should be eligible for scheduling. """ result = {} - from loopy.kernel.data import ParallelTag + from loopy.kernel.data import ParallelTag, IlpBaseTag, VectorizeTag for insn in kernel.instructions: for iname in kernel.insn_inames(insn): if isinstance(kernel.iname_to_tag.get(iname), ParallelTag): continue + iname_dep = result.setdefault(iname, set()) + for dep_insn_id in insn.insn_deps: + if dep_insn_id in iname_dep: + # already depending, nothing to check + continue + dep_insn = kernel.id_to_insn[dep_insn_id] dep_insn_inames = kernel.insn_inames(dep_insn) @@ -235,22 +267,43 @@ def find_loop_insn_dep_map(kernel, loop_nest_map): continue # To make sure dep_insn belongs outside of iname, we must prove - # (via loop_nest_map) that all inames that dep_insn will be - # executed inside are nested *around* iname. - if not dep_insn_inames <= loop_nest_map[iname]: + # that all inames that dep_insn will be executed in nest + # outside of the loop over *iname*. (i.e. nested around, or + # before). + + may_add_to_loop_dep_map = True + for dep_insn_iname in dep_insn_inames: + if dep_insn_iname in loop_nest_around_map[iname]: + # dep_insn_iname is guaranteed to nest outside of iname + # -> safe. + continue + + tag = kernel.iname_to_tag.get(dep_insn_iname) + if isinstance(tag, (ParallelTag, IlpBaseTag, VectorizeTag)): + # Parallel tags don't really nest, so we'll disregard + # them here. + continue + + if dep_insn_iname not in loop_nest_with_map.get(iname, []): + # dep_insn_iname does not nest with iname, so its nest + # must occur outside. + continue + + may_add_to_loop_dep_map = False + break + + if not may_add_to_loop_dep_map: continue - iname_dep = result.setdefault(iname, set()) - if dep_insn_id not in iname_dep: - logger.debug("{knl}: loop dependency map: iname '{iname}' " - "depends on '{dep_insn}' via '{insn}'" - .format( - knl=kernel.name, - iname=iname, - dep_insn=dep_insn_id, - insn=insn.id)) + logger.debug("{knl}: loop dependency map: iname '{iname}' " + "depends on '{dep_insn}' via '{insn}'" + .format( + knl=kernel.name, + iname=iname, + dep_insn=dep_insn_id, + insn=insn.id)) - iname_dep.add(dep_insn_id) + iname_dep.add(dep_insn_id) return result @@ -269,21 +322,34 @@ def group_insn_counts(kernel): # {{{ debug help -def dump_schedule(schedule): - entries = [] +def dump_schedule(kernel, schedule): + lines = [] + indent = "" + + from loopy.kernel.data import ExpressionInstruction for sched_item in schedule: if isinstance(sched_item, EnterLoop): - entries.append("<%s>" % sched_item.iname) + lines.append(indent + "LOOP %s" % sched_item.iname) + indent += " " elif isinstance(sched_item, LeaveLoop): - entries.append("</%s>" % sched_item.iname) + indent = indent[:-4] + lines.append(indent + "ENDLOOP %s" % sched_item.iname) elif isinstance(sched_item, RunInstruction): - entries.append(sched_item.insn_id) + insn = kernel.id_to_insn[sched_item.insn_id] + if isinstance(insn, ExpressionInstruction): + insn_str = "[%s] %s <- %s" % ( + insn.id, str(insn.assignee), str(insn.expression)) + else: + insn_str = sched_item.insn_id + lines.append(indent + insn_str) elif isinstance(sched_item, Barrier): - entries.append("|") + lines.append(indent + "---BARRIER---") else: assert False - return " ".join(entries) + return "\n".join( + "% 4d: %s" % (i, line) + for i, line in enumerate(lines)) class ScheduleDebugger: @@ -343,6 +409,11 @@ class ScheduleDebugger: def start(self): from time import time self.start_time = time() + + +class ScheduleDebugInput(Exception): + pass + # }}} @@ -352,11 +423,11 @@ class SchedulerState(Record): """ .. attribute:: kernel - .. attribute:: loop_nest_map + .. attribute:: loop_nest_around_map .. attribute:: loop_priority - See :func:`loop_nest_map`. + See :func:`loop_nest_around_map`. .. attribute:: breakable_inames @@ -409,7 +480,7 @@ class SchedulerState(Record): def generate_loop_schedules_internal( - sched_state, allow_boost=False, allow_insn=False, debug=None): + sched_state, allow_boost=False, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. @@ -439,18 +510,16 @@ def generate_loop_schedules_internal( print(kernel) print(75*"=") print("CURRENT SCHEDULE:") - print("%s (length: %d)" % ( - dump_schedule(sched_state.schedule), - len(sched_state.schedule))) - print("(LEGEND: entry into loop: <iname>, exit from loop: </iname>, " - "instructions w/ no delimiters)") + print(dump_schedule(sched_state.kernel, sched_state.schedule)) #print("boost allowed:", allow_boost) print(75*"=") print("LOOP NEST MAP:") - for iname, val in six.iteritems(sched_state.loop_nest_map): + for iname, val in six.iteritems(sched_state.loop_nest_around_map): print("%s : %s" % (iname, ", ".join(val))) print(75*"=") - print("WHY IS THIS A DEAD-END SCHEDULE?") + + if debug.debug_length == len(debug.longest_rejected_schedule): + print("WHY IS THIS A DEAD-END SCHEDULE?") #if len(schedule) == 2: #from pudb import set_trace; set_trace() @@ -525,10 +594,10 @@ def generate_loop_schedules_internal( # }}} - if is_ready and allow_insn: - if debug_mode: - print("scheduling '%s'" % insn.id) + if is_ready and debug_mode: + print("ready to schedule '%s'" % insn.id) + if is_ready and not debug_mode: iid_set = frozenset([insn.id]) # {{{ update active group counts for added instruction @@ -565,8 +634,7 @@ def generate_loop_schedules_internal( for sub_sched in generate_loop_schedules_internal( new_sched_state, - allow_boost=rec_allow_boost, debug=debug, - allow_insn=True): + allow_boost=rec_allow_boost, debug=debug): yield sub_sched if not sched_state.group_insn_counts: @@ -618,15 +686,14 @@ def generate_loop_schedules_internal( can_leave = True break - if can_leave: + if can_leave and not debug_mode: for sub_sched in generate_loop_schedules_internal( sched_state.copy( schedule=( sched_state.schedule + (LeaveLoop(iname=last_entered_loop),)), active_inames=sched_state.active_inames[:-1]), - allow_boost=rec_allow_boost, debug=debug, - allow_insn=allow_insn): + allow_boost=rec_allow_boost, debug=debug): yield sub_sched return @@ -667,9 +734,11 @@ def generate_loop_schedules_internal( currently_accessible_inames = ( active_inames_set | sched_state.parallel_inames) - if not sched_state.loop_nest_map[iname] <= currently_accessible_inames: + if ( + not sched_state.loop_nest_around_map[iname] + <= currently_accessible_inames): if debug_mode: - print("scheduling %s prohibited by loop nest map" % iname) + print("scheduling %s prohibited by loop nest-around map" % iname) continue if ( @@ -694,7 +763,7 @@ def generate_loop_schedules_internal( iname_home_domain.get_var_names(dim_type.param)) # The previous check should have ensured this is true, because - # the loop_nest_map takes the domain dependency graph into + # the loop_nest_around_map takes the domain dependency graph into # consideration. assert (iname_home_domain_params & kernel.all_inames() <= currently_accessible_inames) @@ -790,37 +859,42 @@ def generate_loop_schedules_internal( if debug_mode: print("useful inames: %s" % ",".join(useful_loops_set)) - - for tier in priority_tiers: - found_viable_schedule = False - - for iname in sorted(tier, - key=lambda iname: iname_to_usefulness.get(iname, 0), - reverse=True): - - for sub_sched in generate_loop_schedules_internal( - sched_state.copy( - schedule=( - sched_state.schedule - + (EnterLoop(iname=iname),)), - active_inames=( - sched_state.active_inames + (iname,)), - entered_inames=( - sched_state.entered_inames | frozenset((iname,))), - ), - allow_boost=rec_allow_boost, - debug=debug): - found_viable_schedule = True - yield sub_sched - - if found_viable_schedule: - return + else: + for tier in priority_tiers: + found_viable_schedule = False + + for iname in sorted(tier, + key=lambda iname: iname_to_usefulness.get(iname, 0), + reverse=True): + + for sub_sched in generate_loop_schedules_internal( + sched_state.copy( + schedule=( + sched_state.schedule + + (EnterLoop(iname=iname),)), + active_inames=( + sched_state.active_inames + (iname,)), + entered_inames=( + sched_state.entered_inames + | frozenset((iname,))), + ), + allow_boost=rec_allow_boost, + debug=debug): + found_viable_schedule = True + yield sub_sched + + if found_viable_schedule: + return # }}} if debug_mode: print(75*"=") - six.moves.input("Hit Enter for next schedule:") + inp = six.moves.input("Hit Enter for next schedule, " + "or enter a number to examine schedules of a " + "different length:") + if inp: + raise ScheduleDebugInput(inp) if not sched_state.active_inames and not sched_state.unscheduled_insn_ids: # if done, yield result @@ -829,20 +903,11 @@ def generate_loop_schedules_internal( yield sched_state.schedule else: - if not allow_insn: - # try again with boosting allowed - for sub_sched in generate_loop_schedules_internal( - sched_state, - allow_boost=allow_boost, debug=debug, - allow_insn=True): - yield sub_sched - if not allow_boost and allow_boost is not None: # try again with boosting allowed for sub_sched in generate_loop_schedules_internal( sched_state, - allow_boost=True, debug=debug, - allow_insn=allow_insn): + allow_boost=True, debug=debug): yield sub_sched else: # dead end @@ -1243,11 +1308,15 @@ def generate_loop_schedules(kernel, debug_args={}): iname for iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) - loop_nest_map = find_loop_nest_map(kernel) + loop_nest_with_map = find_loop_nest_with_map(kernel) + loop_nest_around_map = find_loop_nest_around_map(kernel) sched_state = SchedulerState( kernel=kernel, - loop_nest_map=loop_nest_map, - loop_insn_dep_map=find_loop_insn_dep_map(kernel, loop_nest_map), + loop_nest_around_map=loop_nest_around_map, + loop_insn_dep_map=find_loop_insn_dep_map( + kernel, + loop_nest_with_map=loop_nest_with_map, + loop_nest_around_map=loop_nest_around_map), breakable_inames=ilp_inames, ilp_inames=ilp_inames, vec_inames=vec_inames, @@ -1289,9 +1358,17 @@ def generate_loop_schedules(kernel, debug_args={}): print() debug.debug_length = len(debug.longest_rejected_schedule) - for _ in generate_loop_schedules_internal(sched_state, - debug=debug): - pass + while True: + try: + for _ in generate_loop_schedules_internal(sched_state, + debug=debug): + pass + + except ScheduleDebugInput as e: + debug.debug_length = int(str(e)) + continue + + break try: for gen in generators: @@ -1327,6 +1404,7 @@ def generate_loop_schedules(kernel, debug_args={}): break except KeyboardInterrupt: + print() print(75*"-") print("Interrupted during scheduling") print(75*"-") diff --git a/setup.cfg b/setup.cfg index d3f13a0e64b79c00a957cb1369e335e0b8a00d76..2dc94705f792ddca37ca7bf1f8ddec67be7ed3b9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,3 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802 +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503 max-line-length=85 diff --git a/test/test_loopy.py b/test/test_loopy.py index 7cad3504859d199c0581c8d3248ebafe50a34c4a..d7d7dc5768f163a84306ef234af08cf620c4066c 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2174,6 +2174,23 @@ def test_to_batched(ctx_factory): bknl(queue, a=a, x=x) +def test_variable_size_temporary(): + knl = lp.make_kernel( + ''' { [i,j]: 0<=i,j<n } ''', + ''' out[i] = sum(j, a[i,j])''') + + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + + knl = lp.add_prefetch( + knl, "a[:,:]", default_tag=None) + + # Make sure that code generation succeeds even if + # there are variable-length arrays. + knl = lp.preprocess_kernel(knl) + for k in lp.generate_loop_schedules(knl): + lp.generate_code(k) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])