diff --git a/loopy/__init__.py b/loopy/__init__.py index 7cadc18d8191f1f6860a273cebbb002f18857d82..246dc295670fec625369bac8239ade9c91250ddf 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -66,7 +66,11 @@ def split_dimension(kernel, split_iname, inner_length, outer_tag=None, inner_tag=None, slabs=(0, 0), do_tagged_check=True): - if do_tagged_check and kernel.iname_to_tag.get(split_iname) is not None: + existing_tag = kernel.iname_to_tag.get(split_iname) + from loopy.kernel import ForceSequentialTag + if do_tagged_check and ( + existing_tag is not None + and not isinstance(existing_tag, ForceSequentialTag)): raise RuntimeError("cannot split already tagged iname '%s'" % split_iname) if split_iname not in kernel.all_inames(): @@ -158,6 +162,10 @@ def split_dimension(kernel, split_iname, inner_length, applied_iname_rewrites=applied_iname_rewrites, )) + if existing_tag is not None: + result = tag_dimensions(result, + {outer_iname: existing_tag, inner_iname: existing_tag}) + return tag_dimensions(result, {outer_iname: outer_tag, inner_iname: inner_tag}) # }}} @@ -263,7 +271,8 @@ def tag_dimensions(kernel, iname_to_tag, force=False): iname_to_tag = dict((iname, parse_tag(tag)) for iname, tag in iname_to_tag.iteritems()) - from loopy.kernel import (ParallelTag, AutoLocalIndexTagBase) + from loopy.kernel import (ParallelTag, AutoLocalIndexTagBase, + ForceSequentialTag) new_iname_to_tag = kernel.iname_to_tag.copy() for iname, new_tag in iname_to_tag.iteritems(): @@ -274,7 +283,7 @@ def tag_dimensions(kernel, iname_to_tag, force=False): retag_ok = False - if isinstance(old_tag, AutoLocalIndexTagBase): + if isinstance(old_tag, (AutoLocalIndexTagBase, ForceSequentialTag)): retag_ok = True if not retag_ok and old_tag is not None and new_tag is None: @@ -283,10 +292,16 @@ def tag_dimensions(kernel, iname_to_tag, force=False): if iname not in kernel.all_inames(): raise ValueError("cannot tag '%s'--not known" % iname) - if isinstance(new_tag, ParallelTag) and iname in kernel.sequential_inames: + if isinstance(new_tag, ParallelTag) and isinstance(old_tag, ForceSequentialTag): raise ValueError("cannot tag '%s' as parallel--" "iname requires sequential execution" % iname) + if isinstance(new_tag, ForceSequentialTag) and isinstance(old_tag, ParallelTag): + raise ValueError("'%s' is already tagged as parallel, " + "but is now prohibited from being parallel " + "(likely because of participation in a precompute or " + "a reduction)" % iname) + if (not retag_ok) and (not force) and old_tag is not None and (old_tag != new_tag): raise RuntimeError("'%s' is already tagged '%s'--cannot retag" % (iname, old_tag)) diff --git a/loopy/check.py b/loopy/check.py index f1f178000a0ec7a53abca7f094d6305ab56a5fb2..51b949aa95c4abbd8cb4dcd7dddc6f6a15e41c14 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -86,11 +86,15 @@ def check_for_inactive_iname_access(kernel): +class WriteRaceConditionError(RuntimeError): + pass + def check_for_write_races(kernel): from loopy.symbolic import DependencyMapper - from loopy.kernel import ParallelTag, GroupIndexTag + from loopy.kernel import ParallelTag, GroupIndexTag, LocalIndexTagBase depmap = DependencyMapper() + iname_to_tag = kernel.iname_to_tag.get for insn in kernel.instructions: assignee_name = insn.get_assignee_var_name() assignee_indices = depmap(insn.get_assignee_indices()) @@ -109,42 +113,33 @@ def check_for_write_races(kernel): "iname that the instruction does not depend on" % insn.id) - inames_without_write_dep = None - if assignee_name in kernel.arg_dict: # Any parallel tags that are not depended upon by the assignee # will cause write races. - parallel_insn_inames = set( + raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) - - inames_without_write_dep = parallel_insn_inames - ( - assignee_inames & parallel_insn_inames) + if isinstance(iname_to_tag(iname), ParallelTag)) elif assignee_name in kernel.temporary_variables: temp_var = kernel.temporary_variables[assignee_name] if temp_var.is_local == True: - local_parallel_insn_inames = set( + raceable_parallel_insn_inames = set( iname for iname in kernel.insn_inames(insn) - if isinstance(kernel.iname_to_tag.get(iname), ParallelTag) - and not isinstance(kernel.iname_to_tag.get(iname), GroupIndexTag)) - - inames_without_write_dep = local_parallel_insn_inames - ( - assignee_inames & local_parallel_insn_inames) + if isinstance(iname_to_tag(iname), ParallelTag) + and not isinstance(iname_to_tag(iname), GroupIndexTag)) elif temp_var.is_local == False: - #from loopy.kernel import IlpBaseTag - #ilp_inames = set( - #iname - #for iname in kernel.insn_inames(insn) - #if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag)) - - #inames_without_write_dep = ilp_inames - assignee_inames - - inames_without_write_dep = set() + raceable_parallel_insn_inames = set( + iname + for iname in kernel.insn_inames(insn) + if isinstance(iname_to_tag(iname), ParallelTag) + and not isinstance(iname_to_tag(iname), + GroupIndexTag) + and not isinstance(iname_to_tag(iname), + LocalIndexTagBase)) else: raise RuntimeError("temp var '%s' hasn't decided on " @@ -154,14 +149,15 @@ def check_for_write_races(kernel): raise RuntimeError("invalid assignee name in instruction '%s'" % insn.id) - assert inames_without_write_dep is not None + race_inames = \ + raceable_parallel_insn_inames - assignee_inames - if inames_without_write_dep: - raise RuntimeError( + if race_inames: + raise WriteRaceConditionError( "instruction '%s' contains a write race: " "instruction will be run across parallel iname(s) '%s', which " "is/are not referenced in the lhs index" - % (insn.id, ",".join(inames_without_write_dep))) + % (insn.id, ",".join(race_inames))) def check_for_orphaned_user_hardware_axes(kernel): from loopy.kernel import LocalIndexTag diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 79c86a18114cb61782559a098883f27762e7baaf..5dfea65cca1abc4639ef35e2da54e2ef62ce5f11 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -41,10 +41,11 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): generate_unroll_loop, generate_sequential_loop_dim_code) - from loopy.kernel import UnrollTag - if isinstance(tag, UnrollTag): + from loopy.kernel import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, + LoopedIlpTag) + if isinstance(tag, (UnrollTag, UnrolledIlpTag)): func = generate_unroll_loop - elif tag is None: + elif tag is None or isinstance(tag, (LoopedIlpTag, ForceSequentialTag)): func = generate_sequential_loop_dim_code else: raise RuntimeError("encountered (invalid) EnterLoop for '%s', tagged '%s'" diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index db3ff940c1c56e43026b73c79517b61c16fcd154..bbb9c166651647e2be95fdfde495cdcad405b8d2 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -104,7 +104,6 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): def generate_unroll_loop(kernel, sched_index, codegen_state): iname = kernel.schedule[sched_index].iname - tag = kernel.iname_to_tag.get(iname) bounds = kernel.get_iname_bounds(iname) @@ -118,20 +117,15 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): bounds.lower_bound_pw_aff.coalesce(), constants_only=False) - from loopy.kernel import UnrollTag - if isinstance(tag, UnrollTag): - result = [] - - for i in range(length): - idx_aff = lower_bound_aff + i - new_codegen_state = codegen_state.fix(iname, idx_aff) - result.append( - build_loop_nest(kernel, sched_index+1, new_codegen_state)) + result = [] - return gen_code_block(result) + for i in range(length): + idx_aff = lower_bound_aff + i + new_codegen_state = codegen_state.fix(iname, idx_aff) + result.append( + build_loop_nest(kernel, sched_index+1, new_codegen_state)) - else: - raise RuntimeError("unexpected tag") + return gen_code_block(result) # }}} diff --git a/loopy/creation.py b/loopy/creation.py index 6061d14a3339ce4085ec567dd65f4d494f0c308f..99951635defce433b38148ecf66c8a30ce046507 100644 --- a/loopy/creation.py +++ b/loopy/creation.py @@ -3,6 +3,33 @@ import numpy as np from loopy.symbolic import IdentityMapper +def tag_reduction_inames_as_sequential(knl): + result = set() + + def map_reduction(red_expr, rec): + rec(red_expr.expr) + result.update(red_expr.inames) + + from loopy.symbolic import ReductionCallbackMapper + for insn in knl.instructions: + ReductionCallbackMapper(map_reduction)(insn.expression) + + from loopy.kernel import ParallelTag, ForceSequentialTag + + new_iname_to_tag = {} + for iname in result: + tag = knl.iname_to_tag.get(iname) + if tag is not None and isinstance(tag, ParallelTag): + raise RuntimeError("inconsistency detected: " + "reduction iname '%s' has " + "a parallel tag" % iname) + + if tag is None: + new_iname_to_tag[iname] = ForceSequentialTag() + + from loopy import tag_dimensions + return tag_dimensions(knl, new_iname_to_tag) + # {{{ sanity checking def check_for_duplicate_names(knl): @@ -378,6 +405,8 @@ def make_kernel(*args, **kwargs): check_for_nonexistent_iname_deps(knl) + knl = tag_reduction_inames_as_sequential(knl) + knl = create_temporaries(knl) knl = duplicate_reduction_inames(knl) knl = duplicate_inames(knl) diff --git a/loopy/cse.py b/loopy/cse.py index 3a7e6e8b81255c849122c321942b9a32e80fb760..65dcf22d92cb42462086d437b0abc9073be81368 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -536,6 +536,7 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], InvocationDescriptor(expr=expr, args=args, expands_footprint=footprint_generators is None, from_subst_rule=current_subst_rule)) + return expr from loopy.symbolic import SubstitutionCallbackMapper @@ -550,15 +551,17 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # }}} sweep_inames = list(sweep_inames) + sweep_inames_set = frozenset(sweep_inames) - # {{{ find inames used in argument dependencies + # {{{ find inames used in arguments expanding_usage_arg_deps = set() for invdesc in invocation_descriptors: if invdesc.expands_footprint: for arg in invdesc.args: - expanding_usage_arg_deps.update(get_dependencies(arg)) + expanding_usage_arg_deps.update( + get_dependencies(arg) & kernel.all_inames()) # }}} @@ -566,18 +569,28 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # {{{ use given / find new storage_axes - extra_storage_axes = list(set(sweep_inames) - expanding_usage_arg_deps) + # extra axes made necessary because they don't occur in the arguments + extra_storage_axes = sweep_inames_set - expanding_usage_arg_deps + + from loopy.symbolic import ParametrizedSubstitutor + submap = ParametrizedSubstitutor(kernel.substitutions) + + value_inames = get_dependencies(submap(subst.expression)) & kernel.all_inames() + if value_inames - expanding_usage_arg_deps < extra_storage_axes: + raise RuntimeError("unreferenced sweep inames specified: " + + ", ".join(extra_storage_axes - value_inames - expanding_usage_arg_deps)) + + new_iname_to_tag = {} if storage_axes is None: storage_axes = ( - extra_storage_axes + list(extra_storage_axes) + list(xrange(len(arg_names)))) expr_subst_dict = {} storage_axis_names = [] storage_axis_sources = [] # number for arg#, or iname - storage_axis_name_to_tag = {} for i, saxis in enumerate(storage_axes): tag_lookup_saxis = saxis @@ -606,7 +619,7 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], based_on=name, extra_used_vars=newly_created_var_names) storage_axis_names.append(name) - storage_axis_name_to_tag[name] = storage_axis_to_tag.get( + new_iname_to_tag[name] = storage_axis_to_tag.get( tag_lookup_saxis, default_tag) newly_created_var_names.add(name) @@ -624,7 +637,7 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # }}} - expanding_inames = frozenset(sweep_inames) | frozenset(expanding_usage_arg_deps) + expanding_inames = sweep_inames_set | frozenset(expanding_usage_arg_deps) assert expanding_inames <= kernel.all_inames() # {{{ find domain to be changed @@ -637,7 +650,7 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # fetches with loops over copies of these parent inames that will end # up being scheduled *within* loops over these parents. - for iname in sweep_inames: + for iname in sweep_inames_set: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise RuntimeError("sweep iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname) @@ -653,6 +666,10 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], from loopy.isl_helpers import convexify new_domain = convexify(new_domain) + for saxis in storage_axis_names: + if saxis not in non1_storage_axis_names: + del new_iname_to_tag[saxis] + # {{{ set up compute insn target_var_name = kernel.make_unique_var_name(based_on=c_subst_name, @@ -804,14 +821,6 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # }}} - # {{{ fill out new_iname_to_tag - - new_iname_to_tag = kernel.iname_to_tag.copy() - for arg_name in non1_storage_axis_names: - new_iname_to_tag[arg_name] = storage_axis_name_to_tag[arg_name] - - # }}} - # {{{ set up temp variable from loopy.kernel import TemporaryVariable @@ -828,12 +837,14 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # }}} - return kernel.copy( + result = kernel.copy( domains=domch.get_domains_with(new_domain), instructions=new_insns, substitutions=new_substs, - temporary_variables=new_temporary_variables, - iname_to_tag=new_iname_to_tag) + temporary_variables=new_temporary_variables) + + from loopy import tag_dimensions + return tag_dimensions(result, new_iname_to_tag) diff --git a/loopy/kernel.py b/loopy/kernel.py index 7c83bd1ddd27768a9c4c97126a348001ff88f8e1..30abc25614acbd971b1610100c0d42fbc90e769a 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -81,6 +81,10 @@ class UnrollTag(IndexTag): def __str__(self): return "unr" +class ForceSequentialTag(IndexTag): + def __str__(self): + return "forceseq" + def parse_tag(tag): if tag is None: return tag @@ -683,8 +687,6 @@ class LoopKernel(Record): were applied to the kernel. These are stored so that they may be repeated on expressions the user specifies later. :ivar cache_manager: - :ivar lowest_priority_inames: (used internally to realize ILP) - :ivar breakable_inames: these inames' loops may be broken up by the scheduler :ivar isl_context: The following instance variables are only used until :func:`loopy.make_kernel` is @@ -717,7 +719,6 @@ class LoopKernel(Record): applied_iname_rewrites=[], cache_manager=None, iname_to_tag_requests=None, - lowest_priority_inames=[], breakable_inames=set(), index_dtype=np.int32, isl_context=None): """ @@ -1006,8 +1007,6 @@ class LoopKernel(Record): iname_to_tag_requests=iname_to_tag_requests, substitutions=substitutions, cache_manager=cache_manager, - lowest_priority_inames=lowest_priority_inames, - breakable_inames=breakable_inames, applied_iname_rewrites=applied_iname_rewrites, function_manglers=function_manglers, symbol_manglers=symbol_manglers, @@ -1329,51 +1328,6 @@ class LoopKernel(Record): return result - @property - @memoize_method - def sequential_inames(self): - result = set() - - def map_reduction(red_expr, rec): - rec(red_expr.expr) - result.update(red_expr.inames) - - from loopy.symbolic import ReductionCallbackMapper - for insn in self.instructions: - ReductionCallbackMapper(map_reduction)(insn.expression) - - for iname in result: - tag = self.iname_to_tag.get(iname) - if tag is not None and isinstance(tag, ParallelTag): - raise RuntimeError("inconsistency detected: " - "sequential/reduction iname '%s' has " - "a parallel tag" % iname) - - return result - - @memoize_method - def loop_nest_map(self): - """Returns a dictionary mapping inames to other inames that are - always nested around them. - """ - result = {} - - all_inames = self.all_inames() - - # {{{ examine instructions - - iname_to_insns = self.iname_to_insns() - - # examine pairs of all inames--O(n**2), I know. - for inner_iname in all_inames: - result[inner_iname] = set() - for outer_iname in self.all_inames(): - if outer_iname in self.breakable_inames: - continue - - if iname_to_insns[inner_iname] < iname_to_insns[outer_iname]: - result[inner_iname].add(outer_iname) - # }}} # {{{ examine domains @@ -1637,7 +1591,8 @@ class LoopKernel(Record): lines.append(sep) lines.append("INAME-TO-TAG MAP:") for iname in sorted(self.all_inames()): - lines.append("%s: %s" % (iname, self.iname_to_tag.get(iname))) + line = "%s: %s" % (iname, self.iname_to_tag.get(iname)) + lines.append(line) lines.append(sep) lines.append("DOMAINS:") @@ -1677,6 +1632,12 @@ class LoopKernel(Record): lines.append("%s : %s" % (insn.id, ",".join(insn.insn_deps))) lines.append(sep) + if self.schedule is not None: + lines.append("SCHEDULE:") + from loopy.schedule import dump_schedule + lines.append(dump_schedule(self.schedule)) + lines.append(sep) + return "\n".join(lines) # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 9f87ba9be3ca1f34e0aedaf2721a34526db4f48c..2e4b98ba66ab73787b7fa1bcb1b421823ccec1dc 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -108,40 +108,6 @@ def infer_types_of_temporaries(kernel): # }}} -# {{{ transform ilp into lower-level constructs - -def realize_ilp(kernel): - from loopy.kernel import ( - - UnrolledIlpTag, UnrollTag, LoopedIlpTag) - ILP_TO_BASE_TAG = { - UnrolledIlpTag: UnrollTag, - LoopedIlpTag: None, - } - - lpi = kernel.lowest_priority_inames[:] - breakable_inames = kernel.breakable_inames.copy() - - new_iname_to_tag = kernel.iname_to_tag.copy() - for iname in kernel.all_inames(): - tag = kernel.iname_to_tag.get(iname) - if type(tag) in ILP_TO_BASE_TAG: - new_tag_cls = ILP_TO_BASE_TAG[type(tag)] - if new_tag_cls is None: - new_iname_to_tag[iname] = None - else: - new_iname_to_tag[iname] = new_tag_cls() - - lpi.append(iname) - breakable_inames.add(iname) - - return kernel.copy( - iname_to_tag=new_iname_to_tag, - lowest_priority_inames=lpi, - breakable_inames=breakable_inames) - -# }}} - # {{{ decide which temporaries are local def mark_local_temporaries(kernel): @@ -818,13 +784,6 @@ def preprocess_kernel(kernel): kernel = realize_reduction(kernel) - # Ordering restriction: - # Must realize reductions before realizing ILP, because realize_ilp() - # gets rid of ILP tags, but realize_reduction() needs them to do - # reduction variable duplication. - - kernel = realize_ilp(kernel) - kernel = mark_local_temporaries(kernel) kernel = assign_automatic_axes(kernel) kernel = add_boostability_and_automatic_dependencies(kernel) diff --git a/loopy/schedule.py b/loopy/schedule.py index 085b269455caf7a87f4b5893e3ad20c8224741d1..14d3756eb8c3c00a962617e4fabcdf3d2d06ff51 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -154,6 +154,39 @@ def find_used_inames_within(kernel, sched_index): return result + + + +def loop_nest_map(kernel): + """Returns a dictionary mapping inames to other inames that are + always nested around them. + """ + result = {} + + all_inames = kernel.all_inames() + + iname_to_insns = kernel.iname_to_insns() + + # examine pairs of all inames--O(n**2), I know. + from loopy.kernel import IlpBaseTag + for inner_iname in all_inames: + result[inner_iname] = set() + for outer_iname in kernel.all_inames(): + tag = kernel.iname_to_tag.get(outer_iname) + if isinstance(tag, IlpBaseTag): + # ILP tags are special because they are parallel tags + # and therefore 'in principle' nest around everything. + # But they're realized by the scheduler as a loop + # (and the scheduler is the only custom + # at the innermost level, so we'll cut them some + # slack here. + continue + + if iname_to_insns[inner_iname] < iname_to_insns[outer_iname]: + result[inner_iname].add(outer_iname) + + return result + # }}} # {{{ debug help @@ -236,11 +269,15 @@ class ScheduleDebugger: # {{{ scheduling algorithm -def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], +class SchedulerState(Record): + pass + +def generate_loop_schedules_internal(sched_state, loop_priority, schedule=[], allow_boost=False, allow_insn=False, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. + kernel = sched_state.kernel all_insn_ids = set(insn.id for insn in kernel.instructions) scheduled_insn_ids = set(sched_item.insn_id for sched_item in schedule @@ -271,10 +308,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], last_entered_loop = None active_inames_set = set(active_inames) - from loopy.kernel import ParallelTag - parallel_inames = set( - iname for iname in kernel.all_inames() - if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) # }}} @@ -325,8 +358,8 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], insn.id, ",".join(set(insn.insn_deps) - scheduled_insn_ids)) continue - want = kernel.insn_inames(insn) - parallel_inames - have = active_inames_set - parallel_inames + want = kernel.insn_inames(insn) - sched_state.parallel_inames + have = active_inames_set - sched_state.parallel_inames # If insn is boostable, it may be placed inside a more deeply # nested loop without harm. @@ -365,7 +398,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], # made. for sub_sched in generate_loop_schedules_internal( - kernel, loop_priority, schedule, + sched_state, loop_priority, schedule, allow_boost=rec_allow_boost, debug=debug, allow_insn=True): yield sub_sched @@ -379,7 +412,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], if last_entered_loop is not None: can_leave = True - if last_entered_loop not in kernel.breakable_inames: + if last_entered_loop not in sched_state.breakable_inames: # If the iname is not breakable, then check that we've # scheduled all the instructions that require it. @@ -418,7 +451,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], schedule = schedule + [LeaveLoop(iname=last_entered_loop)] for sub_sched in generate_loop_schedules_internal( - kernel, loop_priority, schedule, + sched_state, loop_priority, schedule, allow_boost=rec_allow_boost, debug=debug, allow_insn=allow_insn): yield sub_sched @@ -436,7 +469,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], needed_inames = (needed_inames # There's no notion of 'entering' a parallel loop - - parallel_inames + - sched_state.parallel_inames # Don't reenter a loop we're already in. - active_inames_set) @@ -456,11 +489,11 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], # {{{ check if scheduling this iname now is allowed/plausible - currently_accessible_inames = active_inames_set | parallel_inames - if not kernel.loop_nest_map()[iname] <= currently_accessible_inames: + currently_accessible_inames = active_inames_set | sched_state.parallel_inames + if not sched_state.loop_nest_map[iname] <= currently_accessible_inames: if debug_mode: print "scheduling %s prohibited by loop nest map" % iname - print kernel.loop_nest_map() + print sched_state.loop_nest_map continue iname_home_domain = kernel.domains[kernel.get_home_domain_index(iname)] @@ -468,7 +501,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], iname_home_domain_params = set(iname_home_domain.get_var_names(dim_type.param)) # The previous check should have ensured this is true, because - # Kernel.loop_nest_map takes the domain dependency graph into + # the loop_nest_map takes the domain dependency graph into # consideration. assert (iname_home_domain_params & kernel.all_inames() <= currently_accessible_inames) @@ -522,7 +555,6 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], # loops in the second are not even tried (and so on). loop_priority_set = set(loop_priority) - lowest_priority_set = set(kernel.lowest_priority_inames) useful_loops_set = set(iname_to_usefulness.iterkeys()) useful_and_desired = useful_loops_set & loop_priority_set @@ -535,13 +567,13 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], priority_tiers.append( useful_loops_set - loop_priority_set - - lowest_priority_set) + - sched_state.lowest_priority_inames) else: - priority_tiers = [useful_loops_set - lowest_priority_set] + priority_tiers = [useful_loops_set - sched_state.lowest_priority_inames] priority_tiers.extend([ [iname] - for iname in kernel.lowest_priority_inames + for iname in sched_state.lowest_priority_inames if iname in useful_loops_set ]) @@ -559,7 +591,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], new_schedule = schedule + [EnterLoop(iname=iname)] for sub_sched in generate_loop_schedules_internal( - kernel, loop_priority, new_schedule, + sched_state, loop_priority, new_schedule, allow_boost=rec_allow_boost, debug=debug): found_viable_schedule = True @@ -584,7 +616,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], if not allow_insn: # try again with boosting allowed for sub_sched in generate_loop_schedules_internal( - kernel, loop_priority, schedule=schedule, + sched_state, loop_priority, schedule=schedule, allow_boost=allow_boost, debug=debug, allow_insn=True): yield sub_sched @@ -592,7 +624,7 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], if not allow_boost and allow_boost is not None: # try again with boosting allowed for sub_sched in generate_loop_schedules_internal( - kernel, loop_priority, schedule=schedule, + sched_state, loop_priority, schedule=schedule, allow_boost=True, debug=debug, allow_insn=allow_insn): yield sub_sched @@ -729,10 +761,27 @@ def generate_loop_schedules(kernel, loop_priority=[], debug_args={}): debug = ScheduleDebugger(**debug_args) + from loopy.kernel import IlpBaseTag, ParallelTag + ilp_inames = set( + iname + for iname in kernel.all_inames() + if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag)) + parallel_inames = set( + iname for iname in kernel.all_inames() + if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) + + sched_state = SchedulerState( + kernel=kernel, + loop_nest_map=loop_nest_map(kernel), + breakable_inames=ilp_inames, + lowest_priority_inames=ilp_inames, + # ILP is not parallel for the purposes of the scheduler + parallel_inames=parallel_inames - ilp_inames) + generators = [ - generate_loop_schedules_internal(kernel, loop_priority, + generate_loop_schedules_internal(sched_state, loop_priority, debug=debug, allow_boost=None), - generate_loop_schedules_internal(kernel, loop_priority, + generate_loop_schedules_internal(sched_state, loop_priority, debug=debug)] for gen in generators: for gen_sched in gen: @@ -784,8 +833,4 @@ def generate_loop_schedules(kernel, loop_priority=[], debug_args={}): # }}} - - - - # vim: foldmethod=marker diff --git a/test/test_linalg.py b/test/test_linalg.py index da16cdbda200945da3c5cc4aabab02969c18a112..aee835112134b09efc984a269cd74a021eeec0da 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -532,7 +532,7 @@ def test_image_matrix_mul_ilp(ctx_factory): ctx = ctx_factory() order = "C" - n = get_suitable_size(ctx) + n = 9 #get_suitable_size(ctx) knl = lp.make_kernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, @@ -569,6 +569,38 @@ def test_image_matrix_mul_ilp(ctx_factory): +def test_ilp_race_matmul(ctx_factory): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + + n = 9 + + knl = lp.make_kernel(ctx.devices[0], + "{[i,j,k]: 0<=i,j,k<%d}" % n, + [ + "c[i, j] = sum(k, a[i, k]*b[k, j])" + ], + [ + lp.ImageArg("a", dtype, shape=(n, n)), + lp.ImageArg("b", dtype, shape=(n, n)), + lp.GlobalArg("c", dtype, shape=(n, n), order=order), + ], + name="matmul") + + knl = lp.split_dimension(knl, "j", 2, outer_tag="ilp", inner_tag="l.0") + knl = lp.split_dimension(knl, "k", 2) + knl = lp.add_prefetch(knl, 'b', ["k_inner"]) + + from loopy.check import WriteRaceConditionError + import pytest + with pytest.raises(WriteRaceConditionError): + list(lp.generate_loop_schedules(knl)) + + + + + def test_fancy_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() diff --git a/test/test_loopy.py b/test/test_loopy.py index 2dd8cb42e0c10c92230d98033a4159c53199e039..fdcd58af1361cda34d4f7a55bc87f6119dd885e2 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -558,6 +558,69 @@ def test_equality_constraints(ctx_factory): +# {{{ test race detection + +def test_ilp_write_race_detection_global(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel(ctx.devices[0], [ + "[n] -> {[i,j]: 0<=i,j a[i] = 5+i+j", + ], + []) + + from loopy.check import WriteRaceConditionError + import pytest + with pytest.raises(WriteRaceConditionError): + list(lp.generate_loop_schedules(knl)) + + + + +def test_ilp_write_race_detection_private(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel(ctx.devices[0], + "{[j]: 0<=j<16 }", + [ + "[j:ilp] <> a = 5+j", + ], + []) + + from loopy.check import WriteRaceConditionError + import pytest + with pytest.raises(WriteRaceConditionError): + list(lp.generate_loop_schedules(knl)) + +# }}} + + + + if __name__ == "__main__": import sys if len(sys.argv) > 1: @@ -565,3 +628,5 @@ if __name__ == "__main__": else: from py.test.cmdline import main main([__file__]) + +# vim: foldmethod=marker