diff --git a/loopy/check.py b/loopy/check.py index cc87ad9872668bf5323aefd79944e3bbd71b1153..0d2bbff7cf8d6f9e63a33dc2c8814f29afae70f0 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -220,12 +220,12 @@ def check_for_write_races(kernel): assignee_inames = assignee_indices & kernel.all_inames() if not assignee_inames <= kernel.insn_inames(insn): raise LoopyError( - "assignee of instructiosn '%s' references " + "assignee of instructions '%s' references " "iname that the instruction does not depend on" % insn.id) if assignee_name in kernel.arg_dict: - # Any parallel tags that are not depended upon by the assignee + # Any concurrent tags that are not depended upon by the assignee # will cause write races. raceable_parallel_insn_inames = set( diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index c946e09a086e574a2593d60f652a81773d95a1fe..b736191ec1dadb842e12453fbec3b68e831338f6 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -59,6 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, + VectorizeTag, IlpBaseTag) result = find_active_inames_at(kernel, sched_index) @@ -67,7 +68,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): # Find our containing subkernel. Grab inames for all insns from there. within_subkernel = False - for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index+1]): + for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]): from loopy.schedule import CallKernel, ReturnFromKernel if isinstance(sched_item, CallKernel): within_subkernel = True @@ -92,11 +93,12 @@ def get_usable_inames_for_conditional(kernel, sched_index): # # - local indices may not be used in conditionals that cross barriers. # - # - ILP indices are not available in loop bounds, they only get defined - # at the innermost level of nesting. + # - ILP indices and vector lane indices are not available in loop + # bounds, they only get defined at the innermost level of nesting. if ( kernel.iname_tags_of_type(iname, ConcurrentTag) + and not kernel.iname_tags_of_type(iname, VectorizeTag) and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) and crosses_barrier) and not kernel.iname_tags_of_type(iname, IlpBaseTag) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e1520a82ed69fa2aed729d9b1d849a78d658c4e1..e9de52eb68bd47aec09b0a19de0a5d5433aa9843 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -24,7 +24,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl from loopy.schedule import ( @@ -33,30 +32,6 @@ from loopy.schedule import ( from loopy.diagnostic import LoopyError -def get_admissible_conditional_inames_for(codegen_state, sched_index): - """This function disallows conditionals on local-idx tagged - inames if there is a barrier nested somewhere within. - """ - - kernel = codegen_state.kernel - - from loopy.kernel.data import (LocalIndexTag, HardwareConcurrentTag, - filter_iname_tags_by_type) - - from loopy.schedule import find_active_inames_at, has_barrier_within - result = find_active_inames_at(kernel, sched_index) - - has_barrier = has_barrier_within(kernel, sched_index) - - for iname, tags in six.iteritems(kernel.iname_to_tags): - if (filter_iname_tags_by_type(tags, HardwareConcurrentTag) - and codegen_state.is_generating_device_code): - if not has_barrier or not filter_iname_tags_by_type(tags, LocalIndexTag): - result.add(iname) - - return frozenset(result) - - def synthesize_idis_for_extra_args(kernel, schedule_index): """ :returns: A list of :class:`loopy.codegen.ImplementedDataInfo` @@ -302,11 +277,13 @@ def build_loop_nest(codegen_state, schedule_index): """ from loopy.schedule import find_used_inames_within + from loopy.codegen.bounds import get_usable_inames_for_conditional + sched_index_info_entries = [ ScheduleIndexInfo( schedule_indices=[i], admissible_cond_inames=( - get_admissible_conditional_inames_for(codegen_state, i)), + get_usable_inames_for_conditional(kernel, i)), required_predicates=get_required_predicates(kernel, i), used_inames_within=find_used_inames_within(kernel, i) ) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 128e4fbc85a2a03e25da3f88b200e67eb41756d3..b3a87798840bb1624d350c79830f29142e54ab6c 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -231,7 +231,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, - LocalIndexTag, GroupIndexTag) + LocalIndexTag, GroupIndexTag, VectorizeTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) @@ -242,7 +242,8 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [iname for iname in all_inames_by_insns - if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)] + if kernel.iname_tags_of_type(iname, HardwareConcurrentTag) + and not kernel.iname_tags_of_type(iname, VectorizeTag)] if not hw_inames_left: return next_func(codegen_state) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 975d7b3efe4bcc419a7ca004e1df3b0fbd39d5d9..9e6e8db666bab61e85981dc697c2d40cea6a18a6 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -186,7 +186,7 @@ class LoopedIlpTag(IlpBaseTag): # }}} -class VectorizeTag(UniqueTag): +class VectorizeTag(UniqueTag, HardwareConcurrentTag): def __str__(self): return "vec" diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c0eb91ea60317ef8cad1c594571d46bba2d1a671..23c4b7fbd9e55006dd17ed9b127e598a14ee17a2 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -289,7 +289,7 @@ def _classify_reduction_inames(kernel, inames): nonlocal_par = [] from loopy.kernel.data import ( - LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, + LocalIndexTagBase, UnrolledIlpTag, UnrollTag, ConcurrentTag, filter_iname_tags_by_type) for iname in inames: @@ -303,7 +303,7 @@ def _classify_reduction_inames(kernel, inames): elif filter_iname_tags_by_type(iname_tags, LocalIndexTagBase): local_par.append(iname) - elif filter_iname_tags_by_type(iname_tags, (ConcurrentTag, VectorizeTag)): + elif filter_iname_tags_by_type(iname_tags, ConcurrentTag): nonlocal_par.append(iname) else: diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index fb0d0e2c17005ecf051d7034fd7903ed5262bdfc..f145c7122b9fd6e9e516d0becf3d4461fc0cce8c 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -212,12 +212,12 @@ def find_loop_nest_with_map(kernel): """ result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, IlpBaseTag all_nonpar_inames = set( iname for iname in kernel.all_inames() if not kernel.iname_tags_of_type(iname, - (ConcurrentTag, IlpBaseTag, VectorizeTag))) + (ConcurrentTag, IlpBaseTag))) iname_to_insns = kernel.iname_to_insns() @@ -276,7 +276,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, IlpBaseTag for insn in kernel.instructions: for iname in kernel.insn_inames(insn): if kernel.iname_tags_of_type(iname, ConcurrentTag): @@ -310,7 +310,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): continue if kernel.iname_tags_of_type(dep_insn_iname, - (ConcurrentTag, IlpBaseTag, VectorizeTag)): + (ConcurrentTag, IlpBaseTag)): # Parallel tags don't really nest, so we'll disregard # them here. continue