From 3bacc7a69760b7ed5ee90b4c19beede5e9a19a7b Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 27 Jul 2019 15:11:46 -0500 Subject: [PATCH 1/6] first pass at removing special ILP handling from schedule creation --- loopy/schedule/__init__.py | 61 ++++++++++++++------------------------ 1 file changed, 22 insertions(+), 39 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 378a1c0bf..6f72e6334 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -212,12 +212,12 @@ def find_loop_nest_with_map(kernel): """ result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, VectorizeTag all_nonpar_inames = set( iname for iname in kernel.all_inames() if not kernel.iname_tags_of_type(iname, - (ConcurrentTag, IlpBaseTag, VectorizeTag))) + (ConcurrentTag, VectorizeTag))) # TODO redundant? iname_to_insns = kernel.iname_to_insns() @@ -239,20 +239,17 @@ def find_loop_nest_around_map(kernel): iname_to_insns = kernel.iname_to_insns() + from loopy.kernel.data import ConcurrentTag # examine pairs of all inames--O(n**2), I know. - from loopy.kernel.data import IlpBaseTag for inner_iname in all_inames: + if kernel.iname_tags_of_type(inner_iname, ConcurrentTag): + continue # TODO should this come after result update on next line? result[inner_iname] = set() for outer_iname in all_inames: if inner_iname == outer_iname: continue - if kernel.iname_tags_of_type(outer_iname, IlpBaseTag): - # ILP tags are special because they are parallel tags - # and therefore 'in principle' nest around everything. - # But they're realized by the scheduler as a loop - # at the innermost level, so we'll cut them some - # slack here. + if kernel.iname_tags_of_type(outer_iname, ConcurrentTag): continue if iname_to_insns[inner_iname] < iname_to_insns[outer_iname]: @@ -276,7 +273,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag, VectorizeTag for insn in kernel.instructions: for iname in kernel.insn_inames(insn): if kernel.iname_tags_of_type(iname, ConcurrentTag): @@ -310,7 +307,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): continue if kernel.iname_tags_of_type(dep_insn_iname, - (ConcurrentTag, IlpBaseTag, VectorizeTag)): + (ConcurrentTag, VectorizeTag)): # Parallel tags don't really nest, so we'll disregard # them here. continue @@ -573,16 +570,12 @@ class SchedulerState(ImmutableRecord): See :func:`loop_nest_around_map`. - .. attribute:: breakable_inames - - .. attribute:: ilp_inames - .. attribute:: vec_inames .. attribute:: parallel_inames - *Note:* ``ilp`` and ``vec`` are not 'parallel' for the purposes of the - scheduler. See :attr:`ilp_inames`, :attr:`vec_inames`. + *Note:* ``vec`` are not 'parallel' for the purposes of the + scheduler. See :attr:`vec_inames`. .. rubric:: Time-varying scheduler state @@ -974,9 +967,11 @@ def generate_loop_schedules_internal( print("cannot leave '%s' because of preschedule constraints" % last_entered_loop) can_leave = False - elif last_entered_loop not in sched_state.breakable_inames: - # If the iname is not breakable, then check that we've - # scheduled all the instructions that require it. + #elif last_entered_loop not in sched_state.breakable_inames: + # # If the iname is not breakable, then check that we've + # # scheduled all the instructions that require it. + else: + # Check that we've scheduled all the instructions that require the iname. for insn_id in sched_state.unscheduled_insn_ids: insn = kernel.id_to_insn[insn_id] @@ -1199,7 +1194,6 @@ def generate_loop_schedules_internal( if useful_and_desired: wanted = ( useful_and_desired - - sched_state.ilp_inames - sched_state.vec_inames ) priority_tiers = [t for t in @@ -1215,23 +1209,15 @@ def generate_loop_schedules_internal( priority_tiers.append( useful_loops_set - loop_priority_set - - sched_state.ilp_inames - sched_state.vec_inames ) else: priority_tiers = [ useful_loops_set - - sched_state.ilp_inames - sched_state.vec_inames ] # vectorization must be the absolute innermost loop - priority_tiers.extend([ - [iname] - for iname in sched_state.ilp_inames - if iname in useful_loops_set - ]) - priority_tiers.extend([ [iname] for iname in sched_state.vec_inames @@ -1878,12 +1864,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): for item in preschedule for insn_id in sched_item_to_insn_id(item)) - from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag, + from loopy.kernel.data import (ConcurrentTag, VectorizeTag, filter_iname_tags_by_type) - ilp_inames = set( - iname - for iname, tags in six.iteritems(kernel.iname_to_tags) - if filter_iname_tags_by_type(tags, IlpBaseTag)) vec_inames = set( iname for iname, tags in six.iteritems(kernel.iname_to_tags) @@ -1902,9 +1884,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): kernel, loop_nest_with_map=loop_nest_with_map, loop_nest_around_map=loop_nest_around_map), - breakable_inames=ilp_inames, - ilp_inames=ilp_inames, - vec_inames=vec_inames, + vec_inames=vec_inames, # TODO go away prescheduled_inames=prescheduled_inames, prescheduled_insn_ids=prescheduled_insn_ids, @@ -1924,8 +1904,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}): preschedule=preschedule, insn_ids_to_try=None, - # ilp and vec are not parallel for the purposes of the scheduler - parallel_inames=parallel_inames - ilp_inames - vec_inames, + # vec is not parallel for the purposes of the scheduler + # TODO don't do this + # concurrent vs nonconcurrent is only distinction that should need + # TODO rename parallel->concurrent + parallel_inames=parallel_inames - vec_inames, group_insn_counts=group_insn_counts(kernel), active_group_counts={}, -- GitLab From 82d352f600aa20491a313fbfae62ffaef504f190 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 27 Jul 2019 17:22:43 -0500 Subject: [PATCH 2/6] making VectorizeTag inherit from ConcurrentTag --- loopy/kernel/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 975d7b3ef..f9e4a096d 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -186,7 +186,7 @@ class LoopedIlpTag(IlpBaseTag): # }}} -class VectorizeTag(UniqueTag): +class VectorizeTag(UniqueTag, ConcurrentTag): def __str__(self): return "vec" -- GitLab From 3163f6ba9ba7bfe00c9ad056699c2b81d7d135de Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 27 Jul 2019 17:34:11 -0500 Subject: [PATCH 3/6] removing special handling for inames with vectorize tag --- loopy/schedule/__init__.py | 52 +++++++------------------------------- 1 file changed, 9 insertions(+), 43 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index a735d4786..3519757fa 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -212,12 +212,11 @@ def find_loop_nest_with_map(kernel): """ result = {} - from loopy.kernel.data import ConcurrentTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag all_nonpar_inames = set( iname for iname in kernel.all_inames() - if not kernel.iname_tags_of_type(iname, - (ConcurrentTag, VectorizeTag))) # TODO redundant? + if not kernel.iname_tags_of_type(iname, ConcurrentTag)) iname_to_insns = kernel.iname_to_insns() @@ -273,7 +272,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): result = {} - from loopy.kernel.data import ConcurrentTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag for insn in kernel.instructions: for iname in kernel.insn_inames(insn): if kernel.iname_tags_of_type(iname, ConcurrentTag): @@ -306,8 +305,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): # -> safe. continue - if kernel.iname_tags_of_type(dep_insn_iname, - (ConcurrentTag, VectorizeTag)): + if kernel.iname_tags_of_type(dep_insn_iname, ConcurrentTag): # Parallel tags don't really nest, so we'll disregard # them here. continue @@ -570,13 +568,8 @@ class SchedulerState(ImmutableRecord): See :func:`loop_nest_around_map`. - .. attribute:: vec_inames - .. attribute:: parallel_inames - *Note:* ``vec`` are not 'parallel' for the purposes of the - scheduler. See :attr:`vec_inames`. - .. rubric:: Time-varying scheduler state .. attribute:: active_inames @@ -1192,12 +1185,8 @@ def generate_loop_schedules_internal( useful_and_desired = useful_loops_set & loop_priority_set if useful_and_desired: - wanted = ( - useful_and_desired - - sched_state.vec_inames - ) priority_tiers = [t for t in - get_priority_tiers(wanted, + get_priority_tiers(useful_and_desired, sched_state.kernel.loop_priority ) ] @@ -1206,23 +1195,9 @@ def generate_loop_schedules_internal( # have been contradictary. loop_priority_set = set().union(*[set(t) for t in priority_tiers]) - priority_tiers.append( - useful_loops_set - - loop_priority_set - - sched_state.vec_inames - ) + priority_tiers.append(useful_loops_set - loop_priority_set) else: - priority_tiers = [ - useful_loops_set - - sched_state.vec_inames - ] - - # vectorization must be the absolute innermost loop - priority_tiers.extend([ - [iname] - for iname in sched_state.vec_inames - if iname in useful_loops_set - ]) + priority_tiers = [useful_loops_set] # }}} @@ -1850,12 +1825,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): for item in preschedule for insn_id in sched_item_to_insn_id(item)) - from loopy.kernel.data import (ConcurrentTag, VectorizeTag, + from loopy.kernel.data import (ConcurrentTag, filter_iname_tags_by_type) - vec_inames = set( - iname - for iname, tags in six.iteritems(kernel.iname_to_tags) - if filter_iname_tags_by_type(tags, VectorizeTag)) parallel_inames = set( iname for iname, tags in six.iteritems(kernel.iname_to_tags) @@ -1870,8 +1841,6 @@ def generate_loop_schedules_inner(kernel, debug_args={}): kernel, loop_nest_with_map=loop_nest_with_map, loop_nest_around_map=loop_nest_around_map), - vec_inames=vec_inames, # TODO go away - prescheduled_inames=prescheduled_inames, prescheduled_insn_ids=prescheduled_insn_ids, @@ -1890,11 +1859,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): preschedule=preschedule, insn_ids_to_try=None, - # vec is not parallel for the purposes of the scheduler - # TODO don't do this - # concurrent vs nonconcurrent is only distinction that should need # TODO rename parallel->concurrent - parallel_inames=parallel_inames - vec_inames, + parallel_inames=parallel_inames, group_insn_counts=group_insn_counts(kernel), active_group_counts={}, -- GitLab From d0dbe7f456c0e0da2b91c0211c4b0b4bf1d55011 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sat, 27 Jul 2019 17:39:11 -0500 Subject: [PATCH 4/6] renamed SchedulerState.parallel_inames->concurrent_inames --- loopy/schedule/__init__.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 3519757fa..6a74702ae 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -568,7 +568,7 @@ class SchedulerState(ImmutableRecord): See :func:`loop_nest_around_map`. - .. attribute:: parallel_inames + .. attribute:: concurrent_inames .. rubric:: Time-varying scheduler state @@ -789,8 +789,8 @@ def generate_loop_schedules_internal( pass continue - want = kernel.insn_inames(insn) - sched_state.parallel_inames - have = active_inames_set - sched_state.parallel_inames + want = kernel.insn_inames(insn) - sched_state.concurrent_inames + have = active_inames_set - sched_state.concurrent_inames # If insn is boostable, it may be placed inside a more deeply # nested loop without harm. @@ -979,7 +979,7 @@ def generate_loop_schedules_internal( sched_state.scheduled_insn_ids): subdep = kernel.id_to_insn[insn_id] want = (kernel.insn_inames(subdep_id) - - sched_state.parallel_inames) + - sched_state.concurrent_inames) if ( last_entered_loop not in want and last_entered_loop not in subdep.boostable_into): @@ -1057,8 +1057,8 @@ def generate_loop_schedules_internal( needed_inames.update(kernel.insn_inames(insn_id)) needed_inames = (needed_inames - # There's no notion of 'entering' a parallel loop - - sched_state.parallel_inames + # There's no notion of 'entering' a concurrent loop + - sched_state.concurrent_inames # Don't reenter a loop we're already in. - active_inames_set) @@ -1092,7 +1092,7 @@ def generate_loop_schedules_internal( continue currently_accessible_inames = ( - active_inames_set | sched_state.parallel_inames) + active_inames_set | sched_state.concurrent_inames) if ( not sched_state.loop_nest_around_map[iname] <= currently_accessible_inames): @@ -1827,7 +1827,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): from loopy.kernel.data import (ConcurrentTag, filter_iname_tags_by_type) - parallel_inames = set( + concurrent_inames = set( iname for iname, tags in six.iteritems(kernel.iname_to_tags) if filter_iname_tags_by_type(tags, ConcurrentTag)) @@ -1859,8 +1859,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): preschedule=preschedule, insn_ids_to_try=None, - # TODO rename parallel->concurrent - parallel_inames=parallel_inames, + concurrent_inames=concurrent_inames, group_insn_counts=group_insn_counts(kernel), active_group_counts={}, -- GitLab From cce4c56a640f876d73aa1242659d005aa9bc6dfb Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 5 Aug 2019 16:24:08 -0500 Subject: [PATCH 5/6] Generate ilp/vec inames in the absence of schedule entries for them --- loopy/codegen/__init__.py | 36 ++++++++++++++++ loopy/codegen/control.py | 87 ++++++++++++++++++++++++++++++++------- loopy/codegen/loop.py | 59 +++++++++++--------------- loopy/tools.py | 43 +++++++++++++++++++ test/test_loopy.py | 19 +++++++++ test/test_misc.py | 31 ++++++++++++++ 6 files changed, 225 insertions(+), 50 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1b..d6d146385 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -146,6 +146,7 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState(object): """ .. attribute:: kernel + .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -187,12 +188,19 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: priority_aware_iname_order_embedding + + A :class:`dict` mapping inames to a totally ordered set, which + guarantees that *embedding[iname1] < embedding[iname2]* + if *iname1* should be nested outside *iname2* according to loop priorities """ def __init__(self, kernel, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, + priority_aware_iname_order_embedding, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, @@ -211,6 +219,8 @@ class CodeGenerationState(object): self.is_generating_device_code = is_generating_device_code self.gen_program_name = gen_program_name self.schedule_index_end = schedule_index_end + self.priority_aware_iname_order_embedding = \ + priority_aware_iname_order_embedding # {{{ copy helpers @@ -253,6 +263,8 @@ class CodeGenerationState(object): seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, + priority_aware_iname_order_embedding=( + self.priority_aware_iname_order_embedding), vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -454,6 +466,28 @@ def generate_code_v2(kernel): seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) + + # {{{ build priority_aware_iname_order_embedding + + from loopy.tools import compute_topological_order, CycleError + + priority_graph = dict((iname, []) for iname in sorted(kernel.all_inames())) + for priority_tuple in kernel.loop_priority: + for iname, inner_iname in zip(priority_tuple, priority_tuple[1:]): + priority_graph[iname].append(inner_iname) + + try: + priorities_sorted = compute_topological_order(priority_graph) + except CycleError: + # Input loop priorities may contain a cycle, so don't fail if we + # encounter one. + priorities_sorted = sorted(kernel.all_inames()) + + priority_aware_iname_order_embedding = dict( + (iname, i) for (i, iname) in enumerate(priorities_sorted)) + + # }}} + codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, @@ -464,6 +498,8 @@ def generate_code_v2(kernel): seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, + priority_aware_iname_order_embedding=( + priority_aware_iname_order_embedding), var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e1520a82e..90ca5719d 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -130,28 +130,37 @@ def generate_code_for_sched_index(codegen_state, sched_index): elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) tags = tuple(tag for tag in tags if tag) + iname = sched_item.iname from loopy.codegen.loop import ( generate_unroll_loop, - generate_vectorize_loop, generate_sequential_loop_dim_code) - from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, - ForceSequentialTag, LoopedIlpTag, VectorizeTag, - InOrderSequentialSequentialTag, filter_iname_tags_by_type) - if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)): - func = generate_unroll_loop - elif filter_iname_tags_by_type(tags, VectorizeTag): - func = generate_vectorize_loop - elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag, + from loopy.kernel.data import (UnrollTag, + ForceSequentialTag, + InOrderSequentialSequentialTag, + filter_iname_tags_by_type) + + from functools import partial + if filter_iname_tags_by_type(tags, (UnrollTag,)): + func = partial(generate_unroll_loop, codegen_state, iname) + elif not tags or filter_iname_tags_by_type(tags, ( ForceSequentialTag, InOrderSequentialSequentialTag)): - func = generate_sequential_loop_dim_code + from loopy.codegen.bounds import get_usable_inames_for_conditional + usable_inames_for_conditional = ( + get_usable_inames_for_conditional(kernel, sched_index)) + func = partial( + generate_sequential_loop_dim_code, + codegen_state, iname, usable_inames_for_conditional) else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, ", ".join(str(tag) for tag in tags))) - return func(codegen_state, sched_index) + def inner_codegen(state): + return build_loop_nest(state, 1 + sched_index) + + return func(inner_codegen) elif isinstance(sched_item, Barrier): # {{{ emit barrier code @@ -190,10 +199,58 @@ def generate_code_for_sched_index(codegen_state, sched_index): elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] - from loopy.codegen.instruction import generate_instruction_code - return codegen_state.try_vectorized( - "instruction %s" % insn.id, - lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) + from loopy.kernel.data import (IlpBaseTag, + VectorizeTag, UnrolledIlpTag, LoopedIlpTag, + filter_iname_tags_by_type) + + from loopy.codegen.loop import ( + generate_unroll_loop, + generate_sequential_loop_dim_code, + generate_vectorize_loop) + + all_inames_to_codegen = tuple(sorted( + ( + iname for iname in insn.within_inames + if filter_iname_tags_by_type( + kernel.iname_to_tags.get(iname, ()), + (IlpBaseTag, VectorizeTag))), + key=( + lambda iname: + codegen_state.priority_aware_iname_order_embedding[iname]))) + + # Parallel inames that do not have a hardware axis (ilp, vec) are + # handled here. This function (indirectly) calls itself once per loop to + # be generated. + def inner_codegen(inames_to_codegen, state): + if inames_to_codegen: + iname = inames_to_codegen[0] + else: + # Base case: no inames left + from loopy.codegen.instruction import generate_instruction_code + return state.try_vectorized( + "instruction %s" % insn.id, + lambda inner_state: ( + generate_instruction_code(inner_state, insn))) + + tags = kernel.iname_to_tags[iname] + from functools import partial + if filter_iname_tags_by_type(tags, UnrolledIlpTag): + func = partial(generate_unroll_loop, state, iname) + elif filter_iname_tags_by_type(tags, LoopedIlpTag): + from loopy.codegen.bounds import get_usable_inames_for_conditional + usable_inames_for_conditional = ( + get_usable_inames_for_conditional(kernel, sched_index)) + func = partial( + generate_sequential_loop_dim_code, + state, iname, usable_inames_for_conditional) + elif filter_iname_tags_by_type(tags, VectorizeTag): + func = partial(generate_vectorize_loop, state, iname) + else: + raise ValueError("do not know how to generate code for '%s'", iname) + + return func(partial(inner_codegen, inames_to_codegen[1:])) + + return inner_codegen(all_inames_to_codegen, codegen_state) else: raise RuntimeError("unexpected schedule item type: %s" diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 58f055b7b..722338f8f 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -28,7 +28,6 @@ from loopy.diagnostic import warn, LoopyError from loopy.codegen.result import merge_codegen_results import islpy as isl from islpy import dim_type -from loopy.codegen.control import build_loop_nest from pymbolic.mapper.stringifier import PREC_NONE @@ -116,11 +115,9 @@ def get_slab_decomposition(kernel, iname): # {{{ unrolled loops -def generate_unroll_loop(codegen_state, sched_index): +def generate_unroll_loop(codegen_state, iname, inner_codegen): kernel = codegen_state.kernel - iname = kernel.schedule[sched_index].iname - bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( @@ -148,8 +145,7 @@ def generate_unroll_loop(codegen_state, sched_index): for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) - result.append( - build_loop_nest(new_codegen_state, sched_index+1)) + result.append(inner_codegen(new_codegen_state)) return merge_codegen_results(codegen_state, result) @@ -158,11 +154,8 @@ def generate_unroll_loop(codegen_state, sched_index): # {{{ vectorized loops -def generate_vectorize_loop(codegen_state, sched_index): +def generate_vectorize_loop(codegen_state, iname, inner_codegen): kernel = codegen_state.kernel - - iname = kernel.schedule[sched_index].iname - bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( @@ -175,7 +168,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return generate_unroll_loop(codegen_state, iname, inner_codegen) length = int(pw_aff_to_expr(length_aff)) @@ -190,7 +183,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return generate_unroll_loop(codegen_state, iname, inner_codegen) # {{{ 'implement' vectorization bounds @@ -210,7 +203,7 @@ def generate_vectorize_loop(codegen_state, sched_index): length=length, space=length_aff.space)) - return build_loop_nest(new_codegen_state, sched_index+1) + return inner_codegen(new_codegen_state) # }}} @@ -341,20 +334,15 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, # {{{ sequential loop -def generate_sequential_loop_dim_code(codegen_state, sched_index): - kernel = codegen_state.kernel +def generate_sequential_loop_dim_code( + codegen_state, loop_iname, usable_inames_for_conditional, inner_codegen): + # Should not include loop_iname itself + assert loop_iname not in usable_inames_for_conditional + kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper - loop_iname = kernel.schedule[sched_index].iname - slabs = get_slab_decomposition(kernel, loop_iname) - - from loopy.codegen.bounds import get_usable_inames_for_conditional - - # Note: this does not include loop_iname itself! - usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) - result = [] for slab_name, slab in slabs: @@ -377,7 +365,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): # move inames that are usable into parameters moved_inames = [] for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): - if das_iname in usable_inames: + if das_iname in usable_inames_for_conditional: moved_inames.append(das_iname) dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( @@ -387,11 +375,11 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] impl_domain = isl.align_spaces( - codegen_state.implemented_domain, - dom_and_slab, - obj_bigger_ok=True, - across_dim_types=True - ).params() + codegen_state.implemented_domain, + dom_and_slab, + obj_bigger_ok=True, + across_dim_types=True + ).params() lbound = ( kernel.cache_manager.dim_min( @@ -399,12 +387,13 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): .gist(kernel.assumptions) .gist(impl_domain) .coalesce()) + ubound = ( - kernel.cache_manager.dim_max( - dom_and_slab, loop_iname_idx) - .gist(kernel.assumptions) - .gist(impl_domain) - .coalesce()) + kernel.cache_manager.dim_max( + dom_and_slab, loop_iname_idx) + .gist(kernel.assumptions) + .gist(impl_domain) + .coalesce()) # }}} @@ -435,7 +424,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): .copy(kernel=intersect_kernel_with_slab( kernel, slab, loop_iname))) - inner = build_loop_nest(new_codegen_state, sched_index+1) + inner = inner_codegen(new_codegen_state) # }}} diff --git a/loopy/tools.py b/loopy/tools.py index 0fc6d1bf9..2e24cb743 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -412,6 +412,49 @@ def compute_sccs(graph): # }}} +# {{{ compute topological order + +class CycleError(Exception): + """Raised when a topological ordering cannot be computed due to a cycle.""" + pass + + +def compute_topological_order(graph): + reverse_order = [] + visited = set() + visiting = set() + + for root in graph: + if root in visited: + continue + + stack = [(root, iter(graph[root]))] + visiting.add(root) + + while stack: + node, children = stack.pop() + + for child in children: + if child in visiting: + raise CycleError() + + if child in visited: + continue + + visiting.add(child) + stack.append((node, children)) + stack.append((child, iter(graph.get(child, ())))) + break + else: + visiting.remove(node) + visited.add(node) + reverse_order.append(node) + + return list(reversed(reverse_order)) + +# }}} + + # {{{ pickled container value class _PickledObject(object): diff --git a/test/test_loopy.py b/test/test_loopy.py index 119d57adf..67b6cb401 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2889,6 +2889,25 @@ def test_half_complex_conditional(ctx_factory): knl(queue) +def test_nested_ilp_iname_codegen(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i,j]: 0 <= i < 10 and 0 <= j < 10}", + """ + out[i,j] = i + j + """, + [ + lp.GlobalArg("out", dtype=np.float32, shape=lp.auto), + ], + ) + + knl = lp.tag_inames(knl, {"i": "ilp", "j": "ilp"}) + + knl(queue) + + def test_dep_cycle_printing_and_error(): # https://gitlab.tiker.net/inducer/loopy/issues/140 # This kernel has two dep cycles. diff --git a/test/test_misc.py b/test/test_misc.py index 7a834a6f5..7f867b149 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -79,6 +79,37 @@ def test_compute_sccs(): verify_sccs(graph, compute_sccs(graph)) +def test_compute_topological_order(): + from loopy.tools import compute_topological_order, CycleError + + empty = {} + assert compute_topological_order(empty) == [] + + disconnected = {1: [], 2: [], 3: []} + assert len(compute_topological_order(disconnected)) == 3 + + line = list(zip(range(10), ([i] for i in range(1, 11)))) + import random + random.seed(0) + random.shuffle(line) + expected = list(range(11)) + assert compute_topological_order(dict(line)) == expected + + claw = {1: [2, 3], 0: [1]} + assert compute_topological_order(claw)[:2] == [0, 1] + + repeated_edges = {1: [2, 2], 2: [0]} + assert compute_topological_order(repeated_edges) == [1, 2, 0] + + self_cycle = {1: [1]} + with pytest.raises(CycleError): + compute_topological_order(self_cycle) + + cycle = {0: [2], 1: [2], 2: [3], 3: [4, 1]} + with pytest.raises(CycleError): + compute_topological_order(cycle) + + def test_SetTrie(): from loopy.kernel.tools import SetTrie -- GitLab From cf40a7fa5f41873d3ad034b5809917bf485b67c7 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Tue, 6 Aug 2019 13:57:19 -0500 Subject: [PATCH 6/6] Some fixes for vectorization --- loopy/codegen/bounds.py | 6 +++--- loopy/codegen/instruction.py | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index c946e09a0..86eef0671 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -59,7 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, - IlpBaseTag) + IlpBaseTag, VectorizeTag) result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) @@ -92,14 +92,14 @@ def get_usable_inames_for_conditional(kernel, sched_index): # # - local indices may not be used in conditionals that cross barriers. # - # - ILP indices are not available in loop bounds, they only get defined + # - ilp/vec indices are not available in loop bounds, they only get defined # at the innermost level of nesting. if ( kernel.iname_tags_of_type(iname, ConcurrentTag) and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) and crosses_barrier) - and not kernel.iname_tags_of_type(iname, IlpBaseTag) + and not kernel.iname_tags_of_type(iname, (IlpBaseTag, VectorizeTag)) ): result.add(iname) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 5e0747246..2a5ed503c 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -52,6 +52,14 @@ def to_codegen_result( if chk_domain.is_empty(): return None + if codegen_state.vectorization_info is not None: + iname = codegen_state.vectorization_info.iname + from loopy.isl_helpers import obj_involves_variable + for basicset in chk_domain.get_basic_sets(): + for constr in basicset.get_constraints(): + if obj_involves_variable(constr, iname): + raise Unvectorizable("control flow depends on vector iname") + condition_exprs = [] if not chk_domain.plain_is_universe(): from loopy.symbolic import set_to_cond_expr -- GitLab