diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..d6d146385271323c63ed327a1560c8559dd1097a 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -146,6 +146,7 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState(object): """ .. attribute:: kernel + .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -187,12 +188,19 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: priority_aware_iname_order_embedding + + A :class:`dict` mapping inames to a totally ordered set, which + guarantees that *embedding[iname1] < embedding[iname2]* + if *iname1* should be nested outside *iname2* according to loop priorities """ def __init__(self, kernel, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, + priority_aware_iname_order_embedding, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, @@ -211,6 +219,8 @@ class CodeGenerationState(object): self.is_generating_device_code = is_generating_device_code self.gen_program_name = gen_program_name self.schedule_index_end = schedule_index_end + self.priority_aware_iname_order_embedding = \ + priority_aware_iname_order_embedding # {{{ copy helpers @@ -253,6 +263,8 @@ class CodeGenerationState(object): seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, + priority_aware_iname_order_embedding=( + self.priority_aware_iname_order_embedding), vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -454,6 +466,28 @@ def generate_code_v2(kernel): seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) + + # {{{ build priority_aware_iname_order_embedding + + from loopy.tools import compute_topological_order, CycleError + + priority_graph = dict((iname, []) for iname in sorted(kernel.all_inames())) + for priority_tuple in kernel.loop_priority: + for iname, inner_iname in zip(priority_tuple, priority_tuple[1:]): + priority_graph[iname].append(inner_iname) + + try: + priorities_sorted = compute_topological_order(priority_graph) + except CycleError: + # Input loop priorities may contain a cycle, so don't fail if we + # encounter one. + priorities_sorted = sorted(kernel.all_inames()) + + priority_aware_iname_order_embedding = dict( + (iname, i) for (i, iname) in enumerate(priorities_sorted)) + + # }}} + codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, @@ -464,6 +498,8 @@ def generate_code_v2(kernel): seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, + priority_aware_iname_order_embedding=( + priority_aware_iname_order_embedding), var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index c946e09a086e574a2593d60f652a81773d95a1fe..86eef0671981fb285e24e0e959727d49395a82ad 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -59,7 +59,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): from loopy.schedule import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, - IlpBaseTag) + IlpBaseTag, VectorizeTag) result = find_active_inames_at(kernel, sched_index) crosses_barrier = has_barrier_within(kernel, sched_index) @@ -92,14 +92,14 @@ def get_usable_inames_for_conditional(kernel, sched_index): # # - local indices may not be used in conditionals that cross barriers. # - # - ILP indices are not available in loop bounds, they only get defined + # - ilp/vec indices are not available in loop bounds, they only get defined # at the innermost level of nesting. if ( kernel.iname_tags_of_type(iname, ConcurrentTag) and not (kernel.iname_tags_of_type(iname, LocalIndexTagBase) and crosses_barrier) - and not kernel.iname_tags_of_type(iname, IlpBaseTag) + and not kernel.iname_tags_of_type(iname, (IlpBaseTag, VectorizeTag)) ): result.add(iname) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e1520a82ed69fa2aed729d9b1d849a78d658c4e1..90ca5719d121a8782f5a74a25e079d334e02d2e5 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -130,28 +130,37 @@ def generate_code_for_sched_index(codegen_state, sched_index): elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) tags = tuple(tag for tag in tags if tag) + iname = sched_item.iname from loopy.codegen.loop import ( generate_unroll_loop, - generate_vectorize_loop, generate_sequential_loop_dim_code) - from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, - ForceSequentialTag, LoopedIlpTag, VectorizeTag, - InOrderSequentialSequentialTag, filter_iname_tags_by_type) - if filter_iname_tags_by_type(tags, (UnrollTag, UnrolledIlpTag)): - func = generate_unroll_loop - elif filter_iname_tags_by_type(tags, VectorizeTag): - func = generate_vectorize_loop - elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag, + from loopy.kernel.data import (UnrollTag, + ForceSequentialTag, + InOrderSequentialSequentialTag, + filter_iname_tags_by_type) + + from functools import partial + if filter_iname_tags_by_type(tags, (UnrollTag,)): + func = partial(generate_unroll_loop, codegen_state, iname) + elif not tags or filter_iname_tags_by_type(tags, ( ForceSequentialTag, InOrderSequentialSequentialTag)): - func = generate_sequential_loop_dim_code + from loopy.codegen.bounds import get_usable_inames_for_conditional + usable_inames_for_conditional = ( + get_usable_inames_for_conditional(kernel, sched_index)) + func = partial( + generate_sequential_loop_dim_code, + codegen_state, iname, usable_inames_for_conditional) else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" % (sched_item.iname, ", ".join(str(tag) for tag in tags))) - return func(codegen_state, sched_index) + def inner_codegen(state): + return build_loop_nest(state, 1 + sched_index) + + return func(inner_codegen) elif isinstance(sched_item, Barrier): # {{{ emit barrier code @@ -190,10 +199,58 @@ def generate_code_for_sched_index(codegen_state, sched_index): elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] - from loopy.codegen.instruction import generate_instruction_code - return codegen_state.try_vectorized( - "instruction %s" % insn.id, - lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) + from loopy.kernel.data import (IlpBaseTag, + VectorizeTag, UnrolledIlpTag, LoopedIlpTag, + filter_iname_tags_by_type) + + from loopy.codegen.loop import ( + generate_unroll_loop, + generate_sequential_loop_dim_code, + generate_vectorize_loop) + + all_inames_to_codegen = tuple(sorted( + ( + iname for iname in insn.within_inames + if filter_iname_tags_by_type( + kernel.iname_to_tags.get(iname, ()), + (IlpBaseTag, VectorizeTag))), + key=( + lambda iname: + codegen_state.priority_aware_iname_order_embedding[iname]))) + + # Parallel inames that do not have a hardware axis (ilp, vec) are + # handled here. This function (indirectly) calls itself once per loop to + # be generated. + def inner_codegen(inames_to_codegen, state): + if inames_to_codegen: + iname = inames_to_codegen[0] + else: + # Base case: no inames left + from loopy.codegen.instruction import generate_instruction_code + return state.try_vectorized( + "instruction %s" % insn.id, + lambda inner_state: ( + generate_instruction_code(inner_state, insn))) + + tags = kernel.iname_to_tags[iname] + from functools import partial + if filter_iname_tags_by_type(tags, UnrolledIlpTag): + func = partial(generate_unroll_loop, state, iname) + elif filter_iname_tags_by_type(tags, LoopedIlpTag): + from loopy.codegen.bounds import get_usable_inames_for_conditional + usable_inames_for_conditional = ( + get_usable_inames_for_conditional(kernel, sched_index)) + func = partial( + generate_sequential_loop_dim_code, + state, iname, usable_inames_for_conditional) + elif filter_iname_tags_by_type(tags, VectorizeTag): + func = partial(generate_vectorize_loop, state, iname) + else: + raise ValueError("do not know how to generate code for '%s'", iname) + + return func(partial(inner_codegen, inames_to_codegen[1:])) + + return inner_codegen(all_inames_to_codegen, codegen_state) else: raise RuntimeError("unexpected schedule item type: %s" diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 5e0747246160ddc2934c3d545c03a2a9b4090d5d..2a5ed503cdbd176d2b78c3359512f306f7b8ff4b 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -52,6 +52,14 @@ def to_codegen_result( if chk_domain.is_empty(): return None + if codegen_state.vectorization_info is not None: + iname = codegen_state.vectorization_info.iname + from loopy.isl_helpers import obj_involves_variable + for basicset in chk_domain.get_basic_sets(): + for constr in basicset.get_constraints(): + if obj_involves_variable(constr, iname): + raise Unvectorizable("control flow depends on vector iname") + condition_exprs = [] if not chk_domain.plain_is_universe(): from loopy.symbolic import set_to_cond_expr diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 58f055b7b5042ff28f7bf9674b0e7dc5ff1b6269..722338f8fb2ec955cdbd07d1b3ce1cddc9d69629 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -28,7 +28,6 @@ from loopy.diagnostic import warn, LoopyError from loopy.codegen.result import merge_codegen_results import islpy as isl from islpy import dim_type -from loopy.codegen.control import build_loop_nest from pymbolic.mapper.stringifier import PREC_NONE @@ -116,11 +115,9 @@ def get_slab_decomposition(kernel, iname): # {{{ unrolled loops -def generate_unroll_loop(codegen_state, sched_index): +def generate_unroll_loop(codegen_state, iname, inner_codegen): kernel = codegen_state.kernel - iname = kernel.schedule[sched_index].iname - bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( @@ -148,8 +145,7 @@ def generate_unroll_loop(codegen_state, sched_index): for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) - result.append( - build_loop_nest(new_codegen_state, sched_index+1)) + result.append(inner_codegen(new_codegen_state)) return merge_codegen_results(codegen_state, result) @@ -158,11 +154,8 @@ def generate_unroll_loop(codegen_state, sched_index): # {{{ vectorized loops -def generate_vectorize_loop(codegen_state, sched_index): +def generate_vectorize_loop(codegen_state, iname, inner_codegen): kernel = codegen_state.kernel - - iname = kernel.schedule[sched_index].iname - bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( @@ -175,7 +168,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return generate_unroll_loop(codegen_state, iname, inner_codegen) length = int(pw_aff_to_expr(length_aff)) @@ -190,7 +183,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return generate_unroll_loop(codegen_state, iname, inner_codegen) # {{{ 'implement' vectorization bounds @@ -210,7 +203,7 @@ def generate_vectorize_loop(codegen_state, sched_index): length=length, space=length_aff.space)) - return build_loop_nest(new_codegen_state, sched_index+1) + return inner_codegen(new_codegen_state) # }}} @@ -341,20 +334,15 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, # {{{ sequential loop -def generate_sequential_loop_dim_code(codegen_state, sched_index): - kernel = codegen_state.kernel +def generate_sequential_loop_dim_code( + codegen_state, loop_iname, usable_inames_for_conditional, inner_codegen): + # Should not include loop_iname itself + assert loop_iname not in usable_inames_for_conditional + kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper - loop_iname = kernel.schedule[sched_index].iname - slabs = get_slab_decomposition(kernel, loop_iname) - - from loopy.codegen.bounds import get_usable_inames_for_conditional - - # Note: this does not include loop_iname itself! - usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) - result = [] for slab_name, slab in slabs: @@ -377,7 +365,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): # move inames that are usable into parameters moved_inames = [] for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): - if das_iname in usable_inames: + if das_iname in usable_inames_for_conditional: moved_inames.append(das_iname) dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( @@ -387,11 +375,11 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] impl_domain = isl.align_spaces( - codegen_state.implemented_domain, - dom_and_slab, - obj_bigger_ok=True, - across_dim_types=True - ).params() + codegen_state.implemented_domain, + dom_and_slab, + obj_bigger_ok=True, + across_dim_types=True + ).params() lbound = ( kernel.cache_manager.dim_min( @@ -399,12 +387,13 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): .gist(kernel.assumptions) .gist(impl_domain) .coalesce()) + ubound = ( - kernel.cache_manager.dim_max( - dom_and_slab, loop_iname_idx) - .gist(kernel.assumptions) - .gist(impl_domain) - .coalesce()) + kernel.cache_manager.dim_max( + dom_and_slab, loop_iname_idx) + .gist(kernel.assumptions) + .gist(impl_domain) + .coalesce()) # }}} @@ -435,7 +424,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): .copy(kernel=intersect_kernel_with_slab( kernel, slab, loop_iname))) - inner = build_loop_nest(new_codegen_state, sched_index+1) + inner = inner_codegen(new_codegen_state) # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 975d7b3efe4bcc419a7ca004e1df3b0fbd39d5d9..f9e4a096dc8544efe9673f95b67c14473ee9199d 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -186,7 +186,7 @@ class LoopedIlpTag(IlpBaseTag): # }}} -class VectorizeTag(UniqueTag): +class VectorizeTag(UniqueTag, ConcurrentTag): def __str__(self): return "vec" diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index fb0d0e2c17005ecf051d7034fd7903ed5262bdfc..6a74702ae290d550cebe25ea465b1685625ab24b 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -212,12 +212,11 @@ def find_loop_nest_with_map(kernel): """ result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag all_nonpar_inames = set( iname for iname in kernel.all_inames() - if not kernel.iname_tags_of_type(iname, - (ConcurrentTag, IlpBaseTag, VectorizeTag))) + if not kernel.iname_tags_of_type(iname, ConcurrentTag)) iname_to_insns = kernel.iname_to_insns() @@ -239,20 +238,17 @@ def find_loop_nest_around_map(kernel): iname_to_insns = kernel.iname_to_insns() + from loopy.kernel.data import ConcurrentTag # examine pairs of all inames--O(n**2), I know. - from loopy.kernel.data import IlpBaseTag for inner_iname in all_inames: + if kernel.iname_tags_of_type(inner_iname, ConcurrentTag): + continue # TODO should this come after result update on next line? result[inner_iname] = set() for outer_iname in all_inames: if inner_iname == outer_iname: continue - if kernel.iname_tags_of_type(outer_iname, IlpBaseTag): - # ILP tags are special because they are parallel tags - # and therefore 'in principle' nest around everything. - # But they're realized by the scheduler as a loop - # at the innermost level, so we'll cut them some - # slack here. + if kernel.iname_tags_of_type(outer_iname, ConcurrentTag): continue if iname_to_insns[inner_iname] < iname_to_insns[outer_iname]: @@ -276,7 +272,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): result = {} - from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + from loopy.kernel.data import ConcurrentTag for insn in kernel.instructions: for iname in kernel.insn_inames(insn): if kernel.iname_tags_of_type(iname, ConcurrentTag): @@ -309,8 +305,7 @@ def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): # -> safe. continue - if kernel.iname_tags_of_type(dep_insn_iname, - (ConcurrentTag, IlpBaseTag, VectorizeTag)): + if kernel.iname_tags_of_type(dep_insn_iname, ConcurrentTag): # Parallel tags don't really nest, so we'll disregard # them here. continue @@ -573,16 +568,7 @@ class SchedulerState(ImmutableRecord): See :func:`loop_nest_around_map`. - .. attribute:: breakable_inames - - .. attribute:: ilp_inames - - .. attribute:: vec_inames - - .. attribute:: parallel_inames - - *Note:* ``ilp`` and ``vec`` are not 'parallel' for the purposes of the - scheduler. See :attr:`ilp_inames`, :attr:`vec_inames`. + .. attribute:: concurrent_inames .. rubric:: Time-varying scheduler state @@ -803,8 +789,8 @@ def generate_loop_schedules_internal( pass continue - want = kernel.insn_inames(insn) - sched_state.parallel_inames - have = active_inames_set - sched_state.parallel_inames + want = kernel.insn_inames(insn) - sched_state.concurrent_inames + have = active_inames_set - sched_state.concurrent_inames # If insn is boostable, it may be placed inside a more deeply # nested loop without harm. @@ -974,9 +960,11 @@ def generate_loop_schedules_internal( print("cannot leave '%s' because of preschedule constraints" % last_entered_loop) can_leave = False - elif last_entered_loop not in sched_state.breakable_inames: - # If the iname is not breakable, then check that we've - # scheduled all the instructions that require it. + #elif last_entered_loop not in sched_state.breakable_inames: + # # If the iname is not breakable, then check that we've + # # scheduled all the instructions that require it. + else: + # Check that we've scheduled all the instructions that require the iname. for insn_id in sched_state.unscheduled_insn_ids: insn = kernel.id_to_insn[insn_id] @@ -991,7 +979,7 @@ def generate_loop_schedules_internal( sched_state.scheduled_insn_ids): subdep = kernel.id_to_insn[insn_id] want = (kernel.insn_inames(subdep_id) - - sched_state.parallel_inames) + - sched_state.concurrent_inames) if ( last_entered_loop not in want and last_entered_loop not in subdep.boostable_into): @@ -1069,8 +1057,8 @@ def generate_loop_schedules_internal( needed_inames.update(kernel.insn_inames(insn_id)) needed_inames = (needed_inames - # There's no notion of 'entering' a parallel loop - - sched_state.parallel_inames + # There's no notion of 'entering' a concurrent loop + - sched_state.concurrent_inames # Don't reenter a loop we're already in. - active_inames_set) @@ -1104,7 +1092,7 @@ def generate_loop_schedules_internal( continue currently_accessible_inames = ( - active_inames_set | sched_state.parallel_inames) + active_inames_set | sched_state.concurrent_inames) if ( not sched_state.loop_nest_around_map[iname] <= currently_accessible_inames): @@ -1197,13 +1185,8 @@ def generate_loop_schedules_internal( useful_and_desired = useful_loops_set & loop_priority_set if useful_and_desired: - wanted = ( - useful_and_desired - - sched_state.ilp_inames - - sched_state.vec_inames - ) priority_tiers = [t for t in - get_priority_tiers(wanted, + get_priority_tiers(useful_and_desired, sched_state.kernel.loop_priority ) ] @@ -1212,31 +1195,9 @@ def generate_loop_schedules_internal( # have been contradictary. loop_priority_set = set().union(*[set(t) for t in priority_tiers]) - priority_tiers.append( - useful_loops_set - - loop_priority_set - - sched_state.ilp_inames - - sched_state.vec_inames - ) + priority_tiers.append(useful_loops_set - loop_priority_set) else: - priority_tiers = [ - useful_loops_set - - sched_state.ilp_inames - - sched_state.vec_inames - ] - - # vectorization must be the absolute innermost loop - priority_tiers.extend([ - [iname] - for iname in sched_state.ilp_inames - if iname in useful_loops_set - ]) - - priority_tiers.extend([ - [iname] - for iname in sched_state.vec_inames - if iname in useful_loops_set - ]) + priority_tiers = [useful_loops_set] # }}} @@ -1864,17 +1825,9 @@ def generate_loop_schedules_inner(kernel, debug_args={}): for item in preschedule for insn_id in sched_item_to_insn_id(item)) - from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag, + from loopy.kernel.data import (ConcurrentTag, filter_iname_tags_by_type) - ilp_inames = set( - iname - for iname, tags in six.iteritems(kernel.iname_to_tags) - if filter_iname_tags_by_type(tags, IlpBaseTag)) - vec_inames = set( - iname - for iname, tags in six.iteritems(kernel.iname_to_tags) - if filter_iname_tags_by_type(tags, VectorizeTag)) - parallel_inames = set( + concurrent_inames = set( iname for iname, tags in six.iteritems(kernel.iname_to_tags) if filter_iname_tags_by_type(tags, ConcurrentTag)) @@ -1888,10 +1841,6 @@ def generate_loop_schedules_inner(kernel, debug_args={}): kernel, loop_nest_with_map=loop_nest_with_map, loop_nest_around_map=loop_nest_around_map), - breakable_inames=ilp_inames, - ilp_inames=ilp_inames, - vec_inames=vec_inames, - prescheduled_inames=prescheduled_inames, prescheduled_insn_ids=prescheduled_insn_ids, @@ -1910,8 +1859,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): preschedule=preschedule, insn_ids_to_try=None, - # ilp and vec are not parallel for the purposes of the scheduler - parallel_inames=parallel_inames - ilp_inames - vec_inames, + concurrent_inames=concurrent_inames, group_insn_counts=group_insn_counts(kernel), active_group_counts={}, diff --git a/loopy/tools.py b/loopy/tools.py index 0fc6d1bf9b3885db86cc1f4642a4e1342fcfd5a0..2e24cb743baeac9d7ae645a766cf8acfbe97d886 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -412,6 +412,49 @@ def compute_sccs(graph): # }}} +# {{{ compute topological order + +class CycleError(Exception): + """Raised when a topological ordering cannot be computed due to a cycle.""" + pass + + +def compute_topological_order(graph): + reverse_order = [] + visited = set() + visiting = set() + + for root in graph: + if root in visited: + continue + + stack = [(root, iter(graph[root]))] + visiting.add(root) + + while stack: + node, children = stack.pop() + + for child in children: + if child in visiting: + raise CycleError() + + if child in visited: + continue + + visiting.add(child) + stack.append((node, children)) + stack.append((child, iter(graph.get(child, ())))) + break + else: + visiting.remove(node) + visited.add(node) + reverse_order.append(node) + + return list(reversed(reverse_order)) + +# }}} + + # {{{ pickled container value class _PickledObject(object): diff --git a/test/test_loopy.py b/test/test_loopy.py index 119d57adf2c850eba3bb6ad5df3c0a8d0644b70c..67b6cb401c8e977a43718c08757c3b0b514aeb88 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2889,6 +2889,25 @@ def test_half_complex_conditional(ctx_factory): knl(queue) +def test_nested_ilp_iname_codegen(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i,j]: 0 <= i < 10 and 0 <= j < 10}", + """ + out[i,j] = i + j + """, + [ + lp.GlobalArg("out", dtype=np.float32, shape=lp.auto), + ], + ) + + knl = lp.tag_inames(knl, {"i": "ilp", "j": "ilp"}) + + knl(queue) + + def test_dep_cycle_printing_and_error(): # https://gitlab.tiker.net/inducer/loopy/issues/140 # This kernel has two dep cycles. diff --git a/test/test_misc.py b/test/test_misc.py index 7a834a6f5d393298e97df22d47a1de3b64354a42..7f867b1492366bcc348a50e2ea8910bd8627efe0 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -79,6 +79,37 @@ def test_compute_sccs(): verify_sccs(graph, compute_sccs(graph)) +def test_compute_topological_order(): + from loopy.tools import compute_topological_order, CycleError + + empty = {} + assert compute_topological_order(empty) == [] + + disconnected = {1: [], 2: [], 3: []} + assert len(compute_topological_order(disconnected)) == 3 + + line = list(zip(range(10), ([i] for i in range(1, 11)))) + import random + random.seed(0) + random.shuffle(line) + expected = list(range(11)) + assert compute_topological_order(dict(line)) == expected + + claw = {1: [2, 3], 0: [1]} + assert compute_topological_order(claw)[:2] == [0, 1] + + repeated_edges = {1: [2, 2], 2: [0]} + assert compute_topological_order(repeated_edges) == [1, 2, 0] + + self_cycle = {1: [1]} + with pytest.raises(CycleError): + compute_topological_order(self_cycle) + + cycle = {0: [2], 1: [2], 2: [3], 3: [4, 1]} + with pytest.raises(CycleError): + compute_topological_order(cycle) + + def test_SetTrie(): from loopy.kernel.tools import SetTrie