diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 0b9608493d3239a44c6eeec0c85e23500c4aa427..afd8ed762e4ee5252f73105080403542732f2104 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -142,27 +142,38 @@ def add_comment(cmt, code): # {{{ code generation state class CodeGenerationState(object): - def __init__(self, implemented_domain, c_code_mapper): - """ - :param implemented_domain: The entire implemented domain, - i.e. all constraints that have been enforced so far. - :param c_code_mapper: A C code mapper that does not take per-ILP - assignments into account. - """ - self.implemented_domain = implemented_domain + """ + .. attribute:: implemented_domain + + The entire implemented domain (as an :class:`islpy.Set`) + i.e. all constraints that have been enforced so far. + + .. attribute:: implemented_predicates + + A :class:`frozenset` of predicates for which checks have been + implemented. + .. attribute:: c_code_mapper + + A :class:`loopy.codegen.expression.CCodeMapper` that does not take + per-ILP assignments into account. + """ + def __init__(self, implemented_domain, implemented_predicates, c_code_mapper): + self.implemented_domain = implemented_domain + self.implemented_predicates = implemented_predicates self.c_code_mapper = c_code_mapper - def copy(self, implemented_domain=None, c_code_mapper=None): + def copy(self, implemented_domain=None, implemented_predicates=frozenset(), + c_code_mapper=None): return CodeGenerationState( implemented_domain=implemented_domain or self.implemented_domain, + implemented_predicates= + implemented_predicates or self.implemented_predicates, c_code_mapper=c_code_mapper or self.c_code_mapper) def intersect(self, other): new_impl, new_other = isl.align_two(self.implemented_domain, other) - return CodeGenerationState( - new_impl & new_other, - self.c_code_mapper) + return self.copy(implemented_domain=new_impl & new_other) def fix(self, iname, aff): new_impl_domain = self.implemented_domain @@ -185,9 +196,9 @@ class CodeGenerationState(object): expr = pw_aff_to_expr(aff) new_impl_domain = new_impl_domain.add_constraint(cns) - return CodeGenerationState( - new_impl_domain, - self.c_code_mapper.copy_and_assign(iname, expr)) + return self.copy( + implemented_domain=new_impl_domain, + c_code_mapper=self.c_code_mapper.copy_and_assign(iname, expr)) # }}} @@ -362,7 +373,9 @@ def generate_code(kernel): initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( - initial_implemented_domain, c_code_mapper=ccm) + implemented_domain=initial_implemented_domain, + implemented_predicates=frozenset(), + c_code_mapper=ccm) from loopy.codegen.loop import set_up_hw_parallel_loops gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 013ec5d099b892799ad1d7b83dcf91daf4db9a16..4636f07087257bd9b17904a2922485b8c662ce0d 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -25,8 +25,10 @@ THE SOFTWARE. """ -from loopy.codegen import CodeGenerationState, gen_code_block +from loopy.codegen import gen_code_block import islpy as isl +from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, + gather_schedule_subloop, generate_sub_sched_items) def get_admissible_conditional_inames_for(kernel, sched_index): @@ -50,8 +52,6 @@ def get_admissible_conditional_inames_for(kernel, sched_index): def generate_code_for_sched_index(kernel, sched_index, codegen_state): - from loopy.schedule import (EnterLoop, RunInstruction, Barrier) - sched_item = kernel.schedule[sched_index] if isinstance(sched_item, EnterLoop): @@ -128,15 +128,29 @@ def remove_inames_for_shared_hw_axes(kernel, cond_inames): return frozenset(cond_inames - multi_use_inames) +def get_required_predicates(kernel, sched_index): + result = None + for _, sched_item in generate_sub_sched_items(kernel.schedule, sched_index): + if isinstance(sched_item, Barrier): + my_preds = frozenset() + elif isinstance(sched_item, RunInstruction): + my_preds = kernel.id_to_insn[sched_item.insn_id].predicates + else: + raise RuntimeError("unexpected schedule item type: %s" + % type(sched_item)) + + if result is None: + result = my_preds + else: + result = result & my_preds + + return result + + def build_loop_nest(kernel, sched_index, codegen_state): # Most of the complexity of this function goes towards finding groups of # instructions that can be nested inside a shared conditional. - assert isinstance(codegen_state, CodeGenerationState) - - from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, - gather_schedule_subloop) - # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices # i.e. go up to the next LeaveLoop, and skip over inner loops. @@ -167,9 +181,23 @@ def build_loop_nest(kernel, sched_index, codegen_state): # {{{ pass 2: find admissible conditional inames for each sibling schedule item - admissible_cond_inames = [ - get_admissible_conditional_inames_for(kernel, i) - for i in my_sched_indices] + from pytools import Record + + class ScheduleIndexInfo(Record): + """ + .. attribute:: schedule_index + .. attribute:: admissible_cond_inames + .. attribute:: required_predicates + """ + + sched_index_info_entries = [ + ScheduleIndexInfo( + schedule_index=i, + admissible_cond_inames= + get_admissible_conditional_inames_for(kernel, i), + required_predicates=get_required_predicates(kernel, i) + ) + for i in my_sched_indices] # }}} @@ -198,15 +226,37 @@ def build_loop_nest(kernel, sched_index, codegen_state): # so we can safely overapproximate here. overapproximate=True) - def build_insn_group(sched_indices_and_cond_inames, codegen_state, + def build_insn_group(sched_index_info_entries, codegen_state, done_group_lengths=set()): - # done_group_lengths serves to prevent infinite recursion by imposing a - # bigger and bigger minimum size on the group of shared inames found. - - if not sched_indices_and_cond_inames: + """ + :arg done_group_lengths: A set of group lengths (integers) that grows from + empty to include 1 and upwards with every recursive call. + It serves to prevent infinite recursion by preventing recursive + calls from doing anything about groups that are too small. + """ + + # The rough plan here is that build_insn_group starts out with the + # entirety of the current schedule item's downward siblings (i.e. all + # the ones up to the next LeaveLoop). It will then iterate upward to + # find the largest usable conditional hoist group. + # + # It will then call itself recursively, telling its recursive instances + # to ignore the hoist group it just found by adding that group length + # to done_group_length. (It'll also chop the set of schedule indices + # considered down so that a callee cannot find a *longer* hoist group.) + # + # Upon return the hoist is wrapped around the returned code and + # build_insn_group calls itself for the remainder of schedule indices + # that were not in the hoist group. + + if not sched_index_info_entries: return [] - sched_index, cond_inames = sched_indices_and_cond_inames[0] + si_entry = sched_index_info_entries[0] + sched_index = si_entry.schedule_index + current_iname_set = si_entry.admissible_cond_inames + current_pred_set = (si_entry.required_predicates + - codegen_state.implemented_predicates) # {{{ grow schedule item group @@ -216,28 +266,32 @@ def build_loop_nest(kernel, sched_index, codegen_state): bounds_check_cache = BoundsCheckCache( kernel, codegen_state.implemented_domain) - current_iname_set = cond_inames - found_hoists = [] candidate_group_length = 1 - while candidate_group_length <= len(sched_indices_and_cond_inames): + while candidate_group_length <= len(sched_index_info_entries): if candidate_group_length in done_group_lengths: candidate_group_length += 1 continue - other_sched_index, other_cond_inames = \ - sched_indices_and_cond_inames[candidate_group_length-1] - current_iname_set = current_iname_set & other_cond_inames + current_iname_set = ( + current_iname_set + & sched_index_info_entries[candidate_group_length-1] + .admissible_cond_inames) + current_pred_set = ( + current_pred_set + & sched_index_info_entries[candidate_group_length-1] + .required_predicates) # {{{ see which inames are actually used in group # And only generate conditionals for those. from loopy.schedule import find_used_inames_within used_inames = set() - for subsched_index, _ in \ - sched_indices_and_cond_inames[0:candidate_group_length]: - used_inames |= find_used_inames_within(kernel, subsched_index) + for sched_index_info_entry in \ + sched_index_info_entries[0:candidate_group_length]: + used_inames |= find_used_inames_within(kernel, + sched_index_info_entry.schedule_index) # }}} @@ -246,17 +300,21 @@ def build_loop_nest(kernel, sched_index, codegen_state): bounds_checks = bounds_check_cache(only_unshared_inames) - if bounds_checks or bounds_checks is None or candidate_group_length == 1: + if (bounds_checks # found a bounds check + or bounds_checks is None # found impossible bounds check + or current_pred_set + or candidate_group_length == 1): # length-1 must always be an option to reach the recursion base # case below - found_hoists.append((candidate_group_length, bounds_checks)) + found_hoists.append((candidate_group_length, + bounds_checks, current_pred_set)) candidate_group_length += 1 # }}} # pick largest such group - group_length, bounds_checks = max(found_hoists) + group_length, bounds_checks, pred_checks = max(found_hoists) check_set = None for cns in bounds_checks: @@ -276,6 +334,11 @@ def build_loop_nest(kernel, sched_index, codegen_state): is_empty = check_set.is_empty() new_codegen_state = codegen_state.intersect(check_set) + if pred_checks: + new_codegen_state = new_codegen_state.copy( + implemented_predicates=new_codegen_state.implemented_predicates + | pred_checks) + if is_empty: result = [] else: @@ -288,27 +351,27 @@ def build_loop_nest(kernel, sched_index, codegen_state): else: # recurse with a bigger done_group_lengths result = build_insn_group( - sched_indices_and_cond_inames[0:group_length], + sched_index_info_entries[0:group_length], new_codegen_state, done_group_lengths=done_group_lengths | set([group_length])) - if bounds_checks: + if bounds_checks or pred_checks: from loopy.codegen import wrap_in_if from loopy.codegen.bounds import constraint_to_code - result = [ - wrap_in_if( - [constraint_to_code(codegen_state.c_code_mapper, cns) - for cns in bounds_checks], - gen_code_block(result))] + + conditionals = [ + constraint_to_code(codegen_state.c_code_mapper, cns) + for cns in bounds_checks] + list(pred_checks) + + result = [wrap_in_if(conditionals, gen_code_block(result))] return result + build_insn_group( - sched_indices_and_cond_inames[group_length:], codegen_state) + sched_index_info_entries[group_length:], codegen_state) # }}} return gen_code_block( - build_insn_group(zip( - my_sched_indices, admissible_cond_inames), codegen_state)) + build_insn_group(sched_index_info_entries, codegen_state)) diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index e209f8ad07a3e0132cdfe76702eac6c22719d550..019e77262f4a21ff21e9b40146cd819a13740738 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -28,22 +28,26 @@ import islpy as isl from loopy.codegen import GeneratedInstruction -def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt): +def wrap_in_conditionals(codegen_state, domain, check_inames, required_preds, stmt): from loopy.codegen.bounds import get_bounds_checks, constraint_to_code bounds_checks = get_bounds_checks( domain, check_inames, - implemented_domain, overapproximate=False) + codegen_state.implemented_domain, overapproximate=False) bounds_check_set = isl.Set.universe(domain.get_space()) \ .add_constraints(bounds_checks) bounds_check_set, new_implemented_domain = isl.align_two( - bounds_check_set, implemented_domain) + bounds_check_set, codegen_state.implemented_domain) new_implemented_domain = new_implemented_domain & bounds_check_set if bounds_check_set.is_empty(): return None, None - condition_codelets = [constraint_to_code(ccm, cns) for cns in bounds_checks] + condition_codelets = [constraint_to_code(codegen_state.c_code_mapper, cns) + for cns in bounds_checks] + + condition_codelets.extend( + required_preds - codegen_state.implemented_predicates) if condition_codelets: from cgen import If @@ -63,10 +67,10 @@ def generate_instruction_code(kernel, insn, codegen_state): raise RuntimeError("unexpected instruction type") insn_inames = kernel.insn_inames(insn) - insn_code, impl_domain = wrap_in_bounds_checks( - codegen_state.c_code_mapper, + insn_code, impl_domain = wrap_in_conditionals( + codegen_state, kernel.get_inames_domain(insn_inames), insn_inames, - codegen_state.implemented_domain, + insn.predicates, result) if insn_code is None: @@ -95,7 +99,6 @@ def generate_expr_instruction_code(kernel, insn, codegen_state): needed_dtype=target_dtype)) if kernel.flags.trace_assignments or kernel.flags.trace_assignment_values: - from loopy.codegen import gen_code_block from cgen import Statement as S gs, ls = kernel.get_grid_sizes() diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 4b81951d0d65998e2e443cf0f38a6eb2a2daccdd..a3a35b082cdb4df1f2ac7023dfd2ca5e141cc506 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -181,6 +181,7 @@ def parse_insn(insn): insn_id = None priority = 0 forced_iname_deps = frozenset() + predicates = frozenset() if groups["options"] is not None: for option in groups["options"].split(","): @@ -206,6 +207,8 @@ def parse_insn(insn): insn_deps = set(opt_value.split(":")) elif opt_key == "inames": forced_iname_deps = frozenset(opt_value.split(":")) + elif opt_key == "if": + predicates = frozenset(opt_value.split(":")) else: raise ValueError("unrecognized instruction option '%s'" % opt_key) @@ -230,7 +233,8 @@ def parse_insn(insn): forced_iname_deps=forced_iname_deps, assignee=lhs, expression=rhs, temp_var_type=temp_var_type, - priority=priority) + priority=priority, + predicates=predicates) elif subst_match is not None: from pymbolic.primitives import Variable, Call diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index e9c125fd17c2c2938f5b3f16917732cfcada97ae..72e774c5ec62c8c59dbcd2322e248f948ab7f29d 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -349,6 +349,11 @@ class InstructionBase(Record): *must* be executed before this one. Note that loop scheduling augments this by adding dependencies on any writes to temporaries read by this instruction. + .. attribute:: predicates + + a :class:`frozenset` of variable names whose truth values (as defined + by C) determine whether this instruction should be run + .. attribute:: forced_iname_deps A :class:`frozenset` of inames that are added to the list of iname @@ -372,11 +377,11 @@ class InstructionBase(Record): Also allowed to be *None*. """ - fields = set("id insn_deps forced_iname_deps " + fields = set("id insn_deps predicates forced_iname_deps " "priority boostable boostable_into".split()) def __init__(self, id, insn_deps, forced_iname_deps, priority, - boostable, boostable_into): + boostable, boostable_into, predicates): assert isinstance(forced_iname_deps, frozenset) assert isinstance(insn_deps, set) @@ -387,7 +392,8 @@ class InstructionBase(Record): forced_iname_deps=forced_iname_deps, priority=priority, boostable=boostable, - boostable_into=boostable_into) + boostable_into=boostable_into, + predicates=predicates) # {{{ abstract interface @@ -496,14 +502,15 @@ class ExpressionInstruction(InstructionBase): assignee, expression, id=None, forced_iname_deps=frozenset(), insn_deps=set(), boostable=None, boostable_into=None, - temp_var_type=None, priority=0): + temp_var_type=None, priority=0, predicates=frozenset()): InstructionBase.__init__(self, id=id, forced_iname_deps=forced_iname_deps, insn_deps=insn_deps, boostable=boostable, boostable_into=boostable_into, - priority=priority) + priority=priority, + predicates=predicates) from loopy.symbolic import parse if isinstance(assignee, str): @@ -523,6 +530,9 @@ class ExpressionInstruction(InstructionBase): result = get_dependencies(self.expression) for _, subscript in self.assignees_and_indices(): result = result | get_dependencies(subscript) + + result = result | self.predicates + return result @memoize_method @@ -631,7 +641,7 @@ class CInstruction(InstructionBase): iname_exprs, code, read_variables=frozenset(), assignees=frozenset(), id=None, insn_deps=set(), forced_iname_deps=frozenset(), priority=0, - boostable=None, boostable_into=None): + boostable=None, boostable_into=None, predicates=frozenset()): """ :arg iname_exprs: Like :attr:`iname_exprs`, but instead of tuples, simple strings pepresenting inames are also allowed. A single @@ -647,7 +657,7 @@ class CInstruction(InstructionBase): forced_iname_deps=forced_iname_deps, insn_deps=insn_deps, boostable=boostable, boostable_into=boostable_into, - priority=priority) + priority=priority, predicates=predicates) # {{{ normalize iname_exprs @@ -697,7 +707,7 @@ class CInstruction(InstructionBase): for _, subscript in self.assignees_and_indices(): result.update(get_dependencies(subscript)) - return frozenset(result) + return frozenset(result) | self.predicates def reduction_inames(self): return set() diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 8826ed0396db9fa88a8fe517f34a1043cd6f92a7..507cc0e34e8afc980387bb90dc8f6c7f3e118281 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -108,8 +108,11 @@ def find_all_insn_inames(kernel): assert isinstance(write_deps, frozenset), type(insn) assert isinstance(iname_deps, frozenset), type(insn) - logger.debug("%s: find_all_insn_inames: %s (init): %s" % ( - kernel.name, insn.id, ", ".join(sorted(iname_deps)))) + logger.debug("%s: find_all_insn_inames: %s (init): %s - " + "read deps: %s - write deps: %s" % ( + kernel.name, insn.id, ", ".join(sorted(iname_deps)), + ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)), + )) insn_id_to_inames[insn.id] = iname_deps insn_assignee_inames[insn.id] = write_deps & kernel.all_inames() diff --git a/loopy/schedule.py b/loopy/schedule.py index 93565cde1fe176136e8faec31b808ce4fcac4d69..56bba2d4144b4694a6ad29af1da5bf00958083c5 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -76,6 +76,30 @@ def gather_schedule_subloop(schedule, start_idx): assert False +def generate_sub_sched_items(schedule, start_idx): + if not isinstance(schedule[start_idx], EnterLoop): + yield start_idx, schedule[start_idx] + + level = 0 + i = start_idx + while i < len(schedule): + sched_item = schedule[i] + if isinstance(sched_item, EnterLoop): + level += 1 + elif isinstance(sched_item, LeaveLoop): + level -= 1 + + else: + yield i, sched_item + + if level == 0: + return + + i += 1 + + assert False + + def get_barrier_needing_dependency(kernel, target, source, unordered=False): from loopy.kernel.data import InstructionBase if not isinstance(source, InstructionBase): diff --git a/test/test_loopy.py b/test/test_loopy.py index 2729622914ec87ca39e810662db8384302e1c935..7e48e2727985135e342ce987d22859c8dead7267 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1469,6 +1469,34 @@ def test_vector_types(ctx_factory, vec_len): fills_entire_output=False) +def test_conditional(ctx_factory): + #logging.basicConfig(level=logging.DEBUG) + ctx = cl.create_some_context() + + knl = lp.make_kernel( + ctx.devices[0], + "{ [i,j]: 0<=i,j my_a = a[i,j] {id=read_a} + <> a_less_than_zero = my_a < 0 {dep=read_a,inames=i:j} + my_a = 2*my_a {id=twice_a,dep=read_a,if=a_less_than_zero} + my_a = my_a+1 {id=aplus,dep=twice_a,if=a_less_than_zero} + out[i,j] = 2*my_a {dep=aplus} + """, + [ + lp.GlobalArg("a", np.float32, shape=lp.auto), + lp.GlobalArg("out", np.float32, shape=lp.auto), + "..." + ]) + + ref_knl = knl + + lp.auto_test_vs_ref(ref_knl, ctx, knl, + parameters=dict( + n=200 + )) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])