diff --git a/MEMO b/MEMO index f4e5c34e48e62d5c951d01fcb212a9117e361def..ce361fae97bca21f744d62d7c6384de9b7ef0978 100644 --- a/MEMO +++ b/MEMO @@ -78,8 +78,8 @@ Fixes: old inames may still be around, so the rewrite may or may not have to be applied. -- Group instructions by dependency/inames for scheduling, to - increase sched. scalability +- Group instructions by dependency/inames for outlining, to + increase outline scalability - What if no universally valid precompute base index expression is found? (test_intel_matrix_mul with n = 6*16, e.g.?) @@ -200,7 +200,7 @@ Dealt with - Make sure that variables that enter into loop bounds are only written exactly once. [DONE] - - Make sure that loop bound writes are scheduled before the relevant + - Make sure that loop bound writes are outlined before the relevant loops. [DONE] - add_prefetch tagging @@ -218,7 +218,7 @@ Dealt with - Allow complex-valued arithmetic, despite CL's best efforts. -- "No schedule found" debug help: +- "No outline found" debug help: - Find longest dead-end - Automatically report on what hinders progress there @@ -231,11 +231,11 @@ Dealt with - dim_{min,max} caching - Exhaust the search for a no-boost solution first, before looking - for a schedule with boosts. + for an outline with boosts. - Pick not just axis 0, but all axes by lowest available stride -- Scheduler tries too many boostability-related options +- Outliner tries too many boostability-related options - Automatically generate testing code vs. sequential. @@ -286,7 +286,7 @@ Dealt with - implemented_domain may end up being smaller than requested in cse evaluations--check that! -- Allow prioritization of loops in scheduling. +- Allow prioritization of loops in outlining. - Make axpy better. @@ -296,7 +296,7 @@ Dealt with - Flag, exploit idempotence -- Some things involving CSEs might be impossible to schedule +- Some things involving CSEs might be impossible to outline a[i,j] = cse(b[i]) * cse(c[j]) - Be smarter about automatic local axis choice diff --git a/doc/misc.rst b/doc/misc.rst index 62e5a1fa20f2709c4933e21f43175fc1f870c348..d6347ae0d40335ac2a8f2b43ae3cd80bac053b9b 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -401,7 +401,7 @@ potentially valuable guarantee to keep existing code working unchanged for a while. Instead, it might be wiser to just grab the version of the language current at the time of writing the code. -Uh-oh. I got a scheduling error. Any hints? +Uh-oh. I got an outlining error. Any hints? ------------------------------------------- * Make sure that dependencies between instructions are as @@ -412,7 +412,7 @@ Uh-oh. I got a scheduling error. Any hints? There's a heuristic that tries to help find dependencies. If there's only a single write to a variable, then it adds dependencies from all readers to the writer. In your case, that's actually counterproductive, - because it creates a circular dependency, hence the scheduling issue. + because it creates a circular dependency, hence the outlining issue. So you'll have to turn that off, like so:: knl = lp.make_kernel( @@ -435,7 +435,7 @@ Uh-oh. I got a scheduling error. Any hints? * Make sure that your loops are correctly nested. - The scheduler will try to be as helpful as it can in telling + The outliner will try to be as helpful as it can in telling you where it got stuck. Citing Loopy diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 409cbef576d654be973dd6d1424ac40d3ea60982..d2f4dac4d04bea4645c16f1d1378d5a7a36a84b9 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -297,8 +297,8 @@ These are usually key-value pairs. The following attributes are recognized: accepts an optional `@scope` suffix. * ``priority=integer`` sets the instructions priority to the value - ``integer``. Instructions with higher priority will be scheduled sooner, - if possible. Note that the scheduler may still schedule a lower-priority + ``integer``. Instructions with higher priority will be outlined sooner, + if possible. Note that the outliner may still outline a lower-priority instruction ahead of a higher-priority one if loop orders or dependencies require it. diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 740c5cb5848dbb7c6f657011bfc23fa88ca173ec..7df4fb3bd0d1fa0c4895f8ae51d8def5adc5bc67 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -116,9 +116,9 @@ Finishing up .. autofunction:: preprocess_kernel -.. autofunction:: generate_loop_schedules +.. autofunction:: generate_loop_outlines -.. autofunction:: get_one_scheduled_kernel +.. autofunction:: get_one_outlined_kernel .. autofunction:: save_and_reload_temporaries diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 753b09b5da42835b88a000bc0400fa18a254d80f..11e248dd39e71de331954f5f999eeb14fe6a0177 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1203,17 +1203,17 @@ Here is what happens when we try to generate code for the kernel: loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) This happens due to the kernel splitting done by :mod:`loopy`. The splitting -happens when the instruction schedule is generated. To see the schedule, we -should call :func:`loopy.get_one_scheduled_kernel`: +happens when the instruction outline is generated. To see the outline, we +should call :func:`loopy.get_one_outlined_kernel`: - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> knl = lp.get_one_outlined_kernel(lp.preprocess_kernel(knl)) >>> print(knl) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- - SCHEDULE: + OUTLINE: 0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[]) 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} 2: RETURN FROM KERNEL rotate_v2 @@ -1224,7 +1224,7 @@ should call :func:`loopy.get_one_scheduled_kernel`: --------------------------------------------------------------------------- As the error message suggests, taking a look at the generated instruction -schedule will show that while ``tmp`` is assigned in the first kernel, the +outline will show that while ``tmp`` is assigned in the first kernel, the assignment to ``tmp`` is not seen by the second kernel. Because the temporary is in private memory, it does not persist across calls to device kernels (the same goes for local temporaries). @@ -1232,13 +1232,13 @@ goes for local temporaries). :mod:`loopy` provides a function called :func:`loopy.save_and_reload_temporaries` for the purpose of handling the task of saving and restoring temporary values across global barriers. This -function adds instructions to the kernel without scheduling them. That means -that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to -put those instructions into the schedule. +function adds instructions to the kernel without outlining them. That means +that :func:`loopy.get_one_outlined_kernel` needs to be called one more time to +put those instructions into the outline. - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> knl = lp.get_one_outlined_kernel(lp.preprocess_kernel(knl)) >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_scheduled_kernel(knl) # Schedule added instructions + >>> knl = lp.get_one_outlined_kernel(knl) # Outline added instructions >>> print(knl) --------------------------------------------------------------------------- KERNEL: rotate_v2 @@ -1251,7 +1251,7 @@ put those instructions into the schedule. --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- - SCHEDULE: + OUTLINE: 0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[]) 1: tmp = arr[i_inner + i_outer*16] {id=maketmp} 2: tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] = tmp {id=tmp.save} diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index 7ab049cd1906f703b0efc39808ff68a63b91ff37..dad6cb882bcab8a1de1083993dd9faa7b03550d0 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -19,14 +19,14 @@ knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) -# schedule +# outline from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) -from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl) +from loopy.outline import get_one_outlined_kernel +knl = get_one_outlined_kernel(knl) -# map schedule onto host or device +# map outline onto host or device print(knl) cgr = lp.generate_code_v2(knl) diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py index fa581d4262e2f06addf81aeaecca5ed2f8f8c8f1..b25784726240763945ad56597ddc6f2d0f87009c 100644 --- a/examples/python/ispc-stream-harness.py +++ b/examples/python/ispc-stream-harness.py @@ -30,7 +30,7 @@ def transform(knl, vars, stream_dtype): def gen_code(knl): knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) + knl = lp.get_one_outlined_kernel(knl) codegen_result = lp.generate_code_v2(knl) return codegen_result.device_code() + "\n" + codegen_result.host_code() diff --git a/loopy/__init__.py b/loopy/__init__.py index b60de6e2dcd35c1c167bf5e303401f2c6242ebec..de3ff7bd51ba01e51a3cd430e1c894e19d0ab3eb 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -77,7 +77,7 @@ from loopy.transform.iname import ( split_reduction_inward, split_reduction_outward, affine_map_inames, find_unused_axis_tag, make_reduction_inames_unique, - has_schedulable_iname_nesting, get_iname_duplication_options, + has_outlinable_iname_nesting, get_iname_duplication_options, add_inames_to_insn) from loopy.transform.instruction import ( @@ -123,7 +123,7 @@ from loopy.transform.add_barrier import add_barrier from loopy.type_inference import infer_unknown_types from loopy.preprocess import preprocess_kernel, realize_reduction -from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel +from loopy.outline import generate_loop_outlines, get_one_outlined_kernel from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map, @@ -194,7 +194,7 @@ __all__ = [ "split_reduction_inward", "split_reduction_outward", "affine_map_inames", "find_unused_axis_tag", "make_reduction_inames_unique", - "has_schedulable_iname_nesting", "get_iname_duplication_options", + "has_outlinable_iname_nesting", "get_iname_duplication_options", "add_inames_to_insn", "add_prefetch", "change_arg_to_image", @@ -248,7 +248,7 @@ __all__ = [ "infer_unknown_types", "preprocess_kernel", "realize_reduction", - "generate_loop_schedules", "get_one_scheduled_kernel", + "generate_loop_outlines", "get_one_outlined_kernel", "GeneratedProgram", "CodeGenerationResult", "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 6837b99a026debf32b12aceef00ed3863c620639..62f3ee6878d1140273c38fae6ec69924a1819439 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -384,7 +384,7 @@ def auto_test_vs_ref( max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by - scheduling *test_knl*. + outlining *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, @@ -450,14 +450,14 @@ def auto_test_vs_ref( pp_ref_knl = lp.preprocess_kernel(ref_knl) - for knl in lp.generate_loop_schedules(pp_ref_knl): - ref_sched_kernel = knl + for knl in lp.generate_loop_outlines(pp_ref_knl): + ref_outline_kernel = knl break logger.info("%s (ref): trying %s for the reference calculation" % ( ref_knl.name, dev)) - ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) + ref_compiled = CompiledKernel(ref_ctx, ref_outline_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") @@ -469,7 +469,7 @@ def auto_test_vs_ref( try: ref_args, ref_arg_data = \ - make_ref_args(ref_sched_kernel, + make_ref_args(ref_outline_kernel, ref_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False @@ -534,14 +534,14 @@ def auto_test_vs_ref( from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ KernelState.PREPROCESSED, - KernelState.SCHEDULED]: + KernelState.OUTLINED]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) test_knl = lp.preprocess_kernel(test_knl) - if not test_knl.schedule: - test_kernels = lp.generate_loop_schedules(test_knl) + if not test_knl.outline: + test_kernels = lp.generate_loop_outlines(test_knl) else: test_kernels = [test_knl] diff --git a/loopy/check.py b/loopy/check.py index 0d2bbff7cf8d6f9e63a33dc2c8814f29afae70f0..4957f348faa9a39361a110b982dd5ab71e8da33e 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -62,7 +62,7 @@ def check_identifiers_in_subst_rules(knl): # }}} -# {{{ sanity checks run pre-scheduling +# {{{ sanity checks run pre-outlining # FIXME: Replace with an enum. See # https://gitlab.tiker.net/inducer/loopy/issues/85 @@ -414,17 +414,17 @@ def check_write_destinations(kernel): # }}} -# {{{ check_has_schedulable_iname_nesting +# {{{ check_has_outlinable_iname_nesting -def check_has_schedulable_iname_nesting(kernel): - from loopy.transform.iname import (has_schedulable_iname_nesting, +def check_has_outlinable_iname_nesting(kernel): + from loopy.transform.iname import (has_outlinable_iname_nesting, get_iname_duplication_options) - if not has_schedulable_iname_nesting(kernel): + if not has_outlinable_iname_nesting(kernel): import itertools as it opt = get_iname_duplication_options(kernel) opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) for i, w in it.islice(opt, 3)) - raise LoopyError("Kernel does not have a schedulable iname nesting. " + raise LoopyError("Kernel does not have an outlinable iname nesting. " "In order for there to exist a feasible loop nesting, you " "may need to duplicate an iname. To do so, call " "loopy.duplicate_iname. Use loopy.get_iname_duplication_options " @@ -646,9 +646,9 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(kernel): +def pre_outline_checks(kernel): try: - logger.debug("%s: pre-schedule check: start" % kernel.name) + logger.debug("%s: pre-outline check: start" % kernel.name) check_for_integer_subscript_indices(kernel) check_for_duplicate_insn_ids(kernel) @@ -662,47 +662,47 @@ def pre_schedule_checks(kernel): check_for_data_dependent_parallel_bounds(kernel) check_bounds(kernel) check_write_destinations(kernel) - check_has_schedulable_iname_nesting(kernel) + check_has_outlinable_iname_nesting(kernel) check_variable_access_ordered(kernel) - logger.debug("%s: pre-schedule check: done" % kernel.name) + logger.debug("%s: pre-outline check: done" % kernel.name) except KeyboardInterrupt: raise except Exception: print(75*"=") - print("failing kernel during pre-schedule check:") + print("failing kernel during pre-outline check:") print(75*"=") print(kernel) print(75*"=") raise -# {{{ post-schedule / pre-code-generation checks +# {{{ post-outline / pre-code-generation checks # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): - from loopy.schedule import (CallKernel, RunInstruction, +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, outline_index=None): + from loopy.outline import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, - get_insn_ids_for_block_at, gather_schedule_block) + get_insn_ids_for_block_at, gather_outline_block) - if sched_index is None: + if outline_index is None: group_axes = set() local_axes = set() i = 0 - loop_end_i = past_end_i = len(kernel.schedule) + loop_end_i = past_end_i = len(kernel.outline) else: - assert isinstance(kernel.schedule[sched_index], CallKernel) - _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) + assert isinstance(kernel.outline[outline_index], CallKernel) + _, past_end_i = gather_outline_block(kernel.outline, outline_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.outline, outline_index)) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) - i = sched_index + 1 - assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) + i = outline_index + 1 + assert isinstance(kernel.outline[past_end_i - 1], ReturnFromKernel) loop_end_i = past_end_i - 1 # alternative: just disregard length-1 dimensions? @@ -711,12 +711,12 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): GroupIndexTag) while i < loop_end_i: - sched_item = kernel.schedule[i] - if isinstance(sched_item, CallKernel): + outline_item = kernel.outline[i] + if isinstance(outline_item, CallKernel): i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) - elif isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] + elif isinstance(outline_item, RunInstruction): + insn = kernel.id_to_insn[outline_item.insn_id] i += 1 if insn.boostable: @@ -753,19 +753,19 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): ",".join(str(i) for i in local_axes), ",".join(str(i) for i in local_axes_used))) - elif isinstance(sched_item, (Barrier, EnterLoop, LeaveLoop)): + elif isinstance(outline_item, (Barrier, EnterLoop, LeaveLoop)): i += 1 continue else: raise TypeError( - "schedule item not understood: %s" % type(sched_item).__name__) + "outline item not understood: %s" % type(outline_item).__name__) return past_end_i def check_for_unused_hw_axes_in_insns(kernel): - if kernel.schedule: + if kernel.outline: _check_for_unused_hw_axes_in_kernel_chunk(kernel) # }}} @@ -815,7 +815,7 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): for subkernel in get_subkernels(kernel): defined_base_storage = set() - from loopy.schedule.tools import ( + from loopy.outline.tools import ( temporaries_written_in_subkernel, temporaries_read_in_subkernel) for temporary in temporaries_written_in_subkernel(kernel, subkernel): @@ -850,24 +850,24 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): # }}} -# {{{ check that all instructions are scheduled +# {{{ check that all instructions are outlined -def check_that_all_insns_are_scheduled(kernel): +def check_that_all_insns_are_outlined(kernel): - all_schedulable_insns = set(insn.id for insn in kernel.instructions) - from loopy.schedule import sched_item_to_insn_id - scheduled_insns = set( + all_outlinable_insns = set(insn.id for insn in kernel.instructions) + from loopy.outline import outline_item_to_insn_id + outlined_insns = set( insn_id - for sched_item in kernel.schedule - for insn_id in sched_item_to_insn_id(sched_item)) + for outline_item in kernel.outline + for insn_id in outline_item_to_insn_id(outline_item)) - assert scheduled_insns <= all_schedulable_insns + assert outlined_insns <= all_outlinable_insns - if scheduled_insns < all_schedulable_insns: - from loopy.diagnostic import UnscheduledInstructionError - raise UnscheduledInstructionError( - "unscheduled instructions: '%s'" - % ', '.join(all_schedulable_insns - scheduled_insns)) + if outlined_insns < all_outlinable_insns: + from loopy.diagnostic import UnoutlinedInstructionError + raise UnoutlinedInstructionError( + "unoutlined instructions: '%s'" + % ', '.join(all_outlinable_insns - outlined_insns)) # }}} @@ -923,14 +923,14 @@ def pre_codegen_checks(kernel): check_for_unused_hw_axes_in_insns(kernel) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) - check_that_all_insns_are_scheduled(kernel) + check_that_all_insns_are_outlined(kernel) kernel.target.pre_codegen_check(kernel) check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) except Exception: print(75*"=") - print("failing kernel during pre-schedule check:") + print("failing kernel during pre-outline check:") print(75*"=") print(kernel) print(75*"=") diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1bd90bcfc1fe4595345c1b1efb2e6a35f..a39793c11642a910c74877d8e79736bfda929bac 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -186,7 +186,7 @@ class CodeGenerationState(object): or the name of the device program currently being generated. - .. attribute:: schedule_index_end + .. attribute:: outline_index_end """ def __init__(self, kernel, @@ -196,7 +196,7 @@ class CodeGenerationState(object): vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + outline_index_end=None): self.kernel = kernel self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain @@ -210,7 +210,7 @@ class CodeGenerationState(object): self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code self.gen_program_name = gen_program_name - self.schedule_index_end = schedule_index_end + self.outline_index_end = outline_index_end # {{{ copy helpers @@ -219,7 +219,7 @@ class CodeGenerationState(object): var_subst_map=None, vectorization_info=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + outline_index_end=None): if kernel is None: kernel = self.kernel @@ -239,8 +239,8 @@ class CodeGenerationState(object): if gen_program_name is None: gen_program_name = self.gen_program_name - if schedule_index_end is None: - schedule_index_end = self.schedule_index_end + if outline_index_end is None: + outline_index_end = self.outline_index_end return CodeGenerationState( kernel=kernel, @@ -257,7 +257,7 @@ class CodeGenerationState(object): var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, gen_program_name=gen_program_name, - schedule_index_end=schedule_index_end) + outline_index_end=outline_index_end) def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" @@ -384,13 +384,13 @@ def generate_code_v2(kernel): from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) - if kernel.schedule is None: - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + if kernel.outline is None: + from loopy.outline import get_one_outlined_kernel + kernel = get_one_outlined_kernel(kernel) - if kernel.state != KernelState.SCHEDULED: + if kernel.state != KernelState.OUTLINED: raise LoopyError("cannot generate code for a kernel that has not been " - "scheduled") + "outlined") # {{{ cache retrieval @@ -470,12 +470,12 @@ def generate_code_v2(kernel): kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule)) + outline_index_end=len(kernel.outline)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( codegen_state, - schedule_index=0) + outline_index=0) device_code_str = codegen_result.device_code() diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index b736191ec1dadb842e12453fbec3b68e831338f6..c525a94e3411f2bddcf4bb0e9e44f0c5f8568eb3 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -55,25 +55,26 @@ def get_approximate_convex_bounds_checks(domain, check_inames, implemented_domai # {{{ on which inames may a conditional depend? -def get_usable_inames_for_conditional(kernel, sched_index): - from loopy.schedule import ( +def get_usable_inames_for_conditional(kernel, outline_index): + from loopy.outline import ( find_active_inames_at, get_insn_ids_for_block_at, has_barrier_within) from loopy.kernel.data import (ConcurrentTag, LocalIndexTagBase, VectorizeTag, IlpBaseTag) - result = find_active_inames_at(kernel, sched_index) - crosses_barrier = has_barrier_within(kernel, sched_index) + result = find_active_inames_at(kernel, outline_index) + crosses_barrier = has_barrier_within(kernel, outline_index) # Find our containing subkernel. Grab inames for all insns from there. within_subkernel = False - for sched_item_index, sched_item in enumerate(kernel.schedule[:sched_index]): - from loopy.schedule import CallKernel, ReturnFromKernel - if isinstance(sched_item, CallKernel): + for outline_item_index, outline_item in enumerate( + kernel.outline[:outline_index]): + from loopy.outline import CallKernel, ReturnFromKernel + if isinstance(outline_item, CallKernel): within_subkernel = True - subkernel_index = sched_item_index - elif isinstance(sched_item, ReturnFromKernel): + subkernel_index = outline_item_index + elif isinstance(outline_item, ReturnFromKernel): within_subkernel = False if not within_subkernel: @@ -81,7 +82,7 @@ def get_usable_inames_for_conditional(kernel, sched_index): return frozenset(result) insn_ids_for_subkernel = get_insn_ids_for_block_at( - kernel.schedule, subkernel_index) + kernel.outline, subkernel_index) inames_for_subkernel = ( iname diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e9de52eb68bd47aec09b0a19de0a5d5433aa9843..04ceab09a80d01e2b29a2c9a51986d587874dd5d 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -26,26 +26,26 @@ THE SOFTWARE. from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl -from loopy.schedule import ( +from loopy.outline import ( EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel, - gather_schedule_block, generate_sub_sched_items) + gather_outline_block, generate_sub_outline_items) from loopy.diagnostic import LoopyError -def synthesize_idis_for_extra_args(kernel, schedule_index): +def synthesize_idis_for_extra_args(kernel, outline_index): """ :returns: A list of :class:`loopy.codegen.ImplementedDataInfo` """ - sched_item = kernel.schedule[schedule_index] + outline_item = kernel.outline[outline_index] from loopy.codegen import ImplementedDataInfo from loopy.kernel.data import InameArg, AddressSpace - assert isinstance(sched_item, CallKernel) + assert isinstance(outline_item, CallKernel) idis = [] - for arg in sched_item.extra_args: + for arg in outline_item.extra_args: temporary = kernel.temporary_variables[arg] assert temporary.address_space == AddressSpace.GLOBAL idis.extend( @@ -53,7 +53,7 @@ def synthesize_idis_for_extra_args(kernel, schedule_index): kernel.target, index_dtype=kernel.index_dtype)) - for iname in sched_item.extra_inames: + for iname in outline_item.extra_inames: idis.append( ImplementedDataInfo( target=kernel.target, @@ -65,45 +65,45 @@ def synthesize_idis_for_extra_args(kernel, schedule_index): return idis -def generate_code_for_sched_index(codegen_state, sched_index): +def generate_code_for_outline_index(codegen_state, outline_index): kernel = codegen_state.kernel - sched_item = kernel.schedule[sched_index] + outline_item = kernel.outline[outline_index] - if isinstance(sched_item, CallKernel): + if isinstance(outline_item, CallKernel): assert not codegen_state.is_generating_device_code - from loopy.schedule import (gather_schedule_block, get_insn_ids_for_block_at) - _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) - assert past_end_i <= codegen_state.schedule_index_end + from loopy.outline import (gather_outline_block, get_insn_ids_for_block_at) + _, past_end_i = gather_outline_block(kernel.outline, outline_index) + assert past_end_i <= codegen_state.outline_index_end - extra_args = synthesize_idis_for_extra_args(kernel, sched_index) + extra_args = synthesize_idis_for_extra_args(kernel, outline_index) new_codegen_state = codegen_state.copy( is_generating_device_code=True, - gen_program_name=sched_item.kernel_name, - schedule_index_end=past_end_i-1, + gen_program_name=outline_item.kernel_name, + outline_index_end=past_end_i-1, implemented_data_info=(codegen_state.implemented_data_info + extra_args)) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program( - new_codegen_state, sched_index) + new_codegen_state, outline_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.outline, outline_index)) return merge_codegen_results(codegen_state, [ codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, - sched_item.kernel_name, + outline_item.kernel_name, glob_grid, loc_grid, extra_args), ]) - elif isinstance(sched_item, EnterLoop): - tags = kernel.iname_tags(sched_item.iname) + elif isinstance(outline_item, EnterLoop): + tags = kernel.iname_tags(outline_item.iname) tags = tuple(tag for tag in tags if tag) from loopy.codegen.loop import ( @@ -124,30 +124,30 @@ def generate_code_for_sched_index(codegen_state, sched_index): else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" - % (sched_item.iname, ", ".join(str(tag) for tag in tags))) + % (outline_item.iname, ", ".join(str(tag) for tag in tags))) - return func(codegen_state, sched_index) + return func(codegen_state, outline_index) - elif isinstance(sched_item, Barrier): + elif isinstance(outline_item, Barrier): # {{{ emit barrier code from loopy.codegen.result import CodeGenerationResult if codegen_state.is_generating_device_code: barrier_ast = codegen_state.ast_builder.emit_barrier( - sched_item.synchronization_kind, sched_item.mem_kind, - sched_item.comment) - if sched_item.originating_insn_id: + outline_item.synchronization_kind, outline_item.mem_kind, + outline_item.comment) + if outline_item.originating_insn_id: return CodeGenerationResult.new( codegen_state, - sched_item.originating_insn_id, + outline_item.originating_insn_id, barrier_ast, codegen_state.implemented_domain) else: return barrier_ast else: # host code - if sched_item.synchronization_kind in ["global", "local"]: + if outline_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( host_program=None, @@ -158,12 +158,12 @@ def generate_code_for_sched_index(codegen_state, sched_index): else: raise LoopyError("do not know how to emit code for barrier " "synchronization kind '%s'" "in host code" - % sched_item.synchronization_kind) + % outline_item.synchronization_kind) # }}} - elif isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] + elif isinstance(outline_item, RunInstruction): + insn = kernel.id_to_insn[outline_item.insn_id] from loopy.codegen.instruction import generate_instruction_code return codegen_state.try_vectorized( @@ -171,20 +171,21 @@ def generate_code_for_sched_index(codegen_state, sched_index): lambda inner_cgs: generate_instruction_code(inner_cgs, insn)) else: - raise RuntimeError("unexpected schedule item type: %s" - % type(sched_item)) + raise RuntimeError("unexpected outline item type: %s" + % type(outline_item)) -def get_required_predicates(kernel, sched_index): +def get_required_predicates(kernel, outline_index): result = None - for _, sched_item in generate_sub_sched_items(kernel.schedule, sched_index): - if isinstance(sched_item, Barrier): + for _, outline_item in generate_sub_outline_items( + kernel.outline, outline_index): + if isinstance(outline_item, Barrier): my_preds = frozenset() - elif isinstance(sched_item, RunInstruction): - my_preds = kernel.id_to_insn[sched_item.insn_id].predicates + elif isinstance(outline_item, RunInstruction): + my_preds = kernel.id_to_insn[outline_item.insn_id].predicates else: - raise RuntimeError("unexpected schedule item type: %s" - % type(sched_item)) + raise RuntimeError("unexpected outline item type: %s" + % type(outline_item)) if result is None: result = my_preds @@ -216,7 +217,7 @@ def group_by(l, key, merge): return result -def build_loop_nest(codegen_state, schedule_index): +def build_loop_nest(codegen_state, outline_index): # Most of the complexity of this function goes towards finding groups of # instructions that can be nested inside a shared conditional. @@ -226,85 +227,85 @@ def build_loop_nest(codegen_state, schedule_index): # some work about hoisting conditionals and directly go into recursion. if not codegen_state.ast_builder.can_implement_conditionals: result = [] - inner = generate_code_for_sched_index(codegen_state, schedule_index) + inner = generate_code_for_outline_index(codegen_state, outline_index) if inner is not None: result.append(inner) return merge_codegen_results(codegen_state, result) - # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices + # {{{ pass 1: pre-scan outline for my outline item's siblings' indices # i.e. go up to the next LeaveLoop, and skip over inner loops. - my_sched_indices = [] + my_outline_indices = [] - i = schedule_index - while i < codegen_state.schedule_index_end: - sched_item = kernel.schedule[i] + i = outline_index + while i < codegen_state.outline_index_end: + outline_item = kernel.outline[i] - if isinstance(sched_item, LeaveLoop): + if isinstance(outline_item, LeaveLoop): break - my_sched_indices.append(i) + my_outline_indices.append(i) - if isinstance(sched_item, (EnterLoop, CallKernel)): - _, i = gather_schedule_block(kernel.schedule, i) - assert i <= codegen_state.schedule_index_end, \ - "schedule block extends beyond schedule_index_end" + if isinstance(outline_item, (EnterLoop, CallKernel)): + _, i = gather_outline_block(kernel.outline, i) + assert i <= codegen_state.outline_index_end, \ + "outline block extends beyond outline_index_end" - elif isinstance(sched_item, Barrier): + elif isinstance(outline_item, Barrier): i += 1 - elif isinstance(sched_item, RunInstruction): + elif isinstance(outline_item, RunInstruction): i += 1 else: - raise RuntimeError("unexpected schedule item type: %s" - % type(sched_item)) + raise RuntimeError("unexpected outline item type: %s" + % type(outline_item)) del i # }}} - # {{{ pass 2: find admissible conditional inames for each sibling schedule item + # {{{ pass 2: find admissible conditional inames for each sibling outline item from pytools import ImmutableRecord - class ScheduleIndexInfo(ImmutableRecord): + class OutlineIndexInfo(ImmutableRecord): """ - .. attribute:: schedule_index + .. attribute:: outline_index .. attribute:: admissible_cond_inames .. attribute:: required_predicates .. attribute:: used_inames_within """ - from loopy.schedule import find_used_inames_within + from loopy.outline import find_used_inames_within from loopy.codegen.bounds import get_usable_inames_for_conditional - sched_index_info_entries = [ - ScheduleIndexInfo( - schedule_indices=[i], + outline_index_info_entries = [ + OutlineIndexInfo( + outline_indices=[i], admissible_cond_inames=( get_usable_inames_for_conditional(kernel, i)), required_predicates=get_required_predicates(kernel, i), used_inames_within=find_used_inames_within(kernel, i) ) - for i in my_sched_indices + for i in my_outline_indices ] - sched_index_info_entries = group_by( - sched_index_info_entries, + outline_index_info_entries = group_by( + outline_index_info_entries, key=lambda sii: ( sii.admissible_cond_inames, sii.required_predicates, sii.used_inames_within), merge=lambda sii1, sii2: sii1.copy( - schedule_indices=( - sii1.schedule_indices + outline_indices=( + sii1.outline_indices + - sii2.schedule_indices))) + sii2.outline_indices))) # }}} - # {{{ pass 3: greedily group schedule items that share admissible inames + # {{{ pass 3: greedily group outline items that share admissible inames from pytools import memoize_method @@ -327,7 +328,7 @@ def build_loop_nest(codegen_state, schedule_index): return get_approximate_convex_bounds_checks(domain, check_inames, self.impl_domain) - def build_insn_group(sched_index_info_entries, codegen_state, + def build_insn_group(outline_index_info_entries, codegen_state, done_group_lengths=set()): """ :arg done_group_lengths: A set of group lengths (integers) that grows @@ -339,30 +340,30 @@ def build_loop_nest(codegen_state, schedule_index): from loopy.symbolic import get_dependencies # The rough plan here is that build_insn_group starts out with the - # entirety of the current schedule item's downward siblings (i.e. all + # entirety of the current outline item's downward siblings (i.e. all # the ones up to the next LeaveLoop). It will then iterate upward to # find the largest usable conditional hoist group. # # It will then call itself recursively, telling its recursive instances # to ignore the hoist group it just found by adding that group length - # to done_group_length. (It'll also chop the set of schedule indices + # to done_group_length. (It'll also chop the set of outline indices # considered down so that a callee cannot find a *longer* hoist group.) # # Upon return the hoist is wrapped around the returned code and - # build_insn_group calls itself for the remainder of schedule indices + # build_insn_group calls itself for the remainder of outline indices # that were not in the hoist group. - if not sched_index_info_entries: + if not outline_index_info_entries: return [] - origin_si_entry = sched_index_info_entries[0] + origin_si_entry = outline_index_info_entries[0] current_iname_set = origin_si_entry.admissible_cond_inames current_pred_set = (origin_si_entry.required_predicates - codegen_state.implemented_predicates) - # {{{ grow schedule item group + # {{{ grow outline item group - # Keep growing schedule item group as long as group fulfills minimum + # Keep growing outline item group as long as group fulfills minimum # size requirement. bounds_check_cache = BoundsCheckCache( @@ -371,18 +372,18 @@ def build_loop_nest(codegen_state, schedule_index): found_hoists = [] candidate_group_length = 1 - while candidate_group_length <= len(sched_index_info_entries): + while candidate_group_length <= len(outline_index_info_entries): if candidate_group_length in done_group_lengths: candidate_group_length += 1 continue current_iname_set = ( current_iname_set - & sched_index_info_entries[candidate_group_length-1] + & outline_index_info_entries[candidate_group_length-1] .admissible_cond_inames) current_pred_set = ( current_pred_set - & sched_index_info_entries[candidate_group_length-1] + & outline_index_info_entries[candidate_group_length-1] .required_predicates) current_pred_set = frozenset( @@ -394,9 +395,9 @@ def build_loop_nest(codegen_state, schedule_index): # And only generate conditionals for those. used_inames = set() - for sched_index_info_entry in \ - sched_index_info_entries[0:candidate_group_length]: - used_inames |= sched_index_info_entry.used_inames_within + for outline_index_info_entry in \ + outline_index_info_entries[0:candidate_group_length]: + used_inames |= outline_index_info_entry.used_inames_within # }}} @@ -452,11 +453,11 @@ def build_loop_nest(codegen_state, schedule_index): result = [] else: if group_length == 1: - # group only contains starting schedule item + # group only contains starting outline item def gen_code(inner_codegen_state): result = [] - for i in origin_si_entry.schedule_indices: - inner = generate_code_for_sched_index( + for i in origin_si_entry.outline_indices: + inner = generate_code_for_outline_index( inner_codegen_state, i) if inner is not None: @@ -468,7 +469,7 @@ def build_loop_nest(codegen_state, schedule_index): # recurse with a bigger done_group_lengths def gen_code(inner_codegen_state): return build_insn_group( - sched_index_info_entries[0:group_length], + outline_index_info_entries[0:group_length], inner_codegen_state, done_group_lengths=( done_group_lengths | set([group_length]))) @@ -518,11 +519,11 @@ def build_loop_nest(codegen_state, schedule_index): result = gen_code(new_codegen_state) return result + build_insn_group( - sched_index_info_entries[group_length:], codegen_state) + outline_index_info_entries[group_length:], codegen_state) # }}} - insn_group = build_insn_group(sched_index_info_entries, codegen_state) + insn_group = build_insn_group(outline_index_info_entries, codegen_state) return merge_codegen_results( codegen_state, insn_group) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index b3a87798840bb1624d350c79830f29142e54ab6c..67111659f5b31d2478e9cfd4d1b5b47a42dd443e 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -116,10 +116,10 @@ def get_slab_decomposition(kernel, iname): # {{{ unrolled loops -def generate_unroll_loop(codegen_state, sched_index): +def generate_unroll_loop(codegen_state, outline_index): kernel = codegen_state.kernel - iname = kernel.schedule[sched_index].iname + iname = kernel.outline[outline_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) @@ -149,7 +149,7 @@ def generate_unroll_loop(codegen_state, sched_index): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append( - build_loop_nest(new_codegen_state, sched_index+1)) + build_loop_nest(new_codegen_state, outline_index+1)) return merge_codegen_results(codegen_state, result) @@ -158,10 +158,10 @@ def generate_unroll_loop(codegen_state, sched_index): # {{{ vectorized loops -def generate_vectorize_loop(codegen_state, sched_index): +def generate_vectorize_loop(codegen_state, outline_index): kernel = codegen_state.kernel - iname = kernel.schedule[sched_index].iname + iname = kernel.outline[outline_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) @@ -175,7 +175,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return generate_unroll_loop(codegen_state, outline_index) length = int(pw_aff_to_expr(length_aff)) @@ -190,7 +190,7 @@ def generate_vectorize_loop(codegen_state, sched_index): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") - return generate_unroll_loop(codegen_state, sched_index) + return generate_unroll_loop(codegen_state, outline_index) # {{{ 'implement' vectorization bounds @@ -210,7 +210,7 @@ def generate_vectorize_loop(codegen_state, sched_index): length=length, space=length_aff.space)) - return build_loop_nest(new_codegen_state, sched_index+1) + return build_loop_nest(new_codegen_state, outline_index+1) # }}} @@ -226,15 +226,16 @@ def intersect_kernel_with_slab(kernel, slab, iname): # {{{ hw-parallel loop -def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, +def set_up_hw_parallel_loops(codegen_state, outline_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag, VectorizeTag) - from loopy.schedule import get_insn_ids_for_block_at - insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) + from loopy.outline import get_insn_ids_for_block_at + insn_ids_for_block = get_insn_ids_for_block_at( + kernel.outline, outline_index) if hw_inames_left is None: all_inames_by_insns = set() @@ -330,7 +331,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, .copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops( - new_codegen_state, schedule_index, next_func, + new_codegen_state, outline_index, next_func, hw_inames_left) result.append(inner) @@ -342,18 +343,18 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, # {{{ sequential loop -def generate_sequential_loop_dim_code(codegen_state, sched_index): +def generate_sequential_loop_dim_code(codegen_state, outline_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper - loop_iname = kernel.schedule[sched_index].iname + loop_iname = kernel.outline[outline_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! - usable_inames = get_usable_inames_for_conditional(kernel, sched_index) + usable_inames = get_usable_inames_for_conditional(kernel, outline_index) domain = kernel.get_inames_domain(loop_iname) result = [] @@ -434,7 +435,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): .copy(kernel=intersect_kernel_with_slab( kernel, slab, loop_iname))) - inner = build_loop_nest(new_codegen_state, sched_index+1) + inner = build_loop_nest(new_codegen_state, outline_index+1) # }}} diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 4318ad71c1b16deeaac98f8408d5ca82f2de1714..b59dd0563fa9e44392b3b677ca044e591fd10f84 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -273,24 +273,25 @@ def wrap_in_if(codegen_state, condition_exprs, inner): # {{{ program generation top-level -def generate_host_or_device_program(codegen_state, schedule_index): +def generate_host_or_device_program(codegen_state, outline_index): ast_builder = codegen_state.ast_builder - temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index) + temp_decls = ast_builder.get_temporary_decls(codegen_state, outline_index) from functools import partial from loopy.codegen.control import build_loop_nest if codegen_state.is_generating_device_code: - from loopy.schedule import CallKernel - assert isinstance(codegen_state.kernel.schedule[schedule_index], CallKernel) + from loopy.outline import CallKernel + assert isinstance( + codegen_state.kernel.outline[outline_index], CallKernel) from loopy.codegen.loop import set_up_hw_parallel_loops codegen_result = set_up_hw_parallel_loops( - codegen_state, schedule_index, + codegen_state, outline_index, next_func=partial(build_loop_nest, - schedule_index=schedule_index + 1)) + outline_index=outline_index + 1)) else: - codegen_result = build_loop_nest(codegen_state, schedule_index) + codegen_result = build_loop_nest(codegen_state, outline_index) codegen_result = merge_codegen_results( codegen_state, @@ -302,11 +303,11 @@ def generate_host_or_device_program(codegen_state, schedule_index): cur_prog = codegen_result.current_program(codegen_state) body_ast = cur_prog.ast fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) + codegen_state, codegen_result, outline_index) fdef_ast = ast_builder.get_function_definition( codegen_state, codegen_result, - schedule_index, fdecl_ast, body_ast) + outline_index, fdecl_ast, body_ast) codegen_result = codegen_result.with_new_program( codegen_state, diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 561bbc7cc56a8338593a80b7d5890553af89c79b..eb09c9dda249e91bb0bb6dcb114e17e68eee26de 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -100,7 +100,7 @@ class MissingDefinitionError(LoopyError): pass -class UnscheduledInstructionError(LoopyError): +class UnoutlinedInstructionError(LoopyError): pass diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 9096edcc0d7eb0111c393eb4cc5ed78405dba408..be778e06b79cb5f3af4c324e20f7571685f7c9ca 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -101,7 +101,7 @@ class _UniqueVarNameGenerator(UniqueNameGenerator): class KernelState: # noqa INITIAL = 0 PREPROCESSED = 1 - SCHEDULED = 2 + OUTLINED = 2 # {{{ kernel_state, KernelState compataibility @@ -128,8 +128,8 @@ class kernel_state(object): # noqa return KernelState.PREPROCESSED @_deperecated_kernel_state_class_method - def SCHEDULED(): # pylint:disable=no-method-argument - return KernelState.SCHEDULED + def OUTLINED(): # pylint:disable=no-method-argument + return KernelState.OUTLINED # }}} @@ -158,9 +158,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): A list of :class:`loopy.KernelArgument` - .. attribute:: schedule + .. attribute:: outline - *None* or a list of :class:`loopy.schedule.ScheduleItem` + *None* or a list of :class:`loopy.outline.OutlineItem` .. attribute:: name .. attribute:: preambles @@ -199,9 +199,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: loop_priority A frozenset of priority constraints to the kernel. Each such constraint - is a tuple of inames. Inames occuring in such a tuple will be scheduled - earlier than any iname following in the tuple. This applies only to inames - with non-parallel implementation tags. + is a tuple of inames. Inames occuring in such a tuple will be added to + outline earlier than any iname following in the tuple. This applies only + to inames with non-parallel implementation tags. .. attribute:: silenced_warnings @@ -227,7 +227,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ constructor - def __init__(self, domains, instructions, args=None, schedule=None, + def __init__(self, domains, instructions, args=None, outline=None, name="loopy_kernel", preambles=None, preamble_generators=None, @@ -332,7 +332,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if state not in [ KernelState.INITIAL, KernelState.PREPROCESSED, - KernelState.SCHEDULED, + KernelState.OUTLINED, ]: raise ValueError("invalid value for 'state'") @@ -351,7 +351,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): domains=domains, instructions=instructions, args=args, - schedule=schedule, + outline=outline, name=name, preambles=preambles, preamble_generators=preamble_generators, @@ -1222,7 +1222,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "iname-order", "get_visual_iname_order_embedding() could not determine a " "consistent iname nesting order. This is a possible indication " - "that the kernel may not schedule successfully, but for now " + "that the kernel may not outline successfully, but for now " "it only impacts printing of the kernel.") embedding = dict((iname, iname) for iname in self.all_inames()) @@ -1239,7 +1239,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "rules", "instructions", "Dependencies", - "schedule", + "outline", ]) first_letter_to_what = dict( @@ -1341,12 +1341,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): "(use loopy.show_dependency_graph to visualize)") lines.extend(dep_lines) - if "schedule" in what and kernel.schedule is not None: + if "outline" in what and kernel.outline is not None: lines.extend(sep) if show_labels: - lines.append("SCHEDULE:") - from loopy.schedule import dump_schedule - lines.append(dump_schedule(kernel, kernel.schedule)) + lines.append("OUTLINE:") + from loopy.outline import dump_outline + lines.append(dump_outline(kernel, kernel.outline)) lines.extend(sep) @@ -1473,7 +1473,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "domains", "instructions", "args", - "schedule", + "outline", "name", "preambles", "assumptions", diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 8213c9584b54917050c586e1b83b6d66d0473798..e81c6889e7f1b7a7f29aeb6acf410dd05b9d748b 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -81,11 +81,11 @@ class InstructionBase(ImmutableRecord): A :class:`frozenset` of strings indicating which instruction groups (see :class:`InstructionBase.groups`) may not be active when this - instruction is scheduled. + instruction is outlined. .. attribute:: priority - Scheduling priority, an integer. Higher means 'execute sooner'. + Outlining priority, an integer. Higher means 'execute sooner'. Default 0. .. rubric :: Synchronization diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index bb6ae44c9bf8daefef5f6564fccbec58ba72a708..3a9d1f6bfa62ca5e5a7e501892aa4d4210f04211 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -463,14 +463,14 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=False) - if iname_cluster and not kernel.schedule: + if iname_cluster and not kernel.outline: try: - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + from loopy.outline import get_one_outlined_kernel + kernel = get_one_outlined_kernel(kernel) except RuntimeError as e: iname_cluster = False from warnings import warn - warn("error encountered during scheduling for dep graph -- " + warn("error encountered during outlining for dep graph -- " "cannot perform iname clustering: %s(%s)" % (type(e).__name__, e)) @@ -536,22 +536,22 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): lines.append("%s -> %s" % (insn_2, insn_1)) if iname_cluster: - from loopy.schedule import ( + from loopy.outline import ( EnterLoop, LeaveLoop, RunInstruction, Barrier, CallKernel, ReturnFromKernel) - for sched_item in kernel.schedule: - if isinstance(sched_item, EnterLoop): + for outline_item in kernel.outline: + if isinstance(outline_item, EnterLoop): lines.append("subgraph cluster_%s { label=\"%s\"" - % (sched_item.iname, sched_item.iname)) - elif isinstance(sched_item, LeaveLoop): + % (outline_item.iname, outline_item.iname)) + elif isinstance(outline_item, LeaveLoop): lines.append("}") - elif isinstance(sched_item, RunInstruction): - lines.append(sched_item.insn_id) - elif isinstance(sched_item, (CallKernel, ReturnFromKernel, Barrier)): + elif isinstance(outline_item, RunInstruction): + lines.append(outline_item.insn_id) + elif isinstance(outline_item, (CallKernel, ReturnFromKernel, Barrier)): pass else: - raise LoopyError("schedule item not unterstood: %r" % sched_item) + raise LoopyError("outline item not unterstood: %r" % outline_item) return "digraph %s {\n%s\n}" % ( kernel.name, @@ -1721,47 +1721,47 @@ def find_most_recent_global_barrier(kernel, insn_id): @memoize_on_first_arg def get_subkernels(kernel): """Return a :class:`tuple` of the names of the subkernels in the kernel. The - kernel must be scheduled. + kernel must be outlined. - See also :class:`loopy.schedule.CallKernel`. + See also :class:`loopy.outline.CallKernel`. """ from loopy.kernel import KernelState - if kernel.state != KernelState.SCHEDULED: - raise LoopyError("Kernel must be scheduled") + if kernel.state != KernelState.OUTLINED: + raise LoopyError("Kernel must be outlined") - from loopy.schedule import CallKernel + from loopy.outline import CallKernel - return tuple(sched_item.kernel_name - for sched_item in kernel.schedule - if isinstance(sched_item, CallKernel)) + return tuple(outline_item.kernel_name + for outline_item in kernel.outline + if isinstance(outline_item, CallKernel)) @memoize_on_first_arg def get_subkernel_to_insn_id_map(kernel): """Return a :class:`dict` mapping subkernel names to a :class:`frozenset` - consisting of the instruction ids scheduled within the subkernel. The - kernel must be scheduled. + consisting of the instruction ids outlined within the subkernel. The + kernel must be outlined. """ from loopy.kernel import KernelState - if kernel.state != KernelState.SCHEDULED: - raise LoopyError("Kernel must be scheduled") + if kernel.state != KernelState.OUTLINED: + raise LoopyError("Kernel must be outlined") - from loopy.schedule import ( - sched_item_to_insn_id, CallKernel, ReturnFromKernel) + from loopy.outline import ( + outline_item_to_insn_id, CallKernel, ReturnFromKernel) subkernel = None result = {} - for sched_item in kernel.schedule: - if isinstance(sched_item, CallKernel): - subkernel = sched_item.kernel_name + for outline_item in kernel.outline: + if isinstance(outline_item, CallKernel): + subkernel = outline_item.kernel_name result[subkernel] = set() - if isinstance(sched_item, ReturnFromKernel): + if isinstance(outline_item, ReturnFromKernel): subkernel = None if subkernel is not None: - for insn_id in sched_item_to_insn_id(sched_item): + for insn_id in outline_item_to_insn_id(outline_item): result[subkernel].add(insn_id) for subkernel in result: diff --git a/loopy/loop.py b/loopy/loop.py index 4592463822a2321745aaf48a316d16c98d4efca3..b940aba25dc943ae7c9c7c9d4222aa7fc79dbf4c 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -31,7 +31,7 @@ def potential_loop_nest_map(kernel): """Returns a dictionary mapping inames to other inames that *could* be nested around them. - :seealso: :func:`loopy.schedule.loop_nest_map` + :seealso: :func:`loopy.outline.loop_nest_map` """ result = {} diff --git a/loopy/options.py b/loopy/options.py index 63089d94d3487e77a1def39a98fe24631c508398..e3f3dacf493fa299cfaad392301d2abad1b52777 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -93,7 +93,7 @@ class Options(ImmutableRecord): Ignore the boostable_into field of the kernel, when determining whether an iname duplication is necessary - for the kernel to be schedulable. + for the kernel to be outlinable. .. attribute:: check_dep_resolution diff --git a/loopy/schedule/__init__.py b/loopy/outline/__init__.py similarity index 69% rename from loopy/schedule/__init__.py rename to loopy/outline/__init__.py index f145c7122b9fd6e9e516d0becf3d4461fc0cce8c..9b9bd04b5d72b459e1c45b0fb8cd30682a66635e 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/outline/__init__.py @@ -39,9 +39,9 @@ import logging logger = logging.getLogger(__name__) -# {{{ schedule items +# {{{ outline items -class ScheduleItem(ImmutableRecord): +class OutlineItem(ImmutableRecord): __slots__ = [] def update_persistent_hash(self, key_hash, key_builder): @@ -52,11 +52,11 @@ class ScheduleItem(ImmutableRecord): key_builder.rec(key_hash, getattr(self, field_name)) -class BeginBlockItem(ScheduleItem): +class BeginBlockItem(OutlineItem): pass -class EndBlockItem(ScheduleItem): +class EndBlockItem(OutlineItem): pass @@ -68,7 +68,7 @@ class LeaveLoop(EndBlockItem): hash_fields = __slots__ = ["iname"] -class RunInstruction(ScheduleItem): +class RunInstruction(OutlineItem): hash_fields = __slots__ = ["insn_id"] @@ -80,7 +80,7 @@ class ReturnFromKernel(EndBlockItem): hash_fields = __slots__ = ["kernel_name"] -class Barrier(ScheduleItem): +class Barrier(OutlineItem): """ .. attribute:: comment @@ -103,43 +103,43 @@ class Barrier(ScheduleItem): # }}} -# {{{ schedule utilities +# {{{ outline utilities -def gather_schedule_block(schedule, start_idx): - assert isinstance(schedule[start_idx], BeginBlockItem) +def gather_outline_block(outline, start_idx): + assert isinstance(outline[start_idx], BeginBlockItem) level = 0 i = start_idx - while i < len(schedule): - if isinstance(schedule[i], BeginBlockItem): + while i < len(outline): + if isinstance(outline[i], BeginBlockItem): level += 1 - elif isinstance(schedule[i], EndBlockItem): + elif isinstance(outline[i], EndBlockItem): level -= 1 if level == 0: - return schedule[start_idx:i+1], i+1 + return outline[start_idx:i+1], i+1 i += 1 assert False -def generate_sub_sched_items(schedule, start_idx): - if not isinstance(schedule[start_idx], BeginBlockItem): - yield start_idx, schedule[start_idx] +def generate_sub_outline_items(outline, start_idx): + if not isinstance(outline[start_idx], BeginBlockItem): + yield start_idx, outline[start_idx] level = 0 i = start_idx - while i < len(schedule): - sched_item = schedule[i] - if isinstance(sched_item, BeginBlockItem): + while i < len(outline): + outline_item = outline[i] + if isinstance(outline_item, BeginBlockItem): level += 1 - elif isinstance(sched_item, EndBlockItem): + elif isinstance(outline_item, EndBlockItem): level -= 1 else: - yield i, sched_item + yield i, outline_item if level == 0: return @@ -149,59 +149,59 @@ def generate_sub_sched_items(schedule, start_idx): assert False -def get_insn_ids_for_block_at(schedule, start_idx): +def get_insn_ids_for_block_at(outline, start_idx): return frozenset( - sub_sched_item.insn_id - for i, sub_sched_item in generate_sub_sched_items( - schedule, start_idx) - if isinstance(sub_sched_item, RunInstruction)) + sub_outline_item.insn_id + for i, sub_outline_item in generate_sub_outline_items( + outline, start_idx) + if isinstance(sub_outline_item, RunInstruction)) -def find_active_inames_at(kernel, sched_index): +def find_active_inames_at(kernel, outline_index): active_inames = [] - from loopy.schedule import EnterLoop, LeaveLoop - for sched_item in kernel.schedule[:sched_index]: - if isinstance(sched_item, EnterLoop): - active_inames.append(sched_item.iname) - if isinstance(sched_item, LeaveLoop): + from loopy.outline import EnterLoop, LeaveLoop + for outline_item in kernel.outline[:outline_index]: + if isinstance(outline_item, EnterLoop): + active_inames.append(outline_item.iname) + if isinstance(outline_item, LeaveLoop): active_inames.pop() return set(active_inames) -def has_barrier_within(kernel, sched_index): - sched_item = kernel.schedule[sched_index] +def has_barrier_within(kernel, outline_index): + outline_item = kernel.outline[outline_index] - if isinstance(sched_item, BeginBlockItem): - loop_contents, _ = gather_schedule_block( - kernel.schedule, sched_index) + if isinstance(outline_item, BeginBlockItem): + loop_contents, _ = gather_outline_block( + kernel.outline, outline_index) from pytools import any - return any(isinstance(subsched_item, Barrier) - for subsched_item in loop_contents) - elif isinstance(sched_item, Barrier): + return any(isinstance(suboutline_item, Barrier) + for suboutline_item in loop_contents) + elif isinstance(outline_item, Barrier): return True else: return False -def find_used_inames_within(kernel, sched_index): - sched_item = kernel.schedule[sched_index] +def find_used_inames_within(kernel, outline_index): + outline_item = kernel.outline[outline_index] - if isinstance(sched_item, BeginBlockItem): - loop_contents, _ = gather_schedule_block( - kernel.schedule, sched_index) - run_insns = [subsched_item - for subsched_item in loop_contents - if isinstance(subsched_item, RunInstruction)] - elif isinstance(sched_item, RunInstruction): - run_insns = [sched_item] + if isinstance(outline_item, BeginBlockItem): + loop_contents, _ = gather_outline_block( + kernel.outline, outline_index) + run_insns = [suboutline_item + for suboutline_item in loop_contents + if isinstance(suboutline_item, RunInstruction)] + elif isinstance(outline_item, RunInstruction): + run_insns = [outline_item] else: return set() result = set() - for sched_item in run_insns: - result.update(kernel.insn_inames(sched_item.insn_id)) + for outline_item in run_insns: + result.update(kernel.insn_inames(outline_item.insn_id)) return result @@ -250,7 +250,7 @@ def find_loop_nest_around_map(kernel): if kernel.iname_tags_of_type(outer_iname, IlpBaseTag): # ILP tags are special because they are parallel tags # and therefore 'in principle' nest around everything. - # But they're realized by the scheduler as a loop + # But they're realized by the outliner as a loop # at the innermost level, so we'll cut them some # slack here. continue @@ -271,7 +271,7 @@ def find_loop_nest_around_map(kernel): def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): """Returns a dictionary mapping inames to other instruction ids that need to - be scheduled before the iname should be eligible for scheduling. + be outlined before the iname should be eligible for outlining. """ result = {} @@ -405,15 +405,15 @@ def get_priority_tiers(wanted, priorities): yield tier -def sched_item_to_insn_id(sched_item): +def outline_item_to_insn_id(outline_item): # Helper for use in generator expressions, i.e. - # (... for insn_id in sched_item_to_insn_id(item) ...) - if isinstance(sched_item, RunInstruction): - yield sched_item.insn_id - elif isinstance(sched_item, Barrier): - if (hasattr(sched_item, "originating_insn_id") - and sched_item.originating_insn_id is not None): - yield sched_item.originating_insn_id + # (... for insn_id in outline_item_to_insn_id(item) ...) + if isinstance(outline_item, RunInstruction): + yield outline_item.insn_id + elif isinstance(outline_item, Barrier): + if (hasattr(outline_item, "originating_insn_id") + and outline_item.originating_insn_id is not None): + yield outline_item.originating_insn_id # }}} @@ -456,38 +456,38 @@ def format_insn(kernel, insn_id): Fore.CYAN, str(insn), Style.RESET_ALL) -def dump_schedule(kernel, schedule): +def dump_outline(kernel, outline): lines = [] indent = "" from loopy.kernel.data import MultiAssignmentBase - for sched_item in schedule: - if isinstance(sched_item, EnterLoop): - lines.append(indent + "for %s" % sched_item.iname) + for outline_item in outline: + if isinstance(outline_item, EnterLoop): + lines.append(indent + "for %s" % outline_item.iname) indent += " " - elif isinstance(sched_item, LeaveLoop): + elif isinstance(outline_item, LeaveLoop): indent = indent[:-4] - lines.append(indent + "end %s" % sched_item.iname) - elif isinstance(sched_item, CallKernel): + lines.append(indent + "end %s" % outline_item.iname) + elif isinstance(outline_item, CallKernel): lines.append(indent + "CALL KERNEL %s(extra_args=%s, extra_inames=%s)" % ( - sched_item.kernel_name, - sched_item.extra_args, - sched_item.extra_inames)) + outline_item.kernel_name, + outline_item.extra_args, + outline_item.extra_inames)) indent += " " - elif isinstance(sched_item, ReturnFromKernel): + elif isinstance(outline_item, ReturnFromKernel): indent = indent[:-4] - lines.append(indent + "RETURN FROM KERNEL %s" % sched_item.kernel_name) - elif isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] + lines.append(indent + "RETURN FROM KERNEL %s" % outline_item.kernel_name) + elif isinstance(outline_item, RunInstruction): + insn = kernel.id_to_insn[outline_item.insn_id] if isinstance(insn, MultiAssignmentBase): - insn_str = format_insn(kernel, sched_item.insn_id) + insn_str = format_insn(kernel, outline_item.insn_id) else: - insn_str = sched_item.insn_id + insn_str = outline_item.insn_id lines.append(indent + insn_str) - elif isinstance(sched_item, Barrier): + elif isinstance(outline_item, Barrier): lines.append(indent + "... %sbarrier" % - sched_item.synchronization_kind[0]) + outline_item.synchronization_kind[0]) else: assert False @@ -496,9 +496,9 @@ def dump_schedule(kernel, schedule): for i, line in enumerate(lines)) -class ScheduleDebugger: +class OutlineDebugger: def __init__(self, debug_length=None, interactive=True): - self.longest_rejected_schedule = [] + self.longest_rejected_outline = [] self.success_counter = 0 self.dead_end_counter = 0 self.debug_length = debug_length @@ -515,27 +515,27 @@ class ScheduleDebugger: (self.success_counter + self.dead_end_counter) % 50 == 0 and self.elapsed_time() > 10 ): - sys.stdout.write("\rscheduling... %d successes, " + sys.stdout.write("\routlining... %d successes, " "%d dead ends (longest %d)" % ( self.success_counter, self.dead_end_counter, - len(self.longest_rejected_schedule))) + len(self.longest_rejected_outline))) sys.stdout.flush() self.wrote_status = 2 - def log_success(self, schedule): + def log_success(self, outline): self.success_counter += 1 self.update() - def log_dead_end(self, schedule): - if len(schedule) > len(self.longest_rejected_schedule): - self.longest_rejected_schedule = schedule + def log_dead_end(self, outline): + if len(outline) > len(self.longest_rejected_outline): + self.longest_rejected_outline = outline self.dead_end_counter += 1 self.update() - def done_scheduling(self): + def done_outlining(self): if self.wrote_status: - sys.stdout.write("\rscheduler finished"+40*" "+"\n") + sys.stdout.write("\routliner finished"+40*" "+"\n") sys.stdout.flush() def elapsed_time(self): @@ -555,15 +555,15 @@ class ScheduleDebugger: self.start_time = time() -class ScheduleDebugInput(Exception): +class OutlineDebugInput(Exception): pass # }}} -# {{{ scheduling algorithm +# {{{ outlining algorithm -class SchedulerState(ImmutableRecord): +class OutlinerState(ImmutableRecord): """ .. attribute:: kernel @@ -582,9 +582,9 @@ class SchedulerState(ImmutableRecord): .. attribute:: parallel_inames *Note:* ``ilp`` and ``vec`` are not 'parallel' for the purposes of the - scheduler. See :attr:`ilp_inames`, :attr:`vec_inames`. + outliner. See :attr:`ilp_inames`, :attr:`vec_inames`. - .. rubric:: Time-varying scheduler state + .. rubric:: Time-varying outliner state .. attribute:: active_inames @@ -598,33 +598,33 @@ class SchedulerState(ImmutableRecord): The inames of the last entered subkernel - .. attribute:: schedule + .. attribute:: outline - .. attribute:: scheduled_insn_ids + .. attribute:: outlined_insn_ids - .. attribute:: unscheduled_insn_ids + .. attribute:: unoutlined_insn_ids - .. attribute:: preschedule + .. attribute:: preoutline - A sequence of schedule items that must be inserted into the - schedule, maintaining the same relative ordering. Newly scheduled + A sequence of outline items that must be inserted into the + outline, maintaining the same relative ordering. Newly outlined items may interleave this sequence. - .. attribute:: prescheduled_insn_ids + .. attribute:: preoutlined_insn_ids - A :class:`frozenset` of any instruction that started prescheduled + A :class:`frozenset` of any instruction that started preoutlined - .. attribute:: prescheduled_inames + .. attribute:: preoutlined_inames - A :class:`frozenset` of any iname that started prescheduled + A :class:`frozenset` of any iname that started preoutlined - .. attribute:: may_schedule_global_barriers + .. attribute:: may_outline_global_barriers - Whether global barrier scheduling is allowed + Whether global barrier outlining is allowed .. attribute:: within_subkernel - Whether the scheduler is inside a subkernel + Whether the outliner is inside a subkernel .. attribute:: group_insn_counts @@ -634,7 +634,7 @@ class SchedulerState(ImmutableRecord): .. attribute:: active_group_counts A mapping from instruction group names to the number of instructions - in them that are left to schedule. If a group name occurs in this + in them that are left to outline. If a group name occurs in this mapping, that group is considered active. .. attribute:: uses_of_boostability @@ -651,11 +651,11 @@ class SchedulerState(ImmutableRecord): return None -def generate_loop_schedules_internal( - sched_state, allow_boost=False, debug=None): +def generate_loop_outlines_internal( + outline_state, allow_boost=False, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. - kernel = sched_state.kernel + kernel = outline_state.kernel Fore = kernel.options._fore # noqa Style = kernel.options._style # noqa @@ -664,11 +664,11 @@ def generate_loop_schedules_internal( else: rec_allow_boost = False - active_inames_set = frozenset(sched_state.active_inames) + active_inames_set = frozenset(outline_state.active_inames) - next_preschedule_item = ( - sched_state.preschedule[0] - if len(sched_state.preschedule) > 0 + next_preoutline_item = ( + outline_state.preoutline[0] + if len(outline_state.preoutline) > 0 else None) # {{{ decide about debug mode @@ -677,7 +677,7 @@ def generate_loop_schedules_internal( if debug is not None: if (debug.debug_length is not None - and len(sched_state.schedule) >= debug.debug_length): + and len(outline_state.outline) >= debug.debug_length): debug_mode = True if debug_mode: @@ -687,124 +687,127 @@ def generate_loop_schedules_internal( print("KERNEL:") print(kernel.stringify(with_dependencies=True)) print(75*"=") - print("CURRENT SCHEDULE:") - print(dump_schedule(sched_state.kernel, sched_state.schedule)) - if sched_state.preschedule: + print("CURRENT OUTLINE:") + print(dump_outline(outline_state.kernel, outline_state.outline)) + if outline_state.preoutline: print(75*"=") - print("PRESCHEDULED ITEMS AWAITING SCHEDULING:") - print(dump_schedule(sched_state.kernel, sched_state.preschedule)) + print("PREOUTLINED ITEMS AWAITING OUTLINING:") + print(dump_outline(outline_state.kernel, outline_state.preoutline)) #print("boost allowed:", allow_boost) print(75*"=") print("LOOP NEST MAP (inner: outer):") - for iname, val in six.iteritems(sched_state.loop_nest_around_map): + for iname, val in six.iteritems(outline_state.loop_nest_around_map): print("%s : %s" % (iname, ", ".join(val))) print(75*"=") - if debug.debug_length == len(debug.longest_rejected_schedule): - print("WHY IS THIS A DEAD-END SCHEDULE?") + if debug.debug_length == len(debug.longest_rejected_outline): + print("WHY IS THIS A DEAD-END OUTLINE?") - #if len(schedule) == 2: + #if len(outline) == 2: #from pudb import set_trace; set_trace() # }}} - # {{{ see if we have reached the start/end of kernel in the preschedule + # {{{ see if we have reached the start/end of kernel in the preoutline - if isinstance(next_preschedule_item, CallKernel): - assert sched_state.within_subkernel is False - for result in generate_loop_schedules_internal( - sched_state.copy( - schedule=sched_state.schedule + (next_preschedule_item,), - preschedule=sched_state.preschedule[1:], + if isinstance(next_preoutline_item, CallKernel): + assert outline_state.within_subkernel is False + for result in generate_loop_outlines_internal( + outline_state.copy( + outline=outline_state.outline + ( + next_preoutline_item,), + preoutline=outline_state.preoutline[1:], within_subkernel=True, - may_schedule_global_barriers=False, - enclosing_subkernel_inames=sched_state.active_inames), + may_outline_global_barriers=False, + enclosing_subkernel_inames=outline_state.active_inames), allow_boost=rec_allow_boost, debug=debug): yield result - if isinstance(next_preschedule_item, ReturnFromKernel): - assert sched_state.within_subkernel is True + if isinstance(next_preoutline_item, ReturnFromKernel): + assert outline_state.within_subkernel is True # Make sure all subkernel inames have finished. - if sched_state.active_inames == sched_state.enclosing_subkernel_inames: - for result in generate_loop_schedules_internal( - sched_state.copy( - schedule=sched_state.schedule + (next_preschedule_item,), - preschedule=sched_state.preschedule[1:], + if outline_state.active_inames == outline_state.enclosing_subkernel_inames: + for result in generate_loop_outlines_internal( + outline_state.copy( + outline=outline_state.outline + ( + next_preoutline_item,), + preoutline=outline_state.preoutline[1:], within_subkernel=False, - may_schedule_global_barriers=True), + may_outline_global_barriers=True), allow_boost=rec_allow_boost, debug=debug): yield result # }}} - # {{{ see if there are pending barriers in the preschedule + # {{{ see if there are pending barriers in the preoutline # Barriers that do not have an originating instruction are handled here. # (These are automatically inserted by insert_barriers().) Barriers with # originating instructions are handled as part of normal instruction - # scheduling below. + # outlining below. if ( - isinstance(next_preschedule_item, Barrier) - and next_preschedule_item.originating_insn_id is None): - for result in generate_loop_schedules_internal( - sched_state.copy( - schedule=sched_state.schedule + (next_preschedule_item,), - preschedule=sched_state.preschedule[1:]), + isinstance(next_preoutline_item, Barrier) + and next_preoutline_item.originating_insn_id is None): + for result in generate_loop_outlines_internal( + outline_state.copy( + outline=outline_state.outline + ( + next_preoutline_item,), + preoutline=outline_state.preoutline[1:]), allow_boost=rec_allow_boost, debug=debug): yield result # }}} - # {{{ see if any insns are ready to be scheduled now + # {{{ see if any insns are ready to be outlined now - # Also take note of insns that have a chance of being schedulable inside + # Also take note of insns that have a chance of being outlinable inside # the current loop nest, in this set: reachable_insn_ids = set() - active_groups = frozenset(sched_state.active_group_counts) + active_groups = frozenset(outline_state.active_group_counts) def insn_sort_key(insn_id): insn = kernel.id_to_insn[insn_id] # Sort by insn.id as a last criterion to achieve deterministic - # schedule generation order. + # outline generation order. return (insn.priority, len(active_groups & insn.groups), insn.id) # Use previous instruction sorting result if it is available - if sched_state.insn_ids_to_try is None: + if outline_state.insn_ids_to_try is None: insn_ids_to_try = sorted( - # Non-prescheduled instructions go first. - sched_state.unscheduled_insn_ids - sched_state.prescheduled_insn_ids, - key=insn_sort_key, reverse=True) + # Non-preoutlined instructions go first. + outline_state.unoutlined_insn_ids - outline_state.preoutlined_insn_ids, + key=insn_sort_key, reverse=True) else: - insn_ids_to_try = sched_state.insn_ids_to_try + insn_ids_to_try = outline_state.insn_ids_to_try insn_ids_to_try.extend( insn_id - for item in sched_state.preschedule - for insn_id in sched_item_to_insn_id(item)) + for item in outline_state.preoutline + for insn_id in outline_item_to_insn_id(item)) for insn_id in insn_ids_to_try: insn = kernel.id_to_insn[insn_id] - is_ready = insn.depends_on <= sched_state.scheduled_insn_ids + is_ready = insn.depends_on <= outline_state.outlined_insn_ids if not is_ready: if debug_mode: - # These are not that interesting when understanding scheduler + # These are not that interesting when understanding outliner # failures. # print("instruction '%s' is missing insn depedencies '%s'" % ( # format_insn(kernel, insn.id), ",".join( - # insn.depends_on - sched_state.scheduled_insn_ids))) + # insn.depends_on - outline_state.outlined_insn_ids))) pass continue - want = kernel.insn_inames(insn) - sched_state.parallel_inames - have = active_inames_set - sched_state.parallel_inames + want = kernel.insn_inames(insn) - outline_state.parallel_inames + have = active_inames_set - outline_state.parallel_inames # If insn is boostable, it may be placed inside a more deeply # nested loop without harm. @@ -826,39 +829,39 @@ def generate_loop_schedules_internal( print("instruction '%s' won't work under inames '%s'" % (format_insn(kernel, insn.id), ",".join(have-want))) - # {{{ check if scheduling this insn is compatible with preschedule + # {{{ check if outlining this insn is compatible with preoutline - if insn_id in sched_state.prescheduled_insn_ids: - if isinstance(next_preschedule_item, RunInstruction): - next_preschedule_insn_id = next_preschedule_item.insn_id - elif isinstance(next_preschedule_item, Barrier): - assert next_preschedule_item.originating_insn_id is not None - next_preschedule_insn_id = next_preschedule_item.originating_insn_id + if insn_id in outline_state.preoutlined_insn_ids: + if isinstance(next_preoutline_item, RunInstruction): + next_preoutline_insn_id = next_preoutline_item.insn_id + elif isinstance(next_preoutline_item, Barrier): + assert next_preoutline_item.originating_insn_id is not None + next_preoutline_insn_id = next_preoutline_item.originating_insn_id else: - next_preschedule_insn_id = None + next_preoutline_insn_id = None - if next_preschedule_insn_id != insn_id: + if next_preoutline_insn_id != insn_id: if debug_mode: - print("can't schedule '%s' because another preschedule " + print("can't outline '%s' because another preoutline " "instruction precedes it" % format_insn(kernel, insn.id)) is_ready = False # }}} - # {{{ check if scheduler state allows insn scheduling + # {{{ check if outliner state allows insn outlining from loopy.kernel.instruction import BarrierInstruction if isinstance(insn, BarrierInstruction) and \ insn.synchronization_kind == "global": - if not sched_state.may_schedule_global_barriers: + if not outline_state.may_outline_global_barriers: if debug_mode: - print("can't schedule '%s' because global barriers are " + print("can't outline '%s' because global barriers are " "not currently allowed" % format_insn(kernel, insn.id)) is_ready = False else: - if not sched_state.within_subkernel: + if not outline_state.within_subkernel: if debug_mode: - print("can't schedule '%s' because not within subkernel" + print("can't outline '%s' because not within subkernel" % format_insn(kernel, insn.id)) is_ready = False @@ -884,7 +887,7 @@ def generate_loop_schedules_internal( # }}} if is_ready and debug_mode: - print("ready to schedule '%s'" % format_insn(kernel, insn.id)) + print("ready to outline '%s'" % format_insn(kernel, insn.id)) if is_ready and not debug_mode: iid_set = frozenset([insn.id]) @@ -892,7 +895,7 @@ def generate_loop_schedules_internal( # {{{ update active group counts for added instruction if insn.groups: - new_active_group_counts = sched_state.active_group_counts.copy() + new_active_group_counts = outline_state.active_group_counts.copy() for grp in insn.groups: if grp in new_active_group_counts: @@ -902,9 +905,9 @@ def generate_loop_schedules_internal( else: new_active_group_counts[grp] = ( - sched_state.group_insn_counts[grp] - 1) + outline_state.group_insn_counts[grp] - 1) else: - new_active_group_counts = sched_state.active_group_counts + new_active_group_counts = outline_state.active_group_counts # }}} @@ -915,7 +918,7 @@ def generate_loop_schedules_internal( # invalidate instruction_ids_to_try when active group changes if set(new_active_group_counts.keys()) != set( - sched_state.active_group_counts.keys()): + outline_state.active_group_counts.keys()): new_insn_ids_to_try = None # }}} @@ -926,32 +929,33 @@ def generate_loop_schedules_internal( new_uses_of_boostability.append( (insn.id, orig_have & insn.boostable_into)) - new_sched_state = sched_state.copy( - scheduled_insn_ids=sched_state.scheduled_insn_ids | iid_set, - unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set, + new_outline_state = outline_state.copy( + outlined_insn_ids=outline_state.outlined_insn_ids | iid_set, + unoutlined_insn_ids=outline_state.unoutlined_insn_ids - iid_set, insn_ids_to_try=new_insn_ids_to_try, - schedule=( - sched_state.schedule + (RunInstruction(insn_id=insn.id),)), - preschedule=( - sched_state.preschedule - if insn_id not in sched_state.prescheduled_insn_ids - else sched_state.preschedule[1:]), + outline=( + outline_state.outline + ( + RunInstruction(insn_id=insn.id),)), + preoutline=( + outline_state.preoutline + if insn_id not in outline_state.preoutlined_insn_ids + else outline_state.preoutline[1:]), active_group_counts=new_active_group_counts, uses_of_boostability=( - sched_state.uses_of_boostability + outline_state.uses_of_boostability + new_uses_of_boostability) ) # Don't be eager about entering/leaving loops--if progress has been - # made, revert to top of scheduler and see if more progress can be + # made, revert to top of outliner and see if more progress can be # made. - for sub_sched in generate_loop_schedules_internal( - new_sched_state, + for sub_outline in generate_loop_outlines_internal( + new_outline_state, allow_boost=rec_allow_boost, debug=debug): - yield sub_sched + yield sub_outline - if not sched_state.group_insn_counts: - # No groups: We won't need to backtrack on scheduling + if not outline_state.group_insn_counts: + # No groups: We won't need to backtrack on outlining # instructions. return @@ -959,26 +963,26 @@ def generate_loop_schedules_internal( # {{{ see if we're ready to leave the innermost loop - last_entered_loop = sched_state.last_entered_loop + last_entered_loop = outline_state.last_entered_loop if last_entered_loop is not None: can_leave = True if ( - last_entered_loop in sched_state.prescheduled_inames + last_entered_loop in outline_state.preoutlined_inames and not ( - isinstance(next_preschedule_item, LeaveLoop) - and next_preschedule_item.iname == last_entered_loop)): - # A prescheduled loop can only be left if the preschedule agrees. + isinstance(next_preoutline_item, LeaveLoop) + and next_preoutline_item.iname == last_entered_loop)): + # A preoutlined loop can only be left if the preoutline agrees. if debug_mode: - print("cannot leave '%s' because of preschedule constraints" + print("cannot leave '%s' because of preoutline constraints" % last_entered_loop) can_leave = False - elif last_entered_loop not in sched_state.breakable_inames: + elif last_entered_loop not in outline_state.breakable_inames: # If the iname is not breakable, then check that we've - # scheduled all the instructions that require it. + # outlined all the instructions that require it. - for insn_id in sched_state.unscheduled_insn_ids: + for insn_id in outline_state.unoutlined_insn_ids: insn = kernel.id_to_insn[insn_id] if last_entered_loop in kernel.insn_inames(insn): if debug_mode: @@ -988,16 +992,16 @@ def generate_loop_schedules_internal( # check if there's a dependency of insn that needs to be # outside of last_entered_loop. for subdep_id in gen_dependencies_except(kernel, insn_id, - sched_state.scheduled_insn_ids): + outline_state.outlined_insn_ids): subdep = kernel.id_to_insn[insn_id] want = (kernel.insn_inames(subdep_id) - - sched_state.parallel_inames) + - outline_state.parallel_inames) if ( last_entered_loop not in want and last_entered_loop not in subdep.boostable_into): print( "%(warn)swarning:%(reset_all)s '%(iname)s', " - "which the schedule is " + "which the outline is " "currently stuck inside of, seems mis-nested. " "'%(subdep)s' must occur " "before '%(dep)s', " "but '%(subdep)s must be outside " @@ -1021,41 +1025,41 @@ def generate_loop_schedules_internal( if can_leave: can_leave = False - # We may only leave this loop if we've scheduled an instruction + # We may only leave this loop if we've outlined an instruction # since entering it. seen_an_insn = False ignore_count = 0 - for sched_item in sched_state.schedule[::-1]: - if isinstance(sched_item, RunInstruction): + for outline_item in outline_state.outline[::-1]: + if isinstance(outline_item, RunInstruction): seen_an_insn = True - elif isinstance(sched_item, LeaveLoop): + elif isinstance(outline_item, LeaveLoop): ignore_count += 1 - elif isinstance(sched_item, EnterLoop): + elif isinstance(outline_item, EnterLoop): if ignore_count: ignore_count -= 1 else: - assert sched_item.iname == last_entered_loop + assert outline_item.iname == last_entered_loop if seen_an_insn: can_leave = True break if can_leave and not debug_mode: - for sub_sched in generate_loop_schedules_internal( - sched_state.copy( - schedule=( - sched_state.schedule + for sub_outline in generate_loop_outlines_internal( + outline_state.copy( + outline=( + outline_state.outline + (LeaveLoop(iname=last_entered_loop),)), - active_inames=sched_state.active_inames[:-1], - preschedule=( - sched_state.preschedule + active_inames=outline_state.active_inames[:-1], + preoutline=( + outline_state.preoutline if last_entered_loop - not in sched_state.prescheduled_inames - else sched_state.preschedule[1:]), + not in outline_state.preoutlined_inames + else outline_state.preoutline[1:]), ), allow_boost=rec_allow_boost, debug=debug): - yield sub_sched + yield sub_outline return @@ -1063,14 +1067,14 @@ def generate_loop_schedules_internal( # {{{ see if any loop can be entered now - # Find inames that are being referenced by as yet unscheduled instructions. + # Find inames that are being referenced by as yet unoutlined instructions. needed_inames = set() - for insn_id in sched_state.unscheduled_insn_ids: + for insn_id in outline_state.unoutlined_insn_ids: needed_inames.update(kernel.insn_inames(insn_id)) needed_inames = (needed_inames # There's no notion of 'entering' a parallel loop - - sched_state.parallel_inames + - outline_state.parallel_inames # Don't reenter a loop we're already in. - active_inames_set) @@ -1078,12 +1082,12 @@ def generate_loop_schedules_internal( if debug_mode: print(75*"-") print("inames still needed :", ",".join(needed_inames)) - print("active inames :", ",".join(sched_state.active_inames)) - print("inames entered so far :", ",".join(sched_state.entered_inames)) + print("active inames :", ",".join(outline_state.active_inames)) + print("inames entered so far :", ",".join(outline_state.entered_inames)) print("reachable insns:", ",".join(reachable_insn_ids)) print("active groups (with insn counts):", ",".join( "%s: %d" % (grp, c) - for grp, c in six.iteritems(sched_state.active_group_counts))) + for grp, c in six.iteritems(outline_state.active_group_counts))) print(75*"-") if needed_inames: @@ -1091,40 +1095,40 @@ def generate_loop_schedules_internal( for iname in needed_inames: - # {{{ check if scheduling this iname now is allowed/plausible + # {{{ check if outlining this iname now is allowed/plausible if ( - iname in sched_state.prescheduled_inames + iname in outline_state.preoutlined_inames and not ( - isinstance(next_preschedule_item, EnterLoop) - and next_preschedule_item.iname == iname)): + isinstance(next_preoutline_item, EnterLoop) + and next_preoutline_item.iname == iname)): if debug_mode: - print("scheduling %s prohibited by preschedule constraints" + print("outlining %s prohibited by preoutline constraints" % iname) continue currently_accessible_inames = ( - active_inames_set | sched_state.parallel_inames) + active_inames_set | outline_state.parallel_inames) if ( - not sched_state.loop_nest_around_map[iname] + not outline_state.loop_nest_around_map[iname] <= currently_accessible_inames): if debug_mode: - print("scheduling %s prohibited by loop nest-around map" % iname) + print("outlining %s prohibited by loop nest-around map" % iname) continue if ( - not sched_state.loop_insn_dep_map.get(iname, set()) - <= sched_state.scheduled_insn_ids): + not outline_state.loop_insn_dep_map.get(iname, set()) + <= outline_state.outlined_insn_ids): if debug_mode: print( - "scheduling {iname} prohibited by loop dependency map " + "outlining {iname} prohibited by loop dependency map " "(needs '{needed_insns})'" .format( iname=iname, needed_insns=", ".join( - sched_state.loop_insn_dep_map.get(iname, set()) + outline_state.loop_insn_dep_map.get(iname, set()) - - sched_state.scheduled_insn_ids))) + outline_state.outlined_insn_ids))) continue @@ -1140,7 +1144,7 @@ def generate_loop_schedules_internal( <= currently_accessible_inames) # Check if any parameters are temporary variables, and if so, if their - # writes have already been scheduled. + # writes have already been outlined. data_dep_written = True for domain_par in ( @@ -1148,10 +1152,10 @@ def generate_loop_schedules_internal( & set(kernel.temporary_variables)): writer_insn, = kernel.writer_map()[domain_par] - if writer_insn not in sched_state.scheduled_insn_ids: + if writer_insn not in outline_state.outlined_insn_ids: data_dep_written = False if debug_mode: - print("iname '%s' not scheduled because domain " + print("iname '%s' not outlined because domain " "parameter '%s' is not yet available" % (iname, domain_par)) break @@ -1161,7 +1165,8 @@ def generate_loop_schedules_internal( # }}} - # {{{ determine if that gets us closer to being able to schedule an insn + # {{{ determine if that gets us closer to being able to add an insn to + # outline usefulness = None # highest insn priority enabled by iname @@ -1188,23 +1193,23 @@ def generate_loop_schedules_internal( # {{{ tier building - # Build priority tiers. If a schedule is found in the first tier, then + # Build priority tiers. If an outline is found in the first tier, then # loops in the second are not even tried (and so on). loop_priority_set = set().union(*[set(prio) for prio in - sched_state.kernel.loop_priority]) + outline_state.kernel.loop_priority]) useful_loops_set = set(six.iterkeys(iname_to_usefulness)) useful_and_desired = useful_loops_set & loop_priority_set if useful_and_desired: wanted = ( useful_and_desired - - sched_state.ilp_inames - - sched_state.vec_inames + - outline_state.ilp_inames + - outline_state.vec_inames ) priority_tiers = [t for t in get_priority_tiers(wanted, - sched_state.kernel.loop_priority + outline_state.kernel.loop_priority ) ] @@ -1215,26 +1220,26 @@ def generate_loop_schedules_internal( priority_tiers.append( useful_loops_set - loop_priority_set - - sched_state.ilp_inames - - sched_state.vec_inames + - outline_state.ilp_inames + - outline_state.vec_inames ) else: priority_tiers = [ useful_loops_set - - sched_state.ilp_inames - - sched_state.vec_inames + - outline_state.ilp_inames + - outline_state.vec_inames ] # vectorization must be the absolute innermost loop priority_tiers.extend([ [iname] - for iname in sched_state.ilp_inames + for iname in outline_state.ilp_inames if iname in useful_loops_set ]) priority_tiers.extend([ [iname] - for iname in sched_state.vec_inames + for iname in outline_state.vec_inames if iname in useful_loops_set ]) @@ -1244,57 +1249,57 @@ def generate_loop_schedules_internal( print("useful inames: %s" % ",".join(useful_loops_set)) else: for tier in priority_tiers: - found_viable_schedule = False + found_viable_outline = False for iname in sorted(tier, key=lambda iname: ( iname_to_usefulness.get(iname, 0), # Sort by iname to achieve deterministic - # ordering of generated schedules. + # ordering of generated outlines. iname), reverse=True): - for sub_sched in generate_loop_schedules_internal( - sched_state.copy( - schedule=( - sched_state.schedule + for sub_outline in generate_loop_outlines_internal( + outline_state.copy( + outline=( + outline_state.outline + (EnterLoop(iname=iname),)), active_inames=( - sched_state.active_inames + (iname,)), + outline_state.active_inames + (iname,)), entered_inames=( - sched_state.entered_inames + outline_state.entered_inames | frozenset((iname,))), - preschedule=( - sched_state.preschedule - if iname not in sched_state.prescheduled_inames - else sched_state.preschedule[1:]), + preoutline=( + outline_state.preoutline + if iname not in outline_state.preoutlined_inames + else outline_state.preoutline[1:]), ), allow_boost=rec_allow_boost, debug=debug): - found_viable_schedule = True - yield sub_sched + found_viable_outline = True + yield sub_outline - if found_viable_schedule: + if found_viable_outline: return # }}} if debug_mode: print(75*"=") - inp = six.moves.input("Hit Enter for next schedule, " - "or enter a number to examine schedules of a " + inp = six.moves.input("Hit Enter for next outline, " + "or enter a number to examine outlines of a " "different length:") if inp: - raise ScheduleDebugInput(inp) + raise OutlineDebugInput(inp) if ( - not sched_state.active_inames - and not sched_state.unscheduled_insn_ids - and not sched_state.preschedule): + not outline_state.active_inames + and not outline_state.unoutlined_insn_ids + and not outline_state.preoutline): # if done, yield result - debug.log_success(sched_state.schedule) + debug.log_success(outline_state.outline) - for boost_insn_id, boost_inames in sched_state.uses_of_boostability: + for boost_insn_id, boost_inames in outline_state.uses_of_boostability: warn_with_kernel( kernel, "used_boostability", "instruction '%s' was implicitly nested inside " @@ -1303,32 +1308,32 @@ def generate_loop_schedules_internal( % (boost_insn_id, ", ".join(boost_inames)), DeprecationWarning) - yield sched_state.schedule + yield outline_state.outline else: if not allow_boost and allow_boost is not None: # try again with boosting allowed - for sub_sched in generate_loop_schedules_internal( - sched_state, + for sub_outline in generate_loop_outlines_internal( + outline_state, allow_boost=True, debug=debug): - yield sub_sched + yield sub_outline else: # dead end if debug is not None: - debug.log_dead_end(sched_state.schedule) + debug.log_dead_end(outline_state.outline) # }}} # {{{ convert barrier instructions to proper barriers -def convert_barrier_instructions_to_barriers(kernel, schedule): +def convert_barrier_instructions_to_barriers(kernel, outline): from loopy.kernel.instruction import BarrierInstruction result = [] - for sched_item in schedule: - if isinstance(sched_item, RunInstruction): - insn = kernel.id_to_insn[sched_item.insn_id] + for outline_item in outline: + if isinstance(outline_item, RunInstruction): + insn = kernel.id_to_insn[outline_item.insn_id] if isinstance(insn, BarrierInstruction): result.append(Barrier( synchronization_kind=insn.synchronization_kind, @@ -1337,7 +1342,7 @@ def convert_barrier_instructions_to_barriers(kernel, schedule): comment="Barrier inserted due to %s" % insn.id)) continue - result.append(sched_item) + result.append(outline_item) return result @@ -1585,17 +1590,17 @@ def barrier_kind_more_or_equally_global(kind1, kind2): return (kind1 == kind2) or (kind1 == "global" and kind2 == "local") -def insn_ids_reaching_end_without_intervening_barrier(schedule, kind): - return _insn_ids_reaching_end(schedule, kind, reverse=False) +def insn_ids_reaching_end_without_intervening_barrier(outline, kind): + return _insn_ids_reaching_end(outline, kind, reverse=False) -def insn_ids_reachable_from_start_without_intervening_barrier(schedule, kind): - return _insn_ids_reaching_end(schedule, kind, reverse=True) +def insn_ids_reachable_from_start_without_intervening_barrier(outline, kind): + return _insn_ids_reaching_end(outline, kind, reverse=True) -def _insn_ids_reaching_end(schedule, kind, reverse): +def _insn_ids_reaching_end(outline, kind, reverse): if reverse: - schedule = reversed(schedule) + outline = reversed(outline) enter_scope_item_kind = LeaveLoop leave_scope_item_kind = EnterLoop else: @@ -1604,10 +1609,10 @@ def _insn_ids_reaching_end(schedule, kind, reverse): insn_ids_alive_at_scope = [set()] - for sched_item in schedule: - if isinstance(sched_item, enter_scope_item_kind): + for outline_item in outline: + if isinstance(outline_item, enter_scope_item_kind): insn_ids_alive_at_scope.append(set()) - elif isinstance(sched_item, leave_scope_item_kind): + elif isinstance(outline_item, leave_scope_item_kind): innermost_scope = insn_ids_alive_at_scope.pop() # Instructions in deeper scopes are alive but could be killed by # barriers at a shallower level, e.g.: @@ -1619,7 +1624,7 @@ def _insn_ids_reaching_end(schedule, kind, reverse): # # Hence we merge this scope into the parent scope. insn_ids_alive_at_scope[-1].update(innermost_scope) - elif isinstance(sched_item, Barrier): + elif isinstance(outline_item, Barrier): # This barrier kills only the instruction ids that are alive at # the current scope (or deeper). Without further analysis, we # can't assume that instructions at shallower scope can be @@ -1634,17 +1639,17 @@ def _insn_ids_reaching_end(schedule, kind, reverse): # barrier() # end if barrier_kind_more_or_equally_global( - sched_item.synchronization_kind, kind): + outline_item.synchronization_kind, kind): insn_ids_alive_at_scope[-1].clear() else: insn_ids_alive_at_scope[-1] |= set( - insn_id for insn_id in sched_item_to_insn_id(sched_item)) + insn_id for insn_id in outline_item_to_insn_id(outline_item)) assert len(insn_ids_alive_at_scope) == 1 return insn_ids_alive_at_scope[-1] -def append_barrier_or_raise_error(schedule, dep, verify_only): +def append_barrier_or_raise_error(outline, dep, verify_only): if verify_only: from loopy.diagnostic import MissingBarrierError raise MissingBarrierError( @@ -1662,14 +1667,15 @@ def append_barrier_or_raise_error(schedule, dep, verify_only): comment = "for %s (%s)" % ( dep.variable, dep.dep_descr.format( tgt=dep.target.id, src=dep.source.id)) - schedule.append(Barrier( + outline.append(Barrier( comment=comment, synchronization_kind=dep.var_kind, mem_kind=dep.var_kind, originating_insn_id=None)) -def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0): +def insert_barriers( + kernel, outline, synchronization_kind, verify_only, level=0): """ :arg synchronization_kind: "local" or "global". The :attr:`Barrier.synchronization_kind` to be inserted. Generally, this @@ -1680,28 +1686,28 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 :arg level: the current level of loop nesting, 0 for outermost. """ - # {{{ insert barriers at outermost scheduling level + # {{{ insert barriers at outermost outlining level - def insert_barriers_at_outer_level(schedule, reverse=False): + def insert_barriers_at_outer_level(outline, reverse=False): dep_tracker = DependencyTracker(kernel, var_kind=synchronization_kind, reverse=reverse) if reverse: # Populate the dependency tracker with sources from the tail end of - # the schedule block. + # the outline block. for insn_id in ( insn_ids_reaching_end_without_intervening_barrier( - schedule, synchronization_kind)): + outline, synchronization_kind)): dep_tracker.add_source(insn_id) result = [] i = 0 - while i < len(schedule): - sched_item = schedule[i] + while i < len(outline): + outline_item = outline[i] - if isinstance(sched_item, EnterLoop): - subloop, new_i = gather_schedule_block(schedule, i) + if isinstance(outline_item, EnterLoop): + subloop, new_i = gather_outline_block(outline, i) loop_head = ( insn_ids_reachable_from_start_without_intervening_barrier( @@ -1746,30 +1752,30 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 i = new_i - elif isinstance(sched_item, Barrier): - result.append(sched_item) + elif isinstance(outline_item, Barrier): + result.append(outline_item) if barrier_kind_more_or_equally_global( - sched_item.synchronization_kind, synchronization_kind): + outline_item.synchronization_kind, synchronization_kind): dep_tracker.discard_all_sources() i += 1 - elif isinstance(sched_item, RunInstruction): + elif isinstance(outline_item, RunInstruction): for dep in dep_tracker.gen_dependencies_with_target_at( - sched_item.insn_id): + outline_item.insn_id): append_barrier_or_raise_error(result, dep, verify_only) dep_tracker.discard_all_sources() break - result.append(sched_item) - dep_tracker.add_source(sched_item.insn_id) + result.append(outline_item) + dep_tracker.add_source(outline_item.insn_id) i += 1 - elif isinstance(sched_item, (CallKernel, ReturnFromKernel)): - result.append(sched_item) + elif isinstance(outline_item, (CallKernel, ReturnFromKernel)): + result.append(outline_item) i += 1 else: - raise ValueError("unexpected schedule item type '%s'" - % type(sched_item).__name__) + raise ValueError("unexpected outline item type '%s'" + % type(outline_item).__name__) return result @@ -1779,11 +1785,11 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 result = [] i = 0 - while i < len(schedule): - sched_item = schedule[i] + while i < len(outline): + outline_item = outline[i] - if isinstance(sched_item, EnterLoop): - subloop, new_i = gather_schedule_block(schedule, i) + if isinstance(outline_item, EnterLoop): + subloop, new_i = gather_outline_block(outline, i) new_subloop = insert_barriers( kernel, subloop[1:-1], synchronization_kind, verify_only, level + 1) @@ -1792,14 +1798,14 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 result.append(subloop[-1]) i = new_i - elif isinstance(sched_item, + elif isinstance(outline_item, (Barrier, RunInstruction, CallKernel, ReturnFromKernel)): - result.append(sched_item) + result.append(outline_item) i += 1 else: - raise ValueError("unexpected schedule item type '%s'" - % type(sched_item).__name__) + raise ValueError("unexpected outline item type '%s'" + % type(outline_item).__name__) # }}} @@ -1814,55 +1820,56 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 # }}} -class MinRecursionLimitForScheduling(MinRecursionLimit): +class MinRecursionLimitForOutlining(MinRecursionLimit): def __init__(self, kernel): MinRecursionLimit.__init__(self, len(kernel.instructions) * 2 + len(kernel.all_inames()) * 4) -# {{{ main scheduling entrypoint +# {{{ main outlining entrypoint -def generate_loop_schedules(kernel, debug_args={}): +def generate_loop_outlines(kernel, debug_args={}): """ .. warning:: This function needs to be called inside (another layer) of a - :class:`MinRecursionLimitForScheduling` context manager, and the + :class:`MinRecursionLimitForOutlining` context manager, and the context manager needs to end *after* the last reference to the generators has gone out of scope. Otherwise, the high-recursion-limit generator chain may not be successfully garbage-collected and cause an internal error in the Python runtime. """ - with MinRecursionLimitForScheduling(kernel): - for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args): - yield sched + with MinRecursionLimitForOutlining(kernel): + for outline in generate_loop_outlines_inner(kernel, debug_args=debug_args): + yield outline -def generate_loop_schedules_inner(kernel, debug_args={}): +def generate_loop_outlines_inner(kernel, debug_args={}): from loopy.kernel import KernelState - if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): - raise LoopyError("cannot schedule a kernel that has not been " + if kernel.state not in (KernelState.PREPROCESSED, KernelState.OUTLINED): + raise LoopyError("cannot outline a kernel that has not been " "preprocessed") - from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel) + from loopy.check import pre_outline_checks + pre_outline_checks(kernel) - schedule_count = 0 + outline_count = 0 - debug = ScheduleDebugger(**debug_args) + debug = OutlineDebugger(**debug_args) - preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else () + preoutline = kernel.outline if ( + kernel.state == KernelState.OUTLINED) else () - prescheduled_inames = set( + preoutlined_inames = set( insn.iname - for insn in preschedule + for insn in preoutline if isinstance(insn, EnterLoop)) - prescheduled_insn_ids = set( + preoutlined_insn_ids = set( insn_id - for item in preschedule - for insn_id in sched_item_to_insn_id(item)) + for item in preoutline + for insn_id in outline_item_to_insn_id(item)) from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag, filter_iname_tags_by_type) @@ -1881,7 +1888,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): loop_nest_with_map = find_loop_nest_with_map(kernel) loop_nest_around_map = find_loop_nest_around_map(kernel) - sched_state = SchedulerState( + outline_state = OutlinerState( kernel=kernel, loop_nest_around_map=loop_nest_around_map, loop_insn_dep_map=find_loop_insn_dep_map( @@ -1892,25 +1899,25 @@ def generate_loop_schedules_inner(kernel, debug_args={}): ilp_inames=ilp_inames, vec_inames=vec_inames, - prescheduled_inames=prescheduled_inames, - prescheduled_insn_ids=prescheduled_insn_ids, + preoutlined_inames=preoutlined_inames, + preoutlined_insn_ids=preoutlined_insn_ids, # time-varying part active_inames=(), entered_inames=frozenset(), enclosing_subkernel_inames=(), - schedule=(), + outline=(), - unscheduled_insn_ids=set(insn.id for insn in kernel.instructions), - scheduled_insn_ids=frozenset(), - within_subkernel=kernel.state != KernelState.SCHEDULED, - may_schedule_global_barriers=True, + unoutlined_insn_ids=set(insn.id for insn in kernel.instructions), + outlined_insn_ids=frozenset(), + within_subkernel=kernel.state != KernelState.OUTLINED, + may_outline_global_barriers=True, - preschedule=preschedule, + preoutline=preoutline, insn_ids_to_try=None, - # ilp and vec are not parallel for the purposes of the scheduler + # ilp and vec are not parallel for the purposes of the outliner parallel_inames=parallel_inames - ilp_inames - vec_inames, group_insn_counts=group_insn_counts(kernel), @@ -1918,138 +1925,147 @@ def generate_loop_schedules_inner(kernel, debug_args={}): uses_of_boostability=[]) - schedule_gen_kwargs = {} + outline_gen_kwargs = {} if kernel.options.ignore_boostable_into: - schedule_gen_kwargs["allow_boost"] = None + outline_gen_kwargs["allow_boost"] = None def print_longest_dead_end(): if debug.interactive: - print("Loo.py will now show you the scheduler state at the point") - print("where the longest (dead-end) schedule was generated, in the") + print("Loo.py will now show you the outliner state at the point") + print("where the longest (dead-end) outline was generated, in the") print("the hope that some of this makes sense and helps you find") print("the issue.") print() print("To disable this interactive behavior, pass") print(" debug_args=dict(interactive=False)") - print("to generate_loop_schedules().") + print("to generate_loop_outlines().") print(75*"-") six.moves.input("Enter:") print() print() - debug.debug_length = len(debug.longest_rejected_schedule) + debug.debug_length = len(debug.longest_rejected_outline) while True: try: - for _ in generate_loop_schedules_internal( - sched_state, debug=debug, **schedule_gen_kwargs): + for _ in generate_loop_outlines_internal( + outline_state, debug=debug, **outline_gen_kwargs): pass - except ScheduleDebugInput as e: + except OutlineDebugInput as e: debug.debug_length = int(str(e)) continue break try: - for gen_sched in generate_loop_schedules_internal( - sched_state, debug=debug, **schedule_gen_kwargs): + for gen_outline in generate_loop_outlines_internal( + outline_state, debug=debug, **outline_gen_kwargs): debug.stop() - gen_sched = convert_barrier_instructions_to_barriers( - kernel, gen_sched) + gen_outline = convert_barrier_instructions_to_barriers( + kernel, gen_outline) gsize, lsize = kernel.get_grid_size_upper_bounds() if (gsize or lsize): if not kernel.options.disable_global_barriers: logger.debug("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, + gen_outline = insert_barriers(kernel, gen_outline, synchronization_kind="global", verify_only=True) logger.debug("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, + gen_outline = insert_barriers(kernel, gen_outline, synchronization_kind="local", verify_only=False) logger.debug("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( - schedule=gen_sched, - state=KernelState.SCHEDULED) + outline=gen_outline, + state=KernelState.OUTLINED) - from loopy.schedule.device_mapping import \ - map_schedule_onto_host_or_device - if kernel.state != KernelState.SCHEDULED: + from loopy.outline.device_mapping import \ + map_outline_onto_host_or_device + if kernel.state != KernelState.OUTLINED: # Device mapper only gets run once. - new_kernel = map_schedule_onto_host_or_device(new_kernel) + new_kernel = map_outline_onto_host_or_device(new_kernel) - from loopy.schedule.tools import add_extra_args_to_schedule - new_kernel = add_extra_args_to_schedule(new_kernel) + from loopy.outline.tools import add_extra_args_to_outline + new_kernel = add_extra_args_to_outline(new_kernel) yield new_kernel debug.start() - schedule_count += 1 + outline_count += 1 except KeyboardInterrupt: print() print(75*"-") - print("Interrupted during scheduling") + print("Interrupted during outlining") print(75*"-") print_longest_dead_end() raise - debug.done_scheduling() - if not schedule_count: + debug.done_outlining() + if not outline_count: print(75*"-") - print("ERROR: Sorry--loo.py did not find a schedule for your kernel.") + print("ERROR: Sorry--loo.py did not find an outline for your kernel.") print(75*"-") print_longest_dead_end() - raise RuntimeError("no valid schedules found") + raise RuntimeError("no valid outlines found") - logger.info("%s: schedule done" % kernel.name) + logger.info("%s: outline done" % kernel.name) # }}} -schedule_cache = WriteOncePersistentDict( - "loopy-schedule-cache-v4-"+DATA_MODEL_VERSION, +outline_cache = WriteOncePersistentDict( + "loopy-outline-cache-v4-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel): +def _get_one_outlined_kernel_inner(kernel): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the - # MinRecursionLimitForScheduling context manager in the surrounding + # MinRecursionLimitForOutlining context manager in the surrounding # function, because it possilby cannot be safely collected with a lower # recursion limit without crashing the Python runtime. # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel))) + return next(iter(generate_loop_outlines(kernel))) def get_one_scheduled_kernel(kernel): + warn_with_kernel( + kernel, "get_one_scheduled_kernel_deprecated", + "get_one_scheduled_kernel is deprecated. " + "Use get_one_outlined_kernel instead.", + DeprecationWarning) + return get_one_outlined_kernel(kernel) + + +def get_one_outlined_kernel(kernel): from loopy import CACHING_ENABLED - sched_cache_key = kernel + outline_cache_key = kernel from_cache = False if CACHING_ENABLED: try: - result = schedule_cache[sched_cache_key] + result = outline_cache[outline_cache_key] - logger.debug("%s: schedule cache hit" % kernel.name) + logger.debug("%s: outline cache hit" % kernel.name) from_cache = True except KeyError: pass if not from_cache: - with ProcessLogger(logger, "%s: schedule" % kernel.name): - with MinRecursionLimitForScheduling(kernel): - result = _get_one_scheduled_kernel_inner(kernel) + with ProcessLogger(logger, "%s: outline" % kernel.name): + with MinRecursionLimitForOutlining(kernel): + result = _get_one_outlined_kernel_inner(kernel) if CACHING_ENABLED and not from_cache: - schedule_cache.store_if_not_present(sched_cache_key, result) + outline_cache.store_if_not_present(outline_cache_key, result) return result diff --git a/loopy/schedule/device_mapping.py b/loopy/outline/device_mapping.py similarity index 60% rename from loopy/schedule/device_mapping.py rename to loopy/outline/device_mapping.py index 59afb07d2e9b7713dbe86c2c5aef7356decbbcff..2f58a2b1414afc0a0637e64f7ddf238de1b09691 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/outline/device_mapping.py @@ -23,15 +23,15 @@ THE SOFTWARE. """ from loopy.diagnostic import LoopyError -from loopy.schedule import (Barrier, CallKernel, EnterLoop, LeaveLoop, +from loopy.outline import (Barrier, CallKernel, EnterLoop, LeaveLoop, ReturnFromKernel, RunInstruction) -from loopy.schedule.tools import get_block_boundaries +from loopy.outline.tools import get_block_boundaries -def map_schedule_onto_host_or_device(kernel): +def map_outline_onto_host_or_device(kernel): # FIXME: Should be idempotent. from loopy.kernel import KernelState - assert kernel.state == KernelState.SCHEDULED + assert kernel.state == KernelState.OUTLINED from functools import partial device_prog_name_gen = partial( @@ -41,127 +41,127 @@ def map_schedule_onto_host_or_device(kernel): + kernel.target.device_program_name_suffix) if not kernel.target.split_kernel_at_global_barriers(): - new_schedule = ( + new_outline = ( [CallKernel(kernel_name=device_prog_name_gen(), extra_args=[], extra_inames=[])] + - list(kernel.schedule) + + list(kernel.outline) + [ReturnFromKernel(kernel_name=kernel.name)]) - kernel = kernel.copy(schedule=new_schedule) + kernel = kernel.copy(outline=new_outline) else: - kernel = map_schedule_onto_host_or_device_impl( + kernel = map_outline_onto_host_or_device_impl( kernel, device_prog_name_gen) return kernel -def map_schedule_onto_host_or_device_impl(kernel, device_prog_name_gen): - schedule = kernel.schedule - loop_bounds = get_block_boundaries(schedule) +def map_outline_onto_host_or_device_impl(kernel, device_prog_name_gen): + outline = kernel.outline + loop_bounds = get_block_boundaries(outline) # {{{ inner mapper function dummy_call = CallKernel(kernel_name="", extra_args=[], extra_inames=[]) dummy_return = ReturnFromKernel(kernel_name="") - def inner_mapper(start_idx, end_idx, new_schedule): - schedule_required_splitting = False + def inner_mapper(start_idx, end_idx, new_outline): + outline_required_splitting = False i = start_idx current_chunk = [] while i <= end_idx: - sched_item = schedule[i] + outline_item = outline[i] - if isinstance(sched_item, RunInstruction): - current_chunk.append(sched_item) + if isinstance(outline_item, RunInstruction): + current_chunk.append(outline_item) i += 1 - elif isinstance(sched_item, EnterLoop): + elif isinstance(outline_item, EnterLoop): loop_end = loop_bounds[i] - inner_schedule = [] + inner_outline = [] loop_required_splitting = inner_mapper( - i + 1, loop_end - 1, inner_schedule) + i + 1, loop_end - 1, inner_outline) - start_item = schedule[i] - end_item = schedule[loop_end] + start_item = outline[i] + end_item = outline[loop_end] i = loop_end + 1 if loop_required_splitting: - schedule_required_splitting = True + outline_required_splitting = True if current_chunk: - new_schedule.extend( + new_outline.extend( [dummy_call.copy()] + current_chunk + [dummy_return.copy()]) - new_schedule.extend( + new_outline.extend( [start_item] + - inner_schedule + + inner_outline + [end_item]) current_chunk = [] else: current_chunk.extend( [start_item] + - inner_schedule + + inner_outline + [end_item]) - elif isinstance(sched_item, Barrier): - if sched_item.synchronization_kind == "global": + elif isinstance(outline_item, Barrier): + if outline_item.synchronization_kind == "global": # Wrap the current chunk into a kernel call. - schedule_required_splitting = True + outline_required_splitting = True if current_chunk: - new_schedule.extend( + new_outline.extend( [dummy_call.copy()] + current_chunk + [dummy_return.copy()]) - new_schedule.append(sched_item) + new_outline.append(outline_item) current_chunk = [] else: - current_chunk.append(sched_item) + current_chunk.append(outline_item) i += 1 else: - raise LoopyError("unexpected type of schedule item: %s" - % type(sched_item).__name__) + raise LoopyError("unexpected type of outline item: %s" + % type(outline_item).__name__) - if current_chunk and schedule_required_splitting: - # Wrap remainder of schedule into a kernel call. - new_schedule.extend( + if current_chunk and outline_required_splitting: + # Wrap remainder of outline into a kernel call. + new_outline.extend( [dummy_call.copy()] + current_chunk + [dummy_return.copy()]) else: - new_schedule.extend(current_chunk) + new_outline.extend(current_chunk) - return schedule_required_splitting + return outline_required_splitting # }}} - new_schedule = [] - split_kernel = inner_mapper(0, len(schedule) - 1, new_schedule) + new_outline = [] + split_kernel = inner_mapper(0, len(outline) - 1, new_outline) if not split_kernel: # Wrap everything into a kernel call. - new_schedule = ( + new_outline = ( [dummy_call.copy()] + - new_schedule + + new_outline + [dummy_return.copy()]) # Assign names, extra_inames to CallKernel / ReturnFromKernel instructions inames = [] - for idx, sched_item in enumerate(new_schedule): - if isinstance(sched_item, CallKernel): + for idx, outline_item in enumerate(new_outline): + if isinstance(outline_item, CallKernel): last_kernel_name = device_prog_name_gen() - new_schedule[idx] = sched_item.copy( + new_outline[idx] = outline_item.copy( kernel_name=last_kernel_name, extra_inames=list(inames)) - elif isinstance(sched_item, ReturnFromKernel): - new_schedule[idx] = sched_item.copy( + elif isinstance(outline_item, ReturnFromKernel): + new_outline[idx] = outline_item.copy( kernel_name=last_kernel_name) - elif isinstance(sched_item, EnterLoop): - inames.append(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): + elif isinstance(outline_item, EnterLoop): + inames.append(outline_item.iname) + elif isinstance(outline_item, LeaveLoop): inames.pop() - new_kernel = kernel.copy(schedule=new_schedule) + new_kernel = kernel.copy(outline=new_outline) return new_kernel diff --git a/loopy/schedule/tools.py b/loopy/outline/tools.py similarity index 73% rename from loopy/schedule/tools.py rename to loopy/outline/tools.py index e0129fd98417f26a501138a92de4a67614f1a139..8946f6ebbe99f2f609b4dbaa0951505ce2789451 100644 --- a/loopy/schedule/tools.py +++ b/loopy/outline/tools.py @@ -27,19 +27,19 @@ from loopy.kernel.data import AddressSpace # {{{ block boundary finder -def get_block_boundaries(schedule): +def get_block_boundaries(outline): """ Return a dictionary mapping indices of - :class:`loopy.schedule.BlockBeginItem`s to - :class:`loopy.schedule.BlockEndItem`s and vice versa. + :class:`loopy.outline.BlockBeginItem`s to + :class:`loopy.outline.BlockEndItem`s and vice versa. """ - from loopy.schedule import (BeginBlockItem, EndBlockItem) + from loopy.outline import (BeginBlockItem, EndBlockItem) block_bounds = {} active_blocks = [] - for idx, sched_item in enumerate(schedule): - if isinstance(sched_item, BeginBlockItem): + for idx, outline_item in enumerate(outline): + if isinstance(outline_item, BeginBlockItem): active_blocks.append(idx) - elif isinstance(sched_item, EndBlockItem): + elif isinstance(outline_item, EndBlockItem): start = active_blocks.pop() block_bounds[start] = idx block_bounds[idx] = start @@ -70,19 +70,19 @@ def temporaries_written_in_subkernel(kernel, subkernel): # }}} -# {{{ add extra args to schedule +# {{{ add extra args to outline -def add_extra_args_to_schedule(kernel): +def add_extra_args_to_outline(kernel): """ - Fill the `extra_args` fields in all the :class:`loopy.schedule.CallKernel` - instructions in the schedule with global temporaries. + Fill the `extra_args` fields in all the :class:`loopy.outline.CallKernel` + instructions in the outline with global temporaries. """ - new_schedule = [] - from loopy.schedule import CallKernel + new_outline = [] + from loopy.outline import CallKernel - for sched_item in kernel.schedule: - if isinstance(sched_item, CallKernel): - subkernel = sched_item.kernel_name + for outline_item in kernel.outline: + if isinstance(outline_item, CallKernel): + subkernel = outline_item.kernel_name used_temporaries = ( temporaries_read_in_subkernel(kernel, subkernel) @@ -96,13 +96,13 @@ def add_extra_args_to_schedule(kernel): and kernel.temporary_variables[tv].initializer is None and - tv not in sched_item.extra_args) + tv not in outline_item.extra_args) - new_schedule.append(sched_item.copy( - extra_args=sched_item.extra_args + sorted(more_args))) + new_outline.append(outline_item.copy( + extra_args=outline_item.extra_args + sorted(more_args))) else: - new_schedule.append(sched_item) + new_outline.append(outline_item) - return kernel.copy(schedule=new_schedule) + return kernel.copy(outline=new_outline) # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 23c4b7fbd9e55006dd17ed9b127e598a14ee17a2..7e24685feb7a9b6051cf8c242e71bebb21280ffd 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -122,7 +122,7 @@ def check_reduction_iname_uniqueness(kernel): if nonsimul_count and count > 1: raise LoopyError("iname '%s' used in more than one reduction. " "(%d of them, to be precise.) " - "Since this usage can easily cause loop scheduling " + "Since this usage can easily cause loop outlining " "problems, this is prohibited by default. " "Use loopy.make_reduction_inames_unique() to fix this. " "If you are sure that this is OK, write the reduction " diff --git a/loopy/statistics.py b/loopy/statistics.py index 10d29daad062744ca3fbe2dc2261be4cd2c4ca99..0d646a27ff31b05fc072d5bc75748e9154ad67b8 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1718,12 +1718,12 @@ def get_synchronization_map(knl, subgroup_size=None): "ignore_boostable_into to be set." % knl.name) from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, + from loopy.outline import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) + knl = lp.get_one_outlined_kernel(knl) iname_list = [] result = ToCountMap() @@ -1740,29 +1740,29 @@ def get_synchronization_map(knl, subgroup_size=None): else: return one - for sched_item in knl.schedule: - if isinstance(sched_item, EnterLoop): - if sched_item.iname: # (if not empty) - iname_list.append(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - if sched_item.iname: # (if not empty) + for outline_item in knl.outline: + if isinstance(outline_item, EnterLoop): + if outline_item.iname: # (if not empty) + iname_list.append(outline_item.iname) + elif isinstance(outline_item, LeaveLoop): + if outline_item.iname: # (if not empty) iname_list.pop() - elif isinstance(sched_item, Barrier): + elif isinstance(outline_item, Barrier): result = result + ToCountMap({"barrier_%s" % - sched_item.synchronization_kind: + outline_item.synchronization_kind: get_count_poly(iname_list)}) - elif isinstance(sched_item, CallKernel): + elif isinstance(outline_item, CallKernel): result = result + ToCountMap( {"kernel_launch": get_count_poly(iname_list)}) - elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): + elif isinstance(outline_item, (ReturnFromKernel, RunInstruction)): pass else: - raise LoopyError("unexpected schedule item: %s" - % type(sched_item).__name__) + raise LoopyError("unexpected outline item: %s" + % type(outline_item).__name__) return result diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 73d2a6328af87cb51fb90d43efcde34d39aa8299..952326e75c42fe7fbef2f6787fe413b2d9c76c61 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -165,17 +165,17 @@ class ASTBuilderBase(object): # {{{ code generation guts def get_function_definition(self, codegen_state, codegen_result, - schedule_index, function_decl, function_body): + outline_index, function_decl, function_body): raise NotImplementedError def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + outline_index): raise NotImplementedError def generate_top_of_body(self, codegen_state): return [] - def get_temporary_decls(self, codegen_state, schedule_index): + def get_temporary_decls(self, codegen_state, outline_index): raise NotImplementedError def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): @@ -261,14 +261,14 @@ class _DummyASTBlock(object): class DummyHostASTBuilder(ASTBuilderBase): def get_function_definition(self, codegen_state, codegen_result, - schedule_index, function_decl, function_body): + outline_index, function_decl, function_body): return function_body def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + outline_index): return None - def get_temporary_decls(self, codegen_state, schedule_index): + def get_temporary_decls(self, codegen_state, outline_index): return [] def get_expression_to_code_mapper(self, codegen_state): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 01d26dd822e46973ad7ebb99f18ec4519e0b4585..81ca9688e3d4dd512a9445f1641f6f1b1ea83ae2 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -494,7 +494,7 @@ class CFamilyASTBuilder(ASTBuilderBase): # {{{ code generation def get_function_definition(self, codegen_state, codegen_result, - schedule_index, + outline_index, function_decl, function_body): kernel = codegen_state.kernel @@ -509,13 +509,13 @@ class CFamilyASTBuilder(ASTBuilderBase): result = [] from loopy.kernel.data import AddressSpace - from loopy.schedule import CallKernel + from loopy.outline import CallKernel # We only need to write declarations for global variables with # the first device program. `is_first_dev_prog` determines - # whether this is the first device program in the schedule. + # whether this is the first device program in the outline. is_first_dev_prog = codegen_state.is_generating_device_code - for i in range(schedule_index): - if isinstance(kernel.schedule[i], CallKernel): + for i in range(outline_index): + if isinstance(kernel.outline[i], CallKernel): is_first_dev_prog = False break if is_first_dev_prog: @@ -531,7 +531,7 @@ class CFamilyASTBuilder(ASTBuilderBase): index_dtype=kernel.index_dtype) decl = self.wrap_global_constant( self.get_temporary_decl( - codegen_state, schedule_index, tv, + codegen_state, outline_index, tv, decl_info)) if tv.initializer is not None: @@ -568,7 +568,7 @@ class CFamilyASTBuilder(ASTBuilderBase): return var_descr.get_arg_decl(self) def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + outline_index): from cgen import FunctionDeclaration, Value name = codegen_result.current_program(codegen_state).name @@ -584,7 +584,7 @@ class CFamilyASTBuilder(ASTBuilderBase): def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): return None - def get_temporary_decls(self, codegen_state, schedule_index): + def get_temporary_decls(self, codegen_state, outline_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel @@ -601,10 +601,10 @@ class CFamilyASTBuilder(ASTBuilderBase): from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line # Getting the temporary variables that are needed for the current # sub-kernel. - from loopy.schedule.tools import ( + from loopy.outline.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) - subkernel = kernel.schedule[schedule_index].kernel_name + subkernel = kernel.outline[outline_index].kernel_name sub_knl_temps = ( temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) @@ -621,7 +621,7 @@ class CFamilyASTBuilder(ASTBuilderBase): tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( - codegen_state, schedule_index, tv, idi), + codegen_state, outline_index, tv, idi), tv.address_space) if tv.initializer is not None: @@ -740,7 +740,7 @@ class CFamilyASTBuilder(ASTBuilderBase): from loopy.target.c.codegen.expression import CExpressionToCodeMapper return CExpressionToCodeMapper() - def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info): + def get_temporary_decl(self, codegen_state, outline_index, temp_var, decl_info): temp_var_decl = POD(self, decl_info.dtype, decl_info.name) if temp_var.read_only: diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 698507978f7c20d6d594fd3e03626e7b12012a94..050b30451d01f8e5c2f278ed3a4d4e4d6b8b0c79 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -383,7 +383,7 @@ class CKernelExecutor(KernelExecutorBase): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the - kernel has not yet been loop-scheduled, that is done, too, with no + kernel has not yet been loop-outlined, that is done, too, with no specific arguments. """ @@ -396,7 +396,7 @@ class CKernelExecutor(KernelExecutorBase): @memoize_method def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + kernel = self.get_typed_and_outlined_kernel(arg_to_dtype_set) from loopy.codegen import generate_code_v2 codegen_result = generate_code_v2(kernel) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 50fd1026f7bd15ce72915d0d5d5e60f6da4e264c..bbd7edf24a8426bd4b4d216179db8d0443328bc4 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -230,9 +230,9 @@ class CUDACASTBuilder(CFamilyASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + outline_index): fdecl = super(CUDACASTBuilder, self).get_function_declaration( - codegen_state, codegen_result, schedule_index) + codegen_state, codegen_result, outline_index) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) @@ -245,11 +245,11 @@ class CUDACASTBuilder(CFamilyASTBuilder): from cgen import Extern fdecl = Extern("C", fdecl) - from loopy.schedule import get_insn_ids_for_block_at + from loopy.outline import get_insn_ids_for_block_at _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.outline, outline_index)) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): diff --git a/loopy/target/execution.py b/loopy/target/execution.py index c8f0d40903b1638e853caf459c0c0393d754c993..4444af374042f3c9eefd671ce8545ef91e40585a 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -693,8 +693,8 @@ class _Kernels(object): pass -typed_and_scheduled_cache = WriteOncePersistentDict( - "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION, +typed_and_outlined_cache = WriteOncePersistentDict( + "loopy-typed-and-outlined-cache-v1-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -729,7 +729,7 @@ class KernelExecutorBase(object): arg.dtype is None for arg in kernel.args) - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): + def get_typed_and_outlined_kernel_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes kernel = self.kernel @@ -754,16 +754,16 @@ class KernelExecutorBase(object): from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) - if kernel.schedule is None: + if kernel.outline is None: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + from loopy.outline import get_one_outlined_kernel + kernel = get_one_outlined_kernel(kernel) return kernel - def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + def get_typed_and_outlined_kernel(self, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching @@ -774,16 +774,16 @@ class KernelExecutorBase(object): if CACHING_ENABLED: try: - return typed_and_scheduled_cache[cache_key] + return typed_and_outlined_cache[cache_key] except KeyError: pass - logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + logger.debug("%s: typed-and-outlined cache miss" % self.kernel.name) - kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + kernel = self.get_typed_and_outlined_kernel_uncached(arg_to_dtype_set) if CACHING_ENABLED: - typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) + typed_and_outlined_cache.store_if_not_present(cache_key, kernel) return kernel @@ -831,7 +831,7 @@ class KernelExecutorBase(object): arg_to_dtype = frozenset( (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype)) - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) + kernel = self.get_typed_and_outlined_kernel(arg_to_dtype) from loopy.codegen import generate_code_v2 code = generate_code_v2(kernel) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index eb0157bf86d478901fb5a07bbac28aa7a11bcec9..3b6ccef52e3a5144a93726437ed44c630c149495 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -231,7 +231,7 @@ class ISPCASTBuilder(CFamilyASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + outline_index): name = codegen_result.current_program(codegen_state).name from cgen import (FunctionDeclaration, Value) @@ -302,7 +302,7 @@ class ISPCASTBuilder(CFamilyASTBuilder): else: raise LoopyError("unknown barrier kind") - def get_temporary_decl(self, codegen_state, sched_index, temp_var, decl_info): + def get_temporary_decl(self, codegen_state, outline_index, temp_var, decl_info): from loopy.target.c import POD # uses the correct complex type temp_var_decl = POD(self, decl_info.dtype, decl_info.name) diff --git a/loopy/target/numba.py b/loopy/target/numba.py index 6946063ee04f52a4890344b4cbff9446bacb6923..9ec194f32f9ed1581d4b6602057a2a352e0dba0b 100644 --- a/loopy/target/numba.py +++ b/loopy/target/numba.py @@ -49,7 +49,7 @@ class NumbaBaseASTBuilder(PythonASTBuilderBase): ]) def get_function_definition(self, codegen_state, codegen_result, - schedule_index, + outline_index, function_decl, function_body): assert function_decl is None diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 04d436043daed74362ebabd96e18bf1d4d6d4a6c..f13f4dea9e6c01012300f94f02277464a874214d 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -393,9 +393,9 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + outline_index): fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( - codegen_state, codegen_result, schedule_index) + codegen_state, codegen_result, outline_index) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) @@ -404,10 +404,10 @@ class OpenCLCASTBuilder(CFamilyASTBuilder): from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) - from loopy.schedule import get_insn_ids_for_block_at + from loopy.outline import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.outline, outline_index)) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 826ba2a8f09b8a19d19200ef6d936a8276cf3688..6b69e3f13229a1e2f4874e4f9bf46b01ec668a76 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -630,7 +630,7 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): # {{{ code generation guts def get_function_definition(self, codegen_state, codegen_result, - schedule_index, function_decl, function_body): + outline_index, function_decl, function_body): from loopy.kernel.data import TemporaryVariable args = ( ["_lpy_cl_kernels", "queue"] @@ -667,11 +667,11 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): ])) def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + outline_index): # no such thing in Python return None - def get_temporary_decls(self, codegen_state, schedule_state): + def get_temporary_decls(self, codegen_state, outline_state): from genpy import Assign, Comment, Line def alloc_nbytes(tv): diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 05fdd21f084412997ed1f0b258f6ba70b435bd4e..b4a887a693cef0651c31a9f1e6ed7a3228949f6f 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -257,7 +257,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the - kernel has not yet been loop-scheduled, that is done, too, with no + kernel has not yet been loop-outlined, that is done, too, with no specific arguments. """ @@ -276,7 +276,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): @memoize_method def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + kernel = self.get_typed_and_outlined_kernel(arg_to_dtype_set) from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code diff --git a/loopy/target/python.py b/loopy/target/python.py index ce04986d3d2a39dcf7126339055d32fa16ffcc25..64860c43c705df5a9793457b6eaa15ad01c32bd6 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -202,11 +202,11 @@ class PythonASTBuilderBase(ASTBuilderBase): ]) def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + outline_index): return None def get_function_definition(self, codegen_state, codegen_result, - schedule_index, + outline_index, function_decl, function_body): assert function_decl is None @@ -217,7 +217,7 @@ class PythonASTBuilderBase(ASTBuilderBase): [idi.name for idi in codegen_state.implemented_data_info], function_body) - def get_temporary_decls(self, codegen_state, schedule_index): + def get_temporary_decls(self, codegen_state, outline_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 7f4779cc7c0af3fa228ca51a3f8d45944ec21bff..b9fc1b3a86b4d0c3667ca7b4a96ee11a88e78e50 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -328,7 +328,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, if domch.leaf_domain_index is not None: # If the sweep inames are at home in parent domains, then we'll add # fetches with loops over copies of these parent inames that will end - # up being scheduled *within* loops over these parents. + # up being outlined *within* loops over these parents. for iname in buffer_inames_set: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 96c8252ef7e6622250e9006b2275ef7816700b5c..64dfdd8f85b9392690514540692e065766ae097c 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -52,7 +52,7 @@ __doc__ = """ .. autofunction:: get_iname_duplication_options -.. autofunction:: has_schedulable_iname_nesting +.. autofunction:: has_outlinable_iname_nesting .. autofunction:: prioritize_loops @@ -98,7 +98,7 @@ def prioritize_loops(kernel, loop_priority): Priority is only considered if loop nesting is ambiguous. prioritize_loops can be used multiple times. If you do so, each given - *loop_priority* specifies a scheduling constraint. The constraints from + *loop_priority* specifies an outlining constraint. The constraints from all calls to prioritize_loops together establish a partial order on the inames (see https://en.wikipedia.org/wiki/Partially_ordered_set). @@ -912,7 +912,7 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, # }}} -# {{{ iname duplication for schedulability +# {{{ iname duplication for outlinability def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset([])): # Remove common inames of the current insn_iname_sets, as they are not relevant @@ -987,17 +987,17 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( def get_iname_duplication_options(knl, use_boostable_into=False): - """List options for duplication of inames, if necessary for schedulability + """List options for duplication of inames, if necessary for outlinability :returns: a generator listing all options to duplicate inames, if duplication - of an iname is necessary to ensure the schedulability of the kernel. + of an iname is necessary to ensure the outlinability of the kernel. Duplication options are returned as tuples (iname, within) as understood by :func:`duplicate_inames`. There is no guarantee, that the - transformed kernel will be schedulable, because multiple duplications + transformed kernel will be outlinable, because multiple duplications of iname may be necessary. - Some kernels require the duplication of inames in order to be schedulable, as the - forced iname dependencies define an over-determined problem to the scheduler. + Some kernels require the duplication of inames in order to be outlinable, as the + forced iname dependencies define an over-determined problem to the outliner. Consider the following minimal example: knl = lp.make_kernel(["{[i,j]:0<=i,j 0: # For an array variable, all definitions generate a use as # well, because the write could be a partial write, @@ -126,11 +126,11 @@ class LivenessAnalysis(object): # We don't currently check if the write is a partial write # or a full write. Instead, we analyze the access # footprint later on to determine how much to reload/save. - gen[sched_idx].add(var) + gen[outline_idx].add(var) for var in insn.read_dependency_names(): if var not in self.kernel.temporary_variables: continue - gen[sched_idx].add(var) + gen[outline_idx].add(var) return gen, kill @@ -141,14 +141,14 @@ class LivenessAnalysis(object): gen, kill = self.get_gen_and_kill_sets() # Fixed point iteration for liveness analysis - lr = LivenessResult.make_empty(len(self.schedule)) + lr = LivenessResult.make_empty(len(self.outline)) prev_lr = None while prev_lr != lr: from copy import deepcopy prev_lr = deepcopy(lr) - for idx in range(len(self.schedule) - 1, -1, -1): + for idx in range(len(self.outline) - 1, -1, -1): for succ in successors[idx]: lr[idx].live_out.update(lr[succ].live_in) lr[idx].live_in = gen[idx] | (lr[idx].live_out - kill[idx]) @@ -160,26 +160,26 @@ class LivenessAnalysis(object): def print_liveness(self): print(75 * "-") print("LIVE IN:") - for sched_idx, sched_item in enumerate(self.schedule): + for outline_idx, outline_item in enumerate(self.outline): print("{item}: {{{vars}}}".format( - item=sched_idx, - vars=", ".join(sorted(self[sched_idx].live_in)))) + item=outline_idx, + vars=", ".join(sorted(self[outline_idx].live_in)))) print(75 * "-") print("LIVE OUT:") - for sched_idx, sched_item in enumerate(self.schedule): + for outline_idx, outline_item in enumerate(self.outline): print("{item}: {{{vars}}}".format( - item=sched_idx, - vars=", ".join(sorted(self[sched_idx].live_out)))) + item=outline_idx, + vars=", ".join(sorted(self[outline_idx].live_out)))) print(75 * "-") - def __getitem__(self, sched_idx): + def __getitem__(self, outline_idx): """ :arg insn: An instruction name or instance of :class:`loopy.instruction.InstructionBase` :returns: A :class:`LivenessResult` associated with `insn` """ - return self.liveness()[sched_idx] + return self.liveness()[outline_idx] # }}} @@ -316,11 +316,11 @@ class TemporarySaver(object): def subkernel_to_slice_indices(self): result = {} - for sched_item_idx, sched_item in enumerate(self.kernel.schedule): - if isinstance(sched_item, CallKernel): - start_idx = sched_item_idx - elif isinstance(sched_item, ReturnFromKernel): - result[sched_item.kernel_name] = (start_idx, 1 + sched_item_idx) + for outline_item_idx, outline_item in enumerate(self.kernel.outline): + if isinstance(outline_item, CallKernel): + start_idx = outline_item_idx + elif isinstance(outline_item, ReturnFromKernel): + result[outline_item.kernel_name] = (start_idx, 1 + outline_item_idx) return result @@ -331,17 +331,17 @@ class TemporarySaver(object): within_subkernel = False result = {} - for sched_item_idx, sched_item in enumerate(self.kernel.schedule): - if isinstance(sched_item, CallKernel): + for outline_item_idx, outline_item in enumerate(self.kernel.outline): + if isinstance(outline_item, CallKernel): within_subkernel = True - result[sched_item.kernel_name] = frozenset(current_outer_inames) - elif isinstance(sched_item, ReturnFromKernel): + result[outline_item.kernel_name] = frozenset(current_outer_inames) + elif isinstance(outline_item, ReturnFromKernel): within_subkernel = False - elif isinstance(sched_item, EnterLoop): + elif isinstance(outline_item, EnterLoop): if not within_subkernel: - current_outer_inames.add(sched_item.iname) - elif isinstance(sched_item, LeaveLoop): - current_outer_inames.discard(sched_item.iname) + current_outer_inames.add(outline_item.iname) + elif isinstance(outline_item, LeaveLoop): + current_outer_inames.discard(outline_item.iname) return result @@ -356,14 +356,14 @@ class TemporarySaver(object): try: pre_barrier = next(item for item in - self.kernel.schedule[subkernel_start::-1] + self.kernel.outline[subkernel_start::-1] if is_global_barrier(item)).originating_insn_id except StopIteration: pre_barrier = None try: post_barrier = next(item for item in - self.kernel.schedule[subkernel_end:] + self.kernel.outline[subkernel_end:] if is_global_barrier(item)).originating_insn_id except StopIteration: post_barrier = None @@ -727,7 +727,7 @@ def save_and_reload_temporaries(knl): Add instructions to save and reload temporary variables that are live across kernel calls. - The basic code transformation turns schedule segments:: + The basic code transformation turns outline segments:: t = <...> @@ -748,41 +748,42 @@ def save_and_reload_temporaries(knl): liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl) - from loopy.schedule.tools import ( + from loopy.outline.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) - for sched_idx, sched_item in enumerate(knl.schedule): + for outline_idx, outline_item in enumerate(knl.outline): - if isinstance(sched_item, CallKernel): + if isinstance(outline_item, CallKernel): # Any written temporary that is live-out needs to be read into # memory because of the potential for partial writes. - if sched_idx == 0: + if outline_idx == 0: # Kernel entry: nothing live interesting_temporaries = set() else: - subkernel = sched_item.kernel_name + subkernel = outline_item.kernel_name interesting_temporaries = ( temporaries_read_in_subkernel(knl, subkernel) | temporaries_written_in_subkernel(knl, subkernel)) - for temporary in liveness[sched_idx].live_out & interesting_temporaries: + for temporary in ( + liveness[outline_idx].live_out & interesting_temporaries): logger.info("reloading {0} at entry of {1}" - .format(temporary, sched_item.kernel_name)) - saver.reload(temporary, sched_item.kernel_name) + .format(temporary, outline_item.kernel_name)) + saver.reload(temporary, outline_item.kernel_name) - elif isinstance(sched_item, ReturnFromKernel): - if sched_idx == len(knl.schedule) - 1: + elif isinstance(outline_item, ReturnFromKernel): + if outline_idx == len(knl.outline) - 1: # Kernel exit: nothing live interesting_temporaries = set() else: - subkernel = sched_item.kernel_name + subkernel = outline_item.kernel_name interesting_temporaries = ( temporaries_written_in_subkernel(knl, subkernel)) - for temporary in liveness[sched_idx].live_in & interesting_temporaries: + for temporary in liveness[outline_idx].live_in & interesting_temporaries: logger.info("saving {0} before return of {1}" - .format(temporary, sched_item.kernel_name)) - saver.save(temporary, sched_item.kernel_name) + .format(temporary, outline_item.kernel_name)) + saver.save(temporary, outline_item.kernel_name) return saver.finish() diff --git a/proto-tests/test_fem_assembly.py b/proto-tests/test_fem_assembly.py index 18f2a5bfabdd52abad9d78aacf4f1d5be53b5ac1..55abe2135788b3b6de375a91971f1707b1c6d36c 100644 --- a/proto-tests/test_fem_assembly.py +++ b/proto-tests/test_fem_assembly.py @@ -123,7 +123,7 @@ def test_laplacian_stiffness(ctx_factory): # v for variant in [variant_fig33]: var_knl, loop_prio = variant(knl) - kernel_gen = lp.generate_loop_schedules(var_knl, + kernel_gen = lp.generate_loop_outlines(var_knl, loop_priority=loop_prio) kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) diff --git a/proto-tests/test_sem.py b/proto-tests/test_sem.py index 4613b74ae787fe086ead935ddec61ff1a5438521..4774a868593e61e21630197d9fb7c2729c71bbc2 100644 --- a/proto-tests/test_sem.py +++ b/proto-tests/test_sem.py @@ -99,7 +99,7 @@ def test_laplacian(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) - kernel_gen = lp.generate_loop_schedules(knl, + kernel_gen = lp.generate_loop_outlines(knl, loop_priority=["m_fetch_G", "i_fetch_u"]) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) @@ -179,7 +179,7 @@ def test_laplacian_lmem(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 @@ -256,7 +256,7 @@ def test_laplacian_lmem_ilp(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) for knl in kernel_gen: @@ -347,7 +347,7 @@ def test_advect(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) @@ -467,7 +467,7 @@ def test_advect_dealias(ctx_factory): print(knl) #1/0 - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) @@ -531,7 +531,7 @@ def test_interp_diff(ctx_factory): print(knl) #1/0 - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, diff --git a/proto-tests/test_sem_tim.py b/proto-tests/test_sem_tim.py index 1bfb437fb6de1cb5511d108eb35a8ad32326122e..0fbba945b1d455521498f7190326baf63e395382 100644 --- a/proto-tests/test_sem_tim.py +++ b/proto-tests/test_sem_tim.py @@ -101,7 +101,7 @@ def test_laplacian(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) - kernel_gen = lp.generate_loop_schedules(knl, + kernel_gen = lp.generate_loop_outlines(knl, loop_priority=["m_fetch_G", "i_fetch_u"]) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) @@ -191,7 +191,7 @@ def test_laplacian_lmem(ctx_factory): # ValueError: cannot tag 'i_and_j'--not known # knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1")) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 @@ -262,7 +262,7 @@ def test_laplacian_lmem_ilp(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) for knl in kernel_gen: @@ -353,7 +353,7 @@ def test_advect(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) @@ -473,7 +473,7 @@ def test_advect_dealias(ctx_factory): print(knl) #1/0 - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) @@ -537,7 +537,7 @@ def test_interp_diff(ctx_factory): print(knl) #1/0 - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, diff --git a/proto-tests/test_tim.py b/proto-tests/test_tim.py index d7061933e5667a623b4157ea6900a4b13c55e6c4..7c681fa77dc726b58a656134a2fd699cddbaed44 100644 --- a/proto-tests/test_tim.py +++ b/proto-tests/test_tim.py @@ -63,7 +63,7 @@ def test_tim2d(ctx_factory): # knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 @@ -129,7 +129,7 @@ def test_red2d(ctx_factory): knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 @@ -201,7 +201,7 @@ def test_tim3d(ctx_factory): knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 4000 diff --git a/test/test_domain.py b/test/test_domain.py index ebfde850907d68bebf06076fbf1c87d8bb093f71..9f07a64e32ca65755e69a9e5d010078df15da59f 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -68,7 +68,7 @@ def test_assume(ctx_factory): knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) for gen_knl in kernel_gen: print(gen_knl) @@ -97,7 +97,7 @@ def test_divisibility_assumption(ctx_factory): knl = lp.split_iname(knl, "i", 16) knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): + for k in lp.generate_loop_outlines(knl): code = lp.generate_code(k) assert "if" not in code @@ -124,7 +124,7 @@ def test_eq_constraint(ctx_factory): knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) for knl in kernel_gen: print(lp.generate_code(knl)) @@ -229,7 +229,7 @@ def test_dependent_loop_bounds_3(ctx_factory): knl = lp.preprocess_kernel(knl, ctx.devices[0]) with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) + list(lp.generate_loop_outlines(knl_bad)) def test_dependent_loop_bounds_4(): diff --git a/test/test_loopy.py b/test/test_loopy.py index 6b78ac26b78d8c85dab3cd41af0ce1d99d52ec07..7742dd807a6ae5d599d456192363de0e2d988bb9 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -123,7 +123,7 @@ def test_type_inference_no_artificial_doubles(): assumptions="n>=1") knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): + for k in lp.generate_loop_outlines(knl): code = lp.generate_code(k) assert "double" not in code @@ -185,7 +185,7 @@ def test_simple_side_effect(ctx_factory): ) knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) for gen_knl in kernel_gen: print(gen_knl) @@ -207,7 +207,7 @@ def test_owed_barriers(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) @@ -228,7 +228,7 @@ def test_wg_too_small(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) import pytest for gen_knl in kernel_gen: @@ -251,7 +251,7 @@ def test_multi_cse(ctx_factory): knl = lp.add_prefetch(knl, "a", []) knl = lp.preprocess_kernel(knl) - kernel_gen = lp.generate_loop_schedules(knl) + kernel_gen = lp.generate_loop_outlines(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) @@ -305,7 +305,7 @@ def test_ilp_write_race_detection_global(): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: - list(lp.generate_loop_schedules(knl)) + list(lp.generate_loop_outlines(knl)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) @@ -322,7 +322,7 @@ def test_ilp_write_race_avoidance_local(): knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): + for k in lp.generate_loop_outlines(knl): assert k.temporary_variables["a"].shape == (16, 17) @@ -337,7 +337,7 @@ def test_ilp_write_race_avoidance_private(): knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): + for k in lp.generate_loop_outlines(knl): assert k.temporary_variables["a"].shape == (16,) # }}} @@ -777,7 +777,7 @@ def test_multiple_writes_to_local_temporary(): knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): + for k in lp.generate_loop_outlines(knl): code, _ = lp.generate_code(k) print(code) @@ -860,7 +860,7 @@ def test_variable_size_temporary(): # Make sure that code generation succeeds even if # there are variable-length arrays. knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): + for k in lp.generate_loop_outlines(knl): lp.generate_code(k) @@ -1061,14 +1061,14 @@ def test_kernel_splitting(ctx_factory): knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - # schedule + # outline from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) - from loopy.schedule import get_one_scheduled_kernel - knl = get_one_scheduled_kernel(knl) + from loopy.outline import get_one_outlined_kernel + knl = get_one_outlined_kernel(knl) - # map schedule onto host or device + # map outline onto host or device print(knl) cgr = lp.generate_code_v2(knl) @@ -1102,14 +1102,14 @@ def test_kernel_splitting_with_loop(ctx_factory): knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - # schedule + # outline from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) - from loopy.schedule import get_one_scheduled_kernel - knl = get_one_scheduled_kernel(knl) + from loopy.outline import get_one_outlined_kernel + knl = get_one_outlined_kernel(knl) - # map schedule onto host or device + # map outline onto host or device print(knl) cgr = lp.generate_code_v2(knl) @@ -1124,14 +1124,14 @@ def test_kernel_splitting_with_loop(ctx_factory): def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False): from loopy.preprocess import preprocess_kernel - from loopy.schedule import get_one_scheduled_kernel + from loopy.outline import get_one_outlined_kernel knl = preprocess_kernel(knl) - knl = get_one_scheduled_kernel(knl) + knl = get_one_outlined_kernel(knl) from loopy.transform.save import save_and_reload_temporaries knl = save_and_reload_temporaries(knl) - knl = get_one_scheduled_kernel(knl) + knl = get_one_outlined_kernel(knl) if debug: print(knl) @@ -1395,7 +1395,7 @@ def test_save_ambiguous_storage_requirements(): knl = lp.set_temporary_scope(knl, "a", "local") knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) + knl = lp.get_one_outlined_kernel(knl) from loopy.diagnostic import LoopyError with pytest.raises(LoopyError): @@ -1598,7 +1598,7 @@ def test_call_with_options(): # }}} -def test_unschedulable_kernel_detection(): +def test_unoutlinable_kernel_detection(): knl = lp.make_kernel(["{[i,j]:0<=i,j