diff --git a/loopy/check.py b/loopy/check.py index 727b02a85acf16c4a8ec4b5793ecc850c294fd14..f50ee5cfaa3c6a12ed542adf683beb660616dffc 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -486,11 +486,12 @@ def check_write_destinations(kernel): # {{{ check_has_schedulable_iname_nesting def check_has_schedulable_iname_nesting(kernel): - from loopy.transform.iname import (has_schedulable_iname_nesting, - get_iname_duplication_options) - if not has_schedulable_iname_nesting(kernel): + from loopy.transform.iname import ( + has_schedulable_iname_nesting_for_single_kernel, + get_iname_duplication_options_for_single_kernel) + if not has_schedulable_iname_nesting_for_single_kernel(kernel): import itertools as it - opt = get_iname_duplication_options(kernel) + opt = get_iname_duplication_options_for_single_kernel(kernel) opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) for i, w in it.islice(opt, 3)) raise LoopyError("Kernel does not have a schedulable iname nesting. " diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index ed1e7a5bc8da0ee79154e9053eaeb6a624545a65..e9e7c9a447afb559e3536ab3cb1219111a3a2e0d 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -154,6 +154,7 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState(object): """ .. attribute:: kernel + .. attribute:: target .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -199,7 +200,7 @@ class CodeGenerationState(object): .. attribute:: program_callables_info """ - def __init__(self, kernel, + def __init__(self, kernel, target, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, @@ -209,6 +210,7 @@ class CodeGenerationState(object): gen_program_name=None, schedule_index_end=None): self.kernel = kernel + self.target = target self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates @@ -226,7 +228,7 @@ class CodeGenerationState(object): # {{{ copy helpers - def copy(self, kernel=None, implemented_data_info=None, + def copy(self, kernel=None, target=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), var_subst_map=None, vectorization_info=None, is_generating_device_code=None, @@ -236,6 +238,9 @@ class CodeGenerationState(object): if kernel is None: kernel = self.kernel + if target is None: + target = self.target + if implemented_data_info is None: implemented_data_info = self.implemented_data_info @@ -256,6 +261,7 @@ class CodeGenerationState(object): return CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( @@ -413,7 +419,7 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_for_a_single_kernel(kernel, program_callables_info): +def generate_code_for_a_single_kernel(kernel, program_callables_info, target): """ :returns: a :class:`CodeGenerationResult` """ @@ -459,13 +465,13 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( - kernel.target, + target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( - target=kernel.target, + target=target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, @@ -488,6 +494,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), @@ -499,9 +506,9 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( - kernel.target.host_program_name_prefix + target.host_program_name_prefix + kernel.name - + kernel.target.host_program_name_suffix), + + target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), program_callables_info=program_callables_info) @@ -536,7 +543,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): ) preamble_generators = (kernel.preamble_generators - + kernel.target.get_device_ast_builder().preamble_generators()) + + target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -579,7 +586,7 @@ def generate_code_v2(program): if isinstance(in_knl_callable, CallableKernel): codegen_results[func_id] = ( generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.program_callables_info)) + program.program_callables_info, program.target)) device_preambles = set() for cgr in codegen_results.values(): diff --git a/loopy/program.py b/loopy/program.py index bb5b9b1aca54137ded259ccc812f8ba7430ee13b..df7bd1bdd2fd04ca2a2061f6f700608590b5d773 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -192,6 +192,28 @@ class Program(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash + def copy(self, **kwargs): + if 'target' in kwargs: + target = kwargs['target'] + new_self = super(Program, self).copy(**kwargs) + new_resolved_functions = {} + for func_id, in_knl_callable in ( + new_self.program_callables_info.items()): + if isinstance(in_knl_callable, CallableKernel): + subkernel = in_knl_callable.subkernel + new_resolved_functions[func_id] = in_knl_callable.copy( + subkernel=subkernel.copy(target=target)) + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = new_self.program_callables_info.copy( + resolved_functions=new_resolved_functions) + + return new_self.copy( + program_callables_info=program_callables_info) + else: + return super(Program, self).copy(**kwargs) + def get_grid_size_upper_bounds(self, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index caa02c17afaa75180328cadbc3ed307d1f49823f..75aa62467eef7f58591e640e9b4f3c80f97e37dc 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -36,6 +36,7 @@ from loopy.diagnostic import LoopyError from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable __doc__ = """ @@ -982,7 +983,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(knl, use_boostable_into=False): +def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1048,7 +1049,7 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # If we find a duplication option and to not use boostable_into # information, we restart this generator with use_boostable_into=True if not use_boostable_into and not knl.options.ignore_boostable_into: - for option in get_iname_duplication_options(knl, True): + for option in get_iname_duplication_options_for_single_kernel(knl, True): yield option # Emit a warning that we needed boostable_into @@ -1076,12 +1077,34 @@ def get_iname_duplication_options(knl, use_boostable_into=False): yield iname, within -def has_schedulable_iname_nesting(knl): +def get_iname_duplication_options(program, use_boostable_into=False): + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + yield from get_iname_duplication_options_for_single_kernel( + in_knl_callable.subkernel, use_boostable_into) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of in kernel callable %s." + % (type(in_knl_callable))) + + return + + +def has_schedulable_iname_nesting_for_single_kernel(knl): """ :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ - return not bool(next(get_iname_duplication_options(knl), False)) + return not bool(next(get_iname_duplication_options_for_single_kernel(knl), + False)) + + +def has_schedulable_iname_nesting(program): + return all(has_schedulable_iname_nesting_for_single_kernel( + in_knl_callable.subkernel) for in_knl_callable in + program.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 13d9c722ed1fdb15f405d19c8e21389b974dcc9f..65c91871ad276d5e99c295971ca4ab2522176742 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -36,7 +36,7 @@ from loopy.diagnostic import ( from loopy.kernel.instruction import _DataObliviousInstruction from loopy.program import ProgramCallablesInfo -from loopy.symbolic import SubArrayRef +from loopy.symbolic import SubArrayRef, LinearSubscript from pymbolic.primitives import Variable, Subscript import logging @@ -819,7 +819,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, if kernel.temporary_variables[assignee.name].dtype is None: return False - elif isinstance(assignee, Subscript): + elif isinstance(assignee, (Subscript, LinearSubscript)): if assignee.aggregate.name in kernel.arg_dict: if kernel.arg_dict[assignee.aggregate.name].dtype is None: return False @@ -828,7 +828,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, if kernel.temporary_variables[ assignee.aggregate.name].dtype is None: return False - else: assert isinstance(assignee, SubArrayRef) if assignee.subscript.aggregate.name in kernel.arg_dict: