diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e83515d31f1c61e52569d8d0754ce79e7a7f602f..57bf4c6a8aa31c3c5aad3be3b34ccbab7caa9b37 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,6 +32,13 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from cgen import Collection + +from loopy.kernel.instruction import ( + Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, + CInstruction, _DataObliviousInstruction) + + import logging logger = logging.getLogger(__name__) @@ -187,6 +194,12 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: is_generating_master_kernel + + Can be either `True` or `False`. Indicating whether the code is being + generated for a master kernel or an auxiliary kernel. + """ def __init__(self, kernel, @@ -196,7 +209,8 @@ class CodeGenerationState(object): vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + schedule_index_end=None, + is_generating_master_kernel=None): self.kernel = kernel self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain @@ -211,6 +225,7 @@ class CodeGenerationState(object): self.is_generating_device_code = is_generating_device_code self.gen_program_name = gen_program_name self.schedule_index_end = schedule_index_end + self.is_generating_master_kernel = is_generating_master_kernel # {{{ copy helpers @@ -219,7 +234,8 @@ class CodeGenerationState(object): var_subst_map=None, vectorization_info=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + schedule_index_end=None, + is_generating_master_kernel=None): if kernel is None: kernel = self.kernel @@ -242,6 +258,9 @@ class CodeGenerationState(object): if schedule_index_end is None: schedule_index_end = self.schedule_index_end + if is_generating_master_kernel is None: + is_generating_master_kernel = self.is_generating_master_kernel + return CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, @@ -257,7 +276,8 @@ class CodeGenerationState(object): var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, gen_program_name=gen_program_name, - schedule_index_end=schedule_index_end) + schedule_index_end=schedule_index_end, + is_generating_master_kernel=is_generating_master_kernel) def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" @@ -470,13 +490,49 @@ def generate_code_v2(kernel): kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule)) + schedule_index_end=len(kernel.schedule), + is_generating_master_kernel=True) from loopy.codegen.result import generate_host_or_device_program + + # {{{ collecting ASTs of auxiliary kernels + + auxiliary_dev_progs = [] + + from loopy.codegen.auxiliary_kernels import generate_auxiliary_kernel_device_code + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + in_knl_callable = kernel.scoped_functions[insn.expression.function.name] + if in_knl_callable.subkernel is not None: + auxiliary_dev_prog = generate_auxiliary_kernel_device_code( + in_knl_callable.subkernel, + kernel.target).device_programs[0].ast + auxiliary_dev_progs.append(auxiliary_dev_prog) + elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, + BarrierInstruction, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("register_knl not made for %s type of" + "instruciton" % (str(type(insn)))) + + # }}} + codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) + # {{{ pasting the auxiliary functions code to the first device program + + new_dev_prog = codegen_result.device_programs[0] + for auxiliary_dev_prog in auxiliary_dev_progs: + new_dev_prog = new_dev_prog.copy( + ast=Collection([auxiliary_dev_prog, new_dev_prog.ast])) + new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] + codegen_result = codegen_result.copy(device_programs=new_device_programs) + + # }}} + device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains diff --git a/loopy/codegen/auxiliary_kernels.py b/loopy/codegen/auxiliary_kernels.py new file mode 100644 index 0000000000000000000000000000000000000000..799ab59bf5e78fcac58bb71e6e9b61ffc7aa4b22 --- /dev/null +++ b/loopy/codegen/auxiliary_kernels.py @@ -0,0 +1,188 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +import islpy as isl + +from loopy.codegen import ( + ImplementedDataInfo, + CodeGenerationState) +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import ( + Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, + CInstruction, _DataObliviousInstruction) +from cgen import Collection + +import logging +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: generate_auxiliary_kernel_device_code + +""" + + +# {{{ code generation for the auxiliary kernel + +def generate_auxiliary_kernel_device_code(kernel, target): + """ + Generates device programs for the given auxiliary kernel, with the target + specified by the parent kernel + :returns: a :class:`CodeGenerationResult` + """ + kernel = kernel.copy(target=target) + + from loopy.kernel import kernel_state + if kernel.state == kernel_state.INITIAL: + from loopy.preprocess import preprocess_kernel + kernel = preprocess_kernel(kernel) + + if kernel.schedule is None: + from loopy.schedule import get_one_scheduled_kernel + kernel = get_one_scheduled_kernel(kernel) + + if kernel.state != kernel_state.SCHEDULED: + raise LoopyError( + "cannot generate code for a kernel that has not been " + "scheduled") + + from loopy.type_inference import infer_unknown_types + kernel = infer_unknown_types(kernel, expect_completion=True) + + from loopy.check import pre_codegen_checks + pre_codegen_checks(kernel) + + logger.info("%s: generate Auxillary Kernel code: start" % kernel.name) + + # {{{ examine arg list + + from loopy.kernel.data import ValueArg + from loopy.kernel.array import ArrayBase + + implemented_data_info = [] + + for arg in kernel.args: + is_written = arg.name in kernel.get_written_variables() + if isinstance(arg, ArrayBase): + implemented_data_info.extend( + arg.decl_info( + kernel.target, + is_written=is_written, + index_dtype=kernel.index_dtype)) + + elif isinstance(arg, ValueArg): + implemented_data_info.append(ImplementedDataInfo( + target=kernel.target, + name=arg.name, + dtype=arg.dtype, + arg_class=ValueArg, + is_written=is_written)) + + else: + raise ValueError("argument type not understood: '%s'" % type(arg)) + + allow_complex = False + for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): + if var.dtype.involves_complex(): + allow_complex = True + + # }}} + + seen_dtypes = set() + seen_functions = set() + seen_atomic_dtypes = set() + + initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) + codegen_state = CodeGenerationState( + kernel=kernel, + implemented_data_info=implemented_data_info, + implemented_domain=initial_implemented_domain, + implemented_predicates=frozenset(), + seen_dtypes=seen_dtypes, + seen_functions=seen_functions, + seen_atomic_dtypes=seen_atomic_dtypes, + var_subst_map={}, + allow_complex=allow_complex, + var_name_generator=kernel.get_var_name_generator(), + is_generating_device_code=False, + gen_program_name=kernel.name, + schedule_index_end=len(kernel.schedule), + is_generating_master_kernel=False) + + from loopy.codegen.result import generate_host_or_device_program + + # {{{ collecting ASTs of auxiliary kernels + + auxiliary_dev_progs = [] + + from loopy.codegen.auxiliary_kernels import generate_auxiliary_kernel_device_code + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + in_knl_callable = kernel.scoped_functions[insn.expression.function.name] + if in_knl_callable.subkernel is not None: + auxiliary_dev_prog = generate_auxiliary_kernel_device_code( + in_knl_callable.subkernel, + kernel.target).device_programs[0].ast + auxiliary_dev_progs.append(auxiliary_dev_prog) + elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, + BarrierInstruction, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("register_knl not made for %s type of" + "instruciton" % (str(type(insn)))) + + # }}} + + codegen_result = generate_host_or_device_program( + codegen_state, + schedule_index=0) + + # {{{ pasting the auxiliary functions code to the first device program + + new_dev_prog = codegen_result.device_programs[0] + for auxiliary_dev_prog in auxiliary_dev_progs: + new_dev_prog = new_dev_prog.copy( + ast=Collection([auxiliary_dev_prog, new_dev_prog.ast])) + new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] + codegen_result = codegen_result.copy(device_programs=new_device_programs) + + # }}} + + # For faster unpickling in the common case when implemented_domains isn't needed. + from loopy.tools import LazilyUnpicklingDict + codegen_result = codegen_result.copy( + implemented_domains=LazilyUnpicklingDict( + codegen_result.implemented_domains)) + + logger.info("%s: generate code: done" % kernel.name) + + return codegen_result + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index bb88cc0916de1264ede05360554dfc1be1e7dbf0..ee44d5ea412318f2fe49be3bc5f5556546b04aa4 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -61,7 +61,7 @@ class ArrayArgDescriptor(ArgDescriptor): # }}} - super(ArgDescriptor, self).__init__(shape=None, + super(ArgDescriptor, self).__init__(shape=shape, mem_scope=mem_scope, dim_tags=dim_tags) @@ -412,6 +412,7 @@ class InKernelCallable(ImmutableRecord): new_args[id] = new_args[id].copy(shape=descr.shape, dim_tags=descr.dim_tags) + descriptor_specialized_knl = self.subkernel.copy(args=new_args) return self.copy(subkernel=descriptor_specialized_knl, diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 068953a52709f9cf869a88dad425168fa6c67cb2..eedfca6f91ad890d1defb189778080279ebb6613 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2202,9 +2202,8 @@ class ArgDescriptionInferer(CombineMapper): combined_arg_id_to_dtype)) # collecting the descriptors for args, kwargs, assignees - a = frozenset(((expr, new_scoped_function), )) - b = self.combine((self.rec(child) for child in expr.parameters)) - return (a | b) + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) def map_call_with_kwargs(self, expr, **kwargs): from loopy.kernel.function_intergace import ValueArgDescriptor @@ -2267,8 +2266,9 @@ def infer_arg_descr(kernel): pymbolic_calls_to_functions.update( arg_description_modifier(insn.expression, assignees=insn.assignees)) - if isinstance(insn, (MultiAssignmentBase, CInstruction)): - pymbolic_calls_to_functions.update(arg_description_modifier(insn.expression)) + elif isinstance(insn, (MultiAssignmentBase, CInstruction)): + pymbolic_calls_to_functions.update(arg_description_modifier( + insn.expression)) elif isinstance(insn, _DataObliviousInstruction): pass else: @@ -2386,20 +2386,10 @@ def preprocess_kernel(kernel, device=None): # have been established kernel = check_atomic_loads(kernel) + # inferring the shape and dim_tags of the arguments involved in a function + # call. kernel = infer_arg_descr(kernel) - print(75*'-') - print("This is after Type Inference") - for insn in kernel.instructions: - print(insn) - print(75*'-') - print('Linked Functions:') - for name, func in kernel.scoped_functions.items(): - print(name, "=>", (func.name, func.arg_id_to_dtype, - func.arg_id_to_descr, func.subkernel.args)) - print() - print(75*'-') - kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 13460387226d79dcbc055f0eb245d11090145748..b1b1446db123fbfb0296f2bccbe14df1b9d4fb13 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -459,34 +459,8 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): expr = subst_expander(writer_insn.expression) debug(" via expr %s", expr) - if isinstance(writer_insn, lp.Assignment): - result = type_inf_mapper(expr, return_dtype_set=True) - elif isinstance(writer_insn, lp.CallInstruction): - result = type_inf_mapper(expr, return_dtype_set=True) - """ - # Maybe we need to alter this so that the type_inf_mapper returns a - # :class:`dict`? - # ask about this to Andreas Sir. - return_dtype_set = type_inf_mapper(expr, return_tuple=False, - return_dtype_set=True) - - print(return_dtype_set) - print(writer_insn.assignee_var_names()) - result = [] - for return_dtype_set in return_dtype_set: - result_i = None - found = False - for assignee, comp_dtype_set in zip( - writer_insn.assignee_var_names(), return_dtype_set): - if assignee == var_name: - found = True - result_i = comp_dtype_set - break - assert found - if result_i is not None: - result.append(result_i) - """ + result = type_inf_mapper(expr, return_dtype_set=True) debug(" result: %s", result)