diff --git a/.gitignore b/.gitignore index 5c9e73c7b6ffc90a15059b96c59c461ca0157baf..b0668bd73d410542a2ed669ac0d1c429773d1b38 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,6 @@ htmlcov .ipynb_checkpoints lextab.py yacctab.py +.pytest_cache/* .cache diff --git a/loopy/__init__.py b/loopy/__init__.py index 5e8a3fb06b733183fb03c09eb6126a3eee98b916..f5d7f20e2e368d65df2cadc51ba8809c9f024131 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -33,21 +33,21 @@ from loopy.diagnostic import LoopyError, LoopyWarning # {{{ imported user interface -from loopy.library.function import ( - default_function_mangler, single_arg_function_mangler) - from loopy.kernel.instruction import ( memory_ordering, memory_scope, VarAtomicity, AtomicInit, AtomicUpdate, InstructionBase, MultiAssignmentBase, Assignment, ExpressionInstruction, CallInstruction, CInstruction, NoOpInstruction, BarrierInstruction) + from loopy.kernel.data import ( auto, KernelArgument, ValueArg, GlobalArg, ConstantArg, ImageArg, temp_var_scope, TemporaryVariable, - SubstitutionRule, - CallMangleInfo) + SubstitutionRule) + +from loopy.kernel.function_interface import (InKernelCallable, + CommonReturnTypeCallable, SpecificReturnTypeCallable) from loopy.kernel import LoopKernel, kernel_state from loopy.kernel.tools import ( @@ -114,6 +114,7 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier +from loopy.transform.register_knl import register_callable_kernel # }}} from loopy.type_inference import infer_unknown_types @@ -159,13 +160,12 @@ __all__ = [ "CallInstruction", "CInstruction", "NoOpInstruction", "BarrierInstruction", + "InKernelCallable", "SpecificReturnTypeCallable", "CommonReturnTypeCallable", + "KernelArgument", "ValueArg", "GlobalArg", "ConstantArg", "ImageArg", "temp_var_scope", "TemporaryVariable", - "SubstitutionRule", - "CallMangleInfo", - - "default_function_mangler", "single_arg_function_mangler", + "SubstitutionRule" "make_kernel", "UniqueName", @@ -218,6 +218,8 @@ __all__ = [ "add_barrier", + "register_callable_kernel", + # }}} "get_dot_dependency_graph", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e83515d31f1c61e52569d8d0754ce79e7a7f602f..9e6db010d55812a16b8dbe0b10094ae8e1f4b990 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,6 +32,12 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from cgen import Collection + +from loopy.kernel.instruction import ( + Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, + _DataObliviousInstruction) + import logging logger = logging.getLogger(__name__) @@ -187,6 +193,11 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: is_generating_master_kernel + + True of False indication if the code generation is happening for a + master kernel or auxiliary kernels respectively. """ def __init__(self, kernel, @@ -196,7 +207,8 @@ class CodeGenerationState(object): vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + schedule_index_end=None, + is_generating_master_kernel=None): self.kernel = kernel self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain @@ -211,6 +223,7 @@ class CodeGenerationState(object): self.is_generating_device_code = is_generating_device_code self.gen_program_name = gen_program_name self.schedule_index_end = schedule_index_end + self.is_generating_master_kernel = is_generating_master_kernel # {{{ copy helpers @@ -219,7 +232,8 @@ class CodeGenerationState(object): var_subst_map=None, vectorization_info=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + schedule_index_end=None, + is_generating_master_kernel=None): if kernel is None: kernel = self.kernel @@ -242,6 +256,9 @@ class CodeGenerationState(object): if schedule_index_end is None: schedule_index_end = self.schedule_index_end + if is_generating_master_kernel is None: + is_generating_master_kernel = self.is_generating_master_kernel + return CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, @@ -257,7 +274,8 @@ class CodeGenerationState(object): var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, gen_program_name=gen_program_name, - schedule_index_end=schedule_index_end) + schedule_index_end=schedule_index_end, + is_generating_master_kernel=is_generating_master_kernel) def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" @@ -371,9 +389,9 @@ class PreambleInfo(ImmutableRecord): .. attribute:: codegen_state """ - # {{{ main code generation entrypoint + def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` @@ -470,9 +488,36 @@ def generate_code_v2(kernel): kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule)) + schedule_index_end=len(kernel.schedule), + is_generating_master_kernel=True) from loopy.codegen.result import generate_host_or_device_program + # {{{ handling auxiliary kernels + + auxiliary_functions = [] + + for func, aux_knl in kernel.auxiliary_kernels.items(): + from loopy.codegen.auxiliary_kernels import ( + get_instruction_specific_kernel, + generate_auxiliary_kernel_device_code) + for insn in kernel.instructions: + if isinstance(insn, CallInstruction) and insn.is_array_call: + if insn.expression.function.name == func: + compliant_knl = get_instruction_specific_kernel( + insn, kernel, aux_knl) + # TODO: Also need to take input such as allow_complex, + # and preambles from the aux kernels + aux_func = generate_auxiliary_kernel_device_code(compliant_knl, + kernel.target).device_programs[0].ast + auxiliary_functions.append(aux_func) + elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, + BarrierInstruction, CallInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("register_knl not made for %s type of" + "instruciton" % (str(type(insn)))) + # }}} codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) @@ -510,6 +555,17 @@ def generate_code_v2(kernel): # }}} + # {{{ Pasting the auxiliary functions code to the first device program + # TODO: Currently Sticks all the functions only in the first dev_prog, + # need to identify which function goes with which kernel + new_dev_prog = codegen_result.device_programs[0] + for func in auxiliary_functions: + new_dev_prog = new_dev_prog.copy( + ast=Collection([func, new_dev_prog.ast])) + new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] + codegen_result = codegen_result.copy(device_programs=new_device_programs) + # }}} + # For faster unpickling in the common case when implemented_domains isn't needed. from loopy.tools import LazilyUnpicklingDict codegen_result = codegen_result.copy( diff --git a/loopy/codegen/auxiliary_kernels.py b/loopy/codegen/auxiliary_kernels.py new file mode 100644 index 0000000000000000000000000000000000000000..b1ecbc3ff7a9a5c59d3f96b220620504630300f6 --- /dev/null +++ b/loopy/codegen/auxiliary_kernels.py @@ -0,0 +1,308 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +import islpy as isl + +from loopy.codegen import ( + ImplementedDataInfo, + PreambleInfo, + CodeGenerationState) +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import ( + Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, + _DataObliviousInstruction) +from cgen import Collection + +import logging +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: generate_auxiliary_kernel_device_code + +.. autofunction:: get_instruction_specific_kernel +""" + +# {{{ generating compliant kernel for the given instruction + + +def check_compliance(insn, parent_knl, child_knl, child_to_parent): + + # {{{ getting the parent to child mapping + + parent_to_child = {} + for child, parent in child_to_parent.items(): + parent_to_child[parent] = child + + # }}} + + # {{{ dtype compliance + for arg in child_knl.args: + name_in_parent = child_to_parent[arg.name] + parent_arg = parent_knl.arg_dict[name_in_parent] + if arg.dtype is not None: + assert arg.dtype == parent_arg.dtype, ("While registering kernel the" + "dtypes of variables don't match") + # }}} + + """ + # Disabling for now, till I have a function for finding the swept region + # {{{ axes used by the swept_inames + + parent_parameters = insn.expression.parameters + parent_assignees = insn.exression.assignees + for par in parent_parameters + parent_assignees: + inames = par.swept_inames + child_arg = child_knl.arg_dict[parent_to_child[par.name]] + + # check to ensure the equality of number of axes around both the + # kernels + assert len(child_arg.shape) == len(inames), ("regsiter_knl: The ") + + parent_swept_region = par.swept_region() + child_swept_region = child_arg.shape + + for parent_swept, child_swept in zip(parent_swept_region, + child_swept_region): + assert parent_swept == child_swept, ("regsiter_kernel: send only the" + "part of the array you intend to write to the child kernel") + """ + + # }}} + + +def get_instruction_specific_kernel(insn, parent_knl, child_knl): + """ Generates the kernel with the arguments strided so that it is compliant + with the given instruction. Returns the new compliant kernel. + """ + child_knl = child_knl.copy( + name=insn.expression.function.name + "_" + insn.id) + dim_tags_dict = insn.get_parameters_dim_tag_dict(parent_knl.arg_dict) + + # {{{ creating the parent to child parameter association dictionary + + child_arg_to_parent = {} + for child_par, parent_par in zip(child_knl.args, + insn.expression.parameters + insn.assignees): + child_arg_to_parent[child_par.name] = ( + parent_par.subscript.aggregate.name) + + # }}} + + check_compliance(insn, parent_knl, child_knl, child_arg_to_parent) + + new_args = [] + for arg in child_knl.args: + name_in_parent = child_arg_to_parent[arg.name] + parent_arg = parent_knl.arg_dict[name_in_parent] + child_dim_tag = dim_tags_dict[name_in_parent] + + new_args.append(arg.copy(dim_tags=child_dim_tag, dtype=parent_arg.dtype)) + + child_knl = child_knl.copy(args=new_args) + + return child_knl + +# }}} + +# {{{ code generation for the auxiliary kernel + + +def generate_auxiliary_kernel_device_code(kernel, target): + """ + Generates device programs for the given auxiliary kernel, with the target + specified by the parent kernel + :returns: a :class:`CodeGenerationResult` + """ + kernel = kernel.copy(target=target) + + from loopy.kernel import kernel_state + if kernel.state == kernel_state.INITIAL: + from loopy.preprocess import preprocess_kernel + kernel = preprocess_kernel(kernel) + + if kernel.schedule is None: + from loopy.schedule import get_one_scheduled_kernel + kernel = get_one_scheduled_kernel(kernel) + + if kernel.state != kernel_state.SCHEDULED: + raise LoopyError( + "cannot generate code for a kernel that has not been " + "scheduled") + + from loopy.type_inference import infer_unknown_types + kernel = infer_unknown_types(kernel, expect_completion=True) + + from loopy.check import pre_codegen_checks + pre_codegen_checks(kernel) + + logger.info("%s: generate Auxillary Kernel code: start" % kernel.name) + + # {{{ examine arg list + + from loopy.kernel.data import ValueArg + from loopy.kernel.array import ArrayBase + + implemented_data_info = [] + + for arg in kernel.args: + is_written = arg.name in kernel.get_written_variables() + if isinstance(arg, ArrayBase): + implemented_data_info.extend( + arg.decl_info( + kernel.target, + is_written=is_written, + index_dtype=kernel.index_dtype)) + + elif isinstance(arg, ValueArg): + implemented_data_info.append(ImplementedDataInfo( + target=kernel.target, + name=arg.name, + dtype=arg.dtype, + arg_class=ValueArg, + is_written=is_written)) + + else: + raise ValueError("argument type not understood: '%s'" % type(arg)) + + allow_complex = False + for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): + if var.dtype.involves_complex(): + allow_complex = True + + # }}} + + seen_dtypes = set() + seen_functions = set() + seen_atomic_dtypes = set() + + initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) + codegen_state = CodeGenerationState( + kernel=kernel, + implemented_data_info=implemented_data_info, + implemented_domain=initial_implemented_domain, + implemented_predicates=frozenset(), + seen_dtypes=seen_dtypes, + seen_functions=seen_functions, + seen_atomic_dtypes=seen_atomic_dtypes, + var_subst_map={}, + allow_complex=allow_complex, + var_name_generator=kernel.get_var_name_generator(), + is_generating_device_code=False, + gen_program_name=kernel.name, + schedule_index_end=len(kernel.schedule), + is_generating_master_kernel=False) + + from loopy.codegen.result import generate_host_or_device_program + + # {{{ handling auxiliary kernels + + auxiliary_functions = [] + + for func, aux_knl in kernel.auxiliary_kernels.items(): + from loopy.codegen.auxiliary_kernels import ( + get_instruction_specific_kernel, + generate_auxiliary_kernel_device_code) + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + if insn.expression.function.name == func: + compliant_knl = get_instruction_specific_kernel( + insn, kernel, aux_knl) + # TODO: Also need to take input such as allow_complex, + # and preambles from the aux kernels + aux_func = generate_auxiliary_kernel_device_code( + compliant_knl, + kernel.target).device_programs[0].ast # noqa + auxiliary_functions.append(aux_func) + elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, + BarrierInstruction, CallInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "register_knl not made for %s type of" + "instruciton" % (str(type(insn)))) + # }}} + codegen_result = generate_host_or_device_program( + codegen_state, + schedule_index=0) + + device_code_str = codegen_result.device_code() + + from loopy.check import check_implemented_domains + assert check_implemented_domains( + kernel, codegen_result.implemented_domains, device_code_str) + + # {{{ handle preambles + + for arg in kernel.args: + seen_dtypes.add(arg.dtype) + for tv in six.itervalues(kernel.temporary_variables): + seen_dtypes.add(tv.dtype) + + preambles = kernel.preambles[:] + + preamble_info = PreambleInfo( + kernel=kernel, + seen_dtypes=seen_dtypes, + seen_functions=seen_functions, + # a set of LoopyTypes (!) + seen_atomic_dtypes=seen_atomic_dtypes, + codegen_state=codegen_state + ) + + preamble_generators = kernel.preamble_generators + for prea_gen in preamble_generators: + preambles.extend(prea_gen(preamble_info)) + + codegen_result = codegen_result.copy(device_preambles=preambles) + + # }}} + + # {{{ Pasting the auxiliary functions code to the first device program + # TODO: Currently Sticks all the functions only in the first dev_prog, + # need to identify which function goes with which kernel + new_dev_prog = codegen_result.device_programs[0] + for func in auxiliary_functions: + new_dev_prog = new_dev_prog.copy( + ast=Collection([func, new_dev_prog.ast])) + new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] + codegen_result = codegen_result.copy(device_programs=new_device_programs) + # }}} + # For faster unpickling in the common case when implemented_domains isn't needed. + from loopy.tools import LazilyUnpicklingDict + codegen_result = codegen_result.copy( + implemented_domains=LazilyUnpicklingDict( + codegen_result.implemented_domains)) + + logger.info("%s: generate code: done" % kernel.name) + + return codegen_result + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index e590502fb5813af0a820d45228de8e11c35a46c8..1419f4a67d9025d61caddddf6b557fb808ff0809 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -75,7 +75,8 @@ def to_codegen_result( def generate_instruction_code(codegen_state, insn): kernel = codegen_state.kernel - from loopy.kernel.instruction import Assignment, CallInstruction, CInstruction + from loopy.kernel.instruction import (Assignment, CallInstruction, + CInstruction) if isinstance(insn, Assignment): ast = generate_assignment_instruction_code(codegen_state, insn) @@ -230,7 +231,7 @@ def generate_call_code(codegen_state, insn): # }}} - result = codegen_state.ast_builder.emit_multiple_assignment( + result = codegen_state.ast_builder.emit_call( codegen_state, insn) # {{{ tracing diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 038ef23ac08ce3bbc71a1fd1fce40181c6f8d9bb..be43fafdd5a25c1a3c0b2743114939da81aca0a6 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -35,14 +35,57 @@ import re from pytools import UniqueNameGenerator, generate_unique_names -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted from loopy.diagnostic import StaticValueFindingError +from loopy.kernel.function_interface import InKernelCallable + +# {{{ maybe need to remove it, but putting it over here for the moment + + +def default_callables(): + from loopy.library.reduction import reduction_callables + + tuple_callable = {"make_tuple": MakeTupleCallable()} + # TODO: the reduction_callables is empty for now. + # Will change it accoarding to the current system + default_callables = {**reduction_callables(), **tuple_callable} + + return default_callables + + +class MakeTupleCallable(InKernelCallable): + def __init__(self, arg_id_to_dtype=None): + super(MakeTupleCallable, self).__init__(name="loopy_make_tuple") + self.arg_id_to_dtype = arg_id_to_dtype + + def copy(self, arg_id_to_dtype): + if arg_id_to_dtype is None: + arg_id_to_dtype = self.arg_id_to_dtype + + return MakeTupleCallable(self.name, self.arg_id_to_dtype) + + def with_types(self, arg_id_to_dtype): + # there's nothing to check over here, since no other class inherits it this + # will be safe just for `make_tuple` + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for id, dtype in arg_id_to_dtype.items(): + if id >= 0: + # subtracting one because the input 0 maps to the output -1 and so + # on. + new_arg_id_to_dtype[-id-1] = dtype + + return self.copy(new_arg_id_to_dtype), new_arg_id_to_dtype + + def get_target_specific_name(self, target): + return self.name + + def get_preamble(self): + return "" + +# }}} + # {{{ unique var names @@ -150,6 +193,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): a mapping from substitution names to :class:`SubstitutionRule` objects + .. attribute:: auxiliary_kernels + + A dictionary of kernels that are to be mapped from their registered + function names + .. attribute:: iname_slab_increments a dictionary mapping inames to (lower_incr, @@ -196,10 +244,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): temporary_variables={}, iname_to_tag={}, substitutions={}, - function_manglers=[ - default_function_mangler, - single_arg_function_mangler, - ], + auxiliary_kernels={}, + callables=default_callables(), symbol_manglers=[], iname_slab_increments={}, @@ -281,9 +327,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): local_sizes=local_sizes, iname_to_tag=iname_to_tag, substitutions=substitutions, + auxiliary_kernels=auxiliary_kernels, cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, - function_manglers=function_manglers, + callables=callables, symbol_manglers=symbol_manglers, index_dtype=index_dtype, options=options, @@ -296,46 +343,25 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling + # {{{ specializing a call - def mangle_function(self, identifier, arg_dtypes, ast_builder=None): + def get_specialized_callable(self, identifier, arg_id_to_dtype, + ast_builder=None): if ast_builder is None: ast_builder = self.target.get_device_ast_builder() - - manglers = ast_builder.function_manglers() + self.function_manglers - - for mangler in manglers: - mangle_result = mangler(self, identifier, arg_dtypes) - if mangle_result is not None: - from loopy.kernel.data import CallMangleInfo - if isinstance(mangle_result, CallMangleInfo): - assert len(mangle_result.arg_dtypes) == len(arg_dtypes) - return mangle_result - - assert isinstance(mangle_result, tuple) - - from warnings import warn - warn("'%s' returned a tuple instead of a CallMangleInfo instance. " - "This is deprecated." % mangler.__name__, - DeprecationWarning) - - if len(mangle_result) == 2: - result_dtype, target_name = mangle_result - return CallMangleInfo( - target_name=target_name, - result_dtypes=(result_dtype,), - arg_dtypes=None) - - elif len(mangle_result) == 3: - result_dtype, target_name, actual_arg_dtypes = mangle_result - return CallMangleInfo( - target_name=target_name, - result_dtypes=(result_dtype,), - arg_dtypes=actual_arg_dtypes) - - else: - raise ValueError("unexpected size of tuple returned by '%s'" - % mangler.__name__) + # TODO: This is bad.. everytime this wants to specializze a call it tries to + # rebuild this dictionary. Not Happening in my watch ;) Will replace this! + # Maybe we need to make an attribute which would store it. Let see + callable_dict = {**self.callables, **ast_builder.callables(self)} + + if identifier in callable_dict: + guess_callable = callable_dict[identifier] + specialized_callable = guess_callable.with_types(arg_id_to_dtype) + + if specialized_callable is not None: + # the specialized callable should be a tuple + specialized_callable, new_arg_id_to_dtype = specialized_callable + return specialized_callable, new_arg_id_to_dtype return None diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 4a08c28bd8091425293892384e01d20447413cd5..92138526eb800ceb9574c84b064243946cf903f0 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -497,14 +497,16 @@ def parse_insn(groups, insn_options): if isinstance(inner_lhs_i, Lookup): inner_lhs_i = inner_lhs_i.aggregate - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef if isinstance(inner_lhs_i, Variable): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): assignee_names.append(inner_lhs_i.aggregate.name) + elif isinstance(inner_lhs_i, (SubArrayRef)): + assignee_names.append(inner_lhs_i.subscript.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " - "be variable or subscript" % (lhs_i,)) + "be variable, subscript or SubArrayRef" % (lhs_i,)) new_lhs.append(lhs_i) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index c90e8a64b6f47a87e87c5e64d2ef930232d34894..fd2b5c7638f5589e954cdc9b03453b6724b3fcf4 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -604,34 +604,4 @@ class SubstitutionRule(ImmutableRecord): # }}} -# {{{ function call mangling - -class CallMangleInfo(ImmutableRecord): - """ - .. attribute:: target_name - - A string. The name of the function to be called in the - generated target code. - - .. attribute:: result_dtypes - - A tuple of :class:`LoopyType` instances indicating what - types of values the function returns. - - .. attribute:: arg_dtypes - - A tuple of :class:`LoopyType` instances indicating what - types of arguments the function actually receives. - """ - - def __init__(self, target_name, result_dtypes, arg_dtypes): - assert isinstance(result_dtypes, tuple) - - super(CallMangleInfo, self).__init__( - target_name=target_name, - result_dtypes=result_dtypes, - arg_dtypes=arg_dtypes) - -# }}} - # vim: foldmethod=marker diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..833cf57a7313d7f24fb4e23ae940274807e775d2 --- /dev/null +++ b/loopy/kernel/function_interface.py @@ -0,0 +1,506 @@ +from __future__ import division, absolute_import + +import numpy as np + +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError + +from loopy.types import NumpyType + + +# {{{ argument descriptors + +class ArgDescriptor(ImmutableRecord): + """Base type of argument description about the variable type that is supposed to + be encountered in a function signature. + .. attribute:: dtype + .. attribute:: mem_scope + .. attribute:: shape + .. attribute:: dim_tags + """ + + def __init__(self, + mem_scope=None, + shape=None, + dim_tags=None): + super(ArgDescriptor).__init__(self, + mem_scope=mem_scope, + shape=shape, + dim_tags=dim_tags) + + +class ValueArgDescriptor(ArgDescriptor): + """ + """ + def __init__(self): + super(ValueArgDescriptor).__init__(self) + + +class ArrayArgDescriptor(ArgDescriptor): + """ + .. attribute:: mem_scope + .. attribute:: dim_tags + """ + + def __init__(self, + mem_scope=None, + dim_tags=None): + super(ArgDescriptor).__init__(self, + mem_scope=mem_scope, + dim_tags=dim_tags) + + def copy(self, dtype=None, mem_scope=None, shape=None, dim_tags=None): + if dtype is None: + dtype = self.dtype + + if mem_scope is None: + mem_scope = self.mem_scope + + if dim_tags is None: + dim_tags = self.dim_tags + + return ArrayArgDescriptor( + mem_scope=mem_scope, + dim_tags=dim_tags) + + +# }}} + + +# {{{ in kernel callable + +class InKernelCallable(ImmutableRecord): + """ + + .. attribute:: name + + The name of the callable which can be encountered within a kernel. + + .. note:: + + Negative ids in the mapping attributes indicate the result arguments + + """ + + def __init__(self, name=None): + + # {{{ sanity checks + + if not isinstance(name, str): + raise LoopyError("name of a InKernelCallable should be a string") + + # }}} + + self.name = name + + super(InKernelCallable, self).__init__(name=name) + + def copy(self, name=None): + if name is None: + name = self.name + + return InKernelCallable(name=name) + + def with_types(self, arg_id_to_dtype): + """ + :arg arg_id_to_type: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.types.LoopyType` instances. + Unspecified/unknown types are not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_type* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr): + """ + :arg arg_id_to_descr: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.ArrayArgDescriptor` instances. + Unspecified/unknown types are not represented in *arg_id_to_descr*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_descr* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + + raise NotImplementedError() + + def with_iname_tag_usage(self, unusable, concurrent_shape): + """ + :arg unusable: a set of iname tags that may not be used in the callee. + :arg concurrent_shape: an list of tuples ``(iname_tag, bound)`` for + concurrent inames that are used in the calller but also available + for mapping by the callee. *bound* is given as a + :class:`islpy.PwAff`. + + :returns: a list of the same type as *concurrent*, potentially modified + by increasing bounds or adding further iname tag entries. + + All iname tags not explicitly listed in *concurrent* or *unusable* are + available for mapping by the callee. + """ + + raise NotImplementedError() + + def is_arg_written(self, arg_id): + """ + :arg arg_id: (keyword) name or position + """ + + raise NotImplementedError() + + def is_ready_for_code_gen(self): + + raise NotImplementedError() + + # {{{ code generation + + def generate_preambles(self, target): + """ This would generate the target specific preamble. + """ + raise NotImplementedError() + + def get_target_specific_name(self, target): + + raise NotImplementedError() + + def emit_call(self, target): + + raise NotImplementedError() + + # }}} + + def __eq__(self, other): + return (self.name == other.name + and self.arg_id_to_descr == other.arg_id_to_descr + and self.arg_id_to_keyword == other.arg_id_to_keyword) + + def __hash__(self): + return hash((self.name, )) + +# }}} + + +# {{{ generic callable class + + +class CommonReturnTypeCallable(InKernelCallable): + """ A class of generic functions which have the following properties: + - Single return value + - Return type of the callable is a common dtype to all the input arguments + to the callable + + .. attribute:: name + + The name of the function as would be encountered in loopy. + + ..attribute:: specialized_dtype + + The dtype for which the function has been setup to generate code and + premables. For example, the function `sin` can be specialized to either one + of the following `float sin(float x)` or `double sin(double x)`. This is not + usually expected to be an input as this removed the generality of the + callable. + + ..attribute:: kinds_allowed + + The extent upto which the function can be generalized upto. For example + `sin(x)` cannot have complex types as its specialized type. + + ..attribute:: arity + + The number of inputs that are to be given to the function + + """ + + def __init__(self, name=None, specialized_dtype=None, kinds_allowed=None, + arity=None): + + super(CommonReturnTypeCallable, self).__init__(name=name) + + self.specialized_dtype = specialized_dtype + self.kinds_allowed = kinds_allowed + self.arity = arity + + def copy(self, specialized_dtype=None): + if specialized_dtype is None: + specialized_dtype = self.specialized_dtype + + return type(self)(self.name, specialized_dtype, + self.kinds_allowed, self.arity) + + def with_types(self, arg_id_to_dtype): + + specialized_dtype = np.find_common_type([], [dtype.numpy_dtype + for id, dtype in arg_id_to_dtype.items() if id >= 0]) + + if self.specialized_dtype is not None and (specialized_dtype != + self.specialized_dtype): + from loopy.warnings import warn + warn("Trying to change the type of the already set function." + "-- maybe use a different class instance?") + + new_arg_id_to_dtype = arg_id_to_dtype.copy() + # checking the compliance of the arg_id_to_dtype + + if -1 not in arg_id_to_dtype: + # return type was not know earlier, now setting it to the common type + new_arg_id_to_dtype[-1] = NumpyType(specialized_dtype) + + if self.arity+1 == len(new_arg_id_to_dtype) and (specialized_dtype.kind in + self.kinds_allowed): + # the function signature matched with the current instance. + # returning the function and the new_arg_id_to_dtype + for i in range(self.arity): + new_arg_id_to_dtype[i] = NumpyType(specialized_dtype) + + return (self.copy(specialized_dtype=specialized_dtype), + new_arg_id_to_dtype) + + return None + + def is_ready_for_code_gen(self): + return self.specilized_dtype is not None + + def get_target_specific_name(self, target): + raise NotImplementedError() + + def get_preamble(self, target): + raise NotImplementedError() + +# }}} + +# {{{ specific type callable class + + +class SpecificReturnTypeCallable(InKernelCallable): + """ A super class for the funcitons which cannot be listed as generic + functions. These types of Callables support explicity mentioning of the + arguments and result dtypes. + + .. attribute:: name + + The name of the function as would be encountered in loopy. + + .. attribute:: arg_id_to_dtype + + The dtype pattern of the arguments which is supposed to be used for checking + the applicability of this function in a given scenario. + """ + + def __init__(self, name=None, arg_id_to_dtype=None): + + super(SpecificReturnTypeCallable, self).__init__(name=name) + + if arg_id_to_dtype is None: + LoopyError("The function signature is incomplete without the" + "`arg_id_to_dtype`") + self.arg_id_to_dtype = arg_id_to_dtype + + def with_types(self, arg_id_to_dtype): + + # Checking the number of inputs + if len([id for id in arg_id_to_dtype if id >= 0]) != len( + [id for id in self.arg_id_to_dtype if id >= 0]): + # the number of input arguments do not match + return None + + # Checking the input dtypes + for id, dtype in arg_id_to_dtype.items(): + if id in self.arg_id_to_dtype and self.arg_id_to_dtype[id] == dtype: + # dtype matched with the one given in the input + pass + else: + # did not match with the function signature and hence returning + # None + return None + + # Setting the output if not present + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for id, dtype in self.arg_id_to_dtype: + if id < 0: + # outputs + if id in new_arg_id_to_dtype and new_arg_id_to_dtype[id] != dtype: + # the output dtype had been supplied but did not match with the + # one in the function signature + return None + + new_arg_id_to_dtype[id] = dtype + + # Finally returning the types + return self.copy(), new_arg_id_to_dtype + + def is_ready_for_code_gen(self): + # everything about the function is determined at the constructor itself, + # hence always redy for codegen + return True + + def get_target_specific_name(self, target): + # defaults to the name of the function in Loopy. May change this specific to + # a target by inheriting this class and overriding this function. + return self.name + + def get_preamble(self, target): + return "" + +# }}} + +# {{{ callable kernel + + +class CallableKernel(InKernelCallable): + """ + + ..attribute:: name + + This would be the name by which the function would be called in the loopy + kernel. + + .. attribute:: subkernel + + The subkernel associated with the call. + + """ + + # {{{ constructor + + def __init__(self, name=None, subkernel=None): + + super(CallableKernel, self).__init__(name=name) + + if not name == subkernel.name: + subkernel = subkernel.copy(name=name) + + self.subkernel = subkernel + + # }}} + + # {{{ copy + + def copy(self, name=None, subkernel=None): + if name is None: + name = self.name + + if subkernel is None: + subkernel = self.subkernel + + return self.__class__(name=name, + subkernel=subkernel) + + # }}} + + # {{{ with_types + + def with_types(self, arg_id_to_dtype): + + # {{{ sanity checks for arg_id_to_dtype + + for id in arg_id_to_dtype: + if not isinstance(id, str): + raise LoopyError("For Callable kernels the input should be all given" + "as KWargs") + + # }}} + + # Checking the input dtypes + for id, arg in self.subkernel.arg_dict.items(): + if id in self.subkernel.read_varibles(): + + # because we need the type of the parameters from the main kernel. It + # is necessary that we know the types from there. Hence asserting + # this condition + assert id in arg_id_to_dtype + + new_arg_dict = {} + for id, dtype in arg_id_to_dtype.items(): + # Making the type of the new arg according to the arg which has been + # called in the function. + new_arg_dict[id] = self.subkernel.arg_dict[id].copy(dtype=dtype) + + # Merging the 2 dictionaries so that to even incorporate the variables that + # were not mentioned in arg_id_to_dtype. + new_arg_dict = {**self.subkernel.arg_dict, **new_arg_dict} + + # Preprocessing the kernel so that we can get the types of the other + # variables that are involved in the args + from loopy.type_inference import infer_unknown_types + pre_specialized_subkernel = self.subkernel.copy( + args=list(new_arg_dict.values)) + + # inferring the types of the written variables based on the knowledge of the + # types of the arguments supplied + specialized_kernel = infer_unknown_types(pre_specialized_subkernel, + expect_completion=True) + + new_arg_id_to_dtype = {} + for id, arg in specialized_kernel.arg_dict: + new_arg_id_to_dtype[id] = arg.dtype + + # Returning the kernel call with specialized subkernel and the corresponding + # new arg_id_to_dtype + return self.copy(subkernel=specialized_kernel), specialized_kernel.arg_dict + + # }}} + + # {{{ with_descriptors + + def with_descriptors(self, arg_id_to_descr): + for id, arg_descr in arg_id_to_descr.items(): + # The dimensions don't match => reject it + if len(arg_descr.dim_tags) != len(self.subkernel.arg_dict[id].shape): + raise LoopyError("The number of dimensions do not match between the" + "caller kernel and callee kernel for the variable name %s in" + "the callee kernel" % id) + + new_args = [] + for arg in self.subkernel.args: + if arg.name in arg_id_to_descr: + new_args.copy(arg.copy(dim_tags=arg_id_to_descr[arg.name])) + pass + else: + new_args.append(arg.copy()) + + specialized_kernel = self.subkernel.copy(args=new_args) + + new_arg_id_to_descr = {} + + for id, arg in specialized_kernel.arg_dict.items(): + new_arg_id_to_descr[id] = ArrayArgDescriptor(arg.dim_tags, "GLOBAL") + + return self.copy(subkernel=specialized_kernel), new_arg_id_to_descr + + # }}} + + # {{{ get_target_specific_name + + def get_target_specific_name(self, target): + return self.subkernel.name + + # }}} + + # {{{ get preamble + + def get_preamble(self, target): + return "" + + # }}} + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 9d95408acc7f1a53f1f1a7616f7d6611249c796b..c75a804aec39791576a800e22a3c4a10e11ed6f5 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -480,7 +480,7 @@ class InstructionBase(ImmutableRecord): def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -499,13 +499,20 @@ def _get_assignee_var_name(expr): assert isinstance(agg, Variable) return agg.name + + elif isinstance(expr, SubArrayRef): + agg = expr.subscript.aggregate + assert isinstance(agg, Variable) + + return agg.name + else: raise RuntimeError("invalid lvalue '%s'" % expr) def _get_assignee_subscript_deps(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, get_dependencies + from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -516,6 +523,8 @@ def _get_assignee_subscript_deps(expr): return get_dependencies(expr.index) elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) + elif isinstance(expr, SubArrayRef): + return get_dependencies(expr.get_begin_subscript().index) else: raise RuntimeError("invalid lvalue '%s'" % expr) @@ -955,8 +964,8 @@ class CallInstruction(MultiAssignmentBase): forced_iname_deps_is_final=forced_iname_deps_is_final) from pymbolic.primitives import Call - from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)) and expression is not None: + from loopy.symbolic import Reduction, ArrayCall + if not isinstance(expression, (ArrayCall, Call, Reduction)) and expression is not None: raise LoopyError("'expression' argument to CallInstruction " "must be a function call") @@ -972,9 +981,10 @@ class CallInstruction(MultiAssignmentBase): expression = parse(expression) from pymbolic.primitives import Variable, Subscript - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef for assignee in assignees: - if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): + if not isinstance(assignee, (Variable, Subscript, LinearSubscript, + SubArrayRef)): raise LoopyError("invalid lvalue '%s'" % assignee) self.assignees = assignees @@ -1018,6 +1028,28 @@ class CallInstruction(MultiAssignmentBase): result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) return result + def get_parameters_dim_tag_dict(self, arg_dict): + + from loopy.symbolic import SubArrayRef + + dim_tags_dict = {} + for par in (self.assignees + self.expression.parameters): + if isinstance(par, SubArrayRef): + arg_name = par.subscript.aggregate.name + dim_tags_dict[arg_name] = par.get_inner_dim_tags( + arg_dict[arg_name].dim_tags) + + return dim_tags_dict + + @property + def is_array_call(self): + from loopy.symbolic import SubArrayRef + for arg in self.assignees + self.expression.parameters: + if isinstance(arg, SubArrayRef): + return True + + return False + @property def atomicity(self): # Function calls can impossibly be atomic, and even the result assignment @@ -1029,33 +1061,33 @@ class CallInstruction(MultiAssignmentBase): def make_assignment(assignees, expression, temp_var_types=None, **kwargs): - if len(assignees) > 1 or len(assignees) == 0: + from loopy.symbolic import ArrayCall + if len(assignees) > 1 or len(assignees) == 0 or (isinstance(expression, + ArrayCall)): atomicity = kwargs.pop("atomicity", ()) if atomicity: raise LoopyError("atomic operations with more than one " "left-hand side not supported") from pymbolic.primitives import Call - from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)): + from loopy.symbolic import Reduction, ArrayCall + if not isinstance(expression, (ArrayCall, Call, Reduction)): raise LoopyError("right-hand side in multiple assignment must be " "function call or reduction, got: '%s'" % expression) - return CallInstruction( assignees=assignees, expression=expression, temp_var_types=temp_var_types, **kwargs) - else: - return Assignment( - assignee=assignees[0], - expression=expression, - temp_var_type=( - temp_var_types[0] - if temp_var_types is not None - else None), - **kwargs) + return Assignment( + assignee=assignees[0], + expression=expression, + temp_var_type=( + temp_var_types[0] + if temp_var_types is not None + else None), + **kwargs) # {{{ c instruction diff --git a/loopy/library/function.py b/loopy/library/function.py index 9d557ac9fe5c4c040608dc181b96daa812405a65..6582ba56ff3eda46ed1f6a61ed8cd81dea0c9a15 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -23,37 +23,4 @@ THE SOFTWARE. """ -def default_function_mangler(kernel, name, arg_dtypes): - from loopy.library.reduction import reduction_function_mangler - - manglers = [reduction_function_mangler, tuple_function_mangler] - for mangler in manglers: - result = mangler(kernel, name, arg_dtypes) - if result is not None: - return result - - return None - - -def single_arg_function_mangler(kernel, name, arg_dtypes): - if len(arg_dtypes) == 1: - dtype, = arg_dtypes - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo(name, (dtype,), (dtype,)) - - return None - - -def tuple_function_mangler(kernel, name, arg_dtypes): - if name == "make_tuple": - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="loopy_make_tuple", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - - return None - - # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index b8633114ddeb9d48eb33a765755302917ca27f63..9d971c3769718ad585748da830ad4181d90295e2 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -180,7 +180,13 @@ def random123_preamble_generator(preamble_info): )) +def random123_callables(kernel): + # This is just to test whether the rest of the code is working + return {} + + def random123_function_mangler(kernel, name, arg_dtypes): + pass try: rng_variant = FUNC_NAMES_TO_RNG[name] except KeyError: diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 0e5a093b76b8d09d331edead7c69fcc2e3134601..11f3007f8542738ab9177700e03b82d05fb2af09 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -422,6 +422,14 @@ def parse_reduction_op(name): # }}} +def reduction_callables(): + return {} + # TODO: So what's the problem over here? + # I can generate the callables for everythin except max and min, + # A long time solution should be to have a type for the array dtypes + pass + + def reduction_function_mangler(kernel, func_id, arg_dtypes): if isinstance(func_id, ArgExtOp): from loopy.target.opencl import CTarget diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 9e16c3a598246aa71e125ce3d04f372d7c90f28e..c3a84fa43ed5a1bb6045e5de2fc7334fccf5a785 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -106,6 +106,13 @@ class IdentityMapperMixin(object): def map_type_annotation(self, expr, *args): return type(expr)(expr.type, self.rec(expr.child)) + def map_array_call(self, expr, *args): + return ArrayCall(expr.function, + expr.parameters, expr.kw_parameters) + + def map_sub_array_ref(self, expr, *args): + return SubArrayRef(expr.swept_inames, expr.subscript) + map_type_cast = map_type_annotation map_linear_subscript = IdentityMapperBase.map_subscript @@ -163,6 +170,19 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + def map_sub_array_ref(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.swept_inames, *args) + self.rec(expr.subscript, *args) + + def map_array_call(self, expr, *args): + if not self.visit(expr): + return + + return self.map_call(expr, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant @@ -230,6 +250,14 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_array_call(self, expr, prec): + return self.map_call(expr, prec) + + def map_sub_array_ref(self, expr, prec): + return "SubArrayRef({inames}, ({subscr}))".format( + inames=self.rec(expr.swept_inames, prec), + subscr=self.rec(expr.subscript, prec)) + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): @@ -282,6 +310,12 @@ class DependencyMapper(DependencyMapperBase): def map_loopy_function_identifier(self, expr): return set() + def map_sub_array_ref(self, expr, *args): + deps = self.rec(expr.subscript, *args) + return deps - set(iname for iname in expr.swept_inames) + + map_array_call = map_call + map_linear_subscript = DependencyMapperBase.map_subscript def map_type_cast(self, expr): @@ -597,6 +631,113 @@ class Reduction(p.Expression): mapper_method = intern("map_reduction") +class SubArrayRef(p.Expression): + """Represents a generalized sliced notation of an array. + + .. attribute:: swept_inames + + These are a tuple of sweeping inames over the array. + + .. attribute:: subscript + + The subscript whose adress space is to be referenced + """ + + init_arg_names = ("swept_inames", "subscript", "keyword") + + def __init__(self, swept_inames=None, subscript=None, keyword=None): + + # {{{ sanity checks + + if not isinstance(swept_inames, tuple): + assert isinstance(swept_inames, p.Variable) + swept_inames = (swept_inames,) + + assert isinstance(swept_inames, tuple) + + for iname in swept_inames: + assert isinstance(iname, p.Variable) + assert isinstance(subscript, p.Subscript) + + if keyword is not None: + assert isinstance(keyword, str) + + # }}} + + self.swept_inames = swept_inames + self.subscript = subscript + self.keyword = keyword + + def get_begin_subscript(self): + starting_inames = [] + for iname in self.subscript.index_tuple: + if iname in self.swept_inames: + starting_inames.append(parse('0')) + else: + starting_inames.append(iname) + return p.Subscript(self.subscript.aggregate, tuple(starting_inames)) + + def get_inner_dim_tags(self, arg_dim_tags): + """ Gives the dim tags for the inner inames. + This would be used for stride calculation in the child kernel. + This might need to go, once we start calculating the stride length + using the upper and lower bounds of the involved inames. + """ + from loopy.kernel.array import FixedStrideArrayDimTag as DimTag + inner_dim_tags = [] + for dim_tag, iname in zip(arg_dim_tags, self.subscript.index_tuple): + if iname in self.swept_inames: + inner_dim_tags.append(DimTag(dim_tag.stride)) + + return inner_dim_tags + + def __getinitargs__(self): + return (self.swept_inames, self.subscript) + + def get_hash(self): + return hash((self.__class__, self.swept_inames, self.subscript)) + + def is_equal(self, other): + return (other.__class__ == self.__class__ + and other.subscript == self.subscript + and other.swept_inames == self.swept_inames) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_sub_array_ref") + + +class ArrayCall(p.CallWithKwargs): + """Represents an function call over an array across :attr: `inames`. + """ + + def __init__(self, function, parameters, kw_parameters=None): + + # {{{ sanity checks + + assert isinstance(function, p.Variable) + assert isinstance(parameters, tuple) + + for par in parameters: + assert isinstance(par, SubArrayRef) + + # }}} + + self.function = function + self.parameters = parameters + self.kw_parameters = kw_parameters + + def stringifier(self): + return StringifyMapper + + def __hash__(self): + return hash((self.function, self.parameters + + tuple(self.kw_parameters.values()))) + + mapper_method = intern("map_array_call") + + class LinearSubscript(p.Expression): """Represents a linear index into a multi-dimensional array, completely ignoring any multi-dimensional layout. @@ -1040,6 +1181,18 @@ class FunctionToPrimitiveMapper(IdentityMapper): def map_call(self, expr): from loopy.library.reduction import parse_reduction_op + # {{{ handling array calls + + encountered_sub_array_ref = False + for par in expr.parameters: + if isinstance(par, SubArrayRef): + encountered_sub_array_ref = True + break + if encountered_sub_array_ref: + return ArrayCall(expr.function, expr.parameters) + + # }}} + if not isinstance(expr.function, p.Variable): return IdentityMapper.map_call(self, expr) @@ -1098,6 +1251,18 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: return IdentityMapper.map_call(self, expr) + def map_call_with_kwargs(self, expr): + + # {{{ handling array calls + + for par in expr.parameters + tuple(expr.kw_parameters.values()): + if isinstance(par, SubArrayRef): + return ArrayCall(expr.function, expr.parameters, expr.kw_parameters) + + # }}} + + raise NotImplementedError("CallWithKwargs is only supported for ArrayCalls") + # {{{ customization to pymbolic parser @@ -1128,7 +1293,8 @@ class LoopyParser(ParserBase): return float(val) # generic float def parse_prefix(self, pstate): - from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier + from pymbolic.parser import (_PREC_UNARY, _less, _greater, _identifier, + _openbracket, _closebracket, _colon) if pstate.is_next(_less): pstate.advance() if pstate.is_next(_greater): @@ -1144,6 +1310,17 @@ class LoopyParser(ParserBase): return TypeAnnotation( typename, self.parse_expression(pstate, _PREC_UNARY)) + elif pstate.is_next(_openbracket): + pstate.advance() + pstate.expect_not_end() + swept_inames = self.parse_expression(pstate) + pstate.expect(_closebracket) + pstate.advance() + pstate.expect(_colon) + pstate.advance() + subscript = self.parse_expression(pstate, _PREC_UNARY) + return SubArrayRef(swept_inames, subscript) + else: return super(LoopyParser, self).parse_prefix(pstate) @@ -1631,6 +1808,10 @@ class BatchedAccessRangeMapper(WalkMapper): def map_type_cast(self, expr, inames): return self.rec(expr.child, inames) + def map_sub_array_ref(self, expr, inames): + total_inames = inames | set([iname.name for iname in expr.swept_inames]) + return self.rec(expr.subscript, total_inames) + class AccessRangeMapper(object): diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a08b406f53798b4f7f6852a4f424182a75b224e4..1885b63f82517b8fe4868190c919def6e07ded1d 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -150,8 +150,8 @@ class ASTBuilderBase(object): # {{{ library - def function_manglers(self): - return [] + def callables(self, kernel): + return {} def symbol_manglers(self): return [] @@ -206,7 +206,7 @@ class ASTBuilderBase(object): def emit_assignment(self, codegen_state, insn): raise NotImplementedError() - def emit_multiple_assignment(self, codegen_state, insn): + def emit_call(self, codegen_state, insn): raise NotImplementedError() def emit_sequential_loop(self, codegen_state, iname, iname_dtype, diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 177daa02948b9c07ef1d9856dc04019e69e24897..173b91a319d55afb090491e37d1af1988c0130e7 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,9 +27,10 @@ THE SOFTWARE. import six import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo +from loopy.kernel.function_interface import (CommonReturnTypeCallable, + SpecificReturnTypeCallable) from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder -from loopy.diagnostic import LoopyError, LoopyTypeError +from loopy.diagnostic import LoopyError from cgen import Pointer, NestedDeclarator, Block from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE @@ -78,6 +79,10 @@ class DTypeRegistryWrapper(object): # {{{ preamble generator def _preamble_generator(preamble_info): + # TODO: + # No need for this! + # This will go into the generate premble for the functions + return c_funcs = set(func.c_name for func in preamble_info.seen_functions) if "int_floor_div" in c_funcs: yield ("05_int_floor_div", """ @@ -356,70 +361,39 @@ def c_symbol_mangler(kernel, name): # {{{ function mangler -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None +class CMathCallable(CommonReturnTypeCallable): + def get_target_specific_name(self, target): + if not self.is_ready_for_codegen(): + raise LoopyError("Trying to generate ") + assert isinstance(target, CTarget) - if name in ["abs", "min", "max"]: - name = "f" + name + if self.name in ["abs", "max", "min"]: + target_name = "f" + self.name - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): + return target_name - dtype = arg_dtypes[0].numpy_dtype + def get_preamble(self, target): + assert isinstance(target, CTarget) - if modify_name: - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: - name = name + "l" # fabsl - else: - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - - # binary functions - if (name in ["fmax", "fmin"] - and len(arg_dtypes) == 2): - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyTypeError("%s does not support complex numbers") - - elif dtype.kind == "f": - if modify_name: - if dtype == np.float64: - pass # fmin - elif dtype == np.float32: - name = name + "f" # fminf - elif dtype == np.float128: - name = name + "l" # fminl - else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) + return r'#include ' + + +def collect_c_generic_callables(): + unary_functions = ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] + + binary_functions = ["max", "min"] + + function_dict = {} + + for func in unary_functions: + function_dict[func] = CMathCallable(name=func, kinds_allowed=['f'], arity=1) + + for func in binary_functions: + function_dict[func] = CMathCallable(name=func, kinds_allowed=['f'], arity=2) + + return function_dict - return None # }}} @@ -427,11 +401,10 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CASTBuilder(ASTBuilderBase): # {{{ library - def function_manglers(self): + def callables(self, kernel): return ( - super(CASTBuilder, self).function_manglers() + [ - c_math_mangler - ]) + {**super(CASTBuilder, self).callables(kernel), + **collect_c_generic_callables()}) def symbol_manglers(self): return ( @@ -835,7 +808,7 @@ class CASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) - def emit_multiple_assignment(self, codegen_state, insn): + def emit_call(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper from pymbolic.primitives import Variable @@ -851,9 +824,23 @@ class CASTBuilder(ASTBuilderBase): codegen_state.kernel.get_var_descriptor(a) for a in insn.assignee_var_names()] - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) + par_dtypes = tuple(ecm.infer_type(par) for par in parameters + + insn.assignees) + + if insn.is_array_call: + func_id = "{func}_{insn}".format(func=func_id, + insn=insn.id) + arg_id_to_dtype = {} + for id, dtype in enumerate(par_dtypes): + arg_id_to_dtype[id] = dtype + + mangle_result = SpecificReturnTypeCallable( + name=func_id, + arg_dtypes=par_dtypes, + result_dtype=par_dtypes[0]) + else: + mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) if mangle_result is None: raise RuntimeError("function '%s' unknown--" "maybe you need to register a function mangler?" @@ -861,8 +848,8 @@ class CASTBuilder(ASTBuilderBase): assert mangle_result.arg_dtypes is not None - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. + if mangle_result.name == "loopy_make_tuple": + # This shortcut avoids actually having to emit a 'make_tuple' function. return self.emit_tuple_assignment(codegen_state, insn) from loopy.expression import dtype_to_type_context @@ -871,14 +858,15 @@ class CASTBuilder(ASTBuilderBase): dtype_to_type_context(self.target, tgt_dtype), tgt_dtype).expr for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] + parameters+insn.assignees, par_dtypes, mangle_result.arg_dtypes)] from loopy.codegen import SeenFunction codegen_state.seen_functions.add( SeenFunction(func_id, - mangle_result.target_name, + mangle_result.name, mangle_result.arg_dtypes)) + """ from pymbolic import var for i, (a, tgt_dtype) in enumerate( zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): @@ -891,21 +879,21 @@ class CASTBuilder(ASTBuilderBase): ecm(a, PREC_NONE, dtype_to_type_context(self.target, tgt_dtype), tgt_dtype).expr)) + """ from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: - from cgen import ExpressionStatement - return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) + result = var(mangle_result.name)(*c_parameters) result = ecm.wrap_in_typecast( - mangle_result.result_dtypes[0], + mangle_result.result_dtype, assignee_var_descriptors[0].dtype, result) + if insn.is_array_call: + from cgen import ExpressionStatement + return ExpressionStatement( + CExpression(self.get_c_expression_to_code_mapper(), result)) + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) from cgen import Assign diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index c111a02b75243b10de90b2d18d62e3759c575fa8..f13f66158e61654d6d94e3005d66599aa5739297 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -40,6 +40,7 @@ from pymbolic import var from loopy.expression import dtype_to_type_context from loopy.type_inference import TypeInferenceMapper +from loopy.kernel.function_interface import InKernelCallable from loopy.diagnostic import LoopyError, LoopyWarning from loopy.tools import is_integer @@ -165,6 +166,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_tagged_variable(self, expr, type_context): return var(expr.name) + def map_sub_array_ref(self, expr, type_context): + return var("&")(self.rec(expr.get_begin_subscript(), + type_context)) + def map_subscript(self, expr, type_context): def base_impl(expr, type_context): return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] @@ -431,30 +436,37 @@ class ExpressionToCExpressionMapper(IdentityMapper): if isinstance(identifier, Variable): identifier = identifier.name + arg_id_to_dtype = {} + for id, par in enumerate(expr.parameters): + arg_id_to_dtype[id] = self.infer_type(par) + par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) processed_parameters = None - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) + specialized_function, new_arg_id_to_dtype = ( + self.kernel.get_specialized_callable( + identifier, arg_id_to_dtype, + ast_builder=self.codegen_state.ast_builder)) - if mangle_result is None: + if specialized_function is None: raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" + "maybe you need to register a function?" % identifier) - if len(mangle_result.result_dtypes) != 1: + if not isinstance(specialized_function, InKernelCallable): raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - if mangle_result.arg_dtypes is not None: + new_par_dtypes = tuple(new_arg_id_to_dtype[id] for id in + sorted(new_arg_id_to_dtype) if id >= 0) + if new_arg_id_to_dtype is not None: processed_parameters = tuple( self.rec(par, dtype_to_type_context(self.kernel.target, tgt_dtype), tgt_dtype) for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) + expr.parameters, par_dtypes, new_par_dtypes)) else: # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to @@ -471,13 +483,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): "return CallMangleInfo.arg_dtypes" % identifier, LoopyWarning) - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) + self.codegen_state.seen_functions.add(specialized_function) - return var(mangle_result.target_name)(*processed_parameters) + return var(specialized_function.get_target_specific_name(self.kernel.target))( + *processed_parameters) # {{{ deal with complex-valued variables diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 027f27838bf68511905bd34cf75d0b361c749629..da8c0438254fef6822ad98736a434cc1359d4828 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -112,6 +112,73 @@ def _register_vector_types(dtype_registry): # {{{ function mangler + +_CUDA_SIMPLE_MULTI_ARG_FUNCTIONS = { + "atan2": 2 + } + + +def CudaMathCallable(CommonReturnTypeCallable): + def get_target_specific_name(self, target): + if not self.is_ready_for_codegen(): + raise LoopyError("Trying to generate ") + assert isinstance(target, CudaTarget) + + if self.name in ["abs", "max", "min"]: + target_name = "f" + self.name + + dtype = self.specialized_dtype + + if dtype == np.float64: + pass # fabs + elif dtype == np.float32: + target_name = target_name + "f" # fabsf + elif dtype == np.float128: + target_name = target_name + "l" # fabsl + else: + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + + return target_name + + def get_preambles(self, target): + assert isinstance(target, CudaTarget) + + +def collect_cuda_generic_callables(collectible_dict): + unary_functions = ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] + + binary_functions = ["max", "min"] + + for func in unary_functions: + if func in collectible_dict: + raise LoopyError("Cannot map the same name to different generic function" + "types") + + collectible_dict[func] = CudaMathCallable(name=func, kinds_allowed=['f'], + arity=1) + + for func in binary_functions: + if func in collectible_dict: + raise LoopyError("Cannot map the same name to different generic function" + "types") + + collectible_dict[func] = CudaMathCallable(name=func, kinds_allowed=['f'], + arity=2) + + for func, num_args in _CUDA_SIMPLE_MULTI_ARG_FUNCTIONS.items(): + if func in collectible_dict: + raise LoopyError("Cannot map the same name to different generic function" + "types") + + collectible_dict[func] = CudaMathCallable(name=func, kinds_allowed=['f'], + arity=num_args) + + # FIXME: dot is yet to be implemented + + return collectible_dict + + def cuda_function_mangler(kernel, name, arg_dtypes): if not isinstance(name, str): return None diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 31e0569b92a9ddada8ad66c2e0a065c191cc61d3..f4c69946406ed42b2c52e1b07034f6eb26d551ed 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,12 +31,12 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler -from loopy.kernel.data import temp_var_scope, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import temp_var_scope +from loopy.kernel.function_interface import (CommonReturnTypeCallable, + SpecificReturnTypeCallable) from pymbolic import var -from functools import partial - # {{{ dtype registry wrappers @@ -139,11 +139,12 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ OpenCL callables _CL_SIMPLE_MULTI_ARG_FUNCTIONS = { + "dot": 2, "clamp": 3, - "atan2": 2, + "atan2": 2 } @@ -165,61 +166,71 @@ VECTOR_LITERAL_FUNCS = dict( ) -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class CLMathCallable(CommonReturnTypeCallable): + def get_target_specific_name(self, target): + if not self.is_ready_for_codegen(): + raise LoopyError("The function %s is not ready for codegen" % self.name) + assert isinstance(target, OpenCLTarget) + + if self.name in ["abs", "max", "min"]: + target_name = "f" + self.name + + return target_name + + def get_preamble(self, target): + return "" + + +class CLSpecificCallable(SpecificReturnTypeCallable): + pass + + +def collect_cl_generic_callables(): + unary_functions = ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] + + binary_functions = ["max", "min"] - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: + function_dict = {} + + for func in unary_functions: + function_dict[func] = CLMathCallable(name=func, kinds_allowed=['f'], + arity=1) + + for func in binary_functions: + function_dict[func] = CLMathCallable(name=func, kinds_allowed=['f', 'i', + 'u'], + arity=2) + + for func, num_args in _CL_SIMPLE_MULTI_ARG_FUNCTIONS.items(): + function_dict[func] = CLMathCallable(name=func, kinds_allowed=['f'], + arity=num_args) + + for name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + function_dict[name] = CLMathCallable(name, kinds_allowed=[dtype.kind], + arity=count) + + return function_dict + + +''' +Dont think this is necessary anymore +def collect_cl_specific_callables(kernel): + function_dict = {} + for name in VECTOR_LITERAL_FUNCS: base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - if count != len(arg_dtypes): - return None + arg_id_to_dtype = {} + for i in range(count): + arg_id_to_dtype[i] = NumpyType(dtype) + arg_id_to_dtype[-1] = kernel.target.vector_dtype(NumpyType(dtype), count) - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) + function_dict[name] = CLSpecificCallable(name, arg_id_to_dtype) + + return function_dict +''' - return None # }}} @@ -356,7 +367,6 @@ class OpenCLTarget(CTarget): vec.types[base.numpy_dtype, count], target=self) - # }}} # }}} @@ -366,13 +376,11 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_manglers(self): + def callables(self, kernel): return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super(OpenCLCASTBuilder, self).function_manglers()) + {**super(OpenCLCASTBuilder, self).callables(kernel), + **collect_cl_generic_callables(), + **collect_cl_specific_callables(kernel)}) def symbol_manglers(self): return ( @@ -400,6 +408,8 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) + if not codegen_state.is_generating_master_kernel: + return fdecl fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 744c03d8ed091bc0f05e4fc41aa14e88ec89276a..18fbf45fcca8970d52082336bc5db28fa4b7bb0c 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,7 +31,7 @@ from six.moves import range import numpy as np -from loopy.kernel.data import CallMangleInfo +from loopy.kernel.function_interface import CommonReturnTypeCallable from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType @@ -199,36 +199,34 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes +class PyOpenCLMathCallable(CommonReturnTypeCallable): + def get_target_specific_name(self, target): + assert isinstance(target, PyOpenCLTarget) - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" + dtype = self.specialized_dtype + target_name = self.name + + if self.name in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj"] and dtype.kind == 'c': + if dtype.numpy_dtype == np.complex64: + target_name = target_name + "cfloat" + elif dtype.numpy_dtype == np.complex128: + target_name = target_name + "cdouble" else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) + raise RuntimeError("unexpected complex type '%s'" % dtype) + + return target_name - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) +def collect_pyopencl_generic_callables(kernel): + function_dict = {} + for name in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", "tanh", + "conj"]: + function_dict[name] = PyOpenCLMathCallable(name=name, kinds_allowed=['f', + 'c'], arity=1) - return None + # TODO: Need to add real, imag, abs for complex numbers + return function_dict # {{{ preamble generator @@ -739,13 +737,12 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler + def callables(self, kernel): + from loopy.library.random123 import random123_callables return ( - super(PyOpenCLCASTBuilder, self).function_manglers() + [ - pyopencl_function_mangler, - random123_function_mangler - ]) + {**super(PyOpenCLCASTBuilder, self).callables(kernel), + **collect_pyopencl_generic_callables(kernel), + **random123_callables(kernel)}) def preamble_generators(self): from loopy.library.random123 import random123_preamble_generator diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py new file mode 100644 index 0000000000000000000000000000000000000000..8743610049b8e8ea4a41d80cbd858a6612620175 --- /dev/null +++ b/loopy/transform/register_knl.py @@ -0,0 +1,92 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.kernel import LoopKernel +from loopy.diagnostic import LoopyError + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_callable_kernel +""" + +# {{{ main entrypoint + + +def register_callable_kernel(parent, function_name, child): + """ + The purpose of this transformation is so that one can inoke the child + kernel in the parent kernel. + + :arg parent + + This is the "main" kernel which will mostly remain unaltered and one + can interpret it as stitching up the child kernel in the parent kernel. + + :arg function_name + + The name of the function call with which the child kernel must be + associated in the parent kernel + + :arg child + + This is like a function in every other language and this might be + invoked in one of the instructions of the parent kernel. + + ..note:: + + One should note that the kernels would go under stringent compatibilty + tests so that both of them can be confirmed to be made for each other. + """ + + # {{{ Sanity Checks + + assert isinstance(parent, LoopKernel) + assert isinstance(child, LoopKernel) + assert isinstance(function_name, str) + assert function_name not in parent.auxiliary_kernels, ( + "%s has already been used with some other kernel. One" + "function can only be associated with a single kernel" % ( + function_name)) + + # }}} + + from loopy.kernel.function_interface import CallableKernel + callable_kernel = CallableKernel(name=function_name, subkernel=child) + new_auxiliary_kernels = parent.auxiliary_kernels + new_auxiliary_kernels[function_name] = child + + # somehow need to add a new element to the dictionary of the parent_knl + new_callable_dict = parent.callable_dict + if function_name in new_callable_dict: + raise LoopyError("Cant assign the name of a kernel function with a default" + "function's name.") + + new_callable_dict[function_name] = callable_kernel + + return parent.copy(callable_dict=new_callable_dict) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/type_inference.py b/loopy/type_inference.py index fcf8f965b68fd258b0c0f1eae94ec84a39a5b7ee..e4334c328353f0214969cb35d900a1de5b04804f 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -35,6 +35,8 @@ from loopy.diagnostic import ( TypeInferenceFailure, DependencyTypeInferenceFailure) import logging + + logger = logging.getLogger(__name__) @@ -270,17 +272,23 @@ class TypeInferenceMapper(CombineMapper): if None in arg_dtypes: return [] - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) + arg_id_to_dtype = {} + for id, dtype in enumerate(arg_dtypes): + arg_id_to_dtype[id] = dtype + + specialized_callable = self.kernel.get_specialized_callable(identifier, + arg_id_to_dtype) + if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] + if specialized_callable is not None: + _, new_arg_id_to_dtype = specialized_callable + result_dtypes = (dtype for id, dtype in new_arg_id_to_dtype + if id < 0) + return [result_dtypes] else: - if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") - - return [mangle_result.result_dtypes[0]] + if specialized_callable is not None: + _, new_arg_id_to_dtype = specialized_callable + return [new_arg_id_to_dtype[-1]] raise RuntimeError("unable to resolve " "function '%s' with %d given arguments" @@ -399,6 +407,11 @@ class TypeInferenceMapper(CombineMapper): return [expr.operation.result_dtypes(self.kernel, rec_result)[0] for rec_result in rec_results] + def map_sub_array_ref(self, expr): + return self.rec(expr.subscript) + + map_array_call = map_call + # }}} diff --git a/test/test_transform.py b/test/test_transform.py index 0e10db362f36b7fc258059c2ec7ed1a344b97212..0f3adc41bb30aa27d4b70f1513cfd3df2afb1400 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -178,6 +178,67 @@ def test_add_barrier(ctx_factory): assert (np.linalg.norm(out-2*a.T) < 1e-16) +def test_register_knl(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 4 + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + z_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + child_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + c[i, j] = 2*a[i, j] + 3*b[i, j] + """, + kernel_data=[ + lp.GlobalArg( + name='a', + dtype=np.float64, + shape=lp.auto), + lp.GlobalArg( + name='b', + dtype=np.float64, + shape=lp.auto), + lp.GlobalArg( + name='c', + dtype=np.float64, + shape=lp.auto), '...'] + ) + + parent_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], + [j, l]:y[i, j, k, l, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='z', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel(parent_knl, 'linear_combo', child_knl) + + evt, (out, ) = knl(queue, x=x_dev, y=y_dev, z=z_dev) + + x_host = x_dev.get() + y_host = y_dev.get() + + assert (np.linalg.norm(2*x_host+3*y_host-out.get())/( + np.linalg.norm(2*x_host+3*y_host))) < 1e-15 + + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx)