From 2f430adffb1d2eb4933f2c6ec93eb951f3927c19 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <kgk2@illinois.edu> Date: Mon, 2 Jul 2018 20:24:57 -0500 Subject: [PATCH] Hunk edits to isolate the new function interface --- doc/index.rst | 1 + loopy/__init__.py | 8 + loopy/check.py | 102 +++++++- loopy/codegen/__init__.py | 54 ++++ loopy/kernel/__init__.py | 49 ++-- loopy/kernel/creation.py | 156 +++++++++++- loopy/kernel/tools.py | 8 + loopy/library/function.py | 39 +++ loopy/library/random123.py | 104 ++++---- loopy/library/reduction.py | 216 +++++++--------- loopy/preprocess.py | 359 +++++++++++++++++++++++++++ loopy/statistics.py | 9 +- loopy/symbolic.py | 86 ++++++- loopy/target/__init__.py | 7 +- loopy/target/c/__init__.py | 233 ++++++++--------- loopy/target/c/codegen/expression.py | 84 ++----- loopy/target/cuda.py | 84 +++++-- loopy/target/opencl.py | 182 +++++++++----- loopy/target/pyopencl.py | 110 +++++--- loopy/target/python.py | 52 ++-- loopy/transform/diff.py | 9 +- loopy/type_inference.py | 183 ++++++++++++-- test/testlib.py | 40 +++ 23 files changed, 1616 insertions(+), 559 deletions(-) diff --git a/doc/index.rst b/doc/index.rst index d862a8acd..0644b34c4 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -68,6 +68,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform + ref_call ref_other misc diff --git a/loopy/__init__.py b/loopy/__init__.py index f50ce237c..d541f1dae 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -51,6 +51,8 @@ from loopy.kernel.data import ( TemporaryVariable, SubstitutionRule, CallMangleInfo) +from loopy.kernel.function_interface import ( + ScalarCallable) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -119,6 +121,8 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier +from loopy.transform.callable import register_function_lookup + # }}} from loopy.type_inference import infer_unknown_types @@ -168,6 +172,8 @@ __all__ = [ "CallInstruction", "CInstruction", "NoOpInstruction", "BarrierInstruction", + "ScalarCallable", + "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", "AddressSpace", "temp_var_scope", # temp_var_scope is deprecated @@ -230,6 +236,8 @@ __all__ = [ "add_barrier", + "register_function_lookup", + # }}} "get_dot_dependency_graph", diff --git a/loopy/check.py b/loopy/check.py index 84f3b04e0..dd96c1ba6 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -27,9 +27,13 @@ from six.moves import range from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper +from loopy.symbolic import WalkMapper, CombineMapper, ScopedFunction from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -55,6 +59,74 @@ def check_identifiers_in_subst_rules(knl): "kernel-global identifiers" % (knl.name, ", ".join(deps-rule_allowed_identifiers))) + +class UnscopedCallCollector(CombineMapper): + """ + Collects all the unscoped calls within a kernel. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + from loopy.library.reduction import ArgExtOp + if not isinstance(expr.function, (ScopedFunction, ArgExtOp)): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if not isinstance(expr.function, ScopedFunction): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def check_functions_are_scoped(kernel): + """ Checks if all the calls in the instruction expression have been scoped, + otherwise indicates to what all calls we await signature. Refer + :class:`loopy.symbolic.ScopedFunction` for a detailed explanation of a + scoped function. + """ + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + unscoped_calls = UnscopedCallCollector()(subst_expander( + insn.expression)) + if unscoped_calls: + raise LoopyError("Unknown function '%s' obtained -- register a " + "function or a kernel corresponding to it." % + set(unscoped_calls).pop()) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown type of instruction %s" % type(insn).__name__) + # }}} @@ -113,6 +185,18 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) +def _get_all_unique_iname_tags(kernel): + """Returns a set of all the iname tags used in *kernel* that + inherit from :class:`loopy.kernel.data.UniqueTag`. + """ + from loopy.kernel.data import UniqueTag + iname_tags = [kernel.iname_to_tag.get(iname) for iname in + kernel.all_inames()] + return set( + tag for tag in iname_tags if + isinstance(tag, UniqueTag)) + + def check_multiple_tags_allowed(kernel): from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type) @@ -129,6 +213,7 @@ def check_multiple_tags_allowed(kernel): def check_for_double_use_of_hw_axes(kernel): from loopy.kernel.data import UniqueTag + from loopy.kernel.instruction import CallInstruction for insn in kernel.instructions: insn_tag_keys = set() @@ -141,6 +226,21 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) + # check usage of iname tags in the callee kernel + if isinstance(insn, CallInstruction): + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + # check for collision in iname_tag keys in the instruction + # due to the callee kernel + common_iname_tags = [tag for tag in + _get_all_unique_iname_tags(in_knl_callable.subkernel) + if tag.key in insn_tag_keys] + if common_iname_tags: + raise LoopyError("instruction '%s' has multiple " + "inames tagged '%s'" % (insn.id, + common_iname_tags.pop())) + def check_for_inactive_iname_access(kernel): for insn in kernel.instructions: diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1b..16fef45b5 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,6 +32,16 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from cgen import Collection +from loopy.symbolic import CombineMapper + +from loopy.kernel.instruction import ( + Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, + CInstruction, _DataObliviousInstruction, MultiAssignmentBase) + +from functools import reduce + + import logging logger = logging.getLogger(__name__) @@ -362,6 +372,32 @@ code_gen_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) +class InKernelCallablesCollector(CombineMapper): + """ + Returns an instance of :class:`frozenset` containing instances of + :class:`loopy.kernel.function_interface.InKernelCallable` in the + :attr:``kernel`. + """ + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_scoped_function(self, expr): + return frozenset([self.kernel.scoped_functions[ + expr.name]]) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + class PreambleInfo(ImmutableRecord): """ .. attribute:: kernel @@ -506,6 +542,24 @@ def generate_code_v2(kernel): for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) + # {{{ collect preambles from all the in kernel callables. + + in_knl_callable_collector = InKernelCallablesCollector(kernel) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + for in_knl_callable in in_knl_callable_collector(insn.expression): + preambles.extend(in_knl_callable.generate_preambles(kernel.target)) + + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown instruction type '%s'" + % type(insn).__name__) + + # }}} + codegen_result = codegen_result.copy(device_preambles=preambles) # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6b0033808..e89455d30 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -37,10 +37,6 @@ import re from pytools import UniqueNameGenerator, generate_unique_names -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted from loopy.diagnostic import StaticValueFindingError @@ -186,6 +182,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: function_manglers .. attribute:: symbol_manglers + .. attribute:: function_scopers + + A list of functions of signature ``(target, name)`` returning a + :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + .. attribute:: substitutions a mapping from substitution names to @@ -238,6 +239,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): iname_to_tags=None, substitutions=None, function_manglers=None, + function_scopers=None, + scoped_functions={}, symbol_manglers=[], iname_slab_increments=None, @@ -277,15 +280,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if substitutions is None: substitutions = {} if function_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] - if symbol_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] + function_manglers = [] if iname_slab_increments is None: iname_slab_increments = {} @@ -348,6 +343,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT + if function_scopers is None: + # populate the function scopers from the target and the loopy + # specific callable scopers + + from loopy.library.function import loopy_specific_callable_scopers + function_scopers = [loopy_specific_callable_scopers] + ( + target.get_device_ast_builder().function_scopers()) + ImmutableRecordWithoutPickling.__init__(self, domains=domains, instructions=instructions, @@ -367,6 +370,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, function_manglers=function_manglers, + function_scopers=function_scopers, + scoped_functions=scoped_functions, symbol_manglers=symbol_manglers, index_dtype=index_dtype, options=options, @@ -380,7 +385,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling + # {{{ function mangling/scoping def mangle_function(self, identifier, arg_dtypes, ast_builder=None): if ast_builder is None: @@ -423,6 +428,20 @@ class LoopKernel(ImmutableRecordWithoutPickling): return None + def find_scoped_function_identifier(self, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + for scoper in self.function_scopers: + in_knl_callable = scoper(self.target, identifier) + if in_knl_callable: + return in_knl_callable + + return None + # }}} # {{{ symbol mangling @@ -1505,7 +1524,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): "preamble_generators", "function_manglers", + "function_scopers", "symbol_manglers", + "scoped_functions", ) def update_persistent_hash(self, key_hash, key_builder): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index c2b54cf8b..8b371b47d 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -24,16 +24,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np from pymbolic.mapper import CSECachingMapperMixin +from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids -from loopy.symbolic import IdentityMapper, WalkMapper +from loopy.symbolic import ( + IdentityMapper, WalkMapper, SubArrayRef, + RuleAwareIdentityMapper) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace) +from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, + CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -1139,7 +1143,7 @@ class ArgumentGuesser: def make_new_arg(self, arg_name): arg_name = arg_name.strip() - from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace + from loopy.kernel.data import ValueArg, ArrayArg import loopy as lp if arg_name in self.all_params: @@ -1835,6 +1839,148 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # }}} +# {{{ scope functions + +class FunctionScoper(RuleAwareIdentityMapper): + """ + Mapper to convert the ``function`` attribute of a + :class:`pymbolic.primitives.Call` known in the kernel as instances of + :class:`loopy.symbolic.ScopedFunction`. A function is known in the + *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` + returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + + log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + + unknown_function(y) + ScopedFunction('log')(z)``. + + :arg rule_mapping_context: An instance of + :class:`loopy.symbolic.RuleMappingContext`. + :arg function_ids: A container with instances of :class:`str` indicating + the function identifiers to look for while scoping functions. + """ + def __init__(self, rule_mapping_context, kernel): + super(FunctionScoper, self).__init__(rule_mapping_context) + self.kernel = kernel + self.scoped_functions = {} + + def map_call(self, expr, expn_state): + from loopy.symbolic import ScopedFunction + if not isinstance(expr.function, ScopedFunction): + + # search the kernel for the function + in_knl_callable = self.kernel.find_scoped_function_identifier( + expr.function.name) + if in_knl_callable: + # associate the newly created ScopedFunction with the + # resolved in-kernel callable + self.scoped_functions[expr.function.name] = in_knl_callable + + return type(expr)( + ScopedFunction(expr.function.name), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + + # this is an unknown function as of yet, do not modify it + return super(FunctionScoper, self).map_call(expr, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + # FIXME duplicated logic with map_call + + from loopy.symbolic import ScopedFunction + if not isinstance(expr.function, ScopedFunction): + + # search the kernel for the function. + in_knl_callable = self.kernel.find_scoped_function_identifier( + expr.function.name) + + if in_knl_callable: + # associate the newly created ScopedFunction with the + # resolved in-kernel callable + self.scoped_functions[expr.function.name] = in_knl_callable + return type(expr)( + ScopedFunction(expr.function.name), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + + # this is an unknown function as of yet, do not modify it + return super(FunctionScoper, self).map_call_with_kwargs(expr, + expn_state) + + def map_reduction(self, expr, expn_state): + from loopy.library.reduction import (MaxReductionOperation, + MinReductionOperation, ArgMinReductionOperation, + ArgMaxReductionOperation, _SegmentedScalarReductionOperation, + SegmentedOp) + from loopy.library.reduction import ArgExtOp + + # note down the extra functions arising due to certain reductions + + # FIXME Discuss this. It cannot stay the way it is, because non-built-in + # reductions cannot add themselves to this list. We may need to change + # the reduction interface. Why don't reductions generate scoped functions + # in the first place? + if isinstance(expr.operation, MaxReductionOperation): + self.scoped_functions["max"] = ( + self.kernel.find_scoped_function_identifier("max")) + elif isinstance(expr.operation, MinReductionOperation): + self.scoped_functions["min"] = ( + self.kernel.find_scoped_function_identifier("min")) + elif isinstance(expr.operation, ArgMaxReductionOperation): + self.scoped_functions["max"] = ( + self.kernel.find_scoped_function_identifier("max")) + self.scoped_functions["make_tuple"] = ( + self.kernel.find_scoped_function_identifier("make_tuple")) + self.scoped_functions[ArgExtOp(expr.operation)] = ( + self.kernel.find_scoped_function_identifier(expr.operation)) + elif isinstance(expr.operation, ArgMinReductionOperation): + self.scoped_functions["min"] = ( + self.kernel.find_scoped_function_identifier("min")) + self.scoped_functions["make_tuple"] = ( + self.kernel.find_scoped_function_identifier("make_tuple")) + self.scoped_functions[ArgExtOp(expr.operation)] = ( + self.kernel.find_scoped_function_identifier(expr.operation)) + elif isinstance(expr.operation, _SegmentedScalarReductionOperation): + self.scoped_functions["make_tuple"] = ( + self.kernel.find_scoped_function_identifier("make_tuple")) + self.scoped_functions[SegmentedOp(expr.operation)] = ( + self.kernel.find_scoped_function_identifier(expr.operation)) + + return super(FunctionScoper, self).map_reduction(expr, expn_state) + + +def scope_functions(kernel): + """ + Returns a kernel with the pymbolic nodes involving known functions realized + as instances of :class:`loopy.symbolic.ScopedFunction`, along with the + resolved functions being added to the ``scoped_functions`` dictionary of + the kernel. + """ + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + function_scoper = FunctionScoper(rule_mapping_context, kernel) + + # scoping fucntions and collecting the scoped functions + kernel_with_scoped_functions = rule_mapping_context.finish_kernel( + function_scoper.map_kernel(kernel)) + + # updating the functions collected during the scoped functions + updated_scoped_functions = kernel.scoped_functions.copy() + updated_scoped_functions.update(function_scoper.scoped_functions) + + return kernel_with_scoped_functions.copy( + scoped_functions=updated_scoped_functions) + +# }}} + + # {{{ kernel creation top-level def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): @@ -2174,6 +2320,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) + knl = scope_functions(knl) + from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 95c3c336c..1d79a86d7 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1877,7 +1877,15 @@ def infer_arg_is_output_only(kernel): else: new_args.append(arg.copy(is_output_only=False)) elif isinstance(arg, ConstantArg): +<<<<<<< HEAD + if arg.is_output_only: + raise LoopyError("Constant Argument %s cannot have " + "is_output_only True" % arg.name) + else: + new_args.append(arg.copy(is_output_only=False)) +======= new_args.append(arg) +>>>>>>> master else: raise NotImplementedError("Unkonwn argument type %s." % type(arg)) diff --git a/loopy/library/function.py b/loopy/library/function.py index 9d557ac9f..4873eca91 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from loopy.kernel.function_interface import ScalarCallable + def default_function_mangler(kernel, name, arg_dtypes): from loopy.library.reduction import reduction_function_mangler @@ -56,4 +58,41 @@ def tuple_function_mangler(kernel, name, arg_dtypes): return None +class MakeTupleCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple") + + def with_descrs(self, arg_id_to_descr): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), + (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) + + return self.copy(arg_id_to_descr=new_arg_id_to_descr) + + +class IndexOfCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = kernel.index_dtype + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + +def loopy_specific_callable_scopers(target, identifier): + if identifier == "make_tuple": + return MakeTupleCallable(name="make_tuple") + + if identifier in ["indexof", "indexof_vec"]: + return IndexOfCallable(name=identifier) + + from loopy.library.reduction import reduction_scoper + return reduction_scoper(target, identifier) + + # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index b8633114d..a2880bfb8 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -27,6 +27,7 @@ THE SOFTWARE. from pytools import ImmutableRecord from mako.template import Template +from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -163,60 +164,73 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(preamble_info): - for f in preamble_info.seen_functions: - try: - rng_variant = FUNC_NAMES_TO_RNG[f.name] - except KeyError: - continue +class Random123Callable(ScalarCallable): + """ + Records information about for the random123 functions. + """ + + def with_types(self, arg_id_to_dtype, kernel): + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + name = self.name + target = kernel.target + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + fn = rng_variant.full_name + if name == fn: + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen") + + elif name == fn + "_f32": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name) + + elif name == fn + "_f64": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name) + + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + def generate_preambles(self, target): + rng_variant = FUNC_NAMES_TO_RNG[self.name] from loopy.target.pyopencl import PyOpenCLTarget yield ("90-random123-"+rng_variant.full_name, PREAMBLE_TEMPLATE.render( is_pyopencl_target=isinstance( - preamble_info.kernel.target, + target, PyOpenCLTarget), rng_variant=rng_variant, )) + return -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None + +def random123_function_scoper(target, identifier): + if identifier in FUNC_NAMES_TO_RNG: + return Random123Callable(name=identifier) + + return None # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 8ed5cbe56..ca2f02347 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -24,6 +24,8 @@ THE SOFTWARE. from pymbolic import var +from loopy.symbolic import ScopedFunction +from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier @@ -180,7 +182,7 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + return ScopedFunction("max")(operand1, operand2) class MinReductionOperation(ScalarReductionOperation): @@ -188,7 +190,7 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + return ScopedFunction("min")(operand1, operand2) # {{{ base class for symbolic reduction ops @@ -237,7 +239,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype): scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ScopedFunction("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): @@ -254,7 +256,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return SegmentedOp(self)(*(operand1 + operand2)) + return ScopedFunction(SegmentedOp(self))(*(operand1 + operand2)) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -268,29 +270,6 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): op = "((%s) * (%s))" which = "product" - -def get_segmented_function_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) - # }}} @@ -313,7 +292,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ScopedFunction("make_tuple")(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)) def __str__(self): @@ -330,7 +309,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ArgExtOp(self)(*(operand1 + operand2)) + return ScopedFunction(ArgExtOp(self))(*(operand1 + operand2)) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -344,38 +323,6 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): update_comparison = "<=" neutral_sign = +1 - -def get_argext_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - prefix = op.prefix(scalar_dtype, index_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=kernel.target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) - # }}} @@ -429,70 +376,91 @@ def parse_reduction_op(name): # }}} -def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, index_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, index_dtype), - arg_dtypes=( - scalar_dtype, - index_dtype, - scalar_dtype, - index_dtype), - ) - - elif isinstance(func_id, SegmentedOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, segment_flag_dtype), - arg_dtypes=( - scalar_dtype, - segment_flag_dtype, - scalar_dtype, - segment_flag_dtype), - ) +# {{{ reduction specific callables + +class ReductionCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + name_in_target = self.name.prefix(scalar_dtype, index_dtype) + "_op" + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target) + + def with_descr(self, arg_id_to_descr): + from loopy.library.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = arg_id_to_descr.copy() + new_arg_id_to_descr[-1] = ValueArgDescriptor() + return self.copy(arg_id_to_descr=arg_id_to_descr) + + def generate_preambles(self, target): + if isinstance(self.name, _ArgExtremumReductionOperation): + op = self.name + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(index_t)s index1, + %(scalar_t)s op2, %(index_t)s index2, + %(index_t)s *index_out) + { + if (op2 %(comp)s op1) + { + *index_out = index2; + return op2; + } + else + { + *index_out = index1; + return op1; + } + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + elif isinstance(self.name, _SegmentedScalarReductionOperation): + op = self.name + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) + + return + + +def reduction_scoper(target, identifier): + if isinstance(identifier, (_ArgExtremumReductionOperation, + _SegmentedScalarReductionOperation)): + return ReductionCallable(name=identifier) return None - -def reduction_preamble_generator(preamble_info): - from loopy.target.opencl import OpenCLTarget - - for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_argext_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) - - elif isinstance(func.name, SegmentedOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_segmented_function_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) +# }}} # vim: fdm=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fc950c78e..6beadb3de 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -27,6 +27,7 @@ import six from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) +from functools import reduce import islpy as isl @@ -37,6 +38,10 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from loopy.symbolic import CombineMapper + +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + CallInstruction, _DataObliviousInstruction) import logging logger = logging.getLogger(__name__) @@ -2108,6 +2113,350 @@ def check_atomic_loads(kernel): # }}} +# {{{ arg_descr_inference + +class ArgDescrInferenceMapper(CombineMapper): + """ + Returns a set of instances of :class:`tuple` (expr, + in_kernel_callable). The mapped `in_kernel_callable` of the + :class:`InKernelCallable` are descriptor specialized for the given + arguments. + """ + + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + # FIXME logic duplication between map_call and map_call_with_kwargs + def map_call(self, expr, **kwargs): + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import SubArrayRef, ScopedFunction + + # ignoring if the call is not to a ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) + + # descriptors for the args + arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel)) + if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) + for i, par in enumerate(expr.parameters)) + + assignee_id_to_descr = {} + + # assignee descriptor + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + assert isinstance(assignees, tuple) + for i, par in enumerate(assignees): + if isinstance(par, SubArrayRef): + assignee_id_to_descr[-i-1] = ( + par.get_array_arg_descriptor(self.kernel)) + else: + assignee_id_to_descr[-i-1] = ValueArgDescriptor() + + # gathering all the descriptors + # TODO: I dont like in place updates. Change this to somthing else. + # Perhaps make a function? + combined_arg_id_to_descr = arg_id_to_descr.copy() + combined_arg_id_to_descr.update(assignee_id_to_descr) + + # specializing the function according to the parameter description + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_descrs( + combined_arg_id_to_descr)) + + # collecting the descriptors for args, kwargs, assignees + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) + + def map_call_with_kwargs(self, expr, **kwargs): + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import SubArrayRef + + # descriptors for the args and kwargs: + arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel)) + if isinstance(par, SubArrayRef) else ValueArgDescriptor() + for i, par in tuple(enumerate(expr.parameters)) + + tuple(expr.kw_parameters.items())) + + assignee_id_to_descr = {} + + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + assert isinstance(assignees, tuple) + for i, par in enumerate(assignees): + if isinstance(par, SubArrayRef): + assignee_id_to_descr[-i-1] = ( + par.get_array_arg_descriptor(self.kernel)) + else: + assignee_id_to_descr[-i-1] = ValueArgDescriptor() + + # gathering all the descriptors + # TODO: I dont like in place updates. Change this to somthing else. + # Perhaps make a function? + combined_arg_id_to_descr = arg_id_to_descr.copy() + combined_arg_id_to_descr.update(assignee_id_to_descr) + + # specializing the function according to the parameter description + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_descrs( + combined_arg_id_to_descr)) + + # collecting the descriptors for args, kwargs, assignees + return ( + frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) + + def map_constant(self, expr, **kwargs): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def infer_arg_descr(kernel): + """ + Returns a copy of *kernel* with the argument shapes and strides matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. + """ + + arg_description_modifier = ArgDescrInferenceMapper(kernel) + pymbolic_calls_to_functions = set() + + for insn in kernel.instructions: + + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_dtype + pymbolic_calls_to_functions.update( + arg_description_modifier(insn.expression, + assignees=insn.assignees)) + elif isinstance(insn, MultiAssignmentBase): + pymbolic_calls_to_functions.update(arg_description_modifier( + insn.expression)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + # making it the set of tuples a dict + pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions) + + # Now do the similar treatment as done for type inference. + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + + return register_pymbolic_calls_to_knl_callables(kernel, + pymbolic_calls_to_functions) + +# }}} + + +# {{{ + +class HWAxesInferenceMapper(CombineMapper): + """ + Returns a set of instances of :class:`tuple` (expr, + in_kernel_callable). The mapped `in_kernel_callable` of the + :class:`InKernelCallable` are specialized for the the grid sizes of + :attr:`kernel`. + """ + + def __init__(self, kernel): + self.kernel = kernel + self.local_size, self.global_size = kernel.get_grid_size_upper_bounds() + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr, **kwargs): + # ignoring if the call is not to a ScopedFunction + from loopy.symbolic import ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) + + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes( + self.local_size, self.global_size)) + + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) + + def map_call_with_kwargs(self, expr, **kwargs): + from loopy.symbolic import ScopedFunction + # ignoring if the call is not to a ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) + + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes( + self.local_size, self.global_size)) + + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values())))) + + def map_constant(self, expr, **kwargs): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def infer_hw_axes_sizes(kernel): + """ + Returns a copy of *kernel* with the hardware axes matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_hw_axes_sizes`. + """ + hw_axes_modifier = HWAxesInferenceMapper(kernel) + pymbolic_calls_to_functions = set() + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + pymbolic_calls_to_functions.update(hw_axes_modifier( + insn.expression)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("unknown type of instruction %s." % + type(insn)) + + # making it the set of tuples a dict + pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions) + + # Now do the similar treatment as done for type inference. + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + + return register_pymbolic_calls_to_knl_callables(kernel, + pymbolic_calls_to_functions) + +# }}} + + +# {{{ catching functions that are not ready for codegen + +class FunctionsNotReadyForCodegenCollector(CombineMapper): + """ + Returns all instances of function calls in an expression which are + not ready for code generation. + """ + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + return all(values) + + # FIXME logic duplication between map_call and map_call_with_kwargs + def map_call(self, expr, *args, **kwargs): + from loopy.library.reduction import ArgExtOp, SegmentedOp + from pymbolic.primitives import Variable + from loopy.symbolic import ScopedFunction + + if isinstance(expr.function, (ArgExtOp, SegmentedOp)): + return self.combine( + tuple( + self.rec(child, *args, **kwargs) for child in + expr.parameters)) + elif isinstance(expr.function, Variable): + # UnScopedFunction obtained and hence clearly not ready for + # codegen. + return False + + elif isinstance(expr.function, ScopedFunction): + is_ready_for_codegen = self.kernel.scoped_functions[ + expr.function.name].is_ready_for_codegen() + return self.combine( + (is_ready_for_codegen,) + + tuple( + self.rec(child, *args, **kwargs) + for child in expr.parameters)) + else: + raise LoopyError("Unexpected function type %s obtained in %s" + % (type(expr.function), expr)) + + def map_call_with_kwargs(self, expr, *args, **kwargs): + is_ready_for_codegen = self.kernel.scoped_functions[ + expr.function.name].is_ready_for_codegen() + return self.combine( + (is_ready_for_codegen,) + + tuple( + self.rec(child, *args, **kwargs) + for child in expr.parameters) + + tuple( + self.rec(child, *args, **kwargs) + for child in expr.kw_parameters.values()) + ) + + def map_constant(self, expr): + return True + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def make_functions_ready_for_codegen(kernel): + """ + Specializes the functions in the kernel that are missed during type + inference. + + .. code:: python + + knl = lp.make_kernel( + "{[i]: 0<=i<16}", + "a[i] = sin(b[i])", + [lp.ArrayArg('a', dtype=np.float64), + lp.ArrayArg('b', dtype=np.float64)]) + + In the above case, none of the instructions undergo type-specialization, as + all the arguments' types have been realized. But, this would be a problem + during the code generation phase as ``sin`` did not undergo type + specialization, and hence must be fixed through this function. + """ + from loopy.type_inference import TypeInferenceMapper + from loopy.symbolic import SubstitutionRuleExpander + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + + unready_functions_collector = FunctionsNotReadyForCodegenCollector(kernel) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + type_inf_mapper = TypeInferenceMapper(kernel) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + expr = subst_expander(insn.expression) + if not unready_functions_collector(expr): + # Infer the type of the functions that are not type specialized. + type_inf_mapper(expr, return_tuple=isinstance(insn, + CallInstruction), return_dtype_set=True) + + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + + else: + NotImplementedError("Unknown Instruction") + + return register_pymbolic_calls_to_knl_callables(kernel, + type_inf_mapper.specialized_functions) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2188,6 +2537,16 @@ def preprocess_kernel(kernel, device=None): kernel = find_temporary_address_space(kernel) + # inferring the shape and dim_tags of the arguments involved in a function + # call. + kernel = infer_arg_descr(kernel) + + # type specialize functions that were missed during the type inference. + kernel = make_functions_ready_for_codegen(kernel) + + # tuning the functions in the kernel to align with the grid sizes. + kernel = infer_hw_axes_sizes(kernel) + # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) diff --git a/loopy/statistics.py b/loopy/statistics.py index cee28b24f..6c012ca21 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -712,9 +712,16 @@ class ExpressionOpCounter(CounterBase): map_variable = map_constant def map_call(self, expr): + from loopy.symbolic import ScopedFunction + if isinstance(expr.function, ScopedFunction): + function_identifier = self.knl.scoped_functions[ + expr.function.name].name + else: + function_identifier = expr.function.name + return ToCountMap( {Op(dtype=self.type_inf(expr), - name='func:'+str(expr.function), + name='func:'+function_identifier, count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.parameters) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8927cd6fb..770e1128a 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -56,6 +56,7 @@ from pymbolic.mapper.constant_folder import \ ConstantFoldingMapper as ConstantFoldingMapperBase from pymbolic.parser import Parser as ParserBase +from loopy.diagnostic import LoopyError from loopy.diagnostic import ExpressionToAffineConversionError @@ -106,7 +107,10 @@ class IdentityMapperMixin(object): return expr def map_type_annotation(self, expr, *args): - return type(expr)(expr.type, self.rec(expr.child)) + return type(expr)(expr.type, self.rec(expr.child, *args)) + + def map_scoped_function(self, expr, *args): + return ScopedFunction(self.rec(expr.function, *args)) map_type_cast = map_type_annotation @@ -165,9 +169,16 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + def map_scoped_function(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.function, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant + map_scoped_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): @@ -232,13 +243,16 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_scoped_function(self, expr, prec): + return "ScopedFunction('%s')" % expr.name + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation) # noqa + or type(expr.function) != type(other.function) # noqa ): return [] @@ -274,6 +288,13 @@ class DependencyMapper(DependencyMapperBase): return self.combine( self.rec(child, *args) for child in expr.parameters) + def map_call_with_kwargs(self, expr, *args): + # Loopy does not have first-class functions. Do not descend + # into 'function' attribute of Call. + return self.combine( + self.rec(child, *args) for child in expr.parameters+tuple( + expr.kw_parameters.values())) + def map_reduction(self, expr): deps = self.rec(expr.expr) return deps - set(p.Variable(iname) for iname in expr.inames) @@ -289,6 +310,9 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) + def map_scoped_function(self, expr): + return self.rec(expr.function) + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -638,6 +662,51 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") + +class ScopedFunction(p.Expression): + """ + A function invocation whose definition is known in a :mod:`loopy` kernel. + Each instance of :class:`loopy.symbolic.ScopedFunction` in an expression + points to an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` through the + mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer + :ref:`ref_scoped_function` for a slightly detailed explanation on scoped + functions. + + .. attribute:: function + + An instance of :class:`pymbolic.primitives.Variable`, + :class:`loopy.library.reduction.ArgExtOp` or + :class:`loopy.library.reduction.SegmentedOp`. + """ + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + from loopy.library.reduction import ArgExtOp, SegmentedOp + assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp)) + self.function = function + + @property + def name(self): + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(self.function, p.Variable): + return self.function.name + elif isinstance(self.function, (ArgExtOp, SegmentedOp)): + return self.function + else: + raise LoopyError("Unexpected function type %s in ScopedFunction." % + type(self.function)) + + def __getinitargs__(self): + return (self.function, ) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_scoped_function") + # }}} @@ -650,9 +719,12 @@ def get_dependencies(expr): # {{{ rule-aware mappers def parse_tagged_name(expr): + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, p.Variable): + elif isinstance(expr, ScopedFunction): + return parse_tagged_name(expr.function) + elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) @@ -1100,6 +1172,14 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: return IdentityMapper.map_call(self, expr) + def map_call_with_kwargs(self, expr): + for par in expr.kw_parameters.values(): + if not isinstance(par, SubArrayRef): + raise LoopyError("Keyword Arguments is only supported for" + " array arguments--use positional order to specify" + " the order of the arguments in the call.") + return IdentityMapper.map_call_with_kwargs(self, expr) + # {{{ customization to pymbolic parser diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a81354e2f..9733fa446 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -150,7 +150,12 @@ class ASTBuilderBase(object): # {{{ library - def function_manglers(self): + def function_scopers(self): + """ + Returns an instance of list of the functions of signature + ``(target, identifiers)`` returning either an instance of + :class:`InKernelCallable` if a match is found or *None*. + """ return [] def symbol_manglers(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 83efecf0e..eab1e6afc 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,7 +27,6 @@ THE SOFTWARE. import six import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block @@ -35,6 +34,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper from loopy.types import NumpyType +from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p from pytools import memoize_method @@ -354,71 +354,105 @@ def c_symbol_mangler(kernel, name): # }}} -# {{{ function mangler +# {{{ function scoping -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None +class CMathCallable(ScalarCallable): + """ + An umbrella callable for all the math functions which can be seen in a + C-Target. + """ - if name in ["abs", "min", "max"]: - name = "f" + name + def with_types(self, arg_id_to_dtype, kernel): + name = self.name - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): + if name in ["abs", "min", "max"]: + name = "f" + name - dtype = arg_dtypes[0].numpy_dtype + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: - if modify_name: - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: - name = name + "l" # fabsl - else: - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - # binary functions - if (name in ["fmax", "fmin"] - and len(arg_dtypes) == 2): + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyTypeError("%s does not support complex numbers") + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == 'c': + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - elif dtype.kind == "f": - if modify_name: + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified if dtype == np.float64: - pass # fmin + pass # fabs elif dtype == np.float32: - name = name + "f" # fminf + name = name + "f" # fabsf elif dtype == np.float128: - name = name + "l" # fminl + name = name + "l" # fabsl else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) + raise LoopyTypeError("%s does not support type %s" % (name, + dtype)) + + return self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + + # binary functions + if name in ["fmax", "fmin"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + elif dtype.kind == "f": + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) + dtype = NumpyType(dtype) + return self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) + + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) +def scope_c_math_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function + represented by :arg:`identifier` is known in C, otherwise returns *None*. + """ + if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]: + return CMathCallable(name=identifier) return None # }}} @@ -427,12 +461,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CASTBuilder(ASTBuilderBase): # {{{ library - def function_manglers(self): - return ( - super(CASTBuilder, self).function_manglers() + [ - c_math_mangler - ]) - def symbol_manglers(self): return ( super(CASTBuilder, self).symbol_manglers() + [ @@ -445,6 +473,11 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) + def function_scopers(self): + return ( + super(CASTBuilder, self).function_scopers() + [ + scope_c_math_functions]) + # }}} # {{{ code generation @@ -846,82 +879,30 @@ class CASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) def emit_multiple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None + ecm = codegen_state.expression_to_code_mapper + func_id = insn.expression.function.name + in_knl_callable = codegen_state.kernel.scoped_functions[func_id] - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. + if in_knl_callable.name_in_target == 'loopy_make_tuple': return self.emit_tuple_assignment(codegen_state, insn) - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes)) + in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( + insn=insn, + target=self.target, + expression_to_code_mapper=ecm) - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) - - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: + if is_returned: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast( - mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) - - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index dd2104d0c..ecb6ad7d9 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -41,7 +41,7 @@ from pymbolic import var from loopy.expression import dtype_to_type_context from loopy.type_inference import TypeInferenceMapper -from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType @@ -383,19 +383,18 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Variable, Subscript - - identifier = expr.function + from pymbolic.primitives import Subscript # {{{ implement indexof, indexof_vec - if identifier.name in ["indexof", "indexof_vec"]: + identifier_name = self.kernel.scoped_functions[expr.function.name].name + if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier.name) + raise LoopyError("%s takes exactly one argument" % identifier_name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError( - "argument to %s must be a subscript" % identifier.name) + "argument to %s must be a subscript" % identifier_name) ary = self.find_array(arg) @@ -407,11 +406,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier.name) + raise LoopyError("%s does not support images" % identifier_name) - if identifier.name == "indexof": + if identifier_name == "indexof": return access_info.subscripts[0] - elif identifier.name == "indexof_vec": + elif identifier_name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): @@ -430,56 +429,21 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - - processed_parameters = None - - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: - raise LoopyError("functions with more or fewer than one return value " - "may not be used in an expression") - - if mangle_result.arg_dtypes is not None: - processed_parameters = tuple( - self.rec(par, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) - - else: - # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to - # propagate the type context here. But for many others, it does - # not. Using the inferred type as a stopgap for now. - processed_parameters = tuple( - self.rec(par, - type_context=dtype_to_type_context( - self.kernel.target, par_dtype)) - for par, par_dtype in zip(expr.parameters, par_dtypes)) - - from warnings import warn - warn("Calling function '%s' with unknown C signature--" - "return CallMangleInfo.arg_dtypes" - % identifier, LoopyWarning) - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return var(mangle_result.target_name)(*processed_parameters) + from loopy.kernel.function_interface import ManglerCallable + if isinstance(self.kernel.scoped_functions[expr.function.name], + ManglerCallable): + from loopy.codegen import SeenFunction + in_knl_callable = self.kernel.scoped_functions[expr.function.name] + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) + + return self.kernel.scoped_functions[expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target) # {{{ deal with complex-valued variables diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 673d3b284..b2e4118d2 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -34,6 +34,7 @@ from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import AddressSpace from pymbolic import var +from loopy.kernel.function_interface import ScalarCallable # {{{ vector types @@ -111,29 +112,71 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ function scoper -def cuda_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +_CUDA_SPECIFIC_FUNCTIONS = { + "rsqrt": 1, + "atan2": 2, + } - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") +class CudaCallable(ScalarCallable): - if dtype.kind == "f": - name = "f" + name + def cuda_with_types(self, arg_id_to_dtype, kernel): - return dtype, name + name = self.name - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] - return scalar_dtype, name + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] + return self.copy(name_in_target=name, + arg_id_to_dtype={-1: NumpyType(scalar_dtype), + 0: dtype, 1: dtype}) + + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype) + + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + +def scope_cuda_functions(target, identifier): + if identifier in set(["dot"]) | set( + _CUDA_SPECIFIC_FUNCTIONS): + return CudaCallable(name=identifier) return None @@ -217,13 +260,12 @@ class CudaTarget(CTarget): # {{{ ast builder class CUDACASTBuilder(CASTBuilder): + # {{{ library - def function_manglers(self): - return ( - super(CUDACASTBuilder, self).function_manglers() + [ - cuda_function_mangler - ]) + def function_scopers(self): + return [scope_cuda_functions] + ( + super(CUDACASTBuilder, self).function_scopers()) # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 432c95ef3..de07adf97 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,11 +31,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler -from loopy.kernel.data import AddressSpace, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import AddressSpace +from loopy.kernel.function_interface import ScalarCallable from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -166,59 +166,117 @@ VECTOR_LITERAL_FUNCS = dict( ) -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class OpenCLCallable(ScalarCallable): + """ + Records information about OpenCL functions which are not covered by + :class:`loopy.target.c.CMathCallable`. + """ + + def with_types(self, arg_id_to_dtype, kernel): + name = self.name + + if name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if dtype.kind in ['u', 'i', 'f']: + if dtype.kind == 'f': + name = 'f'+name + dtype = NumpyType(dtype) + return self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) + else: + # Unsupported type. + raise LoopyError("%s function not supported for the types %s" % + (name, dtype)) + + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}) + + if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype) + + if name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in + range(count)) + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) + + return self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype) + + # does not satisfy any of the conditions needed for specialization. + # hence just returning a copy of the callable. + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + +def scope_opencl_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = set(["max", "min", "dot"]) | set( + _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - - if count != len(arg_dtypes): - return None - - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) + if identifier in opencl_function_ids: + return OpenCLCallable(name=identifier) return None @@ -365,13 +423,10 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_manglers(self): + def function_scopers(self): return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super(OpenCLCASTBuilder, self).function_manglers()) + [scope_opencl_functions] + super( + OpenCLCASTBuilder, self).function_scopers()) def symbol_manglers(self): return ( @@ -380,13 +435,10 @@ class OpenCLCASTBuilder(CASTBuilder): ]) def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator return ( super(OpenCLCASTBuilder, self).preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator, - ]) + opencl_preamble_generator]) # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 73e8e0092..27c4f4ab4 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,12 +31,12 @@ from six.moves import range import numpy as np -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError, warn_with_kernel +from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError from warnings import warn +from loopy.kernel.function_interface import ScalarCallable import logging logger = logging.getLogger(__name__) @@ -199,37 +199,79 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes +# {{{ pyopencl function scopers - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" +class PyOpenCLCallable(ScalarCallable): + """ + Records information about the callables which are not covered by + :class:`loopy.target.opencl.OpenCLCallable` + """ + def with_types(self, arg_id_to_dtype, kernel): + + name = self.name + + for id in arg_id_to_dtype: + # since all the below functions are single arg. + if not -1 <= id <= 0: + raise LoopyError("%s can only take one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = arg_id_to_dtype[0] + + if name in ["real", "imag", "abs"]: + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}) + + if name in ["sqrt", "exp", "log", + "sin", "cos", "tan", + "sinh", "cosh", "tanh", + "conj", "abs"]: + if dtype.is_complex(): + # function parameters are complex. + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: dtype}) else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) - - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) + # function calls for floating parameters. + numpy_dtype = dtype.numpy_dtype + if numpy_dtype.kind in ('u', 'i'): + dtype = dtype.copy(numpy_dtype=np.float32) + if name == 'abs': + name = 'fabs' + return self.copy(name_in_target=name, + arg_id_to_dtype={0: dtype, -1: dtype}) + + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + +def pyopencl_function_scoper(target, identifier): + if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"]: + return PyOpenCLCallable(name=identifier) return None +# }}} + # {{{ preamble generator @@ -739,19 +781,15 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler + def function_scopers(self): + from loopy.library.random123 import random123_function_scoper return ( - super(PyOpenCLCASTBuilder, self).function_manglers() + [ - pyopencl_function_mangler, - random123_function_mangler - ]) + [pyopencl_function_scoper, random123_function_scoper] + super( + PyOpenCLCASTBuilder, self).function_scopers()) def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator return ([ pyopencl_preamble_generator, - random123_preamble_generator, ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index ce04986d3..2804b0fb9 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -82,47 +82,35 @@ class ExpressionToPythonMapper(StringifyMapper): expr, enclosing_prec) def map_call(self, expr, enclosing_prec): - from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE - identifier = expr.function + identifier_name = self.kernel.scoped_functions[expr.function.name].name - if identifier.name in ["indexof", "indexof_vec"]: + if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + from loopy.kernel.function_interface import ManglerCallable + in_knl_callable = self.kernel.scoped_functions[expr.function.name] + if isinstance(in_knl_callable, ManglerCallable): + from loopy.codegen import SeenFunction + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) str_parameters = None + number_of_assignees = len([key for key in + in_knl_callable.arg_id_to_dtype.keys() if key < 0]) - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: + if number_of_assignees != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - str_parameters = [ - self.rec(par, PREC_NONE) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)] + str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") @@ -189,11 +177,11 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_manglers(self): + def function_scopers(self): + from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_manglers() + [ - _numpy_single_arg_function_mangler, - ]) + super(PythonASTBuilderBase, self).function_scopers() + + [scope_c_math_functions]) def preamble_generators(self): return ( diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d4dcb3701..d0edcfd78 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -398,7 +398,14 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", # }}} - return diff_context.get_new_kernel(), result + # Differentiation lead to addition of new functions to the kernel. + # For example differentiating `sin(x)` -> `cos(x)`. Hence we would need to + # scope `cos(x)`. + from loopy.kernel.creation import scope_functions + differentiated_scoped_kernel = scope_functions( + diff_context.get_new_kernel()) + + return differentiated_scoped_kernel, result # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 010a0658f..a68520525 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -44,6 +44,19 @@ def _debug(kernel, s, *args): logger.debug("%s: %s" % (kernel.name, logstr)) +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :param arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = dict((id, dtype) for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): @@ -60,6 +73,8 @@ class TypeInferenceMapper(CombineMapper): new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() + self.scoped_functions = kernel.scoped_functions + self.specialized_functions = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -250,15 +265,18 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable + from pymbolic.primitives import Variable, CallWithKwargs + from loopy.symbolic import ScopedFunction + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + kw_parameters = {} identifier = expr.function - if isinstance(identifier, Variable): + if isinstance(identifier, (Variable, ScopedFunction)): identifier = identifier.name - if identifier in ["indexof", "indexof_vec"]: - return [self.kernel.index_dtype] - def none_if_empty(d): if d: d, = d @@ -266,25 +284,121 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) - if None in arg_dtypes: - return [] + arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) + + # specializing the known function wrt type + if isinstance(expr.function, ScopedFunction): + in_knl_callable = self.scoped_functions[expr.function.name] + + # {{{ checking that there is no overwriting of types of in_knl_callable + + if in_knl_callable.arg_id_to_dtype is not None: + + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + if in_knl_callable.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int + + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue + + # }}} + + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") + + # }}} + + in_knl_callable = in_knl_callable.with_types( + arg_id_to_dtype, self.kernel) + + # storing the type specialized function so that it can be used for + # later use + self.specialized_functions[expr] = in_knl_callable.with_target( + self.kernel.target) + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if new_arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] + + elif isinstance(expr.function, Variable): + # Since, the function is not "scoped", attempt to infer using + # kernel.function_manglers + + # {{{ trying to infer using function manglers + + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in + expr.parameters) + + # finding the function_mangler which would be associated with the + # realized function. + mangle_result = None + for function_mangler in self.kernel.function_manglers: + mangle_result = function_mangler(self.kernel, identifier, + arg_dtypes) + if mangle_result: + # found a match. + break - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] - else: if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") + from loopy.kernel.function_interface import (ManglerCallable, + ValueArgDescriptor) + + # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes + arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) + for i, dt in enumerate(mangle_result.arg_dtypes)) + arg_id_to_dtype.update(dict((-i-1, + dtype.with_target(self.kernel.target)) for i, dtype in enumerate( + mangle_result.result_dtypes))) + arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.arg_dtypes)) + res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.result_dtypes)) + arg_id_to_descr = dict(arg_descrs+res_descrs) + + # creating the ManglerCallable object corresponding to the + # function. + self.specialized_functions[expr] = ManglerCallable( + identifier, function_mangler, arg_id_to_dtype, + arg_id_to_descr, mangle_result.target_name) + + # Returning the type. + if return_tuple: + if mangle_result is not None: + return [mangle_result.result_dtypes] + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not return_tuple: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct " + "assignments") + + return [mangle_result.result_dtypes[0]] + # }}} - return [mangle_result.result_dtypes[0]] + return [] - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) + map_call_with_kwargs = map_call def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -406,7 +520,7 @@ class TypeInferenceMapper(CombineMapper): def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): - return [kernel.index_dtype], [] + return [kernel.index_dtype], [], {} from functools import partial debug = partial(_debug, kernel) @@ -451,11 +565,12 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types + return None, type_inf_mapper.symbols_with_unknown_types, None result = type_inf_mapper.combine(dtype_sets) - return result, type_inf_mapper.symbols_with_unknown_types + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.specialized_functions) # }}} @@ -553,6 +668,8 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument + specialized_functions = {} + for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] @@ -576,7 +693,7 @@ def infer_unknown_types(kernel, expect_completion=False): debug("inferring type for %s %s", type(item).__name__, item.name) - result, symbols_with_unavailable_types = ( + result, symbols_with_unavailable_types, new_specialized_functions = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) @@ -597,6 +714,10 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + # TODO: I dont like in-place updates. Change this to something + # else. Perhaps add a function for doing this, which does it + # using a bunch of copies? + specialized_functions.update(new_specialized_functions) else: debug(" failure") @@ -639,11 +760,23 @@ def infer_unknown_types(kernel, expect_completion=False): logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) - return unexpanded_kernel.copy( + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + type_specialized_kernel = register_pymbolic_calls_to_knl_callables( + pre_type_specialized_knl, specialized_functions) + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.check import check_functions_are_scoped + check_functions_are_scoped(type_specialized_kernel) + + return type_specialized_kernel + # }}} diff --git a/test/testlib.py b/test/testlib.py index ad290ee7c..a22988ec8 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel @@ -132,4 +133,43 @@ class SeparateTemporariesPreambleTestPreambleGenerator( # }}} + +# {{{ test_register_function_lookup + +class Log2Callable(lp.ScalarCallable): + + def with_types(self, arg_id_to_dtype, kernel): + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + + from loopy.target.opencl import OpenCLTarget + name_in_target = "log2" + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified + if dtype == np.float32: + name_in_target = "log2f" + elif dtype == np.float128: + name_in_target = "log2l" + + from loopy.types import NumpyType + return self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + + +def register_log2_lookup(target, identifier): + if identifier == 'log2': + return Log2Callable(name='log2') + return None + +# }}} + # vim: foldmethod=marker -- GitLab