From 21c0b651b704b5e03ab16ba78c5a1824773818e8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 13 Mar 2018 14:18:28 -0500 Subject: [PATCH 001/916] Added support for ScopedFunctions --- loopy/kernel/__init__.py | 11 +++++- loopy/kernel/creation.py | 77 +++++++++++++++++++++++++++++++++++++- loopy/library/function.py | 7 +++- loopy/library/random123.py | 50 +++---------------------- loopy/library/reduction.py | 7 ++++ loopy/symbolic.py | 24 ++++++++++++ loopy/target/__init__.py | 3 ++ loopy/target/c/__init__.py | 10 +++++ loopy/target/opencl.py | 34 +++++++++++------ loopy/target/pyopencl.py | 10 +++++ 10 files changed, 175 insertions(+), 58 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 32b233900..367214148 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -37,7 +37,8 @@ from pytools import UniqueNameGenerator, generate_unique_names from loopy.library.function import ( default_function_mangler, - single_arg_function_mangler) + single_arg_function_mangler, + default_function_identifiers) from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted @@ -143,6 +144,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): to instances of :class:`loopy.kernel.data.IndexTag`. .. attribute:: function_manglers + .. attribute:: function_identifiers .. attribute:: symbol_manglers .. attribute:: substitutions @@ -200,6 +202,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): default_function_mangler, single_arg_function_mangler, ], + function_identifiers=set(), symbol_manglers=[], iname_slab_increments={}, @@ -265,6 +268,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT + # Populating the function identifiers based on the target and the default + # function identifiers + function_identifiers = (default_function_identifiers() | + target.get_device_ast_builder().function_identifiers()) + ImmutableRecordWithoutPickling.__init__(self, domains=domains, instructions=instructions, @@ -284,6 +292,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, function_manglers=function_manglers, + function_identifiers=function_identifiers, symbol_manglers=symbol_manglers, index_dtype=index_dtype, options=options, diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 0daf327f4..ee17bd1a7 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -27,7 +27,7 @@ THE SOFTWARE. import numpy as np -from pymbolic.mapper import CSECachingMapperMixin +from pymbolic.mapper import CSECachingMapperMixin, Collector from loopy.tools import intern_frozenset_of_ids from loopy.symbolic import IdentityMapper, WalkMapper from loopy.kernel.data import ( @@ -1829,6 +1829,76 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # }}} +# {{{ lookup functions + + +class FunctionScoper(IdentityMapper): + def __init__(self, function_ids): + self.function_ids = function_ids + + def map_call(self, expr): + if expr.function.name in self.function_ids: + # 1. need to change the function to ScopedFunction instead of Variable + from pymbolic.primitives import Call + from loopy.symbolic import ScopedFunction + + return super(FunctionScoper, self).map_call( + Call(function=ScopedFunction(expr.function.name), + parameters=expr.parameters)) + + else: + return super(FunctionScoper, self).map_call(expr) + + def map_call_with_kwargs(self, expr): + if expr.function.name in self.function_ids: + from pymbolic.primitives import CallWithKwargs + from loopy.symbolic import ScopedFunction + return super(FunctionScoper, self).map_call_with_kwargs( + CallWithKwargs(function=ScopedFunction(expr.function.name), + parameters=expr.parameters, + kw_parameters=expr.kw_parameters)) + else: + return super(FunctionScoper, self).map_call_with_kwargs(expr) + + +class ScopedFunctionCollector(Collector): + + def map_scoped_function(self, expr): + return set([expr.name]) + + +def scope_functions(kernel): + func_ids = kernel.function_identifiers.copy() + + from loopy.kernel.instruction import CInstruction, _DataObliviousInstruction + function_scoper = FunctionScoper(func_ids) + scoped_function_collector = ScopedFunctionCollector() + scoped_functions = set() + + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, (MultiAssignmentBase, CInstruction)): + new_insn = insn.copy(expression=function_scoper(insn.expression)) + scoped_functions.update(scoped_function_collector(new_insn.expression)) + new_insns.append(new_insn) + elif isinstance(insn, _DataObliviousInstruction): + new_insns.append(insn) + else: + raise NotImplementedError("scope_function not implemented for %s" % + type(insn)) + + # Need to combine the scoped functions into a dict + """ + from loopy.function_interface import InKernelCallable + scoped_function_dict = ((func, InKernelCallable(func)) for func in + scoped_functions) + """ + return kernel.copy(instructions=new_insns) + +# }}} + + # {{{ kernel creation top-level def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): @@ -2163,6 +2233,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) + # Function Lookup + # TODO: here I add my function for function_lookup. Lol. realize the UN-inteded + # pun + knl = scope_functions(knl) + from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) diff --git a/loopy/library/function.py b/loopy/library/function.py index 9d557ac9f..e8e1e22fa 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -23,7 +23,13 @@ THE SOFTWARE. """ +def default_function_identifiers(): + from loopy.library.reduction import reduction_function_identifiers + return set("make_tuple") | reduction_function_identifiers() + + def default_function_mangler(kernel, name, arg_dtypes): + from loopy.library.reduction import reduction_function_mangler manglers = [reduction_function_mangler, tuple_function_mangler] @@ -55,5 +61,4 @@ def tuple_function_mangler(kernel, name, arg_dtypes): return None - # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index b8633114d..82e44b2d1 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -62,12 +62,8 @@ RNG_VARIANTS = [ _threefry_base_info.copy(width=4, bits=64), ] -FUNC_NAMES_TO_RNG = dict( - (v.full_name + suffix, v) - for v in RNG_VARIANTS - for suffix in [ - "", "_f32", "_f64", - ]) +FUNC_NAMES_TO_RNG = set(v.full_name + suffix for v in RNG_VARIANTS for suffix in + ["", "_f32", "_f64", ]) # }}} @@ -180,43 +176,9 @@ def random123_preamble_generator(preamble_info): )) -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None +def random123_function_identifiers(): + return FUNC_NAMES_TO_RNG + +# Removed the random123_function_mangler # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 0e5a093b7..5daa1528a 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -422,6 +422,13 @@ def parse_reduction_op(name): # }}} +def reduction_function_identifiers(): + """ Return a :class:`set` of the type of the reduction identifiers that can be + encountered in a kernel. + """ + return set(op for op in _REDUCTION_OPS) + + def reduction_function_mangler(kernel, func_id, arg_dtypes): if isinstance(func_id, ArgExtOp): from loopy.target.opencl import CTarget diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 0cc8f4ba6..16c9fd482 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -112,6 +112,8 @@ class IdentityMapperMixin(object): map_rule_argument = map_group_hw_index + map_scoped_function = IdentityMapperBase.map_variable + class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): pass @@ -125,6 +127,8 @@ class PartialEvaluationMapper( def map_common_subexpression_uncached(self, expr): return type(expr)(self.rec(expr.child), expr.prefix, expr.scope) + map_scoped_function = map_variable + class WalkMapper(WalkMapperBase): def map_literal(self, expr, *args): @@ -163,6 +167,8 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + map_scoped_function = WalkMapperBase.map_variable + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant @@ -174,6 +180,8 @@ class CombineMapper(CombineMapperBase): map_linear_subscript = CombineMapperBase.map_subscript + map_scoped_function = CombineMapperBase.map_variable + class SubstitutionMapper( CSECachingMapperMixin, SubstitutionMapperBase, IdentityMapperMixin): @@ -230,6 +238,9 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_scoped_function(self, expr, prec): + return "ScopedFunction('%s')" % expr.name + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): @@ -287,6 +298,8 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) + map_scoped_function = DependencyMapperBase.map_variable + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -322,6 +335,8 @@ class SubstitutionRuleExpander(IdentityMapper): return self.rec(expr) + map_scoped_function = map_variable + # }}} @@ -636,6 +651,15 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") + +class ScopedFunction(p.Variable): + """ Connects a call to a callable available in a kernel. + """ + mapper_method = intern("map_scoped_function") + + def stringifier(self): + return StringifyMapper + # }}} diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a08b406f5..fe6daf12c 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -150,6 +150,9 @@ class ASTBuilderBase(object): # {{{ library + def function_identifiers(self): + return set() + def function_manglers(self): return [] diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 8e69793e8..2b5e394bb 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -356,6 +356,11 @@ def c_symbol_mangler(kernel, name): # {{{ function mangler +def c_math_identifiers(): + return set(["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "tanh", + "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]) + + def c_math_mangler(target, name, arg_dtypes, modify_name=True): # Function mangler for math functions defined in C standard # Convert abs, min, max to fabs, fmin, fmax. @@ -427,6 +432,11 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CASTBuilder(ASTBuilderBase): # {{{ library + def function_identifiers(self): + return ( + super(CASTBuilder, self).function_identifiers() | + c_math_identifiers()) + def function_manglers(self): return ( super(CASTBuilder, self).function_manglers() + [ diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 31e0569b9..94870907b 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,11 +31,10 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler +from loopy.target.c import DTypeRegistryWrapper, c_math_identifiers from loopy.kernel.data import temp_var_scope, CallMangleInfo from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -139,8 +138,27 @@ def _register_vector_types(dtype_registry): # }}} +# {{{ function identifiers + +_CL_SIMPLE_MULTI_ARG_FUNC_IDS = set(["clamp", "atan2"]) + + +VECTOR_LITERAL_FUNC_IDS = set("make_%s%d" % (name, count) + for name in ['char', 'uchar', 'short', 'ushort', 'int', 'uint', 'long', + 'ulong', 'float', 'double'] + for count in [2, 3, 4, 8, 16] + ) + + +def opencl_function_identifiers(): + return set(["max", "min", "dot"]) | (_CL_SIMPLE_MULTI_ARG_FUNC_IDS | + VECTOR_LITERAL_FUNC_IDS) + +# }}} + # {{{ function mangler + _CL_SIMPLE_MULTI_ARG_FUNCTIONS = { "clamp": 3, "atan2": 2, @@ -356,8 +374,6 @@ class OpenCLTarget(CTarget): vec.types[base.numpy_dtype, count], target=self) - # }}} - # }}} @@ -366,13 +382,9 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_manglers(self): - return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super(OpenCLCASTBuilder, self).function_manglers()) + def function_identifiers(self): + return (opencl_function_identifiers() | c_math_identifiers() | + super(OpenCLCASTBuilder, self).function_identifiers()) def symbol_manglers(self): return ( diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 744c03d8e..1451cf9e7 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -199,6 +199,11 @@ def check_sizes(kernel, device): # }}} +def pyopencl_function_identifiers(): + return set(["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", "tanh", + "conj", "real", "imag", "abs"]) + + def pyopencl_function_mangler(target, name, arg_dtypes): if len(arg_dtypes) == 1 and isinstance(name, str): arg_dtype, = arg_dtypes @@ -739,6 +744,11 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library + def function_identifiers(self): + from loopy.library.random123 import random123_function_identifiers + return (super(PyOpenCLCASTBuilder, self).function_identifiers() | + pyopencl_function_identifiers() | random123_function_identifiers()) + def function_manglers(self): from loopy.library.random123 import random123_function_mangler return ( -- GitLab From 47a73915d0b2b194a9c518fc9b159e69890dc07d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 13 Mar 2018 14:54:57 -0500 Subject: [PATCH 002/916] Added support for scoping functions at creation time. --- loopy/kernel/__init__.py | 2 + loopy/kernel/creation.py | 9 +- loopy/kernel/function_interface.py | 505 +++++++++++++++++++++++++++++ 3 files changed, 511 insertions(+), 5 deletions(-) create mode 100644 loopy/kernel/function_interface.py diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 367214148..d33053dea 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -203,6 +203,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): single_arg_function_mangler, ], function_identifiers=set(), + scoped_functions={}, symbol_manglers=[], iname_slab_increments={}, @@ -293,6 +294,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): applied_iname_rewrites=applied_iname_rewrites, function_manglers=function_manglers, function_identifiers=function_identifiers, + scoped_functions=scoped_functions, symbol_manglers=symbol_manglers, index_dtype=index_dtype, options=options, diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index ee17bd1a7..09b0ac180 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1889,12 +1889,11 @@ def scope_functions(kernel): type(insn)) # Need to combine the scoped functions into a dict - """ - from loopy.function_interface import InKernelCallable - scoped_function_dict = ((func, InKernelCallable(func)) for func in + from loopy.kernel.function_interface import InKernelCallable + scoped_function_dict = dict((func, InKernelCallable(func)) for func in scoped_functions) - """ - return kernel.copy(instructions=new_insns) + + return kernel.copy(instructions=new_insns, scoped_functions=scoped_function_dict) # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py new file mode 100644 index 000000000..d88841df7 --- /dev/null +++ b/loopy/kernel/function_interface.py @@ -0,0 +1,505 @@ +from __future__ import division, absolute_import + +import numpy as np + +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError + +from loopy.types import NumpyType + + +# {{{ argument descriptors + +class ArgDescriptor(ImmutableRecord): + """Base type of argument description about the variable type that is supposed to + be encountered in a function signature. + .. attribute:: mem_scope + .. attribute:: shape + .. attribute:: dim_tags + """ + + def __init__(self, + mem_scope=None, + shape=None, + dim_tags=None): + super(ArgDescriptor).__init__(self, + mem_scope=mem_scope, + shape=shape, + dim_tags=dim_tags) + + +class ValueArgDescriptor(ArgDescriptor): + """ + """ + def __init__(self): + super(ValueArgDescriptor, self).__init__(self) + + +class ArrayArgDescriptor(ArgDescriptor): + """ + .. attribute:: mem_scope + .. attribute:: dim_tags + """ + + def __init__(self, + mem_scope=None, + dim_tags=None): + super(ArgDescriptor, self).__init__(self, + mem_scope=mem_scope, + dim_tags=dim_tags) + + def copy(self, dtype=None, mem_scope=None, shape=None, dim_tags=None): + if dtype is None: + dtype = self.dtype + + if mem_scope is None: + mem_scope = self.mem_scope + + if dim_tags is None: + dim_tags = self.dim_tags + + return ArrayArgDescriptor( + mem_scope=mem_scope, + dim_tags=dim_tags) + + +# }}} + + +# {{{ in kernel callable + +class InKernelCallable(ImmutableRecord): + """ + + .. attribute:: name + + The name of the callable which can be encountered within a kernel. + + .. note:: + + Negative ids in the mapping attributes indicate the result arguments + + """ + + def __init__(self, name=None): + + # {{{ sanity checks + + if not isinstance(name, str): + raise LoopyError("name of a InKernelCallable should be a string") + + # }}} + + self.name = name + + super(InKernelCallable, self).__init__(name=name) + + def copy(self, name=None): + if name is None: + name = self.name + + return InKernelCallable(name=name) + + def with_types(self, arg_id_to_dtype): + """ + :arg arg_id_to_type: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.types.LoopyType` instances. + Unspecified/unknown types are not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_type* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr): + """ + :arg arg_id_to_descr: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.ArrayArgDescriptor` instances. + Unspecified/unknown types are not represented in *arg_id_to_descr*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_descr* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + + raise NotImplementedError() + + def with_iname_tag_usage(self, unusable, concurrent_shape): + """ + :arg unusable: a set of iname tags that may not be used in the callee. + :arg concurrent_shape: an list of tuples ``(iname_tag, bound)`` for + concurrent inames that are used in the calller but also available + for mapping by the callee. *bound* is given as a + :class:`islpy.PwAff`. + + :returns: a list of the same type as *concurrent*, potentially modified + by increasing bounds or adding further iname tag entries. + + All iname tags not explicitly listed in *concurrent* or *unusable* are + available for mapping by the callee. + """ + + raise NotImplementedError() + + def is_arg_written(self, arg_id): + """ + :arg arg_id: (keyword) name or position + """ + + raise NotImplementedError() + + def is_ready_for_code_gen(self): + + raise NotImplementedError() + + # {{{ code generation + + def generate_preambles(self, target): + """ This would generate the target specific preamble. + """ + raise NotImplementedError() + + def get_target_specific_name(self, target): + + raise NotImplementedError() + + def emit_call(self, target): + + raise NotImplementedError() + + # }}} + + def __eq__(self, other): + return (self.name == other.name + and self.arg_id_to_descr == other.arg_id_to_descr + and self.arg_id_to_keyword == other.arg_id_to_keyword) + + def __hash__(self): + return hash((self.name, )) + +# }}} + + +# {{{ generic callable class + + +class CommonReturnTypeCallable(InKernelCallable): + """ A class of generic functions which have the following properties: + - Single return value + - Return type of the callable is a common dtype to all the input arguments + to the callable + + .. attribute:: name + + The name of the function as would be encountered in loopy. + + ..attribute:: specialized_dtype + + The dtype for which the function has been setup to generate code and + premables. For example, the function `sin` can be specialized to either one + of the following `float sin(float x)` or `double sin(double x)`. This is not + usually expected to be an input as this removed the generality of the + callable. + + ..attribute:: kinds_allowed + + The extent upto which the function can be generalized upto. For example + `sin(x)` cannot have complex types as its specialized type. + + ..attribute:: arity + + The number of inputs that are to be given to the function + + """ + + def __init__(self, name=None, specialized_dtype=None, kinds_allowed=None, + arity=None): + + super(CommonReturnTypeCallable, self).__init__(name=name) + + self.specialized_dtype = specialized_dtype + self.kinds_allowed = kinds_allowed + self.arity = arity + + def copy(self, specialized_dtype=None): + if specialized_dtype is None: + specialized_dtype = self.specialized_dtype + + return type(self)(self.name, specialized_dtype, + self.kinds_allowed, self.arity) + + def with_types(self, arg_id_to_dtype): + + specialized_dtype = np.find_common_type([], [dtype.numpy_dtype + for id, dtype in arg_id_to_dtype.items() if id >= 0]) + + if self.specialized_dtype is not None and (specialized_dtype != + self.specialized_dtype): + from loopy.warnings import warn + warn("Trying to change the type of the already set function." + "-- maybe use a different class instance?") + + new_arg_id_to_dtype = arg_id_to_dtype.copy() + # checking the compliance of the arg_id_to_dtype + + if -1 not in arg_id_to_dtype: + # return type was not know earlier, now setting it to the common type + new_arg_id_to_dtype[-1] = NumpyType(specialized_dtype) + + if self.arity+1 == len(new_arg_id_to_dtype) and (specialized_dtype.kind in + self.kinds_allowed): + # the function signature matched with the current instance. + # returning the function and the new_arg_id_to_dtype + for i in range(self.arity): + new_arg_id_to_dtype[i] = NumpyType(specialized_dtype) + + return (self.copy(specialized_dtype=specialized_dtype), + new_arg_id_to_dtype) + + return None + + def is_ready_for_code_gen(self): + return self.specilized_dtype is not None + + def get_target_specific_name(self, target): + raise NotImplementedError() + + def get_preamble(self, target): + raise NotImplementedError() + +# }}} + +# {{{ specific type callable class + + +class SpecificReturnTypeCallable(InKernelCallable): + """ A super class for the funcitons which cannot be listed as generic + functions. These types of Callables support explicity mentioning of the + arguments and result dtypes. + + .. attribute:: name + + The name of the function as would be encountered in loopy. + + .. attribute:: arg_id_to_dtype + + The dtype pattern of the arguments which is supposed to be used for checking + the applicability of this function in a given scenario. + """ + + def __init__(self, name=None, arg_id_to_dtype=None): + + super(SpecificReturnTypeCallable, self).__init__(name=name) + + if arg_id_to_dtype is None: + LoopyError("The function signature is incomplete without the" + "`arg_id_to_dtype`") + self.arg_id_to_dtype = arg_id_to_dtype + + def with_types(self, arg_id_to_dtype): + + # Checking the number of inputs + if len([id for id in arg_id_to_dtype if id >= 0]) != len( + [id for id in self.arg_id_to_dtype if id >= 0]): + # the number of input arguments do not match + return None + + # Checking the input dtypes + for id, dtype in arg_id_to_dtype.items(): + if id in self.arg_id_to_dtype and self.arg_id_to_dtype[id] == dtype: + # dtype matched with the one given in the input + pass + else: + # did not match with the function signature and hence returning + # None + return None + + # Setting the output if not present + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for id, dtype in self.arg_id_to_dtype: + if id < 0: + # outputs + if id in new_arg_id_to_dtype and new_arg_id_to_dtype[id] != dtype: + # the output dtype had been supplied but did not match with the + # one in the function signature + return None + + new_arg_id_to_dtype[id] = dtype + + # Finally returning the types + return self.copy(), new_arg_id_to_dtype + + def is_ready_for_code_gen(self): + # everything about the function is determined at the constructor itself, + # hence always redy for codegen + return True + + def get_target_specific_name(self, target): + # defaults to the name of the function in Loopy. May change this specific to + # a target by inheriting this class and overriding this function. + return self.name + + def get_preamble(self, target): + return "" + +# }}} + +# {{{ callable kernel + + +class CallableKernel(InKernelCallable): + """ + + ..attribute:: name + + This would be the name by which the function would be called in the loopy + kernel. + + .. attribute:: subkernel + + The subkernel associated with the call. + + """ + + # {{{ constructor + + def __init__(self, name=None, subkernel=None): + + super(CallableKernel, self).__init__(name=name) + + if not name == subkernel.name: + subkernel = subkernel.copy(name=name) + + self.subkernel = subkernel + + # }}} + + # {{{ copy + + def copy(self, name=None, subkernel=None): + if name is None: + name = self.name + + if subkernel is None: + subkernel = self.subkernel + + return self.__class__(name=name, + subkernel=subkernel) + + # }}} + + # {{{ with_types + + def with_types(self, arg_id_to_dtype): + + # {{{ sanity checks for arg_id_to_dtype + + for id in arg_id_to_dtype: + if not isinstance(id, str): + raise LoopyError("For Callable kernels the input should be all given" + "as KWargs") + + # }}} + + # Checking the input dtypes + for id, arg in self.subkernel.arg_dict.items(): + if id in self.subkernel.read_varibles(): + + # because we need the type of the parameters from the main kernel. It + # is necessary that we know the types from there. Hence asserting + # this condition + assert id in arg_id_to_dtype + + new_arg_dict = {} + for id, dtype in arg_id_to_dtype.items(): + # Making the type of the new arg according to the arg which has been + # called in the function. + new_arg_dict[id] = self.subkernel.arg_dict[id].copy(dtype=dtype) + + # Merging the 2 dictionaries so that to even incorporate the variables that + # were not mentioned in arg_id_to_dtype. + new_arg_dict = {**self.subkernel.arg_dict, **new_arg_dict} + + # Preprocessing the kernel so that we can get the types of the other + # variables that are involved in the args + from loopy.type_inference import infer_unknown_types + pre_specialized_subkernel = self.subkernel.copy( + args=list(new_arg_dict.values)) + + # inferring the types of the written variables based on the knowledge of the + # types of the arguments supplied + specialized_kernel = infer_unknown_types(pre_specialized_subkernel, + expect_completion=True) + + new_arg_id_to_dtype = {} + for id, arg in specialized_kernel.arg_dict: + new_arg_id_to_dtype[id] = arg.dtype + + # Returning the kernel call with specialized subkernel and the corresponding + # new arg_id_to_dtype + return self.copy(subkernel=specialized_kernel), specialized_kernel.arg_dict + + # }}} + + # {{{ with_descriptors + + def with_descriptors(self, arg_id_to_descr): + for id, arg_descr in arg_id_to_descr.items(): + # The dimensions don't match => reject it + if len(arg_descr.dim_tags) != len(self.subkernel.arg_dict[id].shape): + raise LoopyError("The number of dimensions do not match between the" + "caller kernel and callee kernel for the variable name %s in" + "the callee kernel" % id) + + new_args = [] + for arg in self.subkernel.args: + if arg.name in arg_id_to_descr: + new_args.copy(arg.copy(dim_tags=arg_id_to_descr[arg.name])) + pass + else: + new_args.append(arg.copy()) + + specialized_kernel = self.subkernel.copy(args=new_args) + + new_arg_id_to_descr = {} + + for id, arg in specialized_kernel.arg_dict.items(): + new_arg_id_to_descr[id] = ArrayArgDescriptor(arg.dim_tags, "GLOBAL") + + return self.copy(subkernel=specialized_kernel), new_arg_id_to_descr + + # }}} + + # {{{ get_target_specific_name + + def get_target_specific_name(self, target): + return self.subkernel.name + + # }}} + + # {{{ get preamble + + def get_preamble(self, target): + return "" + + # }}} + +# }}} + +# vim: foldmethod=marker -- GitLab From 0a7c42630de2ddf029e0caad347cf7b00311f76c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 13 Mar 2018 17:15:06 -0500 Subject: [PATCH 003/916] Checked that the functions are scoped. --- loopy/preprocess.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 5e36e51a1..30ce5b8ab 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -37,6 +37,8 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from pymbolic.primitives import Variable +from pymbolic.mapper import Collector import logging logger = logging.getLogger(__name__) @@ -2097,6 +2099,29 @@ def check_atomic_loads(kernel): # }}} +# {{{ check for unscoped calls + +class UnScopedCallCollector(Collector): + def map_call(self, expr): + if isinstance(expr.function, Variable): + return set([expr.function.name]) + else: + return set() + + +def check_function_are_scoped(kernel): + """ Checks if all the calls in the instruction expression have been scoped, + otherwise indicate to what all calls we await signature. + """ + for insn in kernel.instructions: + unscoped_calls = UnScopedCallCollector()(insn.expression) + if unscoped_calls: + raise LoopyError("Unknown function obtained %s -- register a function" + " or a kernel corresponding to it." % unscoped_calls[0]) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2146,6 +2171,10 @@ def preprocess_kernel(kernel, device=None): from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) + # Checking if all the functions being used in the kernel and scoped to a + # finite namespace + check_function_are_scoped(kernel) + # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. -- GitLab From 447680ed76436fde746864acd4694ac131991696 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 13 Mar 2018 17:35:36 -0500 Subject: [PATCH 004/916] Finished scoping of the function. --- loopy/preprocess.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 30ce5b8ab..b3e2496ad 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -37,7 +37,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from pymbolic.primitives import Variable +from loopy.symbolic import ScopedFunction from pymbolic.mapper import Collector import logging @@ -2103,21 +2103,21 @@ def check_atomic_loads(kernel): class UnScopedCallCollector(Collector): def map_call(self, expr): - if isinstance(expr.function, Variable): + if not isinstance(expr.function, ScopedFunction): return set([expr.function.name]) else: return set() -def check_function_are_scoped(kernel): +def check_functions_are_scoped(kernel): """ Checks if all the calls in the instruction expression have been scoped, otherwise indicate to what all calls we await signature. """ for insn in kernel.instructions: unscoped_calls = UnScopedCallCollector()(insn.expression) if unscoped_calls: - raise LoopyError("Unknown function obtained %s -- register a function" - " or a kernel corresponding to it." % unscoped_calls[0]) + raise LoopyError("Unknown function '%s' obtained -- register a function" + " or a kernel corresponding to it." % unscoped_calls.pop()) # }}} @@ -2173,7 +2173,7 @@ def preprocess_kernel(kernel, device=None): # Checking if all the functions being used in the kernel and scoped to a # finite namespace - check_function_are_scoped(kernel) + check_functions_are_scoped(kernel) # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. -- GitLab From de52149856e367247875c7601807257a4ffd6cb4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Mar 2018 02:57:55 -0500 Subject: [PATCH 005/916] Added the support for type inference --- loopy/kernel/function_interface.py | 458 ++++++++++++++++------------- loopy/library/random123.py | 52 +++- loopy/type_inference.py | 39 ++- 3 files changed, 331 insertions(+), 218 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index d88841df7..a34869320 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -5,8 +5,6 @@ import numpy as np from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.types import NumpyType - # {{{ argument descriptors @@ -66,7 +64,137 @@ class ArrayArgDescriptor(ArgDescriptor): # }}} -# {{{ in kernel callable +# {{{ c with types + +def c_with_types(name, arg_id_to_dtype): + + # Specializing the type of the math function once they agree upon the + # function signature. + + if name in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: + for id, dtype in arg_id_to_dtype.items(): + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind == 'f': + # generic type resolve we can go ahead and specialize + pass + elif dtype.kind in ['u', 'i']: + # int and unsigned are casted into float32 + dtype = np.float32 + else: + raise LoopyError("%s function cannot take arguments of the type %s" + % (name, dtype)) + + # Done specializing. Returning the intended arg_id_to_dtype + return {-1: dtype, 0: dtype} + + # binary functions + elif name in ["max", "min"]: + for id, dtype in arg_id_to_dtype.items(): + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + # finding the common type for all the dtypes involved + dtype = np.find_common_type( + [], [dtype.numpy_dtype for dtype in arg_id_to_dtype]) + + if dtype.kind == 'f': + # generic type resolve we can go ahead and specialize + pass + elif dtype.kind in ['u', 'i']: + # int and unsigned are implicitly casted into float32 + dtype = np.float32 + else: + raise LoopyError("%s function cannot take arguments of the type %s" + % (name, dtype)) + + # Specialized into one of the known types + return {-1: dtype, 0: arg_id_to_dtype[0], 1: arg_id_to_dtype[1]} + + else: + # could not specialize the function within the C namespace + # this would help when checking for OpenCL/CUDA function which are not + # present in C + return None + +# }}} + + +# {{{ opencl with_types + +def opencl_with_types(name, arg_id_to_dtype): + new_arg_id_to_dtype = c_with_types(name, arg_id_to_dtype) + if new_arg_id_to_dtype is None: + # could not locate the function within C's namespace. Searching in + # OpenCL specific namespace + + # FIXME: Need to add these functions over here + new_arg_id_to_dtype = None + + return new_arg_id_to_dtype + +# }}} + + +# {{{ pyopencl with_types + +def pyopencl_with_types(name, arg_id_to_dtype): + new_arg_id_to_dtype = opencl_with_types(name, arg_id_to_dtype) + if new_arg_id_to_dtype is None: + # could not locate the function within C's namespace. Searching in + # PyOpenCL specific namespace + + # FIXME: Need to add these functions over here + new_arg_id_to_dtype = None + + return new_arg_id_to_dtype + +# }}} + + +# {{{ cuda with_types + +def cuda_with_types(name, arg_id_to_dtype): + new_arg_id_to_dtype = c_with_types(name, arg_id_to_dtype) + if new_arg_id_to_dtype is None: + # could not locate the function within C's namespace. Searching in + # CUDA specific namespace + + # FIXME: Need to add these extra functions over here + new_arg_id_to_dtype = None + + return new_arg_id_to_dtype + +# }}} + + +# {{{ kw_to_pos + +def get_kw_pos_association(kernel): + kw_to_pos = {} + pos_to_kw = {} + + read_count = 0 + write_count = -1 + + for arg in kernel.args: + if arg.name in kernel.written_variables: + kw_to_pos[arg.name] = write_count + pos_to_kw[write_count] = arg.name + write_count -= 1 + else: + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + + return kw_to_pos, pos_to_kw + +# }}} + class InKernelCallable(ImmutableRecord): """ @@ -75,13 +203,25 @@ class InKernelCallable(ImmutableRecord): The name of the callable which can be encountered within a kernel. + .. attribute:: arg_id_to_dtype + + A mapping which indicates the arguments types and result types it would + be handling. This would be set once the callable is type specialized. + + .. attribute:: arg_id_to_descr + + A mapping which gives indicates the argument shape and `dim_tags` it + would be responsible for generating code. These parameters would be set, + once it is shape and stride(`dim_tags`) specialized. + .. note:: Negative ids in the mapping attributes indicate the result arguments """ - def __init__(self, name=None): + def __init__(self, name, subkernel=None, arg_id_to_dtype=None, + arg_id_to_descr=None): # {{{ sanity checks @@ -91,8 +231,10 @@ class InKernelCallable(ImmutableRecord): # }}} self.name = name + self.subkernel = subkernel - super(InKernelCallable, self).__init__(name=name) + super(InKernelCallable, self).__init__(name=name, + subkernel=subkernel) def copy(self, name=None): if name is None: @@ -100,7 +242,7 @@ class InKernelCallable(ImmutableRecord): return InKernelCallable(name=name) - def with_types(self, arg_id_to_dtype): + def with_types(self, arg_id_to_dtype, target): """ :arg arg_id_to_type: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -118,7 +260,103 @@ class InKernelCallable(ImmutableRecord): its keyword identifier. """ - raise NotImplementedError() + if self.arg_id_to_dtype: + # trying to specialize an already specialized function. + + if self.arg_id_to_dtype == arg_id_to_dtype: + return self.copy() + else: + raise LoopyError("Overwriting a specialized function--maybe" + " start with new instance of InKernelCallable?") + + # {{{ attempt to specialize using scalar functions + + from loopy.library import default_function_identifiers + if self.name in default_function_identifiers(): + ... + elif self.name in target.ast_builder().function_identifiers: + from loopy.target.c import CTarget + from loopy.target.opencl import OpenCLTarget + from loopy.target.pyopencl import PyOpenCLTarget + from loopy.target.cuda import CudaTarget + + if isinstance(target, CTarget): + new_arg_id_to_dtype = c_with_types(arg_id_to_dtype) + + elif isinstance(target, OpenCLTarget): + new_arg_id_to_dtype = opencl_with_types(arg_id_to_dtype) + + elif isinstance(target, PyOpenCLTarget): + new_arg_id_to_dtype = pyopencl_with_types(arg_id_to_dtype) + + elif isinstance(target, CudaTarget): + new_arg_id_to_dtype = cuda_with_types(arg_id_to_dtype) + + else: + raise NotImplementedError("InKernelCallable.with_types() for" + " %s target" % target) + + # }}} + + if new_arg_id_to_dtype is not None: + # got our speciliazed function + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + if self.subkernel is None: + # did not find a scalar function and function prototype does not + # even have subkernel registered => no match found + raise LoopyError("Function %s not present within" + " the %s namespace" % (self.name, target)) + + # {{{ attempt to specialization with array functions + + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + new_args = [] + for arg in self.subkernel.args: + kw = arg.name + if kw in arg_id_to_dtype: + # id exists as kw + new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) + elif kw_to_pos[kw] in arg_id_to_dtype: + # id exists as positional argument + new_args.append(arg.copy( + dtype=arg_id_to_dtype[kw_to_pos[kw]])) + else: + if kw in self.subkernel.read_variables(): + # need to know the type of the input arguments for type + # inference + raise LoopyError("Type of %s variable not supplied to the" + " subkernel, which is needed for type" + " inference." % kw) + new_args.append(arg) + + from loopy.type_inference import infer_unknown_types + pre_specialized_subkernel = self.subkernel.copy( + args=new_args) + + # inferring the types of the written variables based on the knowledge + # of the types of the arguments supplied + specialized_kernel = infer_unknown_types(pre_specialized_subkernel, + expect_completion=True) + new_arg_id_to_dtype = {} + read_count = 0 + write_count = -1 + for arg in specialized_kernel.args: + new_arg_id_to_dtype[arg.name] = arg.dtype + if arg.name in specialized_kernel.written_variables(): + new_arg_id_to_dtype[write_count] = arg.dtype + write_count -= 1 + else: + new_arg_id_to_dtype[read_count] = arg.dtype + read_count += 1 + + # }}} + + # Returning the kernel call with specialized subkernel and the corresponding + # new arg_id_to_dtype + return self.copy(subkernel=specialized_kernel, + arg_id_to_dtype=new_arg_id_to_dtype) def with_descrs(self, arg_id_to_descr): """ @@ -188,178 +426,11 @@ class InKernelCallable(ImmutableRecord): def __eq__(self, other): return (self.name == other.name and self.arg_id_to_descr == other.arg_id_to_descr - and self.arg_id_to_keyword == other.arg_id_to_keyword) + and self.arg_id_to_dtype == other.arg_id_to_keyword) def __hash__(self): return hash((self.name, )) -# }}} - - -# {{{ generic callable class - - -class CommonReturnTypeCallable(InKernelCallable): - """ A class of generic functions which have the following properties: - - Single return value - - Return type of the callable is a common dtype to all the input arguments - to the callable - - .. attribute:: name - - The name of the function as would be encountered in loopy. - - ..attribute:: specialized_dtype - - The dtype for which the function has been setup to generate code and - premables. For example, the function `sin` can be specialized to either one - of the following `float sin(float x)` or `double sin(double x)`. This is not - usually expected to be an input as this removed the generality of the - callable. - - ..attribute:: kinds_allowed - - The extent upto which the function can be generalized upto. For example - `sin(x)` cannot have complex types as its specialized type. - - ..attribute:: arity - - The number of inputs that are to be given to the function - - """ - - def __init__(self, name=None, specialized_dtype=None, kinds_allowed=None, - arity=None): - - super(CommonReturnTypeCallable, self).__init__(name=name) - - self.specialized_dtype = specialized_dtype - self.kinds_allowed = kinds_allowed - self.arity = arity - - def copy(self, specialized_dtype=None): - if specialized_dtype is None: - specialized_dtype = self.specialized_dtype - - return type(self)(self.name, specialized_dtype, - self.kinds_allowed, self.arity) - - def with_types(self, arg_id_to_dtype): - - specialized_dtype = np.find_common_type([], [dtype.numpy_dtype - for id, dtype in arg_id_to_dtype.items() if id >= 0]) - - if self.specialized_dtype is not None and (specialized_dtype != - self.specialized_dtype): - from loopy.warnings import warn - warn("Trying to change the type of the already set function." - "-- maybe use a different class instance?") - - new_arg_id_to_dtype = arg_id_to_dtype.copy() - # checking the compliance of the arg_id_to_dtype - - if -1 not in arg_id_to_dtype: - # return type was not know earlier, now setting it to the common type - new_arg_id_to_dtype[-1] = NumpyType(specialized_dtype) - - if self.arity+1 == len(new_arg_id_to_dtype) and (specialized_dtype.kind in - self.kinds_allowed): - # the function signature matched with the current instance. - # returning the function and the new_arg_id_to_dtype - for i in range(self.arity): - new_arg_id_to_dtype[i] = NumpyType(specialized_dtype) - - return (self.copy(specialized_dtype=specialized_dtype), - new_arg_id_to_dtype) - - return None - - def is_ready_for_code_gen(self): - return self.specilized_dtype is not None - - def get_target_specific_name(self, target): - raise NotImplementedError() - - def get_preamble(self, target): - raise NotImplementedError() - -# }}} - -# {{{ specific type callable class - - -class SpecificReturnTypeCallable(InKernelCallable): - """ A super class for the funcitons which cannot be listed as generic - functions. These types of Callables support explicity mentioning of the - arguments and result dtypes. - - .. attribute:: name - - The name of the function as would be encountered in loopy. - - .. attribute:: arg_id_to_dtype - - The dtype pattern of the arguments which is supposed to be used for checking - the applicability of this function in a given scenario. - """ - - def __init__(self, name=None, arg_id_to_dtype=None): - - super(SpecificReturnTypeCallable, self).__init__(name=name) - - if arg_id_to_dtype is None: - LoopyError("The function signature is incomplete without the" - "`arg_id_to_dtype`") - self.arg_id_to_dtype = arg_id_to_dtype - - def with_types(self, arg_id_to_dtype): - - # Checking the number of inputs - if len([id for id in arg_id_to_dtype if id >= 0]) != len( - [id for id in self.arg_id_to_dtype if id >= 0]): - # the number of input arguments do not match - return None - - # Checking the input dtypes - for id, dtype in arg_id_to_dtype.items(): - if id in self.arg_id_to_dtype and self.arg_id_to_dtype[id] == dtype: - # dtype matched with the one given in the input - pass - else: - # did not match with the function signature and hence returning - # None - return None - - # Setting the output if not present - new_arg_id_to_dtype = arg_id_to_dtype.copy() - for id, dtype in self.arg_id_to_dtype: - if id < 0: - # outputs - if id in new_arg_id_to_dtype and new_arg_id_to_dtype[id] != dtype: - # the output dtype had been supplied but did not match with the - # one in the function signature - return None - - new_arg_id_to_dtype[id] = dtype - - # Finally returning the types - return self.copy(), new_arg_id_to_dtype - - def is_ready_for_code_gen(self): - # everything about the function is determined at the constructor itself, - # hence always redy for codegen - return True - - def get_target_specific_name(self, target): - # defaults to the name of the function in Loopy. May change this specific to - # a target by inheriting this class and overriding this function. - return self.name - - def get_preamble(self, target): - return "" - -# }}} - # {{{ callable kernel @@ -417,43 +488,6 @@ class CallableKernel(InKernelCallable): # }}} - # Checking the input dtypes - for id, arg in self.subkernel.arg_dict.items(): - if id in self.subkernel.read_varibles(): - - # because we need the type of the parameters from the main kernel. It - # is necessary that we know the types from there. Hence asserting - # this condition - assert id in arg_id_to_dtype - - new_arg_dict = {} - for id, dtype in arg_id_to_dtype.items(): - # Making the type of the new arg according to the arg which has been - # called in the function. - new_arg_dict[id] = self.subkernel.arg_dict[id].copy(dtype=dtype) - - # Merging the 2 dictionaries so that to even incorporate the variables that - # were not mentioned in arg_id_to_dtype. - new_arg_dict = {**self.subkernel.arg_dict, **new_arg_dict} - - # Preprocessing the kernel so that we can get the types of the other - # variables that are involved in the args - from loopy.type_inference import infer_unknown_types - pre_specialized_subkernel = self.subkernel.copy( - args=list(new_arg_dict.values)) - - # inferring the types of the written variables based on the knowledge of the - # types of the arguments supplied - specialized_kernel = infer_unknown_types(pre_specialized_subkernel, - expect_completion=True) - - new_arg_id_to_dtype = {} - for id, arg in specialized_kernel.arg_dict: - new_arg_id_to_dtype[id] = arg.dtype - - # Returning the kernel call with specialized subkernel and the corresponding - # new arg_id_to_dtype - return self.copy(subkernel=specialized_kernel), specialized_kernel.arg_dict # }}} diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 82e44b2d1..871dde0a6 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -62,8 +62,12 @@ RNG_VARIANTS = [ _threefry_base_info.copy(width=4, bits=64), ] -FUNC_NAMES_TO_RNG = set(v.full_name + suffix for v in RNG_VARIANTS for suffix in - ["", "_f32", "_f64", ]) +FUNC_NAMES_TO_RNG = dict( + (v.full_name + suffix, v) + for v in RNG_VARIANTS + for suffix in [ + "", "_f32", "_f64", + ]) # }}} @@ -177,8 +181,46 @@ def random123_preamble_generator(preamble_info): def random123_function_identifiers(): - return FUNC_NAMES_TO_RNG - -# Removed the random123_function_mangler + return set(FUNC_NAMES_TO_RNG) + + +def random123_function_mangler(kernel, name, arg_dtypes): + try: + rng_variant = FUNC_NAMES_TO_RNG[name] + except KeyError: + return None + + from loopy.types import NumpyType + target = kernel.target + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + from loopy.kernel.data import CallMangleInfo + fn = rng_variant.full_name + if name == fn: + return CallMangleInfo( + target_name=fn+"_gen", + result_dtypes=(ctr_dtype, ctr_dtype), + arg_dtypes=(ctr_dtype, key_dtype)) + + elif name == fn + "_f32": + return CallMangleInfo( + target_name=name, + result_dtypes=( + target.vector_dtype(NumpyType(np.float32), rng_variant.width), + ctr_dtype), + arg_dtypes=(ctr_dtype, key_dtype)) + + elif name == fn + "_f64": + return CallMangleInfo( + target_name=name, + result_dtypes=( + target.vector_dtype(NumpyType(np.float64), rng_variant.width), + ctr_dtype), + arg_dtypes=(ctr_dtype, key_dtype)) + + else: + return None # vim: foldmethod=marker diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 010a0658f..699c045ea 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -60,6 +60,7 @@ class TypeInferenceMapper(CombineMapper): new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() + self.scoped_functions = kernel.scoped_functions def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -250,7 +251,9 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable + from pymbolic.primitives import Variable, Expression + from loopy.symbolic import SubArrayRef + from loopy.kernel.function_interface import ValueArgDescriptor identifier = expr.function if isinstance(identifier, Variable): @@ -270,6 +273,39 @@ class TypeInferenceMapper(CombineMapper): if None in arg_dtypes: return [] + arg_id_to_dtype = dict((i, dtype) for (i, dtype) in + enumerate(arg_dtypes)) + + # specializing the known function wrt type + in_knl_callable = ( + self.scoped_functions[expr.function.name].with_types( + arg_id_to_dtype)) + + # need to colllect arg_id_to_descr from the Subarrayrefs + arg_id_to_descr = {} + for id, par in enumerate(expr.parameters): + if isinstance(par, SubArrayRef): + arg_id_to_descr[id] = par.get_arg_descr() + elif isinstance(par, Expression): + arg_id_to_descr[id] = ValueArgDescriptor() + else: + # should not come over here + raise LoopyError("Unexpected parameter given to call") + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + result_dtypes = [] + + # collecting result dtypes in order of the assignees + + for i in range(len(new_arg_id_to_dtype)): + if -i-1 in new_arg_id_to_dtype: + result_dtypes.appen(new_arg_id_to_dtype[-i-1]) + else: + return result_dtypes + + """ + # Letting this stay over here, as it maybe needed later for maintaining + # backward compatibility mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) if return_tuple: if mangle_result is not None: @@ -285,6 +321,7 @@ class TypeInferenceMapper(CombineMapper): raise RuntimeError("unable to resolve " "function '%s' with %d given arguments" % (identifier, len(arg_dtypes))) + """ def map_variable(self, expr): if expr.name in self.kernel.all_inames(): -- GitLab From 98681cc078cf9275aad206f7436e45333d95e48e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Mar 2018 03:19:30 -0500 Subject: [PATCH 006/916] Added SubArrayRef --- loopy/symbolic.py | 121 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 16c9fd482..23617c48b 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -56,6 +56,7 @@ from pymbolic.mapper.constant_folder import \ ConstantFoldingMapper as ConstantFoldingMapperBase from pymbolic.parser import Parser as ParserBase +from loopy.diagnostic import LoopyError import islpy as isl from islpy import dim_type @@ -106,6 +107,9 @@ class IdentityMapperMixin(object): def map_type_annotation(self, expr, *args): return type(expr)(expr.type, self.rec(expr.child)) + def map_sub_array_ref(self, expr, *args): + return SubArrayRef(expr.swept_inames, expr.subscript) + map_type_cast = map_type_annotation map_linear_subscript = IdentityMapperBase.map_subscript @@ -169,6 +173,13 @@ class WalkMapper(WalkMapperBase): map_scoped_function = WalkMapperBase.map_variable + def map_sub_array_ref(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.swept_inames, *args) + self.rec(expr.subscript, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant @@ -241,6 +252,11 @@ class StringifyMapper(StringifyMapperBase): def map_scoped_function(self, expr, prec): return "ScopedFunction('%s')" % expr.name + def map_sub_array_ref(self, expr, prec): + return "SubArrayRef({inames}, ({subscr}))".format( + inames=self.rec(expr.swept_inames, prec), + subscr=self.rec(expr.subscript, prec)) + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): @@ -293,6 +309,10 @@ class DependencyMapper(DependencyMapperBase): def map_loopy_function_identifier(self, expr): return set() + def map_sub_array_ref(self, expr, *args): + deps = self.rec(expr.subscript, *args) + return deps - set(iname for iname in expr.swept_inames) + map_linear_subscript = DependencyMapperBase.map_subscript def map_type_cast(self, expr): @@ -660,6 +680,79 @@ class ScopedFunction(p.Variable): def stringifier(self): return StringifyMapper + +class SubArrayRef(p.Expression): + """Represents a generalized sliced notation of an array. + + .. attribute:: swept_inames + + These are a tuple of sweeping inames over the array. + + .. attribute:: subscript + + The subscript whose adress space is to be referenced + """ + + init_arg_names = ("swept_inames", "subscript") + + def __init__(self, swept_inames=None, subscript=None): + + # {{{ sanity checks + + if not isinstance(swept_inames, tuple): + assert isinstance(swept_inames, p.Variable) + swept_inames = (swept_inames,) + + assert isinstance(swept_inames, tuple) + + for iname in swept_inames: + assert isinstance(iname, p.Variable) + assert isinstance(subscript, p.Subscript) + + # }}} + + self.swept_inames = swept_inames + self.subscript = subscript + + def get_begin_subscript(self): + starting_inames = [] + for iname in self.subscript.index_tuple: + if iname in self.swept_inames: + starting_inames.append(parse('0')) + else: + starting_inames.append(iname) + return p.Subscript(self.subscript.aggregate, tuple(starting_inames)) + + def get_inner_dim_tags(self, arg_dim_tags): + """ Gives the dim tags for the inner inames. + This would be used for stride calculation in the child kernel. + This might need to go, once we start calculating the stride length + using the upper and lower bounds of the involved inames. + """ + from loopy.kernel.array import FixedStrideArrayDimTag as DimTag + inner_dim_tags = [] + for dim_tag, iname in zip(arg_dim_tags, self.subscript.index_tuple): + if iname in self.swept_inames: + inner_dim_tags.append(DimTag(dim_tag.stride)) + + return inner_dim_tags + + def __getinitargs__(self): + return (self.swept_inames, self.subscript) + + def get_hash(self): + return hash((self.__class__, self.swept_inames, self.subscript)) + + def is_equal(self, other): + return (other.__class__ == self.__class__ + and other.subscript == self.subscript + and other.swept_inames == self.swept_inames) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_sub_array_ref") + # }}} @@ -1122,6 +1215,14 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: return IdentityMapper.map_call(self, expr) + def map_call_with_kwargs(self, expr): + for par in expr.kw_parameters.values(): + if not isinstance(par, SubArrayRef): + raise LoopyError("Keyword Arguments is only supported for" + " array arguments--use positional order to specify" + " the order of the arguments in the call.") + return IdentityMapper.map_call_with_kwargs(self, expr) + # {{{ customization to pymbolic parser @@ -1152,7 +1253,9 @@ class LoopyParser(ParserBase): return float(val) # generic float def parse_prefix(self, pstate): - from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier + from pymbolic.parser import (_PREC_UNARY, _less, _greater, _identifier, + _openbracket, _closebracket, _colon) + if pstate.is_next(_less): pstate.advance() if pstate.is_next(_greater): @@ -1168,6 +1271,18 @@ class LoopyParser(ParserBase): return TypeAnnotation( typename, self.parse_expression(pstate, _PREC_UNARY)) + + elif pstate.is_next(_openbracket): + pstate.advance() + pstate.expect_not_end() + swept_inames = self.parse_expression(pstate) + pstate.expect(_closebracket) + pstate.advance() + pstate.expect(_colon) + pstate.advance() + subscript = self.parse_expression(pstate, _PREC_UNARY) + return SubArrayRef(swept_inames, subscript) + else: return super(LoopyParser, self).parse_prefix(pstate) @@ -1767,6 +1882,10 @@ class BatchedAccessRangeMapper(WalkMapper): def map_type_cast(self, expr, inames): return self.rec(expr.child, inames) + def map_sub_array_ref(self, expr, inames): + total_inames = inames | set([iname.name for iname in expr.swept_inames]) + return self.rec(expr.subscript, total_inames) + class AccessRangeMapper(object): """**IMPORTANT** -- GitLab From eb60d374a9f2fde28c2e38fd2bf0c503524360ae Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Mar 2018 03:53:26 -0500 Subject: [PATCH 007/916] Added the todos in preprocess.py --- loopy/preprocess.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index b3e2496ad..622590c71 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2181,6 +2181,10 @@ def preprocess_kernel(kernel, device=None): kernel = infer_unknown_types(kernel, expect_completion=False) + # TODO: Specializng based on: + # 1. ArgDescriptors + # 2. InameTags + check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) -- GitLab From 3c2dd4ffdba851f8f94a677bd549d02ac10ee354 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Mar 2018 05:38:05 -0500 Subject: [PATCH 008/916] Implemented the scope changing phenomenon. All head to Debugging! --- loopy/type_inference.py | 118 ++++++++++++++++++++++++++++++++++------ 1 file changed, 101 insertions(+), 17 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 699c045ea..ad45cc172 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -25,7 +25,10 @@ THE SOFTWARE. import six from pymbolic.mapper import CombineMapper +from pymbolic.primitives import Call, CallWithKwargs +from loopy.symbolic import IdentityMapper, ScopedFunction import numpy as np +import re from loopy.tools import is_integer from loopy.types import NumpyType @@ -34,6 +37,9 @@ from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) + import logging logger = logging.getLogger(__name__) @@ -61,6 +67,7 @@ class TypeInferenceMapper(CombineMapper): self.new_assignments = new_assignments self.symbols_with_unknown_types = set() self.scoped_functions = kernel.scoped_functions + self.specialized_functions = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -251,9 +258,7 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable, Expression - from loopy.symbolic import SubArrayRef - from loopy.kernel.function_interface import ValueArgDescriptor + from pymbolic.primitives import Variable identifier = expr.function if isinstance(identifier, Variable): @@ -281,16 +286,9 @@ class TypeInferenceMapper(CombineMapper): self.scoped_functions[expr.function.name].with_types( arg_id_to_dtype)) - # need to colllect arg_id_to_descr from the Subarrayrefs - arg_id_to_descr = {} - for id, par in enumerate(expr.parameters): - if isinstance(par, SubArrayRef): - arg_id_to_descr[id] = par.get_arg_descr() - elif isinstance(par, Expression): - arg_id_to_descr[id] = ValueArgDescriptor() - else: - # should not come over here - raise LoopyError("Unexpected parameter given to call") + # storing the type specialized function so that it can be used for + # later use + self.specialized_functions[expr] = in_knl_callable new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype result_dtypes = [] @@ -488,11 +486,12 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types + return None, type_inf_mapper.symbols_with_unknown_types, None result = type_inf_mapper.combine(dtype_sets) - return result, type_inf_mapper.symbols_with_unknown_types + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.specialized_functions) # }}} @@ -517,6 +516,46 @@ class _DictUnionView: raise KeyError(key) +# {{{ FunctionType Specializer + + +# }}} + +# {{{ duplicating the funciton name + +def next_indexed_name(name): + FUNC_NAME = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = FUNC_NAME.match(name) + + if match is None: + if name[-1] == '_': + return "{old_name}0".format(old_name=name) + else: + return "{old_name}_0".format(old_name=name) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + +# }}} + + +# {{{ FunctionScopeChanger + +class FunctionScopeChanger(IdentityMapper): + def __init__(self, new_names): + self.new_names = new_names + + def map_call(self, expr): + return Call(ScopedFunction(self.new_names[expr]), + expr.parameters) + + def map_call_with_kwargs(self, expr): + return CallWithKwargs(ScopedFunction(self.new_names[expr]), + expr.parameters, expr.kw_parameters) +# }}} + + # {{{ infer_unknown_types def infer_unknown_types(kernel, expect_completion=False): @@ -590,6 +629,8 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument + specialized_functions = {} + for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] @@ -613,7 +654,7 @@ def infer_unknown_types(kernel, expect_completion=False): debug("inferring type for %s %s", type(item).__name__, item.name) - result, symbols_with_unavailable_types = ( + result, symbols_with_unavailable_types, new_specialized_functions = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) @@ -634,6 +675,8 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + specialized_functions = {**specialized_functions, + **new_specialized_functions} else: debug(" failure") @@ -676,11 +719,52 @@ def infer_unknown_types(kernel, expect_completion=False): logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) - return unexpanded_kernel.copy( + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) + # {{{ type specialization + + # TODO: These 2 dictionaries are inverse mapping of each other and help to keep + # track of which ...(need to explain better) + scoped_names_to_functions = {} + scoped_functions_to_names = {} + pymbolic_calls_to_new_names = {} + + for pymbolic_call, knl_callable in specialized_functions.items(): + if knl_callable not in scoped_functions_to_names: + # need to make a new name deerived from the old name such that new + # name in not present in new_scoped_name_to_function + old_name = pymbolic_call.function.name + new_name = next_indexed_name(old_name) + while new_name not in scoped_names_to_functions: + new_name = next_indexed_name(new_name) + + scoped_names_to_functions[new_name] = knl_callable + scoped_functions_to_names[knl_callable] = new_name + + pymbolic_calls_to_new_names[pymbolic_call] = ( + scoped_functions_to_names[knl_callable]) + + # }}} + + new_insns = [] + scope_changer = FunctionScopeChanger(pymbolic_calls_to_new_names) + for insn in pre_type_specialized_knl.instructions: + if isinstance(insn, (MultiAssignmentBase, CInstruction)): + expr = scope_changer(insn.expression) + new_insns.append(insn.copy(expression=expr)) + pass + elif isinstance(insn, _DataObliviousInstruction): + new_insns.append(insn) + else: + raise NotImplementedError("Type Inference Specialization not" + "implemented for %s instruciton" % type(insn)) + + return pre_type_specialized_knl.copy(scope_functions=scoped_names_to_functions, + instructions=new_insns) + # }}} -- GitLab From b86e05b2ae76f09ce2fe087c24efd555bb34c74a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Mar 2018 11:55:25 -0500 Subject: [PATCH 009/916] ScopedFunctions do not disappear on calling infer_unknown_types multiple times --- loopy/type_inference.py | 43 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index ad45cc172..23aa379dd 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -25,7 +25,6 @@ THE SOFTWARE. import six from pymbolic.mapper import CombineMapper -from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import IdentityMapper, ScopedFunction import numpy as np import re @@ -284,7 +283,7 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type in_knl_callable = ( self.scoped_functions[expr.function.name].with_types( - arg_id_to_dtype)) + arg_id_to_dtype, self.kernel.target)) # storing the type specialized function so that it can be used for # later use @@ -297,7 +296,7 @@ class TypeInferenceMapper(CombineMapper): for i in range(len(new_arg_id_to_dtype)): if -i-1 in new_arg_id_to_dtype: - result_dtypes.appen(new_arg_id_to_dtype[-i-1]) + result_dtypes.append(new_arg_id_to_dtype[-i-1]) else: return result_dtypes @@ -516,11 +515,6 @@ class _DictUnionView: raise KeyError(key) -# {{{ FunctionType Specializer - - -# }}} - # {{{ duplicating the funciton name def next_indexed_name(name): @@ -542,17 +536,35 @@ def next_indexed_name(name): # {{{ FunctionScopeChanger +#TODO: Make it sophisticated + class FunctionScopeChanger(IdentityMapper): def __init__(self, new_names): self.new_names = new_names + self.new_names_set = frozenset(new_names.values()) def map_call(self, expr): - return Call(ScopedFunction(self.new_names[expr]), - expr.parameters) + if expr in self.new_names: + return type(expr)( + ScopedFunction(self.new_names[expr]), + tuple(self.rec(child) + for child in expr.parameters)) + else: + return IdentityMapper.map_call(self, expr) def map_call_with_kwargs(self, expr): - return CallWithKwargs(ScopedFunction(self.new_names[expr]), - expr.parameters, expr.kw_parameters) + if expr in self.new_names: + return type(expr)( + ScopedFunction(self.new_names[expr]), + tuple(self.rec(child) + for child in expr.parameters), + dict( + (key, self.rec(val)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return IdentityMapper.map_call_with_kwargs(self, expr) + # }}} @@ -728,7 +740,7 @@ def infer_unknown_types(kernel, expect_completion=False): # TODO: These 2 dictionaries are inverse mapping of each other and help to keep # track of which ...(need to explain better) - scoped_names_to_functions = {} + scoped_names_to_functions = pre_type_specialized_knl.scoped_functions scoped_functions_to_names = {} pymbolic_calls_to_new_names = {} @@ -738,7 +750,7 @@ def infer_unknown_types(kernel, expect_completion=False): # name in not present in new_scoped_name_to_function old_name = pymbolic_call.function.name new_name = next_indexed_name(old_name) - while new_name not in scoped_names_to_functions: + while new_name in scoped_names_to_functions: new_name = next_indexed_name(new_name) scoped_names_to_functions[new_name] = knl_callable @@ -755,14 +767,13 @@ def infer_unknown_types(kernel, expect_completion=False): if isinstance(insn, (MultiAssignmentBase, CInstruction)): expr = scope_changer(insn.expression) new_insns.append(insn.copy(expression=expr)) - pass elif isinstance(insn, _DataObliviousInstruction): new_insns.append(insn) else: raise NotImplementedError("Type Inference Specialization not" "implemented for %s instruciton" % type(insn)) - return pre_type_specialized_knl.copy(scope_functions=scoped_names_to_functions, + return pre_type_specialized_knl.copy(scoped_functions=scoped_names_to_functions, instructions=new_insns) # }}} -- GitLab From 5f8efc595582f385e5b896515ba4fabe4c4bb75e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Mar 2018 11:58:49 -0500 Subject: [PATCH 010/916] Type specialization working. Now heading to shape and dim tags specializations --- loopy/kernel/__init__.py | 1 + loopy/kernel/function_interface.py | 38 +++++++++++------------- loopy/preprocess.py | 46 +++++++++++++++++++++++++++++- 3 files changed, 62 insertions(+), 23 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d33053dea..851626a8d 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1341,6 +1341,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "temporary_variables", "iname_to_tag", "substitutions", + "scoped_functions", "iname_slab_increments", "loop_priority", "silenced_warnings", diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index a34869320..4bc7f3d76 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -4,6 +4,7 @@ import numpy as np from pytools import ImmutableRecord from loopy.diagnostic import LoopyError +from loopy.types import NumpyType # {{{ argument descriptors @@ -72,7 +73,7 @@ def c_with_types(name, arg_id_to_dtype): # function signature. if name in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "tan"]: for id, dtype in arg_id_to_dtype.items(): if not -1 <= id <= 0: raise LoopyError("%s can take only one argument." % name) @@ -90,6 +91,7 @@ def c_with_types(name, arg_id_to_dtype): % (name, dtype)) # Done specializing. Returning the intended arg_id_to_dtype + dtype = NumpyType(dtype) return {-1: dtype, 0: dtype} # binary functions @@ -113,7 +115,7 @@ def c_with_types(name, arg_id_to_dtype): % (name, dtype)) # Specialized into one of the known types - return {-1: dtype, 0: arg_id_to_dtype[0], 1: arg_id_to_dtype[1]} + return {-1: NumpyType(dtype), 0: arg_id_to_dtype[0], 1: arg_id_to_dtype[1]} else: # could not specialize the function within the C namespace @@ -182,7 +184,7 @@ def get_kw_pos_association(kernel): write_count = -1 for arg in kernel.args: - if arg.name in kernel.written_variables: + if arg.name in kernel.get_written_variables(): kw_to_pos[arg.name] = write_count pos_to_kw[write_count] = arg.name write_count -= 1 @@ -230,17 +232,10 @@ class InKernelCallable(ImmutableRecord): # }}} - self.name = name - self.subkernel = subkernel - super(InKernelCallable, self).__init__(name=name, - subkernel=subkernel) - - def copy(self, name=None): - if name is None: - name = self.name - - return InKernelCallable(name=name) + subkernel=subkernel, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) def with_types(self, arg_id_to_dtype, target): """ @@ -271,26 +266,26 @@ class InKernelCallable(ImmutableRecord): # {{{ attempt to specialize using scalar functions - from loopy.library import default_function_identifiers + from loopy.library.function import default_function_identifiers if self.name in default_function_identifiers(): ... - elif self.name in target.ast_builder().function_identifiers: + elif self.name in target.get_device_ast_builder().function_identifiers(): from loopy.target.c import CTarget from loopy.target.opencl import OpenCLTarget from loopy.target.pyopencl import PyOpenCLTarget from loopy.target.cuda import CudaTarget if isinstance(target, CTarget): - new_arg_id_to_dtype = c_with_types(arg_id_to_dtype) + new_arg_id_to_dtype = c_with_types(self.name, arg_id_to_dtype) elif isinstance(target, OpenCLTarget): - new_arg_id_to_dtype = opencl_with_types(arg_id_to_dtype) + new_arg_id_to_dtype = opencl_with_types(self.name, arg_id_to_dtype) elif isinstance(target, PyOpenCLTarget): - new_arg_id_to_dtype = pyopencl_with_types(arg_id_to_dtype) + new_arg_id_to_dtype = pyopencl_with_types(self.name, arg_id_to_dtype) elif isinstance(target, CudaTarget): - new_arg_id_to_dtype = cuda_with_types(arg_id_to_dtype) + new_arg_id_to_dtype = cuda_with_types(self.name, arg_id_to_dtype) else: raise NotImplementedError("InKernelCallable.with_types() for" @@ -344,7 +339,7 @@ class InKernelCallable(ImmutableRecord): write_count = -1 for arg in specialized_kernel.args: new_arg_id_to_dtype[arg.name] = arg.dtype - if arg.name in specialized_kernel.written_variables(): + if arg.name in specialized_kernel.get_written_variables(): new_arg_id_to_dtype[write_count] = arg.dtype write_count -= 1 else: @@ -429,7 +424,7 @@ class InKernelCallable(ImmutableRecord): and self.arg_id_to_dtype == other.arg_id_to_keyword) def __hash__(self): - return hash((self.name, )) + return hash((self.name, self.subkernel)) # {{{ callable kernel @@ -488,7 +483,6 @@ class CallableKernel(InKernelCallable): # }}} - # }}} # {{{ with_descriptors diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 622590c71..d7d961d25 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -37,9 +37,12 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import ScopedFunction +from loopy.symbolic import ScopedFunction, IdentityMapper from pymbolic.mapper import Collector +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) + import logging logger = logging.getLogger(__name__) @@ -2122,6 +2125,44 @@ def check_functions_are_scoped(kernel): # }}} +# {{{ arg_descr_inference + +# take help from the work we did yesterday to populate this +class ArgDescriptionAdder(IdentityMapper): + + def __init__(self,): + ... + + def map_call(self, expr): + ... + + +def arg_descr_inference(kernel): + """ Specializes the kernel functions in way that the functions agree upon + shape and dimensions of the arguments too. + """ + + # The rest are to be hanfled by array calls. Which would need a mapper. + + new_insns = [] + for insn in kernel.instructions: + if isinstance(insn, (MultiAssignmentBase, CInstruction)): + expr = ArgDescriptionAdder(insn.expression) + new_insns.append(insn.copy(expression=expr)) + elif isinstance(insn, _DataObliviousInstruction): + new_insns.append() + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + # get the new scoped functions, in a similar fashion we did for type + # inference + + return kernel.copy(instructions=new_insns) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2180,6 +2221,9 @@ def preprocess_kernel(kernel, device=None): # Get them out of the way. kernel = infer_unknown_types(kernel, expect_completion=False) + print(kernel.instructions) + print(kernel.scoped_functions) + 1/0 # TODO: Specializng based on: # 1. ArgDescriptors -- GitLab From e57ee723d85233eb81c3fc5af1efe2d73b40aab3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Mar 2018 01:10:53 -0500 Subject: [PATCH 011/916] arg_id_to_descr is working --- loopy/kernel/__init__.py | 6 +- loopy/kernel/function_interface.py | 174 +++++++++++++++++++++++++---- loopy/library/function.py | 5 - loopy/preprocess.py | 168 ++++++++++++++++++++++++---- loopy/symbolic.py | 13 ++- loopy/type_inference.py | 100 +---------------- 6 files changed, 316 insertions(+), 150 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 851626a8d..d716f0b78 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -37,8 +37,7 @@ from pytools import UniqueNameGenerator, generate_unique_names from loopy.library.function import ( default_function_mangler, - single_arg_function_mangler, - default_function_identifiers) + single_arg_function_mangler) from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted @@ -271,8 +270,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # Populating the function identifiers based on the target and the default # function identifiers - function_identifiers = (default_function_identifiers() | - target.get_device_ast_builder().function_identifiers()) + function_identifiers = target.get_device_ast_builder().function_identifiers() ImmutableRecordWithoutPickling.__init__(self, domains=domains, diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 4bc7f3d76..7127d142b 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -1,11 +1,18 @@ from __future__ import division, absolute_import +import re +import six import numpy as np from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) + +from loopy.symbolic import IdentityMapper, ScopedFunction + # {{{ argument descriptors @@ -21,17 +28,20 @@ class ArgDescriptor(ImmutableRecord): mem_scope=None, shape=None, dim_tags=None): - super(ArgDescriptor).__init__(self, - mem_scope=mem_scope, + super(ArgDescriptor, self).__init__(mem_scope=mem_scope, shape=shape, dim_tags=dim_tags) class ValueArgDescriptor(ArgDescriptor): - """ - """ def __init__(self): - super(ValueArgDescriptor, self).__init__(self) + super(ValueArgDescriptor, self).__init__() + + def __str__(self): + return "ValueArgDescriptor" + + def __repr__(self): + return "ValueArgDescriptor" class ArrayArgDescriptor(ArgDescriptor): @@ -41,9 +51,10 @@ class ArrayArgDescriptor(ArgDescriptor): """ def __init__(self, + shape=None, mem_scope=None, dim_tags=None): - super(ArgDescriptor, self).__init__(self, + super(ArgDescriptor, self).__init__(shape=None, mem_scope=mem_scope, dim_tags=dim_tags) @@ -266,10 +277,7 @@ class InKernelCallable(ImmutableRecord): # {{{ attempt to specialize using scalar functions - from loopy.library.function import default_function_identifiers - if self.name in default_function_identifiers(): - ... - elif self.name in target.get_device_ast_builder().function_identifiers(): + if self.name in target.get_device_ast_builder().function_identifiers(): from loopy.target.c import CTarget from loopy.target.opencl import OpenCLTarget from loopy.target.pyopencl import PyOpenCLTarget @@ -371,7 +379,36 @@ class InKernelCallable(ImmutableRecord): its keyword identifier. """ - raise NotImplementedError() + if self.subkernel is None: + # This is a scalar call + # need to assert that the name is in funtion indentifiers + arg_id_to_descr[-1] = ValueArgDescriptor() + return self.copy(arg_id_to_descr=arg_id_to_descr) + + else: + # Now this ia a kernel call + # tuning the subkernel so that we have the the matching shapes and + # dim_tags. + # FIXME: Although We receive input if the argument is + # local/global. We do not use it to set the subkernel function + # signature. Need to do it, so that we can handle teporary inputs + # in the array call. + + # Collecting the parameters + new_args = self.args.copy() + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + for id, descr in arg_id_to_descr.items(): + if isinstance(id, str): + id = kw_to_pos[id] + assert isinstance(id, int) + new_args[id] = new_args[id].copy(shape=descr.shape, + dim_tags=descr.dim_tags) + + descriptor_specialized_knl = self.subkernel.copy(args=new_args) + + return self.copy(subkernel=descriptor_specialized_knl, + arg_id_to_descr=arg_id_to_descr) def with_iname_tag_usage(self, unusable, concurrent_shape): """ @@ -390,16 +427,10 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - def is_arg_written(self, arg_id): - """ - :arg arg_id: (keyword) name or position - """ - - raise NotImplementedError() - def is_ready_for_code_gen(self): - raise NotImplementedError() + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) # {{{ code generation @@ -413,6 +444,8 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() def emit_call(self, target): + # two varieties of this call, when obtained in between a function and + # when obtained as a separate instruction statement. raise NotImplementedError() @@ -421,7 +454,7 @@ class InKernelCallable(ImmutableRecord): def __eq__(self, other): return (self.name == other.name and self.arg_id_to_descr == other.arg_id_to_descr - and self.arg_id_to_dtype == other.arg_id_to_keyword) + and self.arg_id_to_dtype == other.arg_id_to_dtype) def __hash__(self): return hash((self.name, self.subkernel)) @@ -530,4 +563,105 @@ class CallableKernel(InKernelCallable): # }}} + +# {{{ new pymbolic calls to scoped functions + +def next_indexed_name(name): + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(name) + + if match is None: + if name[-1] == '_': + return "{old_name}0".format(old_name=name) + else: + return "{old_name}_0".format(old_name=name) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + + +class FunctionScopeChanger(IdentityMapper): + #TODO: Make it sophisticated as in I don't like the if-else systems. Needs + # something else. + def __init__(self, new_names): + self.new_names = new_names + self.new_names_set = frozenset(new_names.values()) + + def map_call(self, expr): + if expr in self.new_names: + return type(expr)( + ScopedFunction(self.new_names[expr]), + tuple(self.rec(child) + for child in expr.parameters)) + else: + return IdentityMapper.map_call(self, expr) + + def map_call_with_kwargs(self, expr): + if expr in self.new_names: + return type(expr)( + ScopedFunction(self.new_names[expr]), + tuple(self.rec(child) + for child in expr.parameters), + dict( + (key, self.rec(val)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return IdentityMapper.map_call_with_kwargs(self, expr) + + +def register_pymbolic_calls_to_knl_callables(kernel, + pymbolic_calls_to_knl_callables): + """ Takes in a mapping :arg:`pymbolic_calls_to_knl_callables` and returns a + new kernel which includes an association with the given pymbolic calls to + instances of :class:`InKernelCallable` + """ + + scoped_names_to_functions = kernel.scoped_functions.copy() + + # A dict containing the new scoped functions to the names which have been + # assigned to them + scoped_functions_to_names = {} + + # A dict containing the new name that need to be assigned to the + # corresponding pymbolic call + pymbolic_calls_to_new_names = {} + + for pymbolic_call, in_knl_callable in pymbolic_calls_to_knl_callables.items(): + # checking if such a in-kernel callable already exists. + if in_knl_callable not in scoped_functions_to_names: + # No matching in_knl_callable found => make a new one with a new + # name. + + unique_name = next_indexed_name(pymbolic_call.function.name) + while unique_name in scoped_names_to_functions: + # keep on finding new names till one a unique one is found. + unique_name = next_indexed_name(unique_name) + + # book-keeping of the functions and names mappings for later use + scoped_names_to_functions[unique_name] = in_knl_callable + scoped_functions_to_names[in_knl_callable] = unique_name + + pymbolic_calls_to_new_names[pymbolic_call] = ( + scoped_functions_to_names[in_knl_callable]) + + # Using the data populated in pymbolic_calls_to_new_names to change the + # names of the scoped functions of all the calls in the kernel. + new_insns = [] + scope_changer = FunctionScopeChanger(pymbolic_calls_to_new_names) + for insn in kernel.instructions: + if isinstance(insn, (MultiAssignmentBase, CInstruction)): + expr = scope_changer(insn.expression) + new_insns.append(insn.copy(expression=expr)) + elif isinstance(insn, _DataObliviousInstruction): + new_insns.append(insn) + else: + raise NotImplementedError("Type Inference Specialization not" + "implemented for %s instruciton" % type(insn)) + return kernel.copy(scoped_functions=scoped_names_to_functions, + instructions=new_insns) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/library/function.py b/loopy/library/function.py index e8e1e22fa..3573f1d54 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -23,11 +23,6 @@ THE SOFTWARE. """ -def default_function_identifiers(): - from loopy.library.reduction import reduction_function_identifiers - return set("make_tuple") | reduction_function_identifiers() - - def default_function_mangler(kernel, name, arg_dtypes): from loopy.library.reduction import reduction_function_mangler diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d7d961d25..741f828e2 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -27,6 +27,7 @@ import six from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) +from functools import reduce import islpy as isl @@ -37,11 +38,11 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import ScopedFunction, IdentityMapper +from loopy.symbolic import ScopedFunction, CombineMapper from pymbolic.mapper import Collector from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction) + CallInstruction, _DataObliviousInstruction) import logging logger = logging.getLogger(__name__) @@ -2127,38 +2128,155 @@ def check_functions_are_scoped(kernel): # {{{ arg_descr_inference -# take help from the work we did yesterday to populate this -class ArgDescriptionAdder(IdentityMapper): +def get_arg_description_from_sub_array_ref(sub_array, kernel): + """ Gets the dim_tags, memory scope, shape informations of a + :class:`SubArrayRef` argument in the caller kernel packed into + :class:`ArrayArgDescriptor`. + """ + from loopy.kernel.function_interface import ArrayArgDescriptor - def __init__(self,): - ... + name = sub_array.subscript.attribute.name - def map_call(self, expr): - ... + if name in kernel.temporary_variables: + mem_scope = "LOCAL" + arg = kernel.temporary_variables[name] + assert name not in kernel.arg_dict + else: + assert name in kernel.arg_dict + mem_scope = "GLOBAL" + arg = kernel.arg_dict[name] + + sub_dim_tags, sub_shape = sub_array.get_sub_array_dim_tags_and_shape( + arg.dim_tags, arg.shape) + return ArrayArgDescriptor(mem_scope=mem_scope, + dim_tags=sub_dim_tags, + shape=sub_shape) -def arg_descr_inference(kernel): + +class ArgDescriptionInferer(CombineMapper): + """ Returns a set with elements as instances of :class:`tuple` (expr, + in_kenrel_callable). The mapped `in_kenrel_callable` of the + :class:`InKernelCallable` are descriptor specialized for the given + arguments. + """ + + def __init__(self, scoped_functions): + self.scoped_functions = scoped_functions + + def combine(self, values): + import operator + return reduce(operator.or_, values, set()) + + def map_call(self, expr, **kwargs): + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import SubArrayRef + + # descriptors for the args + arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par)) + if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) + for i, par in enumerate(expr.parameters)) + + assignee_id_to_descr = {} + + # assignee descriptor + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + assert isinstance(assignees, tuple) + for i, par in enumerate(assignees): + if isinstance(par, SubArrayRef): + assignee_id_to_descr[-i-1] = ( + get_arg_description_from_sub_array_ref(par)) + else: + assignee_id_to_descr[-i-1] = ValueArgDescriptor() + + # gathering all the descriptors + combined_arg_id_to_dtype = {**arg_id_to_descr, **assignee_id_to_descr} + + # specializing the function according to the parameter description + new_scoped_function = ( + self.scoped_functions[expr.function.name].with_descrs( + combined_arg_id_to_dtype)) + + # collecting the descriptors for args, kwargs, assignees + return set(((expr, new_scoped_function),)) + + def map_call_with_kwargs(self, expr, **kwargs): + from loopy.kernel.function_intergace import ValueArgDescriptor + from loopy.symbolic import SubArrayRef + + # descriptors for the args and kwargs: + arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par)) + if isinstance(par, SubArrayRef) else ValueArgDescriptor() + for i, par in enumerate(expr.parameters) + + expr.kw_parameters.items()) + + assignee_id_to_descr = {} + + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + assert isinstance(assignees, tuple) + for i, par in enumerate(assignees): + if isinstance(par, SubArrayRef): + assignee_id_to_descr[-i-1] = ( + get_arg_description_from_sub_array_ref(par)) + else: + assignee_id_to_descr[-i-1] = ValueArgDescriptor() + + # gathering all the descriptors + combined_arg_id_to_descr = {**arg_id_to_descr, **assignee_id_to_descr} + + # specializing the function according to the parameter description + new_scoped_function = ( + self.scoped_functions[expr.function.name].with_descr( + combined_arg_id_to_descr)) + + # collecting the descriptors for args, kwargs, assignees + return set(((expr, new_scoped_function),)) + + def map_constant(self, expr): + return set() + + map_variable = map_constant + map_function_symbol = map_constant + +def infer_arg_descr(kernel): """ Specializes the kernel functions in way that the functions agree upon shape and dimensions of the arguments too. """ - # The rest are to be hanfled by array calls. Which would need a mapper. + arg_description_modifier = ArgDescriptionInferer(kernel.scoped_functions) + pymbolic_calls_to_functions = set() - new_insns = [] for insn in kernel.instructions: + + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_dtype + pymbolic_calls_to_functions.update( + arg_description_modifier(insn.expression, + assignees=insn.assignees)) if isinstance(insn, (MultiAssignmentBase, CInstruction)): - expr = ArgDescriptionAdder(insn.expression) - new_insns.append(insn.copy(expression=expr)) + pymbolic_calls_to_functions.update(arg_description_modifier( + insn.expression)) elif isinstance(insn, _DataObliviousInstruction): - new_insns.append() + pass else: raise NotImplementedError("arg_descr_inference for %s instruction" % type(insn)) - # get the new scoped functions, in a similar fashion we did for type - # inference + # making it the set of tuples a dict + pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions) + + # Now do the similar treatment as done for type inference. + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + + return register_pymbolic_calls_to_knl_callables(kernel, + pymbolic_calls_to_functions) - return kernel.copy(instructions=new_insns) # }}} @@ -2221,9 +2339,6 @@ def preprocess_kernel(kernel, device=None): # Get them out of the way. kernel = infer_unknown_types(kernel, expect_completion=False) - print(kernel.instructions) - print(kernel.scoped_functions) - 1/0 # TODO: Specializng based on: # 1. ArgDescriptors @@ -2263,6 +2378,19 @@ def preprocess_kernel(kernel, device=None): # have been established kernel = check_atomic_loads(kernel) + kernel = infer_arg_descr(kernel) + + print(75*'-') + print("This is after Type Inference") + for insn in kernel.instructions: + print(insn) + print(75*'-') + print('Linked Functions:') + for name, func in kernel.scoped_functions.items(): + print(name, "=>", func) + print(75*'-') + 1/0 + kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 23617c48b..8abda0f2a 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -723,19 +723,22 @@ class SubArrayRef(p.Expression): starting_inames.append(iname) return p.Subscript(self.subscript.aggregate, tuple(starting_inames)) - def get_inner_dim_tags(self, arg_dim_tags): + def get_sub_array_dim_tags_and_shape(self, arg_dim_tags, arg_shape): """ Gives the dim tags for the inner inames. This would be used for stride calculation in the child kernel. This might need to go, once we start calculating the stride length using the upper and lower bounds of the involved inames. """ from loopy.kernel.array import FixedStrideArrayDimTag as DimTag - inner_dim_tags = [] - for dim_tag, iname in zip(arg_dim_tags, self.subscript.index_tuple): + sub_dim_tags = [] + sub_shape = [] + for dim_tag, axis_length, iname in zip( + arg_dim_tags, arg_shape, self.subscript.index_tuple): if iname in self.swept_inames: - inner_dim_tags.append(DimTag(dim_tag.stride)) + sub_dim_tags.append(DimTag(dim_tag.stride)) + sub_shape.append(axis_length) - return inner_dim_tags + return sub_dim_tags, sub_shape def __getinitargs__(self): return (self.swept_inames, self.subscript) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 23aa379dd..bc8669528 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -25,9 +25,7 @@ THE SOFTWARE. import six from pymbolic.mapper import CombineMapper -from loopy.symbolic import IdentityMapper, ScopedFunction import numpy as np -import re from loopy.tools import is_integer from loopy.types import NumpyType @@ -36,9 +34,6 @@ from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) -from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction) - import logging logger = logging.getLogger(__name__) @@ -515,59 +510,6 @@ class _DictUnionView: raise KeyError(key) -# {{{ duplicating the funciton name - -def next_indexed_name(name): - FUNC_NAME = re.compile(r"^(?P\S+?)_(?P\d+?)$") - - match = FUNC_NAME.match(name) - - if match is None: - if name[-1] == '_': - return "{old_name}0".format(old_name=name) - else: - return "{old_name}_0".format(old_name=name) - - return "{alpha}_{num}".format(alpha=match.group('alpha'), - num=int(match.group('num'))+1) - -# }}} - - -# {{{ FunctionScopeChanger - -#TODO: Make it sophisticated - -class FunctionScopeChanger(IdentityMapper): - def __init__(self, new_names): - self.new_names = new_names - self.new_names_set = frozenset(new_names.values()) - - def map_call(self, expr): - if expr in self.new_names: - return type(expr)( - ScopedFunction(self.new_names[expr]), - tuple(self.rec(child) - for child in expr.parameters)) - else: - return IdentityMapper.map_call(self, expr) - - def map_call_with_kwargs(self, expr): - if expr in self.new_names: - return type(expr)( - ScopedFunction(self.new_names[expr]), - tuple(self.rec(child) - for child in expr.parameters), - dict( - (key, self.rec(val)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - else: - return IdentityMapper.map_call_with_kwargs(self, expr) - -# }}} - - # {{{ infer_unknown_types def infer_unknown_types(kernel, expect_completion=False): @@ -736,45 +678,11 @@ def infer_unknown_types(kernel, expect_completion=False): args=[new_arg_dict[arg.name] for arg in kernel.args], ) - # {{{ type specialization - - # TODO: These 2 dictionaries are inverse mapping of each other and help to keep - # track of which ...(need to explain better) - scoped_names_to_functions = pre_type_specialized_knl.scoped_functions - scoped_functions_to_names = {} - pymbolic_calls_to_new_names = {} - - for pymbolic_call, knl_callable in specialized_functions.items(): - if knl_callable not in scoped_functions_to_names: - # need to make a new name deerived from the old name such that new - # name in not present in new_scoped_name_to_function - old_name = pymbolic_call.function.name - new_name = next_indexed_name(old_name) - while new_name in scoped_names_to_functions: - new_name = next_indexed_name(new_name) - - scoped_names_to_functions[new_name] = knl_callable - scoped_functions_to_names[knl_callable] = new_name - - pymbolic_calls_to_new_names[pymbolic_call] = ( - scoped_functions_to_names[knl_callable]) - - # }}} - - new_insns = [] - scope_changer = FunctionScopeChanger(pymbolic_calls_to_new_names) - for insn in pre_type_specialized_knl.instructions: - if isinstance(insn, (MultiAssignmentBase, CInstruction)): - expr = scope_changer(insn.expression) - new_insns.append(insn.copy(expression=expr)) - elif isinstance(insn, _DataObliviousInstruction): - new_insns.append(insn) - else: - raise NotImplementedError("Type Inference Specialization not" - "implemented for %s instruciton" % type(insn)) + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + return register_pymbolic_calls_to_knl_callables( + pre_type_specialized_knl, specialized_functions) - return pre_type_specialized_knl.copy(scoped_functions=scoped_names_to_functions, - instructions=new_insns) # }}} -- GitLab From b36f74a5b4ff41eef3abd34ce4d533a15c0a765f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Mar 2018 16:22:16 -0500 Subject: [PATCH 012/916] Can now include SubArrayRef into the LHS assignees --- loopy/kernel/creation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 09b0ac180..f47144f94 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -497,14 +497,16 @@ def parse_insn(groups, insn_options): if isinstance(inner_lhs_i, Lookup): inner_lhs_i = inner_lhs_i.aggregate - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef if isinstance(inner_lhs_i, Variable): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): assignee_names.append(inner_lhs_i.aggregate.name) + elif isinstance(inner_lhs_i, SubArrayRef): + assignee_names.append(inner_lhs_i.subscript.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " - "be variable or subscript" % (lhs_i,)) + "be variable, subscript or a SubArrayRef" % (lhs_i,)) new_lhs.append(lhs_i) -- GitLab From 4cbb9da0f722440f19dfbbb2a3e796d3e03b5a37 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Mar 2018 16:42:51 -0500 Subject: [PATCH 013/916] Includes support to SubArrayRef --- loopy/kernel/instruction.py | 49 ++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 95001c78b..d9b6384c8 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -487,7 +487,7 @@ class InstructionBase(ImmutableRecord): def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -506,13 +506,20 @@ def _get_assignee_var_name(expr): assert isinstance(agg, Variable) return agg.name + + elif isinstance(expr, SubArrayRef): + agg = expr.subscript.aggregate + assert isinstance(agg, Variable) + + return agg.name + else: raise RuntimeError("invalid lvalue '%s'" % expr) def _get_assignee_subscript_deps(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, get_dependencies + from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -523,6 +530,8 @@ def _get_assignee_subscript_deps(expr): return get_dependencies(expr.index) elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) + elif isinstance(expr, SubArrayRef): + return get_dependencies(expr.get_begin_subscript().index) else: raise RuntimeError("invalid lvalue '%s'" % expr) @@ -961,9 +970,10 @@ class CallInstruction(MultiAssignmentBase): forced_iname_deps=forced_iname_deps, forced_iname_deps_is_final=forced_iname_deps_is_final) - from pymbolic.primitives import Call + from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)) and expression is not None: + if not isinstance(expression, (Call, CallWithKwargs, Reduction)) and ( + expression is not None): raise LoopyError("'expression' argument to CallInstruction " "must be a function call") @@ -979,9 +989,10 @@ class CallInstruction(MultiAssignmentBase): expression = parse(expression) from pymbolic.primitives import Variable, Subscript - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef for assignee in assignees: - if not isinstance(assignee, (Variable, Subscript, LinearSubscript)): + if not isinstance(assignee, (Variable, Subscript, LinearSubscript, + SubArrayRef)): raise LoopyError("invalid lvalue '%s'" % assignee) self.assignees = assignees @@ -1035,16 +1046,36 @@ class CallInstruction(MultiAssignmentBase): # }}} +def is_array_call(assignees, expression): + from pymbolic.primitives import Call, CallWithKwargs + from loopy.symbolic import SubArrayRef + + if not isinstance(expression, (Call, CallWithKwargs)): + return False + + for assignee in assignees: + if isinstance(assignee, SubArrayRef): + return True + + for par in expression.parameters: + if isinstance(assignee, SubArrayRef): + return True + + # did not encounter SubArrayRef, hence must be a normal call + return False + + def make_assignment(assignees, expression, temp_var_types=None, **kwargs): - if len(assignees) > 1 or len(assignees) == 0: + if len(assignees) > 1 or len(assignees) == 0 or is_array_call(assignees, + expression): atomicity = kwargs.pop("atomicity", ()) if atomicity: raise LoopyError("atomic operations with more than one " "left-hand side not supported") - from pymbolic.primitives import Call + from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)): + if not isinstance(expression, (Call, CallWithKwargs, Reduction)): raise LoopyError("right-hand side in multiple assignment must be " "function call or reduction, got: '%s'" % expression) -- GitLab From 8bda75e1920ac1cbc8138b7895716d92f2f6288d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Mar 2018 19:41:34 -0500 Subject: [PATCH 014/916] made the function scoper recursive --- loopy/kernel/creation.py | 46 +++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f47144f94..190a80d3b 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1835,32 +1835,44 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): class FunctionScoper(IdentityMapper): + """ + Subclass of :class:`IdentityMapper` which converts functions known to + the kernel at to instances of :class:`ScopedFunction`. + + .. _example: + + If given an expression of the form `sin(x) + unknown_function(y) + + log(z)`, then the mapper would return `ScopedFunction('sin')(x) + + unknown_function(y) + ScopedFunction('log')(z)`. Since the + `unknown_function` is not known to the kernel it is not marked as a + `ScopedFunction`. + """ def __init__(self, function_ids): self.function_ids = function_ids def map_call(self, expr): + from loopy.symbolic import ScopedFunction if expr.function.name in self.function_ids: - # 1. need to change the function to ScopedFunction instead of Variable + # The function is one of the known function hence scoping it. from pymbolic.primitives import Call - from loopy.symbolic import ScopedFunction - return super(FunctionScoper, self).map_call( - Call(function=ScopedFunction(expr.function.name), - parameters=expr.parameters)) - - else: - return super(FunctionScoper, self).map_call(expr) + return Call( + ScopedFunction(expr.function.name), + tuple(self.rec(child) + for child in expr.parameters)) def map_call_with_kwargs(self, expr): if expr.function.name in self.function_ids: from pymbolic.primitives import CallWithKwargs from loopy.symbolic import ScopedFunction - return super(FunctionScoper, self).map_call_with_kwargs( - CallWithKwargs(function=ScopedFunction(expr.function.name), - parameters=expr.parameters, - kw_parameters=expr.kw_parameters)) - else: - return super(FunctionScoper, self).map_call_with_kwargs(expr) + return CallWithKwargs( + ScopedFunction(expr.function.name), + tuple(self.rec(child) + for child in expr.parameters), + dict( + (key, self.rec(val)) + for key, val in six.iteritems(expr.kw_parameters)) + ) class ScopedFunctionCollector(Collector): @@ -1868,6 +1880,8 @@ class ScopedFunctionCollector(Collector): def map_scoped_function(self, expr): return set([expr.name]) + map_sub_array_ref = Collector.map_constant + def scope_functions(kernel): func_ids = kernel.function_identifiers.copy() @@ -1887,7 +1901,7 @@ def scope_functions(kernel): elif isinstance(insn, _DataObliviousInstruction): new_insns.append(insn) else: - raise NotImplementedError("scope_function not implemented for %s" % + raise NotImplementedError("scope_functions not implemented for %s" % type(insn)) # Need to combine the scoped functions into a dict @@ -2235,8 +2249,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_written_variable_names(knl) # Function Lookup - # TODO: here I add my function for function_lookup. Lol. realize the UN-inteded - # pun knl = scope_functions(knl) from loopy.preprocess import prepare_for_caching -- GitLab From 19cc672990effff5a7e119a6582b2943e3dda6f7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Mar 2018 19:44:28 -0500 Subject: [PATCH 015/916] Removed the logic error in ArgDescriptorInferer --- loopy/preprocess.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 741f828e2..01eeb5130 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2166,7 +2166,7 @@ class ArgDescriptionInferer(CombineMapper): def combine(self, values): import operator - return reduce(operator.or_, values, set()) + return reduce(operator.or_, values, frozenset()) def map_call(self, expr, **kwargs): from loopy.kernel.function_interface import ValueArgDescriptor @@ -2200,7 +2200,9 @@ class ArgDescriptionInferer(CombineMapper): combined_arg_id_to_dtype)) # collecting the descriptors for args, kwargs, assignees - return set(((expr, new_scoped_function),)) + return ( + frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) def map_call_with_kwargs(self, expr, **kwargs): from loopy.kernel.function_intergace import ValueArgDescriptor @@ -2234,14 +2236,17 @@ class ArgDescriptionInferer(CombineMapper): combined_arg_id_to_descr)) # collecting the descriptors for args, kwargs, assignees - return set(((expr, new_scoped_function),)) + return ( + frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) def map_constant(self, expr): - return set() + return frozenset() map_variable = map_constant map_function_symbol = map_constant + def infer_arg_descr(kernel): """ Specializes the kernel functions in way that the functions agree upon shape and dimensions of the arguments too. @@ -2259,8 +2264,8 @@ def infer_arg_descr(kernel): arg_description_modifier(insn.expression, assignees=insn.assignees)) if isinstance(insn, (MultiAssignmentBase, CInstruction)): - pymbolic_calls_to_functions.update(arg_description_modifier( - insn.expression)) + a = arg_description_modifier(insn.expression) + pymbolic_calls_to_functions.update(a) elif isinstance(insn, _DataObliviousInstruction): pass else: -- GitLab From 442a45041e4c29edfb79fdbd35b58ed42d74f92f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Mar 2018 20:37:24 -0500 Subject: [PATCH 016/916] correctly handles unkonwn functions now. --- loopy/kernel/creation.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 190a80d3b..1343233bf 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1861,6 +1861,9 @@ class FunctionScoper(IdentityMapper): tuple(self.rec(child) for child in expr.parameters)) + # This is an unknown function as of yet, not modifying it. + return IdentityMapper.map_call(self, expr) + def map_call_with_kwargs(self, expr): if expr.function.name in self.function_ids: from pymbolic.primitives import CallWithKwargs @@ -1874,13 +1877,20 @@ class FunctionScoper(IdentityMapper): for key, val in six.iteritems(expr.kw_parameters)) ) + # This is an unknown function as of yet, not modifying it. + return IdentityMapper.map_call(self, expr) + class ScopedFunctionCollector(Collector): + """ This mapper would collect all the instances of :class:`ScopedFunction` + occurring in the expression and written all of them as a :class:`set`. + """ def map_scoped_function(self, expr): return set([expr.name]) - map_sub_array_ref = Collector.map_constant + def map_sub_array_ref(self, expr): + return set() def scope_functions(kernel): -- GitLab From e2222bc17592423760c60358d63bd68c542f2efd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Mar 2018 21:33:37 -0500 Subject: [PATCH 017/916] changes the doctrings --- loopy/kernel/creation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1343233bf..cdad141a1 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1836,8 +1836,8 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): class FunctionScoper(IdentityMapper): """ - Subclass of :class:`IdentityMapper` which converts functions known to - the kernel at to instances of :class:`ScopedFunction`. + Converts functions known to the kernel as instances of + :class:`ScopedFunction`. .. _example: -- GitLab From e4f4949eb8e4c2563b005d0265538f2d70eafca8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Mar 2018 21:38:29 -0500 Subject: [PATCH 018/916] starts registering callee kernels inside the caller kernel --- loopy/transform/register_knl.py | 112 ++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 loopy/transform/register_knl.py diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py new file mode 100644 index 000000000..691c0c51a --- /dev/null +++ b/loopy/transform/register_knl.py @@ -0,0 +1,112 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.kernel import LoopKernel +from loopy.kernel.creation import FunctionScoper +from loopy.diagnostic import LoopyError +from loopy.function_interface import InKernelCallable + +from loopy.kenrel.instruction import (MultiAssignmentBase, CallInstruction, + CInstruction, _DataObliviousInstruction) + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_callable_kernel +""" + + +# {{{ main entrypoint + +def register_callable_kernel(parent, function_name, child): + """ + The purpose of this transformation is so that one can inoke the child + kernel in the parent kernel. + + :arg parent + + This is the "main" kernel which will mostly remain unaltered and one + can interpret it as stitching up the child kernel in the parent kernel. + + :arg function_name + + The name of the function call with which the child kernel must be + associated in the parent kernel + + :arg child + + This is like a function in every other language and this might be + invoked in one of the instructions of the parent kernel. + + ..note:: + + One should note that the kernels would go under stringent compatibilty + tests so that both of them can be confirmed to be made for each other. + """ + + # {{{ Sanity Checks + + assert isinstance(parent, LoopKernel) + assert isinstance(child, LoopKernel) + assert isinstance(function_name, str) + assert function_name not in parent.auxiliary_kernels, ( + "%s has already been used with some other kernel. One" + "function can only be associated with a single kernel" % ( + function_name)) + + # }}} + + # scoping the function + function_scoper = FunctionScoper(set([function_name])) + new_insns = [] + + for insn in parent.instructions: + if isinstance(insn, CallInstruction): + new_insn = insn.copy(expression=function_scoper(insn.expression)) + new_insns.append(new_insn) + elif isinstance(insn, (_DataObliviousInstruction, MultiAssignmentBase, + CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("scope_functions not implemented for %s" % + type(insn)) + + # adding the scoped function to the scoped function dict of the parent + # kernel. + + scoped_functions = parent.scoped_functions.copy() + + if function_name in scoped_functions: + raise LoopyError("%s is already being used as a funciton name -- maybe" + "use a different name for registering the subkernel") + + scoped_functions[function_name] = InKernelCallable(name=function_name, + subkernel=child) + + # returning the parent kernel with the new scoped function dictionary + return parent.copy(scope_functions=scoped_functions) + +# }}} + +# vim: foldmethod=marker -- GitLab From 06c929056e84beae54dbea2c7ec53479c0536ba9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Mar 2018 21:39:39 -0500 Subject: [PATCH 019/916] removes extra empty line --- loopy/kernel/creation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index cdad141a1..c0c8e73be 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1833,7 +1833,6 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # {{{ lookup functions - class FunctionScoper(IdentityMapper): """ Converts functions known to the kernel as instances of -- GitLab From 0cf8b6051a9b2731021ce6412b25866cec979ff5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Mar 2018 22:32:50 -0500 Subject: [PATCH 020/916] Subkernel call, getting interpreted correctly. --- loopy/__init__.py | 4 ++ loopy/kernel/__init__.py | 2 +- loopy/kernel/data.py | 8 +++ loopy/kernel/function_interface.py | 75 ++++++++++++++++++++---- loopy/preprocess.py | 38 ++++++------ loopy/symbolic.py | 5 +- loopy/target/c/__init__.py | 87 +++++----------------------- loopy/target/c/codegen/expression.py | 4 ++ loopy/transform/register_knl.py | 13 ++--- loopy/type_inference.py | 31 +++++++++- 10 files changed, 154 insertions(+), 113 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 89683e0b4..4fa8c5fc5 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -116,6 +116,8 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier +from loopy.transform.register_knl import register_callable_kernel + # }}} from loopy.type_inference import infer_unknown_types @@ -222,6 +224,8 @@ __all__ = [ "add_barrier", + "register_callable_kernel", + # }}} "get_dot_dependency_graph", diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d716f0b78..25737786c 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1339,7 +1339,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): "temporary_variables", "iname_to_tag", "substitutions", - "scoped_functions", "iname_slab_increments", "loop_priority", "silenced_warnings", @@ -1362,6 +1361,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "preamble_generators", "function_manglers", "symbol_manglers", + "scoped_functions", ) def update_persistent_hash(self, key_hash, key_builder): diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index c90e8a64b..59297e475 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -607,6 +607,13 @@ class SubstitutionRule(ImmutableRecord): # {{{ function call mangling class CallMangleInfo(ImmutableRecord): + def __init__(self): + raise NotImplementedError("New Mangler interface expected") + + +# FIXME: Uncomment it once everything is done. +# KK: Removed it for the duration the new mangler interface starts working. +''' """ .. attribute:: target_name @@ -631,6 +638,7 @@ class CallMangleInfo(ImmutableRecord): target_name=target_name, result_dtypes=result_dtypes, arg_dtypes=arg_dtypes) +''' # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 7127d142b..bb88cc091 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -54,6 +54,13 @@ class ArrayArgDescriptor(ArgDescriptor): shape=None, mem_scope=None, dim_tags=None): + + # {{{ sanity checks + + assert isinstance(shape, tuple) + + # }}} + super(ArgDescriptor, self).__init__(shape=None, mem_scope=mem_scope, dim_tags=dim_tags) @@ -299,11 +306,11 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError("InKernelCallable.with_types() for" " %s target" % target) - # }}} + if new_arg_id_to_dtype is not None: + # got our speciliazed function + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) - if new_arg_id_to_dtype is not None: - # got our speciliazed function - return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + # }}} if self.subkernel is None: # did not find a scalar function and function prototype does not @@ -326,7 +333,7 @@ class InKernelCallable(ImmutableRecord): new_args.append(arg.copy( dtype=arg_id_to_dtype[kw_to_pos[kw]])) else: - if kw in self.subkernel.read_variables(): + if kw in self.subkernel.get_read_variables(): # need to know the type of the input arguments for type # inference raise LoopyError("Type of %s variable not supplied to the" @@ -395,7 +402,7 @@ class InKernelCallable(ImmutableRecord): # in the array call. # Collecting the parameters - new_args = self.args.copy() + new_args = self.subkernel.args.copy() kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) for id, descr in arg_id_to_descr.items(): @@ -441,20 +448,59 @@ class InKernelCallable(ImmutableRecord): def get_target_specific_name(self, target): + if self.subkernel is None: + raise NotImplementedError() + else: + return self.subkernel.name + raise NotImplementedError() - def emit_call(self, target): - # two varieties of this call, when obtained in between a function and - # when obtained as a separate instruction statement. + def emit_call(self, insn, target, expression_to_code_mapper): - raise NotImplementedError() + from loopy.kernel.instruction import CallInstruction + from pymbolic.primitives import CallWithKwargs + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameters.append(kw_parameters[pos_to_kw[i]]) + par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) + + # TODO: currently no suppport for insn keywords. + parameters = parameters + list(assignees) + par_dtypes = par_dtypes + [self.arg_id_to_dtype[-i-1] for i, _ in + enumerate(assignees)] + + # Note that we are not going to do any type casting in array calls. + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + + from pymbolic import var + return var(self.get_target_specific_name(target))(*c_parameters) # }}} def __eq__(self, other): return (self.name == other.name and self.arg_id_to_descr == other.arg_id_to_descr - and self.arg_id_to_dtype == other.arg_id_to_dtype) + and self.arg_id_to_dtype == other.arg_id_to_dtype + and self.subkernel == other.subkernel) def __hash__(self): return hash((self.name, self.subkernel)) @@ -640,6 +686,13 @@ def register_pymbolic_calls_to_knl_callables(kernel, unique_name = next_indexed_name(unique_name) # book-keeping of the functions and names mappings for later use + if in_knl_callable.subkernel is not None: + # changing the name of the subkenrel so that it emits a function + # with the name same as the name being used in the + # scoped_function. + new_subkernel = in_knl_callable.subkernel.copy( + name=unique_name) + in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) scoped_names_to_functions[unique_name] = in_knl_callable scoped_functions_to_names[in_knl_callable] = unique_name diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 01eeb5130..068953a52 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2135,7 +2135,7 @@ def get_arg_description_from_sub_array_ref(sub_array, kernel): """ from loopy.kernel.function_interface import ArrayArgDescriptor - name = sub_array.subscript.attribute.name + name = sub_array.subscript.aggregate.name if name in kernel.temporary_variables: mem_scope = "LOCAL" @@ -2161,8 +2161,8 @@ class ArgDescriptionInferer(CombineMapper): arguments. """ - def __init__(self, scoped_functions): - self.scoped_functions = scoped_functions + def __init__(self, kernel): + self.kernel = kernel def combine(self, values): import operator @@ -2173,7 +2173,8 @@ class ArgDescriptionInferer(CombineMapper): from loopy.symbolic import SubArrayRef # descriptors for the args - arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par)) + arg_id_to_descr = dict((i, + get_arg_description_from_sub_array_ref(par, self.kernel)) if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) for i, par in enumerate(expr.parameters)) @@ -2187,7 +2188,8 @@ class ArgDescriptionInferer(CombineMapper): for i, par in enumerate(assignees): if isinstance(par, SubArrayRef): assignee_id_to_descr[-i-1] = ( - get_arg_description_from_sub_array_ref(par)) + get_arg_description_from_sub_array_ref(par, + self.kernel)) else: assignee_id_to_descr[-i-1] = ValueArgDescriptor() @@ -2196,20 +2198,21 @@ class ArgDescriptionInferer(CombineMapper): # specializing the function according to the parameter description new_scoped_function = ( - self.scoped_functions[expr.function.name].with_descrs( + self.kernel.scoped_functions[expr.function.name].with_descrs( combined_arg_id_to_dtype)) # collecting the descriptors for args, kwargs, assignees - return ( - frozenset(((expr, new_scoped_function), )) | - self.combine((self.rec(child) for child in expr.parameters))) + a = frozenset(((expr, new_scoped_function), )) + b = self.combine((self.rec(child) for child in expr.parameters)) + return (a | b) def map_call_with_kwargs(self, expr, **kwargs): from loopy.kernel.function_intergace import ValueArgDescriptor from loopy.symbolic import SubArrayRef # descriptors for the args and kwargs: - arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par)) + arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par, + self.kernel)) if isinstance(par, SubArrayRef) else ValueArgDescriptor() for i, par in enumerate(expr.parameters) + expr.kw_parameters.items()) @@ -2223,7 +2226,8 @@ class ArgDescriptionInferer(CombineMapper): for i, par in enumerate(assignees): if isinstance(par, SubArrayRef): assignee_id_to_descr[-i-1] = ( - get_arg_description_from_sub_array_ref(par)) + get_arg_description_from_sub_array_ref(par, + self.kernel)) else: assignee_id_to_descr[-i-1] = ValueArgDescriptor() @@ -2232,7 +2236,7 @@ class ArgDescriptionInferer(CombineMapper): # specializing the function according to the parameter description new_scoped_function = ( - self.scoped_functions[expr.function.name].with_descr( + self.kernel.scoped_functions[expr.function.name].with_descr( combined_arg_id_to_descr)) # collecting the descriptors for args, kwargs, assignees @@ -2252,7 +2256,7 @@ def infer_arg_descr(kernel): shape and dimensions of the arguments too. """ - arg_description_modifier = ArgDescriptionInferer(kernel.scoped_functions) + arg_description_modifier = ArgDescriptionInferer(kernel) pymbolic_calls_to_functions = set() for insn in kernel.instructions: @@ -2264,8 +2268,7 @@ def infer_arg_descr(kernel): arg_description_modifier(insn.expression, assignees=insn.assignees)) if isinstance(insn, (MultiAssignmentBase, CInstruction)): - a = arg_description_modifier(insn.expression) - pymbolic_calls_to_functions.update(a) + pymbolic_calls_to_functions.update(arg_description_modifier(insn.expression)) elif isinstance(insn, _DataObliviousInstruction): pass else: @@ -2392,9 +2395,10 @@ def preprocess_kernel(kernel, device=None): print(75*'-') print('Linked Functions:') for name, func in kernel.scoped_functions.items(): - print(name, "=>", func) + print(name, "=>", (func.name, func.arg_id_to_dtype, + func.arg_id_to_descr, func.subkernel.args)) + print() print(75*'-') - 1/0 kernel = kernel.target.preprocess(kernel) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8abda0f2a..bdfe57982 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -189,6 +189,9 @@ class CombineMapper(CombineMapperBase): def map_reduction(self, expr): return self.rec(expr.expr) + def map_sub_array_ref(self, expr): + return self.rec(expr.get_begin_subscript()) + map_linear_subscript = CombineMapperBase.map_subscript map_scoped_function = CombineMapperBase.map_variable @@ -738,7 +741,7 @@ class SubArrayRef(p.Expression): sub_dim_tags.append(DimTag(dim_tag.stride)) sub_shape.append(axis_length) - return sub_dim_tags, sub_shape + return sub_dim_tags, tuple(sub_shape) def __getinitargs__(self): return (self.swept_inames, self.subscript) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 2b5e394bb..28c346dcc 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -822,6 +822,10 @@ class CASTBuilder(ASTBuilderBase): lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic updates in %s" % type(self).__name__) + # FIXME: With the new mangler interface this should not be present, + # Commenting this part so that this does not get used anywhere in the + # meantime + ''' def emit_tuple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper @@ -844,84 +848,23 @@ class CASTBuilder(ASTBuilderBase): assignments.append(Assign(lhs_code, rhs_code)) return block_if_necessary(assignments) + ''' def emit_multiple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None - - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. - return self.emit_tuple_assignment(codegen_state, insn) - - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes)) - - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) - - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: - from cgen import ExpressionStatement - return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast( - mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) + func_id = insn.expression.function.name - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + in_knl_callable = codegen_state.kernel.scoped_functions[func_id] + in_knl_callable_as_call = in_knl_callable.emit_call( + insn=insn, + target=self.target, + expression_to_code_mapper=ecm) - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + from cgen import ExpressionStatement + return ExpressionStatement( + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 59ed77f9c..17e485555 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -165,6 +165,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_tagged_variable(self, expr, type_context): return var(expr.name) + def map_sub_array_ref(self, expr, type_context): + return var("&")(self.rec(expr.get_begin_subscript(), + type_context)) + def map_subscript(self, expr, type_context): def base_impl(expr, type_context): return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')] diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 691c0c51a..f43550b5b 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -25,9 +25,9 @@ THE SOFTWARE. from loopy.kernel import LoopKernel from loopy.kernel.creation import FunctionScoper from loopy.diagnostic import LoopyError -from loopy.function_interface import InKernelCallable +from loopy.kernel.function_interface import InKernelCallable -from loopy.kenrel.instruction import (MultiAssignmentBase, CallInstruction, +from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction) __doc__ = """ @@ -65,15 +65,11 @@ def register_callable_kernel(parent, function_name, child): tests so that both of them can be confirmed to be made for each other. """ - # {{{ Sanity Checks + # {{{ sanity checks assert isinstance(parent, LoopKernel) assert isinstance(child, LoopKernel) assert isinstance(function_name, str) - assert function_name not in parent.auxiliary_kernels, ( - "%s has already been used with some other kernel. One" - "function can only be associated with a single kernel" % ( - function_name)) # }}} @@ -105,7 +101,8 @@ def register_callable_kernel(parent, function_name, child): subkernel=child) # returning the parent kernel with the new scoped function dictionary - return parent.copy(scope_functions=scoped_functions) + return parent.copy(scoped_functions=scoped_functions, + instructions=new_insns) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index bc8669528..134603872 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -253,9 +253,10 @@ class TypeInferenceMapper(CombineMapper): def map_call(self, expr, return_tuple=False): from pymbolic.primitives import Variable + from loopy.symbolic import ScopedFunction identifier = expr.function - if isinstance(identifier, Variable): + if isinstance(identifier, (Variable, ScopedFunction)): identifier = identifier.name if identifier in ["indexof", "indexof_vec"]: @@ -297,7 +298,7 @@ class TypeInferenceMapper(CombineMapper): """ # Letting this stay over here, as it maybe needed later for maintaining - # backward compatibility + # backward compatibility: ~KK mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) if return_tuple: if mangle_result is not None: @@ -428,6 +429,10 @@ class TypeInferenceMapper(CombineMapper): return [expr.operation.result_dtypes(self.kernel, rec_result)[0] for rec_result in rec_results] + def map_sub_array_ref(self, expr): + return self.rec(expr.get_begin_subscript()) + + # }}} @@ -457,9 +462,16 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if isinstance(writer_insn, lp.Assignment): result = type_inf_mapper(expr, return_dtype_set=True) elif isinstance(writer_insn, lp.CallInstruction): - return_dtype_set = type_inf_mapper(expr, return_tuple=True, + result = type_inf_mapper(expr, return_dtype_set=True) + """ + # Maybe we need to alter this so that the type_inf_mapper returns a + # :class:`dict`? + # ask about this to Andreas Sir. + return_dtype_set = type_inf_mapper(expr, return_tuple=False, return_dtype_set=True) + print(return_dtype_set) + print(writer_insn.assignee_var_names()) result = [] for return_dtype_set in return_dtype_set: result_i = None @@ -474,6 +486,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): assert found if result_i is not None: result.append(result_i) + """ debug(" result: %s", result) @@ -678,6 +691,18 @@ def infer_unknown_types(kernel, expect_completion=False): args=[new_arg_dict[arg.name] for arg in kernel.args], ) + #------------------------------------------------------------------------ + # KK: + # FIXME: more type scoped function type specialization but needed for the + # specialization of the in kernel callables + # for example if an instruction is : + # `[i]:z[i] = a_kernel_function([j]:x[j], [k]: y[k])` + # and if the user already provided the types of the args: x, y, z. + # Then the instruction would not go through the TypeInferenceMapper and hence + # the function: `a_kernel_function` would not undergo type specialization, + # which would create problems in the future. + #------------------------------------------------------------------------ + from loopy.kernel.function_interface import ( register_pymbolic_calls_to_knl_callables) return register_pymbolic_calls_to_knl_callables( -- GitLab From 94aec43bcdfacdf8413a7cb83f0429e841494fdc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Mar 2018 00:26:27 -0500 Subject: [PATCH 021/916] Subkernels working again :) --- loopy/codegen/__init__.py | 64 +++++++++- loopy/codegen/auxiliary_kernels.py | 188 +++++++++++++++++++++++++++++ loopy/kernel/function_interface.py | 3 +- loopy/preprocess.py | 24 ++-- loopy/type_inference.py | 28 +---- 5 files changed, 258 insertions(+), 49 deletions(-) create mode 100644 loopy/codegen/auxiliary_kernels.py diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e83515d31..57bf4c6a8 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,6 +32,13 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from cgen import Collection + +from loopy.kernel.instruction import ( + Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, + CInstruction, _DataObliviousInstruction) + + import logging logger = logging.getLogger(__name__) @@ -187,6 +194,12 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: is_generating_master_kernel + + Can be either `True` or `False`. Indicating whether the code is being + generated for a master kernel or an auxiliary kernel. + """ def __init__(self, kernel, @@ -196,7 +209,8 @@ class CodeGenerationState(object): vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + schedule_index_end=None, + is_generating_master_kernel=None): self.kernel = kernel self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain @@ -211,6 +225,7 @@ class CodeGenerationState(object): self.is_generating_device_code = is_generating_device_code self.gen_program_name = gen_program_name self.schedule_index_end = schedule_index_end + self.is_generating_master_kernel = is_generating_master_kernel # {{{ copy helpers @@ -219,7 +234,8 @@ class CodeGenerationState(object): var_subst_map=None, vectorization_info=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None): + schedule_index_end=None, + is_generating_master_kernel=None): if kernel is None: kernel = self.kernel @@ -242,6 +258,9 @@ class CodeGenerationState(object): if schedule_index_end is None: schedule_index_end = self.schedule_index_end + if is_generating_master_kernel is None: + is_generating_master_kernel = self.is_generating_master_kernel + return CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, @@ -257,7 +276,8 @@ class CodeGenerationState(object): var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, gen_program_name=gen_program_name, - schedule_index_end=schedule_index_end) + schedule_index_end=schedule_index_end, + is_generating_master_kernel=is_generating_master_kernel) def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" @@ -470,13 +490,49 @@ def generate_code_v2(kernel): kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule)) + schedule_index_end=len(kernel.schedule), + is_generating_master_kernel=True) from loopy.codegen.result import generate_host_or_device_program + + # {{{ collecting ASTs of auxiliary kernels + + auxiliary_dev_progs = [] + + from loopy.codegen.auxiliary_kernels import generate_auxiliary_kernel_device_code + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + in_knl_callable = kernel.scoped_functions[insn.expression.function.name] + if in_knl_callable.subkernel is not None: + auxiliary_dev_prog = generate_auxiliary_kernel_device_code( + in_knl_callable.subkernel, + kernel.target).device_programs[0].ast + auxiliary_dev_progs.append(auxiliary_dev_prog) + elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, + BarrierInstruction, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("register_knl not made for %s type of" + "instruciton" % (str(type(insn)))) + + # }}} + codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) + # {{{ pasting the auxiliary functions code to the first device program + + new_dev_prog = codegen_result.device_programs[0] + for auxiliary_dev_prog in auxiliary_dev_progs: + new_dev_prog = new_dev_prog.copy( + ast=Collection([auxiliary_dev_prog, new_dev_prog.ast])) + new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] + codegen_result = codegen_result.copy(device_programs=new_device_programs) + + # }}} + device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains diff --git a/loopy/codegen/auxiliary_kernels.py b/loopy/codegen/auxiliary_kernels.py new file mode 100644 index 000000000..799ab59bf --- /dev/null +++ b/loopy/codegen/auxiliary_kernels.py @@ -0,0 +1,188 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +import islpy as isl + +from loopy.codegen import ( + ImplementedDataInfo, + CodeGenerationState) +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import ( + Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, + CInstruction, _DataObliviousInstruction) +from cgen import Collection + +import logging +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: generate_auxiliary_kernel_device_code + +""" + + +# {{{ code generation for the auxiliary kernel + +def generate_auxiliary_kernel_device_code(kernel, target): + """ + Generates device programs for the given auxiliary kernel, with the target + specified by the parent kernel + :returns: a :class:`CodeGenerationResult` + """ + kernel = kernel.copy(target=target) + + from loopy.kernel import kernel_state + if kernel.state == kernel_state.INITIAL: + from loopy.preprocess import preprocess_kernel + kernel = preprocess_kernel(kernel) + + if kernel.schedule is None: + from loopy.schedule import get_one_scheduled_kernel + kernel = get_one_scheduled_kernel(kernel) + + if kernel.state != kernel_state.SCHEDULED: + raise LoopyError( + "cannot generate code for a kernel that has not been " + "scheduled") + + from loopy.type_inference import infer_unknown_types + kernel = infer_unknown_types(kernel, expect_completion=True) + + from loopy.check import pre_codegen_checks + pre_codegen_checks(kernel) + + logger.info("%s: generate Auxillary Kernel code: start" % kernel.name) + + # {{{ examine arg list + + from loopy.kernel.data import ValueArg + from loopy.kernel.array import ArrayBase + + implemented_data_info = [] + + for arg in kernel.args: + is_written = arg.name in kernel.get_written_variables() + if isinstance(arg, ArrayBase): + implemented_data_info.extend( + arg.decl_info( + kernel.target, + is_written=is_written, + index_dtype=kernel.index_dtype)) + + elif isinstance(arg, ValueArg): + implemented_data_info.append(ImplementedDataInfo( + target=kernel.target, + name=arg.name, + dtype=arg.dtype, + arg_class=ValueArg, + is_written=is_written)) + + else: + raise ValueError("argument type not understood: '%s'" % type(arg)) + + allow_complex = False + for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): + if var.dtype.involves_complex(): + allow_complex = True + + # }}} + + seen_dtypes = set() + seen_functions = set() + seen_atomic_dtypes = set() + + initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) + codegen_state = CodeGenerationState( + kernel=kernel, + implemented_data_info=implemented_data_info, + implemented_domain=initial_implemented_domain, + implemented_predicates=frozenset(), + seen_dtypes=seen_dtypes, + seen_functions=seen_functions, + seen_atomic_dtypes=seen_atomic_dtypes, + var_subst_map={}, + allow_complex=allow_complex, + var_name_generator=kernel.get_var_name_generator(), + is_generating_device_code=False, + gen_program_name=kernel.name, + schedule_index_end=len(kernel.schedule), + is_generating_master_kernel=False) + + from loopy.codegen.result import generate_host_or_device_program + + # {{{ collecting ASTs of auxiliary kernels + + auxiliary_dev_progs = [] + + from loopy.codegen.auxiliary_kernels import generate_auxiliary_kernel_device_code + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + in_knl_callable = kernel.scoped_functions[insn.expression.function.name] + if in_knl_callable.subkernel is not None: + auxiliary_dev_prog = generate_auxiliary_kernel_device_code( + in_knl_callable.subkernel, + kernel.target).device_programs[0].ast + auxiliary_dev_progs.append(auxiliary_dev_prog) + elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, + BarrierInstruction, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("register_knl not made for %s type of" + "instruciton" % (str(type(insn)))) + + # }}} + + codegen_result = generate_host_or_device_program( + codegen_state, + schedule_index=0) + + # {{{ pasting the auxiliary functions code to the first device program + + new_dev_prog = codegen_result.device_programs[0] + for auxiliary_dev_prog in auxiliary_dev_progs: + new_dev_prog = new_dev_prog.copy( + ast=Collection([auxiliary_dev_prog, new_dev_prog.ast])) + new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] + codegen_result = codegen_result.copy(device_programs=new_device_programs) + + # }}} + + # For faster unpickling in the common case when implemented_domains isn't needed. + from loopy.tools import LazilyUnpicklingDict + codegen_result = codegen_result.copy( + implemented_domains=LazilyUnpicklingDict( + codegen_result.implemented_domains)) + + logger.info("%s: generate code: done" % kernel.name) + + return codegen_result + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index bb88cc091..ee44d5ea4 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -61,7 +61,7 @@ class ArrayArgDescriptor(ArgDescriptor): # }}} - super(ArgDescriptor, self).__init__(shape=None, + super(ArgDescriptor, self).__init__(shape=shape, mem_scope=mem_scope, dim_tags=dim_tags) @@ -412,6 +412,7 @@ class InKernelCallable(ImmutableRecord): new_args[id] = new_args[id].copy(shape=descr.shape, dim_tags=descr.dim_tags) + descriptor_specialized_knl = self.subkernel.copy(args=new_args) return self.copy(subkernel=descriptor_specialized_knl, diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 068953a52..eedfca6f9 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2202,9 +2202,8 @@ class ArgDescriptionInferer(CombineMapper): combined_arg_id_to_dtype)) # collecting the descriptors for args, kwargs, assignees - a = frozenset(((expr, new_scoped_function), )) - b = self.combine((self.rec(child) for child in expr.parameters)) - return (a | b) + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) def map_call_with_kwargs(self, expr, **kwargs): from loopy.kernel.function_intergace import ValueArgDescriptor @@ -2267,8 +2266,9 @@ def infer_arg_descr(kernel): pymbolic_calls_to_functions.update( arg_description_modifier(insn.expression, assignees=insn.assignees)) - if isinstance(insn, (MultiAssignmentBase, CInstruction)): - pymbolic_calls_to_functions.update(arg_description_modifier(insn.expression)) + elif isinstance(insn, (MultiAssignmentBase, CInstruction)): + pymbolic_calls_to_functions.update(arg_description_modifier( + insn.expression)) elif isinstance(insn, _DataObliviousInstruction): pass else: @@ -2386,20 +2386,10 @@ def preprocess_kernel(kernel, device=None): # have been established kernel = check_atomic_loads(kernel) + # inferring the shape and dim_tags of the arguments involved in a function + # call. kernel = infer_arg_descr(kernel) - print(75*'-') - print("This is after Type Inference") - for insn in kernel.instructions: - print(insn) - print(75*'-') - print('Linked Functions:') - for name, func in kernel.scoped_functions.items(): - print(name, "=>", (func.name, func.arg_id_to_dtype, - func.arg_id_to_descr, func.subkernel.args)) - print() - print(75*'-') - kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 134603872..b1b1446db 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -459,34 +459,8 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): expr = subst_expander(writer_insn.expression) debug(" via expr %s", expr) - if isinstance(writer_insn, lp.Assignment): - result = type_inf_mapper(expr, return_dtype_set=True) - elif isinstance(writer_insn, lp.CallInstruction): - result = type_inf_mapper(expr, return_dtype_set=True) - """ - # Maybe we need to alter this so that the type_inf_mapper returns a - # :class:`dict`? - # ask about this to Andreas Sir. - return_dtype_set = type_inf_mapper(expr, return_tuple=False, - return_dtype_set=True) - - print(return_dtype_set) - print(writer_insn.assignee_var_names()) - result = [] - for return_dtype_set in return_dtype_set: - result_i = None - found = False - for assignee, comp_dtype_set in zip( - writer_insn.assignee_var_names(), return_dtype_set): - if assignee == var_name: - found = True - result_i = comp_dtype_set - break - assert found - if result_i is not None: - result.append(result_i) - """ + result = type_inf_mapper(expr, return_dtype_set=True) debug(" result: %s", result) -- GitLab From f5cb585a4ffa355b7dd2249a2323c68564236476 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Mar 2018 10:57:16 -0500 Subject: [PATCH 022/916] Able to handle scalar calls. Still needs a mechanism to get target_specific_name. --- loopy/kernel/function_interface.py | 51 +++++++++++++++++----- loopy/target/c/__init__.py | 2 +- loopy/target/c/codegen/expression.py | 63 ++++------------------------ 3 files changed, 49 insertions(+), 67 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index ee44d5ea4..17bd60ff2 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -4,6 +4,8 @@ import re import six import numpy as np +from six.moves import zip + from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.types import NumpyType @@ -274,13 +276,16 @@ class InKernelCallable(ImmutableRecord): """ if self.arg_id_to_dtype: - # trying to specialize an already specialized function. + # specializing an already specialized function. - if self.arg_id_to_dtype == arg_id_to_dtype: - return self.copy() - else: - raise LoopyError("Overwriting a specialized function--maybe" - " start with new instance of InKernelCallable?") + for id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " InKernelCallable?") + # TODO: Check if the arguments match. If yes then just + # return self.copy() # {{{ attempt to specialize using scalar functions @@ -290,6 +295,7 @@ class InKernelCallable(ImmutableRecord): from loopy.target.pyopencl import PyOpenCLTarget from loopy.target.cuda import CudaTarget + # FIXME: Push this into the target if isinstance(target, CTarget): new_arg_id_to_dtype = c_with_types(self.name, arg_id_to_dtype) @@ -393,11 +399,11 @@ class InKernelCallable(ImmutableRecord): return self.copy(arg_id_to_descr=arg_id_to_descr) else: - # Now this ia a kernel call + # this ia a kernel call # tuning the subkernel so that we have the the matching shapes and # dim_tags. # FIXME: Although We receive input if the argument is - # local/global. We do not use it to set the subkernel function + # `local/global`. We do not use it to set the subkernel function # signature. Need to do it, so that we can handle teporary inputs # in the array call. @@ -412,7 +418,6 @@ class InKernelCallable(ImmutableRecord): new_args[id] = new_args[id].copy(shape=descr.shape, dim_tags=descr.dim_tags) - descriptor_specialized_knl = self.subkernel.copy(args=new_args) return self.copy(subkernel=descriptor_specialized_knl, @@ -450,13 +455,37 @@ class InKernelCallable(ImmutableRecord): def get_target_specific_name(self, target): if self.subkernel is None: - raise NotImplementedError() + return self.name else: return self.subkernel.name raise NotImplementedError() - def emit_call(self, insn, target, expression_to_code_mapper): + def emit_call(self, expression_to_code_mapper, expression, target): + if self.subkernel: + raise NotImplementedError() + + # must have single assignee + assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 + arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in + range(len(self.arg_id_to_dtype)-1)) + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + expression.parameters) + + from loopy.expression import dtype_to_type_context + # processing the parameters with the required dtypes + processed_parameters = tuple( + expression_to_code_mapper.rec(par, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expression.parameters, par_dtypes, arg_dtypes)) + + from pymbolic import var + return var(self.get_target_specific_name(target))(*processed_parameters) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): from loopy.kernel.instruction import CallInstruction from pymbolic.primitives import CallWithKwargs diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 28c346dcc..b79e6ca48 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -856,7 +856,7 @@ class CASTBuilder(ASTBuilderBase): func_id = insn.expression.function.name in_knl_callable = codegen_state.kernel.scoped_functions[func_id] - in_knl_callable_as_call = in_knl_callable.emit_call( + in_knl_callable_as_call = in_knl_callable.emit_call_insn( insn=insn, target=self.target, expression_to_code_mapper=ecm) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 17e485555..7d05f228f 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -23,7 +23,7 @@ THE SOFTWARE. """ -from six.moves import range, zip +from six.moves import range import numpy as np @@ -41,7 +41,7 @@ from pymbolic import var from loopy.expression import dtype_to_type_context from loopy.type_inference import TypeInferenceMapper -from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType @@ -386,12 +386,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Variable, Subscript - - identifier = expr.function + from pymbolic.primitives import Subscript # {{{ implement indexof, indexof_vec + identifier = expr.function if identifier.name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier.name) @@ -433,56 +432,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - - processed_parameters = None - - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: - raise LoopyError("functions with more or fewer than one return value " - "may not be used in an expression") - - if mangle_result.arg_dtypes is not None: - processed_parameters = tuple( - self.rec(par, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) - - else: - # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to - # propagate the type context here. But for many others, it does - # not. Using the inferred type as a stopgap for now. - processed_parameters = tuple( - self.rec(par, - type_context=dtype_to_type_context( - self.kernel.target, par_dtype)) - for par, par_dtype in zip(expr.parameters, par_dtypes)) - - from warnings import warn - warn("Calling function '%s' with unknown C signature--" - "return CallMangleInfo.arg_dtypes" - % identifier, LoopyWarning) - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return var(mangle_result.target_name)(*processed_parameters) + return self.kernel.scoped_functions[expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target) # {{{ deal with complex-valued variables -- GitLab From 6c901bf3bb58d7c4c494cd2a4883fbfa2f3ff2e5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Mar 2018 17:05:22 -0500 Subject: [PATCH 023/916] Scalar calls done --- loopy/kernel/function_interface.py | 3 ++- loopy/type_inference.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 17bd60ff2..f2c24b293 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -275,7 +275,8 @@ class InKernelCallable(ImmutableRecord): its keyword identifier. """ - if self.arg_id_to_dtype: + if self.arg_id_to_dtype is not None: + # specializing an already specialized function. for id, dtype in arg_id_to_dtype.items(): diff --git a/loopy/type_inference.py b/loopy/type_inference.py index b1b1446db..ee4bf38be 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -120,6 +120,11 @@ class TypeInferenceMapper(CombineMapper): 0 <= len(dtype_set) <= 1 for dtype_set in dtype_sets) + # Can't infer types if one of the dtypes is unknown + for dtype_set in dtype_sets: + if dtype_set == []: + return [] + from pytools import is_single_valued dtypes = [dtype @@ -667,8 +672,7 @@ def infer_unknown_types(kernel, expect_completion=False): #------------------------------------------------------------------------ # KK: - # FIXME: more type scoped function type specialization but needed for the - # specialization of the in kernel callables + # FIXME: # for example if an instruction is : # `[i]:z[i] = a_kernel_function([j]:x[j], [k]: y[k])` # and if the user already provided the types of the args: x, y, z. -- GitLab From 438fd1da29beb6f3ad900c14c39b00dcef609a33 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Mar 2018 05:06:14 -0500 Subject: [PATCH 024/916] Fixed with_types backed to the target --- loopy/kernel/function_interface.py | 182 ++++------------------------- loopy/library/random123.py | 42 +++++++ loopy/target/__init__.py | 9 ++ loopy/target/c/__init__.py | 91 +++++++++++++++ loopy/target/opencl.py | 119 ++++++++++++++++++- loopy/target/pyopencl.py | 49 ++++++++ loopy/type_inference.py | 14 +-- 7 files changed, 335 insertions(+), 171 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index f2c24b293..13955f928 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -2,13 +2,11 @@ from __future__ import division, absolute_import import re import six -import numpy as np from six.moves import zip from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.types import NumpyType from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, _DataObliviousInstruction) @@ -85,115 +83,6 @@ class ArrayArgDescriptor(ArgDescriptor): # }}} -# {{{ c with types - -def c_with_types(name, arg_id_to_dtype): - - # Specializing the type of the math function once they agree upon the - # function signature. - - if name in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "tan"]: - for id, dtype in arg_id_to_dtype.items(): - if not -1 <= id <= 0: - raise LoopyError("%s can take only one argument." % name) - - dtype = arg_id_to_dtype[0].numpy_dtype - - if dtype.kind == 'f': - # generic type resolve we can go ahead and specialize - pass - elif dtype.kind in ['u', 'i']: - # int and unsigned are casted into float32 - dtype = np.float32 - else: - raise LoopyError("%s function cannot take arguments of the type %s" - % (name, dtype)) - - # Done specializing. Returning the intended arg_id_to_dtype - dtype = NumpyType(dtype) - return {-1: dtype, 0: dtype} - - # binary functions - elif name in ["max", "min"]: - for id, dtype in arg_id_to_dtype.items(): - if not -1 <= id <= 1: - raise LoopyError("%s can take only two arguments." % name) - - # finding the common type for all the dtypes involved - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_id_to_dtype]) - - if dtype.kind == 'f': - # generic type resolve we can go ahead and specialize - pass - elif dtype.kind in ['u', 'i']: - # int and unsigned are implicitly casted into float32 - dtype = np.float32 - else: - raise LoopyError("%s function cannot take arguments of the type %s" - % (name, dtype)) - - # Specialized into one of the known types - return {-1: NumpyType(dtype), 0: arg_id_to_dtype[0], 1: arg_id_to_dtype[1]} - - else: - # could not specialize the function within the C namespace - # this would help when checking for OpenCL/CUDA function which are not - # present in C - return None - -# }}} - - -# {{{ opencl with_types - -def opencl_with_types(name, arg_id_to_dtype): - new_arg_id_to_dtype = c_with_types(name, arg_id_to_dtype) - if new_arg_id_to_dtype is None: - # could not locate the function within C's namespace. Searching in - # OpenCL specific namespace - - # FIXME: Need to add these functions over here - new_arg_id_to_dtype = None - - return new_arg_id_to_dtype - -# }}} - - -# {{{ pyopencl with_types - -def pyopencl_with_types(name, arg_id_to_dtype): - new_arg_id_to_dtype = opencl_with_types(name, arg_id_to_dtype) - if new_arg_id_to_dtype is None: - # could not locate the function within C's namespace. Searching in - # PyOpenCL specific namespace - - # FIXME: Need to add these functions over here - new_arg_id_to_dtype = None - - return new_arg_id_to_dtype - -# }}} - - -# {{{ cuda with_types - -def cuda_with_types(name, arg_id_to_dtype): - new_arg_id_to_dtype = c_with_types(name, arg_id_to_dtype) - if new_arg_id_to_dtype is None: - # could not locate the function within C's namespace. Searching in - # CUDA specific namespace - - # FIXME: Need to add these extra functions over here - new_arg_id_to_dtype = None - - return new_arg_id_to_dtype - -# }}} - - # {{{ kw_to_pos def get_kw_pos_association(kernel): @@ -243,7 +132,7 @@ class InKernelCallable(ImmutableRecord): """ def __init__(self, name, subkernel=None, arg_id_to_dtype=None, - arg_id_to_descr=None): + arg_id_to_descr=None, name_in_target=None): # {{{ sanity checks @@ -252,10 +141,14 @@ class InKernelCallable(ImmutableRecord): # }}} + if name_in_target is not None and subkernel is not None: + subkernel = subkernel.copy(name=name_in_target) + super(InKernelCallable, self).__init__(name=name, subkernel=subkernel, arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr) + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) def with_types(self, arg_id_to_dtype, target): """ @@ -285,37 +178,15 @@ class InKernelCallable(ImmutableRecord): raise LoopyError("Overwriting a specialized" " function is illegal--maybe start with new instance of" " InKernelCallable?") - # TODO: Check if the arguments match. If yes then just - # return self.copy() # {{{ attempt to specialize using scalar functions if self.name in target.get_device_ast_builder().function_identifiers(): - from loopy.target.c import CTarget - from loopy.target.opencl import OpenCLTarget - from loopy.target.pyopencl import PyOpenCLTarget - from loopy.target.cuda import CudaTarget - - # FIXME: Push this into the target - if isinstance(target, CTarget): - new_arg_id_to_dtype = c_with_types(self.name, arg_id_to_dtype) - - elif isinstance(target, OpenCLTarget): - new_arg_id_to_dtype = opencl_with_types(self.name, arg_id_to_dtype) - - elif isinstance(target, PyOpenCLTarget): - new_arg_id_to_dtype = pyopencl_with_types(self.name, arg_id_to_dtype) - - elif isinstance(target, CudaTarget): - new_arg_id_to_dtype = cuda_with_types(self.name, arg_id_to_dtype) - - else: - raise NotImplementedError("InKernelCallable.with_types() for" - " %s target" % target) - - if new_arg_id_to_dtype is not None: - # got our speciliazed function - return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + new_in_knl_callable = target.get_device_ast_builder().with_types( + self, arg_id_to_dtype) + if new_in_knl_callable is None: + new_in_knl_callable = self.copy() + return new_in_knl_callable # }}} @@ -444,7 +315,8 @@ class InKernelCallable(ImmutableRecord): def is_ready_for_code_gen(self): return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None) + self.arg_id_to_descr is not None and + self.name_in_target is not None) # {{{ code generation @@ -453,16 +325,10 @@ class InKernelCallable(ImmutableRecord): """ raise NotImplementedError() - def get_target_specific_name(self, target): - - if self.subkernel is None: - return self.name - else: - return self.subkernel.name + def emit_call(self, expression_to_code_mapper, expression, target): - raise NotImplementedError() + assert self.is_ready_for_code_gen() - def emit_call(self, expression_to_code_mapper, expression, target): if self.subkernel: raise NotImplementedError() @@ -484,10 +350,12 @@ class InKernelCallable(ImmutableRecord): expression.parameters, par_dtypes, arg_dtypes)) from pymbolic import var - return var(self.get_target_specific_name(target))(*processed_parameters) + return var(self.name_in_target)(*processed_parameters) def emit_call_insn(self, insn, target, expression_to_code_mapper): + assert self.is_ready_for_code_gen() + from loopy.kernel.instruction import CallInstruction from pymbolic.primitives import CallWithKwargs @@ -507,7 +375,7 @@ class InKernelCallable(ImmutableRecord): parameters.append(kw_parameters[pos_to_kw[i]]) par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) - # TODO: currently no suppport for insn keywords. + # TODO: currently no suppport for assignee keywords. parameters = parameters + list(assignees) par_dtypes = par_dtypes + [self.arg_id_to_dtype[-i-1] for i, _ in enumerate(assignees)] @@ -523,7 +391,7 @@ class InKernelCallable(ImmutableRecord): parameters, par_dtypes)] from pymbolic import var - return var(self.get_target_specific_name(target))(*c_parameters) + return var(self.name_in_target)(*c_parameters) # }}} @@ -718,12 +586,10 @@ def register_pymbolic_calls_to_knl_callables(kernel, # book-keeping of the functions and names mappings for later use if in_knl_callable.subkernel is not None: - # changing the name of the subkenrel so that it emits a function - # with the name same as the name being used in the - # scoped_function. - new_subkernel = in_knl_callable.subkernel.copy( - name=unique_name) - in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) + # for array calls the name in the target is the name of the + # scoped funciton + in_knl_callable = in_knl_callable.copy( + name_in_target=unique_name) scoped_names_to_functions[unique_name] = in_knl_callable scoped_functions_to_names[in_knl_callable] = unique_name diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 871dde0a6..b28d11ba6 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -223,4 +223,46 @@ def random123_function_mangler(kernel, name, arg_dtypes): else: return None + +def random123_with_types(in_knl_callable, arg_id_to_dtype, target): + name = in_knl_callable.name + + if name not in FUNC_NAMES_TO_RNG: + return None + + rng_variant = FUNC_NAMES_TO_RNG[name] + 1/0 + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + from loopy.kernel.data import CallMangleInfo + fn = rng_variant.full_name + if name == fn: + return CallMangleInfo( + target_name=fn+"_gen", + result_dtypes=(ctr_dtype, ctr_dtype), + arg_dtypes=(ctr_dtype, key_dtype)) + + elif name == fn + "_f32": + return CallMangleInfo( + target_name=name, + result_dtypes=( + target.vector_dtype(NumpyType(np.float32), rng_variant.width), + ctr_dtype), + arg_dtypes=(ctr_dtype, key_dtype)) + + elif name == fn + "_f64": + return CallMangleInfo( + target_name=name, + result_dtypes=( + target.vector_dtype(NumpyType(np.float64), rng_variant.width), + ctr_dtype), + arg_dtypes=(ctr_dtype, key_dtype)) + + else: + return None + # vim: foldmethod=marker diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index fe6daf12c..336985ede 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -162,6 +162,15 @@ class ASTBuilderBase(object): def preamble_generators(self): return [] + def with_types(self, in_knl_callable, arg_id_to_dtype): + """ + Checks the in-kernel callable with the target specific functions and then + returns either `None` when no match is found or returns a new type + specialized instance of :class:`InKernelCallable`. + + """ + return None + # }}} # {{{ code generation guts diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index b79e6ca48..5ebcd67e1 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -426,6 +426,90 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): return None + +def c_with_types(in_knl_callable, arg_id_to_dtype, modify_name=False): + # Function mangler for math functions defined in C standard + # Convert abs, min, max to fabs, fmin, fmax. + # If modify_name is set to True, function names are modified according to + # floating point types of the arguments (e.g. cos(double), cosf(float)) + # This should be set to True for C and Cuda, False for OpenCL + name = in_knl_callable.name + + if name in ["abs", "min", "max"]: + name = "f" + name + + # unitary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "tan"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return None + + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype + + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == 'c': + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + + if modify_name: + if dtype == np.float64: + pass # fabs + elif dtype == np.float32: + name = name + "f" # fabsf + elif dtype == np.float128: + name = name + "l" # fabsl + else: + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + + return in_knl_callable.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + + # binary functions + if name in ["fmax", "fmin"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return None + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + elif dtype.kind == "f": + if modify_name: + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) + dtype = NumpyType(dtype) + return in_knl_callable.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) + + return None + # }}} @@ -455,6 +539,13 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) + def with_types(self, in_knl_callable, arg_id_to_dtype): + new_callable = c_with_types(in_knl_callable, arg_id_to_dtype) + if new_callable is not None: + return new_callable + return super(CASTBuilder, self).with_types(in_knl_callable, + arg_id_to_dtype) + # }}} # {{{ code generation diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 94870907b..7aec34a22 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,10 +31,12 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_identifiers +from loopy.target.c import (DTypeRegistryWrapper, c_math_identifiers, + c_math_mangler, c_with_types) from loopy.kernel.data import temp_var_scope, CallMangleInfo from pymbolic import var +from functools import partial # {{{ dtype registry wrappers @@ -156,8 +158,8 @@ def opencl_function_identifiers(): # }}} -# {{{ function mangler +# {{{ function mangler _CL_SIMPLE_MULTI_ARG_FUNCTIONS = { "clamp": 3, @@ -239,6 +241,95 @@ def opencl_function_mangler(kernel, name, arg_dtypes): return None + +def opencl_with_types(in_knl_callable, arg_id_to_dtype): + + name = in_knl_callable.name + + if name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return None + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.values() if id >= 0]) + + if dtype.kind == "i": + dtype = NumpyType(dtype) + return in_knl_callable.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) + + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return None + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return in_knl_callable.copy(name_in_target=name, + arg_id_to_dtype={-1: scalar_dtype, 0: dtype, 1: dtype}) + + if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is not None: + # the types provided aren't mature enough to specialize the + # callable + return None + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.values() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return in_knl_callable.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype) + + if name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is not None: + # the types provided aren't mature enough to specialize the + # callable + return None + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in + range(count)) + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) + + return in_knl_callable.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype) + + return None + + # }}} @@ -382,6 +473,14 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library + def function_manglers(self): + return ( + [ + opencl_function_mangler, + partial(c_math_mangler, modify_name=False) + ] + + super(OpenCLCASTBuilder, self).function_manglers()) + def function_identifiers(self): return (opencl_function_identifiers() | c_math_identifiers() | super(OpenCLCASTBuilder, self).function_identifiers()) @@ -401,6 +500,17 @@ class OpenCLCASTBuilder(CASTBuilder): reduction_preamble_generator, ]) + def with_types(self, in_knl_callable, arg_id_to_dtype): + new_callable = opencl_with_types(in_knl_callable, arg_id_to_dtype) + if new_callable is not None: + return new_callable + + new_callable = c_with_types(in_knl_callable, arg_id_to_dtype) + if new_callable is not None: + return new_callable + return super(OpenCLCASTBuilder, self).with_types(in_knl_callable, + arg_id_to_dtype) + # }}} # {{{ top-level codegen @@ -412,6 +522,11 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) + if not codegen_state.is_generating_master_kernel: + # auxiliary kernels need not mention opencl speicific qualifiers + # for a functions signature + return fdecl + fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 1451cf9e7..4dace7ec2 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -236,6 +236,43 @@ def pyopencl_function_mangler(target, name, arg_dtypes): return None +def pyopencl_with_types(in_knl_callable, arg_id_to_dtype): + + name = in_knl_callable.name + + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return None + + dtype = arg_id_to_dtype[0] + + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise RuntimeError("unexpected complex type '%s'" % dtype) + + if name in ["sqrt", "exp", "log", + "sin", "cos", "tan", + "sinh", "cosh", "tanh", + "conj"]: + return in_knl_callable.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: dtype}) + + if name in ["real", "imag", "abs"]: + return in_knl_callable.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: dtype.numpy_dtype.type(0).real}) + + return None + + # {{{ preamble generator def pyopencl_preamble_generator(preamble_info): @@ -764,6 +801,18 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): random123_preamble_generator, ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) + def with_types(self, in_knl_callable, arg_id_to_dtype): + from loopy.library.random123 import random123_with_types + new_callable = super(PyOpenCLCASTBuilder, self).with_types(in_knl_callable, + arg_id_to_dtype) + if new_callable is not None: + return new_callable + + new_callable = pyopencl_with_types(in_knl_callable, arg_id_to_dtype) + if new_callable is not None: + return new_callable + return random123_with_types(in_knl_callable, arg_id_to_dtype) + # }}} # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index ee4bf38be..f974e3fab 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -120,11 +120,6 @@ class TypeInferenceMapper(CombineMapper): 0 <= len(dtype_set) <= 1 for dtype_set in dtype_sets) - # Can't infer types if one of the dtypes is unknown - for dtype_set in dtype_sets: - if dtype_set == []: - return [] - from pytools import is_single_valued dtypes = [dtype @@ -291,15 +286,12 @@ class TypeInferenceMapper(CombineMapper): self.specialized_functions[expr] = in_knl_callable new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype - result_dtypes = [] # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + return [new_arg_id_to_dtype[-1]] - for i in range(len(new_arg_id_to_dtype)): - if -i-1 in new_arg_id_to_dtype: - result_dtypes.append(new_arg_id_to_dtype[-i-1]) - else: - return result_dtypes + return [] """ # Letting this stay over here, as it maybe needed later for maintaining -- GitLab From 1229c5d640c0fe329ea188dcc28c1b96d29de760 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Mar 2018 13:14:45 -0500 Subject: [PATCH 025/916] Attempt to bifurcate the two callables --- loopy/kernel/function_interface.py | 400 +++++++++++++++-------------- 1 file changed, 201 insertions(+), 199 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 13955f928..e0c086eb8 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -107,6 +107,10 @@ def get_kw_pos_association(kernel): # }}} + +# {{{ template class + + class InKernelCallable(ImmutableRecord): """ @@ -137,13 +141,10 @@ class InKernelCallable(ImmutableRecord): # {{{ sanity checks if not isinstance(name, str): - raise LoopyError("name of a InKernelCallable should be a string") + raise LoopyError("name of a CallableOnScalar should be a string") # }}} - if name_in_target is not None and subkernel is not None: - subkernel = subkernel.copy(name=name_in_target) - super(InKernelCallable, self).__init__(name=name, subkernel=subkernel, arg_id_to_dtype=arg_id_to_dtype, @@ -168,6 +169,93 @@ class InKernelCallable(ImmutableRecord): its keyword identifier. """ + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr): + """ + :arg arg_id_to_descr: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.ArrayArgDescriptor` instances. + Unspecified/unknown types are not represented in *arg_id_to_descr*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_descr* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + + raise NotImplementedError() + + def with_iname_tag_usage(self, unusable, concurrent_shape): + """ + :arg unusable: a set of iname tags that may not be used in the callee. + :arg concurrent_shape: an list of tuples ``(iname_tag, bound)`` for + concurrent inames that are used in the calller but also available + for mapping by the callee. *bound* is given as a + :class:`islpy.PwAff`. + + :returns: a list of the same type as *concurrent*, potentially modified + by increasing bounds or adding further iname tag entries. + + All iname tags not explicitly listed in *concurrent* or *unusable* are + available for mapping by the callee. + """ + + raise NotImplementedError() + + def is_ready_for_code_gen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None and + self.name_in_target is not None) + + # {{{ code generation + + def generate_preambles(self, target): + """ This would generate the target specific preamble. + """ + raise NotImplementedError() + + def emit_call(self, expression_to_code_mapper, expression, target): + + raise NotImplementedError() + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + + raise NotImplementedError() + + # }}} + + def __eq__(self, other): + return (self.name == other.name + and self.arg_id_to_descr == other.arg_id_to_descr + and self.arg_id_to_dtype == other.arg_id_to_dtype + and self.subkernel == other.subkernel) + + def __hash__(self): + return hash((self.name, self.subkernel, self.name_in_target)) + + +# }}} + + +class CallableOnScalar(InKernelCallable): + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(CallableOnScalar, self).__init__(name=name, + subkernel=None, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + def with_types(self, arg_id_to_dtype, target): if self.arg_id_to_dtype is not None: # specializing an already specialized function. @@ -177,9 +265,9 @@ class InKernelCallable(ImmutableRecord): if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: raise LoopyError("Overwriting a specialized" " function is illegal--maybe start with new instance of" - " InKernelCallable?") + " CallableScalar?") - # {{{ attempt to specialize using scalar functions + # {{{ attempt to specialize using scalar functions present in target if self.name in target.get_device_ast_builder().function_identifiers(): new_in_knl_callable = target.get_device_ast_builder().with_types( @@ -190,13 +278,93 @@ class InKernelCallable(ImmutableRecord): # }}} - if self.subkernel is None: - # did not find a scalar function and function prototype does not - # even have subkernel registered => no match found - raise LoopyError("Function %s not present within" - " the %s namespace" % (self.name, target)) + # did not find a scalar function and function prototype does not + # even have subkernel registered => no match found + raise LoopyError("Function %s not present within" + " the %s namespace" % (self.name, target)) + + def with_descrs(self, arg_id_to_descr): + + # This is a scalar call + # need to assert that the name is in funtion indentifiers + arg_id_to_descr[-1] = ValueArgDescriptor() + return self.copy(arg_id_to_descr=arg_id_to_descr) + + def with_iname_tag_usage(self, unusable, concurrent_shape): + + raise NotImplementedError() + + def is_ready_for_code_gen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None and + self.name_in_target is not None) + + # {{{ code generation + + def generate_preambles(self, target): + """ This would generate the target specific preamble. + """ + raise NotImplementedError() + + def emit_call(self, expression_to_code_mapper, expression, target): + + assert self.is_ready_for_code_gen() + + # must have single assignee + assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 + arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in + range(len(self.arg_id_to_dtype)-1)) + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + expression.parameters) + + from loopy.expression import dtype_to_type_context + # processing the parameters with the required dtypes + processed_parameters = tuple( + expression_to_code_mapper.rec(par, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expression.parameters, par_dtypes, arg_dtypes)) + + from pymbolic import var + return var(self.name_in_target)(*processed_parameters) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + # TODO: Need to add support for functions like sincos(x) + # which would give multiple outputs but takes in scalar arguments - # {{{ attempt to specialization with array functions + raise NotImplementedError("emit_call_insn only applies for" + " CallableKernels") + + # }}} + + def __eq__(self, other): + return (self.name == other.name + and self.arg_id_to_descr == other.arg_id_to_descr + and self.arg_id_to_dtype == other.arg_id_to_dtype + and self.subkernel == other.subkernel) + + def __hash__(self): + return hash((self.name, self.subkernel, self.name_in_target)) + + +class CallableKernel(InKernelCallable): + + def __init__(self, name, subkernel, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + if name_in_target is not None and subkernel is not None: + subkernel = subkernel.copy(name=name_in_target) + + super(CallableKernel, self).__init__(name=name, + subkernel=subkernel, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + def with_types(self, arg_id_to_dtype, target): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) @@ -239,76 +407,37 @@ class InKernelCallable(ImmutableRecord): new_arg_id_to_dtype[read_count] = arg.dtype read_count += 1 - # }}} - # Returning the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype return self.copy(subkernel=specialized_kernel, arg_id_to_dtype=new_arg_id_to_dtype) def with_descrs(self, arg_id_to_descr): - """ - :arg arg_id_to_descr: a mapping from argument identifiers - (integers for positional arguments, names for keyword - arguments) to :class:`loopy.ArrayArgDescriptor` instances. - Unspecified/unknown types are not represented in *arg_id_to_descr*. - Return values are denoted by negative integers, with the - first returned value identified as *-1*. + # tuning the subkernel so that we have the the matching shapes and + # dim_tags. + # FIXME: Although We receive input if the argument is + # `local/global`. We do not use it to set the subkernel function + # signature. Need to do it, so that we can handle teporary inputs + # in the array call. - :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a - new :class:`InKernelCallable` specialized for the given types, - and *arg_id_to_descr* is a mapping of the same form as the - argument above, however it may have more information present. - Any argument information exists both by its positional and - its keyword identifier. - """ + # Collecting the parameters + new_args = self.subkernel.args.copy() + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - if self.subkernel is None: - # This is a scalar call - # need to assert that the name is in funtion indentifiers - arg_id_to_descr[-1] = ValueArgDescriptor() - return self.copy(arg_id_to_descr=arg_id_to_descr) + for id, descr in arg_id_to_descr.items(): + if isinstance(id, str): + id = kw_to_pos[id] + assert isinstance(id, int) + new_args[id] = new_args[id].copy(shape=descr.shape, + dim_tags=descr.dim_tags) - else: - # this ia a kernel call - # tuning the subkernel so that we have the the matching shapes and - # dim_tags. - # FIXME: Although We receive input if the argument is - # `local/global`. We do not use it to set the subkernel function - # signature. Need to do it, so that we can handle teporary inputs - # in the array call. - - # Collecting the parameters - new_args = self.subkernel.args.copy() - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - for id, descr in arg_id_to_descr.items(): - if isinstance(id, str): - id = kw_to_pos[id] - assert isinstance(id, int) - new_args[id] = new_args[id].copy(shape=descr.shape, - dim_tags=descr.dim_tags) - - descriptor_specialized_knl = self.subkernel.copy(args=new_args) - - return self.copy(subkernel=descriptor_specialized_knl, - arg_id_to_descr=arg_id_to_descr) + descriptor_specialized_knl = self.subkernel.copy(args=new_args) - def with_iname_tag_usage(self, unusable, concurrent_shape): - """ - :arg unusable: a set of iname tags that may not be used in the callee. - :arg concurrent_shape: an list of tuples ``(iname_tag, bound)`` for - concurrent inames that are used in the calller but also available - for mapping by the callee. *bound* is given as a - :class:`islpy.PwAff`. + return self.copy(subkernel=descriptor_specialized_knl, + arg_id_to_descr=arg_id_to_descr) - :returns: a list of the same type as *concurrent*, potentially modified - by increasing bounds or adding further iname tag entries. - - All iname tags not explicitly listed in *concurrent* or *unusable* are - available for mapping by the callee. - """ + def with_iname_tag_usage(self, unusable, concurrent_shape): raise NotImplementedError() @@ -327,30 +456,7 @@ class InKernelCallable(ImmutableRecord): def emit_call(self, expression_to_code_mapper, expression, target): - assert self.is_ready_for_code_gen() - - if self.subkernel: - raise NotImplementedError() - - # must have single assignee - assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 - arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in - range(len(self.arg_id_to_dtype)-1)) - - par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in - expression.parameters) - - from loopy.expression import dtype_to_type_context - # processing the parameters with the required dtypes - processed_parameters = tuple( - expression_to_code_mapper.rec(par, - dtype_to_type_context(target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expression.parameters, par_dtypes, arg_dtypes)) - - from pymbolic import var - return var(self.name_in_target)(*processed_parameters) + raise NotImplementedError("emit_call only works on scalar operations") def emit_call_insn(self, insn, target, expression_to_code_mapper): @@ -402,111 +508,7 @@ class InKernelCallable(ImmutableRecord): and self.subkernel == other.subkernel) def __hash__(self): - return hash((self.name, self.subkernel)) - -# {{{ callable kernel - - -class CallableKernel(InKernelCallable): - """ - - ..attribute:: name - - This would be the name by which the function would be called in the loopy - kernel. - - .. attribute:: subkernel - - The subkernel associated with the call. - - """ - - # {{{ constructor - - def __init__(self, name=None, subkernel=None): - - super(CallableKernel, self).__init__(name=name) - - if not name == subkernel.name: - subkernel = subkernel.copy(name=name) - - self.subkernel = subkernel - - # }}} - - # {{{ copy - - def copy(self, name=None, subkernel=None): - if name is None: - name = self.name - - if subkernel is None: - subkernel = self.subkernel - - return self.__class__(name=name, - subkernel=subkernel) - - # }}} - - # {{{ with_types - - def with_types(self, arg_id_to_dtype): - - # {{{ sanity checks for arg_id_to_dtype - - for id in arg_id_to_dtype: - if not isinstance(id, str): - raise LoopyError("For Callable kernels the input should be all given" - "as KWargs") - - # }}} - - # }}} - - # {{{ with_descriptors - - def with_descriptors(self, arg_id_to_descr): - for id, arg_descr in arg_id_to_descr.items(): - # The dimensions don't match => reject it - if len(arg_descr.dim_tags) != len(self.subkernel.arg_dict[id].shape): - raise LoopyError("The number of dimensions do not match between the" - "caller kernel and callee kernel for the variable name %s in" - "the callee kernel" % id) - - new_args = [] - for arg in self.subkernel.args: - if arg.name in arg_id_to_descr: - new_args.copy(arg.copy(dim_tags=arg_id_to_descr[arg.name])) - pass - else: - new_args.append(arg.copy()) - - specialized_kernel = self.subkernel.copy(args=new_args) - - new_arg_id_to_descr = {} - - for id, arg in specialized_kernel.arg_dict.items(): - new_arg_id_to_descr[id] = ArrayArgDescriptor(arg.dim_tags, "GLOBAL") - - return self.copy(subkernel=specialized_kernel), new_arg_id_to_descr - - # }}} - - # {{{ get_target_specific_name - - def get_target_specific_name(self, target): - return self.subkernel.name - - # }}} - - # {{{ get preamble - - def get_preamble(self, target): - return "" - - # }}} - -# }}} + return hash((self.name, self.subkernel, self.name_in_target)) # {{{ new pymbolic calls to scoped functions -- GitLab From 01410750b1271f6058422ee62428217bd5abaa8f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Mar 2018 13:07:34 -0500 Subject: [PATCH 026/916] Added support for multiple assignment scalars. --- loopy/kernel/creation.py | 4 +- loopy/kernel/function_interface.py | 85 +++++++++++++++++++----------- loopy/target/c/__init__.py | 4 ++ loopy/transform/register_knl.py | 4 +- 4 files changed, 62 insertions(+), 35 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index c0c8e73be..165607a05 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1914,8 +1914,8 @@ def scope_functions(kernel): type(insn)) # Need to combine the scoped functions into a dict - from loopy.kernel.function_interface import InKernelCallable - scoped_function_dict = dict((func, InKernelCallable(func)) for func in + from loopy.kernel.function_interface import CallableOnScalar + scoped_function_dict = dict((func, CallableOnScalar(func)) for func in scoped_functions) return kernel.copy(instructions=new_insns, scoped_functions=scoped_function_dict) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e0c086eb8..bbd6e43cc 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -107,7 +107,6 @@ def get_kw_pos_association(kernel): # }}} - # {{{ template class @@ -141,10 +140,13 @@ class InKernelCallable(ImmutableRecord): # {{{ sanity checks if not isinstance(name, str): - raise LoopyError("name of a CallableOnScalar should be a string") + raise LoopyError("name of an InKernelCallable should be a string") # }}} + if name_in_target is not None and subkernel is not None: + subkernel = subkernel.copy(name=name_in_target) + super(InKernelCallable, self).__init__(name=name, subkernel=subkernel, arg_id_to_dtype=arg_id_to_dtype, @@ -246,15 +248,6 @@ class InKernelCallable(ImmutableRecord): class CallableOnScalar(InKernelCallable): - def __init__(self, name, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): - - super(CallableOnScalar, self).__init__(name=name, - subkernel=None, - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr, - name_in_target=name_in_target) - def with_types(self, arg_id_to_dtype, target): if self.arg_id_to_dtype is not None: @@ -335,34 +328,64 @@ class CallableOnScalar(InKernelCallable): # TODO: Need to add support for functions like sincos(x) # which would give multiple outputs but takes in scalar arguments - raise NotImplementedError("emit_call_insn only applies for" - " CallableKernels") + # FIXME: needs to get information about whether the callable has should + # do pass by reference by all values or should return one value for + # pass by value return. - # }}} + # For example: The code generation of `sincos` would be different for + # C-Target and OpenCL-target. - def __eq__(self, other): - return (self.name == other.name - and self.arg_id_to_descr == other.arg_id_to_descr - and self.arg_id_to_dtype == other.arg_id_to_dtype - and self.subkernel == other.subkernel) + # Currently doing pass by value for all the assignees. - def __hash__(self): - return hash((self.name, self.subkernel, self.name_in_target)) + assert self.is_ready_for_code_gen() + from loopy.kernel.instruction import CallInstruction -class CallableKernel(InKernelCallable): + assert isinstance(insn, CallInstruction) - def __init__(self, name, subkernel, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): + parameters = insn.expression.parameters + assignees = insn.assignees - if name_in_target is not None and subkernel is not None: - subkernel = subkernel.copy(name=name_in_target) + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + parameters) + arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in + enumerate(parameters)) - super(CallableKernel, self).__init__(name=name, - subkernel=subkernel, - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr, - name_in_target=name_in_target) + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-1] for i, _ in + enumerate(assignees)) + + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(self.target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip( + parameters, par_dtypes, arg_dtypes)] + + for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): + if tgt_dtype != expression_to_code_mapper.infer_type(a): + raise LoopyError("Type Mismach in funciton %s. Expected: %s" + "Got: %s" % (self.name, tgt_dtype, + expression_to_code_mapper.infer_type(a))) + c_parameters.append( + var("&")( + expression_to_code_mapper(a, PREC_NONE, + dtype_to_type_context(self.target, tgt_dtype), + tgt_dtype).expr)) + + from pymbolic import var + return var(self.name_in_target)(*c_parameters) + + raise NotImplementedError("emit_call_insn only applies for" + " CallableKernels") + + # }}} + + +class CallableKernel(InKernelCallable): def with_types(self, arg_id_to_dtype, target): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 5ebcd67e1..2fb902830 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -953,6 +953,10 @@ class CASTBuilder(ASTBuilderBase): expression_to_code_mapper=ecm) from cgen import ExpressionStatement + # FIXME: Depending on the function this can be either an + # ExpressionStatement or Assignment. + # Refer: CallableOnScalar::emit_call_insn. It is discussed in detail + # over there. return ExpressionStatement( CExpression(self.get_c_expression_to_code_mapper(), in_knl_callable_as_call)) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index f43550b5b..05a298d11 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -25,7 +25,7 @@ THE SOFTWARE. from loopy.kernel import LoopKernel from loopy.kernel.creation import FunctionScoper from loopy.diagnostic import LoopyError -from loopy.kernel.function_interface import InKernelCallable +from loopy.kernel.function_interface import CallableKernel from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction) @@ -97,7 +97,7 @@ def register_callable_kernel(parent, function_name, child): raise LoopyError("%s is already being used as a funciton name -- maybe" "use a different name for registering the subkernel") - scoped_functions[function_name] = InKernelCallable(name=function_name, + scoped_functions[function_name] = CallableKernel(name=function_name, subkernel=child) # returning the parent kernel with the new scoped function dictionary -- GitLab From a626687c655d697182349432b98fde82e87054fa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Mar 2018 17:07:33 -0500 Subject: [PATCH 027/916] Changed from collectors to combine mappers --- loopy/kernel/creation.py | 21 ++++++++++++++------- loopy/preprocess.py | 30 +++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 165607a05..124984ea3 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -24,12 +24,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np -from pymbolic.mapper import CSECachingMapperMixin, Collector +from pymbolic.mapper import CSECachingMapperMixin from loopy.tools import intern_frozenset_of_ids -from loopy.symbolic import IdentityMapper, WalkMapper +from loopy.symbolic import IdentityMapper, WalkMapper, CombineMapper from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, @@ -43,6 +42,8 @@ from six.moves import range, zip, intern import re +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -1880,16 +1881,22 @@ class FunctionScoper(IdentityMapper): return IdentityMapper.map_call(self, expr) -class ScopedFunctionCollector(Collector): +class ScopedFunctionCollector(CombineMapper): """ This mapper would collect all the instances of :class:`ScopedFunction` occurring in the expression and written all of them as a :class:`set`. """ + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) def map_scoped_function(self, expr): - return set([expr.name]) + return frozenset([expr.name]) - def map_sub_array_ref(self, expr): - return set() + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant def scope_functions(kernel): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index eedfca6f9..e7472ddd6 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2105,12 +2105,36 @@ def check_atomic_loads(kernel): # {{{ check for unscoped calls -class UnScopedCallCollector(Collector): +class UnScopedCallCollector(CombineMapper): + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + def map_call(self, expr): if not isinstance(expr.function, ScopedFunction): - return set([expr.function.name]) + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if not isinstance(expr.function, ScopedFunction): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + expr.kw_parameter.values()))) else: - return set() + return self.combine((self.rec(child) for child in + expr.parameters+expr.kw_parameters.values())) + + def map_scoped_function(self, expr): + return frozenset([expr.name]) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant def check_functions_are_scoped(kernel): -- GitLab From 8826c9f2c021fd950ff72ad45c09f3d9f30e3ad3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Mar 2018 14:29:54 -0500 Subject: [PATCH 028/916] Need to remove some of these changes. --- loopy/library/reduction.py | 7 ------- loopy/preprocess.py | 17 ++++++++--------- loopy/type_inference.py | 35 +++++++++++++++++++---------------- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 5daa1528a..0e5a093b7 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -422,13 +422,6 @@ def parse_reduction_op(name): # }}} -def reduction_function_identifiers(): - """ Return a :class:`set` of the type of the reduction identifiers that can be - encountered in a kernel. - """ - return set(op for op in _REDUCTION_OPS) - - def reduction_function_mangler(kernel, func_id, arg_dtypes): if isinstance(func_id, ArgExtOp): from loopy.target.opencl import CTarget diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e7472ddd6..34fe6e830 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -893,6 +893,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} + def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): @@ -1093,6 +1094,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames_is_final=insn.within_inames_is_final, predicates=insn.predicates,) + reduction_insn = scope_function_in_insn(reduction_insn, kenrel) + generated_insns.append(reduction_insn) new_insn_add_depends_on.add(reduction_insn.id) @@ -2145,7 +2148,7 @@ def check_functions_are_scoped(kernel): unscoped_calls = UnScopedCallCollector()(insn.expression) if unscoped_calls: raise LoopyError("Unknown function '%s' obtained -- register a function" - " or a kernel corresponding to it." % unscoped_calls.pop()) + " or a kernel corresponding to it." % set(unscoped_calls).pop()) # }}} @@ -2362,10 +2365,6 @@ def preprocess_kernel(kernel, device=None): from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) - # Checking if all the functions being used in the kernel and scoped to a - # finite namespace - check_functions_are_scoped(kernel) - # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. @@ -2382,6 +2381,10 @@ def preprocess_kernel(kernel, device=None): from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel) + # inferring the shape and dim_tags of the arguments involved in a function + # call. + kernel = infer_arg_descr(kernel) + # Ordering restrictions: # # - realize_reduction must happen after type inference because it needs @@ -2410,10 +2413,6 @@ def preprocess_kernel(kernel, device=None): # have been established kernel = check_atomic_loads(kernel) - # inferring the shape and dim_tags of the arguments involved in a function - # call. - kernel = infer_arg_descr(kernel) - kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index f974e3fab..11113538e 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -269,27 +269,24 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) - if None in arg_dtypes: - return [] - - arg_id_to_dtype = dict((i, dtype) for (i, dtype) in - enumerate(arg_dtypes)) + arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in + enumerate(expr.parameters)) # specializing the known function wrt type - in_knl_callable = ( - self.scoped_functions[expr.function.name].with_types( - arg_id_to_dtype, self.kernel.target)) + if isinstance(expr.function, ScopedFunction): + in_knl_callable = ( + self.scoped_functions[expr.function.name].with_types( + arg_id_to_dtype, self.kernel.target)) - # storing the type specialized function so that it can be used for - # later use - self.specialized_functions[expr] = in_knl_callable + # storing the type specialized function so that it can be used for + # later use + self.specialized_functions[expr] = in_knl_callable - new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype - # collecting result dtypes in order of the assignees - if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: - return [new_arg_id_to_dtype[-1]] + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + return [new_arg_id_to_dtype[-1]] return [] @@ -501,6 +498,12 @@ def infer_unknown_types(kernel, expect_completion=False): logger.debug("%s: infer types" % kernel.name) + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.preprocess import check_functions_are_scoped + check_functions_are_scoped(kernel) + from functools import partial debug = partial(_debug, kernel) -- GitLab From 00f158b3ed84054bc0a4d193637f082e761f5cf1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Mar 2018 17:14:27 -0500 Subject: [PATCH 029/916] Started adding the reduction interface --- loopy/kernel/creation.py | 69 ++++++++++++-- loopy/kernel/function_interface.py | 142 +++++++++++++++++++++++------ loopy/kernel/reduction_callable.py | 85 +++++++++++++++++ loopy/library/reduction.py | 7 ++ loopy/symbolic.py | 49 +++++----- 5 files changed, 293 insertions(+), 59 deletions(-) create mode 100644 loopy/kernel/reduction_callable.py diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 124984ea3..5a6423220 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1832,7 +1832,7 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # }}} -# {{{ lookup functions +# {{{ scope functions class FunctionScoper(IdentityMapper): """ @@ -1880,6 +1880,29 @@ class FunctionScoper(IdentityMapper): # This is an unknown function as of yet, not modifying it. return IdentityMapper.map_call(self, expr) + def map_reduction(self, expr): + from pymbolic.primitives import Variable + from loopy.symbolic import ScopedFunction + + mapped_inames = [self.rec(Variable(iname)) for iname in expr.inames] + + new_inames = [] + for iname, new_sym_iname in zip(expr.inames, mapped_inames): + if not isinstance(new_sym_iname, Variable): + from loopy.diagnostic import LoopyError + raise LoopyError("%s did not map iname '%s' to a variable" + % (type(self).__name__, iname)) + + new_inames.append(new_sym_iname.name) + + from loopy.symbolic import Reduction + + return Reduction( + ScopedFunction(expr.operation.name), + tuple(new_inames), + self.rec(expr.expr), + allow_simultaneous=expr.allow_simultaneous) + class ScopedFunctionCollector(CombineMapper): """ This mapper would collect all the instances of :class:`ScopedFunction` @@ -1890,7 +1913,44 @@ class ScopedFunctionCollector(CombineMapper): return reduce(operator.or_, values, frozenset()) def map_scoped_function(self, expr): - return frozenset([expr.name]) + from loopy.kernel.function_interface import CallableOnScalar + return frozenset([(expr.name, CallableOnScalar(expr.name))]) + + def map_reduction(self, expr): + from loopy.kernel.reduction_callable import CallableReduction + from loopy.symbolic import Reduction + + callable_reduction = CallableReduction(expr.operation.name) + + # sanity checks + + if isinstance(expr.expr, tuple): + num_args = len(expr.expr) + else: + num_args = 1 + + if num_args != callable_reduction.operation.arg_count: + raise RuntimeError("invalid invocation of " + "reduction operation '%s': expected %d arguments, " + "got %d instead" % (expr.function.name, + callable_reduction.operation.arg_count, + len(expr.parameters))) + + if callable_reduction.operation.arg_count > 1: + from pymbolic.primitives import Call + + if not isinstance(expr, (tuple, Reduction, Call)): + raise LoopyError("reduction argument must be one of " + "a tuple, reduction, or call; " + "got '%s'" % type(expr).__name__) + else: + if isinstance(expr, tuple): + raise LoopyError("got a tuple argument to a scalar reduction") + elif isinstance(expr, Reduction) and callable_reduction.is_tuple_typed: + raise LoopyError("got a tuple typed argument to a scalar reduction") + + return frozenset([(expr.operation.name, + callable_reduction)]) def map_constant(self, expr): return frozenset() @@ -1921,10 +1981,7 @@ def scope_functions(kernel): type(insn)) # Need to combine the scoped functions into a dict - from loopy.kernel.function_interface import CallableOnScalar - scoped_function_dict = dict((func, CallableOnScalar(func)) for func in - scoped_functions) - + scoped_function_dict = dict(scoped_functions) return kernel.copy(instructions=new_insns, scoped_functions=scoped_function_dict) # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index bbd6e43cc..a87c1670a 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -134,8 +134,7 @@ class InKernelCallable(ImmutableRecord): """ - def __init__(self, name, subkernel=None, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): + def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None): # {{{ sanity checks @@ -144,14 +143,9 @@ class InKernelCallable(ImmutableRecord): # }}} - if name_in_target is not None and subkernel is not None: - subkernel = subkernel.copy(name=name_in_target) - super(InKernelCallable, self).__init__(name=name, - subkernel=subkernel, arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr, - name_in_target=name_in_target) + arg_id_to_descr=arg_id_to_descr) def with_types(self, arg_id_to_dtype, target): """ @@ -233,20 +227,29 @@ class InKernelCallable(ImmutableRecord): # }}} - def __eq__(self, other): - return (self.name == other.name - and self.arg_id_to_descr == other.arg_id_to_descr - and self.arg_id_to_dtype == other.arg_id_to_dtype - and self.subkernel == other.subkernel) +# }}} - def __hash__(self): - return hash((self.name, self.subkernel, self.name_in_target)) +# {{{ callables on scalar -# }}} +class CallableOnScalar(InKernelCallable): + fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") -class CallableOnScalar(InKernelCallable): + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(InKernelCallable, self).__init__(name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.name_in_target = name_in_target + + def __getinitargs__(self): + return (self.name, self.arg_id_to_dtype, self.arg_id_to_descr, + self.name_in_target) def with_types(self, arg_id_to_dtype, target): if self.arg_id_to_dtype is not None: @@ -384,9 +387,32 @@ class CallableOnScalar(InKernelCallable): # }}} +# }}} + + +# {{{ callable kernel class CallableKernel(InKernelCallable): + fields = set(["name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + init_arg_names = ("name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") + + def __init__(self, name, subkernel, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(InKernelCallable, self).__init__(name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.name_in_target = name_in_target + self.subkernel = subkernel + + def __getinitargs__(self): + return (self.name, self.subkernel, self.arg_id_to_dtype, + self.arg_id_to_descr, self.name_in_target) + def with_types(self, arg_id_to_dtype, target): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) @@ -475,12 +501,9 @@ class CallableKernel(InKernelCallable): def generate_preambles(self, target): """ This would generate the target specific preamble. """ + # Transfer the preambel of the subkernel over here raise NotImplementedError() - def emit_call(self, expression_to_code_mapper, expression, target): - - raise NotImplementedError("emit_call only works on scalar operations") - def emit_call_insn(self, insn, target, expression_to_code_mapper): assert self.is_ready_for_code_gen() @@ -524,14 +547,77 @@ class CallableKernel(InKernelCallable): # }}} - def __eq__(self, other): - return (self.name == other.name - and self.arg_id_to_descr == other.arg_id_to_descr - and self.arg_id_to_dtype == other.arg_id_to_dtype - and self.subkernel == other.subkernel) +# }}} + + + + + + +class ReductionCallable(InKernelCallable): + + fields = set(["name", "operation", "arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("name", "operation", "arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, name, operation, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(InKernelCallable, self).__init__(name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.operation = operation + + def with_types(self, arg_id_to_dtype, target): + if self.arg_id_to_dtype is not None: + + # specializing an already specialized function. + + for id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " CallableScalar?") + + if self.name in target.get_device_ast_builder().function_identifiers(): + new_in_knl_callable = target.get_device_ast_builder().with_types( + self, arg_id_to_dtype) + if new_in_knl_callable is None: + new_in_knl_callable = self.copy() + return new_in_knl_callable + + # did not find a scalar function and function prototype does not + # even have subkernel registered => no match found + raise LoopyError("Function %s not present within" + " the %s namespace" % (self.name, target)) + + def with_descrs(self, arg_id_to_descr): + + # This is a scalar call + # need to assert that the name is in funtion indentifiers + arg_id_to_descr[-1] = ValueArgDescriptor() + return self.copy(arg_id_to_descr=arg_id_to_descr) + + def with_iname_tag_usage(self, unusable, concurrent_shape): + + raise NotImplementedError() + + def is_ready_for_code_gen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None and + self.name_in_target is not None) + + + + + + + + + - def __hash__(self): - return hash((self.name, self.subkernel, self.name_in_target)) # {{{ new pymbolic calls to scoped functions diff --git a/loopy/kernel/reduction_callable.py b/loopy/kernel/reduction_callable.py new file mode 100644 index 000000000..1682f7160 --- /dev/null +++ b/loopy/kernel/reduction_callable.py @@ -0,0 +1,85 @@ +# Note: this file is just for convenience purposes. This would go back into +# kernel/function_interface.py. +# keeping it over here until everythin starts working. + + +from __future__ import division, absolute_import + +from loopy.diagnostic import LoopyError + +from loopy.kernel.function_interface import (InKernelCallable, + ValueArgDescriptor) + + +class CallableReduction(InKernelCallable): + + fields = set(["operation", "arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("operation", "arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, operation, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + if isinstance(operation, str): + from loopy.library.reduction import parse_reduction_op + operation = parse_reduction_op(operation) + + from loopy.library.reduction import ReductionOperation + assert isinstance(operation, ReductionOperation) + + self.operation = operation + + super(InKernelCallable, self).__init__(name="", + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + def __getinitargs__(self): + return (self.operation, self.arg_id_to_dtype, + self.arg_id_to_descr) + + @property + def is_tuple_typed(self): + return self.operation.arg_count > 1 + + def with_types(self, arg_id_to_dtype, target): + if self.arg_id_to_dtype is not None: + + # specializing an already specialized function. + + for id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " CallableScalar?") + + if self.name in target.get_device_ast_builder().function_identifiers(): + new_in_knl_callable = target.get_device_ast_builder().with_types( + self, arg_id_to_dtype) + if new_in_knl_callable is None: + new_in_knl_callable = self.copy() + return new_in_knl_callable + + # did not find a scalar function and function prototype does not + # even have subkernel registered => no match found + raise LoopyError("Function %s not present within" + " the %s namespace" % (self.name, target)) + + def with_descrs(self, arg_id_to_descr): + + # This is a scalar call + # need to assert that the name is in funtion indentifiers + arg_id_to_descr[-1] = ValueArgDescriptor() + return self.copy(arg_id_to_descr=arg_id_to_descr) + + def with_iname_tag_usage(self, unusable, concurrent_shape): + + raise NotImplementedError() + + def is_ready_for_code_gen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None and + self.name_in_target is not None) + + +# vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 0e5a093b7..5daa1528a 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -422,6 +422,13 @@ def parse_reduction_op(name): # }}} +def reduction_function_identifiers(): + """ Return a :class:`set` of the type of the reduction identifiers that can be + encountered in a kernel. + """ + return set(op for op in _REDUCTION_OPS) + + def reduction_function_mangler(kernel, func_id, arg_dtypes): if isinstance(func_id, ArgExtOp): from loopy.target.opencl import CTarget diff --git a/loopy/symbolic.py b/loopy/symbolic.py index bdfe57982..e8e39a24f 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -537,9 +537,11 @@ class Reduction(p.Expression): """Represents a reduction operation on :attr:`exprs` across :attr:`inames`. - .. attribute:: operation + ..attribute:: operation - an instance of :class:`loopy.library.reduction.ReductionOperation` + an instance of :class:`pymbolic.primitives.Variable` which indicates + the reduction callable that the reduction would point to in the dict + `kernel.scoped_functions` .. attribute:: inames @@ -563,6 +565,8 @@ class Reduction(p.Expression): init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") def __init__(self, operation, inames, expr, allow_simultaneous=False): + assert isinstance(operation, p.Variable) + if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -580,6 +584,8 @@ class Reduction(p.Expression): inames = tuple(strip_var(iname) for iname in inames) + """ + # Removed by KK. In order to move to the new interface if isinstance(operation, str): from loopy.library.reduction import parse_reduction_op operation = parse_reduction_op(operation) @@ -602,6 +608,7 @@ class Reduction(p.Expression): raise LoopyError("got a tuple argument to a scalar reduction") elif isinstance(expr, Reduction) and expr.is_tuple_typed: raise LoopyError("got a tuple typed argument to a scalar reduction") + """ self.operation = operation self.inames = inames @@ -622,10 +629,12 @@ class Reduction(p.Expression): def stringifier(self): return StringifyMapper - + """ + # Removed by KK. In order to move to the new interface @property def is_tuple_typed(self): return self.operation.arg_count > 1 + """ @property @memoize_method @@ -1139,6 +1148,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): def _parse_reduction(self, operation, inames, red_exprs, allow_simultaneous=False): + assert isinstance(operation, str) + operation = p.Variable(operation) if isinstance(inames, p.Variable): inames = (inames,) @@ -1161,7 +1172,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): allow_simultaneous=allow_simultaneous) def map_call(self, expr): - from loopy.library.reduction import parse_reduction_op + from loopy.library.reduction import reduction_function_identifiers if not isinstance(expr.function, p.Variable): return IdentityMapper.map_call(self, expr) @@ -1181,18 +1192,22 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: raise TypeError("cse takes two arguments") - elif name in ["reduce", "simul_reduce"]: - + elif name in set(["reduce, simul_reduce"]): if len(expr.parameters) >= 3: operation, inames = expr.parameters[:2] red_exprs = expr.parameters[2:] - operation = parse_reduction_op(str(operation)) - return self._parse_reduction(operation, inames, + return self._parse_reduction(str(operation), inames, tuple(self.rec(red_expr) for red_expr in red_exprs), allow_simultaneous=(name == "simul_reduce")) else: + raise TypeError("invalid 'reduce' calling sequence") + elif name in reduction_function_identifiers(): + # KK -- maybe add a check for the arg count? + inames = expr.parameters[0] + red_exprs = tuple(self.rec(param) for param in expr.parameters[1:]) + return self._parse_reduction(name, inames, red_exprs) elif name == "if": if len(expr.parameters) == 3: @@ -1203,23 +1218,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: # see if 'name' is an existing reduction op - - operation = parse_reduction_op(name) - if operation: - # arg_count counts arguments but not inames - if len(expr.parameters) != 1 + operation.arg_count: - raise RuntimeError("invalid invocation of " - "reduction operation '%s': expected %d arguments, " - "got %d instead" % (expr.function.name, - 1 + operation.arg_count, - len(expr.parameters))) - - inames = expr.parameters[0] - red_exprs = tuple(self.rec(param) for param in expr.parameters[1:]) - return self._parse_reduction(operation, inames, red_exprs) - - else: - return IdentityMapper.map_call(self, expr) + return IdentityMapper.map_call(self, expr) def map_call_with_kwargs(self, expr): for par in expr.kw_parameters.values(): -- GitLab From 02bd5cfbd99d8a67b609a2cede0892708169a508 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Mar 2018 17:45:50 -0500 Subject: [PATCH 030/916] Much needed cleaning after the bifurcation! --- loopy/kernel/function_interface.py | 98 +++++------------------------- 1 file changed, 15 insertions(+), 83 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index a87c1670a..bc5d178b1 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -134,19 +134,24 @@ class InKernelCallable(ImmutableRecord): """ + fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr") + def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None): - # {{{ sanity checks + # sanity checks if not isinstance(name, str): raise LoopyError("name of an InKernelCallable should be a string") - # }}} - super(InKernelCallable, self).__init__(name=name, arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) + def __getinitargs__(self): + return (self.name, self.arg_id_to_dtype, self.arg_id_to_descr, + self.name_in_target) + def with_types(self, arg_id_to_dtype, target): """ :arg arg_id_to_type: a mapping from argument identifiers @@ -207,10 +212,7 @@ class InKernelCallable(ImmutableRecord): def is_ready_for_code_gen(self): return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None and - self.name_in_target is not None) - - # {{{ code generation + self.arg_id_to_descr is not None) def generate_preambles(self, target): """ This would generate the target specific preamble. @@ -225,7 +227,9 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - # }}} + def __hash__(self): + + return hash(tuple(self.fields)) # }}} @@ -405,6 +409,8 @@ class CallableKernel(InKernelCallable): super(InKernelCallable, self).__init__(name=name, arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) + if name_in_target is not None: + subkernel = subkernel.copy(name=name_in_target) self.name_in_target = name_in_target self.subkernel = subkernel @@ -496,12 +502,10 @@ class CallableKernel(InKernelCallable): self.arg_id_to_descr is not None and self.name_in_target is not None) - # {{{ code generation - def generate_preambles(self, target): """ This would generate the target specific preamble. """ - # Transfer the preambel of the subkernel over here + # TODO: Transfer the preamble of the subkernel over here raise NotImplementedError() def emit_call_insn(self, insn, target, expression_to_code_mapper): @@ -545,81 +549,9 @@ class CallableKernel(InKernelCallable): from pymbolic import var return var(self.name_in_target)(*c_parameters) - # }}} - # }}} - - - - -class ReductionCallable(InKernelCallable): - - fields = set(["name", "operation", "arg_id_to_dtype", "arg_id_to_descr"]) - init_arg_names = ("name", "operation", "arg_id_to_dtype", "arg_id_to_descr") - - def __init__(self, name, operation, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): - - super(InKernelCallable, self).__init__(name=name, - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr) - - self.operation = operation - - def with_types(self, arg_id_to_dtype, target): - if self.arg_id_to_dtype is not None: - - # specializing an already specialized function. - - for id, dtype in arg_id_to_dtype.items(): - # only checking for the ones which have been provided - if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: - raise LoopyError("Overwriting a specialized" - " function is illegal--maybe start with new instance of" - " CallableScalar?") - - if self.name in target.get_device_ast_builder().function_identifiers(): - new_in_knl_callable = target.get_device_ast_builder().with_types( - self, arg_id_to_dtype) - if new_in_knl_callable is None: - new_in_knl_callable = self.copy() - return new_in_knl_callable - - # did not find a scalar function and function prototype does not - # even have subkernel registered => no match found - raise LoopyError("Function %s not present within" - " the %s namespace" % (self.name, target)) - - def with_descrs(self, arg_id_to_descr): - - # This is a scalar call - # need to assert that the name is in funtion indentifiers - arg_id_to_descr[-1] = ValueArgDescriptor() - return self.copy(arg_id_to_descr=arg_id_to_descr) - - def with_iname_tag_usage(self, unusable, concurrent_shape): - - raise NotImplementedError() - - def is_ready_for_code_gen(self): - - return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None and - self.name_in_target is not None) - - - - - - - - - - - - # {{{ new pymbolic calls to scoped functions def next_indexed_name(name): -- GitLab From c36eb5263283aba4a6564da2dce43a73bc0759e2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 26 Mar 2018 11:22:34 -0500 Subject: [PATCH 031/916] Added the support for a reduction callable. --- loopy/kernel/creation.py | 15 +++-- loopy/kernel/function_interface.py | 26 ++++----- loopy/kernel/reduction_callable.py | 31 ++++------ loopy/library/reduction.py | 90 ++++++++++++++++++++++++------ loopy/preprocess.py | 23 ++++---- loopy/symbolic.py | 34 +++++------ loopy/target/opencl.py | 2 +- loopy/type_inference.py | 54 +++++++++++++----- 8 files changed, 178 insertions(+), 97 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 5a6423220..343c85014 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1898,7 +1898,7 @@ class FunctionScoper(IdentityMapper): from loopy.symbolic import Reduction return Reduction( - ScopedFunction(expr.operation.name), + ScopedFunction(expr.function.name), tuple(new_inames), self.rec(expr.expr), allow_simultaneous=expr.allow_simultaneous) @@ -1918,9 +1918,10 @@ class ScopedFunctionCollector(CombineMapper): def map_reduction(self, expr): from loopy.kernel.reduction_callable import CallableReduction + from loopy.kernel.function_interface import CallableOnScalar from loopy.symbolic import Reduction - callable_reduction = CallableReduction(expr.operation.name) + callable_reduction = CallableReduction(expr.function.name) # sanity checks @@ -1949,8 +1950,14 @@ class ScopedFunctionCollector(CombineMapper): elif isinstance(expr, Reduction) and callable_reduction.is_tuple_typed: raise LoopyError("got a tuple typed argument to a scalar reduction") - return frozenset([(expr.operation.name, - callable_reduction)]) + hidden_function = callable_reduction.operation.hidden_function() + if hidden_function is not None: + return frozenset([(expr.function.name, + callable_reduction), (hidden_function, + CallableOnScalar(hidden_function))]) + else: + return frozenset([(expr.function.name, + callable_reduction)]) def map_constant(self, expr): return frozenset() diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index bc5d178b1..fb80c5876 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -134,23 +134,17 @@ class InKernelCallable(ImmutableRecord): """ - fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr"]) - init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr") + fields = set(["arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") - def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None): + def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): - # sanity checks - - if not isinstance(name, str): - raise LoopyError("name of an InKernelCallable should be a string") - - super(InKernelCallable, self).__init__(name=name, + super(InKernelCallable, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) def __getinitargs__(self): - return (self.name, self.arg_id_to_dtype, self.arg_id_to_descr, - self.name_in_target) + return (self.arg_id_to_dtype, self.arg_id_to_descr) def with_types(self, arg_id_to_dtype, target): """ @@ -245,10 +239,11 @@ class CallableOnScalar(InKernelCallable): def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): - super(InKernelCallable, self).__init__(name=name, + super(InKernelCallable, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) + self.name = name self.name_in_target = name_in_target def __getinitargs__(self): @@ -265,7 +260,7 @@ class CallableOnScalar(InKernelCallable): if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: raise LoopyError("Overwriting a specialized" " function is illegal--maybe start with new instance of" - " CallableScalar?") + " CallableOnScalar?") # {{{ attempt to specialize using scalar functions present in target @@ -406,12 +401,13 @@ class CallableKernel(InKernelCallable): def __init__(self, name, subkernel, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): - super(InKernelCallable, self).__init__(name=name, + super(InKernelCallable, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) if name_in_target is not None: subkernel = subkernel.copy(name=name_in_target) + self.name = name self.name_in_target = name_in_target self.subkernel = subkernel @@ -628,7 +624,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, unique_name = next_indexed_name(unique_name) # book-keeping of the functions and names mappings for later use - if in_knl_callable.subkernel is not None: + if isinstance(in_knl_callable, CallableKernel): # for array calls the name in the target is the name of the # scoped funciton in_knl_callable = in_knl_callable.copy( diff --git a/loopy/kernel/reduction_callable.py b/loopy/kernel/reduction_callable.py index 1682f7160..1ad2acd8d 100644 --- a/loopy/kernel/reduction_callable.py +++ b/loopy/kernel/reduction_callable.py @@ -28,7 +28,7 @@ class CallableReduction(InKernelCallable): self.operation = operation - super(InKernelCallable, self).__init__(name="", + super(InKernelCallable, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) @@ -47,39 +47,32 @@ class CallableReduction(InKernelCallable): for id, dtype in arg_id_to_dtype.items(): # only checking for the ones which have been provided - if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + if id in self.arg_id_to_dtype and ( + self.arg_id_to_dtype[id] != arg_id_to_dtype[id]): raise LoopyError("Overwriting a specialized" " function is illegal--maybe start with new instance of" - " CallableScalar?") - - if self.name in target.get_device_ast_builder().function_identifiers(): - new_in_knl_callable = target.get_device_ast_builder().with_types( - self, arg_id_to_dtype) - if new_in_knl_callable is None: - new_in_knl_callable = self.copy() - return new_in_knl_callable - - # did not find a scalar function and function prototype does not - # even have subkernel registered => no match found - raise LoopyError("Function %s not present within" - " the %s namespace" % (self.name, target)) + " CallableReduction?") + updated_arg_id_to_dtype = self.operation.with_types(arg_id_to_dtype, + target) + return self.copy(arg_id_to_dtype=updated_arg_id_to_dtype) def with_descrs(self, arg_id_to_descr): + # not sure what would be the reson of having this over here # This is a scalar call # need to assert that the name is in funtion indentifiers arg_id_to_descr[-1] = ValueArgDescriptor() return self.copy(arg_id_to_descr=arg_id_to_descr) - def with_iname_tag_usage(self, unusable, concurrent_shape): - - raise NotImplementedError() + def inline(self, kernel): + # Replaces the job of realize_reduction + raise NotImplementedError def is_ready_for_code_gen(self): return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None and - self.name_in_target is not None) + self.operation is not None) # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 5daa1528a..f4444c886 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -36,7 +36,7 @@ class ReductionOperation(object): equality-comparable. """ - def result_dtypes(self, target, *arg_dtypes): + def with_types(self, arg_id_to_dtype, target): """ :arg arg_dtypes: may be None if not known :returns: None if not known, otherwise the returned type @@ -51,6 +51,9 @@ class ReductionOperation(object): def neutral_element(self, *dtypes): raise NotImplementedError + def hidden_function(self): + return None + def __hash__(self): # Force subclasses to override raise NotImplementedError @@ -95,15 +98,22 @@ class ScalarReductionOperation(ReductionOperation): def arg_count(self): return 1 - def result_dtypes(self, kernel, arg_dtype): + def with_types(self, arg_id_to_dtype, target): + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # do not have enough info to figure out the type. + return arg_id_to_dtype.copy() + + arg_dtype = arg_id_to_dtype[0] + + updated_arg_id_to_dtype = arg_id_to_dtype.copy() if self.forced_result_type is not None: - return (self.parse_result_type( - kernel.target, self.forced_result_type),) + updated_arg_id_to_dtype[-1] = (self.parse_result_type( + target, self.forced_result_type),) + return updated_arg_id_to_dtype - if arg_dtype is None: - return None + updated_arg_id_to_dtype[-1] = arg_dtype - return (arg_dtype,) + return updated_arg_id_to_dtype def __hash__(self): return hash((type(self), self.forced_result_type)) @@ -180,7 +190,11 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + from loopy.symbolic import ScopedFunction + return ScopedFunction("max")(operand1, operand2) + + def hidden_function(self): + return "max" class MinReductionOperation(ScalarReductionOperation): @@ -188,7 +202,11 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + from loopy.symbolic import ScopedFunction + return ScopedFunction("min")(operand1, operand2) + + def hidden_function(self): + return "min" # {{{ base class for symbolic reduction ops @@ -233,9 +251,22 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return var("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) - def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): - return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) - + (segment_flag_dtype,)) + def with_types(self, arg_id_to_dtype, target): + for id in range(self.arg_count): + if id not in arg_id_to_dtype or arg_id_to_dtype[id] is None: + # types of arguemnts not known => result type cannot be + # determined. + return arg_id_to_dtype.copy() + + scalar_dtype = arg_id_to_dtype[0] + segment_flag_dtype = arg_id_to_dtype[1] + + updated_arg_id_to_dtype = arg_id_to_dtype.copy() + updated_arg_id_to_dtype[-1] = self.inner_reduction.with_types( + {0: scalar_dtype}, target)[-1] + updated_arg_id_to_dtype[-2] = segment_flag_dtype + + return updated_arg_id_to_dtype def __str__(self): return "segmented(%s)" % self.which @@ -299,8 +330,22 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_dtype.numpy_dtype.type.__name__, index_dtype.numpy_dtype.type.__name__) - def result_dtypes(self, kernel, scalar_dtype, index_dtype): - return (scalar_dtype, index_dtype) + def with_types(self, arg_id_to_dtype, target): + for id in range(self.arg_count): + if id not in arg_id_to_dtype or arg_id_to_dtype[id] is None: + # types of arguemnts not known => result type cannot be + # determined. + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + + updated_arg_id_to_dtype = arg_id_to_dtype.copy() + + updated_arg_id_to_dtype[-1] = scalar_dtype + updated_arg_id_to_dtype[-2] = index_dtype + + return updated_arg_id_to_dtype def neutral_element(self, scalar_dtype, index_dtype): scalar_neutral_func = ( @@ -331,12 +376,18 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): update_comparison = ">=" neutral_sign = -1 + def hidden_function(self): + return "max" + class ArgMinReductionOperation(_ArgExtremumReductionOperation): which = "min" update_comparison = "<=" neutral_sign = +1 + def hidden_function(self): + return "min" + def get_argext_preamble(kernel, func_id, arg_dtypes): op = func_id.reduction_op @@ -377,8 +428,8 @@ def get_argext_preamble(kernel, func_id, arg_dtypes): _REDUCTION_OPS = { "sum": SumReductionOperation, "product": ProductReductionOperation, - "max": MaxReductionOperation, - "min": MinReductionOperation, + "maximum": MaxReductionOperation, + "minimum": MinReductionOperation, "argmax": ArgMaxReductionOperation, "argmin": ArgMinReductionOperation, "segmented(sum)": SegmentedSumReductionOperation, @@ -429,6 +480,12 @@ def reduction_function_identifiers(): return set(op for op in _REDUCTION_OPS) +def reduction_function_mangler(kernel, func_id, arg_dtypes): + raise NotImplementedError("Reduction Function Mangler!") + + +''' +# KK -- we will replace this with the new interface def reduction_function_mangler(kernel, func_id, arg_dtypes): if isinstance(func_id, ArgExtOp): from loopy.target.opencl import CTarget @@ -475,6 +532,7 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes): ) return None +''' def reduction_preamble_generator(preamble_info): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 34fe6e830..51389f4f5 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -39,7 +39,6 @@ from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types from loopy.symbolic import ScopedFunction, CombineMapper -from pymbolic.mapper import Collector from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) @@ -893,7 +892,6 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} - def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): @@ -1041,13 +1039,16 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) + reduction_operation = kernel.scoped_functions[ + expr.function.name].operation + init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes), + expression=reduction_operation.neutral_element(*arg_dtypes), predicates=insn.predicates,) generated_insns.append(init_insn) @@ -1082,10 +1083,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, else: reduction_expr = expr.expr + reduction_operation = kernel.scoped_functions[ + expr.function.name].operation reduction_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=expr.operation( + expression=reduction_operation( arg_dtypes, _strip_if_scalar(acc_vars, acc_vars), reduction_expr), @@ -1094,8 +1097,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, within_inames_is_final=insn.within_inames_is_final, predicates=insn.predicates,) - reduction_insn = scope_function_in_insn(reduction_insn, kenrel) - generated_insns.append(reduction_insn) new_insn_add_depends_on.add(reduction_insn.id) @@ -1944,6 +1945,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, kernel = lp.tag_inames(kernel, new_iname_tags) + # making changes to the scoped function that are arising + # TODO: remove unused inames... kernel = ( @@ -2381,10 +2384,6 @@ def preprocess_kernel(kernel, device=None): from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel) - # inferring the shape and dim_tags of the arguments involved in a function - # call. - kernel = infer_arg_descr(kernel) - # Ordering restrictions: # # - realize_reduction must happen after type inference because it needs @@ -2396,6 +2395,10 @@ def preprocess_kernel(kernel, device=None): kernel = realize_reduction(kernel, unknown_types_ok=False) + # inferring the shape and dim_tags of the arguments involved in a function + # call. + kernel = infer_arg_descr(kernel) + # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. diff --git a/loopy/symbolic.py b/loopy/symbolic.py index e8e39a24f..32670c1cc 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -96,7 +96,7 @@ class IdentityMapperMixin(object): new_inames.append(new_sym_iname.name) return Reduction( - expr.operation, tuple(new_inames), + expr.function, tuple(new_inames), self.rec(expr.expr, *args), allow_simultaneous=expr.allow_simultaneous) @@ -226,7 +226,7 @@ class StringifyMapper(StringifyMapperBase): return "%sreduce(%s, [%s], %s)" % ( "simul_" if expr.allow_simultaneous else "", - expr.operation, ", ".join(expr.inames), + expr.function, ", ".join(expr.inames), self.rec(expr.expr, PREC_NONE)) def map_tagged_variable(self, expr, prec): @@ -266,7 +266,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation) # noqa + or type(expr.function) != type(other.function) # noqa ): return [] @@ -537,7 +537,7 @@ class Reduction(p.Expression): """Represents a reduction operation on :attr:`exprs` across :attr:`inames`. - ..attribute:: operation + ..attribute:: function an instance of :class:`pymbolic.primitives.Variable` which indicates the reduction callable that the reduction would point to in the dict @@ -562,10 +562,10 @@ class Reduction(p.Expression): in precisely one reduction, to avoid mis-nesting errors. """ - init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") + init_arg_names = ("function", "inames", "expr", "allow_simultaneous") - def __init__(self, operation, inames, expr, allow_simultaneous=False): - assert isinstance(operation, p.Variable) + def __init__(self, function, inames, expr, allow_simultaneous=False): + assert isinstance(function, p.Variable) if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -610,20 +610,20 @@ class Reduction(p.Expression): raise LoopyError("got a tuple typed argument to a scalar reduction") """ - self.operation = operation + self.function = function self.inames = inames self.expr = expr self.allow_simultaneous = allow_simultaneous def __getinitargs__(self): - return (self.operation, self.inames, self.expr, self.allow_simultaneous) + return (self.funciton, self.inames, self.expr, self.allow_simultaneous) def get_hash(self): - return hash((self.__class__, self.operation, self.inames, self.expr)) + return hash((self.__class__, self.function, self.inames, self.expr)) def is_equal(self, other): return (other.__class__ == self.__class__ - and other.operation == self.operation + and other.function == self.function and other.inames == self.inames and other.expr == self.expr) @@ -1146,10 +1146,10 @@ class FunctionToPrimitiveMapper(IdentityMapper): turns those into the actual pymbolic primitives used for that. """ - def _parse_reduction(self, operation, inames, red_exprs, + def _parse_reduction(self, function, inames, red_exprs, allow_simultaneous=False): - assert isinstance(operation, str) - operation = p.Variable(operation) + assert isinstance(function, str) + function = p.Variable(function) if isinstance(inames, p.Variable): inames = (inames,) @@ -1168,7 +1168,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): if len(red_exprs) == 1: red_exprs = red_exprs[0] - return Reduction(operation, tuple(processed_inames), red_exprs, + return Reduction(function, tuple(processed_inames), red_exprs, allow_simultaneous=allow_simultaneous) def map_call(self, expr): @@ -1194,10 +1194,10 @@ class FunctionToPrimitiveMapper(IdentityMapper): elif name in set(["reduce, simul_reduce"]): if len(expr.parameters) >= 3: - operation, inames = expr.parameters[:2] + function, inames = expr.parameters[:2] red_exprs = expr.parameters[2:] - return self._parse_reduction(str(operation), inames, + return self._parse_reduction(str(function), inames, tuple(self.rec(red_expr) for red_expr in red_exprs), allow_simultaneous=(name == "simul_reduce")) else: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 7aec34a22..7ffd91309 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -255,7 +255,7 @@ def opencl_with_types(in_knl_callable, arg_id_to_dtype): dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in - arg_id_to_dtype.values() if id >= 0]) + arg_id_to_dtype.items() if id >= 0]) if dtype.kind == "i": dtype = NumpyType(dtype) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 11113538e..8df9773a9 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -396,7 +396,10 @@ class TypeInferenceMapper(CombineMapper): from loopy.symbolic import Reduction from pymbolic.primitives import Call - if not return_tuple and expr.is_tuple_typed: + reduction_callable = self.scoped_functions[ + expr.function.name] + + if not return_tuple and reduction_callable.is_tuple_typed: raise LoopyError("reductions with more or fewer than one " "return value may only be used in direct " "assignments") @@ -416,12 +419,23 @@ class TypeInferenceMapper(CombineMapper): else: rec_results = self.rec(expr.expr) - if return_tuple: - return [expr.operation.result_dtypes(self.kernel, *rec_result) - for rec_result in rec_results] - else: - return [expr.operation.result_dtypes(self.kernel, rec_result)[0] - for rec_result in rec_results] + arg_id_to_dtype = dict(enumerate(rec_results)) + + in_knl_callable = ( + self.scoped_functions[expr.function.name].with_types( + arg_id_to_dtype, self.kernel.target)) + + # storing the type specialized function so that it can be used for + # later use + self.specialized_functions[expr] = in_knl_callable + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + return [new_arg_id_to_dtype[-1]] + + return [] def map_sub_array_ref(self, expr): return self.rec(expr.get_begin_subscript()) @@ -691,8 +705,9 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( kernel, expr, unknown_types_ok): type_inf_mapper = TypeInferenceMapper(kernel) import loopy as lp + callable_reduction = kernel.scoped_functions[expr.function.name] - if expr.is_tuple_typed: + if callable_reduction.is_tuple_typed: arg_dtypes_result = type_inf_mapper( expr, return_tuple=True, return_dtype_set=True) @@ -700,7 +715,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( arg_dtypes = arg_dtypes_result[0] else: if unknown_types_ok: - arg_dtypes = [lp.auto] * expr.operation.arg_count + arg_dtypes = [lp.auto] * callable_reduction.operation.arg_count else: raise LoopyError("failed to determine types of accumulators for " "reduction '%s'" % expr) @@ -714,13 +729,22 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( raise LoopyError("failed to determine type of accumulator for " "reduction '%s'" % expr) - reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes) - reduction_dtypes = tuple( - dt.with_target(kernel.target) - if dt is not lp.auto else dt - for dt in reduction_dtypes) + # TODODODODODODODODODO + + new_arg_id_to_dtype = callable_reduction.with_types( + dict(enumerate(arg_dtypes)), kernel.target).arg_id_to_dtype + + num_result = len([id for id in new_arg_id_to_dtype if id < 0]) + reduction_dtypes = [] + + for id in range(num_result): + dt = new_arg_id_to_dtype[-id-1] + if dt is not lp.auto: + reduction_dtypes.append(dt.with_target(kernel.target)) + else: + reduction_dtypes.append(dt) - return tuple(arg_dtypes), reduction_dtypes + return tuple(arg_dtypes), tuple(reduction_dtypes) # }}} -- GitLab From bbe4926009c7623d0944bcc33a7e50720a529cc8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 26 Mar 2018 12:52:43 -0500 Subject: [PATCH 032/916] Everything working. Needs some cleaning business and adding tests. --- loopy/kernel/function_interface.py | 14 +++--- loopy/preprocess.py | 73 ++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index fb80c5876..5066cff5c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -203,7 +203,7 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - def is_ready_for_code_gen(self): + def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None) @@ -289,7 +289,7 @@ class CallableOnScalar(InKernelCallable): raise NotImplementedError() - def is_ready_for_code_gen(self): + def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None and @@ -304,7 +304,7 @@ class CallableOnScalar(InKernelCallable): def emit_call(self, expression_to_code_mapper, expression, target): - assert self.is_ready_for_code_gen() + assert self.is_ready_for_codegen() # must have single assignee assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 @@ -339,7 +339,7 @@ class CallableOnScalar(InKernelCallable): # Currently doing pass by value for all the assignees. - assert self.is_ready_for_code_gen() + assert self.is_ready_for_codegen() from loopy.kernel.instruction import CallInstruction @@ -492,7 +492,7 @@ class CallableKernel(InKernelCallable): raise NotImplementedError() - def is_ready_for_code_gen(self): + def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None and @@ -506,7 +506,7 @@ class CallableKernel(InKernelCallable): def emit_call_insn(self, insn, target, expression_to_code_mapper): - assert self.is_ready_for_code_gen() + assert self.is_ready_for_codegen() from loopy.kernel.instruction import CallInstruction from pymbolic.primitives import CallWithKwargs @@ -653,4 +653,6 @@ def register_pymbolic_calls_to_knl_callables(kernel, # }}} + + # vim: foldmethod=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 51389f4f5..3f3c1c472 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2319,6 +2319,76 @@ def infer_arg_descr(kernel): # }}} +# {{{ final sweep over the callables to make them ready for codegen + +class ReadyForCodegen(CombineMapper): + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + return all(values) + + def map_call(self, expr, *args, **kwargs): + is_ready_for_codegen = self.kernel.scoped_functions[ + expr.function.name].is_ready_for_codegen() + return self.combine( + (is_ready_for_codegen,) + + tuple( + self.rec(child, *args, **kwargs) for child in expr.parameters) + ) + + def map_call_with_kwargs(self, expr, *args, **kwargs): + is_ready_for_codegen = self.kernel.scoped_functions[ + expr.function.name].is_ready_for_codegen() + return self.combine( + (is_ready_for_codegen,) + + tuple( + self.rec(child, *args, **kwargs) + for child in expr.parameters) + + tuple( + self.rec(child, *args, **kwargs) + for child in expr.kw_parameters.values()) + ) + + def map_constant(self, expr): + return True + + map_variable = map_constant + map_function_symbol = map_constant + + +def try_making_callable_ready_for_codegen(kernel): + from loopy.type_inference import TypeInferenceMapper + from loopy.symbolic import SubstitutionRuleExpander + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + + ready_for_codegen = ReadyForCodegen(kernel) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + type_inf_mapper = TypeInferenceMapper(kernel) + + inferred_functions = {} + for insn in kernel.instructions: + if isinstance(insn, (MultiAssignmentBase, CallInstruction)): + expr = subst_expander(insn.expression) + if not ready_for_codegen(expr): + # only trying to specialize the functions which are not ready + # for codegen + type_inf_mapper(expr) + inferred_functions = {**inferred_functions, + **type_inf_mapper.specialized_functions} + + elif isinstance(insn, (_DataObliviousInstruction)): + pass + else: + NotImplementedError("Unknown Instruction") + + return register_pymbolic_calls_to_knl_callables(kernel, + inferred_functions) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2399,6 +2469,9 @@ def preprocess_kernel(kernel, device=None): # call. kernel = infer_arg_descr(kernel) + # try specializing callables one last time. + kernel = try_making_callable_ready_for_codegen(kernel) + # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. -- GitLab From 00fd25fa3e6a64c29ada79f7d6752b379a90ec86 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 26 Mar 2018 15:07:38 -0500 Subject: [PATCH 033/916] Attempt to complete reduction. --- loopy/kernel/creation.py | 13 ++++++++++--- loopy/kernel/function_interface.py | 20 +++++++++++++++++--- loopy/library/reduction.py | 4 ++-- loopy/preprocess.py | 20 +++++++++++++++++--- loopy/symbolic.py | 2 +- 5 files changed, 47 insertions(+), 12 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 343c85014..ae18a9294 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1897,8 +1897,12 @@ class FunctionScoper(IdentityMapper): from loopy.symbolic import Reduction + # Adding _reduce at the end of the reduction in order to avoid + # confusion between reduce(max, ...) and max(a, b) in the + # `scoped_functions` dictionary. + return Reduction( - ScopedFunction(expr.function.name), + ScopedFunction(expr.function.name+"_reduce"), tuple(new_inames), self.rec(expr.expr), allow_simultaneous=expr.allow_simultaneous) @@ -1921,7 +1925,10 @@ class ScopedFunctionCollector(CombineMapper): from loopy.kernel.function_interface import CallableOnScalar from loopy.symbolic import Reduction - callable_reduction = CallableReduction(expr.function.name) + # Refer to map_reduction subroutine of FunctionScoper. + assert expr.function.name[-7:] == "_reduce" + + callable_reduction = CallableReduction(expr.function.name[:-7]) # sanity checks @@ -1986,7 +1993,7 @@ def scope_functions(kernel): else: raise NotImplementedError("scope_functions not implemented for %s" % type(insn)) - + # Need to combine the scoped functions into a dict scoped_function_dict = dict(scoped_functions) return kernel.copy(instructions=new_insns, scoped_functions=scoped_function_dict) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 5066cff5c..2fbb931cb 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -566,11 +566,14 @@ def next_indexed_name(name): class FunctionScopeChanger(IdentityMapper): - #TODO: Make it sophisticated as in I don't like the if-else systems. Needs + # TODO: Make it sophisticated as in I don't like the if-else systems. Needs # something else. + # Explain what this is doing. + # The name should be more like "NameChanger" more like "GameChanger" LOl. + # Wow my jokes are baaad. Anyways back to work!! + def __init__(self, new_names): self.new_names = new_names - self.new_names_set = frozenset(new_names.values()) def map_call(self, expr): if expr in self.new_names: @@ -594,6 +597,18 @@ class FunctionScopeChanger(IdentityMapper): else: return IdentityMapper.map_call_with_kwargs(self, expr) + def map_reduction(self, expr): + from loopy.symbolic import Reduction + + if self.new_names: + return Reduction( + ScopedFunction(self.new_names[expr]), + tuple(expr.inames), + self.rec(expr.expr), + allow_simultaneous=expr.allow_simultaneous) + else: + return IdentityMapper.map_reduction(self, expr) + def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_calls_to_knl_callables): @@ -654,5 +669,4 @@ def register_pymbolic_calls_to_knl_callables(kernel, # }}} - # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index f4444c886..f1c5607fe 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -428,8 +428,8 @@ def get_argext_preamble(kernel, func_id, arg_dtypes): _REDUCTION_OPS = { "sum": SumReductionOperation, "product": ProductReductionOperation, - "maximum": MaxReductionOperation, - "minimum": MinReductionOperation, + "max": MaxReductionOperation, + "min": MinReductionOperation, "argmax": ArgMaxReductionOperation, "argmin": ArgMinReductionOperation, "segmented(sum)": SegmentedSumReductionOperation, diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 3f3c1c472..8950f1590 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2357,7 +2357,22 @@ class ReadyForCodegen(CombineMapper): map_function_symbol = map_constant -def try_making_callable_ready_for_codegen(kernel): +def specializing_incomplete_callables(kernel): + """ + Transformation necessary to type-specialize the callables which are missed + in type inference. For example consider: + ``` + knl = lp.make_kernel( + "{[i]: 0<=i<16}", + "a[i] = sin[b[i]]", + [lp.GlobalArg('a', dtype=np.float64), + lp.GlobalArg('b', dtype=np.float64)]) + ``` + In this case, none of the instructions undergo type inference as the type + inference is already resolved. But this would be a problem during + code-generation as `sin` is not type specialized. + + """ from loopy.type_inference import TypeInferenceMapper from loopy.symbolic import SubstitutionRuleExpander from loopy.kernel.function_interface import ( @@ -2462,7 +2477,6 @@ def preprocess_kernel(kernel, device=None): # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. - kernel = realize_reduction(kernel, unknown_types_ok=False) # inferring the shape and dim_tags of the arguments involved in a function @@ -2470,7 +2484,7 @@ def preprocess_kernel(kernel, device=None): kernel = infer_arg_descr(kernel) # try specializing callables one last time. - kernel = try_making_callable_ready_for_codegen(kernel) + kernel = specializing_incomplete_callables(kernel) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 32670c1cc..831bab5c2 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -616,7 +616,7 @@ class Reduction(p.Expression): self.allow_simultaneous = allow_simultaneous def __getinitargs__(self): - return (self.funciton, self.inames, self.expr, self.allow_simultaneous) + return (self.function, self.inames, self.expr, self.allow_simultaneous) def get_hash(self): return hash((self.__class__, self.function, self.inames, self.expr)) -- GitLab From 0bda08491ee5bee4248723490b331dcc6a7b7935 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 26 Mar 2018 15:11:16 -0500 Subject: [PATCH 034/916] Removed the temp file reduction_callable --- loopy/kernel/function_interface.py | 69 ++++++++++++++++++++++++++ loopy/kernel/reduction_callable.py | 78 ------------------------------ 2 files changed, 69 insertions(+), 78 deletions(-) delete mode 100644 loopy/kernel/reduction_callable.py diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 2fbb931cb..4168f647a 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -548,6 +548,75 @@ class CallableKernel(InKernelCallable): # }}} +# {{{ callable reduction + +class CallableReduction(InKernelCallable): + + fields = set(["operation", "arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("operation", "arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, operation, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + if isinstance(operation, str): + from loopy.library.reduction import parse_reduction_op + operation = parse_reduction_op(operation) + + from loopy.library.reduction import ReductionOperation + assert isinstance(operation, ReductionOperation) + + self.operation = operation + + super(InKernelCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + def __getinitargs__(self): + return (self.operation, self.arg_id_to_dtype, + self.arg_id_to_descr) + + @property + def is_tuple_typed(self): + return self.operation.arg_count > 1 + + def with_types(self, arg_id_to_dtype, target): + if self.arg_id_to_dtype is not None: + + # specializing an already specialized function. + + for id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + if id in self.arg_id_to_dtype and ( + self.arg_id_to_dtype[id] != arg_id_to_dtype[id]): + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " CallableReduction?") + updated_arg_id_to_dtype = self.operation.with_types(arg_id_to_dtype, + target) + return self.copy(arg_id_to_dtype=updated_arg_id_to_dtype) + + def with_descrs(self, arg_id_to_descr): + # not sure what would be the reson of having this over here + + # This is a scalar call + # need to assert that the name is in funtion indentifiers + arg_id_to_descr[-1] = ValueArgDescriptor() + return self.copy(arg_id_to_descr=arg_id_to_descr) + + def inline(self, kernel): + # TODO: In the future. This should replace the job done by + # `lp.preprocess.realize_reductions` + raise NotImplementedError + + def is_ready_for_code_gen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None and + self.operation is not None) + +# }}} + + # {{{ new pymbolic calls to scoped functions def next_indexed_name(name): diff --git a/loopy/kernel/reduction_callable.py b/loopy/kernel/reduction_callable.py deleted file mode 100644 index 1ad2acd8d..000000000 --- a/loopy/kernel/reduction_callable.py +++ /dev/null @@ -1,78 +0,0 @@ -# Note: this file is just for convenience purposes. This would go back into -# kernel/function_interface.py. -# keeping it over here until everythin starts working. - - -from __future__ import division, absolute_import - -from loopy.diagnostic import LoopyError - -from loopy.kernel.function_interface import (InKernelCallable, - ValueArgDescriptor) - - -class CallableReduction(InKernelCallable): - - fields = set(["operation", "arg_id_to_dtype", "arg_id_to_descr"]) - init_arg_names = ("operation", "arg_id_to_dtype", "arg_id_to_descr") - - def __init__(self, operation, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): - - if isinstance(operation, str): - from loopy.library.reduction import parse_reduction_op - operation = parse_reduction_op(operation) - - from loopy.library.reduction import ReductionOperation - assert isinstance(operation, ReductionOperation) - - self.operation = operation - - super(InKernelCallable, self).__init__( - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr) - - def __getinitargs__(self): - return (self.operation, self.arg_id_to_dtype, - self.arg_id_to_descr) - - @property - def is_tuple_typed(self): - return self.operation.arg_count > 1 - - def with_types(self, arg_id_to_dtype, target): - if self.arg_id_to_dtype is not None: - - # specializing an already specialized function. - - for id, dtype in arg_id_to_dtype.items(): - # only checking for the ones which have been provided - if id in self.arg_id_to_dtype and ( - self.arg_id_to_dtype[id] != arg_id_to_dtype[id]): - raise LoopyError("Overwriting a specialized" - " function is illegal--maybe start with new instance of" - " CallableReduction?") - updated_arg_id_to_dtype = self.operation.with_types(arg_id_to_dtype, - target) - return self.copy(arg_id_to_dtype=updated_arg_id_to_dtype) - - def with_descrs(self, arg_id_to_descr): - # not sure what would be the reson of having this over here - - # This is a scalar call - # need to assert that the name is in funtion indentifiers - arg_id_to_descr[-1] = ValueArgDescriptor() - return self.copy(arg_id_to_descr=arg_id_to_descr) - - def inline(self, kernel): - # Replaces the job of realize_reduction - raise NotImplementedError - - def is_ready_for_code_gen(self): - - return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None and - self.operation is not None) - - -# vim: foldmethod=marker -- GitLab From 1bcf4e9889e547feb0d58a1cd70ca442b513737f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 26 Mar 2018 15:52:05 -0500 Subject: [PATCH 035/916] Added test and minor cleaning --- loopy/kernel/creation.py | 6 +-- loopy/kernel/function_interface.py | 60 ++++++++++++++++++++---------- loopy/preprocess.py | 2 +- test/test_transform.py | 48 ++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 24 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index ae18a9294..097a9b749 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1921,11 +1921,11 @@ class ScopedFunctionCollector(CombineMapper): return frozenset([(expr.name, CallableOnScalar(expr.name))]) def map_reduction(self, expr): - from loopy.kernel.reduction_callable import CallableReduction - from loopy.kernel.function_interface import CallableOnScalar + from loopy.kernel.function_interface import (CallableOnScalar, + CallableReduction) from loopy.symbolic import Reduction - # Refer to map_reduction subroutine of FunctionScoper. + # Refer to `map_reduction` subroutine of `FunctionScoper`. assert expr.function.name[-7:] == "_reduce" callable_reduction = CallableReduction(expr.function.name[:-7]) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 4168f647a..9111aebab 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -1,5 +1,26 @@ from __future__ import division, absolute_import +__copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + import re import six @@ -83,7 +104,7 @@ class ArrayArgDescriptor(ArgDescriptor): # }}} -# {{{ kw_to_pos +# {{{ helper function for callable kenrel -- kw_to_pos def get_kw_pos_association(kernel): kw_to_pos = {} @@ -109,7 +130,6 @@ def get_kw_pos_association(kernel): # {{{ template class - class InKernelCallable(ImmutableRecord): """ @@ -634,29 +654,29 @@ def next_indexed_name(name): num=int(match.group('num'))+1) -class FunctionScopeChanger(IdentityMapper): - # TODO: Make it sophisticated as in I don't like the if-else systems. Needs - # something else. - # Explain what this is doing. - # The name should be more like "NameChanger" more like "GameChanger" LOl. - # Wow my jokes are baaad. Anyways back to work!! +class ScopedFunctionNameChanger(IdentityMapper): + """ + Mapper that takes in a mapping `expr_to_new_names` and maps the + corresponding expression to the new names, which correspond to the names in + `kernel.scoped_functions`. + """ - def __init__(self, new_names): - self.new_names = new_names + def __init__(self, expr_to_new_names): + self.expr_to_new_names = expr_to_new_names def map_call(self, expr): - if expr in self.new_names: + if expr in self.expr_to_new_names: return type(expr)( - ScopedFunction(self.new_names[expr]), + ScopedFunction(self.expr_to_new_names[expr]), tuple(self.rec(child) for child in expr.parameters)) else: return IdentityMapper.map_call(self, expr) def map_call_with_kwargs(self, expr): - if expr in self.new_names: + if expr in self.expr_to_new_names: return type(expr)( - ScopedFunction(self.new_names[expr]), + ScopedFunction(self.expr_to_new_names[expr]), tuple(self.rec(child) for child in expr.parameters), dict( @@ -669,9 +689,9 @@ class FunctionScopeChanger(IdentityMapper): def map_reduction(self, expr): from loopy.symbolic import Reduction - if self.new_names: + if self.expr_to_new_names: return Reduction( - ScopedFunction(self.new_names[expr]), + ScopedFunction(self.expr_to_new_names[expr]), tuple(expr.inames), self.rec(expr.expr), allow_simultaneous=expr.allow_simultaneous) @@ -680,8 +700,8 @@ class FunctionScopeChanger(IdentityMapper): def register_pymbolic_calls_to_knl_callables(kernel, - pymbolic_calls_to_knl_callables): - """ Takes in a mapping :arg:`pymbolic_calls_to_knl_callables` and returns a + pymbolic_exprs_to_knl_callables): + """ Takes in a mapping :arg:`pymbolic_exprs_to_knl_callables` and returns a new kernel which includes an association with the given pymbolic calls to instances of :class:`InKernelCallable` """ @@ -696,7 +716,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, # corresponding pymbolic call pymbolic_calls_to_new_names = {} - for pymbolic_call, in_knl_callable in pymbolic_calls_to_knl_callables.items(): + for pymbolic_call, in_knl_callable in pymbolic_exprs_to_knl_callables.items(): # checking if such a in-kernel callable already exists. if in_knl_callable not in scoped_functions_to_names: # No matching in_knl_callable found => make a new one with a new @@ -722,7 +742,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, # Using the data populated in pymbolic_calls_to_new_names to change the # names of the scoped functions of all the calls in the kernel. new_insns = [] - scope_changer = FunctionScopeChanger(pymbolic_calls_to_new_names) + scope_changer = ScopedFunctionNameChanger(pymbolic_calls_to_new_names) for insn in kernel.instructions: if isinstance(insn, (MultiAssignmentBase, CInstruction)): expr = scope_changer(insn.expression) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 8950f1590..bc4c84524 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2384,7 +2384,7 @@ def specializing_incomplete_callables(kernel): inferred_functions = {} for insn in kernel.instructions: - if isinstance(insn, (MultiAssignmentBase, CallInstruction)): + if isinstance(insn, (MultiAssignmentBase, CInstruction)): expr = subst_expander(insn.expression) if not ready_for_codegen(expr): # only trying to specialize the functions which are not ready diff --git a/test/test_transform.py b/test/test_transform.py index 2f98fe34d..b01024f23 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -182,6 +182,54 @@ def test_add_barrier(ctx_factory): assert (np.linalg.norm(out-2*a.T) < 1e-16) +def test_register_knl(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + grandchild_knl = lp.make_kernel( + "{[i, j]:0<= i, j< 16}", + """ + c[i, j] = 2*a[i, j] + 3*b[i, j] + """) + + child_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) + """) + + parent_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + child_knl = lp.register_callable_kernel( + child_knl, 'linear_combo1', grandchild_knl) + knl = lp.register_callable_kernel( + parent_knl, 'linear_combo2', child_knl) + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From f85423c023a5e83d4a0d4c7a59cab60874f21c07 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 26 Mar 2018 15:59:37 -0500 Subject: [PATCH 036/916] Fix Flake8 --- loopy/kernel/creation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 097a9b749..b8100f3ab 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1993,7 +1993,7 @@ def scope_functions(kernel): else: raise NotImplementedError("scope_functions not implemented for %s" % type(insn)) - + # Need to combine the scoped functions into a dict scoped_function_dict = dict(scoped_functions) return kernel.copy(instructions=new_insns, scoped_functions=scoped_function_dict) -- GitLab From 735ec7b79dfdb8fcfa0e90e5e33a7c9c8160eb57 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 26 Mar 2018 16:28:33 -0500 Subject: [PATCH 037/916] Minor changes --- loopy/codegen/__init__.py | 2 +- loopy/codegen/auxiliary_kernels.py | 2 +- loopy/kernel/__init__.py | 15 ++++++++------- loopy/kernel/creation.py | 2 +- loopy/library/random123.py | 1 + 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 57bf4c6a8..4d847612b 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -513,7 +513,7 @@ def generate_code_v2(kernel): _DataObliviousInstruction)): pass else: - raise NotImplementedError("register_knl not made for %s type of" + raise NotImplementedError("register_knl not made for %s type of " "instruciton" % (str(type(insn)))) # }}} diff --git a/loopy/codegen/auxiliary_kernels.py b/loopy/codegen/auxiliary_kernels.py index 799ab59bf..6c4166bd3 100644 --- a/loopy/codegen/auxiliary_kernels.py +++ b/loopy/codegen/auxiliary_kernels.py @@ -153,7 +153,7 @@ def generate_auxiliary_kernel_device_code(kernel, target): _DataObliviousInstruction)): pass else: - raise NotImplementedError("register_knl not made for %s type of" + raise NotImplementedError("register_knl not made for %s type of " "instruciton" % (str(type(insn)))) # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 25737786c..b87e55ca9 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -143,7 +143,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): to instances of :class:`loopy.kernel.data.IndexTag`. .. attribute:: function_manglers - .. attribute:: function_identifiers .. attribute:: symbol_manglers .. attribute:: substitutions @@ -201,7 +200,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): default_function_mangler, single_arg_function_mangler, ], - function_identifiers=set(), scoped_functions={}, symbol_manglers=[], @@ -268,10 +266,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT - # Populating the function identifiers based on the target and the default - # function identifiers - function_identifiers = target.get_device_ast_builder().function_identifiers() - ImmutableRecordWithoutPickling.__init__(self, domains=domains, instructions=instructions, @@ -291,7 +285,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, function_manglers=function_manglers, - function_identifiers=function_identifiers, scoped_functions=scoped_functions, symbol_manglers=symbol_manglers, index_dtype=index_dtype, @@ -350,6 +343,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} + # {{{ target function identifiers + + @property + def function_identifiers(self): + return self.target.get_device_ast_builder().function_identifiers() + + # }}} + # {{{ symbol mangling def mangle_symbol(self, ast_builder, identifier): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index b8100f3ab..b97639c91 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1974,7 +1974,7 @@ class ScopedFunctionCollector(CombineMapper): def scope_functions(kernel): - func_ids = kernel.function_identifiers.copy() + func_ids = kernel.function_identifiers from loopy.kernel.instruction import CInstruction, _DataObliviousInstruction function_scoper = FunctionScoper(func_ids) diff --git a/loopy/library/random123.py b/loopy/library/random123.py index b28d11ba6..5cc3dd9ce 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -225,6 +225,7 @@ def random123_function_mangler(kernel, name, arg_dtypes): def random123_with_types(in_knl_callable, arg_id_to_dtype, target): + # FIXME: Translate the mangler to this. name = in_knl_callable.name if name not in FUNC_NAMES_TO_RNG: -- GitLab From 1fcd98c91758e3c02d5bcb1cd9be1de0021c38a1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 26 Mar 2018 16:41:09 -0500 Subject: [PATCH 038/916] Added docstrings explaing `hidden_functions` --- loopy/library/reduction.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index f1c5607fe..d2a4e90ac 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -52,6 +52,13 @@ class ReductionOperation(object): raise NotImplementedError def hidden_function(self): + """ + A reduction may result into a scalar callable during the codegen phase. + This function would return an instance of :class:`str` to scope such + functions that may result during "realize_reduction". For example: + `reduce(max(...))` results into another callable `max(a, b)` which is + the "hidden function" the operation is pointing to. + """ return None def __hash__(self): -- GitLab From da2d437d0e2ec914e841adc6241b45d5578790ec Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 27 Mar 2018 01:15:09 -0500 Subject: [PATCH 039/916] Added support for slices for arguments with known shapes --- loopy/kernel/creation.py | 123 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index b97639c91..69767d5e6 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -27,12 +27,14 @@ THE SOFTWARE. import numpy as np from pymbolic.mapper import CSECachingMapperMixin +from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids -from loopy.symbolic import IdentityMapper, WalkMapper, CombineMapper +from loopy.symbolic import IdentityMapper, WalkMapper, CombineMapper, SubArrayRef from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, SubstitutionRule) +from loopy.kernel.instruction import CInstruction, _DataObliviousInstruction from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -498,7 +500,7 @@ def parse_insn(groups, insn_options): if isinstance(inner_lhs_i, Lookup): inner_lhs_i = inner_lhs_i.aggregate - from loopy.symbolic import LinearSubscript, SubArrayRef + from loopy.symbolic import LinearSubscript if isinstance(inner_lhs_i, Variable): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): @@ -2001,6 +2003,119 @@ def scope_functions(kernel): # }}} +# {{{ slice to sub array ref + +def get_slice_params(expr, domain_length): + """ + Either reads the params from the slice or initiates the value to defaults. + """ + start, stop, step = expr.start, expr.stop, expr.step + + if start is None: + start = 0 + + if stop is None: + stop = domain_length + + if step is None: + step = 1 + + return start, stop, step + + +class SliceToInameReplacer(IdentityMapper): + """ + Mapper that converts slices to instances of :class:`SubArrayRef`. + """ + def __init__(self, knl, var_name_gen): + self.var_name_gen = var_name_gen + self.knl = knl + self.iname_domains = {} + + def map_subscript(self, expr): + updated_index = [] + swept_inames = [] + for i, index in enumerate(expr.index_tuple): + if isinstance(index, Slice): + unique_var_name = self.var_name_gen(based_on="islice") + if expr.aggregate.name in self.knl.arg_dict: + domain_length = self.knl.arg_dict[expr.aggregate.name].shape[i] + elif expr.aggregate.name in self.knl.temporary_variables: + domain_length = self.knl.temporary_variables[ + expr.aggregate.name].shape[i] + else: + raise LoopyError("Slice notation is only supported for " + "variables whose shapes are known at creation time " + "-- maybe add the shape for the sliced argument.") + start, stop, step = get_slice_params( + index, domain_length) + self.iname_domains[unique_var_name] = (start, stop, step) + + updated_index.append(step*Variable(unique_var_name)) + swept_inames.append(Variable(unique_var_name)) + else: + updated_index.append(index) + + if swept_inames: + return SubArrayRef(tuple(swept_inames), Subscript( + self.rec(expr.aggregate), + self.rec(tuple(updated_index)))) + else: + return IdentityMapper.map_subscript(self, expr) + + def get_iname_domain_as_isl_set(self): + """ + Returns the extra domain constraints imposed by the slice inames. + """ + if not self.iname_domains: + return None + + ctx = self.knl.isl_context + space = isl.Space.create_from_names(ctx, + set=list(self.iname_domains.keys())) + iname_set = isl.BasicSet.universe(space) + + for iname, (start, stop, step) in self.iname_domains.items(): + iname_set = (iname_set + .add_constraint(isl.Constraint.ineq_from_names(space, {1: + -start, iname: step})) + .add_constraint(isl.Constraint.ineq_from_names(space, {1: + stop-1, iname: -step}))) + + return iname_set + + +def realize_slices_as_sub_array_refs(kernel): + """ + Transformation that returns a kernel with the instances of + :class:`pymbolic.primitives.Slice` to `loopy.symbolic.SubArrayRef` + """ + unique_var_name_generator = kernel.get_var_name_generator() + slice_replacer = SliceToInameReplacer(kernel, unique_var_name_generator) + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, (MultiAssignmentBase, CInstruction)): + new_expr = slice_replacer(insn.expression) + new_insns.append(insn.copy(expression=new_expr)) + elif isinstance(insn, _DataObliviousInstruction): + new_insns.append(insn) + else: + raise NotImplementedError("parse_slices not implemented for %s" % + type(insn)) + + slice_iname_domains = slice_replacer.get_iname_domain_as_isl_set() + + if slice_iname_domains: + d1, d2 = isl.align_two(kernel.domains[0], slice_iname_domains) + return kernel.copy(domains=[d1 & d2], + instructions=new_insns) + else: + return kernel.copy(instructions=new_insns) + +# }}} + + # {{{ kernel creation top-level def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): @@ -2298,6 +2413,10 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_nonexistent_iname_deps(knl) knl = create_temporaries(knl, default_order) + + # Convert slices to iname domains + knl = realize_slices_as_sub_array_refs(knl) + # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- -- GitLab From 535a8755cdbd73f2467d813f67b1c53a3bb16a27 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 27 Mar 2018 01:23:58 -0500 Subject: [PATCH 040/916] Added a test for slice --- test/test_transform.py | 43 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/test/test_transform.py b/test/test_transform.py index b01024f23..ea7237633 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -230,6 +230,49 @@ def test_register_knl(ctx_factory): np.linalg.norm(2*x+3*y))) < 1e-15 +def test_slices(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + child_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """) + + parent_knl = lp.make_kernel( + "{[i, k, m]: 0<=i, k, m<16}", + """ + z[i, :, k, :, m] = linear_combo(x[i, :, k, :, m], y[i, :, k, :, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='z', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, 'linear_combo', child_knl) + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 8f61e63ece310b820dab6380eee194a0fe43f94b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 27 Mar 2018 09:01:40 -0500 Subject: [PATCH 041/916] Supports slices. --- loopy/kernel/creation.py | 12 ++++++++---- loopy/kernel/instruction.py | 21 +++++++++++++-------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 69767d5e6..0bc3d5bc2 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -34,7 +34,8 @@ from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, SubstitutionRule) -from loopy.kernel.instruction import CInstruction, _DataObliviousInstruction +from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, + CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -2095,10 +2096,13 @@ def realize_slices_as_sub_array_refs(kernel): new_insns = [] for insn in kernel.instructions: - if isinstance(insn, (MultiAssignmentBase, CInstruction)): + if isinstance(insn, CallInstruction): new_expr = slice_replacer(insn.expression) - new_insns.append(insn.copy(expression=new_expr)) - elif isinstance(insn, _DataObliviousInstruction): + new_assignees = slice_replacer(insn.assignees) + new_insns.append(insn.copy(assignees=new_assignees, + expression=new_expr)) + elif isinstance(insn, (CInstruction, MultiAssignmentBase, + _DataObliviousInstruction)): new_insns.append(insn) else: raise NotImplementedError("parse_slices not implemented for %s" % diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index d9b6384c8..d2d0c5457 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1046,22 +1046,27 @@ class CallInstruction(MultiAssignmentBase): # }}} +def subscript_contains_slice(subscript): + from pymbolic.primitives import Subscript, Slice + assert isinstance(subscript, Subscript) + return any(isinstance(index, Slice) for index in subscript.index_tuple) + + def is_array_call(assignees, expression): - from pymbolic.primitives import Call, CallWithKwargs + from pymbolic.primitives import Call, CallWithKwargs, Subscript from loopy.symbolic import SubArrayRef if not isinstance(expression, (Call, CallWithKwargs)): return False - for assignee in assignees: - if isinstance(assignee, SubArrayRef): - return True - - for par in expression.parameters: - if isinstance(assignee, SubArrayRef): + for par in expression.parameters+assignees: + if isinstance(par, SubArrayRef): return True + elif isinstance(par, Subscript): + if subscript_contains_slice(par): + return True - # did not encounter SubArrayRef, hence must be a normal call + # did not encounter SubArrayRef/Slice, hence must be a normal call return False -- GitLab From 334ab645c00c7bb2255c826c0cf7956f23695ae5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 27 Mar 2018 09:57:23 -0500 Subject: [PATCH 042/916] Fixes minor error regarding realizing simil_reduce, reduce --- loopy/preprocess.py | 10 +++++++++- loopy/symbolic.py | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index bc4c84524..f6bf6ab88 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2141,14 +2141,20 @@ class UnScopedCallCollector(CombineMapper): map_variable = map_constant map_function_symbol = map_constant + map_tagged_variable = map_constant def check_functions_are_scoped(kernel): """ Checks if all the calls in the instruction expression have been scoped, otherwise indicate to what all calls we await signature. """ + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + for insn in kernel.instructions: - unscoped_calls = UnScopedCallCollector()(insn.expression) + unscoped_calls = UnScopedCallCollector()(subst_expander( + insn.expression)) if unscoped_calls: raise LoopyError("Unknown function '%s' obtained -- register a function" " or a kernel corresponding to it." % set(unscoped_calls).pop()) @@ -2278,6 +2284,7 @@ class ArgDescriptionInferer(CombineMapper): map_variable = map_constant map_function_symbol = map_constant + map_tagged_variable = map_constant def infer_arg_descr(kernel): @@ -2355,6 +2362,7 @@ class ReadyForCodegen(CombineMapper): map_variable = map_constant map_function_symbol = map_constant + map_tagged_variable = map_constant def specializing_incomplete_callables(kernel): diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 831bab5c2..62de58e76 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1192,12 +1192,12 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: raise TypeError("cse takes two arguments") - elif name in set(["reduce, simul_reduce"]): + elif name in ["reduce", "simul_reduce"]: if len(expr.parameters) >= 3: function, inames = expr.parameters[:2] red_exprs = expr.parameters[2:] - return self._parse_reduction(str(function), inames, + return self._parse_reduction(str(function.name), inames, tuple(self.rec(red_expr) for red_expr in red_exprs), allow_simultaneous=(name == "simul_reduce")) else: -- GitLab From f56be725e739f5477f85742ab2919e179de83091 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 28 Mar 2018 21:25:41 -0500 Subject: [PATCH 043/916] Removed a FIXME comment which has already been handled. --- loopy/type_inference.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 8df9773a9..1b5edae41 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -679,17 +679,6 @@ def infer_unknown_types(kernel, expect_completion=False): args=[new_arg_dict[arg.name] for arg in kernel.args], ) - #------------------------------------------------------------------------ - # KK: - # FIXME: - # for example if an instruction is : - # `[i]:z[i] = a_kernel_function([j]:x[j], [k]: y[k])` - # and if the user already provided the types of the args: x, y, z. - # Then the instruction would not go through the TypeInferenceMapper and hence - # the function: `a_kernel_function` would not undergo type specialization, - # which would create problems in the future. - #------------------------------------------------------------------------ - from loopy.kernel.function_interface import ( register_pymbolic_calls_to_knl_callables) return register_pymbolic_calls_to_knl_callables( -- GitLab From cd690f8ed66870516ec667a3121d4c3830c439b3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Mar 2018 13:53:21 -0500 Subject: [PATCH 044/916] no more pytest cache --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e4a64f214..6cac4589a 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ htmlcov .ipynb_checkpoints lextab.py yacctab.py +.pytest_cache/* loopy/_git_rev.py -- GitLab From a2b1821186880faf7a414264759bf6ed28242050 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Mar 2018 17:25:14 -0500 Subject: [PATCH 045/916] Handles substitutions/precompute --- loopy/kernel/creation.py | 13 +++- loopy/kernel/function_interface.py | 97 ++++++++++++++++++++---------- 2 files changed, 76 insertions(+), 34 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 0bc3d5bc2..1379d726f 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1974,6 +1974,7 @@ class ScopedFunctionCollector(CombineMapper): map_variable = map_constant map_function_symbol = map_constant + map_tagged_variable = map_constant def scope_functions(kernel): @@ -1997,9 +1998,19 @@ def scope_functions(kernel): raise NotImplementedError("scope_functions not implemented for %s" % type(insn)) + scoped_substitutions = {} + + for name, rule in kernel.substitutions.items(): + scoped_rule = rule.copy( + expression=function_scoper(rule.expression)) + scoped_substitutions[name] = scoped_rule + scoped_functions.update(scoped_function_collector(scoped_rule.expression)) + # Need to combine the scoped functions into a dict scoped_function_dict = dict(scoped_functions) - return kernel.copy(instructions=new_insns, scoped_functions=scoped_function_dict) + return kernel.copy(instructions=new_insns, + scoped_functions=scoped_function_dict, + substitutions=scoped_substitutions) # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 9111aebab..852b9ee1d 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -29,10 +29,13 @@ from six.moves import zip from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction) +from pymbolic.primitives import Variable +from loopy.symbolic import parse_tagged_name -from loopy.symbolic import IdentityMapper, ScopedFunction + +from loopy.symbolic import (IdentityMapper, ScopedFunction, + SubstitutionRuleMappingContext, RuleAwareIdentityMapper, + SubstitutionRuleExpander) # {{{ argument descriptors @@ -654,49 +657,82 @@ def next_indexed_name(name): num=int(match.group('num'))+1) -class ScopedFunctionNameChanger(IdentityMapper): +class ScopedFunctionNameChanger(RuleAwareIdentityMapper): """ Mapper that takes in a mapping `expr_to_new_names` and maps the corresponding expression to the new names, which correspond to the names in `kernel.scoped_functions`. """ - def __init__(self, expr_to_new_names): + def __init__(self, rule_mapping_context, expr_to_new_names, subst_expander): + super(ScopedFunctionNameChanger, self).__init__(rule_mapping_context) self.expr_to_new_names = expr_to_new_names - - def map_call(self, expr): - if expr in self.expr_to_new_names: - return type(expr)( - ScopedFunction(self.expr_to_new_names[expr]), - tuple(self.rec(child) - for child in expr.parameters)) + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + if not isinstance(expr.function, Variable): + return IdentityMapper.map_call(self, expr, expn_state) + + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.expr_to_new_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expr]), + tuple(self.rec(child) + for child in expr.parameters)) + elif expanded_expr in self.expr_to_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expanded_expr]), + tuple(self.rec(child) + for child in expr.parameters)) + else: + return IdentityMapper.map_call(self, expr) else: - return IdentityMapper.map_call(self, expr) + return self.map_substitution(name, tag, expr.parameters, expn_state) - def map_call_with_kwargs(self, expr): + def map_call_with_kwargs(self, expr, expn_state): + expanded_expr = self.subst_expander(expr) if expr in self.expr_to_new_names: return type(expr)( ScopedFunction(self.expr_to_new_names[expr]), - tuple(self.rec(child) + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + elif expanded_expr in self.expr_to_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) for child in expr.parameters), dict( - (key, self.rec(val)) + (key, self.rec(val, expn_state)) for key, val in six.iteritems(expr.kw_parameters)) ) else: return IdentityMapper.map_call_with_kwargs(self, expr) - def map_reduction(self, expr): + def map_reduction(self, expr, expn_state): from loopy.symbolic import Reduction + expanded_expr = self.subst_expander(expr) - if self.expr_to_new_names: + if expr in self.expr_to_new_names: return Reduction( ScopedFunction(self.expr_to_new_names[expr]), tuple(expr.inames), - self.rec(expr.expr), + self.rec(expr.expr, expn_state), + allow_simultaneous=expr.allow_simultaneous) + elif expanded_expr in self.expr_to_new_names: + return Reduction( + ScopedFunction(self.expr_to_new_names[expanded_expr]), + tuple(expr.inames), + self.rec(expr.expr, expn_state), allow_simultaneous=expr.allow_simultaneous) else: - return IdentityMapper.map_reduction(self, expr) + return IdentityMapper.map_reduction(self, expr, expn_state) def register_pymbolic_calls_to_knl_callables(kernel, @@ -741,19 +777,14 @@ def register_pymbolic_calls_to_knl_callables(kernel, # Using the data populated in pymbolic_calls_to_new_names to change the # names of the scoped functions of all the calls in the kernel. - new_insns = [] - scope_changer = ScopedFunctionNameChanger(pymbolic_calls_to_new_names) - for insn in kernel.instructions: - if isinstance(insn, (MultiAssignmentBase, CInstruction)): - expr = scope_changer(insn.expression) - new_insns.append(insn.copy(expression=expr)) - elif isinstance(insn, _DataObliviousInstruction): - new_insns.append(insn) - else: - raise NotImplementedError("Type Inference Specialization not" - "implemented for %s instruciton" % type(insn)) - return kernel.copy(scoped_functions=scoped_names_to_functions, - instructions=new_insns) + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + scope_changer = ScopedFunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + scoped_kernel = scope_changer.map_kernel(kernel) + + return scoped_kernel.copy(scoped_functions=scoped_names_to_functions) # }}} -- GitLab From 0d98db9831bda0983fe0c272f97b50fed7d20591 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Mar 2018 17:34:48 -0500 Subject: [PATCH 046/916] Fixes minor typo in ScopeFunctionCollector --- loopy/kernel/function_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 852b9ee1d..eb63d364c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -682,7 +682,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): ScopedFunction(self.expr_to_new_names[expr]), tuple(self.rec(child) for child in expr.parameters)) - elif expanded_expr in self.expr_to_names: + elif expanded_expr in self.expr_to_new_names: return type(expr)( ScopedFunction(self.expr_to_new_names[expanded_expr]), tuple(self.rec(child) @@ -703,7 +703,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): (key, self.rec(val, expn_state)) for key, val in six.iteritems(expr.kw_parameters)) ) - elif expanded_expr in self.expr_to_names: + elif expanded_expr in self.expr_to_new_names: return type(expr)( ScopedFunction(self.expr_to_new_names[expanded_expr]), tuple(self.rec(child, expn_state) -- GitLab From 9daa667cfcddcc229395befcfb27045409d5696a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Mar 2018 18:02:04 -0500 Subject: [PATCH 047/916] Changes in TypeInference in order to handle tests --- loopy/type_inference.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 1b5edae41..9ffdb983e 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -475,7 +475,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types, None + return None, type_inf_mapper.symbols_with_unknown_types, {} result = type_inf_mapper.combine(dtype_sets) @@ -630,8 +630,7 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") - specialized_functions = {**specialized_functions, - **new_specialized_functions} + specialized_functions.update(new_specialized_functions) else: debug(" failure") -- GitLab From 9bcf27ba6d432e94a4a97fafac15d7a95dbbd085 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Mar 2018 18:27:28 -0500 Subject: [PATCH 048/916] TODO for replacing the inplace updates in a dictionary --- loopy/preprocess.py | 7 +++++-- loopy/type_inference.py | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f6bf6ab88..2ed004e07 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2230,12 +2230,15 @@ class ArgDescriptionInferer(CombineMapper): assignee_id_to_descr[-i-1] = ValueArgDescriptor() # gathering all the descriptors - combined_arg_id_to_dtype = {**arg_id_to_descr, **assignee_id_to_descr} + # TODO: I dont like in place updates. Change this to somthing else. + # Perhaps make a function? + combined_arg_id_to_descr = arg_id_to_descr.copy() + combined_arg_id_to_descr.update(assignee_id_to_descr) # specializing the function according to the parameter description new_scoped_function = ( self.kernel.scoped_functions[expr.function.name].with_descrs( - combined_arg_id_to_dtype)) + combined_arg_id_to_descr)) # collecting the descriptors for args, kwargs, assignees return (frozenset(((expr, new_scoped_function), )) | diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 9ffdb983e..861e59852 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -630,6 +630,9 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + # TODO: I dont like in place updates. Change this to something + # else. Perhaps add a function for doing this, which does it + # using a bunch of copies? specialized_functions.update(new_specialized_functions) else: debug(" failure") -- GitLab From 665eafb120922f444b31dcb669057c3c2bd9a122 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Mar 2018 19:08:34 -0500 Subject: [PATCH 049/916] Syntax changes in order to comply with python 2 --- loopy/preprocess.py | 5 ++++- loopy/type_inference.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 2ed004e07..7b05efd0b 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2270,7 +2270,10 @@ class ArgDescriptionInferer(CombineMapper): assignee_id_to_descr[-i-1] = ValueArgDescriptor() # gathering all the descriptors - combined_arg_id_to_descr = {**arg_id_to_descr, **assignee_id_to_descr} + # TODO: I dont like in place updates. Change this to somthing else. + # Perhaps make a function? + combined_arg_id_to_descr = arg_id_to_descr.copy() + combined_arg_id_to_descr.update(assignee_id_to_descr) # specializing the function according to the parameter description new_scoped_function = ( diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 861e59852..2d35d7cfa 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -630,7 +630,7 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") - # TODO: I dont like in place updates. Change this to something + # TODO: I dont like in-place updates. Change this to something # else. Perhaps add a function for doing this, which does it # using a bunch of copies? specialized_functions.update(new_specialized_functions) -- GitLab From 0bfbd6996ecb971f3fc67c7be1a276b3d54700cf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Mar 2018 19:41:31 -0500 Subject: [PATCH 050/916] Inplace dict update./ --- loopy/preprocess.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 7b05efd0b..812f6d265 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2404,8 +2404,7 @@ def specializing_incomplete_callables(kernel): # only trying to specialize the functions which are not ready # for codegen type_inf_mapper(expr) - inferred_functions = {**inferred_functions, - **type_inf_mapper.specialized_functions} + inferred_functions.update(type_inf_mapper.specialized_functions) elif isinstance(insn, (_DataObliviousInstruction)): pass -- GitLab From 7095ac70bd25e1f0f4d99545d18bd70c3c633ce5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Mar 2018 21:26:23 -0500 Subject: [PATCH 051/916] Resolving the type inference error, by passing an empty dictionary --- loopy/type_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 2d35d7cfa..3128a1d52 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -448,7 +448,7 @@ class TypeInferenceMapper(CombineMapper): def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): - return [kernel.index_dtype], [] + return [kernel.index_dtype], [], {} from functools import partial debug = partial(_debug, kernel) @@ -475,7 +475,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types, {} + return None, type_inf_mapper.symbols_with_unknown_types, None result = type_inf_mapper.combine(dtype_sets) -- GitLab From 36790774a06ac49cd42126a811ce5a1ba243e308 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Mar 2018 22:02:00 -0500 Subject: [PATCH 052/916] Adding a missing argument to IdentityMapper --- loopy/kernel/function_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index eb63d364c..d99c531ab 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -688,7 +688,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): tuple(self.rec(child) for child in expr.parameters)) else: - return IdentityMapper.map_call(self, expr) + return IdentityMapper.map_call(self, expr, expn_state) else: return self.map_substitution(name, tag, expr.parameters, expn_state) @@ -713,7 +713,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): for key, val in six.iteritems(expr.kw_parameters)) ) else: - return IdentityMapper.map_call_with_kwargs(self, expr) + return IdentityMapper.map_call_with_kwargs(self, expr, expn_state) def map_reduction(self, expr, expn_state): from loopy.symbolic import Reduction -- GitLab From b2c5e712c4598486eaa0530c1ca7cff1e181ea81 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 14:01:23 -0500 Subject: [PATCH 053/916] Handling different instruction types in check_functions_are_scoped --- loopy/preprocess.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 812f6d265..0857a5e72 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2153,8 +2153,15 @@ def check_functions_are_scoped(kernel): subst_expander = SubstitutionRuleExpander(kernel.substitutions) for insn in kernel.instructions: - unscoped_calls = UnScopedCallCollector()(subst_expander( - insn.expression)) + if isinstance(insn, MultiAssignmentBase): + unscoped_calls = UnScopedCallCollector()(subst_expander( + insn.expression)) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("check_function_are_scoped not " + "implemented for %s type of instruction." % type(insn)) + if unscoped_calls: raise LoopyError("Unknown function '%s' obtained -- register a function" " or a kernel corresponding to it." % set(unscoped_calls).pop()) -- GitLab From fc4cb54f28b9cc21cf349c360b52922dafdf9d01 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 14:26:34 -0500 Subject: [PATCH 054/916] Fixes minor error --- loopy/preprocess.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 0857a5e72..4309f9ae1 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2156,15 +2156,16 @@ def check_functions_are_scoped(kernel): if isinstance(insn, MultiAssignmentBase): unscoped_calls = UnScopedCallCollector()(subst_expander( insn.expression)) + if unscoped_calls: + raise LoopyError("Unknown function '%s' obtained -- register a " + "function or a kernel corresponding to it." % + set(unscoped_calls).pop()) elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): pass else: raise NotImplementedError("check_function_are_scoped not " "implemented for %s type of instruction." % type(insn)) - if unscoped_calls: - raise LoopyError("Unknown function '%s' obtained -- register a function" - " or a kernel corresponding to it." % set(unscoped_calls).pop()) # }}} -- GitLab From dd2e1c047eb394244f2c2ed094a6122659877c2d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 15:25:23 -0500 Subject: [PATCH 055/916] Fixes error to collect scoped functions within a reduction expre --- loopy/kernel/creation.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1379d726f..883db10dc 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1962,12 +1962,16 @@ class ScopedFunctionCollector(CombineMapper): hidden_function = callable_reduction.operation.hidden_function() if hidden_function is not None: - return frozenset([(expr.function.name, - callable_reduction), (hidden_function, - CallableOnScalar(hidden_function))]) + + return ( + frozenset([(expr.function.name, callable_reduction), + (hidden_function, CallableOnScalar(hidden_function))]) | + self.rec(expr.expr)) else: - return frozenset([(expr.function.name, - callable_reduction)]) + return ( + frozenset([(expr.function.name, + callable_reduction)]) | + self.rec(expr.expr)) def map_constant(self, expr): return frozenset() -- GitLab From 145c175581663c574fad14714d99fb2ba4d49697 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 15:26:26 -0500 Subject: [PATCH 056/916] Passed an expn_state to ScopefFunctoinNameChanger --- loopy/kernel/function_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index d99c531ab..c71280520 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -680,12 +680,12 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): if expr in self.expr_to_new_names: return type(expr)( ScopedFunction(self.expr_to_new_names[expr]), - tuple(self.rec(child) + tuple(self.rec(child, expn_state) for child in expr.parameters)) elif expanded_expr in self.expr_to_new_names: return type(expr)( ScopedFunction(self.expr_to_new_names[expanded_expr]), - tuple(self.rec(child) + tuple(self.rec(child, expn_state) for child in expr.parameters)) else: return IdentityMapper.map_call(self, expr, expn_state) -- GitLab From 05f7d0cfea90ecf8d933e9ec359ac2f2eeda4206 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 16:27:17 -0500 Subject: [PATCH 057/916] adds ability to call scope_functions at any point of the loopy pipeline --- loopy/kernel/creation.py | 48 ++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 883db10dc..3a2f888f8 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1855,7 +1855,8 @@ class FunctionScoper(IdentityMapper): def map_call(self, expr): from loopy.symbolic import ScopedFunction - if expr.function.name in self.function_ids: + if not isinstance(expr.function, ScopedFunction) and ( + expr.function.name in self.function_ids): # The function is one of the known function hence scoping it. from pymbolic.primitives import Call @@ -1868,9 +1869,10 @@ class FunctionScoper(IdentityMapper): return IdentityMapper.map_call(self, expr) def map_call_with_kwargs(self, expr): - if expr.function.name in self.function_ids: + from loopy.symbolic import ScopedFunction + if not isinstance(expr.function, ScopedFunction) and ( + expr.function.name in self.function_ids): from pymbolic.primitives import CallWithKwargs - from loopy.symbolic import ScopedFunction return CallWithKwargs( ScopedFunction(expr.function.name), tuple(self.rec(child) @@ -1887,6 +1889,10 @@ class FunctionScoper(IdentityMapper): from pymbolic.primitives import Variable from loopy.symbolic import ScopedFunction + if isinstance(expr.function, ScopedFunction): + # we have already scoped this function. + return IdentityMapper.map_reduction(self, expr) + mapped_inames = [self.rec(Variable(iname)) for iname in expr.inames] new_inames = [] @@ -1915,13 +1921,20 @@ class ScopedFunctionCollector(CombineMapper): """ This mapper would collect all the instances of :class:`ScopedFunction` occurring in the expression and written all of them as a :class:`set`. """ + def __init__(self, already_scoped_functions={}): + self.already_scoped_functions = already_scoped_functions + def combine(self, values): import operator return reduce(operator.or_, values, frozenset()) def map_scoped_function(self, expr): from loopy.kernel.function_interface import CallableOnScalar - return frozenset([(expr.name, CallableOnScalar(expr.name))]) + if expr.name in self.already_scoped_functions: + # functions is already scoped + return frozenset() + else: + return frozenset([(expr.name, CallableOnScalar(expr.name))]) def map_reduction(self, expr): from loopy.kernel.function_interface import (CallableOnScalar, @@ -1931,6 +1944,10 @@ class ScopedFunctionCollector(CombineMapper): # Refer to `map_reduction` subroutine of `FunctionScoper`. assert expr.function.name[-7:] == "_reduce" + if expr.function.name in self.already_scoped_functions: + # the function is already scoped + return self.rec(expr.expr) + callable_reduction = CallableReduction(expr.function.name[:-7]) # sanity checks @@ -1962,7 +1979,6 @@ class ScopedFunctionCollector(CombineMapper): hidden_function = callable_reduction.operation.hidden_function() if hidden_function is not None: - return ( frozenset([(expr.function.name, callable_reduction), (hidden_function, CallableOnScalar(hidden_function))]) | @@ -1986,15 +2002,17 @@ def scope_functions(kernel): from loopy.kernel.instruction import CInstruction, _DataObliviousInstruction function_scoper = FunctionScoper(func_ids) - scoped_function_collector = ScopedFunctionCollector() - scoped_functions = set() + scoped_function_collector = ScopedFunctionCollector( + kernel.scoped_functions) + new_scoped_functions = set() new_insns = [] for insn in kernel.instructions: if isinstance(insn, (MultiAssignmentBase, CInstruction)): new_insn = insn.copy(expression=function_scoper(insn.expression)) - scoped_functions.update(scoped_function_collector(new_insn.expression)) + new_scoped_functions.update(scoped_function_collector( + new_insn.expression)) new_insns.append(new_insn) elif isinstance(insn, _DataObliviousInstruction): new_insns.append(insn) @@ -2002,19 +2020,21 @@ def scope_functions(kernel): raise NotImplementedError("scope_functions not implemented for %s" % type(insn)) - scoped_substitutions = {} + substitutions_with_scoped_expr = {} for name, rule in kernel.substitutions.items(): scoped_rule = rule.copy( expression=function_scoper(rule.expression)) - scoped_substitutions[name] = scoped_rule - scoped_functions.update(scoped_function_collector(scoped_rule.expression)) + substitutions_with_scoped_expr[name] = scoped_rule + new_scoped_functions.update(scoped_function_collector( + scoped_rule.expression)) # Need to combine the scoped functions into a dict - scoped_function_dict = dict(scoped_functions) + updated_scoped_functions = kernel.scoped_functions.copy() + updated_scoped_functions.update(dict(new_scoped_functions)) return kernel.copy(instructions=new_insns, - scoped_functions=scoped_function_dict, - substitutions=scoped_substitutions) + scoped_functions=updated_scoped_functions, + substitutions=substitutions_with_scoped_expr) # }}} -- GitLab From b5916208301c0da9c6d454bbb53a0162929f4f14 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 16:28:15 -0500 Subject: [PATCH 058/916] scopes functions that arise out of differentiation. --- loopy/transform/diff.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d4dcb3701..86bc056e9 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -398,7 +398,14 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", # }}} - return diff_context.get_new_kernel(), result + # Differentiation lead to addition of new functions to the kernel. + # For example differentiating `sin(x)` -> `cos(x)`. Hence we would need to + # scope `cos(x)`. + from loopy.kernel.creation import scope_functions + differentiated_scoped_kernel = ( + scope_functions(diff_context.get_new_kernel())) + + return differentiated_scoped_kernel, result # }}} -- GitLab From 1bed0a254a8a430b5e03e61d321a14fe01b8842e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 16:38:06 -0500 Subject: [PATCH 059/916] Added NumpyTypes for the type inference --- loopy/target/opencl.py | 2 +- loopy/target/pyopencl.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 7ffd91309..77ae6a957 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -276,7 +276,7 @@ def opencl_with_types(in_knl_callable, arg_id_to_dtype): dtype = arg_id_to_dtype[0] scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] return in_knl_callable.copy(name_in_target=name, - arg_id_to_dtype={-1: scalar_dtype, 0: dtype, 1: dtype}) + arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}) if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 4dace7ec2..295296444 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -264,11 +264,12 @@ def pyopencl_with_types(in_knl_callable, arg_id_to_dtype): "sinh", "cosh", "tanh", "conj"]: return in_knl_callable.copy(name_in_target="%s_%s" % (tpname, name), - arg_id_to_dtype={0: dtype, -1: dtype}) + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) if name in ["real", "imag", "abs"]: return in_knl_callable.copy(name_in_target="%s_%s" % (tpname, name), - arg_id_to_dtype={0: dtype, -1: dtype.numpy_dtype.type(0).real}) + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType( + dtype.numpy_dtype.type(0).real)}) return None -- GitLab From 8f3791a0154e9228cfc32e6d8a525f1ca249511f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 17:24:57 -0500 Subject: [PATCH 060/916] Fixes minor error in identifying the NumpyType --- loopy/target/pyopencl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 295296444..2fd6af935 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -268,8 +268,8 @@ def pyopencl_with_types(in_knl_callable, arg_id_to_dtype): if name in ["real", "imag", "abs"]: return in_knl_callable.copy(name_in_target="%s_%s" % (tpname, name), - arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType( - dtype.numpy_dtype.type(0).real)}) + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}) return None -- GitLab From 137afed2153d8f943ca313d5f02602c846d72cbf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 17:25:15 -0500 Subject: [PATCH 061/916] Fixes the map_reduction according to the new reduction type --- loopy/transform/iname.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 2347cef3c..125cd9a41 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -144,7 +144,10 @@ class _InameSplitter(RuleAwareIdentityMapper): new_inames.extend([self.outer_iname, self.inner_iname]) from loopy.symbolic import Reduction - return Reduction(expr.operation, tuple(new_inames), + reduction_callable = ( + self.rule_mapping_context.kernel.scoped_functions[ + expr.function.name]) + return Reduction(reduction_callable.operation, tuple(new_inames), self.rec(expr.expr, expn_state), expr.allow_simultaneous) else: -- GitLab From cdb280b3ab6b7f0e52c8121020fe0ca71306d339 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 17:46:58 -0500 Subject: [PATCH 062/916] handles minor errors. --- loopy/kernel/creation.py | 4 ++-- loopy/preprocess.py | 14 ++++++++------ loopy/symbolic.py | 3 +++ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 3a2f888f8..f324645a9 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2009,12 +2009,12 @@ def scope_functions(kernel): new_insns = [] for insn in kernel.instructions: - if isinstance(insn, (MultiAssignmentBase, CInstruction)): + if isinstance(insn, MultiAssignmentBase): new_insn = insn.copy(expression=function_scoper(insn.expression)) new_scoped_functions.update(scoped_function_collector( new_insn.expression)) new_insns.append(new_insn) - elif isinstance(insn, _DataObliviousInstruction): + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): new_insns.append(insn) else: raise NotImplementedError("scope_functions not implemented for %s" % diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 4309f9ae1..8b4cfb1de 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2215,6 +2215,8 @@ class ArgDescriptionInferer(CombineMapper): def map_call(self, expr, **kwargs): from loopy.kernel.function_interface import ValueArgDescriptor from loopy.symbolic import SubArrayRef + if not isinstance(expr.function, ScopedFunction): + return CombineMapper.map_call(self, expr, **kwargs) # descriptors for the args arg_id_to_descr = dict((i, @@ -2317,10 +2319,10 @@ def infer_arg_descr(kernel): pymbolic_calls_to_functions.update( arg_description_modifier(insn.expression, assignees=insn.assignees)) - elif isinstance(insn, (MultiAssignmentBase, CInstruction)): + elif isinstance(insn, MultiAssignmentBase): pymbolic_calls_to_functions.update(arg_description_modifier( insn.expression)) - elif isinstance(insn, _DataObliviousInstruction): + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): pass else: raise NotImplementedError("arg_descr_inference for %s instruction" % @@ -2379,7 +2381,7 @@ class ReadyForCodegen(CombineMapper): map_tagged_variable = map_constant -def specializing_incomplete_callables(kernel): +def specialize_incomplete_callables(kernel): """ Transformation necessary to type-specialize the callables which are missed in type inference. For example consider: @@ -2406,7 +2408,7 @@ def specializing_incomplete_callables(kernel): inferred_functions = {} for insn in kernel.instructions: - if isinstance(insn, (MultiAssignmentBase, CInstruction)): + if isinstance(insn, MultiAssignmentBase): expr = subst_expander(insn.expression) if not ready_for_codegen(expr): # only trying to specialize the functions which are not ready @@ -2414,7 +2416,7 @@ def specializing_incomplete_callables(kernel): type_inf_mapper(expr) inferred_functions.update(type_inf_mapper.specialized_functions) - elif isinstance(insn, (_DataObliviousInstruction)): + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): pass else: NotImplementedError("Unknown Instruction") @@ -2505,7 +2507,7 @@ def preprocess_kernel(kernel, device=None): kernel = infer_arg_descr(kernel) # try specializing callables one last time. - kernel = specializing_incomplete_callables(kernel) + kernel = specialize_incomplete_callables(kernel) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 62de58e76..5374303fb 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -565,6 +565,9 @@ class Reduction(p.Expression): init_arg_names = ("function", "inames", "expr", "allow_simultaneous") def __init__(self, function, inames, expr, allow_simultaneous=False): + if isinstance(function, str): + function = p.Variable(function) + assert isinstance(function, p.Variable) if isinstance(inames, str): -- GitLab From 08671c4a2adefdcc3c17f9d7aec16bb22b6d3833 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Mar 2018 18:35:06 -0500 Subject: [PATCH 063/916] Added a copy of the list, compatible with Python 2 --- loopy/kernel/function_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index c71280520..bf8b9766a 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -496,7 +496,7 @@ class CallableKernel(InKernelCallable): # in the array call. # Collecting the parameters - new_args = self.subkernel.args.copy() + new_args = self.subkernel.args[:] kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) for id, descr in arg_id_to_descr.items(): -- GitLab From ede0021e7d4228199fe56d57873b7c80555a345a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 1 Apr 2018 16:57:19 -0500 Subject: [PATCH 064/916] Switched back to old reduction interface. :) --- loopy/kernel/creation.py | 84 ------------------------ loopy/kernel/function_interface.py | 69 -------------------- loopy/library/function.py | 2 +- loopy/library/reduction.py | 100 ++++------------------------- loopy/preprocess.py | 11 +--- loopy/symbolic.py | 73 ++++++++++----------- loopy/type_inference.py | 54 +++++----------- 7 files changed, 67 insertions(+), 326 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f324645a9..ed6c0605b 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1885,37 +1885,6 @@ class FunctionScoper(IdentityMapper): # This is an unknown function as of yet, not modifying it. return IdentityMapper.map_call(self, expr) - def map_reduction(self, expr): - from pymbolic.primitives import Variable - from loopy.symbolic import ScopedFunction - - if isinstance(expr.function, ScopedFunction): - # we have already scoped this function. - return IdentityMapper.map_reduction(self, expr) - - mapped_inames = [self.rec(Variable(iname)) for iname in expr.inames] - - new_inames = [] - for iname, new_sym_iname in zip(expr.inames, mapped_inames): - if not isinstance(new_sym_iname, Variable): - from loopy.diagnostic import LoopyError - raise LoopyError("%s did not map iname '%s' to a variable" - % (type(self).__name__, iname)) - - new_inames.append(new_sym_iname.name) - - from loopy.symbolic import Reduction - - # Adding _reduce at the end of the reduction in order to avoid - # confusion between reduce(max, ...) and max(a, b) in the - # `scoped_functions` dictionary. - - return Reduction( - ScopedFunction(expr.function.name+"_reduce"), - tuple(new_inames), - self.rec(expr.expr), - allow_simultaneous=expr.allow_simultaneous) - class ScopedFunctionCollector(CombineMapper): """ This mapper would collect all the instances of :class:`ScopedFunction` @@ -1936,59 +1905,6 @@ class ScopedFunctionCollector(CombineMapper): else: return frozenset([(expr.name, CallableOnScalar(expr.name))]) - def map_reduction(self, expr): - from loopy.kernel.function_interface import (CallableOnScalar, - CallableReduction) - from loopy.symbolic import Reduction - - # Refer to `map_reduction` subroutine of `FunctionScoper`. - assert expr.function.name[-7:] == "_reduce" - - if expr.function.name in self.already_scoped_functions: - # the function is already scoped - return self.rec(expr.expr) - - callable_reduction = CallableReduction(expr.function.name[:-7]) - - # sanity checks - - if isinstance(expr.expr, tuple): - num_args = len(expr.expr) - else: - num_args = 1 - - if num_args != callable_reduction.operation.arg_count: - raise RuntimeError("invalid invocation of " - "reduction operation '%s': expected %d arguments, " - "got %d instead" % (expr.function.name, - callable_reduction.operation.arg_count, - len(expr.parameters))) - - if callable_reduction.operation.arg_count > 1: - from pymbolic.primitives import Call - - if not isinstance(expr, (tuple, Reduction, Call)): - raise LoopyError("reduction argument must be one of " - "a tuple, reduction, or call; " - "got '%s'" % type(expr).__name__) - else: - if isinstance(expr, tuple): - raise LoopyError("got a tuple argument to a scalar reduction") - elif isinstance(expr, Reduction) and callable_reduction.is_tuple_typed: - raise LoopyError("got a tuple typed argument to a scalar reduction") - - hidden_function = callable_reduction.operation.hidden_function() - if hidden_function is not None: - return ( - frozenset([(expr.function.name, callable_reduction), - (hidden_function, CallableOnScalar(hidden_function))]) | - self.rec(expr.expr)) - else: - return ( - frozenset([(expr.function.name, - callable_reduction)]) | - self.rec(expr.expr)) - def map_constant(self, expr): return frozenset() diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index bf8b9766a..57f5d0747 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -571,75 +571,6 @@ class CallableKernel(InKernelCallable): # }}} -# {{{ callable reduction - -class CallableReduction(InKernelCallable): - - fields = set(["operation", "arg_id_to_dtype", "arg_id_to_descr"]) - init_arg_names = ("operation", "arg_id_to_dtype", "arg_id_to_descr") - - def __init__(self, operation, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): - - if isinstance(operation, str): - from loopy.library.reduction import parse_reduction_op - operation = parse_reduction_op(operation) - - from loopy.library.reduction import ReductionOperation - assert isinstance(operation, ReductionOperation) - - self.operation = operation - - super(InKernelCallable, self).__init__( - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr) - - def __getinitargs__(self): - return (self.operation, self.arg_id_to_dtype, - self.arg_id_to_descr) - - @property - def is_tuple_typed(self): - return self.operation.arg_count > 1 - - def with_types(self, arg_id_to_dtype, target): - if self.arg_id_to_dtype is not None: - - # specializing an already specialized function. - - for id, dtype in arg_id_to_dtype.items(): - # only checking for the ones which have been provided - if id in self.arg_id_to_dtype and ( - self.arg_id_to_dtype[id] != arg_id_to_dtype[id]): - raise LoopyError("Overwriting a specialized" - " function is illegal--maybe start with new instance of" - " CallableReduction?") - updated_arg_id_to_dtype = self.operation.with_types(arg_id_to_dtype, - target) - return self.copy(arg_id_to_dtype=updated_arg_id_to_dtype) - - def with_descrs(self, arg_id_to_descr): - # not sure what would be the reson of having this over here - - # This is a scalar call - # need to assert that the name is in funtion indentifiers - arg_id_to_descr[-1] = ValueArgDescriptor() - return self.copy(arg_id_to_descr=arg_id_to_descr) - - def inline(self, kernel): - # TODO: In the future. This should replace the job done by - # `lp.preprocess.realize_reductions` - raise NotImplementedError - - def is_ready_for_code_gen(self): - - return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None and - self.operation is not None) - -# }}} - - # {{{ new pymbolic calls to scoped functions def next_indexed_name(name): diff --git a/loopy/library/function.py b/loopy/library/function.py index 3573f1d54..9d557ac9f 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -24,7 +24,6 @@ THE SOFTWARE. def default_function_mangler(kernel, name, arg_dtypes): - from loopy.library.reduction import reduction_function_mangler manglers = [reduction_function_mangler, tuple_function_mangler] @@ -56,4 +55,5 @@ def tuple_function_mangler(kernel, name, arg_dtypes): return None + # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index d2a4e90ac..0e5a093b7 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -36,7 +36,7 @@ class ReductionOperation(object): equality-comparable. """ - def with_types(self, arg_id_to_dtype, target): + def result_dtypes(self, target, *arg_dtypes): """ :arg arg_dtypes: may be None if not known :returns: None if not known, otherwise the returned type @@ -51,16 +51,6 @@ class ReductionOperation(object): def neutral_element(self, *dtypes): raise NotImplementedError - def hidden_function(self): - """ - A reduction may result into a scalar callable during the codegen phase. - This function would return an instance of :class:`str` to scope such - functions that may result during "realize_reduction". For example: - `reduce(max(...))` results into another callable `max(a, b)` which is - the "hidden function" the operation is pointing to. - """ - return None - def __hash__(self): # Force subclasses to override raise NotImplementedError @@ -105,22 +95,15 @@ class ScalarReductionOperation(ReductionOperation): def arg_count(self): return 1 - def with_types(self, arg_id_to_dtype, target): - if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: - # do not have enough info to figure out the type. - return arg_id_to_dtype.copy() - - arg_dtype = arg_id_to_dtype[0] - - updated_arg_id_to_dtype = arg_id_to_dtype.copy() + def result_dtypes(self, kernel, arg_dtype): if self.forced_result_type is not None: - updated_arg_id_to_dtype[-1] = (self.parse_result_type( - target, self.forced_result_type),) - return updated_arg_id_to_dtype + return (self.parse_result_type( + kernel.target, self.forced_result_type),) - updated_arg_id_to_dtype[-1] = arg_dtype + if arg_dtype is None: + return None - return updated_arg_id_to_dtype + return (arg_dtype,) def __hash__(self): return hash((type(self), self.forced_result_type)) @@ -197,11 +180,7 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - from loopy.symbolic import ScopedFunction - return ScopedFunction("max")(operand1, operand2) - - def hidden_function(self): - return "max" + return var("max")(operand1, operand2) class MinReductionOperation(ScalarReductionOperation): @@ -209,11 +188,7 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - from loopy.symbolic import ScopedFunction - return ScopedFunction("min")(operand1, operand2) - - def hidden_function(self): - return "min" + return var("min")(operand1, operand2) # {{{ base class for symbolic reduction ops @@ -258,22 +233,9 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return var("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) - def with_types(self, arg_id_to_dtype, target): - for id in range(self.arg_count): - if id not in arg_id_to_dtype or arg_id_to_dtype[id] is None: - # types of arguemnts not known => result type cannot be - # determined. - return arg_id_to_dtype.copy() - - scalar_dtype = arg_id_to_dtype[0] - segment_flag_dtype = arg_id_to_dtype[1] - - updated_arg_id_to_dtype = arg_id_to_dtype.copy() - updated_arg_id_to_dtype[-1] = self.inner_reduction.with_types( - {0: scalar_dtype}, target)[-1] - updated_arg_id_to_dtype[-2] = segment_flag_dtype - - return updated_arg_id_to_dtype + def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): + return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) + + (segment_flag_dtype,)) def __str__(self): return "segmented(%s)" % self.which @@ -337,22 +299,8 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_dtype.numpy_dtype.type.__name__, index_dtype.numpy_dtype.type.__name__) - def with_types(self, arg_id_to_dtype, target): - for id in range(self.arg_count): - if id not in arg_id_to_dtype or arg_id_to_dtype[id] is None: - # types of arguemnts not known => result type cannot be - # determined. - return self.copy(arg_id_to_dtype=arg_id_to_dtype) - - scalar_dtype = arg_id_to_dtype[0] - index_dtype = arg_id_to_dtype[1] - - updated_arg_id_to_dtype = arg_id_to_dtype.copy() - - updated_arg_id_to_dtype[-1] = scalar_dtype - updated_arg_id_to_dtype[-2] = index_dtype - - return updated_arg_id_to_dtype + def result_dtypes(self, kernel, scalar_dtype, index_dtype): + return (scalar_dtype, index_dtype) def neutral_element(self, scalar_dtype, index_dtype): scalar_neutral_func = ( @@ -383,18 +331,12 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): update_comparison = ">=" neutral_sign = -1 - def hidden_function(self): - return "max" - class ArgMinReductionOperation(_ArgExtremumReductionOperation): which = "min" update_comparison = "<=" neutral_sign = +1 - def hidden_function(self): - return "min" - def get_argext_preamble(kernel, func_id, arg_dtypes): op = func_id.reduction_op @@ -480,19 +422,6 @@ def parse_reduction_op(name): # }}} -def reduction_function_identifiers(): - """ Return a :class:`set` of the type of the reduction identifiers that can be - encountered in a kernel. - """ - return set(op for op in _REDUCTION_OPS) - - -def reduction_function_mangler(kernel, func_id, arg_dtypes): - raise NotImplementedError("Reduction Function Mangler!") - - -''' -# KK -- we will replace this with the new interface def reduction_function_mangler(kernel, func_id, arg_dtypes): if isinstance(func_id, ArgExtOp): from loopy.target.opencl import CTarget @@ -539,7 +468,6 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes): ) return None -''' def reduction_preamble_generator(preamble_info): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 8b4cfb1de..968bbf0dc 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1039,16 +1039,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) - reduction_operation = kernel.scoped_functions[ - expr.function.name].operation - init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=reduction_operation.neutral_element(*arg_dtypes), + expression=expr.operation.neutral_element(*arg_dtypes), predicates=insn.predicates,) generated_insns.append(init_insn) @@ -1083,12 +1080,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, else: reduction_expr = expr.expr - reduction_operation = kernel.scoped_functions[ - expr.function.name].operation reduction_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=reduction_operation( + expression=expr.operation( arg_dtypes, _strip_if_scalar(acc_vars, acc_vars), reduction_expr), @@ -1945,8 +1940,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, kernel = lp.tag_inames(kernel, new_iname_tags) - # making changes to the scoped function that are arising - # TODO: remove unused inames... kernel = ( diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 5374303fb..5dce66ac8 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -96,7 +96,7 @@ class IdentityMapperMixin(object): new_inames.append(new_sym_iname.name) return Reduction( - expr.function, tuple(new_inames), + expr.operation, tuple(new_inames), self.rec(expr.expr, *args), allow_simultaneous=expr.allow_simultaneous) @@ -226,7 +226,7 @@ class StringifyMapper(StringifyMapperBase): return "%sreduce(%s, [%s], %s)" % ( "simul_" if expr.allow_simultaneous else "", - expr.function, ", ".join(expr.inames), + expr.operation, ", ".join(expr.inames), self.rec(expr.expr, PREC_NONE)) def map_tagged_variable(self, expr, prec): @@ -537,11 +537,8 @@ class Reduction(p.Expression): """Represents a reduction operation on :attr:`exprs` across :attr:`inames`. - ..attribute:: function - - an instance of :class:`pymbolic.primitives.Variable` which indicates - the reduction callable that the reduction would point to in the dict - `kernel.scoped_functions` + .. attribute:: operation + an instance of :class:`loopy.library.reduction.ReductionOperation` .. attribute:: inames @@ -562,14 +559,9 @@ class Reduction(p.Expression): in precisely one reduction, to avoid mis-nesting errors. """ - init_arg_names = ("function", "inames", "expr", "allow_simultaneous") - - def __init__(self, function, inames, expr, allow_simultaneous=False): - if isinstance(function, str): - function = p.Variable(function) - - assert isinstance(function, p.Variable) + init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") + def __init__(self, operation, inames, expr, allow_simultaneous=False): if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -587,8 +579,6 @@ class Reduction(p.Expression): inames = tuple(strip_var(iname) for iname in inames) - """ - # Removed by KK. In order to move to the new interface if isinstance(operation, str): from loopy.library.reduction import parse_reduction_op operation = parse_reduction_op(operation) @@ -611,33 +601,30 @@ class Reduction(p.Expression): raise LoopyError("got a tuple argument to a scalar reduction") elif isinstance(expr, Reduction) and expr.is_tuple_typed: raise LoopyError("got a tuple typed argument to a scalar reduction") - """ - self.function = function + self.operation = operation self.inames = inames self.expr = expr self.allow_simultaneous = allow_simultaneous def __getinitargs__(self): - return (self.function, self.inames, self.expr, self.allow_simultaneous) + return (self.operation, self.inames, self.expr, self.allow_simultaneous) def get_hash(self): - return hash((self.__class__, self.function, self.inames, self.expr)) + return hash((self.__class__, self.operation, self.inames, self.expr)) def is_equal(self, other): return (other.__class__ == self.__class__ - and other.function == self.function + and other.operation == self.operation and other.inames == self.inames and other.expr == self.expr) def stringifier(self): return StringifyMapper - """ - # Removed by KK. In order to move to the new interface + @property def is_tuple_typed(self): return self.operation.arg_count > 1 - """ @property @memoize_method @@ -1149,10 +1136,8 @@ class FunctionToPrimitiveMapper(IdentityMapper): turns those into the actual pymbolic primitives used for that. """ - def _parse_reduction(self, function, inames, red_exprs, + def _parse_reduction(self, operation, inames, red_exprs, allow_simultaneous=False): - assert isinstance(function, str) - function = p.Variable(function) if isinstance(inames, p.Variable): inames = (inames,) @@ -1171,11 +1156,11 @@ class FunctionToPrimitiveMapper(IdentityMapper): if len(red_exprs) == 1: red_exprs = red_exprs[0] - return Reduction(function, tuple(processed_inames), red_exprs, + return Reduction(operation, tuple(processed_inames), red_exprs, allow_simultaneous=allow_simultaneous) def map_call(self, expr): - from loopy.library.reduction import reduction_function_identifiers + from loopy.library.reduction import parse_reduction_op if not isinstance(expr.function, p.Variable): return IdentityMapper.map_call(self, expr) @@ -1196,21 +1181,17 @@ class FunctionToPrimitiveMapper(IdentityMapper): raise TypeError("cse takes two arguments") elif name in ["reduce", "simul_reduce"]: + if len(expr.parameters) >= 3: - function, inames = expr.parameters[:2] + operation, inames = expr.parameters[:2] red_exprs = expr.parameters[2:] - return self._parse_reduction(str(function.name), inames, + operation = parse_reduction_op(str(operation)) + return self._parse_reduction(operation, inames, tuple(self.rec(red_expr) for red_expr in red_exprs), allow_simultaneous=(name == "simul_reduce")) else: - raise TypeError("invalid 'reduce' calling sequence") - elif name in reduction_function_identifiers(): - # KK -- maybe add a check for the arg count? - inames = expr.parameters[0] - red_exprs = tuple(self.rec(param) for param in expr.parameters[1:]) - return self._parse_reduction(name, inames, red_exprs) elif name == "if": if len(expr.parameters) == 3: @@ -1221,7 +1202,23 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: # see if 'name' is an existing reduction op - return IdentityMapper.map_call(self, expr) + + operation = parse_reduction_op(name) + if operation: + # arg_count counts arguments but not inames + if len(expr.parameters) != 1 + operation.arg_count: + raise RuntimeError("invalid invocation of " + "reduction operation '%s': expected %d arguments, " + "got %d instead" % (expr.function.name, + 1 + operation.arg_count, + len(expr.parameters))) + + inames = expr.parameters[0] + red_exprs = tuple(self.rec(param) for param in expr.parameters[1:]) + return self._parse_reduction(operation, inames, red_exprs) + + else: + return IdentityMapper.map_call(self, expr) def map_call_with_kwargs(self, expr): for par in expr.kw_parameters.values(): diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 3128a1d52..1c1f47fa0 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -396,10 +396,7 @@ class TypeInferenceMapper(CombineMapper): from loopy.symbolic import Reduction from pymbolic.primitives import Call - reduction_callable = self.scoped_functions[ - expr.function.name] - - if not return_tuple and reduction_callable.is_tuple_typed: + if not return_tuple and expr.is_tuple_typed: raise LoopyError("reductions with more or fewer than one " "return value may only be used in direct " "assignments") @@ -419,23 +416,12 @@ class TypeInferenceMapper(CombineMapper): else: rec_results = self.rec(expr.expr) - arg_id_to_dtype = dict(enumerate(rec_results)) - - in_knl_callable = ( - self.scoped_functions[expr.function.name].with_types( - arg_id_to_dtype, self.kernel.target)) - - # storing the type specialized function so that it can be used for - # later use - self.specialized_functions[expr] = in_knl_callable - - new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype - - # collecting result dtypes in order of the assignees - if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: - return [new_arg_id_to_dtype[-1]] - - return [] + if return_tuple: + return [expr.operation.result_dtypes(self.kernel, *rec_result) + for rec_result in rec_results] + else: + return [expr.operation.result_dtypes(self.kernel, rec_result)[0] + for rec_result in rec_results] def map_sub_array_ref(self, expr): return self.rec(expr.get_begin_subscript()) @@ -696,9 +682,8 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( kernel, expr, unknown_types_ok): type_inf_mapper = TypeInferenceMapper(kernel) import loopy as lp - callable_reduction = kernel.scoped_functions[expr.function.name] - if callable_reduction.is_tuple_typed: + if expr.is_tuple_typed: arg_dtypes_result = type_inf_mapper( expr, return_tuple=True, return_dtype_set=True) @@ -706,7 +691,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( arg_dtypes = arg_dtypes_result[0] else: if unknown_types_ok: - arg_dtypes = [lp.auto] * callable_reduction.operation.arg_count + arg_dtypes = [lp.auto] * expr.operation.arg_count else: raise LoopyError("failed to determine types of accumulators for " "reduction '%s'" % expr) @@ -720,22 +705,13 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( raise LoopyError("failed to determine type of accumulator for " "reduction '%s'" % expr) - # TODODODODODODODODODO - - new_arg_id_to_dtype = callable_reduction.with_types( - dict(enumerate(arg_dtypes)), kernel.target).arg_id_to_dtype - - num_result = len([id for id in new_arg_id_to_dtype if id < 0]) - reduction_dtypes = [] - - for id in range(num_result): - dt = new_arg_id_to_dtype[-id-1] - if dt is not lp.auto: - reduction_dtypes.append(dt.with_target(kernel.target)) - else: - reduction_dtypes.append(dt) + reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes) + reduction_dtypes = tuple( + dt.with_target(kernel.target) + if dt is not lp.auto else dt + for dt in reduction_dtypes) - return tuple(arg_dtypes), tuple(reduction_dtypes) + return tuple(arg_dtypes), reduction_dtypes # }}} -- GitLab From 635512882edf2b6d0bb9dfb41a0986dd1d5a3eae Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 1 Apr 2018 17:26:37 -0500 Subject: [PATCH 065/916] fixes small wrinkle so that we could move back to the old reduction interface. --- loopy/transform/iname.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 125cd9a41..2347cef3c 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -144,10 +144,7 @@ class _InameSplitter(RuleAwareIdentityMapper): new_inames.extend([self.outer_iname, self.inner_iname]) from loopy.symbolic import Reduction - reduction_callable = ( - self.rule_mapping_context.kernel.scoped_functions[ - expr.function.name]) - return Reduction(reduction_callable.operation, tuple(new_inames), + return Reduction(expr.operation, tuple(new_inames), self.rec(expr.expr, expn_state), expr.allow_simultaneous) else: -- GitLab From 7782b78e6e2c4f63965bbca4f639cc4cf4fc4297 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 1 Apr 2018 20:55:22 -0500 Subject: [PATCH 066/916] Passing some more tests --- loopy/kernel/__init__.py | 3 +- loopy/kernel/creation.py | 1 + loopy/kernel/function_interface.py | 35 ++++++------- loopy/preprocess.py | 5 +- loopy/target/c/__init__.py | 3 +- loopy/target/c/codegen/expression.py | 14 ++--- loopy/target/cuda.py | 77 ++++++++++++++++++++++++++++ loopy/target/opencl.py | 30 ++++------- loopy/target/python.py | 14 +++++ loopy/type_inference.py | 5 +- 10 files changed, 133 insertions(+), 54 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index b87e55ca9..5aa0691ec 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -347,7 +347,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): @property def function_identifiers(self): - return self.target.get_device_ast_builder().function_identifiers() + return self.target.get_device_ast_builder().function_identifiers() | ( + set(["indexof", "indexof_vec"])) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index ed6c0605b..33f368196 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1911,6 +1911,7 @@ class ScopedFunctionCollector(CombineMapper): map_variable = map_constant map_function_symbol = map_constant map_tagged_variable = map_constant + map_type_cast = map_constant def scope_functions(kernel): diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 57f5d0747..cb0240425 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -169,7 +169,7 @@ class InKernelCallable(ImmutableRecord): def __getinitargs__(self): return (self.arg_id_to_dtype, self.arg_id_to_descr) - def with_types(self, arg_id_to_dtype, target): + def with_types(self, arg_id_to_dtype, kernel): """ :arg arg_id_to_type: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -273,7 +273,7 @@ class CallableOnScalar(InKernelCallable): return (self.name, self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, target): + def with_types(self, arg_id_to_dtype, kernel): if self.arg_id_to_dtype is not None: # specializing an already specialized function. @@ -285,21 +285,23 @@ class CallableOnScalar(InKernelCallable): " function is illegal--maybe start with new instance of" " CallableOnScalar?") - # {{{ attempt to specialize using scalar functions present in target - - if self.name in target.get_device_ast_builder().function_identifiers(): - new_in_knl_callable = target.get_device_ast_builder().with_types( + if self.name in kernel.target.get_device_ast_builder( + ).function_identifiers(): + new_in_knl_callable = kernel.target.get_device_ast_builder().with_types( self, arg_id_to_dtype) if new_in_knl_callable is None: new_in_knl_callable = self.copy() return new_in_knl_callable + elif self.name in ["indexof", "indexof_vec"]: + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = kernel.index_dtype - # }}} - - # did not find a scalar function and function prototype does not - # even have subkernel registered => no match found - raise LoopyError("Function %s not present within" - " the %s namespace" % (self.name, target)) + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + else: + # did not find a scalar function and function prototype does not + # even have subkernel registered => no match found + raise LoopyError("Function %s not present within" + " the %s namespace" % (self.name, kernel.target)) def with_descrs(self, arg_id_to_descr): @@ -308,15 +310,10 @@ class CallableOnScalar(InKernelCallable): arg_id_to_descr[-1] = ValueArgDescriptor() return self.copy(arg_id_to_descr=arg_id_to_descr) - def with_iname_tag_usage(self, unusable, concurrent_shape): - - raise NotImplementedError() - def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None and - self.name_in_target is not None) + self.arg_id_to_descr is not None) # {{{ code generation @@ -438,7 +435,7 @@ class CallableKernel(InKernelCallable): return (self.name, self.subkernel, self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, target): + def with_types(self, arg_id_to_dtype, kernel): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 968bbf0dc..fafabfb58 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2135,6 +2135,7 @@ class UnScopedCallCollector(CombineMapper): map_variable = map_constant map_function_symbol = map_constant map_tagged_variable = map_constant + map_type_cast = map_constant def check_functions_are_scoped(kernel): @@ -2288,12 +2289,13 @@ class ArgDescriptionInferer(CombineMapper): frozenset(((expr, new_scoped_function), )) | self.combine((self.rec(child) for child in expr.parameters))) - def map_constant(self, expr): + def map_constant(self, expr, **kwargs): return frozenset() map_variable = map_constant map_function_symbol = map_constant map_tagged_variable = map_constant + map_type_cast = map_constant def infer_arg_descr(kernel): @@ -2372,6 +2374,7 @@ class ReadyForCodegen(CombineMapper): map_variable = map_constant map_function_symbol = map_constant map_tagged_variable = map_constant + map_type_cast = map_constant def specialize_incomplete_callables(kernel): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 2fb902830..28068df75 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -540,7 +540,8 @@ class CASTBuilder(ASTBuilderBase): ]) def with_types(self, in_knl_callable, arg_id_to_dtype): - new_callable = c_with_types(in_knl_callable, arg_id_to_dtype) + new_callable = c_with_types(in_knl_callable, arg_id_to_dtype, + modify_name=True) if new_callable is not None: return new_callable return super(CASTBuilder, self).with_types(in_knl_callable, diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 7d05f228f..2dd1a14ea 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -390,14 +390,14 @@ class ExpressionToCExpressionMapper(IdentityMapper): # {{{ implement indexof, indexof_vec - identifier = expr.function - if identifier.name in ["indexof", "indexof_vec"]: + identifier_name = self.kernel.scoped_functions[expr.function.name].name + if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier.name) + raise LoopyError("%s takes exactly one argument" % identifier_name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError( - "argument to %s must be a subscript" % identifier.name) + "argument to %s must be a subscript" % identifier_name) ary = self.find_array(arg) @@ -409,11 +409,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier.name) + raise LoopyError("%s does not support images" % identifier_name) - if identifier.name == "indexof": + if identifier_name == "indexof": return access_info.subscripts[0] - elif identifier.name == "indexof_vec": + elif identifier_name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 027f27838..75606945a 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -30,6 +30,7 @@ from pytools import memoize_method from loopy.target.c import CTarget, CASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper +from loopy.target.c import (c_math_identifiers, c_with_types) from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import temp_var_scope @@ -112,6 +113,16 @@ def _register_vector_types(dtype_registry): # {{{ function mangler +_CUDA_SPECIFIC_FUNCTIONS = { + "rsqrt": 1, + "atan2": 2, + } + + +def cuda_function_identifiers(): + return set(_CUDA_SPECIFIC_FUNCTIONS) + + def cuda_function_mangler(kernel, name, arg_dtypes): if not isinstance(name, str): return None @@ -136,6 +147,57 @@ def cuda_function_mangler(kernel, name, arg_dtypes): return None + +def cuda_with_types(in_knl_callable, arg_id_to_dtype): + + name = in_knl_callable.name + + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return None + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] + return in_knl_callable.copy(name_in_target=name, + arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}) + + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return None + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return in_knl_callable.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype) + + return None + + # }}} @@ -224,6 +286,21 @@ class CUDACASTBuilder(CASTBuilder): cuda_function_mangler ]) + def function_identifiers(self): + return (cuda_function_identifiers() | c_math_identifiers() | + super(CUDACASTBuilder, self).function_identifiers()) + + def with_types(self, in_knl_callable, arg_id_to_dtype): + new_callable = cuda_with_types(in_knl_callable, arg_id_to_dtype) + if new_callable is not None: + return new_callable + + new_callable = c_with_types(in_knl_callable, arg_id_to_dtype, + modify_name=True) + if new_callable is not None: + return new_callable + return super(CUDACASTBuilder, self).with_types(in_knl_callable, + arg_id_to_dtype) # }}} # {{{ top-level codegen diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 77ae6a957..87c77b2c2 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -140,28 +140,10 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function identifiers - -_CL_SIMPLE_MULTI_ARG_FUNC_IDS = set(["clamp", "atan2"]) - - -VECTOR_LITERAL_FUNC_IDS = set("make_%s%d" % (name, count) - for name in ['char', 'uchar', 'short', 'ushort', 'int', 'uint', 'long', - 'ulong', 'float', 'double'] - for count in [2, 3, 4, 8, 16] - ) - - -def opencl_function_identifiers(): - return set(["max", "min", "dot"]) | (_CL_SIMPLE_MULTI_ARG_FUNC_IDS | - VECTOR_LITERAL_FUNC_IDS) - -# }}} - - # {{{ function mangler _CL_SIMPLE_MULTI_ARG_FUNCTIONS = { + "rsqrt": 1, "clamp": 3, "atan2": 2, } @@ -185,6 +167,11 @@ VECTOR_LITERAL_FUNCS = dict( ) +def opencl_function_identifiers(): + return set(["max", "min", "dot"]) | (set(_CL_SIMPLE_MULTI_ARG_FUNCTIONS) | + set(VECTOR_LITERAL_FUNCS)) + + def opencl_function_mangler(kernel, name, arg_dtypes): if not isinstance(name, str): return None @@ -279,6 +266,7 @@ def opencl_with_types(in_knl_callable, arg_id_to_dtype): arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}) if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + print(arg_id_to_dtype) num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] for id in arg_id_to_dtype: if not -1 <= id < num_args: @@ -286,14 +274,14 @@ def opencl_with_types(in_knl_callable, arg_id_to_dtype): num_args)) for i in range(num_args): - if i not in arg_id_to_dtype or arg_id_to_dtype[i] is not None: + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: # the types provided aren't mature enough to specialize the # callable return None dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in - arg_id_to_dtype.values() if id >= 0]) + arg_id_to_dtype.items() if id >= 0]) if dtype.kind == "c": raise LoopyError("%s does not support complex numbers" diff --git a/loopy/target/python.py b/loopy/target/python.py index ce04986d3..dcc1be9bc 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -195,6 +195,20 @@ class PythonASTBuilderBase(ASTBuilderBase): _numpy_single_arg_function_mangler, ]) + def function_identifiers(self): + from loopy.target.c import c_math_identifiers + return ( + super(PythonASTBuilderBase, self).function_identifiers() | + c_math_identifiers()) + + def with_types(self, in_knl_callable, arg_id_to_dtype): + from loopy.target.c import c_with_types + new_callable = c_with_types(in_knl_callable, arg_id_to_dtype) + if new_callable is not None: + return new_callable + return super(PythonASTBuilderBase, self).with_types(in_knl_callable, + arg_id_to_dtype) + def preamble_generators(self): return ( super(PythonASTBuilderBase, self).preamble_generators() + [ diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 1c1f47fa0..02121ed9e 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -259,9 +259,6 @@ class TypeInferenceMapper(CombineMapper): if isinstance(identifier, (Variable, ScopedFunction)): identifier = identifier.name - if identifier in ["indexof", "indexof_vec"]: - return [self.kernel.index_dtype] - def none_if_empty(d): if d: d, = d @@ -276,7 +273,7 @@ class TypeInferenceMapper(CombineMapper): if isinstance(expr.function, ScopedFunction): in_knl_callable = ( self.scoped_functions[expr.function.name].with_types( - arg_id_to_dtype, self.kernel.target)) + arg_id_to_dtype, self.kernel)) # storing the type specialized function so that it can be used for # later use -- GitLab From 28daffc0327362fe3132df0cd478654b7c204551 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 1 Apr 2018 21:32:55 -0500 Subject: [PATCH 067/916] Scopes reduction functions(until we convert the reductions also into callables). --- loopy/kernel/creation.py | 14 ++++++++++++++ loopy/library/reduction.py | 5 +++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 33f368196..794a99945 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1905,6 +1905,20 @@ class ScopedFunctionCollector(CombineMapper): else: return frozenset([(expr.name, CallableOnScalar(expr.name))]) + def map_reduction(self, expr): + from loopy.kernel.function_interface import CallableOnScalar + from loopy.library.reduction import (MaxReductionOperation, + MinReductionOperation, ArgMinReductionOperation, + ArgMaxReductionOperation) + if isinstance(expr.operation, (MaxReductionOperation, + ArgMaxReductionOperation)): + return frozenset([("max", CallableOnScalar("max"))]) + if isinstance(expr.operation, (MinReductionOperation, + ArgMinReductionOperation)): + return frozenset([("min", CallableOnScalar("min"))]) + else: + return frozenset() + def map_constant(self, expr): return frozenset() diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 0e5a093b7..70c6d68d2 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -24,6 +24,7 @@ THE SOFTWARE. from pymbolic import var +from loopy.symbolic import ScopedFunction import numpy as np from loopy.symbolic import FunctionIdentifier @@ -180,7 +181,7 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + return ScopedFunction("max")(operand1, operand2) class MinReductionOperation(ScalarReductionOperation): @@ -188,7 +189,7 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + return ScopedFunction("min")(operand1, operand2) # {{{ base class for symbolic reduction ops -- GitLab From 169481b3a5dfffd82557d8afc62a585ced9cf63c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 1 Apr 2018 22:53:20 -0500 Subject: [PATCH 068/916] fixes small bug about not scoping the expression within an expression --- loopy/kernel/creation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 794a99945..3c9d621a4 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1883,7 +1883,7 @@ class FunctionScoper(IdentityMapper): ) # This is an unknown function as of yet, not modifying it. - return IdentityMapper.map_call(self, expr) + return IdentityMapper.map_call_with_kwargs(self, expr) class ScopedFunctionCollector(CombineMapper): @@ -1912,12 +1912,14 @@ class ScopedFunctionCollector(CombineMapper): ArgMaxReductionOperation) if isinstance(expr.operation, (MaxReductionOperation, ArgMaxReductionOperation)): - return frozenset([("max", CallableOnScalar("max"))]) + return frozenset([("max", CallableOnScalar("max"))]) | ( + self.rec(expr.expr)) if isinstance(expr.operation, (MinReductionOperation, ArgMinReductionOperation)): - return frozenset([("min", CallableOnScalar("min"))]) + return frozenset([("min", CallableOnScalar("min"))]) | ( + self.rec(expr.expr)) else: - return frozenset() + return self.rec(expr.expr) def map_constant(self, expr): return frozenset() -- GitLab From db97460a3ebea26915d48f5bef3d22e6c317d51f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Apr 2018 15:20:39 -0500 Subject: [PATCH 069/916] Still fixing some of the tests --- loopy/codegen/__init__.py | 3 ++- loopy/kernel/__init__.py | 2 +- loopy/kernel/creation.py | 14 +++++++---- loopy/kernel/function_interface.py | 14 +++++++---- loopy/library/reduction.py | 4 ++-- loopy/preprocess.py | 4 +--- loopy/type_inference.py | 38 ++++++++++++++++++++++++++++-- 7 files changed, 62 insertions(+), 17 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 4d847612b..6023a4b55 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -503,7 +503,8 @@ def generate_code_v2(kernel): for insn in kernel.instructions: if isinstance(insn, CallInstruction): in_knl_callable = kernel.scoped_functions[insn.expression.function.name] - if in_knl_callable.subkernel is not None: + from loopy.kernel.function_interface import CallableKernel + if isinstance(in_knl_callable, CallableKernel): auxiliary_dev_prog = generate_auxiliary_kernel_device_code( in_knl_callable.subkernel, kernel.target).device_programs[0].ast diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 5aa0691ec..892c8a5cb 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -348,7 +348,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): @property def function_identifiers(self): return self.target.get_device_ast_builder().function_identifiers() | ( - set(["indexof", "indexof_vec"])) + set(["indexof", "indexof_vec", "make_tuple"])) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 3c9d621a4..834fdce20 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1910,14 +1910,20 @@ class ScopedFunctionCollector(CombineMapper): from loopy.library.reduction import (MaxReductionOperation, MinReductionOperation, ArgMinReductionOperation, ArgMaxReductionOperation) - if isinstance(expr.operation, (MaxReductionOperation, - ArgMaxReductionOperation)): + if isinstance(expr.operation, MaxReductionOperation): return frozenset([("max", CallableOnScalar("max"))]) | ( self.rec(expr.expr)) - if isinstance(expr.operation, (MinReductionOperation, - ArgMinReductionOperation)): + elif isinstance(expr.operation, MinReductionOperation): return frozenset([("min", CallableOnScalar("min"))]) | ( self.rec(expr.expr)) + elif isinstance(expr.operation, ArgMaxReductionOperation): + return frozenset([("max", CallableOnScalar("min")), ("make_tuple", + CallableOnScalar("make_tuple"))]) | ( + self.rec(expr.expr)) + elif isinstance(expr.operation, ArgMinReductionOperation): + return frozenset([("min", CallableOnScalar("min")), ("make_tuple", + CallableOnScalar("make_tuple"))]) | ( + self.rec(expr.expr)) else: return self.rec(expr.expr) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index cb0240425..5d7585d0c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -297,6 +297,14 @@ class CallableOnScalar(InKernelCallable): new_arg_id_to_dtype[-1] = kernel.index_dtype return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + elif self.name == "make_tuple": + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = arg_id_to_dtype[i] + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple") else: # did not find a scalar function and function prototype does not # even have subkernel registered => no match found @@ -347,8 +355,6 @@ class CallableOnScalar(InKernelCallable): return var(self.name_in_target)(*processed_parameters) def emit_call_insn(self, insn, target, expression_to_code_mapper): - # TODO: Need to add support for functions like sincos(x) - # which would give multiple outputs but takes in scalar arguments # FIXME: needs to get information about whether the callable has should # do pass by reference by all values or should return one value for @@ -382,7 +388,7 @@ class CallableOnScalar(InKernelCallable): c_parameters = [ expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), + dtype_to_type_context(target, tgt_dtype), tgt_dtype).expr for par, par_dtype, tgt_dtype in zip( parameters, par_dtypes, arg_dtypes)] @@ -395,7 +401,7 @@ class CallableOnScalar(InKernelCallable): c_parameters.append( var("&")( expression_to_code_mapper(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), + dtype_to_type_context(target, tgt_dtype), tgt_dtype).expr)) from pymbolic import var diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 70c6d68d2..fc8afd330 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -231,7 +231,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype): scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ScopedFunction("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): @@ -307,7 +307,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ScopedFunction("make_tuple")(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)) def __str__(self): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fafabfb58..6c5c9cc08 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2333,7 +2333,6 @@ def infer_arg_descr(kernel): return register_pymbolic_calls_to_knl_callables(kernel, pymbolic_calls_to_functions) - # }}} @@ -2479,8 +2478,7 @@ def preprocess_kernel(kernel, device=None): kernel = infer_unknown_types(kernel, expect_completion=False) # TODO: Specializng based on: - # 1. ArgDescriptors - # 2. InameTags + # 1. InameTags check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 02121ed9e..89866124c 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -44,6 +44,19 @@ def _debug(kernel, s, *args): logger.debug("%s: %s" % (kernel.name, logstr)) +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :param arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = dict((id, dtype) for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_pos = sorted(return_arg_id_to_dtype.keys()) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): @@ -283,7 +296,10 @@ class TypeInferenceMapper(CombineMapper): # collecting result dtypes in order of the assignees if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: - return [new_arg_id_to_dtype[-1]] + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] return [] @@ -450,8 +466,26 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): expr = subst_expander(writer_insn.expression) debug(" via expr %s", expr) + if isinstance(writer_insn, lp.Assignment): + result = type_inf_mapper(expr, return_dtype_set=True) + elif isinstance(writer_insn, lp.CallInstruction): + return_dtype_set = type_inf_mapper(expr, return_tuple=True, + return_dtype_set=True) + + result = [] + for return_dtype_set in return_dtype_set: + result_i = None + found = False + for assignee, comp_dtype_set in zip( + writer_insn.assignee_var_names(), return_dtype_set): + if assignee == var_name: + found = True + result_i = comp_dtype_set + break - result = type_inf_mapper(expr, return_dtype_set=True) + assert found + if result_i is not None: + result.append(result_i) debug(" result: %s", result) -- GitLab From 945e6d1fc886ce39aaeda3a37aa5884dda8384a8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 3 Apr 2018 13:39:15 -0500 Subject: [PATCH 070/916] Factored auxiliary kernel's codegen into the main codegen --- loopy/codegen/__init__.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 6023a4b55..4cff83a03 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -394,7 +394,7 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_v2(kernel): +def generate_code_v2(kernel, is_generating_master_kernel=True): """ :returns: a :class:`CodeGenerationResult` """ @@ -491,7 +491,7 @@ def generate_code_v2(kernel): + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), - is_generating_master_kernel=True) + is_generating_master_kernel=is_generating_master_kernel) from loopy.codegen.result import generate_host_or_device_program @@ -499,15 +499,14 @@ def generate_code_v2(kernel): auxiliary_dev_progs = [] - from loopy.codegen.auxiliary_kernels import generate_auxiliary_kernel_device_code for insn in kernel.instructions: if isinstance(insn, CallInstruction): in_knl_callable = kernel.scoped_functions[insn.expression.function.name] from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel): - auxiliary_dev_prog = generate_auxiliary_kernel_device_code( - in_knl_callable.subkernel, - kernel.target).device_programs[0].ast + auxiliary_dev_prog = generate_code_v2( + in_knl_callable.subkernel.copy(target=kernel.target), + is_generating_master_kernel=False).device_programs[0].ast auxiliary_dev_progs.append(auxiliary_dev_prog) elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, BarrierInstruction, CInstruction, @@ -515,7 +514,7 @@ def generate_code_v2(kernel): pass else: raise NotImplementedError("register_knl not made for %s type of " - "instruciton" % (str(type(insn)))) + "instruction" % (str(type(insn)))) # }}} @@ -523,8 +522,6 @@ def generate_code_v2(kernel): codegen_state, schedule_index=0) - # {{{ pasting the auxiliary functions code to the first device program - new_dev_prog = codegen_result.device_programs[0] for auxiliary_dev_prog in auxiliary_dev_progs: new_dev_prog = new_dev_prog.copy( @@ -532,8 +529,6 @@ def generate_code_v2(kernel): new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] codegen_result = codegen_result.copy(device_programs=new_device_programs) - # }}} - device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains -- GitLab From 72bf1cb5254d6db49c4e95ff517ed6882558a6b6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 3 Apr 2018 13:46:27 -0500 Subject: [PATCH 071/916] Removed auxiliary_kernels.oy --- loopy/codegen/auxiliary_kernels.py | 188 ----------------------------- 1 file changed, 188 deletions(-) delete mode 100644 loopy/codegen/auxiliary_kernels.py diff --git a/loopy/codegen/auxiliary_kernels.py b/loopy/codegen/auxiliary_kernels.py deleted file mode 100644 index 6c4166bd3..000000000 --- a/loopy/codegen/auxiliary_kernels.py +++ /dev/null @@ -1,188 +0,0 @@ -from __future__ import division, absolute_import - -__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - -import six -import islpy as isl - -from loopy.codegen import ( - ImplementedDataInfo, - CodeGenerationState) -from loopy.diagnostic import LoopyError -from loopy.kernel.instruction import ( - Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, - CInstruction, _DataObliviousInstruction) -from cgen import Collection - -import logging -logger = logging.getLogger(__name__) - - -__doc__ = """ -.. currentmodule:: loopy - -.. autofunction:: generate_auxiliary_kernel_device_code - -""" - - -# {{{ code generation for the auxiliary kernel - -def generate_auxiliary_kernel_device_code(kernel, target): - """ - Generates device programs for the given auxiliary kernel, with the target - specified by the parent kernel - :returns: a :class:`CodeGenerationResult` - """ - kernel = kernel.copy(target=target) - - from loopy.kernel import kernel_state - if kernel.state == kernel_state.INITIAL: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - - if kernel.schedule is None: - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) - - if kernel.state != kernel_state.SCHEDULED: - raise LoopyError( - "cannot generate code for a kernel that has not been " - "scheduled") - - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) - - logger.info("%s: generate Auxillary Kernel code: start" % kernel.name) - - # {{{ examine arg list - - from loopy.kernel.data import ValueArg - from loopy.kernel.array import ArrayBase - - implemented_data_info = [] - - for arg in kernel.args: - is_written = arg.name in kernel.get_written_variables() - if isinstance(arg, ArrayBase): - implemented_data_info.extend( - arg.decl_info( - kernel.target, - is_written=is_written, - index_dtype=kernel.index_dtype)) - - elif isinstance(arg, ValueArg): - implemented_data_info.append(ImplementedDataInfo( - target=kernel.target, - name=arg.name, - dtype=arg.dtype, - arg_class=ValueArg, - is_written=is_written)) - - else: - raise ValueError("argument type not understood: '%s'" % type(arg)) - - allow_complex = False - for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): - if var.dtype.involves_complex(): - allow_complex = True - - # }}} - - seen_dtypes = set() - seen_functions = set() - seen_atomic_dtypes = set() - - initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) - codegen_state = CodeGenerationState( - kernel=kernel, - implemented_data_info=implemented_data_info, - implemented_domain=initial_implemented_domain, - implemented_predicates=frozenset(), - seen_dtypes=seen_dtypes, - seen_functions=seen_functions, - seen_atomic_dtypes=seen_atomic_dtypes, - var_subst_map={}, - allow_complex=allow_complex, - var_name_generator=kernel.get_var_name_generator(), - is_generating_device_code=False, - gen_program_name=kernel.name, - schedule_index_end=len(kernel.schedule), - is_generating_master_kernel=False) - - from loopy.codegen.result import generate_host_or_device_program - - # {{{ collecting ASTs of auxiliary kernels - - auxiliary_dev_progs = [] - - from loopy.codegen.auxiliary_kernels import generate_auxiliary_kernel_device_code - for insn in kernel.instructions: - if isinstance(insn, CallInstruction): - in_knl_callable = kernel.scoped_functions[insn.expression.function.name] - if in_knl_callable.subkernel is not None: - auxiliary_dev_prog = generate_auxiliary_kernel_device_code( - in_knl_callable.subkernel, - kernel.target).device_programs[0].ast - auxiliary_dev_progs.append(auxiliary_dev_prog) - elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, - BarrierInstruction, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("register_knl not made for %s type of " - "instruciton" % (str(type(insn)))) - - # }}} - - codegen_result = generate_host_or_device_program( - codegen_state, - schedule_index=0) - - # {{{ pasting the auxiliary functions code to the first device program - - new_dev_prog = codegen_result.device_programs[0] - for auxiliary_dev_prog in auxiliary_dev_progs: - new_dev_prog = new_dev_prog.copy( - ast=Collection([auxiliary_dev_prog, new_dev_prog.ast])) - new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] - codegen_result = codegen_result.copy(device_programs=new_device_programs) - - # }}} - - # For faster unpickling in the common case when implemented_domains isn't needed. - from loopy.tools import LazilyUnpicklingDict - codegen_result = codegen_result.copy( - implemented_domains=LazilyUnpicklingDict( - codegen_result.implemented_domains)) - - logger.info("%s: generate code: done" % kernel.name) - - return codegen_result - -# }}} - -# vim: foldmethod=marker -- GitLab From be0317998e2b331fd21a0a78286e18b0a5e3e6c4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 4 Apr 2018 13:11:01 -0500 Subject: [PATCH 072/916] Added support for multi-args in kernel calls --- loopy/codegen/__init__.py | 5 +++++ loopy/kernel/__init__.py | 4 ++++ loopy/kernel/creation.py | 26 +++++++++++++----------- loopy/kernel/function_interface.py | 29 ++++++++++++++++----------- loopy/kernel/instruction.py | 32 +++++++++++++++++++++++++----- loopy/preprocess.py | 6 +++--- loopy/target/c/__init__.py | 3 ++- loopy/target/opencl.py | 4 ++-- loopy/target/pyopencl.py | 7 +++++-- loopy/transform/register_knl.py | 2 +- loopy/type_inference.py | 5 ++++- 11 files changed, 84 insertions(+), 39 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 4cff83a03..e3b3d077d 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -518,6 +518,9 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): # }}} + # {{{ pasting the device codes generated by the auxiliary kernels to the + # first device program + codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) @@ -529,6 +532,8 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] codegen_result = codegen_result.copy(device_programs=new_device_programs) + # }}} + device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 892c8a5cb..f998cb9a0 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -347,6 +347,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): @property def function_identifiers(self): + """ + Returns the function identifiers as an instance of :class:`set` which + are known to the kernel at creation time. + """ return self.target.get_device_ast_builder().function_identifiers() | ( set(["indexof", "indexof_vec", "make_tuple"])) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 834fdce20..07376b7bb 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1842,13 +1842,11 @@ class FunctionScoper(IdentityMapper): Converts functions known to the kernel as instances of :class:`ScopedFunction`. - .. _example: - - If given an expression of the form `sin(x) + unknown_function(y) + - log(z)`, then the mapper would return `ScopedFunction('sin')(x) + - unknown_function(y) + ScopedFunction('log')(z)`. Since the - `unknown_function` is not known to the kernel it is not marked as a - `ScopedFunction`. + **Example**: If given an expression of the form ``sin(x) + unknown_function(y) + + log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + + unknown_function(y) + ScopedFunction('log')(z)``. Since the + ``unknown_function`` is not known to the kernel it is not marked as a + :class:`loopy.symbolic.ScopedFunction`. """ def __init__(self, function_ids): self.function_ids = function_ids @@ -1866,7 +1864,7 @@ class FunctionScoper(IdentityMapper): for child in expr.parameters)) # This is an unknown function as of yet, not modifying it. - return IdentityMapper.map_call(self, expr) + return super(FunctionScoper, self).map_call(expr) def map_call_with_kwargs(self, expr): from loopy.symbolic import ScopedFunction @@ -1883,14 +1881,18 @@ class FunctionScoper(IdentityMapper): ) # This is an unknown function as of yet, not modifying it. - return IdentityMapper.map_call_with_kwargs(self, expr) + return super(FunctionScoper, self).map_call_with_kwargs(expr) class ScopedFunctionCollector(CombineMapper): - """ This mapper would collect all the instances of :class:`ScopedFunction` - occurring in the expression and written all of them as a :class:`set`. """ - def __init__(self, already_scoped_functions={}): + Mapper to collect the instances of :class:`loopy.symbolic.ScopedFunction` + in an expression. + + :returns: an instance of :class:`frozenset` of tuples ``(function_name, + in_kernel_callable)`` + """ + def __init__(self, already_scoped_functions=frozenset()): self.already_scoped_functions = already_scoped_functions def combine(self, values): diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 5d7585d0c..9f24e9c43 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -1,6 +1,6 @@ from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni" +__copyright__ = "Copyright (C) 2018 Andreas Klöckner, Kaushik Kulkarni" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -407,9 +407,6 @@ class CallableOnScalar(InKernelCallable): from pymbolic import var return var(self.name_in_target)(*c_parameters) - raise NotImplementedError("emit_call_insn only applies for" - " CallableKernels") - # }}} # }}} @@ -456,12 +453,6 @@ class CallableKernel(InKernelCallable): new_args.append(arg.copy( dtype=arg_id_to_dtype[kw_to_pos[kw]])) else: - if kw in self.subkernel.get_read_variables(): - # need to know the type of the input arguments for type - # inference - raise LoopyError("Type of %s variable not supplied to the" - " subkernel, which is needed for type" - " inference." % kw) new_args.append(arg) from loopy.type_inference import infer_unknown_types @@ -472,6 +463,7 @@ class CallableKernel(InKernelCallable): # of the types of the arguments supplied specialized_kernel = infer_unknown_types(pre_specialized_subkernel, expect_completion=True) + new_arg_id_to_dtype = {} read_count = 0 write_count = -1 @@ -506,8 +498,15 @@ class CallableKernel(InKernelCallable): if isinstance(id, str): id = kw_to_pos[id] assert isinstance(id, int) - new_args[id] = new_args[id].copy(shape=descr.shape, - dim_tags=descr.dim_tags) + if isinstance(descr, ArrayArgDescriptor): + new_args[id] = new_args[id].copy(shape=descr.shape, + dim_tags=descr.dim_tags) + elif isinstance(descr, ValueArgDescriptor): + pass + else: + raise LoopyError("Descriptor must be either an instance of " + "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % + type(descr)) descriptor_specialized_knl = self.subkernel.copy(args=new_args) @@ -561,7 +560,13 @@ class CallableKernel(InKernelCallable): # Note that we are not going to do any type casting in array calls. from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else expression_to_code_mapper(par, PREC_NONE, dtype_to_type_context(target, par_dtype), par_dtype).expr diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index d2d0c5457..fb0c6690b 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1070,6 +1070,20 @@ def is_array_call(assignees, expression): return False +def get_array_call_assignee(assignee): + from pymbolic.primitives import Subscript, Variable + from loopy.symbolic import SubArrayRef + if isinstance(assignee, SubArrayRef): + return assignee + elif isinstance(assignee, Subscript): + return SubArrayRef((), assignee) + elif isinstance(assignee, Variable): + return SubArrayRef((), Subscript(assignee, 0)) + else: + raise LoopyError("ArrayCall only takes Variable, Subscript or " + "SubArrayRef as its inputs") + + def make_assignment(assignees, expression, temp_var_types=None, **kwargs): if len(assignees) > 1 or len(assignees) == 0 or is_array_call(assignees, expression): @@ -1084,11 +1098,19 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): raise LoopyError("right-hand side in multiple assignment must be " "function call or reduction, got: '%s'" % expression) - return CallInstruction( - assignees=assignees, - expression=expression, - temp_var_types=temp_var_types, - **kwargs) + if not is_array_call(assignees, expression): + return CallInstruction( + assignees=assignees, + expression=expression, + temp_var_types=temp_var_types, + **kwargs) + else: + return CallInstruction( + assignees=tuple(get_array_call_assignee(assignee) for + assignee in assignees), + expression=expression, + temp_var_types=temp_var_types, + **kwargs) else: return Assignment( diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 6c5c9cc08..9e8956a59 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1942,9 +1942,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # TODO: remove unused inames... - kernel = ( - _hackily_ensure_multi_assignment_return_values_are_scoped_private( - kernel)) + # kernel = ( + # _hackily_ensure_multi_assignment_return_values_are_scoped_private( + # kernel)) return kernel diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 28068df75..5ee7401c3 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -338,7 +338,7 @@ class _ConstRestrictPointer(Pointer): class _ConstPointer(Pointer): - def get_decl_pait(self): + def get_decl_pair(self): sub_tp, sub_decl = self.subdecl.get_decl_pair() return sub_tp, ("*const %s" % sub_decl) @@ -828,6 +828,7 @@ class CASTBuilder(ASTBuilderBase): assert shape == () result = POD(self, dtype, name) + if not is_written: from cgen import Const result = Const(result) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 87c77b2c2..af194335f 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -241,8 +241,8 @@ def opencl_with_types(in_knl_callable, arg_id_to_dtype): return None dtype = np.find_common_type( - [], [dtype.numpy_dtype for id, dtype in - arg_id_to_dtype.items() if id >= 0]) + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) if dtype.kind == "i": dtype = NumpyType(dtype) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 2fd6af935..138f02137 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -242,7 +242,7 @@ def pyopencl_with_types(in_knl_callable, arg_id_to_dtype): for id in arg_id_to_dtype: if not -1 <= id <= 0: - raise LoopyError("%s can take only one argument." % name) + return None if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: # the types provided aren't mature enough to specialize the @@ -809,10 +809,13 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): if new_callable is not None: return new_callable - new_callable = pyopencl_with_types(in_knl_callable, arg_id_to_dtype) + return pyopencl_with_types(in_knl_callable, arg_id_to_dtype) + ''' + # Till the time we have written the RNG with types if new_callable is not None: return new_callable return random123_with_types(in_knl_callable, arg_id_to_dtype) + ''' # }}} diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 05a298d11..38615ed70 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -98,7 +98,7 @@ def register_callable_kernel(parent, function_name, child): "use a different name for registering the subkernel") scoped_functions[function_name] = CallableKernel(name=function_name, - subkernel=child) + subkernel=child.copy(target=parent.target)) # returning the parent kernel with the new scoped function dictionary return parent.copy(scoped_functions=scoped_functions, diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 89866124c..dee893715 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -52,7 +52,7 @@ def get_return_types_as_tuple(arg_id_to_dtype): """ return_arg_id_to_dtype = dict((id, dtype) for id, dtype in arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) - return_arg_pos = sorted(return_arg_id_to_dtype.keys()) + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) @@ -294,6 +294,9 @@ class TypeInferenceMapper(CombineMapper): new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + if new_arg_id_to_dtype is None: + return [] + # collecting result dtypes in order of the assignees if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: if return_tuple: -- GitLab From c6be75d4c307a3b8d8078dcfc3f1cbeed5ce5646 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 9 Apr 2018 00:23:10 -0500 Subject: [PATCH 073/916] Fixes negative strides in a slice --- loopy/check.py | 63 +++++++- loopy/codegen/__init__.py | 5 - loopy/isl_helpers.py | 29 ++-- loopy/kernel/creation.py | 237 +++++++++++++++-------------- loopy/kernel/function_interface.py | 125 ++++++++------- loopy/kernel/instruction.py | 16 +- loopy/preprocess.py | 121 ++++----------- loopy/symbolic.py | 91 +++++++++-- loopy/target/c/__init__.py | 31 +++- loopy/target/cuda.py | 4 +- loopy/target/opencl.py | 21 ++- loopy/target/python.py | 4 +- loopy/transform/diff.py | 4 +- loopy/transform/register_knl.py | 78 +++------- loopy/type_inference.py | 3 +- 15 files changed, 469 insertions(+), 363 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 146391bf2..6afeb86ac 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -27,9 +27,13 @@ from six.moves import range from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper +from loopy.symbolic import WalkMapper, CombineMapper, ScopedFunction from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -55,6 +59,63 @@ def check_identifiers_in_subst_rules(knl): "kernel-global identifiers" % (knl.name, ", ".join(deps-rule_allowed_identifiers))) + +class UnScopedCallCollector(CombineMapper): + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + if not isinstance(expr.function, ScopedFunction): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if not isinstance(expr.function, ScopedFunction): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + expr.kw_parameter.values()))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+expr.kw_parameters.values())) + + def map_scoped_function(self, expr): + return frozenset([expr.name]) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def check_functions_are_scoped(kernel): + """ Checks if all the calls in the instruction expression have been scoped, + otherwise indicate to what all calls we await signature. + """ + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + unscoped_calls = UnScopedCallCollector()(subst_expander( + insn.expression)) + if unscoped_calls: + raise LoopyError("Unknown function '%s' obtained -- register a " + "function or a kernel corresponding to it." % + set(unscoped_calls).pop()) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("check_function_are_scoped not " + "implemented for %s type of instruction." % type(insn)) + # }}} diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e3b3d077d..2e217b779 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -516,11 +516,6 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): raise NotImplementedError("register_knl not made for %s type of " "instruction" % (str(type(insn)))) - # }}} - - # {{{ pasting the device codes generated by the auxiliary kernels to the - # first device program - codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 5a747d070..f0c37933a 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -27,7 +27,7 @@ THE SOFTWARE. from six.moves import range, zip -from loopy.diagnostic import StaticValueFindingError +from loopy.diagnostic import StaticValueFindingError, LoopyError import islpy as isl from islpy import dim_type @@ -62,7 +62,7 @@ def dump_space(ls): # {{{ make_slab -def make_slab(space, iname, start, stop): +def make_slab(space, iname, start, stop, step=1): zero = isl.Aff.zero_on_domain(space) if isinstance(start, (isl.Aff, isl.PwAff)): @@ -91,13 +91,24 @@ def make_slab(space, iname, start, stop): iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) - result = (isl.BasicSet.universe(space) - # start <= iname - .add_constraint(isl.Constraint.inequality_from_aff( - iname_aff - start)) - # iname < stop - .add_constraint(isl.Constraint.inequality_from_aff( - stop-1 - iname_aff))) + if step > 0: + result = (isl.BasicSet.universe(space) + # start <= iname + .add_constraint(isl.Constraint.inequality_from_aff( + step*iname_aff - start)) + # iname < stop + .add_constraint(isl.Constraint.inequality_from_aff( + stop-1 - step*iname_aff))) + elif step < 0: + result = (isl.BasicSet.universe(space) + # start <= iname + .add_constraint(isl.Constraint.inequality_from_aff( + step*iname_aff + start)) + # iname < stop + .add_constraint(isl.Constraint.inequality_from_aff( + -stop-1 - step*iname_aff))) + else: + raise LoopyError("0 step not allowed in make_slab.") return result diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 07376b7bb..e6813aa4a 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -29,7 +29,9 @@ import numpy as np from pymbolic.mapper import CSECachingMapperMixin from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids -from loopy.symbolic import IdentityMapper, WalkMapper, CombineMapper, SubArrayRef +from loopy.symbolic import ( + IdentityMapper, WalkMapper, SubArrayRef, + RuleAwareIdentityMapper) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, @@ -45,8 +47,6 @@ from six.moves import range, zip, intern import re -from functools import reduce - import logging logger = logging.getLogger(__name__) @@ -1837,172 +1837,174 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # {{{ scope functions -class FunctionScoper(IdentityMapper): +class FunctionScoper(RuleAwareIdentityMapper): """ Converts functions known to the kernel as instances of - :class:`ScopedFunction`. + :class:`loopy.symbolic.ScopedFunction`. **Example**: If given an expression of the form ``sin(x) + unknown_function(y) + log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + - unknown_function(y) + ScopedFunction('log')(z)``. Since the - ``unknown_function`` is not known to the kernel it is not marked as a - :class:`loopy.symbolic.ScopedFunction`. + unknown_function(y) + ScopedFunction('log')(z)``. """ - def __init__(self, function_ids): + def __init__(self, rule_mapping_context, function_ids): + super(FunctionScoper, self).__init__(rule_mapping_context) self.function_ids = function_ids + self.scoped_functions = {} - def map_call(self, expr): + def map_call(self, expr, expn_state): from loopy.symbolic import ScopedFunction if not isinstance(expr.function, ScopedFunction) and ( expr.function.name in self.function_ids): # The function is one of the known function hence scoping it. from pymbolic.primitives import Call + from loopy.kernel.function_interface import ScalarCallable + + # Associating the newly created ScopedFunction with a `CallableScalar` + self.scoped_functions[expr.function.name] = ScalarCallable( + expr.function.name) return Call( ScopedFunction(expr.function.name), - tuple(self.rec(child) + tuple(self.rec(child, expn_state) for child in expr.parameters)) - # This is an unknown function as of yet, not modifying it. - return super(FunctionScoper, self).map_call(expr) + # This is an unknown function as of yet, hence not modifying it. + return super(FunctionScoper, self).map_call(expr, expn_state) - def map_call_with_kwargs(self, expr): + def map_call_with_kwargs(self, expr, expn_state): from loopy.symbolic import ScopedFunction if not isinstance(expr.function, ScopedFunction) and ( expr.function.name in self.function_ids): from pymbolic.primitives import CallWithKwargs + from loopy.kernel.function_interface import ScalarCallable + + # Associating the newly created ScopedFunction with a `CallableScalar` + self.scoped_functions[expr.function.name] = ScalarCallable( + expr.function.name) return CallWithKwargs( ScopedFunction(expr.function.name), - tuple(self.rec(child) + tuple(self.rec(child, expn_state) for child in expr.parameters), dict( - (key, self.rec(val)) + (key, self.rec(val, expn_state)) for key, val in six.iteritems(expr.kw_parameters)) ) - # This is an unknown function as of yet, not modifying it. - return super(FunctionScoper, self).map_call_with_kwargs(expr) - + # This is an unknown function as of yet, hence not modifying it. + return super(FunctionScoper, self).map_call_with_kwargs(expr, + expn_state) -class ScopedFunctionCollector(CombineMapper): - """ - Mapper to collect the instances of :class:`loopy.symbolic.ScopedFunction` - in an expression. - - :returns: an instance of :class:`frozenset` of tuples ``(function_name, - in_kernel_callable)`` - """ - def __init__(self, already_scoped_functions=frozenset()): - self.already_scoped_functions = already_scoped_functions - - def combine(self, values): - import operator - return reduce(operator.or_, values, frozenset()) - - def map_scoped_function(self, expr): - from loopy.kernel.function_interface import CallableOnScalar - if expr.name in self.already_scoped_functions: - # functions is already scoped - return frozenset() - else: - return frozenset([(expr.name, CallableOnScalar(expr.name))]) - - def map_reduction(self, expr): - from loopy.kernel.function_interface import CallableOnScalar + def map_reduction(self, expr, expn_state): + from loopy.kernel.function_interface import ScalarCallable from loopy.library.reduction import (MaxReductionOperation, MinReductionOperation, ArgMinReductionOperation, ArgMaxReductionOperation) + if isinstance(expr.operation, MaxReductionOperation): - return frozenset([("max", CallableOnScalar("max"))]) | ( - self.rec(expr.expr)) + self.scoped_functions["max"] = ScalarCallable("max") elif isinstance(expr.operation, MinReductionOperation): - return frozenset([("min", CallableOnScalar("min"))]) | ( - self.rec(expr.expr)) + self.scoped_functions["min"] = ScalarCallable("min") elif isinstance(expr.operation, ArgMaxReductionOperation): - return frozenset([("max", CallableOnScalar("min")), ("make_tuple", - CallableOnScalar("make_tuple"))]) | ( - self.rec(expr.expr)) + self.scoped_functions["max"] = ScalarCallable("max") + self.scoped_functions["make_tuple"] = ScalarCallable("make_tuple") elif isinstance(expr.operation, ArgMinReductionOperation): - return frozenset([("min", CallableOnScalar("min")), ("make_tuple", - CallableOnScalar("make_tuple"))]) | ( - self.rec(expr.expr)) - else: - return self.rec(expr.expr) - - def map_constant(self, expr): - return frozenset() - - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant + self.scoped_functions["min"] = ScalarCallable("min") + self.scoped_functions["make_tuple"] = ScalarCallable("make_tuple") + return super(FunctionScoper, self).map_reduction(expr, expn_state) -def scope_functions(kernel): - func_ids = kernel.function_identifiers - from loopy.kernel.instruction import CInstruction, _DataObliviousInstruction - function_scoper = FunctionScoper(func_ids) - scoped_function_collector = ScopedFunctionCollector( - kernel.scoped_functions) - new_scoped_functions = set() +def scope_functions(kernel, function_identifiers=None): + """ + Returns a kernel with the pymbolic nodes involving known functions realized + as instances of :class:`loopy.symbolic.ScopedFunction`. - new_insns = [] + :arg function_identifiers: The functions which are to be looked up in the + kernel. + """ + if function_identifiers is None: + # Adding the default fucnction identifiers if none provided + function_identifiers = kernel.function_identifiers - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - new_insn = insn.copy(expression=function_scoper(insn.expression)) - new_scoped_functions.update(scoped_function_collector( - new_insn.expression)) - new_insns.append(new_insn) - elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): - new_insns.append(insn) - else: - raise NotImplementedError("scope_functions not implemented for %s" % - type(insn)) + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) - substitutions_with_scoped_expr = {} + function_scoper = FunctionScoper(rule_mapping_context, function_identifiers) - for name, rule in kernel.substitutions.items(): - scoped_rule = rule.copy( - expression=function_scoper(rule.expression)) - substitutions_with_scoped_expr[name] = scoped_rule - new_scoped_functions.update(scoped_function_collector( - scoped_rule.expression)) + # scoping fucntions and collecting the scoped functions + kernel_with_scoped_functions = function_scoper.map_kernel(kernel) - # Need to combine the scoped functions into a dict + # updating the functions collected during the scoped functions updated_scoped_functions = kernel.scoped_functions.copy() - updated_scoped_functions.update(dict(new_scoped_functions)) - return kernel.copy(instructions=new_insns, - scoped_functions=updated_scoped_functions, - substitutions=substitutions_with_scoped_expr) + updated_scoped_functions.update(function_scoper.scoped_functions) + + return kernel_with_scoped_functions.copy( + scoped_functions=updated_scoped_functions) # }}} # {{{ slice to sub array ref -def get_slice_params(expr, domain_length): +def get_slice_params(slice, dimension_length): """ - Either reads the params from the slice or initiates the value to defaults. + Returns the slice parameters across an axes spanning *domain_length* as a + tuple of ``(start, stop, step)``. + + :arg slice: An instance of :class:`pymbolic.primitives.Slice`. + :arg dimension_length: The axes length swept by *slice*. """ - start, stop, step = expr.start, expr.stop, expr.step + from pymbolic.primitives import Slice + assert isinstance(slice, Slice) + start, stop, step = slice.start, slice.stop, slice.step + + if step is None: + step = 1 + + if step == 0: + raise LoopyError("Slice cannot have 0 step size.") if start is None: - start = 0 + if step > 0: + start = 0 + else: + start = dimension_length-1 if stop is None: - stop = domain_length - - if step is None: - step = 1 + if step > 0: + stop = dimension_length + else: + stop = -1 return start, stop, step class SliceToInameReplacer(IdentityMapper): """ - Mapper that converts slices to instances of :class:`SubArrayRef`. + Converts slices to instances of :class:`loopy.symbolic.SubArrayRef`. + + :attribute var_name_gen: + + Variable name generator, in order to generate unique inames within the + kernel domain. + + :attribute knl: + + An instance of :clas:`loopy.LoopKernel` + + :attribute iname_domains: + + An instance of :class:`dict` to store the slices enountered in the + expressions as a mapping from ``iname`` to a tuple of ``(start, stop, + step)``, which describes the affine constraint imposed on the ``iname`` + by the corresponding slice notation its intended to replace. + + :Example: + + ``x[:, i, :, j]`` would be mapped to ``[islice_0, islice_1]: + x[islice_0, i, islice_1, j]`` + """ def __init__(self, knl, var_name_gen): self.var_name_gen = var_name_gen @@ -2028,7 +2030,11 @@ class SliceToInameReplacer(IdentityMapper): index, domain_length) self.iname_domains[unique_var_name] = (start, stop, step) - updated_index.append(step*Variable(unique_var_name)) + if step > 0: + updated_index.append(step*Variable(unique_var_name)) + else: + updated_index.append(start+step*Variable(unique_var_name)) + swept_inames.append(Variable(unique_var_name)) else: updated_index.append(index) @@ -2042,7 +2048,8 @@ class SliceToInameReplacer(IdentityMapper): def get_iname_domain_as_isl_set(self): """ - Returns the extra domain constraints imposed by the slice inames. + Returns the extra domain constraints imposed by the slice inames, + recorded in :attr:`iname_domains` """ if not self.iname_domains: return None @@ -2052,20 +2059,17 @@ class SliceToInameReplacer(IdentityMapper): set=list(self.iname_domains.keys())) iname_set = isl.BasicSet.universe(space) + from loopy.isl_helpers import make_slab for iname, (start, stop, step) in self.iname_domains.items(): - iname_set = (iname_set - .add_constraint(isl.Constraint.ineq_from_names(space, {1: - -start, iname: step})) - .add_constraint(isl.Constraint.ineq_from_names(space, {1: - stop-1, iname: -step}))) + iname_set = iname_set & make_slab(space, iname, start, stop, step) return iname_set def realize_slices_as_sub_array_refs(kernel): """ - Transformation that returns a kernel with the instances of - :class:`pymbolic.primitives.Slice` to `loopy.symbolic.SubArrayRef` + Returns a kernel with the instances of :class:`pymbolic.primitives.Slice` + interpreted as `loopy.symbolic.SubArrayRef`. """ unique_var_name_generator = kernel.get_var_name_generator() slice_replacer = SliceToInameReplacer(kernel, unique_var_name_generator) @@ -2074,14 +2078,15 @@ def realize_slices_as_sub_array_refs(kernel): for insn in kernel.instructions: if isinstance(insn, CallInstruction): new_expr = slice_replacer(insn.expression) - new_assignees = slice_replacer(insn.assignees) + new_assignees = tuple(slice_replacer(assignee) for assignee in + insn.assignees) new_insns.append(insn.copy(assignees=new_assignees, expression=new_expr)) elif isinstance(insn, (CInstruction, MultiAssignmentBase, _DataObliviousInstruction)): new_insns.append(insn) else: - raise NotImplementedError("parse_slices not implemented for %s" % + raise NotImplementedError("Unknown type of instruction -- %s" % type(insn)) slice_iname_domains = slice_replacer.get_iname_domain_as_isl_set() @@ -2435,7 +2440,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_written_variable_names(knl) # Function Lookup - knl = scope_functions(knl) + knl = scope_functions(knl, knl.function_identifiers) from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 9f24e9c43..a70ea2af6 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -9,8 +9,10 @@ in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -40,52 +42,46 @@ from loopy.symbolic import (IdentityMapper, ScopedFunction, # {{{ argument descriptors -class ArgDescriptor(ImmutableRecord): - """Base type of argument description about the variable type that is supposed to - be encountered in a function signature. - .. attribute:: mem_scope - .. attribute:: shape - .. attribute:: dim_tags - """ +class ValueArgDescriptor(ImmutableRecord): + pass - def __init__(self, - mem_scope=None, - shape=None, - dim_tags=None): - super(ArgDescriptor, self).__init__(mem_scope=mem_scope, - shape=shape, - dim_tags=dim_tags) +class ArrayArgDescriptor(ImmutableRecord): + """ + Records information about an array argument to an in-kernel callable, to be + passed to and returned from + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used + for matching shape and scope of caller and callee kernels. -class ValueArgDescriptor(ArgDescriptor): - def __init__(self): - super(ValueArgDescriptor, self).__init__() + ..attribute:: shape - def __str__(self): - return "ValueArgDescriptor" + Shape of the array. - def __repr__(self): - return "ValueArgDescriptor" + .. attribute:: mem_scope + Can be either "LOCAL" or "GLOBAL", definiing where the argument is + supposed to reside in the device memory. -class ArrayArgDescriptor(ArgDescriptor): - """ - .. attribute:: mem_scope .. attribute:: dim_tags + + A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` """ - def __init__(self, - shape=None, - mem_scope=None, - dim_tags=None): + def __init__(self, shape, mem_scope, dim_tags): # {{{ sanity checks + from loopy.kernel.array import FixedStrideArrayDimTag + assert isinstance(shape, tuple) + assert isinstance(mem_scope, str) + assert isinstance(dim_tags, tuple) + assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in + dim_tags) # }}} - super(ArgDescriptor, self).__init__(shape=shape, + super(ArrayArgDescriptor, self).__init__(shape=shape, mem_scope=mem_scope, dim_tags=dim_tags) @@ -110,6 +106,10 @@ class ArrayArgDescriptor(ArgDescriptor): # {{{ helper function for callable kenrel -- kw_to_pos def get_kw_pos_association(kernel): + """ + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments present of + the kernel. + """ kw_to_pos = {} pos_to_kw = {} @@ -117,14 +117,18 @@ def get_kw_pos_association(kernel): write_count = -1 for arg in kernel.args: - if arg.name in kernel.get_written_variables(): - kw_to_pos[arg.name] = write_count - pos_to_kw[write_count] = arg.name - write_count -= 1 - else: + # FIXME: Confused about the written and read variables ordering. + # Confirm it with Prof. Andreas. + if arg.name not in kernel.get_written_variables(): kw_to_pos[arg.name] = read_count pos_to_kw[read_count] = arg.name read_count += 1 + else: + # These args are not read in the kernel. Hence, assuming that they + # must be returned. + kw_to_pos[arg.name] = write_count + pos_to_kw[write_count] = arg.name + write_count -= 1 return kw_to_pos, pos_to_kw @@ -135,6 +139,7 @@ def get_kw_pos_association(kernel): class InKernelCallable(ImmutableRecord): """ + Describes a callable encountered in a kernel. .. attribute:: name @@ -147,9 +152,9 @@ class InKernelCallable(ImmutableRecord): .. attribute:: arg_id_to_descr - A mapping which gives indicates the argument shape and `dim_tags` it + A mapping which gives indicates the argument shape and ``dim_tags`` it would be responsible for generating code. These parameters would be set, - once it is shape and stride(`dim_tags`) specialized. + once it is shape and stride(``dim_tags``) specialized. .. note:: @@ -253,7 +258,12 @@ class InKernelCallable(ImmutableRecord): # {{{ callables on scalar -class CallableOnScalar(InKernelCallable): +class ScalarCallable(InKernelCallable): + """ + Records the information about a scalar callable encountered in a kernel. + The :meth:`ScalarCallable.with_types` is intended to assist with type + specialization of the funciton. + """ fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", @@ -283,7 +293,7 @@ class CallableOnScalar(InKernelCallable): if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: raise LoopyError("Overwriting a specialized" " function is illegal--maybe start with new instance of" - " CallableOnScalar?") + " ScalarCallable?") if self.name in kernel.target.get_device_ast_builder( ).function_identifiers(): @@ -313,8 +323,6 @@ class CallableOnScalar(InKernelCallable): def with_descrs(self, arg_id_to_descr): - # This is a scalar call - # need to assert that the name is in funtion indentifiers arg_id_to_descr[-1] = ValueArgDescriptor() return self.copy(arg_id_to_descr=arg_id_to_descr) @@ -325,11 +333,6 @@ class CallableOnScalar(InKernelCallable): # {{{ code generation - def generate_preambles(self, target): - """ This would generate the target specific preamble. - """ - raise NotImplementedError() - def emit_call(self, expression_to_code_mapper, expression, target): assert self.is_ready_for_codegen() @@ -395,7 +398,7 @@ class CallableOnScalar(InKernelCallable): for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): if tgt_dtype != expression_to_code_mapper.infer_type(a): - raise LoopyError("Type Mismach in funciton %s. Expected: %s" + raise LoopyError("Type Mismatch in funciton %s. Expected: %s" "Got: %s" % (self.name, tgt_dtype, expression_to_code_mapper.infer_type(a))) c_parameters.append( @@ -415,6 +418,20 @@ class CallableOnScalar(InKernelCallable): # {{{ callable kernel class CallableKernel(InKernelCallable): + """ + Records information about in order to make the callee kernel compatible to be + called from a caller kernel. The :meth:`loopy.register_callable_kernel` + should be called in order to initiate association between a funciton in + caller kernel and the callee kernel. + + The :meth:`CallableKernel.with_types` should be called in order to match + the ``dtypes`` of the arguments that are shared between the caller and the + callee kernel. + + The :meth:`CallableKernel.with_descrs` should be called in order to match + the ``dim_tags, shape, mem_scopes`` of the arguments shared between the + caller and the callee kernel. + """ fields = set(["name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) @@ -465,16 +482,11 @@ class CallableKernel(InKernelCallable): expect_completion=True) new_arg_id_to_dtype = {} - read_count = 0 - write_count = -1 for arg in specialized_kernel.args: + # associating the updated_arg_id_to_dtype with keyword as well as + # positional id. new_arg_id_to_dtype[arg.name] = arg.dtype - if arg.name in specialized_kernel.get_written_variables(): - new_arg_id_to_dtype[write_count] = arg.dtype - write_count -= 1 - else: - new_arg_id_to_dtype[read_count] = arg.dtype - read_count += 1 + new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype # Returning the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype @@ -573,7 +585,6 @@ class CallableKernel(InKernelCallable): for par, par_dtype in zip( parameters, par_dtypes)] - from pymbolic import var return var(self.name_in_target)(*c_parameters) # }}} @@ -598,9 +609,9 @@ def next_indexed_name(name): class ScopedFunctionNameChanger(RuleAwareIdentityMapper): """ - Mapper that takes in a mapping `expr_to_new_names` and maps the + Mapper that takes in a mapping ``expr_to_new_names`` and maps the corresponding expression to the new names, which correspond to the names in - `kernel.scoped_functions`. + ``kernel.scoped_functions``. """ def __init__(self, rule_mapping_context, expr_to_new_names, subst_expander): diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index fb0c6690b..c81553b45 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1047,6 +1047,9 @@ class CallInstruction(MultiAssignmentBase): def subscript_contains_slice(subscript): + """Return *True* if the *subscript* contains an instance of + :class:`pymbolic.primitives.Slice` as of its indices. + """ from pymbolic.primitives import Subscript, Slice assert isinstance(subscript, Subscript) return any(isinstance(index, Slice) for index in subscript.index_tuple) @@ -1071,12 +1074,20 @@ def is_array_call(assignees, expression): def get_array_call_assignee(assignee): + """ + Converts the assignee subscript or variable as a SubArrayRef. + """ from pymbolic.primitives import Subscript, Variable from loopy.symbolic import SubArrayRef if isinstance(assignee, SubArrayRef): return assignee elif isinstance(assignee, Subscript): - return SubArrayRef((), assignee) + if subscript_contains_slice(assignee): + # Slice subscripted array are treated as SubArrayRef in the kernel + # Hence, making the behavior similar to that of `SubArrayref` + return assignee + else: + return SubArrayRef((), assignee) elif isinstance(assignee, Variable): return SubArrayRef((), Subscript(assignee, 0)) else: @@ -1105,6 +1116,9 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): temp_var_types=temp_var_types, **kwargs) else: + # In the case of an array call, it is important to have each + # assignee as an instance of SubArrayRef. If not given as a + # SubArrayRef return CallInstruction( assignees=tuple(get_array_call_assignee(assignee) for assignee in assignees), diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 9e8956a59..49103931f 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2102,68 +2102,6 @@ def check_atomic_loads(kernel): # }}} -# {{{ check for unscoped calls - -class UnScopedCallCollector(CombineMapper): - - def combine(self, values): - import operator - return reduce(operator.or_, values, frozenset()) - - def map_call(self, expr): - if not isinstance(expr.function, ScopedFunction): - return (frozenset([expr.function.name]) | - self.combine((self.rec(child) for child in expr.parameters))) - else: - return self.combine((self.rec(child) for child in expr.parameters)) - - def map_call_with_kwargs(self, expr): - if not isinstance(expr.function, ScopedFunction): - return (frozenset([expr.function.name]) | - self.combine((self.rec(child) for child in expr.parameters - + expr.kw_parameter.values()))) - else: - return self.combine((self.rec(child) for child in - expr.parameters+expr.kw_parameters.values())) - - def map_scoped_function(self, expr): - return frozenset([expr.name]) - - def map_constant(self, expr): - return frozenset() - - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant - - -def check_functions_are_scoped(kernel): - """ Checks if all the calls in the instruction expression have been scoped, - otherwise indicate to what all calls we await signature. - """ - - from loopy.symbolic import SubstitutionRuleExpander - subst_expander = SubstitutionRuleExpander(kernel.substitutions) - - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - unscoped_calls = UnScopedCallCollector()(subst_expander( - insn.expression)) - if unscoped_calls: - raise LoopyError("Unknown function '%s' obtained -- register a " - "function or a kernel corresponding to it." % - set(unscoped_calls).pop()) - elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("check_function_are_scoped not " - "implemented for %s type of instruction." % type(insn)) - - -# }}} - - # {{{ arg_descr_inference def get_arg_description_from_sub_array_ref(sub_array, kernel): @@ -2172,15 +2110,18 @@ def get_arg_description_from_sub_array_ref(sub_array, kernel): :class:`ArrayArgDescriptor`. """ from loopy.kernel.function_interface import ArrayArgDescriptor + # from loopy.kernel.data import temp_var_scope name = sub_array.subscript.aggregate.name if name in kernel.temporary_variables: + # mem_scope = temp_var_scope.LOCAL mem_scope = "LOCAL" arg = kernel.temporary_variables[name] assert name not in kernel.arg_dict else: assert name in kernel.arg_dict + # mem_scope = temp_var_scope.GLOBAL mem_scope = "GLOBAL" arg = kernel.arg_dict[name] @@ -2192,7 +2133,7 @@ def get_arg_description_from_sub_array_ref(sub_array, kernel): shape=sub_shape) -class ArgDescriptionInferer(CombineMapper): +class ArgDescrInferenceMapper(CombineMapper): """ Returns a set with elements as instances of :class:`tuple` (expr, in_kenrel_callable). The mapped `in_kenrel_callable` of the :class:`InKernelCallable` are descriptor specialized for the given @@ -2303,7 +2244,7 @@ def infer_arg_descr(kernel): shape and dimensions of the arguments too. """ - arg_description_modifier = ArgDescriptionInferer(kernel) + arg_description_modifier = ArgDescrInferenceMapper(kernel) pymbolic_calls_to_functions = set() for insn in kernel.instructions: @@ -2336,9 +2277,13 @@ def infer_arg_descr(kernel): # }}} -# {{{ final sweep over the callables to make them ready for codegen +# {{{ catching functions that are not ready for codegen -class ReadyForCodegen(CombineMapper): +class FunctionsNotReadyForCodegenCollector(CombineMapper): + """ + Returns all instances of function calls in an expression which are + not ready for code generation. + """ def __init__(self, kernel): self.kernel = kernel @@ -2376,48 +2321,48 @@ class ReadyForCodegen(CombineMapper): map_type_cast = map_constant -def specialize_incomplete_callables(kernel): +def make_functions_ready_for_codegen(kernel): """ - Transformation necessary to type-specialize the callables which are missed - in type inference. For example consider: - ``` - knl = lp.make_kernel( - "{[i]: 0<=i<16}", - "a[i] = sin[b[i]]", - [lp.GlobalArg('a', dtype=np.float64), - lp.GlobalArg('b', dtype=np.float64)]) - ``` - In this case, none of the instructions undergo type inference as the type - inference is already resolved. But this would be a problem during - code-generation as `sin` is not type specialized. + Specializes the functions in the kernel that are missed during type + inference. + + .. code:: python + knl = lp.make_kernel( + "{[i]: 0<=i<16}", + "a[i] = sin(b[i])", + [lp.GlobalArg('a', dtype=np.float64), + lp.GlobalArg('b', dtype=np.float64)]) + + In the above case, none of the instructions undergo type-specialization, as + all the arguments' types have been realized. But, this would be a problem + during the code generation phase as ``sin`` did not undergo type + specialization, and hence must be fixed through this function. """ from loopy.type_inference import TypeInferenceMapper from loopy.symbolic import SubstitutionRuleExpander from loopy.kernel.function_interface import ( register_pymbolic_calls_to_knl_callables) - ready_for_codegen = ReadyForCodegen(kernel) + unready_functions_collector = FunctionsNotReadyForCodegenCollector(kernel) subst_expander = SubstitutionRuleExpander(kernel.substitutions) type_inf_mapper = TypeInferenceMapper(kernel) - inferred_functions = {} for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): expr = subst_expander(insn.expression) - if not ready_for_codegen(expr): - # only trying to specialize the functions which are not ready - # for codegen + if not unready_functions_collector(expr): + # Infer the type of the functions that are not type specialized. type_inf_mapper(expr) - inferred_functions.update(type_inf_mapper.specialized_functions) elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): pass + else: NotImplementedError("Unknown Instruction") return register_pymbolic_calls_to_knl_callables(kernel, - inferred_functions) + type_inf_mapper.specialized_functions) # }}} @@ -2500,8 +2445,8 @@ def preprocess_kernel(kernel, device=None): # call. kernel = infer_arg_descr(kernel) - # try specializing callables one last time. - kernel = specialize_incomplete_callables(kernel) + # type specialize functions that were missed during the type inference. + kernel = make_functions_ready_for_codegen(kernel) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 5dce66ac8..c455d08fd 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -108,7 +108,8 @@ class IdentityMapperMixin(object): return type(expr)(expr.type, self.rec(expr.child)) def map_sub_array_ref(self, expr, *args): - return SubArrayRef(expr.swept_inames, expr.subscript) + return SubArrayRef(self.rec(expr.swept_inames, *args), + self.rec(expr.subscript, *args)) map_type_cast = map_type_annotation @@ -683,6 +684,35 @@ class ScopedFunction(p.Variable): return StringifyMapper +class EvaluatorWithDeficientContext(PartialEvaluationMapper): + """Evaluation Mapper that does not need values of all the variables + involved in the expression. + + Returns the expression with the values mapped from :attr:`context`. + """ + def map_variable(self, expr): + if expr.name in self.context: + return self.context[expr.name] + else: + return expr + + +class VariableInAnExpression(CombineMapper): + def __init__(self, variables_to_search): + assert(all(isinstance(variable, p.Variable) for variable in + variables_to_search)) + self.variables_to_search = variables_to_search + + def combine(self, values): + return any(values) + + def map_variable(self, expr): + return expr in self.variables_to_search + + def map_constant(self, expr): + return False + + class SubArrayRef(p.Expression): """Represents a generalized sliced notation of an array. @@ -697,7 +727,7 @@ class SubArrayRef(p.Expression): init_arg_names = ("swept_inames", "subscript") - def __init__(self, swept_inames=None, subscript=None): + def __init__(self, swept_inames, subscript): # {{{ sanity checks @@ -717,22 +747,54 @@ class SubArrayRef(p.Expression): self.subscript = subscript def get_begin_subscript(self): - starting_inames = [] - for iname in self.subscript.index_tuple: - if iname in self.swept_inames: - starting_inames.append(parse('0')) - else: - starting_inames.append(iname) - return p.Subscript(self.subscript.aggregate, tuple(starting_inames)) + """ + Returns an instance of :class:`pymbolic.primitives.Subscript`, the + beginning subscript of the array swept by the *SubArrayRef*. + + **Example:** Consider ``[i, k]: a[i, j, k, l]``. The beginning + subscript would be ``a[0, j, 0, l]`` + """ + swept_inames_to_zeros = dict( + (swept_iname.name, 0) for swept_iname in self.swept_inames) + + return EvaluatorWithDeficientContext(swept_inames_to_zeros)( + self.subscript) def get_sub_array_dim_tags_and_shape(self, arg_dim_tags, arg_shape): - """ Gives the dim tags for the inner inames. - This would be used for stride calculation in the child kernel. - This might need to go, once we start calculating the stride length - using the upper and lower bounds of the involved inames. + """Returns the dim tags for the inner inames. + + .. arg:: arg_dim_tags + + a list of :class:`loopy.kernel.array.FixedStrideArrayDimTag` of the + argument referred by the *SubArrayRef*. + + .. arg:: arg_shape + + a tuple indicating the shape of the argument referred by the + *SubArrayRef*. """ from loopy.kernel.array import FixedStrideArrayDimTag as DimTag sub_dim_tags = [] + sub_shape = [] # need to figure out an elegant way of finding this out. + linearized_index = sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg_dim_tags, self.subscript.index_tuple)) + + print(self.subscript) + print(linearized_index) + + strides_as_dict = CoefficientCollector(tuple(iname.name for iname in + self.swept_inames))(linearized_index) + sub_dim_tags = tuple(DimTag(strides_as_dict[iname]) for iname in + self.swept_inames) + sub_shape = tuple(dim_shape for dim_shape, index in zip( + arg_shape, self.subscript.index_tuple) if VariableInAnExpression( + self.swept_inames)(index)) + + return sub_dim_tags, sub_shape + """ + # Trying out new things + from loopy.kernel.array import FixedStrideArrayDimTag as DimTag + sub_dim_tags = [] sub_shape = [] for dim_tag, axis_length, iname in zip( arg_dim_tags, arg_shape, self.subscript.index_tuple): @@ -740,7 +802,8 @@ class SubArrayRef(p.Expression): sub_dim_tags.append(DimTag(dim_tag.stride)) sub_shape.append(axis_length) - return sub_dim_tags, tuple(sub_shape) + return tuple(sub_dim_tags), tuple(sub_shape) + """ def __getinitargs__(self): return (self.swept_inames, self.subscript) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 5ee7401c3..b9690b511 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -427,18 +427,37 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): return None -def c_with_types(in_knl_callable, arg_id_to_dtype, modify_name=False): - # Function mangler for math functions defined in C standard +def with_types_for_c_target(in_knl_callable, arg_id_to_dtype, modify_name=False): + """Target facing function for C-like targets in order to map the math + functions encountered in a kernel to the equivalent function signature. + + .. arg in_knl_callable:: + + An instance of :class:`loopy.kernel.function_interface.ScalarCallable`, + which is supposed to be mapped in the target. + + .. arg arg_id_to_dtype:: + + Same as the maapping in :meth:`ScalarCallable.with_types` + + .. arg modify_name:: + + Must be set *True* for C and Cuda targets and *False* for OpenCL targets. + + :return: An updated instance of + :class:`loopy.kernel.function_interface.ScalarCallable` tuned for the + target. Or *None* if could not find a corresponding C-function for the given + pair *in_knl_callable*, *arg_id_to_dtype*. + """ # Convert abs, min, max to fabs, fmin, fmax. # If modify_name is set to True, function names are modified according to # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL name = in_knl_callable.name if name in ["abs", "min", "max"]: name = "f" + name - # unitary functions + # unary functions if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "tan"]: @@ -540,7 +559,7 @@ class CASTBuilder(ASTBuilderBase): ]) def with_types(self, in_knl_callable, arg_id_to_dtype): - new_callable = c_with_types(in_knl_callable, arg_id_to_dtype, + new_callable = with_types_for_c_target(in_knl_callable, arg_id_to_dtype, modify_name=True) if new_callable is not None: return new_callable @@ -957,7 +976,7 @@ class CASTBuilder(ASTBuilderBase): from cgen import ExpressionStatement # FIXME: Depending on the function this can be either an # ExpressionStatement or Assignment. - # Refer: CallableOnScalar::emit_call_insn. It is discussed in detail + # Refer: ScalarCallable::emit_call_insn. It is discussed in detail # over there. return ExpressionStatement( CExpression(self.get_c_expression_to_code_mapper(), diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 75606945a..d2dac07a0 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -30,7 +30,7 @@ from pytools import memoize_method from loopy.target.c import CTarget, CASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper -from loopy.target.c import (c_math_identifiers, c_with_types) +from loopy.target.c import (c_math_identifiers, with_types_for_c_target) from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import temp_var_scope @@ -295,7 +295,7 @@ class CUDACASTBuilder(CASTBuilder): if new_callable is not None: return new_callable - new_callable = c_with_types(in_knl_callable, arg_id_to_dtype, + new_callable = with_types_for_c_target(in_knl_callable, arg_id_to_dtype, modify_name=True) if new_callable is not None: return new_callable diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index af194335f..60546a7a6 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -32,7 +32,7 @@ from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.target.c import (DTypeRegistryWrapper, c_math_identifiers, - c_math_mangler, c_with_types) + c_math_mangler, with_types_for_c_target) from loopy.kernel.data import temp_var_scope, CallMangleInfo from pymbolic import var @@ -229,7 +229,20 @@ def opencl_function_mangler(kernel, name, arg_dtypes): return None -def opencl_with_types(in_knl_callable, arg_id_to_dtype): +def with_types_for_opencl_target(in_knl_callable, arg_id_to_dtype): + """Returns an updated ``in_knl_callable`` specifically tuned for OpenCL + targets. Returns *None*, if does not match with any of the OpenCL function + signatures. + + .. arg in_knl_callable:: + + An instance of :class:`loopy.kernel.function_interface.ScalarCallable`. + + .. arg arg_id_to_dtype:: + + A mapping which provides information from argument id to its type. Same + format as in :meth:`ScalarCallable.with_types`. + """ name = in_knl_callable.name @@ -489,11 +502,11 @@ class OpenCLCASTBuilder(CASTBuilder): ]) def with_types(self, in_knl_callable, arg_id_to_dtype): - new_callable = opencl_with_types(in_knl_callable, arg_id_to_dtype) + new_callable = with_types_for_opencl_target(in_knl_callable, arg_id_to_dtype) if new_callable is not None: return new_callable - new_callable = c_with_types(in_knl_callable, arg_id_to_dtype) + new_callable = with_types_for_c_target(in_knl_callable, arg_id_to_dtype) if new_callable is not None: return new_callable return super(OpenCLCASTBuilder, self).with_types(in_knl_callable, diff --git a/loopy/target/python.py b/loopy/target/python.py index dcc1be9bc..8d1a0345b 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -202,8 +202,8 @@ class PythonASTBuilderBase(ASTBuilderBase): c_math_identifiers()) def with_types(self, in_knl_callable, arg_id_to_dtype): - from loopy.target.c import c_with_types - new_callable = c_with_types(in_knl_callable, arg_id_to_dtype) + from loopy.target.c import with_types_for_c_target + new_callable = with_types_for_c_target(in_knl_callable, arg_id_to_dtype) if new_callable is not None: return new_callable return super(PythonASTBuilderBase, self).with_types(in_knl_callable, diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index 86bc056e9..d0edcfd78 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -402,8 +402,8 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", # For example differentiating `sin(x)` -> `cos(x)`. Hence we would need to # scope `cos(x)`. from loopy.kernel.creation import scope_functions - differentiated_scoped_kernel = ( - scope_functions(diff_context.get_new_kernel())) + differentiated_scoped_kernel = scope_functions( + diff_context.get_new_kernel()) return differentiated_scoped_kernel, result diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 38615ed70..49b19fd89 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -23,13 +23,9 @@ THE SOFTWARE. """ from loopy.kernel import LoopKernel -from loopy.kernel.creation import FunctionScoper from loopy.diagnostic import LoopyError from loopy.kernel.function_interface import CallableKernel -from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, - CInstruction, _DataObliviousInstruction) - __doc__ = """ .. currentmodule:: loopy @@ -39,70 +35,42 @@ __doc__ = """ # {{{ main entrypoint -def register_callable_kernel(parent, function_name, child): - """ - The purpose of this transformation is so that one can inoke the child - kernel in the parent kernel. - - :arg parent - - This is the "main" kernel which will mostly remain unaltered and one - can interpret it as stitching up the child kernel in the parent kernel. - - :arg function_name - - The name of the function call with which the child kernel must be - associated in the parent kernel - - :arg child +def register_callable_kernel(caller_kernel, function_name, callee_kernel): + """Returns a copy of *caller_kernel* which identifies *function_name* in an + expression as a call to *callee_kernel*. - This is like a function in every other language and this might be - invoked in one of the instructions of the parent kernel. - - ..note:: - - One should note that the kernels would go under stringent compatibilty - tests so that both of them can be confirmed to be made for each other. + :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :arg function_name: An instance of :class:`str`. + :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. """ # {{{ sanity checks - assert isinstance(parent, LoopKernel) - assert isinstance(child, LoopKernel) + assert isinstance(caller_kernel, LoopKernel) + assert isinstance(callee_kernel, LoopKernel) assert isinstance(function_name, str) - # }}} + if function_name in caller_kernel.function_identifiers: + raise LoopyError("%s is being used a default function " + "identifier--maybe use a different function name in order to " + "associate with a callable kernel." % function_name) - # scoping the function - function_scoper = FunctionScoper(set([function_name])) - new_insns = [] - - for insn in parent.instructions: - if isinstance(insn, CallInstruction): - new_insn = insn.copy(expression=function_scoper(insn.expression)) - new_insns.append(new_insn) - elif isinstance(insn, (_DataObliviousInstruction, MultiAssignmentBase, - CInstruction)): - new_insns.append(insn) - else: - raise NotImplementedError("scope_functions not implemented for %s" % - type(insn)) - - # adding the scoped function to the scoped function dict of the parent - # kernel. + # }}} - scoped_functions = parent.scoped_functions.copy() + # now we know some new functions, and hence scoping them. + from loopy.kernel.creation import scope_functions - if function_name in scoped_functions: - raise LoopyError("%s is already being used as a funciton name -- maybe" - "use a different name for registering the subkernel") + # scoping the function corresponding to kernel call + caller_kernel = scope_functions(caller_kernel, set([function_name])) + updated_scoped_functions = caller_kernel.scoped_functions - scoped_functions[function_name] = CallableKernel(name=function_name, - subkernel=child.copy(target=parent.target)) + # making the target of the child kernel to be same as the target of parent + # kernel. + updated_scoped_functions[function_name] = CallableKernel(name=function_name, + subkernel=callee_kernel.copy(target=caller_kernel.target)) # returning the parent kernel with the new scoped function dictionary - return parent.copy(scoped_functions=scoped_functions, - instructions=new_insns) + return caller_kernel.copy(scoped_functions=updated_scoped_functions) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index dee893715..8e36a0a96 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -300,6 +300,7 @@ class TypeInferenceMapper(CombineMapper): # collecting result dtypes in order of the assignees if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: if return_tuple: + print(get_return_types_as_tuple(new_arg_id_to_dtype)) return [get_return_types_as_tuple(new_arg_id_to_dtype)] else: return [new_arg_id_to_dtype[-1]] @@ -535,7 +536,7 @@ def infer_unknown_types(kernel, expect_completion=False): if expect_completion: # if completion is expected, then it is important that all the # callables are scoped. - from loopy.preprocess import check_functions_are_scoped + from loopy.check import check_functions_are_scoped check_functions_are_scoped(kernel) from functools import partial -- GitLab From 8edfa5285dca489d66a6677b6714cd1b7e977f8c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 9 Apr 2018 16:18:40 -0500 Subject: [PATCH 074/916] Better error handling for sub array refs. --- loopy/symbolic.py | 23 ++++++----------------- loopy/type_inference.py | 1 - 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index c455d08fd..d13f1f558 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -775,13 +775,10 @@ class SubArrayRef(p.Expression): """ from loopy.kernel.array import FixedStrideArrayDimTag as DimTag sub_dim_tags = [] - sub_shape = [] # need to figure out an elegant way of finding this out. + sub_shape = [] linearized_index = sum(dim_tag.stride*iname for dim_tag, iname in zip(arg_dim_tags, self.subscript.index_tuple)) - print(self.subscript) - print(linearized_index) - strides_as_dict = CoefficientCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) sub_dim_tags = tuple(DimTag(strides_as_dict[iname]) for iname in @@ -790,20 +787,12 @@ class SubArrayRef(p.Expression): arg_shape, self.subscript.index_tuple) if VariableInAnExpression( self.swept_inames)(index)) - return sub_dim_tags, sub_shape - """ - # Trying out new things - from loopy.kernel.array import FixedStrideArrayDimTag as DimTag - sub_dim_tags = [] - sub_shape = [] - for dim_tag, axis_length, iname in zip( - arg_dim_tags, arg_shape, self.subscript.index_tuple): - if iname in self.swept_inames: - sub_dim_tags.append(DimTag(dim_tag.stride)) - sub_shape.append(axis_length) + if len(sub_shape) != len(self.swept_inames): + # Not allowed something like: [i]: a[i, i] + raise LoopyError("Number of axes swept must be equal to the number " + "of inames declared for sweeping.") - return tuple(sub_dim_tags), tuple(sub_shape) - """ + return sub_dim_tags, sub_shape def __getinitargs__(self): return (self.swept_inames, self.subscript) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 8e36a0a96..233da62d1 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -300,7 +300,6 @@ class TypeInferenceMapper(CombineMapper): # collecting result dtypes in order of the assignees if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: if return_tuple: - print(get_return_types_as_tuple(new_arg_id_to_dtype)) return [get_return_types_as_tuple(new_arg_id_to_dtype)] else: return [new_arg_id_to_dtype[-1]] -- GitLab From 7a38cf5f2d66e18e86384789f22fc75ad2f9b7e6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 9 Apr 2018 16:43:22 -0500 Subject: [PATCH 075/916] Changed the structure of ScopedFunction --- loopy/symbolic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index d13f1f558..8c0424a08 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -675,9 +675,14 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") -class ScopedFunction(p.Variable): +class ScopedFunction(p.Expression): """ Connects a call to a callable available in a kernel. """ + + def __init__(self, function): + from loopy.library.reduction import ArgExtOp + assert isinstance(function, (p.Variable, ArgExtOp)) + mapper_method = intern("map_scoped_function") def stringifier(self): -- GitLab From 872bc4df9084a1df738b2b4ed85b01fe9bb2325b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 9 Apr 2018 16:45:33 -0500 Subject: [PATCH 076/916] Reverted ScopedFunction back to its earlier stage for some other debugging. --- loopy/symbolic.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8c0424a08..d13f1f558 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -675,14 +675,9 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") -class ScopedFunction(p.Expression): +class ScopedFunction(p.Variable): """ Connects a call to a callable available in a kernel. """ - - def __init__(self, function): - from loopy.library.reduction import ArgExtOp - assert isinstance(function, (p.Variable, ArgExtOp)) - mapper_method = intern("map_scoped_function") def stringifier(self): -- GitLab From b617a7acfdbd79e3a153426f917093672c4b59e7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 9 Apr 2018 19:39:53 -0500 Subject: [PATCH 077/916] Implemented domain changes using loopy.kernel.tools.DomainChanger --- loopy/kernel/creation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index e6813aa4a..1323ad458 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2092,9 +2092,9 @@ def realize_slices_as_sub_array_refs(kernel): slice_iname_domains = slice_replacer.get_iname_domain_as_isl_set() if slice_iname_domains: - d1, d2 = isl.align_two(kernel.domains[0], slice_iname_domains) - return kernel.copy(domains=[d1 & d2], - instructions=new_insns) + from loopy.kernel.tools import DomainChanger + domch = DomainChanger(kernel.copy(instructions=new_insns), frozenset()) + return domch.get_kernel_with(slice_iname_domains) else: return kernel.copy(instructions=new_insns) -- GitLab From f7729e3e095608feee7aa6d7ab5fb34e83c8d8e1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 9 Apr 2018 19:42:08 -0500 Subject: [PATCH 078/916] Callable kernel does not have name attribute any more. --- loopy/kernel/function_interface.py | 7 +++---- loopy/transform/register_knl.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index a70ea2af6..b7e9023d7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -433,12 +433,12 @@ class CallableKernel(InKernelCallable): caller and the callee kernel. """ - fields = set(["name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", + fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) - init_arg_names = ("name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", + init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") - def __init__(self, name, subkernel, arg_id_to_dtype=None, + def __init__(self, subkernel, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): super(InKernelCallable, self).__init__( @@ -447,7 +447,6 @@ class CallableKernel(InKernelCallable): if name_in_target is not None: subkernel = subkernel.copy(name=name_in_target) - self.name = name self.name_in_target = name_in_target self.subkernel = subkernel diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 49b19fd89..20e3817f9 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -66,7 +66,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # making the target of the child kernel to be same as the target of parent # kernel. - updated_scoped_functions[function_name] = CallableKernel(name=function_name, + updated_scoped_functions[function_name] = CallableKernel( subkernel=callee_kernel.copy(target=caller_kernel.target)) # returning the parent kernel with the new scoped function dictionary -- GitLab From 7075aefe58a21d90b882978c52c540726b1421fd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 10 Apr 2018 18:53:24 -0500 Subject: [PATCH 079/916] Changed the structure of ScopedFunction --- loopy/check.py | 7 ++-- loopy/kernel/creation.py | 2 +- loopy/kernel/function_interface.py | 47 ++------------------------- loopy/symbolic.py | 52 ++++++++++++++++++++++-------- 4 files changed, 45 insertions(+), 63 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 6afeb86ac..e7d1a0580 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -68,6 +68,8 @@ class UnScopedCallCollector(CombineMapper): def map_call(self, expr): if not isinstance(expr.function, ScopedFunction): + print(expr) + print(type(expr.function)) return (frozenset([expr.function.name]) | self.combine((self.rec(child) for child in expr.parameters))) else: @@ -82,9 +84,6 @@ class UnScopedCallCollector(CombineMapper): return self.combine((self.rec(child) for child in expr.parameters+expr.kw_parameters.values())) - def map_scoped_function(self, expr): - return frozenset([expr.name]) - def map_constant(self, expr): return frozenset() @@ -99,7 +98,7 @@ def check_functions_are_scoped(kernel): otherwise indicate to what all calls we await signature. """ - from loopy.symbolic import SubstitutionRuleExpander + from loopy.symbolic import SubstitutionRuleExpander, IdentityMapper subst_expander = SubstitutionRuleExpander(kernel.substitutions) for insn in kernel.instructions: diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1323ad458..5b5ea07c9 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1864,7 +1864,7 @@ class FunctionScoper(RuleAwareIdentityMapper): expr.function.name) return Call( - ScopedFunction(expr.function.name), + ScopedFunction(expr.function), tuple(self.rec(child, expn_state) for child in expr.parameters)) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index b7e9023d7..ac2554e4f 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -619,10 +619,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): self.subst_expander = subst_expander def map_call(self, expr, expn_state): - if not isinstance(expr.function, Variable): - return IdentityMapper.map_call(self, expr, expn_state) - - name, tag = parse_tagged_name(expr.function) + name, tag = parse_tagged_name(expr.function.function) if name not in self.rule_mapping_context.old_subst_rules: expanded_expr = self.subst_expander(expr) @@ -641,47 +638,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): else: return self.map_substitution(name, tag, expr.parameters, expn_state) - def map_call_with_kwargs(self, expr, expn_state): - expanded_expr = self.subst_expander(expr) - if expr in self.expr_to_new_names: - return type(expr)( - ScopedFunction(self.expr_to_new_names[expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - elif expanded_expr in self.expr_to_new_names: - return type(expr)( - ScopedFunction(self.expr_to_new_names[expanded_expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - else: - return IdentityMapper.map_call_with_kwargs(self, expr, expn_state) - - def map_reduction(self, expr, expn_state): - from loopy.symbolic import Reduction - expanded_expr = self.subst_expander(expr) - - if expr in self.expr_to_new_names: - return Reduction( - ScopedFunction(self.expr_to_new_names[expr]), - tuple(expr.inames), - self.rec(expr.expr, expn_state), - allow_simultaneous=expr.allow_simultaneous) - elif expanded_expr in self.expr_to_new_names: - return Reduction( - ScopedFunction(self.expr_to_new_names[expanded_expr]), - tuple(expr.inames), - self.rec(expr.expr, expn_state), - allow_simultaneous=expr.allow_simultaneous) - else: - return IdentityMapper.map_reduction(self, expr, expn_state) + # TODO: Add a method map_call_with_kwargs def register_pymbolic_calls_to_knl_callables(kernel, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index d13f1f558..4aa9d2790 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -111,14 +111,18 @@ class IdentityMapperMixin(object): return SubArrayRef(self.rec(expr.swept_inames, *args), self.rec(expr.subscript, *args)) + def map_scoped_function(self, expr, *args): + if isinstance(expr.function, p.Variable): + return ScopedFunction(self.rec(expr.function, *args)) + else: + return ScopedFunction(expr.function, *args) + map_type_cast = map_type_annotation map_linear_subscript = IdentityMapperBase.map_subscript map_rule_argument = map_group_hw_index - map_scoped_function = IdentityMapperBase.map_variable - class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): pass @@ -132,8 +136,6 @@ class PartialEvaluationMapper( def map_common_subexpression_uncached(self, expr): return type(expr)(self.rec(expr.child), expr.prefix, expr.scope) - map_scoped_function = map_variable - class WalkMapper(WalkMapperBase): def map_literal(self, expr, *args): @@ -172,8 +174,6 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index - map_scoped_function = WalkMapperBase.map_variable - def map_sub_array_ref(self, expr, *args): if not self.visit(expr): return @@ -181,6 +181,13 @@ class WalkMapper(WalkMapperBase): self.rec(expr.swept_inames, *args) self.rec(expr.subscript, *args) + def map_scoped_function(self, expr, *args): + if not self.visit(expr): + return + + if isinstance(expr.function, p.Variable): + self.rec(expr.function, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant @@ -193,9 +200,10 @@ class CombineMapper(CombineMapperBase): def map_sub_array_ref(self, expr): return self.rec(expr.get_begin_subscript()) - map_linear_subscript = CombineMapperBase.map_subscript + def map_scoped_function(self, expr): + return self.rec(expr.funciton) - map_scoped_function = CombineMapperBase.map_variable + map_linear_subscript = CombineMapperBase.map_subscript class SubstitutionMapper( @@ -254,7 +262,7 @@ class StringifyMapper(StringifyMapperBase): return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) def map_scoped_function(self, expr, prec): - return "ScopedFunction('%s')" % expr.name + return "ScopedFunction('%s')" % self.rec(expr.function, prec) def map_sub_array_ref(self, expr, prec): return "SubArrayRef({inames}, ({subscr}))".format( @@ -359,8 +367,6 @@ class SubstitutionRuleExpander(IdentityMapper): return self.rec(expr) - map_scoped_function = map_variable - # }}} @@ -675,14 +681,34 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") -class ScopedFunction(p.Variable): +class ScopedFunction(p.Expression): """ Connects a call to a callable available in a kernel. + + .. attribute:: function + + An instance of :class:`pymbolic.primitives.Variable` or + `loopy.library.reduction.ArgExtOp`. """ - mapper_method = intern("map_scoped_function") + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + assert isinstance(function, p.Variable) + self.function = function + + @property + def name(self): + return self.function.name def stringifier(self): return StringifyMapper + def __getinitargs__(self): + return self.function, + + mapper_method = intern("map_scoped_function") + class EvaluatorWithDeficientContext(PartialEvaluationMapper): """Evaluation Mapper that does not need values of all the variables -- GitLab From 36c8473bf1805cb363dded936d8fab2ed06ccb48 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 10 Apr 2018 23:52:35 -0500 Subject: [PATCH 080/916] ArgExtOp working after some gymnastics --- loopy/check.py | 11 ++-- loopy/codegen/__init__.py | 3 + loopy/kernel/data.py | 8 --- loopy/kernel/function_interface.py | 4 ++ loopy/preprocess.py | 22 +++++-- loopy/symbolic.py | 5 +- loopy/target/c/__init__.py | 96 +++++++++++++++++++++++++++--- 7 files changed, 120 insertions(+), 29 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index e7d1a0580..10f828ed1 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -60,16 +60,15 @@ def check_identifiers_in_subst_rules(knl): % (knl.name, ", ".join(deps-rule_allowed_identifiers))) -class UnScopedCallCollector(CombineMapper): +class UnscopedCallCollector(CombineMapper): def combine(self, values): import operator return reduce(operator.or_, values, frozenset()) def map_call(self, expr): - if not isinstance(expr.function, ScopedFunction): - print(expr) - print(type(expr.function)) + from loopy.library.reduction import ArgExtOp + if not isinstance(expr.function, (ScopedFunction, ArgExtOp)): return (frozenset([expr.function.name]) | self.combine((self.rec(child) for child in expr.parameters))) else: @@ -98,12 +97,12 @@ def check_functions_are_scoped(kernel): otherwise indicate to what all calls we await signature. """ - from loopy.symbolic import SubstitutionRuleExpander, IdentityMapper + from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): - unscoped_calls = UnScopedCallCollector()(subst_expander( + unscoped_calls = UnscopedCallCollector()(subst_expander( insn.expression)) if unscoped_calls: raise LoopyError("Unknown function '%s' obtained -- register a " diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 2e217b779..735c16d15 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -501,6 +501,9 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): for insn in kernel.instructions: if isinstance(insn, CallInstruction): + from loopy.library.reduction import ArgExtOp + if isinstance(insn.expression.function, ArgExtOp): + continue in_knl_callable = kernel.scoped_functions[insn.expression.function.name] from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel): diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 59297e475..c90e8a64b 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -607,13 +607,6 @@ class SubstitutionRule(ImmutableRecord): # {{{ function call mangling class CallMangleInfo(ImmutableRecord): - def __init__(self): - raise NotImplementedError("New Mangler interface expected") - - -# FIXME: Uncomment it once everything is done. -# KK: Removed it for the duration the new mangler interface starts working. -''' """ .. attribute:: target_name @@ -638,7 +631,6 @@ class CallMangleInfo(ImmutableRecord): target_name=target_name, result_dtypes=result_dtypes, arg_dtypes=arg_dtypes) -''' # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index ac2554e4f..3812400b5 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -619,6 +619,10 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): self.subst_expander = subst_expander def map_call(self, expr, expn_state): + from loopy.library.reduction import ArgExtOp + if isinstance(expr.function, ArgExtOp): + return IdentityMapper.map_call(self, expr, expn_state) + name, tag = parse_tagged_name(expr.function.function) if name not in self.rule_mapping_context.old_subst_rules: diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 49103931f..1064f0f93 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -38,7 +38,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import ScopedFunction, CombineMapper +from loopy.symbolic import CombineMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) @@ -1942,9 +1942,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # TODO: remove unused inames... - # kernel = ( - # _hackily_ensure_multi_assignment_return_values_are_scoped_private( - # kernel)) + kernel = ( + _hackily_ensure_multi_assignment_return_values_are_scoped_private( + kernel)) return kernel @@ -2150,8 +2150,11 @@ class ArgDescrInferenceMapper(CombineMapper): def map_call(self, expr, **kwargs): from loopy.kernel.function_interface import ValueArgDescriptor from loopy.symbolic import SubArrayRef - if not isinstance(expr.function, ScopedFunction): - return CombineMapper.map_call(self, expr, **kwargs) + from loopy.library.reduction import ArgExtOp + + if isinstance(expr.function, ArgExtOp): + # Special treatment to ArgExtOp + return self.combine((self.rec(child) for child in expr.parameters)) # descriptors for the args arg_id_to_descr = dict((i, @@ -2291,6 +2294,13 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): return all(values) def map_call(self, expr, *args, **kwargs): + from loopy.library.reduction import ArgExtOp + if isinstance(expr.function, ArgExtOp): + return self.combine( + tuple( + self.rec(child, *args, **kwargs) for child in + expr.parameters)) + is_ready_for_codegen = self.kernel.scoped_functions[ expr.function.name].is_ready_for_codegen() return self.combine( diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 4aa9d2790..0a27d1044 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -115,7 +115,7 @@ class IdentityMapperMixin(object): if isinstance(expr.function, p.Variable): return ScopedFunction(self.rec(expr.function, *args)) else: - return ScopedFunction(expr.function, *args) + return ScopedFunction(expr.function) map_type_cast = map_type_annotation @@ -694,7 +694,8 @@ class ScopedFunction(p.Expression): def __init__(self, function): if isinstance(function, str): function = p.Variable(function) - assert isinstance(function, p.Variable) + from loopy.library.reduction import ArgExtOp + assert isinstance(function, (p.Variable, ArgExtOp)) self.function = function @property diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index b9690b511..0438c4158 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -934,10 +934,86 @@ class CASTBuilder(ASTBuilderBase): lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic updates in %s" % type(self).__name__) - # FIXME: With the new mangler interface this should not be present, - # Commenting this part so that this does not get used anywhere in the - # meantime - ''' + def emit_code_specially_for_the_special_arg_extop(self, codegen_state, + insn): + + ecm = codegen_state.expression_to_code_mapper + + from pymbolic.primitives import Variable + from pymbolic.mapper.stringifier import PREC_NONE + + func_id = insn.expression.function + parameters = insn.expression.parameters + + if isinstance(func_id, Variable): + func_id = func_id.name + + assignee_var_descriptors = [ + codegen_state.kernel.get_var_descriptor(a) + for a in insn.assignee_var_names()] + + par_dtypes = tuple(ecm.infer_type(par) for par in parameters) + + mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) + if mangle_result is None: + raise RuntimeError("function '%s' unknown--" + "maybe you need to register a function mangler?" + % func_id) + + assert mangle_result.arg_dtypes is not None + + if mangle_result.target_name == "loopy_make_tuple": + # This shorcut avoids actually having to emit a 'make_tuple' function. + return self.emit_tuple_assignment(codegen_state, insn) + + from loopy.expression import dtype_to_type_context + c_parameters = [ + ecm(par, PREC_NONE, + dtype_to_type_context(self.target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip( + parameters, par_dtypes, mangle_result.arg_dtypes)] + + from loopy.codegen import SeenFunction + codegen_state.seen_functions.add( + SeenFunction(func_id, + mangle_result.target_name, + mangle_result.arg_dtypes)) + + from pymbolic import var + for i, (a, tgt_dtype) in enumerate( + zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): + if tgt_dtype != ecm.infer_type(a): + raise LoopyError("type mismatch in %d'th (1-based) left-hand " + "side of instruction '%s'" % (i+1, insn.id)) + c_parameters.append( + # TODO Yuck: The "where-at function": &(...) + var("&")( + ecm(a, PREC_NONE, + dtype_to_type_context(self.target, tgt_dtype), + tgt_dtype).expr)) + + from pymbolic import var + result = var(mangle_result.target_name)(*c_parameters) + + # In case of no assignees, we are done + if len(mangle_result.result_dtypes) == 0: + from cgen import ExpressionStatement + return ExpressionStatement( + CExpression(self.get_c_expression_to_code_mapper(), result)) + + result = ecm.wrap_in_typecast( + mangle_result.result_dtypes[0], + assignee_var_descriptors[0].dtype, + result) + + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + + from cgen import Assign + return Assign( + lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), result)) + def emit_tuple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper @@ -960,14 +1036,20 @@ class CASTBuilder(ASTBuilderBase): assignments.append(Assign(lhs_code, rhs_code)) return block_if_necessary(assignments) - ''' def emit_multiple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper + from loopy.library.reduction import ArgExtOp + if isinstance(insn.expression.function, ArgExtOp): + return self.emit_code_specially_for_the_special_arg_extop(codegen_state, + insn) + ecm = codegen_state.expression_to_code_mapper func_id = insn.expression.function.name - in_knl_callable = codegen_state.kernel.scoped_functions[func_id] + + if in_knl_callable.name == 'make_tuple': + return self.emit_tuple_assignment(codegen_state, insn) + in_knl_callable_as_call = in_knl_callable.emit_call_insn( insn=insn, target=self.target, -- GitLab From de8d4df1e7c351d2de0a537062b212102bfd7d73 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 11 Apr 2018 01:16:51 -0500 Subject: [PATCH 081/916] Some more adjustments --- loopy/preprocess.py | 7 ++++++- loopy/target/c/__init__.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1064f0f93..a48dd421a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -754,7 +754,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): # }}} - from loopy.kernel.instruction import CallInstruction + from loopy.kernel.instruction import CallInstruction, is_array_call for insn in kernel.instructions: if not isinstance(insn, CallInstruction): continue @@ -762,6 +762,9 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): if len(insn.assignees) <= 1: continue + if is_array_call(insn.assignees, insn.expression): + continue + assignees = insn.assignees assignee_var_names = insn.assignee_var_names() @@ -1687,6 +1690,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, arg_dtypes, reduction_dtypes = ( infer_arg_and_reduction_dtypes_for_reduction_expression( temp_kernel, expr, unknown_types_ok)) + print(type(expr)) + print(rec) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 0438c4158..aa2e89ab8 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1047,7 +1047,7 @@ class CASTBuilder(ASTBuilderBase): func_id = insn.expression.function.name in_knl_callable = codegen_state.kernel.scoped_functions[func_id] - if in_knl_callable.name == 'make_tuple': + if in_knl_callable.name_in_target == 'loopy_make_tuple': return self.emit_tuple_assignment(codegen_state, insn) in_knl_callable_as_call = in_knl_callable.emit_call_insn( -- GitLab From f23f1a63eb3682afdfe1a84bdae66a23a4312479 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 11 Apr 2018 01:35:19 -0500 Subject: [PATCH 082/916] Everything is working. --- loopy/kernel/creation.py | 2 +- loopy/kernel/function_interface.py | 2 +- loopy/preprocess.py | 2 -- loopy/symbolic.py | 37 +++++------------------------- 4 files changed, 8 insertions(+), 35 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 5b5ea07c9..1323ad458 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1864,7 +1864,7 @@ class FunctionScoper(RuleAwareIdentityMapper): expr.function.name) return Call( - ScopedFunction(expr.function), + ScopedFunction(expr.function.name), tuple(self.rec(child, expn_state) for child in expr.parameters)) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 3812400b5..6004de9ee 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -623,7 +623,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): if isinstance(expr.function, ArgExtOp): return IdentityMapper.map_call(self, expr, expn_state) - name, tag = parse_tagged_name(expr.function.function) + name, tag = parse_tagged_name(expr.function) if name not in self.rule_mapping_context.old_subst_rules: expanded_expr = self.subst_expander(expr) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index a48dd421a..c581fa2ad 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1690,8 +1690,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, arg_dtypes, reduction_dtypes = ( infer_arg_and_reduction_dtypes_for_reduction_expression( temp_kernel, expr, unknown_types_ok)) - print(type(expr)) - print(rec) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 0a27d1044..7ce713004 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -111,11 +111,7 @@ class IdentityMapperMixin(object): return SubArrayRef(self.rec(expr.swept_inames, *args), self.rec(expr.subscript, *args)) - def map_scoped_function(self, expr, *args): - if isinstance(expr.function, p.Variable): - return ScopedFunction(self.rec(expr.function, *args)) - else: - return ScopedFunction(expr.function) + map_scoped_function = IdentityMapperBase.map_variable map_type_cast = map_type_annotation @@ -181,12 +177,7 @@ class WalkMapper(WalkMapperBase): self.rec(expr.swept_inames, *args) self.rec(expr.subscript, *args) - def map_scoped_function(self, expr, *args): - if not self.visit(expr): - return - - if isinstance(expr.function, p.Variable): - self.rec(expr.function, *args) + map_scoped_function = WalkMapperBase.map_variable class CallbackMapper(CallbackMapperBase, IdentityMapper): @@ -200,8 +191,7 @@ class CombineMapper(CombineMapperBase): def map_sub_array_ref(self, expr): return self.rec(expr.get_begin_subscript()) - def map_scoped_function(self, expr): - return self.rec(expr.funciton) + map_scoped_function = CombineMapperBase.map_variable map_linear_subscript = CombineMapperBase.map_subscript @@ -262,7 +252,7 @@ class StringifyMapper(StringifyMapperBase): return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) def map_scoped_function(self, expr, prec): - return "ScopedFunction('%s')" % self.rec(expr.function, prec) + return "ScopedFunction('%s')" % expr.name def map_sub_array_ref(self, expr, prec): return "SubArrayRef({inames}, ({subscr}))".format( @@ -681,33 +671,18 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") -class ScopedFunction(p.Expression): +class ScopedFunction(p.Variable): """ Connects a call to a callable available in a kernel. - .. attribute:: function + .. attribute:: name An instance of :class:`pymbolic.primitives.Variable` or `loopy.library.reduction.ArgExtOp`. """ - init_arg_names = ("function", ) - - def __init__(self, function): - if isinstance(function, str): - function = p.Variable(function) - from loopy.library.reduction import ArgExtOp - assert isinstance(function, (p.Variable, ArgExtOp)) - self.function = function - - @property - def name(self): - return self.function.name def stringifier(self): return StringifyMapper - def __getinitargs__(self): - return self.function, - mapper_method = intern("map_scoped_function") -- GitLab From 453133f23e1cb68e16e6c547626b226caf485472 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 11 Apr 2018 01:48:55 -0500 Subject: [PATCH 083/916] Changed the name of the arg_ext_op emitter --- loopy/target/c/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index aa2e89ab8..3dcc846c7 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -934,7 +934,7 @@ class CASTBuilder(ASTBuilderBase): lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic updates in %s" % type(self).__name__) - def emit_code_specially_for_the_special_arg_extop(self, codegen_state, + def emit_arg_extop(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper @@ -1040,7 +1040,7 @@ class CASTBuilder(ASTBuilderBase): def emit_multiple_assignment(self, codegen_state, insn): from loopy.library.reduction import ArgExtOp if isinstance(insn.expression.function, ArgExtOp): - return self.emit_code_specially_for_the_special_arg_extop(codegen_state, + return self.emit_arg_extop(codegen_state, insn) ecm = codegen_state.expression_to_code_mapper -- GitLab From f541d313302a657a7490b37aca3fe4c95ac371bb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 11 Apr 2018 03:03:11 -0500 Subject: [PATCH 084/916] Added tests for slices and multi arg array calls. --- test/test_transform.py | 50 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/test/test_transform.py b/test/test_transform.py index ea7237633..c18369e1e 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -230,7 +230,7 @@ def test_register_knl(ctx_factory): np.linalg.norm(2*x+3*y))) < 1e-15 -def test_slices(ctx_factory): +def test_slices_with_negative_step(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 2 ** 4 @@ -247,7 +247,8 @@ def test_slices(ctx_factory): parent_knl = lp.make_kernel( "{[i, k, m]: 0<=i, k, m<16}", """ - z[i, :, k, :, m] = linear_combo(x[i, :, k, :, m], y[i, :, k, :, m]) + z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + y[i, :, k, :, m]) """, kernel_data=[ lp.GlobalArg( @@ -269,10 +270,53 @@ def test_slices(ctx_factory): evt, (out, ) = knl(queue, x=x, y=y) - assert (np.linalg.norm(2*x+3*y-out)/( + assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( np.linalg.norm(2*x+3*y))) < 1e-15 +def test_multi_arg_array_call(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + import pymbolic.primitives as p + n = 10 + acc_i = p.Variable("acc_i") + i = p.Variable("i") + index = p.Variable("index") + a_i = p.Subscript(p.Variable("a"), p.Variable("i")) + argmin_kernel = lp.make_kernel( + "{[i]: 0 <= i < n}", + [ + lp.Assignment(id="init2", assignee=index, + expression=0), + lp.Assignment(id="init1", assignee=acc_i, + expression="214748367"), + lp.Assignment(id="insn", assignee=index, + expression=p.If(p.Expression.eq(acc_i, a_i), i, index), + depends_on="update"), + lp.Assignment(id="update", assignee=acc_i, + expression=p.Variable("min")(acc_i, a_i), + depends_on="init1,init2")]) + + argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) + + knl = lp.make_kernel( + "{[i]:0<=i Date: Wed, 11 Apr 2018 04:12:42 -0500 Subject: [PATCH 085/916] Added comments for make_slab --- loopy/isl_helpers.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index f0c37933a..847eb0d97 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -63,6 +63,26 @@ def dump_space(ls): # {{{ make_slab def make_slab(space, iname, start, stop, step=1): + """ + Returns an instance of :class:`islpy._isl.BasicSet`, which satisfies the + constraint ``start <= step*iname < stop``. + + :arg space: An instance of :class:`islpy._isl.Space`. + + :arg iname: + Either an instance of :class:`str` as a name of the ``iname`` or a + tuple of ``(iname_dt, iname_dx)`` indicating the *iname* in the space. + + :arg start: + An instance of :class:`int` or an instance of + :class:`islpy._isl.Aff` indicating the lower bound of + ``step*iname``(inclusive). + + :arg stop: + An instance of :class:`int` or an instance of + :class:`islpy._isl.Aff` indicating the upper bound of + ``step*iname``. + """ zero = isl.Aff.zero_on_domain(space) if isinstance(start, (isl.Aff, isl.PwAff)): @@ -93,21 +113,22 @@ def make_slab(space, iname, start, stop, step=1): if step > 0: result = (isl.BasicSet.universe(space) - # start <= iname + # start <= step*iname .add_constraint(isl.Constraint.inequality_from_aff( step*iname_aff - start)) - # iname < stop + # step*iname < stop .add_constraint(isl.Constraint.inequality_from_aff( stop-1 - step*iname_aff))) elif step < 0: result = (isl.BasicSet.universe(space) - # start <= iname + # start >= (-step)*iname .add_constraint(isl.Constraint.inequality_from_aff( step*iname_aff + start)) - # iname < stop + # (-step)*iname > stop .add_constraint(isl.Constraint.inequality_from_aff( -stop-1 - step*iname_aff))) else: + # step = 0 raise LoopyError("0 step not allowed in make_slab.") return result -- GitLab From 12d2d6f3589a466b24b2a8a03d09f8977bd8597e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 11 Apr 2018 23:14:58 -0500 Subject: [PATCH 086/916] Able to handle argmin --- loopy/check.py | 13 +++- loopy/codegen/__init__.py | 54 +++++++++++++++-- loopy/kernel/creation.py | 21 ++++--- loopy/kernel/data.py | 2 + loopy/kernel/function_interface.py | 88 +++++++++++++++++++++------- loopy/library/reduction.py | 9 ++- loopy/preprocess.py | 9 +-- loopy/symbolic.py | 36 +++++++++--- loopy/target/c/__init__.py | 86 +-------------------------- loopy/target/c/codegen/expression.py | 4 +- loopy/type_inference.py | 2 +- 11 files changed, 185 insertions(+), 139 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 10f828ed1..95da2d531 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -61,6 +61,17 @@ def check_identifiers_in_subst_rules(knl): class UnscopedCallCollector(CombineMapper): + """ + Collects all the unscoped calls within a kernel. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ def combine(self, values): import operator @@ -94,7 +105,7 @@ class UnscopedCallCollector(CombineMapper): def check_functions_are_scoped(kernel): """ Checks if all the calls in the instruction expression have been scoped, - otherwise indicate to what all calls we await signature. + otherwise indicates to what all calls we await signature. """ from loopy.symbolic import SubstitutionRuleExpander diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 735c16d15..d308d288e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -33,10 +33,13 @@ from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION from cgen import Collection +from loopy.symbolic import CombineMapper from loopy.kernel.instruction import ( Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, - CInstruction, _DataObliviousInstruction) + CInstruction, _DataObliviousInstruction, MultiAssignmentBase) + +from functools import reduce import logging @@ -259,6 +262,8 @@ class CodeGenerationState(object): schedule_index_end = self.schedule_index_end if is_generating_master_kernel is None: + # By default assumes that code is being generated for a master + # kernel. is_generating_master_kernel = self.is_generating_master_kernel return CodeGenerationState( @@ -382,6 +387,30 @@ code_gen_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) +class InKernelCallablesCollector(CombineMapper): + """ + Yields the preambles from all the scoped functions in the kernel. + """ + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_scoped_function(self, expr): + return frozenset([self.kernel.scoped_functions[ + expr.function]]) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + class PreambleInfo(ImmutableRecord): """ .. attribute:: kernel @@ -396,6 +425,9 @@ class PreambleInfo(ImmutableRecord): def generate_code_v2(kernel, is_generating_master_kernel=True): """ + :arg is_generating_master_kernel: An instance of :class:`bool`. *True* if + the code is being generated for a master kernel, otherwise *False*. + :returns: a :class:`CodeGenerationResult` """ @@ -501,10 +533,8 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): for insn in kernel.instructions: if isinstance(insn, CallInstruction): - from loopy.library.reduction import ArgExtOp - if isinstance(insn.expression.function, ArgExtOp): - continue - in_knl_callable = kernel.scoped_functions[insn.expression.function.name] + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.function] from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel): auxiliary_dev_prog = generate_code_v2( @@ -523,6 +553,8 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): codegen_state, schedule_index=0) + # Modifying the first device program to add the auxiliary kernels + # as functions. new_dev_prog = codegen_result.device_programs[0] for auxiliary_dev_prog in auxiliary_dev_progs: new_dev_prog = new_dev_prog.copy( @@ -561,6 +593,18 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) + in_knl_callable_collector = InKernelCallablesCollector(kernel) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + for in_knl_callable in in_knl_callable_collector(insn.expression): + preambles.extend(in_knl_callable.generate_preambles(kernel.target)) + + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unkown instruction %s" % type(insn)) + codegen_result = codegen_result.copy(device_preambles=preambles) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1323ad458..ca64a3157 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1860,7 +1860,7 @@ class FunctionScoper(RuleAwareIdentityMapper): from loopy.kernel.function_interface import ScalarCallable # Associating the newly created ScopedFunction with a `CallableScalar` - self.scoped_functions[expr.function.name] = ScalarCallable( + self.scoped_functions[expr.function] = ScalarCallable( expr.function.name) return Call( @@ -1879,7 +1879,7 @@ class FunctionScoper(RuleAwareIdentityMapper): from loopy.kernel.function_interface import ScalarCallable # Associating the newly created ScopedFunction with a `CallableScalar` - self.scoped_functions[expr.function.name] = ScalarCallable( + self.scoped_functions[expr.function.function] = ScalarCallable( expr.function.name) return CallWithKwargs( ScopedFunction(expr.function.name), @@ -1899,17 +1899,22 @@ class FunctionScoper(RuleAwareIdentityMapper): from loopy.library.reduction import (MaxReductionOperation, MinReductionOperation, ArgMinReductionOperation, ArgMaxReductionOperation) + from pymbolic import var + from loopy.library.reduction import ArgExtOp if isinstance(expr.operation, MaxReductionOperation): - self.scoped_functions["max"] = ScalarCallable("max") + self.scoped_functions[var("max")] = ScalarCallable("max") elif isinstance(expr.operation, MinReductionOperation): - self.scoped_functions["min"] = ScalarCallable("min") + self.scoped_functions[var("min")] = ScalarCallable("min") elif isinstance(expr.operation, ArgMaxReductionOperation): - self.scoped_functions["max"] = ScalarCallable("max") - self.scoped_functions["make_tuple"] = ScalarCallable("make_tuple") + self.scoped_functions[var("max")] = ScalarCallable("max") + self.scoped_functions[var("make_tuple")] = ScalarCallable("make_tuple") + elif isinstance(expr.operation, ArgMinReductionOperation): - self.scoped_functions["min"] = ScalarCallable("min") - self.scoped_functions["make_tuple"] = ScalarCallable("make_tuple") + self.scoped_functions[var("min")] = ScalarCallable("min") + self.scoped_functions[var("make_tuple")] = ScalarCallable("make_tuple") + self.scoped_functions[ArgExtOp(expr.operation)] = ScalarCallable( + expr.operation) return super(FunctionScoper, self).map_reduction(expr, expn_state) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index c90e8a64b..f60e1ddb1 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -625,6 +625,8 @@ class CallMangleInfo(ImmutableRecord): """ def __init__(self, target_name, result_dtypes, arg_dtypes): + # added for debugging + raise NotImplementedError("Please use the new interface! :-)") assert isinstance(result_dtypes, tuple) super(CallMangleInfo, self).__init__( diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 6004de9ee..001f23808 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -24,7 +24,6 @@ THE SOFTWARE. import re -import six from six.moves import zip @@ -34,6 +33,8 @@ from loopy.diagnostic import LoopyError from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name +from loopy.library.reduction import ArgExtOp +from loopy.library.reduction import _ArgExtremumReductionOperation from loopy.symbolic import (IdentityMapper, ScopedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, @@ -315,6 +316,19 @@ class ScalarCallable(InKernelCallable): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target="loopy_make_tuple") + elif isinstance(self.name, _ArgExtremumReductionOperation): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_arg%s_%s_%s_op" % (self.name.which, + scalar_dtype.numpy_dtype.type.__name__, + index_dtype.numpy_dtype.type.__name__)) + else: # did not find a scalar function and function prototype does not # even have subkernel registered => no match found @@ -398,7 +412,7 @@ class ScalarCallable(InKernelCallable): for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): if tgt_dtype != expression_to_code_mapper.infer_type(a): - raise LoopyError("Type Mismatch in funciton %s. Expected: %s" + raise LoopyError("Type Mismatch in function %s. Expected: %s" "Got: %s" % (self.name, tgt_dtype, expression_to_code_mapper.infer_type(a))) c_parameters.append( @@ -410,6 +424,40 @@ class ScalarCallable(InKernelCallable): from pymbolic import var return var(self.name_in_target)(*c_parameters) + def generate_preambles(self, target): + if isinstance(self.name, _ArgExtremumReductionOperation): + op = self.name + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline void %(prefix)s_op( + %(scalar_t)s op1, %(index_t)s index1, + %(scalar_t)s op2, %(index_t)s index2, + %(scalar_t)s *op, %(index_t)s *index_out) + { + if (op2 %(comp)s op1) + { + *index_out = index2; + *op = op2; + } + else + { + *index_out = index1; + *op = op1; + } + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + + return + # }}} # }}} @@ -537,7 +585,6 @@ class CallableKernel(InKernelCallable): def generate_preambles(self, target): """ This would generate the target specific preamble. """ - # TODO: Transfer the preamble of the subkernel over here raise NotImplementedError() def emit_call_insn(self, insn, target, expression_to_code_mapper): @@ -591,19 +638,21 @@ class CallableKernel(InKernelCallable): # {{{ new pymbolic calls to scoped functions -def next_indexed_name(name): +def next_indexed_variable(function): + if isinstance(function, ArgExtOp): + return function.copy() func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") - match = func_name.match(name) + match = func_name.match(function.name) if match is None: - if name[-1] == '_': - return "{old_name}0".format(old_name=name) + if function.name[-1] == '_': + return Variable("{old_name}0".format(old_name=function.name)) else: - return "{old_name}_0".format(old_name=name) + return Variable("{old_name}_0".format(old_name=function.name)) - return "{alpha}_{num}".format(alpha=match.group('alpha'), - num=int(match.group('num'))+1) + return Variable("{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1)) class ScopedFunctionNameChanger(RuleAwareIdentityMapper): @@ -619,11 +668,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): self.subst_expander = subst_expander def map_call(self, expr, expn_state): - from loopy.library.reduction import ArgExtOp - if isinstance(expr.function, ArgExtOp): - return IdentityMapper.map_call(self, expr, expn_state) - - name, tag = parse_tagged_name(expr.function) + name, tag = parse_tagged_name(expr.function.function) if name not in self.rule_mapping_context.old_subst_rules: expanded_expr = self.subst_expander(expr) @@ -668,19 +713,20 @@ def register_pymbolic_calls_to_knl_callables(kernel, # No matching in_knl_callable found => make a new one with a new # name. - unique_name = next_indexed_name(pymbolic_call.function.name) - while unique_name in scoped_names_to_functions: + unique_var = next_indexed_variable(pymbolic_call.function.function) + while unique_var in scoped_names_to_functions and not isinstance( + unique_var, ArgExtOp): # keep on finding new names till one a unique one is found. - unique_name = next_indexed_name(unique_name) + unique_var = next_indexed_variable(unique_var) # book-keeping of the functions and names mappings for later use if isinstance(in_knl_callable, CallableKernel): # for array calls the name in the target is the name of the # scoped funciton in_knl_callable = in_knl_callable.copy( - name_in_target=unique_name) - scoped_names_to_functions[unique_name] = in_knl_callable - scoped_functions_to_names[in_knl_callable] = unique_name + name_in_target=unique_var.name) + scoped_names_to_functions[unique_var] = in_knl_callable + scoped_functions_to_names[in_knl_callable] = unique_var pymbolic_calls_to_new_names[pymbolic_call] = ( scoped_functions_to_names[in_knl_callable]) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index fc8afd330..c72d5da19 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -207,6 +207,13 @@ class ReductionOpFunction(FunctionIdentifier): def name(self): return self.__class__.__name__ + def copy(self, reduction_op=None): + if reduction_op is None: + reduction_op = self.reduction_op + + return type(self)(reduction_op) + + # }}} @@ -324,7 +331,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ArgExtOp(self)(*(operand1 + operand2)) + return ScopedFunction(ArgExtOp(self))(*(operand1 + operand2)) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c581fa2ad..101a2d496 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2153,11 +2153,6 @@ class ArgDescrInferenceMapper(CombineMapper): def map_call(self, expr, **kwargs): from loopy.kernel.function_interface import ValueArgDescriptor from loopy.symbolic import SubArrayRef - from loopy.library.reduction import ArgExtOp - - if isinstance(expr.function, ArgExtOp): - # Special treatment to ArgExtOp - return self.combine((self.rec(child) for child in expr.parameters)) # descriptors for the args arg_id_to_descr = dict((i, @@ -2188,7 +2183,7 @@ class ArgDescrInferenceMapper(CombineMapper): # specializing the function according to the parameter description new_scoped_function = ( - self.kernel.scoped_functions[expr.function.name].with_descrs( + self.kernel.scoped_functions[expr.function.function].with_descrs( combined_arg_id_to_descr)) # collecting the descriptors for args, kwargs, assignees @@ -2305,7 +2300,7 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): expr.parameters)) is_ready_for_codegen = self.kernel.scoped_functions[ - expr.function.name].is_ready_for_codegen() + expr.function.function].is_ready_for_codegen() return self.combine( (is_ready_for_codegen,) + tuple( diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 7ce713004..9aa464dc3 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -111,7 +111,8 @@ class IdentityMapperMixin(object): return SubArrayRef(self.rec(expr.swept_inames, *args), self.rec(expr.subscript, *args)) - map_scoped_function = IdentityMapperBase.map_variable + def map_scoped_function(self, expr, *args): + return ScopedFunction(self.rec(expr.function, *args)) map_type_cast = map_type_annotation @@ -177,7 +178,11 @@ class WalkMapper(WalkMapperBase): self.rec(expr.swept_inames, *args) self.rec(expr.subscript, *args) - map_scoped_function = WalkMapperBase.map_variable + def map_scoped_function(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.function, *args) class CallbackMapper(CallbackMapperBase, IdentityMapper): @@ -191,8 +196,6 @@ class CombineMapper(CombineMapperBase): def map_sub_array_ref(self, expr): return self.rec(expr.get_begin_subscript()) - map_scoped_function = CombineMapperBase.map_variable - map_linear_subscript = CombineMapperBase.map_subscript @@ -320,7 +323,8 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) - map_scoped_function = DependencyMapperBase.map_variable + def map_scoped_function(self, expr): + return self.rec(expr.function) class SubstitutionRuleExpander(IdentityMapper): @@ -671,14 +675,29 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") -class ScopedFunction(p.Variable): +class ScopedFunction(p.Expression): """ Connects a call to a callable available in a kernel. - .. attribute:: name + .. attribute:: function An instance of :class:`pymbolic.primitives.Variable` or `loopy.library.reduction.ArgExtOp`. """ + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + from loopy.library.reduction import ArgExtOp + assert isinstance(function, (p.Variable, ArgExtOp)) + self.function = function + + @property + def name(self): + return self.function.name + + def __getinitargs__(self): + return (self.function, ) def stringifier(self): return StringifyMapper @@ -824,9 +843,10 @@ def get_dependencies(expr): # {{{ rule-aware mappers def parse_tagged_name(expr): + from loopy.library.reduction import ArgExtOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, p.Variable): + elif isinstance(expr, (p.Variable, ArgExtOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 3dcc846c7..036a6f64b 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -934,86 +934,6 @@ class CASTBuilder(ASTBuilderBase): lhs_expr, rhs_expr, lhs_dtype): raise NotImplementedError("atomic updates in %s" % type(self).__name__) - def emit_arg_extop(self, codegen_state, - insn): - - ecm = codegen_state.expression_to_code_mapper - - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None - - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. - return self.emit_tuple_assignment(codegen_state, insn) - - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes)) - - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) - - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: - from cgen import ExpressionStatement - return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast( - mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) - - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) - def emit_tuple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper @@ -1038,13 +958,9 @@ class CASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) def emit_multiple_assignment(self, codegen_state, insn): - from loopy.library.reduction import ArgExtOp - if isinstance(insn.expression.function, ArgExtOp): - return self.emit_arg_extop(codegen_state, - insn) ecm = codegen_state.expression_to_code_mapper - func_id = insn.expression.function.name + func_id = insn.expression.function.function in_knl_callable = codegen_state.kernel.scoped_functions[func_id] if in_knl_callable.name_in_target == 'loopy_make_tuple': diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 2dd1a14ea..4dc5a54bc 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -390,7 +390,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): # {{{ implement indexof, indexof_vec - identifier_name = self.kernel.scoped_functions[expr.function.name].name + identifier_name = self.kernel.scoped_functions[expr.function.function].name if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier_name) @@ -432,7 +432,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} - return self.kernel.scoped_functions[expr.function.name].emit_call( + return self.kernel.scoped_functions[expr.function.function].emit_call( expression_to_code_mapper=self, expression=expr, target=self.kernel.target) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 233da62d1..de4fcfc1f 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -285,7 +285,7 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type if isinstance(expr.function, ScopedFunction): in_knl_callable = ( - self.scoped_functions[expr.function.name].with_types( + self.scoped_functions[expr.function.function].with_types( arg_id_to_dtype, self.kernel)) # storing the type specialized function so that it can be used for -- GitLab From 2c79b03647788d66c7aa60aada999a2581e2a638 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 12 Apr 2018 00:46:52 -0500 Subject: [PATCH 087/916] Fixes test_dg --- loopy/kernel/function_interface.py | 2 +- loopy/symbolic.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 001f23808..eff2f8941 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -668,7 +668,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): self.subst_expander = subst_expander def map_call(self, expr, expn_state): - name, tag = parse_tagged_name(expr.function.function) + name, tag = parse_tagged_name(expr.function) if name not in self.rule_mapping_context.old_subst_rules: expanded_expr = self.subst_expander(expr) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 9aa464dc3..7310df23a 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -187,6 +187,7 @@ class WalkMapper(WalkMapperBase): class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant + map_scoped_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): @@ -846,6 +847,8 @@ def parse_tagged_name(expr): from loopy.library.reduction import ArgExtOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag + elif isinstance(expr, ScopedFunction): + return parse_tagged_name(expr.function) elif isinstance(expr, (p.Variable, ArgExtOp)): return expr.name, None else: -- GitLab From c4b030d4cca8400e147148d6403c4d5da1f84906 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 14 Apr 2018 23:40:59 -0500 Subject: [PATCH 088/916] Old mangler interface given. --- loopy/kernel/data.py | 2 - loopy/kernel/function_interface.py | 85 +++++++++++++++++++++++++++- loopy/preprocess.py | 36 ++++++++---- loopy/target/c/codegen/expression.py | 10 ++++ loopy/transform/register_knl.py | 3 +- loopy/type_inference.py | 77 +++++++++++++++++-------- 6 files changed, 173 insertions(+), 40 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index f60e1ddb1..c90e8a64b 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -625,8 +625,6 @@ class CallMangleInfo(ImmutableRecord): """ def __init__(self, target_name, result_dtypes, arg_dtypes): - # added for debugging - raise NotImplementedError("Please use the new interface! :-)") assert isinstance(result_dtypes, tuple) super(CallMangleInfo, self).__init__( diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index eff2f8941..f7cf5fd1c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -257,7 +257,7 @@ class InKernelCallable(ImmutableRecord): # }}} -# {{{ callables on scalar +# {{{ scalar callable class ScalarCallable(InKernelCallable): """ @@ -585,7 +585,13 @@ class CallableKernel(InKernelCallable): def generate_preambles(self, target): """ This would generate the target specific preamble. """ - raise NotImplementedError() + # FIXME: This is not correct, as the code code preamble generated + # during the code generationg of the child kernel, does not guarantee + # that this thing would be updated. + for preamble in self.subkernel.preambles: + yield preamble + + return def emit_call_insn(self, insn, target, expression_to_code_mapper): @@ -636,6 +642,72 @@ class CallableKernel(InKernelCallable): # }}} +class ManglerCallable(ScalarCallable): + """ + A callable whose characateristic is defined by a function mangler. + + .. attribute function_mangler:: + + A function of signature ``(target, name , arg_dtypes)`` and returns an + instance of ``loopy.CallMangleInfo``. + """ + fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", + "arg_id_to_descr", "name_in_target") + + def __init__(self, name, function_mangler, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + self.function_mangler = function_mangler + + super(ManglerCallable, self).__init__( + name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + def __getinitargs__(self): + return (self.name, self.function_mangler, self.arg_id_to_dtype, + self.arg_id_to_descr, self.name_in_target) + + def with_types(self, arg_id_to_dtype, kernel): + if self.arg_id_to_dtype is not None: + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + # if does not match, returns an error. + if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " ManglerCallable?") + + sorted_keys = sorted(arg_id_to_dtype.keys()) + arg_dtypes = tuple(arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + mangle_result = self.function_mangler(kernel.target, self.name, + arg_dtypes) + if mangle_result: + new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) + new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in + enumerate(mangle_result.result_dtypes))) + return self.copy(name_in_target=mangle_result.target_name, + arg_id_to_dtype=new_arg_id_to_dtype) + else: + # The function mangler does not agree with the arg id to dtypes + # provided. Indicating that is illegal. + raise LoopyError("Function %s not coherent with the provided types." % ( + self.name, kernel.target)) + + def mangle_result(self, kernel): + sorted_keys = sorted(self.arg_id_to_dtype.keys()) + arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + return self.function_mangler(kernel.target, self.name, arg_dtypes) + + # {{{ new pymbolic calls to scoped functions def next_indexed_variable(function): @@ -712,8 +784,15 @@ def register_pymbolic_calls_to_knl_callables(kernel, if in_knl_callable not in scoped_functions_to_names: # No matching in_knl_callable found => make a new one with a new # name. + if isinstance(pymbolic_call.function, Variable): + pymbolic_call_function = pymbolic_call.function + elif isinstance(pymbolic_call.function, ScopedFunction): + pymbolic_call_function = pymbolic_call.function.function + else: + raise NotImplementedError("Unknown type %s for pymbolic call " + "function." % type(pymbolic_call)) - unique_var = next_indexed_variable(pymbolic_call.function.function) + unique_var = next_indexed_variable(pymbolic_call_function) while unique_var in scoped_names_to_functions and not isinstance( unique_var, ArgExtOp): # keep on finding new names till one a unique one is found. diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 101a2d496..998ad502b 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2152,7 +2152,11 @@ class ArgDescrInferenceMapper(CombineMapper): def map_call(self, expr, **kwargs): from loopy.kernel.function_interface import ValueArgDescriptor - from loopy.symbolic import SubArrayRef + from loopy.symbolic import SubArrayRef, ScopedFunction + + # ignoring if the call is not to a ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) # descriptors for the args arg_id_to_descr = dict((i, @@ -2293,19 +2297,30 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): def map_call(self, expr, *args, **kwargs): from loopy.library.reduction import ArgExtOp + from pymbolic.primitives import Variable + from loopy.symbolic import ScopedFunction + if isinstance(expr.function, ArgExtOp): return self.combine( tuple( self.rec(child, *args, **kwargs) for child in expr.parameters)) - - is_ready_for_codegen = self.kernel.scoped_functions[ - expr.function.function].is_ready_for_codegen() - return self.combine( - (is_ready_for_codegen,) + - tuple( - self.rec(child, *args, **kwargs) for child in expr.parameters) - ) + elif isinstance(expr.function, Variable): + # UnScopedFunction obtained and hence clearly not ready for + # codegen. + return False + + elif isinstance(expr.function, ScopedFunction): + is_ready_for_codegen = self.kernel.scoped_functions[ + expr.function.function].is_ready_for_codegen() + return self.combine( + (is_ready_for_codegen,) + + tuple( + self.rec(child, *args, **kwargs) + for child in expr.parameters)) + else: + raise LoopyError("Unexpected function type %s obtained in %s" + % (type(expr.function), expr)) def map_call_with_kwargs(self, expr, *args, **kwargs): is_ready_for_codegen = self.kernel.scoped_functions[ @@ -2361,7 +2376,8 @@ def make_functions_ready_for_codegen(kernel): expr = subst_expander(insn.expression) if not unready_functions_collector(expr): # Infer the type of the functions that are not type specialized. - type_inf_mapper(expr) + type_inf_mapper(expr, return_tuple=isinstance(insn, + CallInstruction), return_dtype_set=True) elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): pass diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 4dc5a54bc..27a62b649 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -431,6 +431,16 @@ class ExpressionToCExpressionMapper(IdentityMapper): raise RuntimeError("should not get here") # }}} + from loopy.kernel.function_interface import ManglerCallable + if isinstance(self.kernel.scoped_functions[expr.function.function], + ManglerCallable): + from loopy.codegen import SeenFunction + in_knl_callable = self.kernel.scoped_functions[expr.function.function] + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) return self.kernel.scoped_functions[expr.function.function].emit_call( expression_to_code_mapper=self, diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 20e3817f9..221f2abef 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -66,7 +66,8 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # making the target of the child kernel to be same as the target of parent # kernel. - updated_scoped_functions[function_name] = CallableKernel( + from pymbolic.primitives import Variable + updated_scoped_functions[Variable(function_name)] = CallableKernel( subkernel=callee_kernel.copy(target=caller_kernel.target)) # returning the parent kernel with the new scoped function dictionary diff --git a/loopy/type_inference.py b/loopy/type_inference.py index de4fcfc1f..20c7dc8a2 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -304,27 +304,56 @@ class TypeInferenceMapper(CombineMapper): else: return [new_arg_id_to_dtype[-1]] - return [] + elif isinstance(expr.function, Variable): + # Since, the function is not "scoped", attempt to infer using + # kernel.function_manlgers + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in + expr.parameters) + + # finding the function_mangler which would be associated with the + # realized function. + mangle_result = None + for function_mangler in self.kernel.function_manglers: + mangle_result = function_mangler(self.kernel.target, identifier, + arg_dtypes) + if mangle_result: + # found a match. + break - """ - # Letting this stay over here, as it maybe needed later for maintaining - # backward compatibility: ~KK - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] - else: if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") + from loopy.kernel.function_interface import (ManglerCallable, + ValueArgDescriptor) + + # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes + arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) + arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in + enumerate(mangle_result.result_dtypes))) + arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.arg_dtypes)) + res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.result_dtypes)) + arg_id_to_descr = dict(arg_descrs+res_descrs) + + # creating the ManglerCallable object corresponding to the + # function. + self.specialized_functions[expr] = ManglerCallable( + identifier, function_mangler, arg_id_to_dtype, + arg_id_to_descr, mangle_result.target_name) + + # Returning the type. + if return_tuple: + if mangle_result is not None: + return [mangle_result.result_dtypes] + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not return_tuple: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct " + "assignments") - return [mangle_result.result_dtypes[0]] + return [mangle_result.result_dtypes[0]] - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) - """ + return [] def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -532,12 +561,6 @@ def infer_unknown_types(kernel, expect_completion=False): logger.debug("%s: infer types" % kernel.name) - if expect_completion: - # if completion is expected, then it is important that all the - # callables are scoped. - from loopy.check import check_functions_are_scoped - check_functions_are_scoped(kernel) - from functools import partial debug = partial(_debug, kernel) @@ -703,9 +726,15 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.function_interface import ( register_pymbolic_calls_to_knl_callables) - return register_pymbolic_calls_to_knl_callables( + type_specialized_kernel = register_pymbolic_calls_to_knl_callables( pre_type_specialized_knl, specialized_functions) + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.check import check_functions_are_scoped + check_functions_are_scoped(type_specialized_kernel) + return type_specialized_kernel # }}} -- GitLab From 0aba2097c1cfe21b0cc5370b8ca1b13642535262 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 15 Apr 2018 11:38:45 -0500 Subject: [PATCH 089/916] Suports arg_max --- loopy/kernel/__init__.py | 10 +++++----- loopy/kernel/creation.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index f998cb9a0..051f080c7 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -35,9 +35,9 @@ import re from pytools import UniqueNameGenerator, generate_unique_names -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) +# from loopy.library.function import ( +# default_function_mangler, +# single_arg_function_mangler) from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted @@ -197,8 +197,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): iname_to_tag={}, substitutions={}, function_manglers=[ - default_function_mangler, - single_arg_function_mangler, + # default_function_mangler, + # single_arg_function_mangler, ], scoped_functions={}, symbol_manglers=[], diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index ca64a3157..4b7fd8a22 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1909,7 +1909,8 @@ class FunctionScoper(RuleAwareIdentityMapper): elif isinstance(expr.operation, ArgMaxReductionOperation): self.scoped_functions[var("max")] = ScalarCallable("max") self.scoped_functions[var("make_tuple")] = ScalarCallable("make_tuple") - + self.scoped_functions[ArgExtOp(expr.operation)] = ScalarCallable( + expr.operation) elif isinstance(expr.operation, ArgMinReductionOperation): self.scoped_functions[var("min")] = ScalarCallable("min") self.scoped_functions[var("make_tuple")] = ScalarCallable("make_tuple") -- GitLab From daae8fae81860c1837eb76eaf236ec55270cc14b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 15 Apr 2018 23:40:09 -0500 Subject: [PATCH 090/916] Got rid of debug statements :-) --- loopy/target/opencl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 60546a7a6..199b8854b 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -279,7 +279,6 @@ def with_types_for_opencl_target(in_knl_callable, arg_id_to_dtype): arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}) if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - print(arg_id_to_dtype) num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] for id in arg_id_to_dtype: if not -1 <= id < num_args: -- GitLab From be3078fb7d26719d1f1eff4f0374a977a21c8631 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 15 Apr 2018 23:40:41 -0500 Subject: [PATCH 091/916] Added missing finish_kenrel for a subclass of RuleAwareIdentityMapper --- loopy/kernel/creation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 4b7fd8a22..2e49b7b74 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1939,7 +1939,8 @@ def scope_functions(kernel, function_identifiers=None): function_scoper = FunctionScoper(rule_mapping_context, function_identifiers) # scoping fucntions and collecting the scoped functions - kernel_with_scoped_functions = function_scoper.map_kernel(kernel) + kernel_with_scoped_functions = rule_mapping_context.finish_kernel( + function_scoper.map_kernel(kernel)) # updating the functions collected during the scoped functions updated_scoped_functions = kernel.scoped_functions.copy() -- GitLab From 7bf054312f6151780dde614d3306d08e9dec1445 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Apr 2018 01:48:12 -0500 Subject: [PATCH 092/916] supports segmented scan operations. --- loopy/kernel/creation.py | 7 +++- loopy/kernel/function_interface.py | 59 +++++++++++++++++++++++++----- loopy/library/reduction.py | 2 +- loopy/preprocess.py | 4 +- loopy/symbolic.py | 8 ++-- loopy/target/c/__init__.py | 11 ++---- 6 files changed, 67 insertions(+), 24 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 2e49b7b74..a306280b0 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1898,7 +1898,8 @@ class FunctionScoper(RuleAwareIdentityMapper): from loopy.kernel.function_interface import ScalarCallable from loopy.library.reduction import (MaxReductionOperation, MinReductionOperation, ArgMinReductionOperation, - ArgMaxReductionOperation) + ArgMaxReductionOperation, _SegmentedScalarReductionOperation, + SegmentedOp) from pymbolic import var from loopy.library.reduction import ArgExtOp @@ -1916,6 +1917,10 @@ class FunctionScoper(RuleAwareIdentityMapper): self.scoped_functions[var("make_tuple")] = ScalarCallable("make_tuple") self.scoped_functions[ArgExtOp(expr.operation)] = ScalarCallable( expr.operation) + elif isinstance(expr.operation, _SegmentedScalarReductionOperation): + self.scoped_functions[var("make_tuple")] = ScalarCallable("make_tuple") + self.scoped_functions[SegmentedOp(expr.operation)] = ScalarCallable( + expr.operation) return super(FunctionScoper, self).map_reduction(expr, expn_state) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index f7cf5fd1c..d08cc2e2f 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -34,7 +34,8 @@ from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name from loopy.library.reduction import ArgExtOp -from loopy.library.reduction import _ArgExtremumReductionOperation +from loopy.library.reduction import (_ArgExtremumReductionOperation, + _SegmentedScalarReductionOperation) from loopy.symbolic import (IdentityMapper, ScopedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, @@ -328,7 +329,18 @@ class ScalarCallable(InKernelCallable): name_in_target="loopy_arg%s_%s_%s_op" % (self.name.which, scalar_dtype.numpy_dtype.type.__name__, index_dtype.numpy_dtype.type.__name__)) - + elif isinstance(self.name, _SegmentedScalarReductionOperation): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_segmented_%s_%s_%s_op" % (self.name.which, + scalar_dtype.numpy_dtype.type.__name__, + index_dtype.numpy_dtype.type.__name__)) else: # did not find a scalar function and function prototype does not # even have subkernel registered => no match found @@ -380,7 +392,8 @@ class ScalarCallable(InKernelCallable): # For example: The code generation of `sincos` would be different for # C-Target and OpenCL-target. - # Currently doing pass by value for all the assignees. + # Currently this is formulated such that the first argument is returned + # and rest all are passed by reference as arguments to the function. assert self.is_ready_for_codegen() @@ -389,14 +402,14 @@ class ScalarCallable(InKernelCallable): assert isinstance(insn, CallInstruction) parameters = insn.expression.parameters - assignees = insn.assignees + assignees = insn.assignees[1:] par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in parameters) arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)) - assignee_dtypes = tuple(self.arg_id_to_dtype[-i-1] for i, _ in + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in enumerate(assignees)) from loopy.expression import dtype_to_type_context @@ -425,6 +438,7 @@ class ScalarCallable(InKernelCallable): return var(self.name_in_target)(*c_parameters) def generate_preambles(self, target): + print(self.name) if isinstance(self.name, _ArgExtremumReductionOperation): op = self.name scalar_dtype = self.arg_id_to_dtype[-1] @@ -433,20 +447,20 @@ class ScalarCallable(InKernelCallable): prefix = op.prefix(scalar_dtype, index_dtype) yield (prefix, """ - inline void %(prefix)s_op( + inline %(scalar_t)s %(prefix)s_op( %(scalar_t)s op1, %(index_t)s index1, %(scalar_t)s op2, %(index_t)s index2, - %(scalar_t)s *op, %(index_t)s *index_out) + %(index_t)s *index_out) { if (op2 %(comp)s op1) { *index_out = index2; - *op = op2; + return op2; } else { *index_out = index1; - *op = op1; + return op1; } } """ % dict( @@ -455,6 +469,29 @@ class ScalarCallable(InKernelCallable): index_t=target.dtype_to_typename(index_dtype), comp=op.update_comparison, )) + elif isinstance(self.name, _SegmentedScalarReductionOperation): + print('Danda') + op = self.name + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + print(prefix) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) return @@ -642,6 +679,8 @@ class CallableKernel(InKernelCallable): # }}} +# {{{ mangler callable + class ManglerCallable(ScalarCallable): """ A callable whose characateristic is defined by a function mangler. @@ -707,6 +746,8 @@ class ManglerCallable(ScalarCallable): return self.function_mangler(kernel.target, self.name, arg_dtypes) +# }}} + # {{{ new pymbolic calls to scoped functions diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index c72d5da19..0c2297ab9 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -255,7 +255,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return SegmentedOp(self)(*(operand1 + operand2)) + return ScopedFunction(SegmentedOp(self))(*(operand1 + operand2)) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 998ad502b..0c5c0096b 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2296,11 +2296,11 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): return all(values) def map_call(self, expr, *args, **kwargs): - from loopy.library.reduction import ArgExtOp + from loopy.library.reduction import ArgExtOp, SegmentedOp from pymbolic.primitives import Variable from loopy.symbolic import ScopedFunction - if isinstance(expr.function, ArgExtOp): + if isinstance(expr.function, (ArgExtOp, SegmentedOp)): return self.combine( tuple( self.rec(child, *args, **kwargs) for child in diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 7310df23a..8da8f4d5f 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -689,8 +689,8 @@ class ScopedFunction(p.Expression): def __init__(self, function): if isinstance(function, str): function = p.Variable(function) - from loopy.library.reduction import ArgExtOp - assert isinstance(function, (p.Variable, ArgExtOp)) + from loopy.library.reduction import ArgExtOp, SegmentedOp + assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp)) self.function = function @property @@ -844,12 +844,12 @@ def get_dependencies(expr): # {{{ rule-aware mappers def parse_tagged_name(expr): - from loopy.library.reduction import ArgExtOp + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag elif isinstance(expr, ScopedFunction): return parse_tagged_name(expr.function) - elif isinstance(expr, (p.Variable, ArgExtOp)): + elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 036a6f64b..e40d61687 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -971,14 +971,11 @@ class CASTBuilder(ASTBuilderBase): target=self.target, expression_to_code_mapper=ecm) - from cgen import ExpressionStatement - # FIXME: Depending on the function this can be either an - # ExpressionStatement or Assignment. - # Refer: ScalarCallable::emit_call_insn. It is discussed in detail - # over there. - return ExpressionStatement( + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, CExpression(self.get_c_expression_to_code_mapper(), - in_knl_callable_as_call)) + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): -- GitLab From f239599c9c2f81e934d07c81c7f594a428e37f35 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Apr 2018 02:21:34 -0500 Subject: [PATCH 093/916] Removed debug statements --- loopy/kernel/function_interface.py | 3 --- loopy/target/c/__init__.py | 20 +++++++++++++++----- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index d08cc2e2f..97a1bba02 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -438,7 +438,6 @@ class ScalarCallable(InKernelCallable): return var(self.name_in_target)(*c_parameters) def generate_preambles(self, target): - print(self.name) if isinstance(self.name, _ArgExtremumReductionOperation): op = self.name scalar_dtype = self.arg_id_to_dtype[-1] @@ -470,12 +469,10 @@ class ScalarCallable(InKernelCallable): comp=op.update_comparison, )) elif isinstance(self.name, _SegmentedScalarReductionOperation): - print('Danda') op = self.name scalar_dtype = self.arg_id_to_dtype[-1] segment_flag_dtype = self.arg_id_to_dtype[-2] prefix = op.prefix(scalar_dtype, segment_flag_dtype) - print(prefix) yield (prefix, """ inline %(scalar_t)s %(prefix)s_op( diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index e40d61687..965978fed 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -971,11 +971,21 @@ class CASTBuilder(ASTBuilderBase): target=self.target, expression_to_code_mapper=ecm) - from cgen import Assign - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - return Assign(lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), - in_knl_callable_as_call)) + from loopy.kernel.function_interface import (ScalarCallable, + CallableKernel) + if isinstance(in_knl_callable, ScalarCallable): + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + elif isinstance(in_knl_callable, CallableKernel): + from cgen import ExpressionStatement + return ExpressionStatement( + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: + raise NotImplementedError("Unexpected type of In Kernel Callable.") def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): -- GitLab From bd0390dedcfd21f9e903b8c4ca3473122a6fb89a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Apr 2018 02:21:55 -0500 Subject: [PATCH 094/916] Restores support for CallInstructions --- loopy/target/c/__init__.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 965978fed..80bc8114c 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -974,18 +974,27 @@ class CASTBuilder(ASTBuilderBase): from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) if isinstance(in_knl_callable, ScalarCallable): - from cgen import Assign - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - return Assign(lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), - in_knl_callable_as_call)) + if insn.assignees: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: + # No return scalar callables + from cgen import ExpressionStatement + return ExpressionStatement( + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + elif isinstance(in_knl_callable, CallableKernel): from cgen import ExpressionStatement return ExpressionStatement( CExpression(self.get_c_expression_to_code_mapper(), in_knl_callable_as_call)) else: - raise NotImplementedError("Unexpected type of In Kernel Callable.") + raise NotImplementedError("Unexpected type %s of In Kernel " + "Callable." % type(in_knl_callable)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): -- GitLab From d33750763d22069359cd09f9707b9a22b02e691f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Apr 2018 02:56:00 -0500 Subject: [PATCH 095/916] switching to loopy syntax fabs -> abs --- test/test_scan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_scan.py b/test/test_scan.py index c45afd0d6..40ef4048b 100644 --- a/test/test_scan.py +++ b/test/test_scan.py @@ -351,7 +351,7 @@ def test_argmax(ctx_factory, i_tag): knl = lp.make_kernel( "{[i,j]: 0<=i<%d and 0<=j<=i}" % n, """ - max_vals[i], max_indices[i] = argmax(j, fabs(a[j]), j) + max_vals[i], max_indices[i] = argmax(j, abs(a[j]), j) """) knl = lp.tag_inames(knl, dict(i=i_tag)) -- GitLab From 77b3dfad32c362acee4fd74287ecd88af5570cbe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Apr 2018 02:59:48 -0500 Subject: [PATCH 096/916] Flake8 --- loopy/target/pyopencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 138f02137..3a9b75e8f 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -803,7 +803,7 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) def with_types(self, in_knl_callable, arg_id_to_dtype): - from loopy.library.random123 import random123_with_types + # from loopy.library.random123 import random123_with_types new_callable = super(PyOpenCLCASTBuilder, self).with_types(in_knl_callable, arg_id_to_dtype) if new_callable is not None: -- GitLab From 53fb149213d6e97683dc1e98900705096e30af2b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Apr 2018 03:33:04 -0500 Subject: [PATCH 097/916] Moved to the new function interface --- loopy/statistics.py | 9 ++++++++- test/test_reduction.py | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 5e929b618..defc4f6d7 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -711,9 +711,16 @@ class ExpressionOpCounter(CounterBase): map_variable = map_constant def map_call(self, expr): + from loopy.symbolic import ScopedFunction + if isinstance(expr.function, ScopedFunction): + function_identifier = self.knl.scoped_functions[ + expr.function.function].name + else: + function_identifier = expr.function.name + return ToCountMap( {Op(dtype=self.type_inf(expr), - name='func:'+str(expr.function), + name='func:'+function_identifier, count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.parameters) diff --git a/test/test_reduction.py b/test/test_reduction.py index 866ae9f58..d1754f82f 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -300,7 +300,7 @@ def test_argmax(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i<%d}" % n, """ - max_val, max_idx = argmax(i, fabs(a[i]), i) + max_val, max_idx = argmax(i, abs(a[i]), i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) @@ -400,7 +400,7 @@ def test_parallel_multi_output_reduction(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ - max_val, max_indices = argmax(i, fabs(a[i]), i) + max_val, max_indices = argmax(i, abs(a[i]), i) """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.add_dtypes(knl, dict(a=np.float64)) -- GitLab From 745b091de5327ba7923a12ee1ca63dec54344a6a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Apr 2018 18:17:48 -0500 Subject: [PATCH 098/916] Making InKernelCallables pickables. --- loopy/kernel/function_interface.py | 58 +++++++++++++++++++++--------- loopy/type_inference.py | 8 +++-- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 97a1bba02..c87813774 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -105,7 +105,7 @@ class ArrayArgDescriptor(ImmutableRecord): # }}} -# {{{ helper function for callable kenrel -- kw_to_pos +# {{{ helper function for in kernel callables def get_kw_pos_association(kernel): """ @@ -134,6 +134,25 @@ def get_kw_pos_association(kernel): return kw_to_pos, pos_to_kw + +def with_target(in_knl_callable, target): + + if target is None: + raise RuntimeError() + + def with_target_if_not_None(dtype): + if dtype: + return dtype.with_target(target) + else: + return None + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype.copy() + if in_knl_callable.arg_id_to_dtype: + new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, + dtype in in_knl_callable.arg_id_to_dtype.items()) + + return in_knl_callable.copy(arg_id_to_dtype=new_arg_id_to_dtype) + # }}} @@ -274,7 +293,7 @@ class ScalarCallable(InKernelCallable): def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): - super(InKernelCallable, self).__init__( + super(ScalarCallable, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) @@ -299,24 +318,27 @@ class ScalarCallable(InKernelCallable): if self.name in kernel.target.get_device_ast_builder( ).function_identifiers(): + # Searching the function within the namespace of the target. new_in_knl_callable = kernel.target.get_device_ast_builder().with_types( self, arg_id_to_dtype) + # adding target attribute to the NumpyTypes if new_in_knl_callable is None: new_in_knl_callable = self.copy() - return new_in_knl_callable + return with_target(new_in_knl_callable, kernel.target) elif self.name in ["indexof", "indexof_vec"]: new_arg_id_to_dtype = arg_id_to_dtype.copy() new_arg_id_to_dtype[-1] = kernel.index_dtype - return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype), + kernel.target) elif self.name == "make_tuple": new_arg_id_to_dtype = arg_id_to_dtype.copy() for i in range(len(arg_id_to_dtype)): if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: - new_arg_id_to_dtype[-i-1] = arg_id_to_dtype[i] + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] - return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target="loopy_make_tuple") + return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple"), kernel.target) elif isinstance(self.name, _ArgExtremumReductionOperation): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] @@ -325,10 +347,10 @@ class ScalarCallable(InKernelCallable): new_arg_id_to_dtype = arg_id_to_dtype.copy() new_arg_id_to_dtype[-1] = result_dtypes[0] new_arg_id_to_dtype[-2] = result_dtypes[1] - return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target="loopy_arg%s_%s_%s_op" % (self.name.which, scalar_dtype.numpy_dtype.type.__name__, - index_dtype.numpy_dtype.type.__name__)) + index_dtype.numpy_dtype.type.__name__)), kernel.target) elif isinstance(self.name, _SegmentedScalarReductionOperation): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] @@ -337,10 +359,10 @@ class ScalarCallable(InKernelCallable): new_arg_id_to_dtype = arg_id_to_dtype.copy() new_arg_id_to_dtype[-1] = result_dtypes[0] new_arg_id_to_dtype[-2] = result_dtypes[1] - return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target="loopy_segmented_%s_%s_%s_op" % (self.name.which, scalar_dtype.numpy_dtype.type.__name__, - index_dtype.numpy_dtype.type.__name__)) + index_dtype.numpy_dtype.type.__name__)), kernel.target) else: # did not find a scalar function and function prototype does not # even have subkernel registered => no match found @@ -523,14 +545,16 @@ class CallableKernel(InKernelCallable): def __init__(self, subkernel, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): - super(InKernelCallable, self).__init__( + super(CallableKernel, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) if name_in_target is not None: subkernel = subkernel.copy(name=name_in_target) self.name_in_target = name_in_target - self.subkernel = subkernel + self.subkernel = subkernel.copy( + args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) + if arg.dtype is not None else arg for arg in subkernel.args]) def __getinitargs__(self): return (self.name, self.subkernel, self.arg_id_to_dtype, @@ -571,8 +595,8 @@ class CallableKernel(InKernelCallable): # Returning the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype - return self.copy(subkernel=specialized_kernel, - arg_id_to_dtype=new_arg_id_to_dtype) + return with_target(self.copy(subkernel=specialized_kernel, + arg_id_to_dtype=new_arg_id_to_dtype), kernel.target) def with_descrs(self, arg_id_to_descr): @@ -728,8 +752,8 @@ class ManglerCallable(ScalarCallable): new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in enumerate(mangle_result.result_dtypes))) - return self.copy(name_in_target=mangle_result.target_name, - arg_id_to_dtype=new_arg_id_to_dtype) + return with_target(self.copy(name_in_target=mangle_result.target_name, + arg_id_to_dtype=new_arg_id_to_dtype), kernel.target) else: # The function mangler does not agree with the arg id to dtypes # provided. Indicating that is illegal. diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 20c7dc8a2..51555ab3b 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -325,9 +325,11 @@ class TypeInferenceMapper(CombineMapper): ValueArgDescriptor) # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes - arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) - arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in - enumerate(mangle_result.result_dtypes))) + arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) + for i, dt in enumerate(mangle_result.arg_dtypes)) + arg_id_to_dtype.update(dict((-i-1, + dtype.with_target(self.kernel.target)) for i, dtype in enumerate( + mangle_result.result_dtypes))) arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in enumerate(mangle_result.arg_dtypes)) res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in -- GitLab From 592e2b9ab12048396b8d52960bae937e9ecfcc9c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 15:26:38 -0500 Subject: [PATCH 099/916] fixes small error in map_type_annotation --- loopy/symbolic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8da8f4d5f..301cb4898 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -105,7 +105,7 @@ class IdentityMapperMixin(object): return expr def map_type_annotation(self, expr, *args): - return type(expr)(expr.type, self.rec(expr.child)) + return type(expr)(expr.type, self.rec(expr.child, *args)) def map_sub_array_ref(self, expr, *args): return SubArrayRef(self.rec(expr.swept_inames, *args), -- GitLab From 38114fce1a40f02db1ea2cf3592a907358203557 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 15:27:38 -0500 Subject: [PATCH 100/916] fixes small error to take care of None arg_id_to_dtypes --- loopy/kernel/function_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index c87813774..9fb427fd7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -146,7 +146,7 @@ def with_target(in_knl_callable, target): else: return None - new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype.copy() + new_arg_id_to_dtype = None if in_knl_callable.arg_id_to_dtype: new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, dtype in in_knl_callable.arg_id_to_dtype.items()) -- GitLab From bfaf375d9198824327ea66b697f332aa6d9aa444 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 15:28:06 -0500 Subject: [PATCH 101/916] nice looking code --- loopy/target/c/codegen/expression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 27a62b649..110f3f035 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -431,6 +431,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): raise RuntimeError("should not get here") # }}} + from loopy.kernel.function_interface import ManglerCallable if isinstance(self.kernel.scoped_functions[expr.function.function], ManglerCallable): -- GitLab From 2db932266977cf8193cb5d90d31e7ee21b17e2fa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 15:28:25 -0500 Subject: [PATCH 102/916] switchiing to new function interface. --- loopy/target/python.py | 44 +++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/loopy/target/python.py b/loopy/target/python.py index 8d1a0345b..696f3245e 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -82,47 +82,35 @@ class ExpressionToPythonMapper(StringifyMapper): expr, enclosing_prec) def map_call(self, expr, enclosing_prec): - from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE - identifier = expr.function + identifier_name = self.kernel.scoped_functions[expr.function.function].name - if identifier.name in ["indexof", "indexof_vec"]: + if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + from loopy.kernel.function_interface import ManglerCallable + in_knl_callable = self.kernel.scoped_functions[expr.function.function] + if isinstance(in_knl_callable, ManglerCallable): + from loopy.codegen import SeenFunction + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) str_parameters = None + number_of_assignees = len([key for key in + in_knl_callable.arg_id_to_dtype.keys() if key < 0]) - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: + if number_of_assignees != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - str_parameters = [ - self.rec(par, PREC_NONE) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) + str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") -- GitLab From 990a342b0b7c7211a8202330daea710a450b67f7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 15:35:30 -0500 Subject: [PATCH 103/916] Fixes a small error in the conditional statement. --- loopy/target/opencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 199b8854b..cd9f73fa9 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -314,7 +314,7 @@ def with_types_for_opencl_target(in_knl_callable, arg_id_to_dtype): num_args)) for i in range(count): - if i not in arg_id_to_dtype or arg_id_to_dtype[i] is not None: + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: # the types provided aren't mature enough to specialize the # callable return None -- GitLab From a8d435f1d89105b26ea65a4dfb6020caae5115a1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 20:22:22 -0500 Subject: [PATCH 104/916] Added with_types for random123 functions --- loopy/library/random123.py | 77 +++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 5cc3dd9ce..31fdb527e 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -163,21 +163,18 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(preamble_info): - for f in preamble_info.seen_functions: - try: - rng_variant = FUNC_NAMES_TO_RNG[f.name] - except KeyError: - continue - - from loopy.target.pyopencl import PyOpenCLTarget - yield ("90-random123-"+rng_variant.full_name, - PREAMBLE_TEMPLATE.render( - is_pyopencl_target=isinstance( - preamble_info.kernel.target, - PyOpenCLTarget), - rng_variant=rng_variant, - )) +def random123_preamble_generator(name, target): + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.target.pyopencl import PyOpenCLTarget + return ("90-random123-"+rng_variant.full_name, + PREAMBLE_TEMPLATE.render( + is_pyopencl_target=isinstance( + target, + PyOpenCLTarget), + rng_variant=rng_variant, + )) def random123_function_identifiers(): @@ -225,44 +222,54 @@ def random123_function_mangler(kernel, name, arg_dtypes): def random123_with_types(in_knl_callable, arg_id_to_dtype, target): - # FIXME: Translate the mangler to this. name = in_knl_callable.name if name not in FUNC_NAMES_TO_RNG: return None rng_variant = FUNC_NAMES_TO_RNG[name] - 1/0 from loopy.types import NumpyType base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - from loopy.kernel.data import CallMangleInfo fn = rng_variant.full_name if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return in_knl_callable.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen") elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return None + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + if arg_id_to_dtype[0] != new_arg_id_to_dtype[0]: + print(arg_id_to_dtype) + print(new_arg_id_to_dtype) + 1/0 + + if arg_id_to_dtype[1] != new_arg_id_to_dtype[1]: + print(arg_id_to_dtype) + print(new_arg_id_to_dtype) + 1/0 + return in_knl_callable.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name) elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return in_knl_callable.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name) else: return None -- GitLab From db6f5b1efebab3ad989661651e630880f59aa780 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 20:23:22 -0500 Subject: [PATCH 105/916] Added support for random123 functions and ignored the difference between unint and int --- loopy/kernel/function_interface.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 9fb427fd7..811a1b993 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -312,6 +312,14 @@ class ScalarCallable(InKernelCallable): for id, dtype in arg_id_to_dtype.items(): # only checking for the ones which have been provided if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + import numpy as np + if self.arg_id_to_dtype[id].dtype.type == np.uint32 and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if self.arg_id_to_dtype[id].dtype.type == np.uint64 and ( + arg_id_to_dtype[id].dtype.type == np.int64): + continue + raise LoopyError("Overwriting a specialized" " function is illegal--maybe start with new instance of" " ScalarCallable?") @@ -460,7 +468,12 @@ class ScalarCallable(InKernelCallable): return var(self.name_in_target)(*c_parameters) def generate_preambles(self, target): - if isinstance(self.name, _ArgExtremumReductionOperation): + from loopy.library.random123 import (random123_function_identifiers, + random123_preamble_generator) + if self.name in random123_function_identifiers(): + yield random123_preamble_generator(self.name, target) + + elif isinstance(self.name, _ArgExtremumReductionOperation): op = self.name scalar_dtype = self.arg_id_to_dtype[-1] index_dtype = self.arg_id_to_dtype[-2] @@ -512,6 +525,7 @@ class ScalarCallable(InKernelCallable): combined=op.op % ("op1", "op2"), )) + return # }}} -- GitLab From c678228e74f02836f120c7f9c0e44271b0c9fde5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 20:24:12 -0500 Subject: [PATCH 106/916] streamlined a few lines --- loopy/type_inference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 51555ab3b..d0c1d1e98 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -284,8 +284,10 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type if isinstance(expr.function, ScopedFunction): + in_knl_callable = self.scoped_functions[expr.function.function] + in_knl_callable = ( - self.scoped_functions[expr.function.function].with_types( + in_knl_callable.with_types( arg_id_to_dtype, self.kernel)) # storing the type specialized function so that it can be used for -- GitLab From f8f934181f38d023fa84920e9cd0be4fdd842181 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 20:25:44 -0500 Subject: [PATCH 107/916] Added support for random123_with_types --- loopy/target/pyopencl.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 3a9b75e8f..a9e5f2963 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -796,26 +796,22 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): ]) def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator return ([ pyopencl_preamble_generator, - random123_preamble_generator, ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) def with_types(self, in_knl_callable, arg_id_to_dtype): - # from loopy.library.random123 import random123_with_types new_callable = super(PyOpenCLCASTBuilder, self).with_types(in_knl_callable, arg_id_to_dtype) if new_callable is not None: return new_callable - return pyopencl_with_types(in_knl_callable, arg_id_to_dtype) - ''' - # Till the time we have written the RNG with types + new_callable = pyopencl_with_types(in_knl_callable, arg_id_to_dtype) if new_callable is not None: return new_callable - return random123_with_types(in_knl_callable, arg_id_to_dtype) - ''' + from loopy.library.random123 import random123_with_types + return random123_with_types(in_knl_callable, arg_id_to_dtype, + self.target) # }}} -- GitLab From b47531d16d353ccc2b9057e7f1d8ee5bf0608450 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 20:44:45 -0500 Subject: [PATCH 108/916] Placate Flake8 --- loopy/kernel/function_interface.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 811a1b993..984e0a0a0 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -525,7 +525,6 @@ class ScalarCallable(InKernelCallable): combined=op.op % ("op1", "op2"), )) - return # }}} -- GitLab From 1b92beea83da7226ea9369a68ed9ae9df6a640b1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Apr 2018 23:36:37 -0500 Subject: [PATCH 109/916] Fixes the un-pickability of slices in instructions. --- loopy/kernel/creation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index a306280b0..2f2f753b7 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2106,7 +2106,8 @@ def realize_slices_as_sub_array_refs(kernel): if slice_iname_domains: from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel.copy(instructions=new_insns), frozenset()) - return domch.get_kernel_with(slice_iname_domains) + return kernel.copy(domains=domch.get_domains_with(slice_iname_domains), + instructions=new_insns) else: return kernel.copy(instructions=new_insns) -- GitLab From 0b142bf2b914d04504e6f3b73adebf3ad37ba6c1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Apr 2018 16:34:37 -0500 Subject: [PATCH 110/916] Added helpful error strings --- loopy/check.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 95da2d531..0b5c50053 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -122,8 +122,7 @@ def check_functions_are_scoped(kernel): elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): pass else: - raise NotImplementedError("check_function_are_scoped not " - "implemented for %s type of instruction." % type(insn)) + raise NotImplementedError("Unknown type of instruction %s." % type(insn)) # }}} -- GitLab From 867f8d0ca5e9b31950adbbc190d61bc372007484 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Apr 2018 16:37:30 -0500 Subject: [PATCH 111/916] removes unhelpful comments --- loopy/codegen/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d308d288e..37294a993 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -262,8 +262,6 @@ class CodeGenerationState(object): schedule_index_end = self.schedule_index_end if is_generating_master_kernel is None: - # By default assumes that code is being generated for a master - # kernel. is_generating_master_kernel = self.is_generating_master_kernel return CodeGenerationState( -- GitLab From ff2c883a7245b688a038ecdbf5134a6e3f3661aa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Apr 2018 16:50:24 -0500 Subject: [PATCH 112/916] Added some helpful comments --- loopy/codegen/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 37294a993..ba04170e2 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -529,6 +529,8 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): auxiliary_dev_progs = [] + # scanning through all the call instructions if there is any instance of + # CallableKernel, whose code is to be generated. for insn in kernel.instructions: if isinstance(insn, CallInstruction): in_knl_callable = kernel.scoped_functions[ @@ -544,8 +546,8 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): _DataObliviousInstruction)): pass else: - raise NotImplementedError("register_knl not made for %s type of " - "instruction" % (str(type(insn)))) + raise NotImplementedError("Unknown type of instruction %s." % ( + str(type(insn)))) codegen_result = generate_host_or_device_program( codegen_state, @@ -591,6 +593,8 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) + # {{{ collecting preambles from all the in kernel callables. + in_knl_callable_collector = InKernelCallablesCollector(kernel) for insn in kernel.instructions: @@ -603,6 +607,8 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): else: raise NotImplementedError("Unkown instruction %s" % type(insn)) + # }}} + codegen_result = codegen_result.copy(device_preambles=preambles) # }}} -- GitLab From d52434cf86617492f143ded09344b2d2b29ee83b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Apr 2018 16:50:42 -0500 Subject: [PATCH 113/916] Removed the default manglers. --- loopy/kernel/__init__.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 051f080c7..e0e2d6776 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -35,10 +35,6 @@ import re from pytools import UniqueNameGenerator, generate_unique_names -# from loopy.library.function import ( -# default_function_mangler, -# single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted from loopy.diagnostic import StaticValueFindingError @@ -196,10 +192,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): temporary_variables={}, iname_to_tag={}, substitutions={}, - function_manglers=[ - # default_function_mangler, - # single_arg_function_mangler, - ], + function_manglers=[], scoped_functions={}, symbol_manglers=[], -- GitLab From 13831f469e80b867cb18f3e14dec885850b0fce0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Apr 2018 17:08:52 -0500 Subject: [PATCH 114/916] Some comments. --- loopy/kernel/creation.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 2f2f753b7..d78ad982e 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1845,6 +1845,11 @@ class FunctionScoper(RuleAwareIdentityMapper): **Example**: If given an expression of the form ``sin(x) + unknown_function(y) + log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + unknown_function(y) + ScopedFunction('log')(z)``. + + :arg rule_mapping_context: An instance of + :class:`loopy.symbolic.RuleMappingContext`. + :arg function_ids: A container with instances of :class:`str` indicating + the function identifiers to look for while scoping functions. """ def __init__(self, rule_mapping_context, function_ids): super(FunctionScoper, self).__init__(rule_mapping_context) @@ -1903,6 +1908,7 @@ class FunctionScoper(RuleAwareIdentityMapper): from pymbolic import var from loopy.library.reduction import ArgExtOp + # Noting down the extra functions arising due to certain reductions. if isinstance(expr.operation, MaxReductionOperation): self.scoped_functions[var("max")] = ScalarCallable("max") elif isinstance(expr.operation, MinReductionOperation): @@ -1971,6 +1977,8 @@ def get_slice_params(slice, dimension_length): assert isinstance(slice, Slice) start, stop, step = slice.start, slice.stop, slice.step + # {{{ defaulting parameters + if step is None: step = 1 @@ -1989,6 +1997,8 @@ def get_slice_params(slice, dimension_length): else: stop = -1 + # }}} + return start, stop, step @@ -2003,7 +2013,7 @@ class SliceToInameReplacer(IdentityMapper): :attribute knl: - An instance of :clas:`loopy.LoopKernel` + An instance of :class:`loopy.LoopKernel` :attribute iname_domains: @@ -2061,7 +2071,7 @@ class SliceToInameReplacer(IdentityMapper): def get_iname_domain_as_isl_set(self): """ Returns the extra domain constraints imposed by the slice inames, - recorded in :attr:`iname_domains` + recorded in :attr:`iname_domains`. """ if not self.iname_domains: return None @@ -2081,7 +2091,7 @@ class SliceToInameReplacer(IdentityMapper): def realize_slices_as_sub_array_refs(kernel): """ Returns a kernel with the instances of :class:`pymbolic.primitives.Slice` - interpreted as `loopy.symbolic.SubArrayRef`. + encountered in expressions replaced as `loopy.symbolic.SubArrayRef`. """ unique_var_name_generator = kernel.get_var_name_generator() slice_replacer = SliceToInameReplacer(kernel, unique_var_name_generator) -- GitLab From 5d7bf5e7def390d8f41f13af523165164c9e345e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Apr 2018 17:26:11 -0500 Subject: [PATCH 115/916] Added some comments. More to come! --- loopy/kernel/function_interface.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 984e0a0a0..bee6f9850 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -120,7 +120,6 @@ def get_kw_pos_association(kernel): for arg in kernel.args: # FIXME: Confused about the written and read variables ordering. - # Confirm it with Prof. Andreas. if arg.name not in kernel.get_written_variables(): kw_to_pos[arg.name] = read_count pos_to_kw[read_count] = arg.name @@ -136,11 +135,24 @@ def get_kw_pos_association(kernel): def with_target(in_knl_callable, target): + """ + Returns a copy of :arg:`in_knl_callable` with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` as instances of + :class:`loopy.LoopyType`. + + :arg in_knl_callable: An instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + :arg target: An instance of :class:`loopy.target.TargetBase`. + """ if target is None: raise RuntimeError() def with_target_if_not_None(dtype): + """ + Returns a copy of :arg:`dtype` associated with the target. If + ``dtype`` is *None* returns *None*. + """ if dtype: return dtype.with_target(target) else: -- GitLab From 8e0a3680f8200c3392f65285aead93d24ab75f97 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Apr 2018 12:11:21 -0500 Subject: [PATCH 116/916] Added comments. --- loopy/kernel/function_interface.py | 67 ++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index bee6f9850..630ae76b7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -33,7 +33,7 @@ from loopy.diagnostic import LoopyError from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name -from loopy.library.reduction import ArgExtOp +from loopy.library.reduction import ArgExtOp, SegmentedOp from loopy.library.reduction import (_ArgExtremumReductionOperation, _SegmentedScalarReductionOperation) @@ -320,7 +320,6 @@ class ScalarCallable(InKernelCallable): if self.arg_id_to_dtype is not None: # specializing an already specialized function. - for id, dtype in arg_id_to_dtype.items(): # only checking for the ones which have been provided if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: @@ -336,21 +335,31 @@ class ScalarCallable(InKernelCallable): " function is illegal--maybe start with new instance of" " ScalarCallable?") + # {{{ target specific callables + if self.name in kernel.target.get_device_ast_builder( ).function_identifiers(): - # Searching the function within the namespace of the target. new_in_knl_callable = kernel.target.get_device_ast_builder().with_types( self, arg_id_to_dtype) # adding target attribute to the NumpyTypes if new_in_knl_callable is None: new_in_knl_callable = self.copy() return with_target(new_in_knl_callable, kernel.target) + + # }}} + + # {{{ indexof, indexof_vec + elif self.name in ["indexof", "indexof_vec"]: new_arg_id_to_dtype = arg_id_to_dtype.copy() new_arg_id_to_dtype[-1] = kernel.index_dtype return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype), kernel.target) + # }}} + + # {{{ make_tuple + elif self.name == "make_tuple": new_arg_id_to_dtype = arg_id_to_dtype.copy() for i in range(len(arg_id_to_dtype)): @@ -359,6 +368,11 @@ class ScalarCallable(InKernelCallable): return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target="loopy_make_tuple"), kernel.target) + + # }}} + + # {{{ ArgExtOp, SegmentedOp + elif isinstance(self.name, _ArgExtremumReductionOperation): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] @@ -383,6 +397,9 @@ class ScalarCallable(InKernelCallable): name_in_target="loopy_segmented_%s_%s_%s_op" % (self.name.which, scalar_dtype.numpy_dtype.type.__name__, index_dtype.numpy_dtype.type.__name__)), kernel.target) + + # }}} + else: # did not find a scalar function and function prototype does not # even have subkernel registered => no match found @@ -426,6 +443,20 @@ class ScalarCallable(InKernelCallable): return var(self.name_in_target)(*processed_parameters) def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a pymbolic call for C-based targets, when the instructions + involve multiple return values along with the required type casting. + The first assignee is returned, but the rest of them are appended to + the parameters and passed by reference. + + :Example: ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` + + :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. + :arg target: An instance of :class:`loopy.target.TargetBase`. + :arg expression_to_code_mapper: An instance of :class:`IdentityMapper` + responsible for code mapping from :mod:`loopy` syntax to the + **target syntax**. + """ # FIXME: needs to get information about whether the callable has should # do pass by reference by all values or should return one value for @@ -476,7 +507,6 @@ class ScalarCallable(InKernelCallable): dtype_to_type_context(target, tgt_dtype), tgt_dtype).expr)) - from pymbolic import var return var(self.name_in_target)(*c_parameters) def generate_preambles(self, target): @@ -786,6 +816,10 @@ class ManglerCallable(ScalarCallable): self.name, kernel.target)) def mangle_result(self, kernel): + """ + Returns an instance of :class:`loopy.kernel.data.CallMangleInfo` for + the given pair :attr:`function_mangler` and :attr:`arg_id_to_dtype`. + """ sorted_keys = sorted(self.arg_id_to_dtype.keys()) arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if key >= 0) @@ -798,7 +832,17 @@ class ManglerCallable(ScalarCallable): # {{{ new pymbolic calls to scoped functions def next_indexed_variable(function): - if isinstance(function, ArgExtOp): + """ + Returns a copy a :arg:`function` with the next indexed-name in the + sequence. + + :Example: ``Variable('sin_0')`` will return ``Variable('sin_1'). + + :arg function: Either an instance of :class:`pymbolic.primitives.Variable` + or :class:`loopy.reduction.ArgExtOp` or + :class:`loopy.reduction.SegmentedOp`. + """ + if isinstance(function, (ArgExtOp, SegmentedOp)): return function.copy() func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") @@ -851,9 +895,16 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_exprs_to_knl_callables): - """ Takes in a mapping :arg:`pymbolic_exprs_to_knl_callables` and returns a - new kernel which includes an association with the given pymbolic calls to - instances of :class:`InKernelCallable` + """ + Returns a copy of :arg:`kernel` which includes an association with the given + pymbolic expressions to the instances of :class:`InKernelCallable` for the + mapping given by :arg:`pymbolic_exprs_to_knl_calllables`. + + :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`. + + :arg pymbolic_exprs_to_knl_callables: A mapping from pymbolic expressions + to the instances of + :class:`loopy.kernel.function_interface.InKernelCallable`. """ scoped_names_to_functions = kernel.scoped_functions.copy() -- GitLab From 050f93bc2b9b60d8ac057b51d81f0cdb16cba6b2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Apr 2018 12:24:56 -0500 Subject: [PATCH 117/916] Added a few comments. --- loopy/target/__init__.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 336985ede..5a90dd51e 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -151,6 +151,11 @@ class ASTBuilderBase(object): # {{{ library def function_identifiers(self): + """ + Returns an instance of :class:`set` containing instances of + :class:`str` indicating the names of the functions known to the + :attr:`ASTBuilderBase.target`. + """ return set() def function_manglers(self): @@ -164,10 +169,14 @@ class ASTBuilderBase(object): def with_types(self, in_knl_callable, arg_id_to_dtype): """ - Checks the in-kernel callable with the target specific functions and then - returns either `None` when no match is found or returns a new type - specialized instance of :class:`InKernelCallable`. - + Returns a copy of :arg:`in_knl_callable` along with the return type for + the argument types specified by :arg:`arg_id_to_dtype`. Returns *None* + if no such function exists for the given types. + + :arg in_knl_callable: An instance of + :class:`loopy.kernel.function_interface`. + :arg arg_id_to_dtype: A mapping similar + :meth:`loopy.kernel.function_interface.with_types()` """ return None -- GitLab From 6b1e7a05eb03fe1b6ac3071df0518e75816f6aa1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 22 Apr 2018 23:43:01 -0500 Subject: [PATCH 118/916] Added code for register_function_scoper interface. --- loopy/__init__.py | 3 - loopy/kernel/__init__.py | 37 ++-- loopy/kernel/creation.py | 76 ++++----- loopy/kernel/function_interface.py | 218 ++++-------------------- loopy/library/function.py | 45 +++++ loopy/library/random123.py | 166 +++++++----------- loopy/library/reduction.py | 206 ++++++++++------------- loopy/target/__init__.py | 26 +-- loopy/target/c/__init__.py | 261 ++++++++++------------------- loopy/target/cuda.py | 135 ++++++--------- loopy/target/opencl.py | 260 +++++++++++----------------- loopy/target/pyopencl.py | 143 +++++++--------- loopy/type_inference.py | 29 +++- 13 files changed, 616 insertions(+), 989 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 4fa8c5fc5..f77449d19 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -33,9 +33,6 @@ from loopy.diagnostic import LoopyError, LoopyWarning # {{{ imported user interface -from loopy.library.function import ( - default_function_mangler, single_arg_function_mangler) - from loopy.kernel.instruction import ( memory_ordering, memory_scope, VarAtomicity, AtomicInit, AtomicUpdate, InstructionBase, diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index e0e2d6776..0ea2a2557 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -141,6 +141,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: function_manglers .. attribute:: symbol_manglers + .. attribute:: function_scopers + + A list of functions of signature ``(target, name)`` returning a + :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + .. attribute:: substitutions a mapping from substitution names to @@ -193,6 +198,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): iname_to_tag={}, substitutions={}, function_manglers=[], + function_scopers=frozenset(), scoped_functions={}, symbol_manglers=[], @@ -259,6 +265,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT + from loopy.library.function import loopy_specific_callable_scopers + # populating the function scopers from the target and the loopy + # specific callable scopers + function_scopers = frozenset([loopy_specific_callable_scopers]) | ( + target.get_device_ast_builder().function_scopers()) + ImmutableRecordWithoutPickling.__init__(self, domains=domains, instructions=instructions, @@ -278,6 +290,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, function_manglers=function_manglers, + function_scopers=function_scopers, scoped_functions=scoped_functions, symbol_manglers=symbol_manglers, index_dtype=index_dtype, @@ -291,7 +304,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling + # {{{ function mangling/scoping def mangle_function(self, identifier, arg_dtypes, ast_builder=None): if ast_builder is None: @@ -334,18 +347,19 @@ class LoopKernel(ImmutableRecordWithoutPickling): return None - # }}} - - # {{{ target function identifiers - - @property - def function_identifiers(self): + def lookup_function(self, identifier, ast_builder=None): """ - Returns the function identifiers as an instance of :class:`set` which - are known to the kernel at creation time. + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. """ - return self.target.get_device_ast_builder().function_identifiers() | ( - set(["indexof", "indexof_vec", "make_tuple"])) + for scoper in self.function_scopers: + in_knl_callable = scoper(self.target, identifier) + if in_knl_callable: + return in_knl_callable + + return None # }}} @@ -1359,6 +1373,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "preamble_generators", "function_manglers", + "function_scopers", "symbol_manglers", "scoped_functions", ) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index d78ad982e..412debc43 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1851,49 +1851,49 @@ class FunctionScoper(RuleAwareIdentityMapper): :arg function_ids: A container with instances of :class:`str` indicating the function identifiers to look for while scoping functions. """ - def __init__(self, rule_mapping_context, function_ids): + def __init__(self, rule_mapping_context, kernel): super(FunctionScoper, self).__init__(rule_mapping_context) - self.function_ids = function_ids + self.kernel = kernel self.scoped_functions = {} def map_call(self, expr, expn_state): from loopy.symbolic import ScopedFunction - if not isinstance(expr.function, ScopedFunction) and ( - expr.function.name in self.function_ids): - # The function is one of the known function hence scoping it. - from pymbolic.primitives import Call - from loopy.kernel.function_interface import ScalarCallable + if not isinstance(expr.function, ScopedFunction): - # Associating the newly created ScopedFunction with a `CallableScalar` - self.scoped_functions[expr.function] = ScalarCallable( - expr.function.name) + # searching the kernel for the function. + in_knl_callable = self.kernel.lookup_function(expr.function.name) + if in_knl_callable: + # Associating the newly created ScopedFunction with the + # resolved in-kernel callable. + self.scoped_functions[expr.function] = in_knl_callable - return Call( - ScopedFunction(expr.function.name), - tuple(self.rec(child, expn_state) - for child in expr.parameters)) + return type(expr)( + ScopedFunction(expr.function.name), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) # This is an unknown function as of yet, hence not modifying it. return super(FunctionScoper, self).map_call(expr, expn_state) def map_call_with_kwargs(self, expr, expn_state): from loopy.symbolic import ScopedFunction - if not isinstance(expr.function, ScopedFunction) and ( - expr.function.name in self.function_ids): - from pymbolic.primitives import CallWithKwargs - from loopy.kernel.function_interface import ScalarCallable - - # Associating the newly created ScopedFunction with a `CallableScalar` - self.scoped_functions[expr.function.function] = ScalarCallable( - expr.function.name) - return CallWithKwargs( - ScopedFunction(expr.function.name), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) + if not isinstance(expr.function, ScopedFunction): + + # searching the kernel for the function. + in_knl_callable = self.kernel.lookup_function(expr.function.name) + + if in_knl_callable: + # Associating the newly created ScopedFunction with the + # resolved in-kernel callable. + self.scoped_functions[expr.function.function] = in_knl_callable + return type(expr)( + ScopedFunction(expr.function.name), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) # This is an unknown function as of yet, hence not modifying it. return super(FunctionScoper, self).map_call_with_kwargs(expr, @@ -1931,23 +1931,19 @@ class FunctionScoper(RuleAwareIdentityMapper): return super(FunctionScoper, self).map_reduction(expr, expn_state) -def scope_functions(kernel, function_identifiers=None): +def scope_functions(kernel): """ Returns a kernel with the pymbolic nodes involving known functions realized - as instances of :class:`loopy.symbolic.ScopedFunction`. - - :arg function_identifiers: The functions which are to be looked up in the - kernel. + as instances of :class:`loopy.symbolic.ScopedFunction`, along with the + resolved functions being added to the ``scoped_functions`` dictionary of + the kernel. """ - if function_identifiers is None: - # Adding the default fucnction identifiers if none provided - function_identifiers = kernel.function_identifiers from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) - function_scoper = FunctionScoper(rule_mapping_context, function_identifiers) + function_scoper = FunctionScoper(rule_mapping_context, kernel) # scoping fucntions and collecting the scoped functions kernel_with_scoped_functions = rule_mapping_context.finish_kernel( @@ -2463,7 +2459,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_written_variable_names(knl) # Function Lookup - knl = scope_functions(knl, knl.function_identifiers) + knl = scope_functions(knl) from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 630ae76b7..d225e2528 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -34,8 +34,6 @@ from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name from loopy.library.reduction import ArgExtOp, SegmentedOp -from loopy.library.reduction import (_ArgExtremumReductionOperation, - _SegmentedScalarReductionOperation) from loopy.symbolic import (IdentityMapper, ScopedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, @@ -133,38 +131,6 @@ def get_kw_pos_association(kernel): return kw_to_pos, pos_to_kw - -def with_target(in_knl_callable, target): - """ - Returns a copy of :arg:`in_knl_callable` with all the ``dtypes`` in - ``in_knl_callable.arg_id_to_dtype`` as instances of - :class:`loopy.LoopyType`. - - :arg in_knl_callable: An instance of - :class:`loopy.kernel.function_interface.InKernelCallable`. - :arg target: An instance of :class:`loopy.target.TargetBase`. - """ - - if target is None: - raise RuntimeError() - - def with_target_if_not_None(dtype): - """ - Returns a copy of :arg:`dtype` associated with the target. If - ``dtype`` is *None* returns *None*. - """ - if dtype: - return dtype.with_target(target) - else: - return None - - new_arg_id_to_dtype = None - if in_knl_callable.arg_id_to_dtype: - new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, - dtype in in_knl_callable.arg_id_to_dtype.items()) - - return in_knl_callable.copy(arg_id_to_dtype=new_arg_id_to_dtype) - # }}} @@ -247,6 +213,35 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() + def with_target(self, target): + """ + Returns a copy with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` as instances of + :class:`loopy.LoopyType`. + + :arg target: An instance of :class:`loopy.target.TargetBase`. + """ + + if target is None: + raise RuntimeError() + + def with_target_if_not_None(dtype): + """ + Returns a copy of :arg:`dtype` associated with the target. If + ``dtype`` is *None* returns *None*. + """ + if dtype: + return dtype.with_target(target) + else: + return None + + new_arg_id_to_dtype = None + if self.arg_id_to_dtype: + new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, + dtype in self.arg_id_to_dtype.items()) + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + def with_iname_tag_usage(self, unusable, concurrent_shape): """ :arg unusable: a set of iname tags that may not be used in the callee. @@ -317,94 +312,8 @@ class ScalarCallable(InKernelCallable): self.name_in_target) def with_types(self, arg_id_to_dtype, kernel): - if self.arg_id_to_dtype is not None: - - # specializing an already specialized function. - for id, dtype in arg_id_to_dtype.items(): - # only checking for the ones which have been provided - if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: - import numpy as np - if self.arg_id_to_dtype[id].dtype.type == np.uint32 and ( - arg_id_to_dtype[id].dtype.type == np.int32): - continue - if self.arg_id_to_dtype[id].dtype.type == np.uint64 and ( - arg_id_to_dtype[id].dtype.type == np.int64): - continue - - raise LoopyError("Overwriting a specialized" - " function is illegal--maybe start with new instance of" - " ScalarCallable?") - - # {{{ target specific callables - - if self.name in kernel.target.get_device_ast_builder( - ).function_identifiers(): - new_in_knl_callable = kernel.target.get_device_ast_builder().with_types( - self, arg_id_to_dtype) - # adding target attribute to the NumpyTypes - if new_in_knl_callable is None: - new_in_knl_callable = self.copy() - return with_target(new_in_knl_callable, kernel.target) - - # }}} - - # {{{ indexof, indexof_vec - - elif self.name in ["indexof", "indexof_vec"]: - new_arg_id_to_dtype = arg_id_to_dtype.copy() - new_arg_id_to_dtype[-1] = kernel.index_dtype - - return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype), - kernel.target) - # }}} - - # {{{ make_tuple - - elif self.name == "make_tuple": - new_arg_id_to_dtype = arg_id_to_dtype.copy() - for i in range(len(arg_id_to_dtype)): - if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: - new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] - - return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target="loopy_make_tuple"), kernel.target) - - # }}} - - # {{{ ArgExtOp, SegmentedOp - - elif isinstance(self.name, _ArgExtremumReductionOperation): - scalar_dtype = arg_id_to_dtype[0] - index_dtype = arg_id_to_dtype[1] - result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, - index_dtype) - new_arg_id_to_dtype = arg_id_to_dtype.copy() - new_arg_id_to_dtype[-1] = result_dtypes[0] - new_arg_id_to_dtype[-2] = result_dtypes[1] - return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target="loopy_arg%s_%s_%s_op" % (self.name.which, - scalar_dtype.numpy_dtype.type.__name__, - index_dtype.numpy_dtype.type.__name__)), kernel.target) - elif isinstance(self.name, _SegmentedScalarReductionOperation): - scalar_dtype = arg_id_to_dtype[0] - index_dtype = arg_id_to_dtype[1] - result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, - index_dtype) - new_arg_id_to_dtype = arg_id_to_dtype.copy() - new_arg_id_to_dtype[-1] = result_dtypes[0] - new_arg_id_to_dtype[-2] = result_dtypes[1] - return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target="loopy_segmented_%s_%s_%s_op" % (self.name.which, - scalar_dtype.numpy_dtype.type.__name__, - index_dtype.numpy_dtype.type.__name__)), kernel.target) - - # }}} - - else: - # did not find a scalar function and function prototype does not - # even have subkernel registered => no match found - raise LoopyError("Function %s not present within" - " the %s namespace" % (self.name, kernel.target)) + raise LoopyError("No type inference information present for " + "the function %s." % (self.name)) def with_descrs(self, arg_id_to_descr): @@ -510,63 +419,6 @@ class ScalarCallable(InKernelCallable): return var(self.name_in_target)(*c_parameters) def generate_preambles(self, target): - from loopy.library.random123 import (random123_function_identifiers, - random123_preamble_generator) - if self.name in random123_function_identifiers(): - yield random123_preamble_generator(self.name, target) - - elif isinstance(self.name, _ArgExtremumReductionOperation): - op = self.name - scalar_dtype = self.arg_id_to_dtype[-1] - index_dtype = self.arg_id_to_dtype[-2] - - prefix = op.prefix(scalar_dtype, index_dtype) - - yield (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) - elif isinstance(self.name, _SegmentedScalarReductionOperation): - op = self.name - scalar_dtype = self.arg_id_to_dtype[-1] - segment_flag_dtype = self.arg_id_to_dtype[-2] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - yield (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) - return # }}} @@ -650,8 +502,8 @@ class CallableKernel(InKernelCallable): # Returning the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype - return with_target(self.copy(subkernel=specialized_kernel, - arg_id_to_dtype=new_arg_id_to_dtype), kernel.target) + return self.copy(subkernel=specialized_kernel, + arg_id_to_dtype=new_arg_id_to_dtype) def with_descrs(self, arg_id_to_descr): @@ -807,8 +659,8 @@ class ManglerCallable(ScalarCallable): new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in enumerate(mangle_result.result_dtypes))) - return with_target(self.copy(name_in_target=mangle_result.target_name, - arg_id_to_dtype=new_arg_id_to_dtype), kernel.target) + return self.copy(name_in_target=mangle_result.target_name, + arg_id_to_dtype=new_arg_id_to_dtype) else: # The function mangler does not agree with the arg id to dtypes # provided. Indicating that is illegal. diff --git a/loopy/library/function.py b/loopy/library/function.py index 9d557ac9f..57a8ac53c 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from loopy.kernel.function_interface import ScalarCallable + def default_function_mangler(kernel, name, arg_dtypes): from loopy.library.reduction import reduction_function_mangler @@ -56,4 +58,47 @@ def tuple_function_mangler(kernel, name, arg_dtypes): return None +class MakeTupleCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] + + from loopy.kernel.function_interface import with_target + return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple"), kernel.target) + + def with_descrs(self, arg_id_to_descr): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), + (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) + + return self.copy(arg_id_to_descr=new_arg_id_to_descr) + + +class IndexOfCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = kernel.index_dtype + + from loopy.kernel.function_interface import with_target + return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype), + kernel.target) + + +def loopy_specific_callable_scopers(target, identifier): + if identifier == "make_tuple": + return MakeTupleCallable(name="make_tuple") + + if identifier in ["indexof", "indexof_vec"]: + return IndexOfCallable(name=identifier) + + return None + # FIXME: Reduction callables are an important part, but there are some + # import related issues, which I am planning to handle later! + # from loopy.library.reduction import reduction_specific_callables + # return reduction_specific_callables(target, identifier) + + # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 31fdb527e..a2880bfb8 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -27,6 +27,7 @@ THE SOFTWARE. from pytools import ImmutableRecord from mako.template import Template +from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -163,114 +164,73 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(name, target): +class Random123Callable(ScalarCallable): + """ + Records information about for the random123 functions. + """ - rng_variant = FUNC_NAMES_TO_RNG[name] + def with_types(self, arg_id_to_dtype, kernel): - from loopy.target.pyopencl import PyOpenCLTarget - return ("90-random123-"+rng_variant.full_name, - PREAMBLE_TEMPLATE.render( - is_pyopencl_target=isinstance( - target, - PyOpenCLTarget), - rng_variant=rng_variant, - )) - - -def random123_function_identifiers(): - return set(FUNC_NAMES_TO_RNG) - - -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None - - -def random123_with_types(in_knl_callable, arg_id_to_dtype, target): - name = in_knl_callable.name - - if name not in FUNC_NAMES_TO_RNG: - return None - - rng_variant = FUNC_NAMES_TO_RNG[name] - - from loopy.types import NumpyType - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - fn = rng_variant.full_name - if name == fn: - new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: - key_dtype} - return in_knl_callable.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=fn+"_gen") - - elif name == fn + "_f32": if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): # the types provided aren't mature enough to specialize the # callable - return None - new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), - rng_variant.width), - -2: ctr_dtype, 0: ctr_dtype, 1: - key_dtype} - if arg_id_to_dtype[0] != new_arg_id_to_dtype[0]: - print(arg_id_to_dtype) - print(new_arg_id_to_dtype) - 1/0 - - if arg_id_to_dtype[1] != new_arg_id_to_dtype[1]: - print(arg_id_to_dtype) - print(new_arg_id_to_dtype) - 1/0 - return in_knl_callable.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name) - - elif name == fn + "_f64": - new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), - rng_variant.width), - -2: ctr_dtype, 0: ctr_dtype, 1: - key_dtype} - return in_knl_callable.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name) - else: - return None + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + name = self.name + target = kernel.target + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + fn = rng_variant.full_name + if name == fn: + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen") + + elif name == fn + "_f32": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name) + + elif name == fn + "_f64": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name) + + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + def generate_preambles(self, target): + rng_variant = FUNC_NAMES_TO_RNG[self.name] + + from loopy.target.pyopencl import PyOpenCLTarget + yield ("90-random123-"+rng_variant.full_name, + PREAMBLE_TEMPLATE.render( + is_pyopencl_target=isinstance( + target, + PyOpenCLTarget), + rng_variant=rng_variant, + )) + + return + + +def random123_function_scoper(target, identifier): + if identifier in FUNC_NAMES_TO_RNG: + return Random123Callable(name=identifier) + + return None # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 0c2297ab9..1dd6f00f1 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -25,6 +25,7 @@ THE SOFTWARE. from pymbolic import var from loopy.symbolic import ScopedFunction +# from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier @@ -269,29 +270,6 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): op = "((%s) * (%s))" which = "product" - -def get_segmented_function_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) - # }}} @@ -345,38 +323,6 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): update_comparison = "<=" neutral_sign = +1 - -def get_argext_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - prefix = op.prefix(scalar_dtype, index_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=kernel.target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) - # }}} @@ -430,70 +376,94 @@ def parse_reduction_op(name): # }}} -def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, index_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, index_dtype), - arg_dtypes=( - scalar_dtype, - index_dtype, - scalar_dtype, - index_dtype), - ) - - elif isinstance(func_id, SegmentedOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, segment_flag_dtype), - arg_dtypes=( - scalar_dtype, - segment_flag_dtype, - scalar_dtype, - segment_flag_dtype), - ) +# {{{ reduction specific callables + +''' +class ReductionCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, in_knl_callable, kernel): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + name_in_target = self.name.prefix(scalar_dtype, index_dtype) + + from loopy.library.kernel.function_interface import with_target + + return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target), kernel.target) + + def with_descr(self, arg_id_to_descr): + from loopy.library.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = arg_id_to_descr.copy() + new_arg_id_to_descr[-1] = ValueArgDescriptor() + return self.copy(arg_id_to_descr=arg_id_to_descr) + + def generate_preambles(self, target): + if isinstance(self.name, _ArgExtremumReductionOperation): + op = self.name + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(index_t)s index1, + %(scalar_t)s op2, %(index_t)s index2, + %(index_t)s *index_out) + { + if (op2 %(comp)s op1) + { + *index_out = index2; + return op2; + } + else + { + *index_out = index1; + return op1; + } + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + elif isinstance(self.name, _SegmentedScalarReductionOperation): + op = self.name + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) + + return + + +def reduction_specific_callable(target, identifier): + if isinstance(identifier, (_ArgExtremumReductionOperation, + _SegmentedScalarReductionOperation)): + return ReductionCallable(name=identifier) return None - - -def reduction_preamble_generator(preamble_info): - from loopy.target.opencl import OpenCLTarget - - for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_argext_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) - - elif isinstance(func.name, SegmentedOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_segmented_function_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) +''' +# }}} # vim: fdm=marker diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 5a90dd51e..53e5ccbc3 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -150,16 +150,13 @@ class ASTBuilderBase(object): # {{{ library - def function_identifiers(self): + def function_scopers(self): """ - Returns an instance of :class:`set` containing instances of - :class:`str` indicating the names of the functions known to the - :attr:`ASTBuilderBase.target`. + Returns an instance of :class:`frozenset` of the functions of signature + ``(target, identifiers)`` returning either an instance of + :class:`InKernelCallable` if a match is found or *None*. """ - return set() - - def function_manglers(self): - return [] + return frozenset() def symbol_manglers(self): return [] @@ -167,19 +164,6 @@ class ASTBuilderBase(object): def preamble_generators(self): return [] - def with_types(self, in_knl_callable, arg_id_to_dtype): - """ - Returns a copy of :arg:`in_knl_callable` along with the return type for - the argument types specified by :arg:`arg_id_to_dtype`. Returns *None* - if no such function exists for the given types. - - :arg in_knl_callable: An instance of - :class:`loopy.kernel.function_interface`. - :arg arg_id_to_dtype: A mapping similar - :meth:`loopy.kernel.function_interface.with_types()` - """ - return None - # }}} # {{{ code generation guts diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 80bc8114c..36c9601b5 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,7 +27,6 @@ THE SOFTWARE. import six import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block @@ -35,6 +34,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper from loopy.types import NumpyType +from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p from pytools import memoize_method @@ -354,179 +354,104 @@ def c_symbol_mangler(kernel, name): # }}} -# {{{ function mangler +# {{{ function scoping -def c_math_identifiers(): - return set(["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "tanh", - "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]) - - -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None - - if name in ["abs", "min", "max"]: - name = "f" + name +class CMathCallable(ScalarCallable): + """ + An umbrella callable for all the math functions which can be seen in a + C-Target. + """ - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): + def with_types(self, arg_id_to_dtype, kernel): + name = self.name - dtype = arg_dtypes[0].numpy_dtype + if name in ["abs", "min", "max"]: + name = "f" + name - if modify_name: - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: - name = name + "l" # fabsl - else: - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) - # binary functions - if (name in ["fmax", "fmin"] - and len(arg_dtypes) == 2): + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype - if dtype.kind == "c": - raise LoopyTypeError("%s does not support complex numbers") + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == 'c': + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - elif dtype.kind == "f": - if modify_name: + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified if dtype == np.float64: - pass # fmin + pass # fabs elif dtype == np.float32: - name = name + "f" # fminf + name = name + "f" # fabsf elif dtype == np.float128: - name = name + "l" # fminl + name = name + "l" # fabsl else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - return None - - -def with_types_for_c_target(in_knl_callable, arg_id_to_dtype, modify_name=False): - """Target facing function for C-like targets in order to map the math - functions encountered in a kernel to the equivalent function signature. - - .. arg in_knl_callable:: - - An instance of :class:`loopy.kernel.function_interface.ScalarCallable`, - which is supposed to be mapped in the target. - - .. arg arg_id_to_dtype:: - - Same as the maapping in :meth:`ScalarCallable.with_types` + raise LoopyTypeError("%s does not support type %s" % (name, + dtype)) + + return self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + + # binary functions + if name in ["fmax", "fmin"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + elif dtype.kind == "f": + if not isinstance(kernel.target, (OpenCLTarget)): + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) + dtype = NumpyType(dtype) + return self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) - .. arg modify_name:: + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - Must be set *True* for C and Cuda targets and *False* for OpenCL targets. - :return: An updated instance of - :class:`loopy.kernel.function_interface.ScalarCallable` tuned for the - target. Or *None* if could not find a corresponding C-function for the given - pair *in_knl_callable*, *arg_id_to_dtype*. +def scope_c_math_functions(target, identifier): """ - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - name = in_knl_callable.name - - if name in ["abs", "min", "max"]: - name = "f" + name - - # unary functions - if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "tan"]: - - for id in arg_id_to_dtype: - if not -1 <= id <= 0: - raise LoopyError("%s can take only one argument." % name) - - if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: - # the types provided aren't mature enough to specialize the - # callable - return None - - dtype = arg_id_to_dtype[0] - dtype = dtype.numpy_dtype - - if dtype.kind in ('u', 'i'): - # ints and unsigned casted to float32 - dtype = np.float32 - elif dtype.kind == 'c': - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - - if modify_name: - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: - name = name + "l" # fabsl - else: - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - - return in_knl_callable.copy(name_in_target=name, - arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) - - # binary functions - if name in ["fmax", "fmin"]: - - for id in arg_id_to_dtype: - if not -1 <= id <= 1: - raise LoopyError("%s can take only two arguments." % name) - - if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( - arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): - # the types provided aren't mature enough to specialize the - # callable - return None - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() - if id >= 0]) - - if dtype.kind == "c": - raise LoopyTypeError("%s does not support complex numbers") - - elif dtype.kind == "f": - if modify_name: - if dtype == np.float64: - pass # fmin - elif dtype == np.float32: - name = name + "f" # fminf - elif dtype == np.float128: - name = name + "l" # fminl - else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) - dtype = NumpyType(dtype) - return in_knl_callable.copy(name_in_target=name, - arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) - + Returns an instance of :class:`InKernelCallable` if the function + represented by :arg:`identifier` is known in C, otherwise returns *None*. + """ + if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]: + return CMathCallable(name=identifier) return None # }}} @@ -535,17 +460,6 @@ def with_types_for_c_target(in_knl_callable, arg_id_to_dtype, modify_name=False) class CASTBuilder(ASTBuilderBase): # {{{ library - def function_identifiers(self): - return ( - super(CASTBuilder, self).function_identifiers() | - c_math_identifiers()) - - def function_manglers(self): - return ( - super(CASTBuilder, self).function_manglers() + [ - c_math_mangler - ]) - def symbol_manglers(self): return ( super(CASTBuilder, self).symbol_manglers() + [ @@ -558,13 +472,10 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) - def with_types(self, in_knl_callable, arg_id_to_dtype): - new_callable = with_types_for_c_target(in_knl_callable, arg_id_to_dtype, - modify_name=True) - if new_callable is not None: - return new_callable - return super(CASTBuilder, self).with_types(in_knl_callable, - arg_id_to_dtype) + def function_scopers(self): + return ( + super(CASTBuilder, self).function_scopers() | frozenset([ + scope_c_math_functions])) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index d2dac07a0..2651abc94 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -30,11 +30,11 @@ from pytools import memoize_method from loopy.target.c import CTarget, CASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper -from loopy.target.c import (c_math_identifiers, with_types_for_c_target) from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import temp_var_scope from pymbolic import var +from loopy.kernel.function_interface import ScalarCallable # {{{ vector types @@ -111,7 +111,7 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ function scoper _CUDA_SPECIFIC_FUNCTIONS = { "rsqrt": 1, @@ -119,85 +119,66 @@ _CUDA_SPECIFIC_FUNCTIONS = { } -def cuda_function_identifiers(): - return set(_CUDA_SPECIFIC_FUNCTIONS) +class CudaCallable(ScalarCallable): + def cuda_with_types(self, arg_id_to_dtype, kernel): -def cuda_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None + name = self.name - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") - - if dtype.kind == "f": - name = "f" + name - - return dtype, name - - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] - return scalar_dtype, name - - return None - - -def cuda_with_types(in_knl_callable, arg_id_to_dtype): + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - name = in_knl_callable.name + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] + return self.copy(name_in_target=name, + arg_id_to_dtype={-1: NumpyType(scalar_dtype), + 0: dtype, 1: dtype}) - if name == "dot": - for id in arg_id_to_dtype: - if not -1 <= id <= 1: - raise LoopyError("%s can take only 2 arguments." % name) + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) - if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( - arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): - # the types provided aren't mature enough to specialize the - # callable - return None + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - dtype = arg_id_to_dtype[0] - scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] - return in_knl_callable.copy(name_in_target=name, - arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}) + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) - if name in _CUDA_SPECIFIC_FUNCTIONS: - num_args = _CUDA_SPECIFIC_FUNCTIONS[name] - for id in arg_id_to_dtype: - if not -1 <= id < num_args: - raise LoopyError("%s can take only %d arguments." % (name, - num_args)) + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) - for i in range(num_args): - if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: - # the types provided aren't mature enough to specialize the - # callable - return None + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) - dtype = np.find_common_type( - [], [dtype.numpy_dtype for id, dtype in - arg_id_to_dtype.items() if id >= 0]) + return self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype) - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, - num_args)) - return in_knl_callable.copy(name_in_target=name, - arg_id_to_dtype=updated_arg_id_to_dtype) +def scope_cuda_functions(target, identifier): + if identifier in frozenset(["dot"]) | frozenset( + _CUDA_SPECIFIC_FUNCTIONS): + return CudaCallable(name=identifier) return None - # }}} @@ -278,29 +259,13 @@ class CudaTarget(CTarget): # {{{ ast builder class CUDACASTBuilder(CASTBuilder): + # {{{ library - def function_manglers(self): - return ( - super(CUDACASTBuilder, self).function_manglers() + [ - cuda_function_mangler - ]) - - def function_identifiers(self): - return (cuda_function_identifiers() | c_math_identifiers() | - super(CUDACASTBuilder, self).function_identifiers()) - - def with_types(self, in_knl_callable, arg_id_to_dtype): - new_callable = cuda_with_types(in_knl_callable, arg_id_to_dtype) - if new_callable is not None: - return new_callable - - new_callable = with_types_for_c_target(in_knl_callable, arg_id_to_dtype, - modify_name=True) - if new_callable is not None: - return new_callable - return super(CUDACASTBuilder, self).with_types(in_knl_callable, - arg_id_to_dtype) + def function_scopers(self): + return frozenset([scope_cuda_functions]) | ( + super(CUDACASTBuilder, self).function_scopers()) + # }}} # {{{ top-level codegen diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index cd9f73fa9..367d06bdd 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,12 +31,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import (DTypeRegistryWrapper, c_math_identifiers, - c_math_mangler, with_types_for_c_target) -from loopy.kernel.data import temp_var_scope, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import temp_var_scope +from loopy.kernel.function_interface import ScalarCallable from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -167,168 +166,117 @@ VECTOR_LITERAL_FUNCS = dict( ) -def opencl_function_identifiers(): - return set(["max", "min", "dot"]) | (set(_CL_SIMPLE_MULTI_ARG_FUNCTIONS) | - set(VECTOR_LITERAL_FUNCS)) +class OpenCLCallable(ScalarCallable): + """ + Records information about OpenCL functions which are not covered by + :class:`loopy.target.c.CMathCallable`. + """ + def with_types(self, arg_id_to_dtype, kernel): + name = self.name -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None + if name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - - if count != len(arg_dtypes): - return None - - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) - return None + if dtype.kind in ['u', 'i', 'f']: + dtype = NumpyType(dtype) + return self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) + else: + # Unsupported type. + raise LoopyError("%s function not supported for the types %s" % + (name, dtype)) + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) -def with_types_for_opencl_target(in_knl_callable, arg_id_to_dtype): - """Returns an updated ``in_knl_callable`` specifically tuned for OpenCL - targets. Returns *None*, if does not match with any of the OpenCL function - signatures. + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - .. arg in_knl_callable:: + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}) - An instance of :class:`loopy.kernel.function_interface.ScalarCallable`. + if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) - .. arg arg_id_to_dtype:: + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - A mapping which provides information from argument id to its type. Same - format as in :meth:`ScalarCallable.with_types`. - """ + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) - name = in_knl_callable.name - - if name in ["max", "min"]: - for id in arg_id_to_dtype: - if not -1 <= id <= 1: - raise LoopyError("%s can take only 2 arguments." % name) - if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: - return None - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() - if (id >= 0 and dtype is not None)]) - - if dtype.kind == "i": - dtype = NumpyType(dtype) - return in_knl_callable.copy(name_in_target=name, - arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) - - if name == "dot": - for id in arg_id_to_dtype: - if not -1 <= id <= 1: - raise LoopyError("%s can take only 2 arguments." % name) - - if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( - arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): - # the types provided aren't mature enough to specialize the - # callable - return None - - dtype = arg_id_to_dtype[0] - scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] - return in_knl_callable.copy(name_in_target=name, - arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - for id in arg_id_to_dtype: - if not -1 <= id < num_args: - raise LoopyError("%s can take only %d arguments." % (name, - num_args)) - - for i in range(num_args): - if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: - # the types provided aren't mature enough to specialize the - # callable - return None + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) - dtype = np.find_common_type( - [], [dtype.numpy_dtype for id, dtype in - arg_id_to_dtype.items() if id >= 0]) + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) + return self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype) - updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, - num_args)) + if name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - return in_knl_callable.copy(name_in_target=name, - arg_id_to_dtype=updated_arg_id_to_dtype) + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - for id in arg_id_to_dtype: - if not -1 <= id < count: - raise LoopyError("%s can take only %d arguments." % (name, - num_args)) + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in + range(count)) + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) - for i in range(count): - if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: - # the types provided aren't mature enough to specialize the - # callable - return None + return self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype) - updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in - range(count)) - updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( - NumpyType(dtype), count) + # does not satisfy any of the conditions needed for specialization. + # hence just returning a copy of the callable. + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - return in_knl_callable.copy(name_in_target="(%s%d) " % (base_tp_name, count), - arg_id_to_dtype=updated_arg_id_to_dtype) - return None +def scope_opencl_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = frozenset(["max", "min", "dot"]) | frozenset( + _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | frozenset(VECTOR_LITERAL_FUNCS) + if identifier in opencl_function_ids: + return OpenCLCallable(name=identifier) + + return None # }}} @@ -473,17 +421,10 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_manglers(self): + def function_scopers(self): return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super(OpenCLCASTBuilder, self).function_manglers()) - - def function_identifiers(self): - return (opencl_function_identifiers() | c_math_identifiers() | - super(OpenCLCASTBuilder, self).function_identifiers()) + frozenset([scope_opencl_functions]) | + super(OpenCLCASTBuilder, self).function_scopers()) def symbol_manglers(self): return ( @@ -500,17 +441,6 @@ class OpenCLCASTBuilder(CASTBuilder): reduction_preamble_generator, ]) - def with_types(self, in_knl_callable, arg_id_to_dtype): - new_callable = with_types_for_opencl_target(in_knl_callable, arg_id_to_dtype) - if new_callable is not None: - return new_callable - - new_callable = with_types_for_c_target(in_knl_callable, arg_id_to_dtype) - if new_callable is not None: - return new_callable - return super(OpenCLCASTBuilder, self).with_types(in_knl_callable, - arg_id_to_dtype) - # }}} # {{{ top-level codegen diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index a9e5f2963..ddda6247b 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,12 +31,12 @@ from six.moves import range import numpy as np -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError, warn_with_kernel +from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError from warnings import warn +from loopy.kernel.function_interface import ScalarCallable import logging logger = logging.getLogger(__name__) @@ -199,80 +199,75 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_identifiers(): - return set(["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", "tanh", - "conj", "real", "imag", "abs"]) +# {{{ pyopencl function scopers +class PyOpenCLCallable(ScalarCallable): + """ + Records information about the callables which are not covered by + :class:`loopy.target.opencl.OpenCLCallable` + """ + def with_types(self, arg_id_to_dtype, kernel): -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes - - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" - else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) - - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) - - return None - - -def pyopencl_with_types(in_knl_callable, arg_id_to_dtype): - - name = in_knl_callable.name + name = self.name - for id in arg_id_to_dtype: - if not -1 <= id <= 0: - return None + for id in arg_id_to_dtype: + # since all the below functions are single arg. + if not -1 <= id <= 0: + raise LoopyError("%s can only take one argument." % name) - if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: - # the types provided aren't mature enough to specialize the - # callable - return None + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - dtype = arg_id_to_dtype[0] + dtype = arg_id_to_dtype[0] - if dtype.is_complex(): - if dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif dtype.numpy_dtype == np.complex128: - tpname = "cdouble" - else: - raise RuntimeError("unexpected complex type '%s'" % dtype) + if name in ["real", "imag", "abs"]: + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + return self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}) if name in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", "tanh", "conj"]: - return in_knl_callable.copy(name_in_target="%s_%s" % (tpname, name), + if dtype.is_complex(): + # function parameters are complex. + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + else: + # function calls for real parameters. + if dtype.kind in ('u', 'i'): + dtype = np.float32 + return self.copy(name_in_target=name, arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) - if name in ["real", "imag", "abs"]: - return in_knl_callable.copy(name_in_target="%s_%s" % (tpname, name), - arg_id_to_dtype={0: dtype, -1: NumpyType( - np.dtype(dtype.numpy_dtype.type(0).real))}) + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + +def pyopencl_function_scoper(target, identifier): + if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"]: + return PyOpenCLCallable(name=identifier) return None +# }}} + # {{{ preamble generator @@ -782,37 +777,17 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_identifiers(self): - from loopy.library.random123 import random123_function_identifiers - return (super(PyOpenCLCASTBuilder, self).function_identifiers() | - pyopencl_function_identifiers() | random123_function_identifiers()) - - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler + def function_scopers(self): + from loopy.library.random123 import random123_function_scoper return ( - super(PyOpenCLCASTBuilder, self).function_manglers() + [ - pyopencl_function_mangler, - random123_function_mangler - ]) + frozenset([pyopencl_function_scoper, random123_function_scoper]) | + super(PyOpenCLCASTBuilder, self).function_scopers()) def preamble_generators(self): return ([ pyopencl_preamble_generator, ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) - def with_types(self, in_knl_callable, arg_id_to_dtype): - new_callable = super(PyOpenCLCASTBuilder, self).with_types(in_knl_callable, - arg_id_to_dtype) - if new_callable is not None: - return new_callable - - new_callable = pyopencl_with_types(in_knl_callable, arg_id_to_dtype) - if new_callable is not None: - return new_callable - from loopy.library.random123 import random123_with_types - return random123_with_types(in_knl_callable, arg_id_to_dtype, - self.target) - # }}} # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index d0c1d1e98..697cfddf5 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -286,13 +286,40 @@ class TypeInferenceMapper(CombineMapper): if isinstance(expr.function, ScopedFunction): in_knl_callable = self.scoped_functions[expr.function.function] + # {{{ checking that there is no overwriting of in_knl_callable + + if in_knl_callable.arg_id_to_dtype is not None: + + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + # Ignoring the the cases when there is a discrepancy + # between np.uint and np.int + if in_knl_callable.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue + + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") + + # }}} + in_knl_callable = ( in_knl_callable.with_types( arg_id_to_dtype, self.kernel)) # storing the type specialized function so that it can be used for # later use - self.specialized_functions[expr] = in_knl_callable + self.specialized_functions[expr] = in_knl_callable.with_target( + self.kernel.target) new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype -- GitLab From 7809e5135f47a31ae6faae3444e6ed8dad70a7b5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 23 Apr 2018 13:06:54 -0500 Subject: [PATCH 119/916] Switched to new function lookup interface. --- loopy/__init__.py | 9 ++++-- loopy/kernel/__init__.py | 13 ++++---- loopy/kernel/creation.py | 30 ++++++++++--------- loopy/kernel/function_interface.py | 6 ++-- loopy/library/function.py | 16 ++++------ loopy/library/reduction.py | 19 +++++------- loopy/target/opencl.py | 7 ++--- loopy/target/pyopencl.py | 14 +++++---- loopy/target/python.py | 22 +++----------- .../{register_knl.py => register_callable.py} | 24 ++++++++++++++- 10 files changed, 86 insertions(+), 74 deletions(-) rename loopy/transform/{register_knl.py => register_callable.py} (79%) diff --git a/loopy/__init__.py b/loopy/__init__.py index f77449d19..7650e303c 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -45,6 +45,8 @@ from loopy.kernel.data import ( temp_var_scope, TemporaryVariable, SubstitutionRule, CallMangleInfo) +from loopy.kernel.function_interface import ( + ScalarCallable) from loopy.kernel import LoopKernel, kernel_state from loopy.kernel.tools import ( @@ -113,7 +115,8 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier -from loopy.transform.register_knl import register_callable_kernel +from loopy.transform.register_callable import (register_callable_kernel, + register_function_lookup) # }}} @@ -160,6 +163,8 @@ __all__ = [ "CallInstruction", "CInstruction", "NoOpInstruction", "BarrierInstruction", + "ScalarCallable", + "KernelArgument", "ValueArg", "GlobalArg", "ConstantArg", "ImageArg", "temp_var_scope", "TemporaryVariable", @@ -221,7 +226,7 @@ __all__ = [ "add_barrier", - "register_callable_kernel", + "register_callable_kernel", "register_function_lookup", # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 0ea2a2557..b99fc6dc2 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -198,7 +198,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): iname_to_tag={}, substitutions={}, function_manglers=[], - function_scopers=frozenset(), + function_scopers=None, scoped_functions={}, symbol_manglers=[], @@ -265,11 +265,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT - from loopy.library.function import loopy_specific_callable_scopers - # populating the function scopers from the target and the loopy - # specific callable scopers - function_scopers = frozenset([loopy_specific_callable_scopers]) | ( - target.get_device_ast_builder().function_scopers()) + if function_scopers is None: + from loopy.library.function import loopy_specific_callable_scopers + # populating the function scopers from the target and the loopy + # specific callable scopers + function_scopers = frozenset([loopy_specific_callable_scopers]) | ( + target.get_device_ast_builder().function_scopers()) ImmutableRecordWithoutPickling.__init__(self, domains=domains, diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 412debc43..219042de4 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1900,7 +1900,6 @@ class FunctionScoper(RuleAwareIdentityMapper): expn_state) def map_reduction(self, expr, expn_state): - from loopy.kernel.function_interface import ScalarCallable from loopy.library.reduction import (MaxReductionOperation, MinReductionOperation, ArgMinReductionOperation, ArgMaxReductionOperation, _SegmentedScalarReductionOperation, @@ -1910,23 +1909,26 @@ class FunctionScoper(RuleAwareIdentityMapper): # Noting down the extra functions arising due to certain reductions. if isinstance(expr.operation, MaxReductionOperation): - self.scoped_functions[var("max")] = ScalarCallable("max") + self.scoped_functions[var("max")] = self.kernel.lookup_function("max") elif isinstance(expr.operation, MinReductionOperation): - self.scoped_functions[var("min")] = ScalarCallable("min") + self.scoped_functions[var("min")] = self.kernel.lookup_function("min") elif isinstance(expr.operation, ArgMaxReductionOperation): - self.scoped_functions[var("max")] = ScalarCallable("max") - self.scoped_functions[var("make_tuple")] = ScalarCallable("make_tuple") - self.scoped_functions[ArgExtOp(expr.operation)] = ScalarCallable( - expr.operation) + self.scoped_functions[var("max")] = self.kernel.lookup_function("max") + self.scoped_functions[var("make_tuple")] = self.kernel.lookup_function( + "make_tuple") + self.scoped_functions[ArgExtOp(expr.operation)] = ( + self.kernel.lookup_function(expr.operation)) elif isinstance(expr.operation, ArgMinReductionOperation): - self.scoped_functions[var("min")] = ScalarCallable("min") - self.scoped_functions[var("make_tuple")] = ScalarCallable("make_tuple") - self.scoped_functions[ArgExtOp(expr.operation)] = ScalarCallable( - expr.operation) + self.scoped_functions[var("min")] = self.kernel.lookup_function("min") + self.scoped_functions[var("make_tuple")] = self.kernel.lookup_function( + "make_tuple") + self.scoped_functions[ArgExtOp(expr.operation)] = ( + self.kernel.lookup_function(expr.operation)) elif isinstance(expr.operation, _SegmentedScalarReductionOperation): - self.scoped_functions[var("make_tuple")] = ScalarCallable("make_tuple") - self.scoped_functions[SegmentedOp(expr.operation)] = ScalarCallable( - expr.operation) + self.scoped_functions[var("make_tuple")] = self.kernel.lookup_function( + "make_tuple") + self.scoped_functions[SegmentedOp(expr.operation)] = ( + self.kernel.lookup_function(expr.operation)) return super(FunctionScoper, self).map_reduction(expr, expn_state) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index d225e2528..7c3aac1f6 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -33,7 +33,6 @@ from loopy.diagnostic import LoopyError from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name -from loopy.library.reduction import ArgExtOp, SegmentedOp from loopy.symbolic import (IdentityMapper, ScopedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, @@ -420,6 +419,7 @@ class ScalarCallable(InKernelCallable): def generate_preambles(self, target): return + yield # }}} @@ -694,6 +694,7 @@ def next_indexed_variable(function): or :class:`loopy.reduction.ArgExtOp` or :class:`loopy.reduction.SegmentedOp`. """ + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(function, (ArgExtOp, SegmentedOp)): return function.copy() func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") @@ -783,8 +784,9 @@ def register_pymbolic_calls_to_knl_callables(kernel, "function." % type(pymbolic_call)) unique_var = next_indexed_variable(pymbolic_call_function) + from loopy.library.reduction import ArgExtOp, SegmentedOp while unique_var in scoped_names_to_functions and not isinstance( - unique_var, ArgExtOp): + unique_var, (ArgExtOp, SegmentedOp)): # keep on finding new names till one a unique one is found. unique_var = next_indexed_variable(unique_var) diff --git a/loopy/library/function.py b/loopy/library/function.py index 57a8ac53c..4873eca91 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -65,9 +65,8 @@ class MakeTupleCallable(ScalarCallable): if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] - from loopy.kernel.function_interface import with_target - return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target="loopy_make_tuple"), kernel.target) + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple") def with_descrs(self, arg_id_to_descr): from loopy.kernel.function_interface import ValueArgDescriptor @@ -82,9 +81,7 @@ class IndexOfCallable(ScalarCallable): new_arg_id_to_dtype = arg_id_to_dtype.copy() new_arg_id_to_dtype[-1] = kernel.index_dtype - from loopy.kernel.function_interface import with_target - return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype), - kernel.target) + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) def loopy_specific_callable_scopers(target, identifier): @@ -94,11 +91,8 @@ def loopy_specific_callable_scopers(target, identifier): if identifier in ["indexof", "indexof_vec"]: return IndexOfCallable(name=identifier) - return None - # FIXME: Reduction callables are an important part, but there are some - # import related issues, which I am planning to handle later! - # from loopy.library.reduction import reduction_specific_callables - # return reduction_specific_callables(target, identifier) + from loopy.library.reduction import reduction_scoper + return reduction_scoper(target, identifier) # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 1dd6f00f1..ca2f02347 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -25,7 +25,7 @@ THE SOFTWARE. from pymbolic import var from loopy.symbolic import ScopedFunction -# from loopy.kernel.function_interface import ScalarCallable +from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier @@ -378,9 +378,8 @@ def parse_reduction_op(name): # {{{ reduction specific callables -''' class ReductionCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, in_knl_callable, kernel): + def with_types(self, arg_id_to_dtype, kernel): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, @@ -388,12 +387,10 @@ class ReductionCallable(ScalarCallable): new_arg_id_to_dtype = arg_id_to_dtype.copy() new_arg_id_to_dtype[-1] = result_dtypes[0] new_arg_id_to_dtype[-2] = result_dtypes[1] - name_in_target = self.name.prefix(scalar_dtype, index_dtype) + name_in_target = self.name.prefix(scalar_dtype, index_dtype) + "_op" - from loopy.library.kernel.function_interface import with_target - - return with_target(self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name_in_target), kernel.target) + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target) def with_descr(self, arg_id_to_descr): from loopy.library.kernel.function_interface import ValueArgDescriptor @@ -457,13 +454,13 @@ class ReductionCallable(ScalarCallable): return -def reduction_specific_callable(target, identifier): +def reduction_scoper(target, identifier): if isinstance(identifier, (_ArgExtremumReductionOperation, _SegmentedScalarReductionOperation)): return ReductionCallable(name=identifier) return None -''' + # }}} # vim: fdm=marker diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 367d06bdd..a882628d7 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -187,6 +187,8 @@ class OpenCLCallable(ScalarCallable): if (id >= 0 and dtype is not None)]) if dtype.kind in ['u', 'i', 'f']: + if dtype.kind == 'f': + name = 'f'+name dtype = NumpyType(dtype) return self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) @@ -433,13 +435,10 @@ class OpenCLCASTBuilder(CASTBuilder): ]) def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator return ( super(OpenCLCASTBuilder, self).preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator, - ]) + opencl_preamble_generator]) # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index ddda6247b..ef884c698 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -230,14 +230,15 @@ class PyOpenCLCallable(ScalarCallable): tpname = "cdouble" else: raise LoopyTypeError("unexpected complex type '%s'" % dtype) - return self.copy(name_in_target="%s_%s" % (tpname, name), - arg_id_to_dtype={0: dtype, -1: NumpyType( - np.dtype(dtype.numpy_dtype.type(0).real))}) + + return self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}) if name in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", "tanh", - "conj"]: + "conj", "abs"]: if dtype.is_complex(): # function parameters are complex. if dtype.numpy_dtype == np.complex64: @@ -250,9 +251,12 @@ class PyOpenCLCallable(ScalarCallable): return self.copy(name_in_target="%s_%s" % (tpname, name), arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) else: - # function calls for real parameters. + # function calls for floating parameters. + dtype = dtype.numpy_dtype if dtype.kind in ('u', 'i'): dtype = np.float32 + if name == 'abs': + name = 'fabs' return self.copy(name_in_target=name, arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) diff --git a/loopy/target/python.py b/loopy/target/python.py index 696f3245e..c25404268 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -177,25 +177,11 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_manglers(self): + def function_scopers(self): + from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_manglers() + [ - _numpy_single_arg_function_mangler, - ]) - - def function_identifiers(self): - from loopy.target.c import c_math_identifiers - return ( - super(PythonASTBuilderBase, self).function_identifiers() | - c_math_identifiers()) - - def with_types(self, in_knl_callable, arg_id_to_dtype): - from loopy.target.c import with_types_for_c_target - new_callable = with_types_for_c_target(in_knl_callable, arg_id_to_dtype) - if new_callable is not None: - return new_callable - return super(PythonASTBuilderBase, self).with_types(in_knl_callable, - arg_id_to_dtype) + super(PythonASTBuilderBase, self).function_scopers() | + frozenset([scope_c_math_functions])) def preamble_generators(self): return ( diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_callable.py similarity index 79% rename from loopy/transform/register_knl.py rename to loopy/transform/register_callable.py index 221f2abef..ac68f60d9 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_callable.py @@ -33,7 +33,7 @@ __doc__ = """ """ -# {{{ main entrypoint +# {{{ register_callable_kernel def register_callable_kernel(caller_kernel, function_name, callee_kernel): """Returns a copy of *caller_kernel* which identifies *function_name* in an @@ -75,4 +75,26 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # }}} + +# {{{ register scalar callable + +def register_function_lookup(kernel, function_lookup): + """ + Returns a copy of *kernel* with the *function_lookup* registered. + + :arg function_lookup: A function of signature ``(target, identifier)`` + returning a :class:`loopy.kernel.function_interface.InKernelCallable`. + """ + + # adding the function lookup to the set of function lookers in the kernel. + new_function_scopers = kernel.function_scopers | frozenset([function_lookup]) + registered_kernel = kernel.copy(function_scopers=new_function_scopers) + from loopy.kernel.creation import scope_functions + + # returning the scoped_version of the kernel, as new functions maybe + # resolved. + return scope_functions(registered_kernel) + +# }}} + # vim: foldmethod=marker -- GitLab From 4d032e771977782adbd76c500dc92268f7527d6b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 23 Apr 2018 13:48:11 -0500 Subject: [PATCH 120/916] Made changes in CallableKernel to include register scoper function interface. --- loopy/kernel/__init__.py | 2 +- loopy/target/__init__.py | 4 +- loopy/target/c/__init__.py | 4 +- loopy/target/cuda.py | 4 +- loopy/target/opencl.py | 8 ++-- loopy/target/pyopencl.py | 4 +- loopy/target/python.py | 4 +- loopy/transform/register_callable.py | 69 ++++++++++++---------------- test/test_transform.py | 22 +++++++++ test/testlib.py | 40 ++++++++++++++++ 10 files changed, 107 insertions(+), 54 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index b99fc6dc2..6ac773d29 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -269,7 +269,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): from loopy.library.function import loopy_specific_callable_scopers # populating the function scopers from the target and the loopy # specific callable scopers - function_scopers = frozenset([loopy_specific_callable_scopers]) | ( + function_scopers = [loopy_specific_callable_scopers] + ( target.get_device_ast_builder().function_scopers()) ImmutableRecordWithoutPickling.__init__(self, diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 53e5ccbc3..0f90ca414 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -152,11 +152,11 @@ class ASTBuilderBase(object): def function_scopers(self): """ - Returns an instance of :class:`frozenset` of the functions of signature + Returns an instance of list of the functions of signature ``(target, identifiers)`` returning either an instance of :class:`InKernelCallable` if a match is found or *None*. """ - return frozenset() + return [] def symbol_manglers(self): return [] diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 36c9601b5..87904f07f 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -474,8 +474,8 @@ class CASTBuilder(ASTBuilderBase): def function_scopers(self): return ( - super(CASTBuilder, self).function_scopers() | frozenset([ - scope_c_math_functions])) + super(CASTBuilder, self).function_scopers() + [ + scope_c_math_functions]) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 2651abc94..4265716ad 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -173,7 +173,7 @@ class CudaCallable(ScalarCallable): def scope_cuda_functions(target, identifier): - if identifier in frozenset(["dot"]) | frozenset( + if identifier in set(["dot"]) | set( _CUDA_SPECIFIC_FUNCTIONS): return CudaCallable(name=identifier) @@ -263,7 +263,7 @@ class CUDACASTBuilder(CASTBuilder): # {{{ library def function_scopers(self): - return frozenset([scope_cuda_functions]) | ( + return [scope_cuda_functions] + ( super(CUDACASTBuilder, self).function_scopers()) # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index a882628d7..4366b08ef 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -272,8 +272,8 @@ def scope_opencl_functions(target, identifier): Returns an instance of :class:`InKernelCallable` if the function defined by *identifier* is known in OpenCL. """ - opencl_function_ids = frozenset(["max", "min", "dot"]) | frozenset( - _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | frozenset(VECTOR_LITERAL_FUNCS) + opencl_function_ids = set(["max", "min", "dot"]) | set( + _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) if identifier in opencl_function_ids: return OpenCLCallable(name=identifier) @@ -425,8 +425,8 @@ class OpenCLCASTBuilder(CASTBuilder): def function_scopers(self): return ( - frozenset([scope_opencl_functions]) | - super(OpenCLCASTBuilder, self).function_scopers()) + [scope_opencl_functions] + super( + OpenCLCASTBuilder, self).function_scopers()) def symbol_manglers(self): return ( diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index ef884c698..bae98d14a 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -784,8 +784,8 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): def function_scopers(self): from loopy.library.random123 import random123_function_scoper return ( - frozenset([pyopencl_function_scoper, random123_function_scoper]) | - super(PyOpenCLCASTBuilder, self).function_scopers()) + [pyopencl_function_scoper, random123_function_scoper] + super( + PyOpenCLCASTBuilder, self).function_scopers()) def preamble_generators(self): return ([ diff --git a/loopy/target/python.py b/loopy/target/python.py index c25404268..e20b7965f 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -180,8 +180,8 @@ class PythonASTBuilderBase(ASTBuilderBase): def function_scopers(self): from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_scopers() | - frozenset([scope_c_math_functions])) + super(PythonASTBuilderBase, self).function_scopers() + + [scope_c_math_functions]) def preamble_generators(self): return ( diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index ac68f60d9..19e463113 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -23,7 +23,6 @@ THE SOFTWARE. """ from loopy.kernel import LoopKernel -from loopy.diagnostic import LoopyError from loopy.kernel.function_interface import CallableKernel __doc__ = """ @@ -33,6 +32,28 @@ __doc__ = """ """ +# {{{ register function lookup + +def register_function_lookup(kernel, function_lookup): + """ + Returns a copy of *kernel* with the *function_lookup* registered. + + :arg function_lookup: A function of signature ``(target, identifier)`` + returning a :class:`loopy.kernel.function_interface.InKernelCallable`. + """ + + # adding the function lookup to the set of function lookers in the kernel. + new_function_scopers = kernel.function_scopers + [function_lookup] + registered_kernel = kernel.copy(function_scopers=new_function_scopers) + from loopy.kernel.creation import scope_functions + + # returning the scoped_version of the kernel, as new functions maybe + # resolved. + return scope_functions(registered_kernel) + +# }}} + + # {{{ register_callable_kernel def register_callable_kernel(caller_kernel, function_name, callee_kernel): @@ -50,50 +71,20 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): assert isinstance(callee_kernel, LoopKernel) assert isinstance(function_name, str) - if function_name in caller_kernel.function_identifiers: - raise LoopyError("%s is being used a default function " - "identifier--maybe use a different function name in order to " - "associate with a callable kernel." % function_name) - # }}} - # now we know some new functions, and hence scoping them. - from loopy.kernel.creation import scope_functions - - # scoping the function corresponding to kernel call - caller_kernel = scope_functions(caller_kernel, set([function_name])) - updated_scoped_functions = caller_kernel.scoped_functions - # making the target of the child kernel to be same as the target of parent # kernel. - from pymbolic.primitives import Variable - updated_scoped_functions[Variable(function_name)] = CallableKernel( - subkernel=callee_kernel.copy(target=caller_kernel.target)) - - # returning the parent kernel with the new scoped function dictionary - return caller_kernel.copy(scoped_functions=updated_scoped_functions) - -# }}} - + callable_kernel = CallableKernel(subkernel=callee_kernel.copy( + target=caller_kernel.target)) -# {{{ register scalar callable + def register_callee_kernel(target, identifier): + if identifier == function_name: + return callable_kernel + return None -def register_function_lookup(kernel, function_lookup): - """ - Returns a copy of *kernel* with the *function_lookup* registered. - - :arg function_lookup: A function of signature ``(target, identifier)`` - returning a :class:`loopy.kernel.function_interface.InKernelCallable`. - """ - - # adding the function lookup to the set of function lookers in the kernel. - new_function_scopers = kernel.function_scopers | frozenset([function_lookup]) - registered_kernel = kernel.copy(function_scopers=new_function_scopers) - from loopy.kernel.creation import scope_functions - - # returning the scoped_version of the kernel, as new functions maybe - # resolved. - return scope_functions(registered_kernel) + return register_function_lookup(caller_kernel, + register_callee_kernel) # }}} diff --git a/test/test_transform.py b/test/test_transform.py index c18369e1e..8c11c0efb 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -182,6 +182,28 @@ def test_add_barrier(ctx_factory): assert (np.linalg.norm(out-2*a.T) < 1e-16) +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import register_log2_lookup + + x = np.random.rand(10) + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + knl = lp.register_function_lookup(knl, register_log2_lookup) + + evt, (out, ) = knl(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + def test_register_knl(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) diff --git a/test/testlib.py b/test/testlib.py index 73de4199d..f0e90d95a 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel @@ -114,4 +115,43 @@ class SeparateTemporariesPreambleTestHelper: # }}} + +# {{{ test_register_function_lookup + +class Log2Callable(lp.ScalarCallable): + + def with_types(self, arg_id_to_dtype, kernel): + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + + from loopy.target.opencl import OpenCLTarget + name_in_target = "log2" + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified + if dtype == np.float32: + name_in_target = "log2f" + elif dtype == np.float128: + name_in_target = "log2l" + + from loopy.types import NumpyType + return self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + + +def register_log2_lookup(target, identifier): + if identifier == 'log2': + return Log2Callable(name='log2') + return None + +# }}} + # vim: foldmethod=marker -- GitLab From 8a57a5a45d6124340e376b00190692faae1f7065 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 23 Apr 2018 14:16:34 -0500 Subject: [PATCH 121/916] Added default_function_mangler from temp purposes. --- loopy/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/loopy/__init__.py b/loopy/__init__.py index 7650e303c..eb43249a6 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -33,6 +33,9 @@ from loopy.diagnostic import LoopyError, LoopyWarning # {{{ imported user interface +from loopy.library.function import ( + default_function_mangler, single_arg_function_mangler) + from loopy.kernel.instruction import ( memory_ordering, memory_scope, VarAtomicity, AtomicInit, AtomicUpdate, InstructionBase, -- GitLab From 413e660c4ed714f576ce005f8704a26c4bf4793c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 23 Apr 2018 14:38:08 -0500 Subject: [PATCH 122/916] straightens small wrinkle in the with_types for CTarget --- loopy/target/c/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 87904f07f..fa9ca27bf 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -427,7 +427,8 @@ class CMathCallable(ScalarCallable): raise LoopyTypeError("%s does not support complex numbers") elif dtype.kind == "f": - if not isinstance(kernel.target, (OpenCLTarget)): + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): if dtype == np.float64: pass # fmin elif dtype == np.float32: -- GitLab From e95155384e76986861c0f1ec293a668dd95391e7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 24 Apr 2018 10:54:27 -0500 Subject: [PATCH 123/916] Helpful comments for infer_arg_descr --- loopy/preprocess.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 0c5c0096b..2073a14df 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2245,8 +2245,10 @@ class ArgDescrInferenceMapper(CombineMapper): def infer_arg_descr(kernel): - """ Specializes the kernel functions in way that the functions agree upon - shape and dimensions of the arguments too. + """ + Returns a copy of *kernel* with the argument shapes and strides matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. """ arg_description_modifier = ArgDescrInferenceMapper(kernel) -- GitLab From 82175cb5599ff9f93d8d4229804c7dec3b77e474 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 24 Apr 2018 11:33:12 -0500 Subject: [PATCH 124/916] Added the conflicting iname check betweent the caller and the callee. --- loopy/check.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/loopy/check.py b/loopy/check.py index 0b5c50053..94250c621 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -182,8 +182,21 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) +def _get_all_unique_iname_tags(kernel): + """ Returns a frozenset of all the unique iname tags in the *kernel*. + """ + from loopy.kernel.data import UniqueTag + iname_tags = frozenset(kernel.iname_to_tag.get(iname) for iname in + kernel.all_inames()) - frozenset([None]) + unique_iname_tags = frozenset([tag for tag in iname_tags if + isinstance(tag, UniqueTag)]) + return unique_iname_tags + + def check_for_double_use_of_hw_axes(kernel): from loopy.kernel.data import UniqueTag + from loopy.kernel.instructions import CallInstruction + from loopy.kernel.function_interface import CallableKernel for insn in kernel.instructions: insn_tag_keys = set() @@ -197,6 +210,21 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) + # checking usage of iname tags in the callee kernel. + if isinstance(insn, CallInstruction): + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.function] + if isinstance(in_knl_callable, CallableKernel): + # checking for collision in iname_tag keys in the instruction + # due to the callee kernel. + common_iname_tags = frozenset(tag for tag in + _get_all_unique_iname_tags(in_knl_callable.subkernel) + if tag.key in insn_tag_keys) + if common_iname_tags: + raise LoopyError("instruction '%s' has multiple " + "inames tagged '%s'" % (insn.id, + common_iname_tags.pop())) + def check_for_inactive_iname_access(kernel): for insn in kernel.instructions: -- GitLab From a0ac9d30c896bc047078b4e500a6a427f37d00aa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 24 Apr 2018 19:28:06 -0500 Subject: [PATCH 125/916] Added partial support for checking the with_iname_tags and also switched back to old kernel.scoped_functions, where we make the association str->InKernelCallable. --- loopy/check.py | 18 +++++++++--------- loopy/codegen/__init__.py | 4 ++-- loopy/kernel/creation.py | 19 +++++++++---------- loopy/kernel/function_interface.py | 14 +++++--------- loopy/preprocess.py | 4 ++-- loopy/statistics.py | 2 +- loopy/symbolic.py | 9 ++++++++- loopy/target/c/__init__.py | 2 +- loopy/target/c/codegen/expression.py | 8 ++++---- loopy/target/python.py | 4 ++-- loopy/type_inference.py | 2 +- 11 files changed, 44 insertions(+), 42 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 94250c621..b55b0cf99 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -183,19 +183,19 @@ def check_loop_priority_inames_known(kernel): def _get_all_unique_iname_tags(kernel): - """ Returns a frozenset of all the unique iname tags in the *kernel*. + """ Returns a list of all the unique iname tags in the *kernel*. """ from loopy.kernel.data import UniqueTag - iname_tags = frozenset(kernel.iname_to_tag.get(iname) for iname in - kernel.all_inames()) - frozenset([None]) - unique_iname_tags = frozenset([tag for tag in iname_tags if - isinstance(tag, UniqueTag)]) + iname_tags = [kernel.iname_to_tag.get(iname) for iname in + kernel.all_inames()] + unique_iname_tags = [tag for tag in iname_tags if + isinstance(tag, UniqueTag)] return unique_iname_tags def check_for_double_use_of_hw_axes(kernel): from loopy.kernel.data import UniqueTag - from loopy.kernel.instructions import CallInstruction + from loopy.kernel.instruction import CallInstruction from loopy.kernel.function_interface import CallableKernel for insn in kernel.instructions: @@ -213,13 +213,13 @@ def check_for_double_use_of_hw_axes(kernel): # checking usage of iname tags in the callee kernel. if isinstance(insn, CallInstruction): in_knl_callable = kernel.scoped_functions[ - insn.expression.function.function] + insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): # checking for collision in iname_tag keys in the instruction # due to the callee kernel. - common_iname_tags = frozenset(tag for tag in + common_iname_tags = [tag for tag in _get_all_unique_iname_tags(in_knl_callable.subkernel) - if tag.key in insn_tag_keys) + if tag.key in insn_tag_keys] if common_iname_tags: raise LoopyError("instruction '%s' has multiple " "inames tagged '%s'" % (insn.id, diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index ba04170e2..c48492597 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -398,7 +398,7 @@ class InKernelCallablesCollector(CombineMapper): def map_scoped_function(self, expr): return frozenset([self.kernel.scoped_functions[ - expr.function]]) + expr.name]]) def map_constant(self, expr): return frozenset() @@ -534,7 +534,7 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): for insn in kernel.instructions: if isinstance(insn, CallInstruction): in_knl_callable = kernel.scoped_functions[ - insn.expression.function.function] + insn.expression.function.name] from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel): auxiliary_dev_prog = generate_code_v2( diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 219042de4..4fa7a643f 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1865,7 +1865,7 @@ class FunctionScoper(RuleAwareIdentityMapper): if in_knl_callable: # Associating the newly created ScopedFunction with the # resolved in-kernel callable. - self.scoped_functions[expr.function] = in_knl_callable + self.scoped_functions[expr.function.name] = in_knl_callable return type(expr)( ScopedFunction(expr.function.name), @@ -1885,7 +1885,7 @@ class FunctionScoper(RuleAwareIdentityMapper): if in_knl_callable: # Associating the newly created ScopedFunction with the # resolved in-kernel callable. - self.scoped_functions[expr.function.function] = in_knl_callable + self.scoped_functions[expr.function.name] = in_knl_callable return type(expr)( ScopedFunction(expr.function.name), tuple(self.rec(child, expn_state) @@ -1904,28 +1904,27 @@ class FunctionScoper(RuleAwareIdentityMapper): MinReductionOperation, ArgMinReductionOperation, ArgMaxReductionOperation, _SegmentedScalarReductionOperation, SegmentedOp) - from pymbolic import var from loopy.library.reduction import ArgExtOp # Noting down the extra functions arising due to certain reductions. if isinstance(expr.operation, MaxReductionOperation): - self.scoped_functions[var("max")] = self.kernel.lookup_function("max") + self.scoped_functions["max"] = self.kernel.lookup_function("max") elif isinstance(expr.operation, MinReductionOperation): - self.scoped_functions[var("min")] = self.kernel.lookup_function("min") + self.scoped_functions["min"] = self.kernel.lookup_function("min") elif isinstance(expr.operation, ArgMaxReductionOperation): - self.scoped_functions[var("max")] = self.kernel.lookup_function("max") - self.scoped_functions[var("make_tuple")] = self.kernel.lookup_function( + self.scoped_functions["max"] = self.kernel.lookup_function("max") + self.scoped_functions["make_tuple"] = self.kernel.lookup_function( "make_tuple") self.scoped_functions[ArgExtOp(expr.operation)] = ( self.kernel.lookup_function(expr.operation)) elif isinstance(expr.operation, ArgMinReductionOperation): - self.scoped_functions[var("min")] = self.kernel.lookup_function("min") - self.scoped_functions[var("make_tuple")] = self.kernel.lookup_function( + self.scoped_functions["min"] = self.kernel.lookup_function("min") + self.scoped_functions["make_tuple"] = self.kernel.lookup_function( "make_tuple") self.scoped_functions[ArgExtOp(expr.operation)] = ( self.kernel.lookup_function(expr.operation)) elif isinstance(expr.operation, _SegmentedScalarReductionOperation): - self.scoped_functions[var("make_tuple")] = self.kernel.lookup_function( + self.scoped_functions["make_tuple"] = self.kernel.lookup_function( "make_tuple") self.scoped_functions[SegmentedOp(expr.operation)] = ( self.kernel.lookup_function(expr.operation)) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 7c3aac1f6..d988054ca 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -537,10 +537,6 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr) - def with_iname_tag_usage(self, unusable, concurrent_shape): - - raise NotImplementedError() - def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and @@ -703,12 +699,12 @@ def next_indexed_variable(function): if match is None: if function.name[-1] == '_': - return Variable("{old_name}0".format(old_name=function.name)) + return "{old_name}0".format(old_name=function.name) else: - return Variable("{old_name}_0".format(old_name=function.name)) + return "{old_name}_0".format(old_name=function.name) - return Variable("{alpha}_{num}".format(alpha=match.group('alpha'), - num=int(match.group('num'))+1)) + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) class ScopedFunctionNameChanger(RuleAwareIdentityMapper): @@ -795,7 +791,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, # for array calls the name in the target is the name of the # scoped funciton in_knl_callable = in_knl_callable.copy( - name_in_target=unique_var.name) + name_in_target=unique_var) scoped_names_to_functions[unique_var] = in_knl_callable scoped_functions_to_names[in_knl_callable] = unique_var diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 2073a14df..369daa45d 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2187,7 +2187,7 @@ class ArgDescrInferenceMapper(CombineMapper): # specializing the function according to the parameter description new_scoped_function = ( - self.kernel.scoped_functions[expr.function.function].with_descrs( + self.kernel.scoped_functions[expr.function.name].with_descrs( combined_arg_id_to_descr)) # collecting the descriptors for args, kwargs, assignees @@ -2314,7 +2314,7 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): elif isinstance(expr.function, ScopedFunction): is_ready_for_codegen = self.kernel.scoped_functions[ - expr.function.function].is_ready_for_codegen() + expr.function.name].is_ready_for_codegen() return self.combine( (is_ready_for_codegen,) + tuple( diff --git a/loopy/statistics.py b/loopy/statistics.py index defc4f6d7..0bf227617 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -714,7 +714,7 @@ class ExpressionOpCounter(CounterBase): from loopy.symbolic import ScopedFunction if isinstance(expr.function, ScopedFunction): function_identifier = self.knl.scoped_functions[ - expr.function.function].name + expr.function.name].name else: function_identifier = expr.function.name diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 301cb4898..e4cdfa05d 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -695,7 +695,14 @@ class ScopedFunction(p.Expression): @property def name(self): - return self.function.name + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(self.function, p.Variable): + return self.function.name + elif isinstance(self.function, (ArgExtOp, SegmentedOp)): + return self.function + else: + raise LoopyError("Unexpected function type %s in ScopedFunction." % + type(self.function)) def __getinitargs__(self): return (self.function, ) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index fa9ca27bf..9ce9f04bf 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -872,7 +872,7 @@ class CASTBuilder(ASTBuilderBase): def emit_multiple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper - func_id = insn.expression.function.function + func_id = insn.expression.function.name in_knl_callable = codegen_state.kernel.scoped_functions[func_id] if in_knl_callable.name_in_target == 'loopy_make_tuple': diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 110f3f035..385d10c4e 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -390,7 +390,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): # {{{ implement indexof, indexof_vec - identifier_name = self.kernel.scoped_functions[expr.function.function].name + identifier_name = self.kernel.scoped_functions[expr.function.name].name if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier_name) @@ -433,17 +433,17 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} from loopy.kernel.function_interface import ManglerCallable - if isinstance(self.kernel.scoped_functions[expr.function.function], + if isinstance(self.kernel.scoped_functions[expr.function.name], ManglerCallable): from loopy.codegen import SeenFunction - in_knl_callable = self.kernel.scoped_functions[expr.function.function] + in_knl_callable = self.kernel.scoped_functions[expr.function.name] mangle_result = in_knl_callable.mangle_result(self.kernel) self.codegen_state.seen_functions.add( SeenFunction(identifier_name, mangle_result.target_name, mangle_result.arg_dtypes)) - return self.kernel.scoped_functions[expr.function.function].emit_call( + return self.kernel.scoped_functions[expr.function.name].emit_call( expression_to_code_mapper=self, expression=expr, target=self.kernel.target) diff --git a/loopy/target/python.py b/loopy/target/python.py index e20b7965f..2804b0fb9 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -84,14 +84,14 @@ class ExpressionToPythonMapper(StringifyMapper): def map_call(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE - identifier_name = self.kernel.scoped_functions[expr.function.function].name + identifier_name = self.kernel.scoped_functions[expr.function.name].name if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") from loopy.kernel.function_interface import ManglerCallable - in_knl_callable = self.kernel.scoped_functions[expr.function.function] + in_knl_callable = self.kernel.scoped_functions[expr.function.name] if isinstance(in_knl_callable, ManglerCallable): from loopy.codegen import SeenFunction mangle_result = in_knl_callable.mangle_result(self.kernel) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 697cfddf5..cc3b9e8e4 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -284,7 +284,7 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type if isinstance(expr.function, ScopedFunction): - in_knl_callable = self.scoped_functions[expr.function.function] + in_knl_callable = self.scoped_functions[expr.function.name] # {{{ checking that there is no overwriting of in_knl_callable -- GitLab From 68c8fea311693ce2b976a0333f3911689f5ced67 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 24 Apr 2018 19:59:36 -0500 Subject: [PATCH 126/916] Fixes small error to convert str to variable while passing to unique_var_generator --- loopy/kernel/function_interface.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index d988054ca..ed79f092d 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -681,10 +681,10 @@ class ManglerCallable(ScalarCallable): def next_indexed_variable(function): """ - Returns a copy a :arg:`function` with the next indexed-name in the - sequence. + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. - :Example: ``Variable('sin_0')`` will return ``Variable('sin_1'). + :Example: ``Variable('sin_0')`` will return ``'sin_1'``. :arg function: Either an instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.reduction.ArgExtOp` or @@ -784,7 +784,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, while unique_var in scoped_names_to_functions and not isinstance( unique_var, (ArgExtOp, SegmentedOp)): # keep on finding new names till one a unique one is found. - unique_var = next_indexed_variable(unique_var) + unique_var = next_indexed_variable(Variable(unique_var)) # book-keeping of the functions and names mappings for later use if isinstance(in_knl_callable, CallableKernel): -- GitLab From c5baa387c8a922edcc0e429a97a0cd9055bf76ab Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 25 Apr 2018 11:13:14 -0500 Subject: [PATCH 127/916] starts making changes in order to take in memory_address_scope. --- loopy/kernel/data.py | 43 +++++++++++++++++++++++++++++++++++++++---- loopy/preprocess.py | 9 ++++----- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index c90e8a64b..0129b7ee6 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -310,10 +310,10 @@ class InameArg(ValueArg): # }}} -# {{{ temporary variable +# {{{ memory address space -class temp_var_scope: # noqa - """Storage location of a temporary +class mem_address_space: # noqa + """Storage location of a variable. .. attribute:: PRIVATE .. attribute:: LOCAL @@ -336,7 +336,42 @@ class temp_var_scope: # noqa elif val == cls.GLOBAL: return "global" else: - raise ValueError("unexpected value of temp_var_scope") + raise ValueError("unexpected value of mem_address_space.") + +# }}} + + +# {{{ temporary variable + +class _deprecated_temp_var_scope_property(property): # noqa + def __get__(self, cls, owner): + from warnings import warn + warn("'temp_var_scope' is deprecated. Use 'mem_address_space'.", + DeprecationWarning, stacklevel=2) + return classmethod(self.fget).__get__(None, owner)() + +class temp_var_scope: # noqa + """Deprecated. Use :class:`mem_adress_space` instead. + """ + + @_deprecated_temp_var_scope_property + def PRIVATE(self): + return mem_address_space.PRIVATE + + @_deprecated_temp_var_scope_property + def LOCAL(self): + return mem_address_space.LOCAL + + @_deprecated_temp_var_scope_property + def GLOBAL(self): + return mem_address_space.GLOBAL + + @classmethod + def stringify(cls, val): + from warnings import warn + warn("'temp_var_scope' is deprecated. Use 'mem_address_space'.", + DeprecationWarning, stacklevel=2) + return mem_address_space.stringify(cls, val) class TemporaryVariable(ArrayBase): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 369daa45d..3bd18d7fe 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2113,19 +2113,18 @@ def get_arg_description_from_sub_array_ref(sub_array, kernel): :class:`ArrayArgDescriptor`. """ from loopy.kernel.function_interface import ArrayArgDescriptor - # from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import mem_address_space name = sub_array.subscript.aggregate.name if name in kernel.temporary_variables: - # mem_scope = temp_var_scope.LOCAL - mem_scope = "LOCAL" arg = kernel.temporary_variables[name] + mem_scope = arg.mem_scope assert name not in kernel.arg_dict else: assert name in kernel.arg_dict - # mem_scope = temp_var_scope.GLOBAL - mem_scope = "GLOBAL" + mem_scope = mem_address_space + mem_scope = kernel.arg_dict[name].mem_scope arg = kernel.arg_dict[name] sub_dim_tags, sub_shape = sub_array.get_sub_array_dim_tags_and_shape( -- GitLab From b3b73a1194ff03b07554bd4281c3458ff6858103 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 25 Apr 2018 21:29:54 -0500 Subject: [PATCH 128/916] Made register_callee_kernel picklable. --- loopy/preprocess.py | 10 +++++++--- loopy/transform/register_callable.py | 26 ++++++++++++++++++++------ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 3bd18d7fe..bd0d871f1 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2119,12 +2119,16 @@ def get_arg_description_from_sub_array_ref(sub_array, kernel): if name in kernel.temporary_variables: arg = kernel.temporary_variables[name] - mem_scope = arg.mem_scope + # FIXME: This is temporary change them back to the necessary ones. + # mem_scope = arg.mem_scope + mem_scope = 'Local' assert name not in kernel.arg_dict else: assert name in kernel.arg_dict - mem_scope = mem_address_space - mem_scope = kernel.arg_dict[name].mem_scope + # FIXME: This is just temporary, change them back to the needed + # changes. + # mem_scope = kernel.arg_dict[name].mem_scope + mem_scope = 'Global' arg = kernel.arg_dict[name] sub_dim_tags, sub_shape = sub_array.get_sub_array_dim_tags_and_shape( diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 19e463113..1a0aadec6 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -24,6 +24,7 @@ THE SOFTWARE. from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel +from pytools import ImmutableRecord __doc__ = """ .. currentmodule:: loopy @@ -56,6 +57,24 @@ def register_function_lookup(kernel, function_lookup): # {{{ register_callable_kernel +class RegisterCalleeKernel(ImmutableRecord): + """ + Helper class to make the function scoper from + :func:`loopy.transform.register_callable_kernel` picklable. As python + cannot pickle lexical closures. + """ + fields = set(['function_name', 'callable_kernel']) + + def __init__(self, function_name, callable_kernel): + self.function_name = function_name + self.callable_kernel = callable_kernel + + def __call__(self, target, identifier): + if identifier == self.function_name: + return self.callable_kernel + return None + + def register_callable_kernel(caller_kernel, function_name, callee_kernel): """Returns a copy of *caller_kernel* which identifies *function_name* in an expression as a call to *callee_kernel*. @@ -78,13 +97,8 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): callable_kernel = CallableKernel(subkernel=callee_kernel.copy( target=caller_kernel.target)) - def register_callee_kernel(target, identifier): - if identifier == function_name: - return callable_kernel - return None - return register_function_lookup(caller_kernel, - register_callee_kernel) + RegisterCalleeKernel(function_name, callable_kernel)) # }}} -- GitLab From ecd52672db3d46e80eadb188510a326d62ed3560 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Apr 2018 07:26:28 -0500 Subject: [PATCH 129/916] Two major changes: 1. Moved from GlobalArg -> ArrayArg, 2. Switched from MemoryAddressSpace -> temp_var_scope. --- loopy/__init__.py | 8 +- loopy/auto_test.py | 10 +-- loopy/check.py | 30 +++---- loopy/cli.py | 2 +- loopy/codegen/control.py | 4 +- loopy/frontend/fortran/translator.py | 2 +- loopy/kernel/__init__.py | 16 ++-- loopy/kernel/creation.py | 10 +-- loopy/kernel/data.py | 124 ++++++++++++++++----------- loopy/kernel/function_interface.py | 6 +- loopy/preprocess.py | 65 +++++++------- loopy/schedule/tools.py | 4 +- loopy/statistics.py | 8 +- loopy/target/c/__init__.py | 12 +-- loopy/target/c/codegen/expression.py | 6 +- loopy/target/cuda.py | 10 ++- loopy/target/execution.py | 10 +-- loopy/target/ispc.py | 16 ++-- loopy/target/opencl.py | 47 +++++++--- loopy/target/pyopencl.py | 10 +-- loopy/target/pyopencl_execution.py | 8 +- loopy/transform/batch.py | 8 +- loopy/transform/buffer.py | 12 +-- loopy/transform/data.py | 14 +-- loopy/transform/diff.py | 2 +- loopy/transform/precompute.py | 12 +-- loopy/transform/save.py | 8 +- 27 files changed, 256 insertions(+), 208 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index eb43249a6..a5850ec0a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -44,8 +44,8 @@ from loopy.kernel.instruction import ( from loopy.kernel.data import ( auto, KernelArgument, - ValueArg, GlobalArg, ConstantArg, ImageArg, - temp_var_scope, TemporaryVariable, + ValueArg, ArrayArg, GlobalArg, ConstantArg, ImageArg, + temp_var_scope, TemporaryVariable, MemoryAddressSpace, SubstitutionRule, CallMangleInfo) from loopy.kernel.function_interface import ( @@ -169,8 +169,8 @@ __all__ = [ "ScalarCallable", "KernelArgument", - "ValueArg", "GlobalArg", "ConstantArg", "ImageArg", - "temp_var_scope", "TemporaryVariable", + "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", + "MemoryAddressSpace", "temp_var_scope", "TemporaryVariable", "SubstitutionRule", "CallMangleInfo", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index a91eb51a0..35a27fb0d 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -79,7 +79,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array - from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, \ + from loopy.kernel.data import ValueArg, ArrayArg, ImageArg, \ TemporaryVariable, ConstantArg from pymbolic import evaluate @@ -108,7 +108,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): ref_arg_data.append(None) - elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg \ + elif arg.arg_class is ArrayArg or arg.arg_class is ImageArg \ or arg.arg_class is ConstantArg: if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError("array '%s' needs known shape to use automatic " @@ -185,7 +185,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): pass else: - raise LoopyError("arg type not understood") + raise LoopyError("arg type %s not understood" % type(arg)) return ref_args, ref_arg_data @@ -198,7 +198,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array - from loopy.kernel.data import ValueArg, GlobalArg, ImageArg,\ + from loopy.kernel.data import ValueArg, ArrayArg, ImageArg,\ TemporaryVariable, ConstantArg from pymbolic import evaluate @@ -232,7 +232,7 @@ def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): args[arg.name] = cl.image_from_array( queue.context, arg_desc.ref_pre_run_array.get()) - elif arg.arg_class is GlobalArg or\ + elif arg.arg_class is ArrayArg or\ arg.arg_class is ConstantArg: shape = evaluate(arg.unvec_shape, parameters) strides = evaluate(arg.unvec_strides, parameters) diff --git a/loopy/check.py b/loopy/check.py index b55b0cf99..744bc27aa 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -239,20 +239,20 @@ def check_for_inactive_iname_access(kernel): def _is_racing_iname_tag(tv, tag): - from loopy.kernel.data import (temp_var_scope, + from loopy.kernel.data import (MemoryAddressSpace, LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto) - if tv.scope == temp_var_scope.PRIVATE: + if tv.scope == MemoryAddressSpace.PRIVATE: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag))) - elif tv.scope == temp_var_scope.LOCAL: + elif tv.scope == MemoryAddressSpace.LOCAL: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, GroupIndexTag)) - elif tv.scope == temp_var_scope.GLOBAL: + elif tv.scope == MemoryAddressSpace.GLOBAL: return isinstance(tag, ConcurrentTag) elif tv.scope == auto: @@ -517,15 +517,15 @@ class IndirectDependencyEdgeFinder(object): def declares_nosync_with(kernel, var_scope, dep_a, dep_b): - from loopy.kernel.data import temp_var_scope - if var_scope == temp_var_scope.GLOBAL: + from loopy.kernel.data import MemoryAddressSpace + if var_scope == MemoryAddressSpace.GLOBAL: search_scopes = ["global", "any"] - elif var_scope == temp_var_scope.LOCAL: + elif var_scope == MemoryAddressSpace.LOCAL: search_scopes = ["local", "any"] - elif var_scope == temp_var_scope.PRIVATE: + elif var_scope == MemoryAddressSpace.PRIVATE: search_scopes = ["any"] else: - raise ValueError("unexpected value of 'temp_var_scope'") + raise ValueError("unexpected value of 'MemoryAddressSpace'") ab_nosync = False ba_nosync = False @@ -548,7 +548,7 @@ def _check_variable_access_ordered_inner(kernel): wmap = kernel.writer_map() rmap = kernel.reader_map() - from loopy.kernel.data import GlobalArg, ValueArg, temp_var_scope + from loopy.kernel.data import ValueArg, MemoryAddressSpace, ArrayArg from loopy.kernel.tools import find_aliasing_equivalence_classes depfind = IndirectDependencyEdgeFinder(kernel) @@ -574,10 +574,10 @@ def _check_variable_access_ordered_inner(kernel): scope = kernel.temporary_variables[name].scope else: arg = kernel.arg_dict[name] - if isinstance(arg, GlobalArg): - scope = temp_var_scope.GLOBAL + if isinstance(arg, ArrayArg): + scope = arg.memory_address_space elif isinstance(arg, ValueArg): - scope = temp_var_scope.PRIVATE + scope = MemoryAddressSpace.PRIVATE else: # No need to consider ConstantArg and ImageArg (for now) # because those won't be written. @@ -843,7 +843,7 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): # {{{ check that temporaries are defined in subkernels where used def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace from loopy.kernel.tools import get_subkernels for subkernel in get_subkernels(kernel): @@ -874,7 +874,7 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): "aliases have a definition" % (temporary, subkernel)) continue - if tval.scope in (temp_var_scope.PRIVATE, temp_var_scope.LOCAL): + if tval.scope in (MemoryAddressSpace.PRIVATE, MemoryAddressSpace.LOCAL): from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError("temporary variable '%s' gets used " "in subkernel '%s' without a definition (maybe you forgot " diff --git a/loopy/cli.py b/loopy/cli.py index 060340d59..a92922b18 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -205,7 +205,7 @@ def main(): new_kernels = [] for kernel in kernels: new_args = [ - lp.GlobalArg("occa_info", np.int32, shape=None) + lp.ArrayArg("occa_info", np.int32, shape=None) ] + kernel.args new_kernels.append(kernel.copy(args=new_args)) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e3e209726..dd9cda618 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -63,7 +63,7 @@ def synthesize_idis_for_extra_args(kernel, schedule_index): sched_item = kernel.schedule[schedule_index] from loopy.codegen import ImplementedDataInfo - from loopy.kernel.data import InameArg, temp_var_scope + from loopy.kernel.data import InameArg, MemoryAddressSpace assert isinstance(sched_item, CallKernel) @@ -71,7 +71,7 @@ def synthesize_idis_for_extra_args(kernel, schedule_index): for arg in sched_item.extra_args: temporary = kernel.temporary_variables[arg] - assert temporary.scope == temp_var_scope.GLOBAL + assert temporary.scope == MemoryAddressSpace.GLOBAL idis.extend( temporary.decl_info( kernel.target, diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index bcbe41874..70415c333 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -679,7 +679,7 @@ class F2LoopyTranslator(FTreeWalkerBase): if dims is not None: # default order is set to "F" in kernel creation below kernel_data.append( - lp.GlobalArg( + lp.ArrayArg( arg_name, dtype=sub.get_type(arg_name), shape=sub.get_loopy_shape(arg_name), diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6ac773d29..9a4ea7027 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -873,17 +873,17 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def global_var_names(self): - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace - from loopy.kernel.data import GlobalArg + from loopy.kernel.data import ArrayArg return ( set( arg.name for arg in self.args - if isinstance(arg, GlobalArg)) + if isinstance(arg, ArrayArg)) | set( tv.name for tv in six.itervalues(self.temporary_variables) - if tv.scope == temp_var_scope.GLOBAL)) + if tv.scope == MemoryAddressSpace.GLOBAL)) # }}} @@ -1075,17 +1075,17 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def local_var_names(self): - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace return set( tv.name for tv in six.itervalues(self.temporary_variables) - if tv.scope == temp_var_scope.LOCAL) + if tv.scope == MemoryAddressSpace.LOCAL) def local_mem_use(self): - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace return sum( tv.nbytes for tv in six.itervalues(self.temporary_variables) - if tv.scope == temp_var_scope.LOCAL) + if tv.scope == MemoryAddressSpace.LOCAL) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 4fa7a643f..781d8b986 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1143,7 +1143,7 @@ class ArgumentGuesser: def make_new_arg(self, arg_name): arg_name = arg_name.strip() - from loopy.kernel.data import ValueArg, GlobalArg + from loopy.kernel.data import ValueArg, ArrayArg import loopy as lp if arg_name in self.all_params: @@ -1153,7 +1153,7 @@ class ArgumentGuesser: # It's not a temp var, and thereby not a domain parameter--the only # other writable type of variable is an argument. - return GlobalArg(arg_name, + return ArrayArg(arg_name, shape=lp.auto, offset=self.default_offset) irank = self.find_index_rank(arg_name) @@ -1161,7 +1161,7 @@ class ArgumentGuesser: # read-only, no indices return ValueArg(arg_name) else: - return GlobalArg(arg_name, shape=lp.auto, offset=self.default_offset) + return ArrayArg(arg_name, shape=lp.auto, offset=self.default_offset) def convert_names_to_full_args(self, kernel_args): new_kernel_args = [] @@ -2144,7 +2144,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): :arg kernel_data: - A list of :class:`ValueArg`, :class:`GlobalArg`, ... (etc.) instances. + A list of :class:`ValueArg`, :class:`ArrayArg`, ... (etc.) instances. The order of these arguments determines the order of the arguments to the generated kernel. @@ -2175,7 +2175,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): (name, c_name, arg_dtypes), generating extra entries for *preambles*. :arg default_order: "C" (default) or "F" :arg default_offset: 0 or :class:`loopy.auto`. The default value of - *offset* in :attr:`GlobalArg` for guessed arguments. + *offset* in :attr:`ArrayArg` for guessed arguments. Defaults to 0. :arg function_manglers: list of functions of signature ``(target, name, arg_dtypes)`` diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 0129b7ee6..db08de00a 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -207,6 +207,38 @@ def parse_tag(tag): # }}} +# {{{ memory address space + +class MemoryAddressSpace: + """ + Storage location of a variable. + + .. attribute:: PRIVATE + .. attribute:: LOCAL + .. attribute:: GLOBAL + """ + + # These must occur in ascending order of 'globality' so that + # max(scope) does the right thing. + + PRIVATE = 0 + LOCAL = 1 + GLOBAL = 2 + + @classmethod + def stringify(cls, val): + if val == cls.PRIVATE: + return "private" + elif val == cls.LOCAL: + return "local" + elif val == cls.GLOBAL: + return "global" + else: + raise ValueError("unexpected value of MemoryAddressScope") + +# }}} + + # {{{ arguments class KernelArgument(ImmutableRecord): @@ -236,14 +268,34 @@ class KernelArgument(ImmutableRecord): ImmutableRecord.__init__(self, **kwargs) -class GlobalArg(ArrayBase, KernelArgument): +class ArrayArg(ArrayBase, KernelArgument): + + allowed_extra_kwargs = [ + "memory_address_space"] + + def __init__(self, *args, **kwargs): + # Defaulting the memory_address_space to be GLOBAL. + kwargs["memory_address_space"] = kwargs.pop( + "memory_address_space", MemoryAddressSpace.GLOBAL) + + super(ArrayArg, self).__init__(*args, **kwargs) + __doc__ = ArrayBase.__doc__ min_target_axes = 0 max_target_axes = 1 def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): - return ast_builder.get_global_arg_decl(self.name + name_suffix, shape, - dtype, is_written) + return ast_builder.get_array_arg_decl(self.name + name_suffix, + self.memory_address_space, shape, dtype, is_written) + + +class GlobalArg(ArrayBase, KernelArgument): + def __new__(cls, *args, **kwargs): + from warnings import warn + warn("Use of 'GlobalArg' is deprecated use 'ArrayArg' instead.", + DeprecationWarning, stacklevel=2) + + return ArrayArg(*args, **kwargs) class ConstantArg(ArrayBase, KernelArgument): @@ -310,44 +362,14 @@ class InameArg(ValueArg): # }}} -# {{{ memory address space - -class mem_address_space: # noqa - """Storage location of a variable. - - .. attribute:: PRIVATE - .. attribute:: LOCAL - .. attribute:: GLOBAL - """ - - # These must occur in ascending order of 'globality' so that - # max(scope) does the right thing. - - PRIVATE = 0 - LOCAL = 1 - GLOBAL = 2 - - @classmethod - def stringify(cls, val): - if val == cls.PRIVATE: - return "private" - elif val == cls.LOCAL: - return "local" - elif val == cls.GLOBAL: - return "global" - else: - raise ValueError("unexpected value of mem_address_space.") - -# }}} - - # {{{ temporary variable class _deprecated_temp_var_scope_property(property): # noqa def __get__(self, cls, owner): from warnings import warn - warn("'temp_var_scope' is deprecated. Use 'mem_address_space'.", + warn("'temp_var_scope' is deprecated. Use 'MemoryAddressSpace'.", DeprecationWarning, stacklevel=2) + return classmethod(self.fget).__get__(None, owner)() class temp_var_scope: # noqa @@ -356,22 +378,22 @@ class temp_var_scope: # noqa @_deprecated_temp_var_scope_property def PRIVATE(self): - return mem_address_space.PRIVATE + return MemoryAddressSpace.PRIVATE @_deprecated_temp_var_scope_property def LOCAL(self): - return mem_address_space.LOCAL + return MemoryAddressSpace.LOCAL @_deprecated_temp_var_scope_property def GLOBAL(self): - return mem_address_space.GLOBAL + return MemoryAddressSpace.GLOBAL @classmethod def stringify(cls, val): from warnings import warn - warn("'temp_var_scope' is deprecated. Use 'mem_address_space'.", + warn("'temp_var_scope' is deprecated. Use 'MemoryAddressSpace'.", DeprecationWarning, stacklevel=2) - return mem_address_space.stringify(cls, val) + return MemoryAddressSpace.stringify class TemporaryVariable(ArrayBase): @@ -381,7 +403,7 @@ class TemporaryVariable(ArrayBase): .. attribute:: scope What memory this temporary variable lives in. - One of the values in :class:`temp_var_scope`, + One of the values in :class:`MemoryAddressSpace`, or :class:`loopy.auto` if this is to be automatically determined. @@ -393,7 +415,7 @@ class TemporaryVariable(ArrayBase): .. attribute:: scope - One of :class:`temp_var_scope`. + One of :class:`MemoryAddressSpace`. .. attribute:: initializer @@ -509,15 +531,15 @@ class TemporaryVariable(ArrayBase): @property def is_local(self): - """One of :class:`loopy.temp_var_scope`.""" + """One of :class:`loopy.MemoryAddressSpace`.""" if self.scope is auto: return auto - elif self.scope == temp_var_scope.LOCAL: + elif self.scope == MemoryAddressSpace.LOCAL: return True - elif self.scope == temp_var_scope.PRIVATE: + elif self.scope == MemoryAddressSpace.PRIVATE: return False - elif self.scope == temp_var_scope.GLOBAL: + elif self.scope == MemoryAddressSpace.GLOBAL: raise LoopyError("TemporaryVariable.is_local called on " "global temporary variable '%s'" % self.name) else: @@ -538,7 +560,7 @@ class TemporaryVariable(ArrayBase): shape_override=self.storage_shape) def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): - if self.scope == temp_var_scope.GLOBAL: + if self.scope == MemoryAddressSpace.GLOBAL: return ast_builder.get_global_arg_decl(self.name + name_suffix, shape, dtype, is_written) else: @@ -549,7 +571,7 @@ class TemporaryVariable(ArrayBase): if self.scope is auto: scope_str = "auto" else: - scope_str = temp_var_scope.stringify(self.scope) + scope_str = MemoryAddressSpace.stringify(self.scope) return ( self.stringify(include_typename=False) @@ -598,11 +620,11 @@ def iname_tag_to_temp_var_scope(iname_tag): iname_tag = parse_tag(iname_tag) if isinstance(iname_tag, GroupIndexTag): - return temp_var_scope.GLOBAL + return MemoryAddressSpace.GLOBAL elif isinstance(iname_tag, LocalIndexTag): - return temp_var_scope.LOCAL + return MemoryAddressSpace.LOCAL else: - return temp_var_scope.PRIVATE + return MemoryAddressSpace.PRIVATE # {{{ substitution rule diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index ed79f092d..e755cb6c4 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -73,7 +73,6 @@ class ArrayArgDescriptor(ImmutableRecord): from loopy.kernel.array import FixedStrideArrayDimTag assert isinstance(shape, tuple) - assert isinstance(mem_scope, str) assert isinstance(dim_tags, tuple) assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in dim_tags) @@ -522,16 +521,17 @@ class CallableKernel(InKernelCallable): if isinstance(id, str): id = kw_to_pos[id] assert isinstance(id, int) + if isinstance(descr, ArrayArgDescriptor): new_args[id] = new_args[id].copy(shape=descr.shape, - dim_tags=descr.dim_tags) + dim_tags=descr.dim_tags, + memory_address_space=descr.mem_scope) elif isinstance(descr, ValueArgDescriptor): pass else: raise LoopyError("Descriptor must be either an instance of " "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % type(descr)) - descriptor_specialized_knl = self.subkernel.copy(args=new_args) return self.copy(subkernel=descriptor_specialized_knl, diff --git a/loopy/preprocess.py b/loopy/preprocess.py index bd0d871f1..48651b777 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -160,7 +160,7 @@ def find_temporary_scope(kernel): new_temp_vars = {} from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag, - temp_var_scope) + MemoryAddressSpace) import loopy as lp writers = kernel.writer_map() @@ -221,12 +221,12 @@ def find_temporary_scope(kernel): assert locparallel_assignee_inames <= locparallel_compute_inames assert grpparallel_assignee_inames <= grpparallel_compute_inames - desired_scope = temp_var_scope.PRIVATE + desired_scope = MemoryAddressSpace.PRIVATE for iname_descr, scope_descr, apin, cpin, scope in [ ("local", "local", locparallel_assignee_inames, - locparallel_compute_inames, temp_var_scope.LOCAL), + locparallel_compute_inames, MemoryAddressSpace.LOCAL), ("group", "global", grpparallel_assignee_inames, - grpparallel_compute_inames, temp_var_scope.GLOBAL), + grpparallel_compute_inames, MemoryAddressSpace.GLOBAL), ]: if (apin != cpin and bool(apin)): @@ -774,7 +774,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): last_added_insn_id = insn.id - from loopy.kernel.data import temp_var_scope, TemporaryVariable + from loopy.kernel.data import MemoryAddressSpace, TemporaryVariable FIRST_POINTER_ASSIGNEE_IDX = 1 # noqa @@ -787,7 +787,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): assignee_var_name in kernel.temporary_variables and (kernel.temporary_variables[assignee_var_name].scope - == temp_var_scope.PRIVATE)): + == MemoryAddressSpace.PRIVATE)): new_assignees.append(assignee) continue @@ -809,7 +809,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): TemporaryVariable( name=new_assignee_name, dtype=None, - scope=temp_var_scope.PRIVATE)) + scope=MemoryAddressSpace.PRIVATE)) from pymbolic import var new_assignee = var(new_assignee_name) @@ -990,12 +990,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, for i in range(nresults)] for name in temp_var_names: - from loopy.kernel.data import TemporaryVariable, temp_var_scope + from loopy.kernel.data import TemporaryVariable, MemoryAddressSpace new_temporary_variables[name] = TemporaryVariable( name=name, shape=(), dtype=None, - scope=temp_var_scope.PRIVATE) + scope=MemoryAddressSpace.PRIVATE) from pymbolic import var temp_vars = tuple(var(n) for n in temp_var_names) @@ -1021,13 +1021,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace acc_var_names = make_temporaries( name_based_on="acc_"+"_".join(expr.inames), nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=temp_var_scope.PRIVATE) + scope=MemoryAddressSpace.PRIVATE) init_insn_depends_on = frozenset() @@ -1159,21 +1159,21 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _get_int_iname_size(oiname) for oiname in outer_local_inames) - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace neutral_var_names = make_temporaries( name_based_on="neutral_"+red_iname, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=temp_var_scope.PRIVATE) + scope=MemoryAddressSpace.PRIVATE) acc_var_names = make_temporaries( name_based_on="acc_"+red_iname, nvars=nresults, shape=outer_local_iname_sizes + (size,), dtypes=reduction_dtypes, - scope=temp_var_scope.LOCAL) + scope=MemoryAddressSpace.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) @@ -1393,13 +1393,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, track_iname) - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace acc_var_names = make_temporaries( name_based_on="acc_" + scan_iname, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=temp_var_scope.PRIVATE) + scope=MemoryAddressSpace.PRIVATE) from pymbolic import var acc_vars = tuple(var(n) for n in acc_var_names) @@ -1518,21 +1518,21 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # }}} - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace read_var_names = make_temporaries( name_based_on="read_"+scan_iname+"_arg_{index}", nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=temp_var_scope.PRIVATE) + scope=MemoryAddressSpace.PRIVATE) acc_var_names = make_temporaries( name_based_on="acc_"+scan_iname, nvars=nresults, shape=outer_local_iname_sizes + (scan_size,), dtypes=reduction_dtypes, - scope=temp_var_scope.LOCAL) + scope=MemoryAddressSpace.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) read_vars = tuple(var(n) for n in read_var_names) @@ -2113,23 +2113,17 @@ def get_arg_description_from_sub_array_ref(sub_array, kernel): :class:`ArrayArgDescriptor`. """ from loopy.kernel.function_interface import ArrayArgDescriptor - from loopy.kernel.data import mem_address_space name = sub_array.subscript.aggregate.name if name in kernel.temporary_variables: arg = kernel.temporary_variables[name] - # FIXME: This is temporary change them back to the necessary ones. - # mem_scope = arg.mem_scope - mem_scope = 'Local' + mem_scope = arg.scope assert name not in kernel.arg_dict else: assert name in kernel.arg_dict - # FIXME: This is just temporary, change them back to the needed - # changes. - # mem_scope = kernel.arg_dict[name].mem_scope - mem_scope = 'Global' arg = kernel.arg_dict[name] + mem_scope = arg.memory_address_space sub_dim_tags, sub_shape = sub_array.get_sub_array_dim_tags_and_shape( arg.dim_tags, arg.shape) @@ -2140,8 +2134,9 @@ def get_arg_description_from_sub_array_ref(sub_array, kernel): class ArgDescrInferenceMapper(CombineMapper): - """ Returns a set with elements as instances of :class:`tuple` (expr, - in_kenrel_callable). The mapped `in_kenrel_callable` of the + """ + Returns a set of instances of :class:`tuple` (expr, + in_kernel_callable). The mapped `in_kernel_callable` of the :class:`InKernelCallable` are descriptor specialized for the given arguments. """ @@ -2359,8 +2354,8 @@ def make_functions_ready_for_codegen(kernel): knl = lp.make_kernel( "{[i]: 0<=i<16}", "a[i] = sin(b[i])", - [lp.GlobalArg('a', dtype=np.float64), - lp.GlobalArg('b', dtype=np.float64)]) + [lp.ArrayArg('a', dtype=np.float64), + lp.ArrayArg('b', dtype=np.float64)]) In the above case, none of the instructions undergo type-specialization, as all the arguments' types have been realized. But, this would be a problem @@ -2470,10 +2465,6 @@ def preprocess_kernel(kernel, device=None): # defaults from being applied. kernel = realize_reduction(kernel, unknown_types_ok=False) - # inferring the shape and dim_tags of the arguments involved in a function - # call. - kernel = infer_arg_descr(kernel) - # type specialize functions that were missed during the type inference. kernel = make_functions_ready_for_codegen(kernel) @@ -2486,6 +2477,10 @@ def preprocess_kernel(kernel, device=None): kernel = find_temporary_scope(kernel) + # inferring the shape and dim_tags of the arguments involved in a function + # call. + kernel = infer_arg_descr(kernel) + # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index f9b08d343..00c2df142 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -22,7 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from loopy.kernel.data import temp_var_scope +from loopy.kernel.data import MemoryAddressSpace # {{{ block boundary finder @@ -91,7 +91,7 @@ def add_extra_args_to_schedule(kernel): more_args = set(tv for tv in used_temporaries if - kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL + kernel.temporary_variables[tv].scope == MemoryAddressSpace.GLOBAL and kernel.temporary_variables[tv].initializer is None and diff --git a/loopy/statistics.py b/loopy/statistics.py index 0bf227617..5cebbee3c 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -30,7 +30,7 @@ import islpy as isl from pymbolic.mapper import CombineMapper from functools import reduce from loopy.kernel.data import ( - MultiAssignmentBase, TemporaryVariable, temp_var_scope) + MultiAssignmentBase, TemporaryVariable, MemoryAddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError from pytools import Record @@ -848,7 +848,7 @@ class LocalMemAccessCounter(MemAccessCounter): if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if isinstance(array, TemporaryVariable) and ( - array.scope == temp_var_scope.LOCAL): + array.scope == MemoryAddressSpace.LOCAL): sub_map[MemAccess(mtype='local', dtype=dtype, count_granularity=CountGranularity.WORKITEM)] = 1 return sub_map @@ -880,7 +880,7 @@ class GlobalMemAccessCounter(MemAccessCounter): # this is a temporary variable return ToCountMap() - if not isinstance(array, lp.GlobalArg): + if not isinstance(array, lp.ArrayArg): # this array is not in global memory return ToCountMap() @@ -899,7 +899,7 @@ class GlobalMemAccessCounter(MemAccessCounter): # this is a temporary variable return self.rec(expr.index) - if not isinstance(array, lp.GlobalArg): + if not isinstance(array, lp.ArrayArg): # this array is not in global memory return self.rec(expr.index) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 9ce9f04bf..88f780304 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -497,7 +497,7 @@ class CASTBuilder(ASTBuilderBase): result = [] - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace from loopy.schedule import CallKernel # We only need to write declarations for global variables with # the first device program. `is_first_dev_prog` determines @@ -512,7 +512,7 @@ class CASTBuilder(ASTBuilderBase): six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): - if tv.scope == temp_var_scope.GLOBAL and tv.initializer is not None: + if tv.scope == MemoryAddressSpace.GLOBAL and tv.initializer is not None: assert tv.read_only decl_info, = tv.decl_info(self.target, @@ -573,7 +573,7 @@ class CASTBuilder(ASTBuilderBase): return None def get_temporary_decls(self, codegen_state, schedule_index): - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace kernel = codegen_state.kernel @@ -605,7 +605,7 @@ class CASTBuilder(ASTBuilderBase): if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations - if tv.scope != temp_var_scope.GLOBAL and ( + if tv.scope != MemoryAddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( @@ -770,7 +770,7 @@ class CASTBuilder(ASTBuilderBase): return result - def get_global_arg_decl(self, name, shape, dtype, is_written): + def get_array_arg_decl(self, name, shape, dtype, is_written): from cgen import RestrictPointer, Const arg_decl = RestrictPointer(POD(self, dtype, name)) @@ -780,6 +780,8 @@ class CASTBuilder(ASTBuilderBase): return arg_decl + get_global_arg_decl = get_array_arg_decl + def get_constant_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import RestrictPointer, Const diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 385d10c4e..9f55ce851 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -198,7 +198,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state.vectorization_info) from loopy.kernel.data import ( - ImageArg, GlobalArg, TemporaryVariable, ConstantArg) + ImageArg, ArrayArg, TemporaryVariable, ConstantArg) if isinstance(ary, ImageArg): extra_axes = 0 @@ -231,10 +231,10 @@ class ExpressionToCExpressionMapper(IdentityMapper): raise NotImplementedError( "non-floating-point images not supported for now") - elif isinstance(ary, (GlobalArg, TemporaryVariable, ConstantArg)): + elif isinstance(ary, (ArrayArg, TemporaryVariable, ConstantArg)): if len(access_info.subscripts) == 0: if ( - (isinstance(ary, (ConstantArg, GlobalArg)) or + (isinstance(ary, (ConstantArg, ArrayArg)) or (isinstance(ary, TemporaryVariable) and ary.base_storage))): # unsubscripted global args are pointers result = make_var(access_info.array_name)[0] diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 4265716ad..6340bec92 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -32,7 +32,7 @@ from loopy.target.c import CTarget, CASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.kernel.data import temp_var_scope +from loopy.kernel.data import MemoryAddressSpace from pymbolic import var from loopy.kernel.function_interface import ScalarCallable @@ -351,10 +351,10 @@ class CUDACASTBuilder(CASTBuilder): raise LoopyError("unknown barrier kind") def wrap_temporary_decl(self, decl, scope): - if scope == temp_var_scope.LOCAL: + if scope == MemoryAddressSpace.LOCAL: from cgen.cuda import CudaShared return CudaShared(decl) - elif scope == temp_var_scope.PRIVATE: + elif scope == MemoryAddressSpace.PRIVATE: return decl else: raise ValueError("unexpected temporary variable scope: %s" @@ -364,7 +364,7 @@ class CUDACASTBuilder(CASTBuilder): from cgen.cuda import CudaConstant return CudaConstant(decl) - def get_global_arg_decl(self, name, shape, dtype, is_written): + def get_array_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import Const from cgen.cuda import CudaRestrictPointer @@ -376,6 +376,8 @@ class CUDACASTBuilder(CASTBuilder): return arg_decl + get_global_arg_decl = get_array_arg_decl + def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError("not yet: texture arguments in CUDA") diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3a3ea0a70..b3b1ef7b9 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -150,14 +150,14 @@ class ExecutionWrapperGeneratorBase(object): # returning the desired integer argument. iarg_to_sources = {} - from loopy.kernel.data import GlobalArg + from loopy.kernel.data import ArrayArg from loopy.symbolic import DependencyMapper, StringifyMapper from loopy.diagnostic import ParameterFinderWarning dep_map = DependencyMapper() from pymbolic import var for arg in implemented_data_info: - if arg.arg_class is GlobalArg: + if arg.arg_class is ArrayArg: sym_shape = var(arg.name).attr("shape") for axis_nr, shape_i in enumerate(arg.shape): if shape_i is None: @@ -432,7 +432,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ allocate written arrays, if needed - if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ + if is_written and arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \ and arg.shape is not None \ and all(si is not None for si in arg.shape): @@ -455,7 +455,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ argument checking - if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ + if arg.arg_class in [lp.ArrayArg, lp.ConstantArg] \ and not options.skip_arg_checks: if possibly_made_by_loopy: gen("if not _lpy_made_by_loopy:") @@ -568,7 +568,7 @@ class ExecutionWrapperGeneratorBase(object): gen("del _lpy_made_by_loopy") gen("") - if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: + if arg.arg_class in [lp.ArrayArg, lp.ConstantArg]: args.append(self.get_arg_pass(arg)) else: args.append("%s" % arg.name) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 45a59847b..583da7dee 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -32,7 +32,7 @@ from loopy.diagnostic import LoopyError from loopy.symbolic import Literal from pymbolic import var import pymbolic.primitives as p -from loopy.kernel.data import temp_var_scope +from loopy.kernel.data import MemoryAddressSpace from pymbolic.mapper.stringifier import PREC_NONE from pytools import memoize_method @@ -82,7 +82,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): def map_variable(self, expr, type_context): tv = self.kernel.temporary_variables.get(expr.name) - if tv is not None and tv.scope == temp_var_scope.PRIVATE: + if tv is not None and tv.scope == MemoryAddressSpace.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # below in decl generation) @@ -102,7 +102,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): ary = self.find_array(expr) if (isinstance(ary, TemporaryVariable) - and ary.scope == temp_var_scope.PRIVATE): + and ary.scope == MemoryAddressSpace.PRIVATE): # generate access code for acccess to private-index temporaries gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() @@ -308,7 +308,7 @@ class ISPCASTBuilder(CASTBuilder): shape = decl_info.shape - if temp_var.scope == temp_var_scope.PRIVATE: + if temp_var.scope == MemoryAddressSpace.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # above in expr to code mapper) @@ -329,7 +329,7 @@ class ISPCASTBuilder(CASTBuilder): from cgen.ispc import ISPCUniform return ISPCUniform(decl) - def get_global_arg_decl(self, name, shape, dtype, is_written): + def get_array_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import Const from cgen.ispc import ISPCUniformPointer, ISPCUniform @@ -343,6 +343,8 @@ class ISPCASTBuilder(CASTBuilder): return arg_decl + get_global_arg_decl = get_array_arg_decl + def get_value_arg_decl(self, name, shape, dtype, is_written): result = super(ISPCASTBuilder, self).get_value_arg_decl( name, shape, dtype, is_written) @@ -400,9 +402,9 @@ class ISPCASTBuilder(CASTBuilder): lambda expr: evaluate(expr, self.codegen_state.var_subst_map), codegen_state.vectorization_info) - from loopy.kernel.data import GlobalArg, TemporaryVariable + from loopy.kernel.data import ArrayArg, TemporaryVariable - if not isinstance(ary, (GlobalArg, TemporaryVariable)): + if not isinstance(ary, (ArrayArg, TemporaryVariable)): raise LoopyError("array type not supported in ISPC: %s" % type(ary).__name) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 4366b08ef..d849e7223 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -32,7 +32,7 @@ from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper -from loopy.kernel.data import temp_var_scope +from loopy.kernel.data import MemoryAddressSpace from loopy.kernel.function_interface import ScalarCallable from pymbolic import var @@ -517,10 +517,10 @@ class OpenCLCASTBuilder(CASTBuilder): raise LoopyError("unknown barrier kind") def wrap_temporary_decl(self, decl, scope): - if scope == temp_var_scope.LOCAL: + if scope == MemoryAddressSpace.LOCAL: from cgen.opencl import CLLocal return CLLocal(decl) - elif scope == temp_var_scope.PRIVATE: + elif scope == MemoryAddressSpace.PRIVATE: return decl else: raise ValueError("unexpected temporary variable scope: %s" @@ -530,11 +530,28 @@ class OpenCLCASTBuilder(CASTBuilder): from cgen.opencl import CLConstant return CLConstant(decl) + def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): + from cgen.opencl import CLGlobal, CLLocal + from loopy.kernel.data import MemoryAddressSpace + + if mem_address_space == MemoryAddressSpace.LOCAL: + return CLLocal(super(OpenCLCASTBuilder, self).get_array_arg_decl( + name, shape, dtype, is_written)) + elif mem_address_space == MemoryAddressSpace.PRIVATE: + return super(OpenCLCASTBuilder, self).get_array_arg_decl( + name, shape, dtype, is_written) + elif mem_address_space == MemoryAddressSpace.GLOBAL: + return CLGlobal(super(OpenCLCASTBuilder, self).get_array_arg_decl( + name, shape, dtype, is_written)) + else: + raise ValueError("unexpected array argument scope: %s" + % mem_address_space) + def get_global_arg_decl(self, name, shape, dtype, is_written): - from cgen.opencl import CLGlobal + from loopy.kernel.data import MemoryAddressSpace - return CLGlobal(super(OpenCLCASTBuilder, self).get_global_arg_decl( - name, shape, dtype, is_written)) + return self.get_array_arg_decl(name, MemoryAddressSpace.GLOBAL, shape, + dtype, is_written) def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): if is_written: @@ -585,7 +602,7 @@ class OpenCLCASTBuilder(CASTBuilder): old_val_var = codegen_state.var_name_generator("loopy_old_val") new_val_var = codegen_state.var_name_generator("loopy_new_val") - from loopy.kernel.data import TemporaryVariable, temp_var_scope + from loopy.kernel.data import TemporaryVariable, MemoryAddressSpace ecm = codegen_state.expression_to_code_mapper.with_assignments( { old_val_var: TemporaryVariable(old_val_var, lhs_dtype), @@ -623,16 +640,24 @@ class OpenCLCASTBuilder(CASTBuilder): else: assert False - from loopy.kernel.data import TemporaryVariable, GlobalArg - if isinstance(lhs_var, GlobalArg): + from loopy.kernel.data import (TemporaryVariable, ArrayArg) + if ( + isinstance(lhs_var, ArrayArg) + and + lhs_var.memory_address_space == MemoryAddressSpace.GLOBAL): var_kind = "__global" + elif ( + isinstance(lhs_var, ArrayArg) + and + lhs_var.memory_address_space == MemoryAddressSpace.LOCAL): + var_kind = "__local" elif ( isinstance(lhs_var, TemporaryVariable) - and lhs_var.scope == temp_var_scope.LOCAL): + and lhs_var.scope == MemoryAddressSpace.LOCAL): var_kind = "__local" elif ( isinstance(lhs_var, TemporaryVariable) - and lhs_var.scope == temp_var_scope.GLOBAL): + and lhs_var.scope == MemoryAddressSpace.GLOBAL): var_kind = "__global" else: raise LoopyError("unexpected kind of variable '%s' in " diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index bae98d14a..fe2f15b67 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -52,11 +52,11 @@ def adjust_local_temp_var_storage(kernel, device): new_temp_vars = {} - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace lmem_size = cl_char.usable_local_mem_size(device) for temp_var in six.itervalues(kernel.temporary_variables): - if temp_var.scope != temp_var_scope.LOCAL: + if temp_var.scope != MemoryAddressSpace.LOCAL: new_temp_vars[temp_var.name] = \ temp_var.copy(storage_shape=temp_var.shape) continue @@ -69,7 +69,7 @@ def adjust_local_temp_var_storage(kernel, device): other_loctemp_nbytes = [ tv.nbytes for tv in six.itervalues(kernel.temporary_variables) - if tv.scope == temp_var_scope.LOCAL + if tv.scope == MemoryAddressSpace.LOCAL and tv.name != temp_var.name] storage_shape = temp_var.storage_shape @@ -698,11 +698,11 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): from operator import mul return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace global_temporaries = sorted( (tv for tv in six.itervalues(codegen_state.kernel.temporary_variables) - if tv.scope == temp_var_scope.GLOBAL), + if tv.scope == MemoryAddressSpace.GLOBAL), key=lambda tv: tv.name) from pymbolic.mapper.stringifier import PREC_NONE diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index bef3152d0..29249e5f4 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -160,9 +160,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): """) gen("") - from loopy.kernel.data import GlobalArg + from loopy.kernel.data import ArrayArg for arg in implemented_data_info: - if issubclass(arg.arg_class, GlobalArg): + if issubclass(arg.arg_class, ArrayArg): gen( "wait_for.extend({arg_name}.events)" .format(arg_name=arg.name)) @@ -179,9 +179,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if kernel.options.cl_exec_manage_array_events: gen("") - from loopy.kernel.data import GlobalArg + from loopy.kernel.data import ArrayArg for arg in implemented_data_info: - if (issubclass(arg.arg_class, GlobalArg) + if (issubclass(arg.arg_class, ArrayArg) and arg.base_name in kernel.get_written_variables()): gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name)) diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 7e6b03581..b576e539e 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -26,7 +26,7 @@ THE SOFTWARE. import six from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext) -from loopy.kernel.data import ValueArg, GlobalArg +from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl __doc__ = """ @@ -39,14 +39,14 @@ __doc__ = """ # {{{ to_batched def temp_needs_batching_if_not_sequential(tv, batch_varying_args): - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace if tv.name in batch_varying_args: return True if tv.initializer is not None and tv.read_only: # do not batch read_only temps if not in # `batch_varying_args` return False - if tv.scope == temp_var_scope.PRIVATE: + if tv.scope == MemoryAddressSpace.PRIVATE: # do not batch private temps if not in `batch_varying args` return False return True @@ -147,7 +147,7 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", for arg in knl.args: if arg.name in batch_varying_args: if isinstance(arg, ValueArg): - arg = GlobalArg(arg.name, arg.dtype, shape=(nbatches_expr,), + arg = ArrayArg(arg.name, arg.dtype, shape=(nbatches_expr,), dim_tags="c") else: arg = arg.copy( diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 1b059b6a7..058919a77 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -137,7 +137,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable - :class:`loopy.temp_var_scope` and shape is created. + :class:`loopy.MemoryAddressSpace` and shape is created. By default, the value of the buffered cells in *var_name* are read prior to any (read/write) use, and the modified values are written out after use has @@ -159,8 +159,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, :arg within: If not None, limit the action of the transformation to matching contexts. See :func:`loopy.match.parse_stack_match` for syntax. - :arg temp_var_scope: If given, override the choice of :class:`temp_var_scope` - for the created temporary. + :arg temporary_scope: If given, override the choice of + :class:`MemoryAddressSpace` for the created temporary. :arg default_tag: The default :ref:`iname-tags` to be assigned to the inames used for fetching and storing :arg fetch_bounding_box: If the access footprint is non-convex @@ -171,7 +171,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, # {{{ unify temporary_scope / temporary_is_local - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace if temporary_is_local is not None: from warnings import warn warn("temporary_is_local is deprecated. Use temporary_scope instead", @@ -182,9 +182,9 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, "temporary_scope") if temporary_is_local: - temporary_scope = temp_var_scope.LOCAL + temporary_scope = MemoryAddressSpace.LOCAL else: - temporary_scope = temp_var_scope.PRIVATE + temporary_scope = MemoryAddressSpace.PRIVATE del temporary_is_local diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 575311b11..a1ad951be 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -175,7 +175,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, :arg rule_name: base name of the generated temporary variable. :arg temporary_name: The name of the temporary to be used. - :arg temporary_scope: The :class:`temp_var_scope` to use for the + :arg temporary_scope: The :class:`MemoryAddressSpace` to use for the temporary. :arg temporary_is_local: Deprecated, use *temporary_scope* instead. :arg footprint_subscripts: A list of tuples indicating the index (i.e. @@ -647,24 +647,24 @@ def set_temporary_scope(kernel, temp_var_names, scope): :arg temp_var_names: a container with membership checking, or a comma-separated string of variables for which the scope is to be set. - :arg scope: One of the values from :class:`temp_var_scope`, or one + :arg scope: One of the values from :class:`MemoryAddressSpace`, or one of the strings ``"private"``, ``"local"``, or ``"global"``. """ if isinstance(temp_var_names, str): temp_var_names = [s.strip() for s in temp_var_names.split(",")] - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace if isinstance(scope, str): try: - scope = getattr(temp_var_scope, scope.upper()) + scope = getattr(MemoryAddressSpace, scope.upper()) except AttributeError: raise LoopyError("scope '%s' unknown" % scope) if not isinstance(scope, int) or scope not in [ - temp_var_scope.PRIVATE, - temp_var_scope.LOCAL, - temp_var_scope.GLOBAL]: + MemoryAddressSpace.PRIVATE, + MemoryAddressSpace.LOCAL, + MemoryAddressSpace.GLOBAL]: raise LoopyError("invalid scope '%s'" % scope) new_temp_vars = kernel.temporary_variables.copy() diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d0edcfd78..f1a015413 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -336,7 +336,7 @@ class DifferentiationContext(object): if var_name in self.kernel.arg_dict: self.new_args.append( - lp.GlobalArg( + lp.ArrayArg( new_var_name, arg.dtype, shape=shape, diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 4755ca177..82d2d3b34 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -341,7 +341,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # {{{ unify temporary_scope / temporary_is_local - from loopy.kernel.data import temp_var_scope + from loopy.kernel.data import MemoryAddressSpace if temporary_is_local is not None: from warnings import warn warn("temporary_is_local is deprecated. Use temporary_scope instead", @@ -352,9 +352,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, "temporary_scope") if temporary_is_local: - temporary_scope = temp_var_scope.LOCAL + temporary_scope = MemoryAddressSpace.LOCAL else: - temporary_scope = temp_var_scope.PRIVATE + temporary_scope = MemoryAddressSpace.PRIVATE del temporary_is_local @@ -804,7 +804,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, compute_dep_id = compute_insn_id added_compute_insns = [compute_insn] - if temporary_scope == temp_var_scope.GLOBAL: + if temporary_scope == MemoryAddressSpace.GLOBAL: barrier_insn_id = kernel.make_unique_instruction_id( based_on=c_subst_name+"_barrier") from loopy.kernel.instruction import BarrierInstruction @@ -976,8 +976,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, raise LoopyError("Existing and new temporary '%s' do not " "have matching scopes (existing: %s, new: %s)" % (temporary_name, - temp_var_scope.stringify(temp_var.scope), - temp_var_scope.stringify(temporary_scope))) + MemoryAddressSpace.stringify(temp_var.scope), + MemoryAddressSpace.stringify(temporary_scope))) temp_var = temp_var.copy(scope=temporary_scope) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index e3d8368a7..2ac84a681 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -27,7 +27,7 @@ from loopy.diagnostic import LoopyError import loopy as lp import six -from loopy.kernel.data import auto, temp_var_scope +from loopy.kernel.data import auto, MemoryAddressSpace from pytools import memoize_method, Record from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, @@ -228,7 +228,7 @@ class TemporarySaver(object): return TemporaryVariable( name=self.name, dtype=temporary.dtype, - scope=temp_var_scope.GLOBAL, + scope=MemoryAddressSpace.GLOBAL, shape=self.new_shape) @property @@ -439,7 +439,7 @@ class TemporarySaver(object): group_sizes, local_sizes = ( self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) - if temporary.scope == lp.temp_var_scope.LOCAL: + if temporary.scope == lp.MemoryAddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. del local_tags[:] local_sizes = () @@ -452,7 +452,7 @@ class TemporarySaver(object): def auto_promote_temporary(self, temporary_name): temporary = self.kernel.temporary_variables[temporary_name] - if temporary.scope == temp_var_scope.GLOBAL: + if temporary.scope == MemoryAddressSpace.GLOBAL: # Nothing to be done for global temporaries (I hope) return None -- GitLab From c8d56ebd4484e2a3564c5a8857d456ce8bf8bd9c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Apr 2018 07:41:10 -0500 Subject: [PATCH 130/916] Resolve Flake8 errors. --- loopy/target/c/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 88f780304..b5b9bb542 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -512,7 +512,8 @@ class CASTBuilder(ASTBuilderBase): six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): - if tv.scope == MemoryAddressSpace.GLOBAL and tv.initializer is not None: + if tv.scope == MemoryAddressSpace.GLOBAL and ( + tv.initializer is not None): assert tv.read_only decl_info, = tv.decl_info(self.target, -- GitLab From 3cee6045595efa11085f3fd7a9068dacf2ac1b0d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Apr 2018 08:16:19 -0500 Subject: [PATCH 131/916] Fixes minor error interfering in get_global_arg_decl --- loopy/kernel/data.py | 4 ++-- loopy/target/__init__.py | 3 +++ loopy/target/c/__init__.py | 10 ++++++++-- loopy/target/cuda.py | 9 +++++++-- loopy/target/ispc.py | 9 +++++++-- loopy/target/opencl.py | 9 ++++++--- 6 files changed, 33 insertions(+), 11 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index db08de00a..2d5dc8976 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -561,8 +561,8 @@ class TemporaryVariable(ArrayBase): def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): if self.scope == MemoryAddressSpace.GLOBAL: - return ast_builder.get_global_arg_decl(self.name + name_suffix, shape, - dtype, is_written) + return ast_builder.get_array_arg_decl(self.name + name_suffix, + MemoryAddressSpace.GLOBAL, shape, dtype, is_written) else: raise LoopyError("unexpected request for argument declaration of " "non-global temporary") diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 0f90ca414..9733fa446 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -202,6 +202,9 @@ class ASTBuilderBase(object): """ raise NotImplementedError() + def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): + raise NotImplementedError() + def get_global_arg_decl(self, name, shape, dtype, is_written): raise NotImplementedError() diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index b5b9bb542..86e7bea81 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -771,7 +771,7 @@ class CASTBuilder(ASTBuilderBase): return result - def get_array_arg_decl(self, name, shape, dtype, is_written): + def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): from cgen import RestrictPointer, Const arg_decl = RestrictPointer(POD(self, dtype, name)) @@ -781,7 +781,13 @@ class CASTBuilder(ASTBuilderBase): return arg_decl - get_global_arg_decl = get_array_arg_decl + def get_global_arg_decl(self, name, shape, dtype, is_written): + from warnings import warn + warn("get_global_arg_decl is deprecated use get_array_arg_decl " + "instead.", DeprecationWarning, stacklevel=2) + from loopy.kernel.data import MemoryAddressSpace + return self.get_array_arg_decl(name, MemoryAddressSpace.GLOBAL, shape, + dtype, is_written) def get_constant_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 6340bec92..7e3724a3a 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -364,7 +364,7 @@ class CUDACASTBuilder(CASTBuilder): from cgen.cuda import CudaConstant return CudaConstant(decl) - def get_array_arg_decl(self, name, shape, dtype, is_written): + def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import Const from cgen.cuda import CudaRestrictPointer @@ -376,7 +376,12 @@ class CUDACASTBuilder(CASTBuilder): return arg_decl - get_global_arg_decl = get_array_arg_decl + def get_global_arg_decl(self, name, shape, dtype, is_written): + from warnings import warn + warn("get_global_arg_decl is deprecated use get_array_arg_decl " + "instead.", DeprecationWarning, stacklevel=2) + return self.get_array_arg_decl(name, MemoryAddressSpace.GLOBAL, shape, + dtype, is_written) def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): raise NotImplementedError("not yet: texture arguments in CUDA") diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 583da7dee..0a4299033 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -329,7 +329,7 @@ class ISPCASTBuilder(CASTBuilder): from cgen.ispc import ISPCUniform return ISPCUniform(decl) - def get_array_arg_decl(self, name, shape, dtype, is_written): + def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import Const from cgen.ispc import ISPCUniformPointer, ISPCUniform @@ -343,7 +343,12 @@ class ISPCASTBuilder(CASTBuilder): return arg_decl - get_global_arg_decl = get_array_arg_decl + def get_global_arg_decl(self, name, shape, dtype, is_written): + from warnings import warn + warn("get_global_arg_decl is deprecated use get_array_arg_decl " + "instead.", DeprecationWarning, stacklevel=2) + return self.get_array_arg_decl(name, MemoryAddressSpace.GLOBAL, shape, + dtype, is_written) def get_value_arg_decl(self, name, shape, dtype, is_written): result = super(ISPCASTBuilder, self).get_value_arg_decl( diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d849e7223..d8d013101 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -536,19 +536,22 @@ class OpenCLCASTBuilder(CASTBuilder): if mem_address_space == MemoryAddressSpace.LOCAL: return CLLocal(super(OpenCLCASTBuilder, self).get_array_arg_decl( - name, shape, dtype, is_written)) + name, mem_address_space, shape, dtype, is_written)) elif mem_address_space == MemoryAddressSpace.PRIVATE: return super(OpenCLCASTBuilder, self).get_array_arg_decl( - name, shape, dtype, is_written) + name, mem_address_space, shape, dtype, is_written) elif mem_address_space == MemoryAddressSpace.GLOBAL: return CLGlobal(super(OpenCLCASTBuilder, self).get_array_arg_decl( - name, shape, dtype, is_written)) + name, mem_address_space, shape, dtype, is_written)) else: raise ValueError("unexpected array argument scope: %s" % mem_address_space) def get_global_arg_decl(self, name, shape, dtype, is_written): from loopy.kernel.data import MemoryAddressSpace + from warnings import warn + warn("get_global_arg_decl is deprecated use get_array_arg_decl " + "instead.", DeprecationWarning, stacklevel=2) return self.get_array_arg_decl(name, MemoryAddressSpace.GLOBAL, shape, dtype, is_written) -- GitLab From a89beaa87a165669578011c825f83bfdfbebde20 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Apr 2018 08:39:17 -0500 Subject: [PATCH 132/916] Changed from GlobalArg to ArrayArg --- doc/tutorial.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index af8c8281c..345c26b68 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -112,9 +112,9 @@ always see loopy's view of a kernel by printing it. KERNEL: loopy_kernel --------------------------------------------------------------------------- ARGUMENTS: - a: GlobalArg, type: , shape: (n), dim_tags: (N0:stride:1) + a: ArrayArg, type: , shape: (n), dim_tags: (N0:stride:1) n: ValueArg, type: - out: GlobalArg, type: , shape: (n), dim_tags: (N0:stride:1) + out: ArrayArg, type: , shape: (n), dim_tags: (N0:stride:1) --------------------------------------------------------------------------- DOMAINS: [n] -> { [i] : 0 <= i < n } @@ -1145,7 +1145,7 @@ the right by 1 in parallel: ... end ... """, ... [ - ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), + ... lp.ArrayArg("arr", shape=("n",), dtype=np.int32), ... "...", ... ], ... name="rotate_v1", @@ -1189,7 +1189,7 @@ Let us start with an example. Consider the kernel from above with a ... end ... """, ... [ - ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), + ... lp.ArrayArg("arr", shape=("n",), dtype=np.int32), ... "...", ... ], ... name="rotate_v2", @@ -1321,8 +1321,8 @@ tagged, as in the following example:: "{ [i]: 0<=i Date: Fri, 27 Apr 2018 12:58:54 -0500 Subject: [PATCH 133/916] Removing the FIXME comment about handling temporaries. --- loopy/kernel/function_interface.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e755cb6c4..d3c5ba60c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -508,10 +508,6 @@ class CallableKernel(InKernelCallable): # tuning the subkernel so that we have the the matching shapes and # dim_tags. - # FIXME: Although We receive input if the argument is - # `local/global`. We do not use it to set the subkernel function - # signature. Need to do it, so that we can handle teporary inputs - # in the array call. # Collecting the parameters new_args = self.subkernel.args[:] -- GitLab From 272bc5583cccc0d9f0b1b59b1b4074ee325e8677 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Apr 2018 14:09:27 -0500 Subject: [PATCH 134/916] INtroduced is_master_kernel --- loopy/kernel/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 9a4ea7027..09f31af3a 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -184,6 +184,18 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: target A subclass of :class:`loopy.TargetBase`. + + .. attribute:: is_master_kernel + + # TODO: Naming suggestions? + # is_top_level_kernel + # is_caller_kernel + # is_called_from_host + # is_root_kernel + + An instance of :class:`bool`. Will be set *False* for the kernel which + would be called from another top level kernels. Default value is + *True*. """ # {{{ constructor @@ -212,6 +224,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=None, state=kernel_state.INITIAL, + is_master_kernel=True, target=None, overridden_get_grid_sizes_for_insn_ids=None): @@ -297,6 +310,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): index_dtype=index_dtype, options=options, state=state, + is_master_kernel=is_master_kernel, target=target, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids)) @@ -1358,6 +1372,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "silenced_warnings", "options", "state", + "is_master_kernel", "target", ) -- GitLab From 5c9f25f3b3e7ba26eb24f90e32314a9b02481f76 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Apr 2018 17:15:18 -0500 Subject: [PATCH 135/916] removed `is_generating_master_kernel` from CodegenerationState and added it as an attribute to the LoopKernel. --- loopy/codegen/__init__.py | 31 +++++++--------------------- loopy/target/opencl.py | 2 +- loopy/transform/register_callable.py | 3 ++- 3 files changed, 10 insertions(+), 26 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index c48492597..0786af663 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -197,12 +197,6 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end - - .. attribute:: is_generating_master_kernel - - Can be either `True` or `False`. Indicating whether the code is being - generated for a master kernel or an auxiliary kernel. - """ def __init__(self, kernel, @@ -212,8 +206,7 @@ class CodeGenerationState(object): vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None, - is_generating_master_kernel=None): + schedule_index_end=None): self.kernel = kernel self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain @@ -228,7 +221,6 @@ class CodeGenerationState(object): self.is_generating_device_code = is_generating_device_code self.gen_program_name = gen_program_name self.schedule_index_end = schedule_index_end - self.is_generating_master_kernel = is_generating_master_kernel # {{{ copy helpers @@ -237,8 +229,7 @@ class CodeGenerationState(object): var_subst_map=None, vectorization_info=None, is_generating_device_code=None, gen_program_name=None, - schedule_index_end=None, - is_generating_master_kernel=None): + schedule_index_end=None): if kernel is None: kernel = self.kernel @@ -261,9 +252,6 @@ class CodeGenerationState(object): if schedule_index_end is None: schedule_index_end = self.schedule_index_end - if is_generating_master_kernel is None: - is_generating_master_kernel = self.is_generating_master_kernel - return CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, @@ -279,8 +267,7 @@ class CodeGenerationState(object): var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, gen_program_name=gen_program_name, - schedule_index_end=schedule_index_end, - is_generating_master_kernel=is_generating_master_kernel) + schedule_index_end=schedule_index_end) def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" @@ -421,11 +408,8 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_v2(kernel, is_generating_master_kernel=True): +def generate_code_v2(kernel): """ - :arg is_generating_master_kernel: An instance of :class:`bool`. *True* if - the code is being generated for a master kernel, otherwise *False*. - :returns: a :class:`CodeGenerationResult` """ @@ -520,8 +504,7 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule), - is_generating_master_kernel=is_generating_master_kernel) + schedule_index_end=len(kernel.schedule)) from loopy.codegen.result import generate_host_or_device_program @@ -538,8 +521,8 @@ def generate_code_v2(kernel, is_generating_master_kernel=True): from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel): auxiliary_dev_prog = generate_code_v2( - in_knl_callable.subkernel.copy(target=kernel.target), - is_generating_master_kernel=False).device_programs[0].ast + in_knl_callable.subkernel.copy(target=kernel.target) + ).device_programs[0].ast auxiliary_dev_progs.append(auxiliary_dev_prog) elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, BarrierInstruction, CInstruction, diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d8d013101..5d00dd39a 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -451,7 +451,7 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) - if not codegen_state.is_generating_master_kernel: + if not codegen_state.kernel.is_master_kernel: # auxiliary kernels need not mention opencl speicific qualifiers # for a functions signature return fdecl diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 1a0aadec6..1ae4d70be 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -95,7 +95,8 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # making the target of the child kernel to be same as the target of parent # kernel. callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=caller_kernel.target)) + target=caller_kernel.target, + is_master_kernel=False)) return register_function_lookup(caller_kernel, RegisterCalleeKernel(function_name, callable_kernel)) -- GitLab From 250407540acb82204c0868697d99f6f43baff7f6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Apr 2018 11:31:33 -0500 Subject: [PATCH 136/916] Done with with_iname_tag_usage. Need to add comments explaining quite a lot of functions. --- loopy/kernel/__init__.py | 53 ++++++++++++++---- loopy/kernel/function_interface.py | 41 +++++++++----- loopy/kernel/tools.py | 46 ++++++++++++++++ loopy/preprocess.py | 87 ++++++++++++++++++++++++++++++ loopy/schedule/__init__.py | 22 ++++---- 5 files changed, 218 insertions(+), 31 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 09f31af3a..a792d246a 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -187,7 +187,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: is_master_kernel - # TODO: Naming suggestions? + # FIXME: Naming suggestions? # is_top_level_kernel # is_caller_kernel # is_called_from_host @@ -950,20 +950,23 @@ class LoopKernel(ImmutableRecordWithoutPickling): constants_only=True))) @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs - *global_size* and *local_size* are :class:`islpy.PwAff` objects. + *global_size* and *local_size* are instances of :class:`dict` with + mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. """ - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - ignore_auto=ignore_auto) + # {{{ collecting the callee kernels in insn_ids + + from loopy.kernel.tools import get_callee_kernels + callee_kernels = get_callee_kernels(self, insn_ids) + + # }}} all_inames_by_insns = set() for insn_id in insn_ids: @@ -978,6 +981,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): global_sizes = {} local_sizes = {} + # updating the grid sizes from the callee_kernels. + for callee_kernel in callee_kernels: + gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts( + frozenset(insn.id for insn in callee_kernel.instructions)) + + global_sizes.update(gsize) + local_sizes.update(lsize) + from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) @@ -1014,6 +1025,30 @@ class LoopKernel(ImmutableRecordWithoutPickling): tgt_dict[tag.axis] = size + return global_sizes, local_sizes + + @memoize_method + def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + ignore_auto=ignore_auto) + + assert self.is_master_kernel, ("Callee kernels do not have sufficient " + "information to compute grid sizes.") + + global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( + insn_ids, ignore_auto=ignore_auto) + def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -1033,8 +1068,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert cur_axis is not None if cur_axis > len(size_list): - raise RuntimeError("%s axis %d unused" % ( - which, len(size_list))) + raise RuntimeError("%s axis %d unused for %s" % ( + which, len(size_list), self.name)) size_list.append(size_dict[cur_axis]) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index d3c5ba60c..799f1425c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -129,6 +129,17 @@ def get_kw_pos_association(kernel): return kw_to_pos, pos_to_kw + +class GridOverride(ImmutableRecord): + fields = set(["local_size", "global_size"]) + + def __init__(self, local_size, global_size): + self.local_size = local_size + self.global_size = global_size + + def __call__(self, insn_ids, ignore_auto=True): + return self.local_size, self.global_size + # }}} @@ -240,19 +251,11 @@ class InKernelCallable(ImmutableRecord): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) - def with_iname_tag_usage(self, unusable, concurrent_shape): + def with_hw_axes_sizes(self, local_size, global_size): """ - :arg unusable: a set of iname tags that may not be used in the callee. - :arg concurrent_shape: an list of tuples ``(iname_tag, bound)`` for - concurrent inames that are used in the calller but also available - for mapping by the callee. *bound* is given as a - :class:`islpy.PwAff`. - - :returns: a list of the same type as *concurrent*, potentially modified - by increasing bounds or adding further iname tag entries. - - All iname tags not explicitly listed in *concurrent* or *unusable* are - available for mapping by the callee. + # TODO: docs + :arg local_size: + :arg global_size: """ raise NotImplementedError() @@ -318,6 +321,9 @@ class ScalarCallable(InKernelCallable): arg_id_to_descr[-1] = ValueArgDescriptor() return self.copy(arg_id_to_descr=arg_id_to_descr) + def with_hw_axes_sizes(self, global_size, local_size): + return self.copy() + def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and @@ -533,6 +539,17 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr) + def with_hw_axes_sizes(self, gsize, lsize): + """ + # TODO: docs + :arg gsize: + :arg lsize: + """ + return self.copy( + subkernel=self.subkernel.copy( + overridden_get_grid_sizes_for_insn_ids=GridOverride( + lsize, gsize))) + def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index ec26916f3..ac9b3667d 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1800,4 +1800,50 @@ def find_aliasing_equivalence_classes(kernel): # }}} +# {{{ callee kernel tools + +def get_callee_kernels(kernel, insn_ids=None): + """ + Returns an instance of :class:`frozenset` of all the callee kernels + called in instructions in the *kernel* whose IDs are given in *insn_ids*. + + :arg kernel: An instance of :class:`LoopKernel`. + :arg insn_ids: An instance of :class:`frozenset`. + + If *insn_ids* is *None* returns all the callee kernels called by *kernel*. + """ + + if insn_ids is None: + insn_ids = frozenset(insn.id for insn in kernel.instructions) + + from loopy.kernel.function_interface import CallableKernel + + def _get_callee_kernel_if_insn_has_callable_kernel(insn_id): + """Returns callee kernel if the instruction has a call to a + :class:`loopy.kernel.function_interface.CallableKernel`. Otherwise + returns *None*. + """ + insn = kernel.id_to_insn[insn_id] + from loopy.kernel.instruction import (CallInstruction, + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) + if isinstance(insn, CallInstruction): + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + return in_knl_callable.subkernel + elif isinstance(insn, (MultiAssignmentBase, + CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknoown type of instruction %s." % + type(insn)) + + return None + + return frozenset([_get_callee_kernel_if_insn_has_callable_kernel(id) + for id in insn_ids]) - frozenset([None]) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 48651b777..49824f464 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2282,6 +2282,92 @@ def infer_arg_descr(kernel): # }}} +# {{{ + +class HWAxesInferenceMapper(CombineMapper): + """ + Returns a set of instances of :class:`tuple` (expr, + in_kernel_callable). The mapped `in_kernel_callable` of the + :class:`InKernelCallable` are specialized for the the grid sizes of + :attr:`kernel`. + """ + + def __init__(self, kernel): + self.kernel = kernel + self.local_size, self.global_size = kernel.get_grid_size_upper_bounds() + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr, **kwargs): + # ignoring if the call is not to a ScopedFunction + from loopy.symbolic import ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) + + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes( + self.local_size, self.global_size)) + + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) + + def map_call_with_kwargs(self, expr, **kwargs): + from loopy.symbolic import ScopedFunction + # ignoring if the call is not to a ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) + + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes( + self.local_size, self.global_size)) + + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values())))) + + def map_constant(self, expr, **kwargs): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def infer_hw_axes_sizes(kernel): + """ + Returns a copy of *kernel* with the hardware axes matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_hw_axes_sizes`. + """ + hw_axes_modifier = HWAxesInferenceMapper(kernel) + pymbolic_calls_to_functions = set() + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + pymbolic_calls_to_functions.update(hw_axes_modifier( + insn.expression)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("unknown type of instruction %s." % + type(insn)) + + # making it the set of tuples a dict + pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions) + + # Now do the similar treatment as done for type inference. + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + + return register_pymbolic_calls_to_knl_callables(kernel, + pymbolic_calls_to_functions) + +# }}} + + # {{{ catching functions that are not ready for codegen class FunctionsNotReadyForCodegenCollector(CombineMapper): @@ -2480,6 +2566,7 @@ def preprocess_kernel(kernel, device=None): # inferring the shape and dim_tags of the arguments involved in a function # call. kernel = infer_arg_descr(kernel) + kernel = infer_hw_axes_sizes(kernel) # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 2c9964b11..0b9e98564 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1976,18 +1976,20 @@ def generate_loop_schedules_inner(kernel, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - gsize, lsize = kernel.get_grid_size_upper_bounds() + if kernel.is_master_kernel: + gsize, lsize = kernel.get_grid_size_upper_bounds() - if (gsize or lsize): - if not kernel.options.disable_global_barriers: - logger.debug("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, - synchronization_kind="global", verify_only=True) + if (gsize or lsize): + if not kernel.options.disable_global_barriers: + logger.debug("%s: barrier insertion: global" % ( + kernel.name)) + gen_sched = insert_barriers(kernel, gen_sched, + synchronization_kind="global", verify_only=True) - logger.debug("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, gen_sched, - synchronization_kind="local", verify_only=False) - logger.debug("%s: barrier insertion: done" % kernel.name) + logger.debug("%s: barrier insertion: local" % kernel.name) + gen_sched = insert_barriers(kernel, gen_sched, + synchronization_kind="local", verify_only=False) + logger.debug("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( schedule=gen_sched, -- GitLab From c23ec98676568bafc97b714fed1ba58fbca1b3f5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Apr 2018 15:46:24 -0500 Subject: [PATCH 137/916] Fixes small typo in get_callee_kernels. --- loopy/kernel/tools.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index ac9b3667d..c5c4346d3 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1827,15 +1827,16 @@ def get_callee_kernels(kernel, insn_ids=None): from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) if isinstance(insn, CallInstruction): - in_knl_callable = kernel.scoped_functions[ - insn.expression.function.name] - if isinstance(in_knl_callable, CallableKernel): - return in_knl_callable.subkernel + if insn.expression.function.name in kernel.scoped_functions: + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + return in_knl_callable.subkernel elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass else: - raise NotImplementedError("Unknoown type of instruction %s." % + raise NotImplementedError("Unknown type of instruction %s." % type(insn)) return None -- GitLab From a3fa082c129d1242fd80e7cc343649caa53c10e6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Apr 2018 17:13:09 -0500 Subject: [PATCH 138/916] Rewording of comments. --- loopy/codegen/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 0786af663..d0eb57cb5 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -374,7 +374,9 @@ code_gen_cache = WriteOncePersistentDict( class InKernelCallablesCollector(CombineMapper): """ - Yields the preambles from all the scoped functions in the kernel. + Returns an instance of :class:`frozenset` containing instances of + :class:`loopy.kernel.function_interface.InKernelCallable` in the + :attr:``kernel`. """ def __init__(self, kernel): self.kernel = kernel -- GitLab From 07fa72615f451ac149557262b198c42c3d6c3aef Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Apr 2018 17:20:49 -0500 Subject: [PATCH 139/916] Removed unused arguments in lookup_functions --- loopy/kernel/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index a792d246a..b36abc847 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -362,7 +362,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return None - def lookup_function(self, identifier, ast_builder=None): + def lookup_function(self, identifier): """ Returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable` if the @@ -1068,7 +1068,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert cur_axis is not None if cur_axis > len(size_list): - raise RuntimeError("%s axis %d unused for %s" % ( + raise LoopyError("%s axis %d unused for %s" % ( which, len(size_list), self.name)) size_list.append(size_dict[cur_axis]) -- GitLab From 39dde4156d5aa520c5a3ddb70dc63d2da00eb2ec Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Apr 2018 17:30:29 -0500 Subject: [PATCH 140/916] Comment re-wording. --- loopy/kernel/data.py | 2 +- loopy/kernel/instruction.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 2d5dc8976..d12c79e2f 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -292,7 +292,7 @@ class ArrayArg(ArrayBase, KernelArgument): class GlobalArg(ArrayBase, KernelArgument): def __new__(cls, *args, **kwargs): from warnings import warn - warn("Use of 'GlobalArg' is deprecated use 'ArrayArg' instead.", + warn("Use of 'GlobalArg' is deprecated, use 'ArrayArg' instead.", DeprecationWarning, stacklevel=2) return ArrayArg(*args, **kwargs) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index c81553b45..506f88c80 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1056,6 +1056,13 @@ def subscript_contains_slice(subscript): def is_array_call(assignees, expression): + """ + Returns *True* is the instruction is an array call. + + An array call is a function call applied to array type objects. If any of + the arguemnts or assignees to the function is an array, + :meth:`is_array_call` will return *True*. + """ from pymbolic.primitives import Call, CallWithKwargs, Subscript from loopy.symbolic import SubArrayRef @@ -1073,7 +1080,7 @@ def is_array_call(assignees, expression): return False -def get_array_call_assignee(assignee): +def modify_assignee_assignee_for_array_call(assignee): """ Converts the assignee subscript or variable as a SubArrayRef. """ -- GitLab From bac6e28cc6b2fde55e6359c02f1dbf220d53441d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Apr 2018 17:51:12 -0500 Subject: [PATCH 141/916] Minors bug fixes. --- loopy/kernel/instruction.py | 4 ++-- loopy/schedule/__init__.py | 23 +++++++++++------------ loopy/transform/register_callable.py | 4 ++++ 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 506f88c80..b456acfb2 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1127,8 +1127,8 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): # assignee as an instance of SubArrayRef. If not given as a # SubArrayRef return CallInstruction( - assignees=tuple(get_array_call_assignee(assignee) for - assignee in assignees), + assignees=tuple(modify_assignee_assignee_for_array_call( + assignee) for assignee in assignees), expression=expression, temp_var_types=temp_var_types, **kwargs) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 0b9e98564..ae05b69af 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1976,20 +1976,19 @@ def generate_loop_schedules_inner(kernel, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - if kernel.is_master_kernel: - gsize, lsize = kernel.get_grid_size_upper_bounds() + gsize, lsize = kernel.get_grid_size_upper_bounds() - if (gsize or lsize): - if not kernel.options.disable_global_barriers: - logger.debug("%s: barrier insertion: global" % ( - kernel.name)) - gen_sched = insert_barriers(kernel, gen_sched, - synchronization_kind="global", verify_only=True) - - logger.debug("%s: barrier insertion: local" % kernel.name) + if (gsize or lsize): + if not kernel.options.disable_global_barriers: + logger.debug("%s: barrier insertion: global" % ( + kernel.name)) gen_sched = insert_barriers(kernel, gen_sched, - synchronization_kind="local", verify_only=False) - logger.debug("%s: barrier insertion: done" % kernel.name) + synchronization_kind="global", verify_only=True) + + logger.debug("%s: barrier insertion: local" % kernel.name) + gen_sched = insert_barriers(kernel, gen_sched, + synchronization_kind="local", verify_only=False) + logger.debug("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( schedule=gen_sched, diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 1ae4d70be..be36e62ff 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -98,6 +98,10 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): target=caller_kernel.target, is_master_kernel=False)) + # disabling global barriers for callee kernel + from loopy import set_options + callee_kernel = set_options(callee_kernel, "disable_global_barriers") + return register_function_lookup(caller_kernel, RegisterCalleeKernel(function_name, callable_kernel)) -- GitLab From 0061ceee494f5b3bbd41ce06b213e3d56262fdb2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Apr 2018 19:10:26 -0500 Subject: [PATCH 142/916] adds some helpful comments. --- loopy/kernel/function_interface.py | 56 +++++++++--------------------- loopy/preprocess.py | 4 +-- 2 files changed, 17 insertions(+), 43 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 799f1425c..4150a4091 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -58,13 +58,13 @@ class ArrayArgDescriptor(ImmutableRecord): .. attribute:: mem_scope - Can be either "LOCAL" or "GLOBAL", definiing where the argument is - supposed to reside in the device memory. + An attribute of :class:`loopy.kernel.data.MemoryAddressSpace`. .. attribute:: dim_tags A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` """ + fields = set(['shape', 'mem_scope', 'dim_tags']) def __init__(self, shape, mem_scope, dim_tags): @@ -79,25 +79,11 @@ class ArrayArgDescriptor(ImmutableRecord): # }}} - super(ArrayArgDescriptor, self).__init__(shape=shape, + super(ArrayArgDescriptor, self).__init__( + shape=shape, mem_scope=mem_scope, dim_tags=dim_tags) - def copy(self, dtype=None, mem_scope=None, shape=None, dim_tags=None): - if dtype is None: - dtype = self.dtype - - if mem_scope is None: - mem_scope = self.mem_scope - - if dim_tags is None: - dim_tags = self.dim_tags - - return ArrayArgDescriptor( - mem_scope=mem_scope, - dim_tags=dim_tags) - - # }}} @@ -105,8 +91,8 @@ class ArrayArgDescriptor(ImmutableRecord): def get_kw_pos_association(kernel): """ - Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments present of - the kernel. + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in the + *kernel*. """ kw_to_pos = {} pos_to_kw = {} @@ -130,7 +116,7 @@ def get_kw_pos_association(kernel): return kw_to_pos, pos_to_kw -class GridOverride(ImmutableRecord): +class GridOverrideForCalleeKernel(ImmutableRecord): fields = set(["local_size", "global_size"]) def __init__(self, local_size, global_size): @@ -232,7 +218,7 @@ class InKernelCallable(ImmutableRecord): """ if target is None: - raise RuntimeError() + raise LoopyError("target cannot be None for with_target") def with_target_if_not_None(dtype): """ @@ -253,9 +239,8 @@ class InKernelCallable(ImmutableRecord): def with_hw_axes_sizes(self, local_size, global_size): """ - # TODO: docs - :arg local_size: - :arg global_size: + :arg local_size: An instance of :class:`islpy.PwAff`. + :arg global_size: An instance of :class:`islpy.PwAff`. """ raise NotImplementedError() @@ -540,15 +525,10 @@ class CallableKernel(InKernelCallable): arg_id_to_descr=arg_id_to_descr) def with_hw_axes_sizes(self, gsize, lsize): - """ - # TODO: docs - :arg gsize: - :arg lsize: - """ return self.copy( subkernel=self.subkernel.copy( - overridden_get_grid_sizes_for_insn_ids=GridOverride( - lsize, gsize))) + overridden_get_grid_sizes_for_insn_ids=( + GridOverrideForCalleeKernel(lsize, gsize)))) def is_ready_for_codegen(self): @@ -590,12 +570,11 @@ class CallableKernel(InKernelCallable): parameters.append(kw_parameters[pos_to_kw[i]]) par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) - # TODO: currently no suppport for assignee keywords. parameters = parameters + list(assignees) par_dtypes = par_dtypes + [self.arg_id_to_dtype[-i-1] for i, _ in enumerate(assignees)] - # Note that we are not going to do any type casting in array calls. + # we are not going to do any type casting in array calls. from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import SubArrayRef @@ -622,7 +601,7 @@ class ManglerCallable(ScalarCallable): """ A callable whose characateristic is defined by a function mangler. - .. attribute function_mangler:: + .. attribute:: function_mangler A function of signature ``(target, name , arg_dtypes)`` and returns an instance of ``loopy.CallMangleInfo``. @@ -722,9 +701,8 @@ def next_indexed_variable(function): class ScopedFunctionNameChanger(RuleAwareIdentityMapper): """ - Mapper that takes in a mapping ``expr_to_new_names`` and maps the - corresponding expression to the new names, which correspond to the names in - ``kernel.scoped_functions``. + Changes the names of scoped functions in calls of expressions according to + the mapping ``expr_to_new_names`` """ def __init__(self, rule_mapping_context, expr_to_new_names, subst_expander): @@ -752,8 +730,6 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): else: return self.map_substitution(name, tag, expr.parameters, expn_state) - # TODO: Add a method map_call_with_kwargs - def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_exprs_to_knl_callables): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 49824f464..0bf5cd513 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2532,9 +2532,6 @@ def preprocess_kernel(kernel, device=None): kernel = infer_unknown_types(kernel, expect_completion=False) - # TODO: Specializng based on: - # 1. InameTags - check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) @@ -2566,6 +2563,7 @@ def preprocess_kernel(kernel, device=None): # inferring the shape and dim_tags of the arguments involved in a function # call. kernel = infer_arg_descr(kernel) + # tuning the functions in the kernel to align with the grid sizes. kernel = infer_hw_axes_sizes(kernel) # boostability should be removed in 2017.x. -- GitLab From c916519e06bc2f64dc17a2d1dcd4452ff079868e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Apr 2018 19:30:45 -0500 Subject: [PATCH 143/916] Added some helpful comments. --- loopy/kernel/function_interface.py | 3 +++ loopy/transform/register_callable.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 4150a4091..abf9faceb 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -432,6 +432,9 @@ class CallableKernel(InKernelCallable): The :meth:`CallableKernel.with_descrs` should be called in order to match the ``dim_tags, shape, mem_scopes`` of the arguments shared between the caller and the callee kernel. + + The :meth:`CallableKernel.with_hw_axes` should be called to set the grid + sizes for the :attr:`subkernel` of the callable. """ fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index be36e62ff..dfbe9a619 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -76,7 +76,7 @@ class RegisterCalleeKernel(ImmutableRecord): def register_callable_kernel(caller_kernel, function_name, callee_kernel): - """Returns a copy of *caller_kernel* which identifies *function_name* in an + """Returns a copy of *caller_kernel*, which would resolve *function_name* in an expression as a call to *callee_kernel*. :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. -- GitLab From aabb1e281131ad23f93045bc5eae8a11f900b953 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Apr 2018 13:38:33 -0500 Subject: [PATCH 144/916] new attribute for array arg i.e. direction. --- loopy/kernel/data.py | 5 +++- loopy/kernel/function_interface.py | 16 ++++++----- loopy/kernel/tools.py | 40 ++++++++++++++++++++++++++++ loopy/transform/register_callable.py | 23 ++++++++++++++++ 4 files changed, 76 insertions(+), 8 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index d12c79e2f..788d4ffc0 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -264,6 +264,7 @@ class KernelArgument(ImmutableRecord): dtype = None kwargs["dtype"] = dtype + kwargs["direction"] = kwargs.pop("direction", None) ImmutableRecord.__init__(self, **kwargs) @@ -271,12 +272,14 @@ class KernelArgument(ImmutableRecord): class ArrayArg(ArrayBase, KernelArgument): allowed_extra_kwargs = [ - "memory_address_space"] + "memory_address_space", + "direction"] def __init__(self, *args, **kwargs): # Defaulting the memory_address_space to be GLOBAL. kwargs["memory_address_space"] = kwargs.pop( "memory_address_space", MemoryAddressSpace.GLOBAL) + kwargs["direction"] = kwargs.pop("direction", None) super(ArrayArg, self).__init__(*args, **kwargs) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index abf9faceb..08b18af37 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -87,13 +87,15 @@ class ArrayArgDescriptor(ImmutableRecord): # }}} -# {{{ helper function for in kernel callables +# {{{ helper function for in-kernel callables def get_kw_pos_association(kernel): """ - Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in the + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in *kernel*. """ + from loopy.kernel.tools import infer_arg_direction + kernel = infer_arg_direction(kernel) kw_to_pos = {} pos_to_kw = {} @@ -101,17 +103,17 @@ def get_kw_pos_association(kernel): write_count = -1 for arg in kernel.args: - # FIXME: Confused about the written and read variables ordering. - if arg.name not in kernel.get_written_variables(): + if arg.direction == 'in': kw_to_pos[arg.name] = read_count pos_to_kw[read_count] = arg.name read_count += 1 - else: - # These args are not read in the kernel. Hence, assuming that they - # must be returned. + elif arg.direction == 'out': kw_to_pos[arg.name] = write_count pos_to_kw[write_count] = arg.name write_count -= 1 + else: + raise LoopyError("Unknown value of kernel argument direction %s for " + "%s" % (arg.direction, arg.name)) return kw_to_pos, pos_to_kw diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index c5c4346d3..436b92223 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1847,4 +1847,44 @@ def get_callee_kernels(kernel, insn_ids=None): # }}} +# {{{ direction helper tools + +def infer_arg_direction(kernel): + """ + Returns a copy of *kernel* with the directions of the argument inferred. + + .. note:: + Implements a simple heuristic -- if the argument direction is not + specified by the user then if the argument is written at any point + during in the kernel then its direction is set to be ``out``, otherwise + ``in``. + """ + from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg + direction_inferred_args = [] + for arg in kernel.args: + if isinstance(arg, (ArrayArg, ImageArg)): + if arg.direction is not None: + if arg.direction not in ['in', 'out']: + raise LoopyError("Unknown value of direction %s for %s." % ( + arg.direction, arg.name)) + direction_inferred_args.append(arg) + else: + if arg.name in kernel.get_written_variables(): + direction_inferred_args.append(arg.copy(direction='out')) + else: + direction_inferred_args.append(arg.copy(direction='in')) + elif isinstance(arg, (ValueArg, ConstantArg)): + # For ValueArg, ConstantArg the direction always has to be in. + if arg.direction is not None and arg.direction == 'out': + raise LoopyError("Argument %s cannot have 'out' direction." % + arg.name) + else: + direction_inferred_args.append(arg.copy(direction='in')) + else: + raise NotImplementedError("Unkonwn argument type %s." % type(arg)) + + return kernel.copy(args=direction_inferred_args) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index dfbe9a619..aff35e79e 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -25,6 +25,9 @@ THE SOFTWARE. from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, + CInstruction, _DataObliviousInstruction) __doc__ = """ .. currentmodule:: loopy @@ -90,6 +93,26 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): assert isinstance(callee_kernel, LoopKernel) assert isinstance(function_name, str) + # check to make sure that the variables with 'out' direction is equal to + # the number of assigness in the callee kernel intructions. + from loopy.kernel.tools import infer_arg_direction + callee_kernel = infer_arg_direction(callee_kernel) + expected_num_assignees = len([arg for arg in callee_kernel.args if + arg.direction == 'out']) + for insn in caller_kernel.instructions: + if isinstance(insn, CallInstruction) and ( + insn.expression.function.name == 'function_name'): + if insn.assignees != expected_num_assignees: + raise LoopyError("The number of arguments with 'out' direction " + "in callee kernel %s and the number of assignees in " + "instruction %s do not match." % ( + callee_kernel.name, insn.id)) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("unknown instruction %s" % type(insn)) + # }}} # making the target of the child kernel to be same as the target of parent -- GitLab From ed2ee03f266d32b0ebd10906719581eebff01cbe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Apr 2018 15:30:37 -0500 Subject: [PATCH 145/916] Added CallWithKwargs support for array calls. --- loopy/check.py | 4 ++-- loopy/kernel/function_interface.py | 38 ++++++++++++++++++++++++++---- loopy/preprocess.py | 8 +++---- loopy/symbolic.py | 7 ++++++ loopy/type_inference.py | 32 ++++++++++++++++++------- 5 files changed, 70 insertions(+), 19 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 744bc27aa..080c5721c 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -89,10 +89,10 @@ class UnscopedCallCollector(CombineMapper): if not isinstance(expr.function, ScopedFunction): return (frozenset([expr.function.name]) | self.combine((self.rec(child) for child in expr.parameters - + expr.kw_parameter.values()))) + + tuple(expr.kw_parameters.values())))) else: return self.combine((self.rec(child) for child in - expr.parameters+expr.kw_parameters.values())) + expr.parameters+tuple(expr.kw_parameters.values()))) def map_constant(self, expr): return frozenset() diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 08b18af37..b4a18315a 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -24,6 +24,7 @@ THE SOFTWARE. import re +import six from six.moves import zip @@ -34,9 +35,8 @@ from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name -from loopy.symbolic import (IdentityMapper, ScopedFunction, - SubstitutionRuleMappingContext, RuleAwareIdentityMapper, - SubstitutionRuleExpander) +from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, + RuleAwareIdentityMapper, SubstitutionRuleExpander) # {{{ argument descriptors @@ -731,7 +731,37 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): tuple(self.rec(child, expn_state) for child in expr.parameters)) else: - return IdentityMapper.map_call(self, expr, expn_state) + return super(ScopedFunctionNameChanger, self).map_call( + self, expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.expr_to_new_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + elif expanded_expr in self.expr_to_new_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return super(ScopedFunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) else: return self.map_substitution(name, tag, expr.parameters, expn_state) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 0bf5cd513..bf1467c16 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2193,15 +2193,15 @@ class ArgDescrInferenceMapper(CombineMapper): self.combine((self.rec(child) for child in expr.parameters))) def map_call_with_kwargs(self, expr, **kwargs): - from loopy.kernel.function_intergace import ValueArgDescriptor + from loopy.kernel.function_interface import ValueArgDescriptor from loopy.symbolic import SubArrayRef # descriptors for the args and kwargs: arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par, self.kernel)) if isinstance(par, SubArrayRef) else ValueArgDescriptor() - for i, par in enumerate(expr.parameters) + - expr.kw_parameters.items()) + for i, par in tuple(enumerate(expr.parameters)) + + tuple(expr.kw_parameters.items())) assignee_id_to_descr = {} @@ -2225,7 +2225,7 @@ class ArgDescrInferenceMapper(CombineMapper): # specializing the function according to the parameter description new_scoped_function = ( - self.kernel.scoped_functions[expr.function.name].with_descr( + self.kernel.scoped_functions[expr.function.name].with_descrs( combined_arg_id_to_descr)) # collecting the descriptors for args, kwargs, assignees diff --git a/loopy/symbolic.py b/loopy/symbolic.py index e4cdfa05d..55bd543fc 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -305,6 +305,13 @@ class DependencyMapper(DependencyMapperBase): return self.combine( self.rec(child, *args) for child in expr.parameters) + def map_call_with_kwargs(self, expr, *args): + # Loopy does not have first-class functions. Do not descend + # into 'function' attribute of Call. + return self.combine( + self.rec(child, *args) for child in expr.parameters+tuple( + expr.kw_parameters.values())) + def map_reduction(self, expr): deps = self.rec(expr.expr) return deps - set(p.Variable(iname) for iname in expr.inames) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index cc3b9e8e4..e4f6ec0a4 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -265,9 +265,14 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable + from pymbolic.primitives import Variable, CallWithKwargs from loopy.symbolic import ScopedFunction + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + kw_parameters = {} + identifier = expr.function if isinstance(identifier, (Variable, ScopedFunction)): identifier = identifier.name @@ -280,21 +285,23 @@ class TypeInferenceMapper(CombineMapper): return None arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in - enumerate(expr.parameters)) + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) # specializing the known function wrt type if isinstance(expr.function, ScopedFunction): in_knl_callable = self.scoped_functions[expr.function.name] - # {{{ checking that there is no overwriting of in_knl_callable + # {{{ checking that there is no overwriting of types of in_knl_callable if in_knl_callable.arg_id_to_dtype is not None: # specializing an already specialized function. for id, dtype in arg_id_to_dtype.items(): - # Ignoring the the cases when there is a discrepancy - # between np.uint and np.int if in_knl_callable.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int + import numpy as np if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( np.uint32) and ( @@ -306,15 +313,16 @@ class TypeInferenceMapper(CombineMapper): np.int64): continue + # }}} + raise LoopyError("Overwriting a specialized function " "is illegal--maybe start with new instance of " "InKernelCallable?") # }}} - in_knl_callable = ( - in_knl_callable.with_types( - arg_id_to_dtype, self.kernel)) + in_knl_callable = in_knl_callable.with_types( + arg_id_to_dtype, self.kernel) # storing the type specialized function so that it can be used for # later use @@ -335,7 +343,10 @@ class TypeInferenceMapper(CombineMapper): elif isinstance(expr.function, Variable): # Since, the function is not "scoped", attempt to infer using - # kernel.function_manlgers + # kernel.function_manglers + + # {{{ trying to infer using function manglers + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) @@ -383,9 +394,12 @@ class TypeInferenceMapper(CombineMapper): "assignments") return [mangle_result.result_dtypes[0]] + # }}} return [] + map_call_with_kwargs = map_call + def map_variable(self, expr): if expr.name in self.kernel.all_inames(): return [self.kernel.index_dtype] -- GitLab From 00819f86128ae029dd46e05d410bb024cd77bb6f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Apr 2018 18:02:57 -0500 Subject: [PATCH 146/916] CallWithKwargs is final. --- loopy/kernel/function_interface.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index b4a18315a..a310106db 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -575,9 +575,12 @@ class CallableKernel(InKernelCallable): parameters.append(kw_parameters[pos_to_kw[i]]) par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) - parameters = parameters + list(assignees) - par_dtypes = par_dtypes + [self.arg_id_to_dtype[-i-1] for i, _ in - enumerate(assignees)] + assignee_write_count = -1 + for i, arg in enumerate(self.subkernel.args): + if arg.direction == 'out': + assignee = assignees[-assignee_write_count-1] + parameters.insert(i, assignee) + par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) # we are not going to do any type casting in array calls. from loopy.expression import dtype_to_type_context -- GitLab From 0dfc9957447590cc36b3e011287c8095c0dbe4b3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Apr 2018 19:21:24 -0500 Subject: [PATCH 147/916] Minor fixes in multiple array output. --- loopy/kernel/function_interface.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index a310106db..56434ba57 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -575,14 +575,16 @@ class CallableKernel(InKernelCallable): parameters.append(kw_parameters[pos_to_kw[i]]) par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) + # inserting the assigness at the required positions. assignee_write_count = -1 for i, arg in enumerate(self.subkernel.args): if arg.direction == 'out': assignee = assignees[-assignee_write_count-1] parameters.insert(i, assignee) par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) + assignee_write_count -= 1 - # we are not going to do any type casting in array calls. + # no type casting in array calls. from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import SubArrayRef -- GitLab From 6d23d9ff2082196c3e83b798d9466d518e06045c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Apr 2018 19:26:57 -0500 Subject: [PATCH 148/916] Minor tweaks and fixes. --- loopy/kernel/function_interface.py | 2 +- loopy/transform/register_callable.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 56434ba57..ecd00f12e 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -737,7 +737,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): for child in expr.parameters)) else: return super(ScopedFunctionNameChanger, self).map_call( - self, expr, expn_state) + expr, expn_state) else: return self.map_substitution(name, tag, expr.parameters, expn_state) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index aff35e79e..4df55905c 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -99,6 +99,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): callee_kernel = infer_arg_direction(callee_kernel) expected_num_assignees = len([arg for arg in callee_kernel.args if arg.direction == 'out']) + expected_num_parameters = len(callee_kernel.args) - expected_num_assignees for insn in caller_kernel.instructions: if isinstance(insn, CallInstruction) and ( insn.expression.function.name == 'function_name'): @@ -107,6 +108,12 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): "in callee kernel %s and the number of assignees in " "instruction %s do not match." % ( callee_kernel.name, insn.id)) + if insn.expression.prameters != expected_num_parameters: + raise LoopyError("The number of expected arguments " + "for the callee kernel %s and the number of parameters in " + "instruction %s do not match." % ( + callee_kernel.name, insn.id)) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass -- GitLab From 802f3299830a4f04e9c60e7f30c0e1462993bbe2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Apr 2018 19:50:50 -0500 Subject: [PATCH 149/916] Minor bug fix in ValuArg's direction --- loopy/kernel/data.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 788d4ffc0..ab66a5e87 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -326,11 +326,29 @@ class ImageArg(ArrayBase, KernelArgument): class ValueArg(KernelArgument): - def __init__(self, name, dtype=None, approximately=1000, target=None): + def __init__(self, name, dtype=None, approximately=1000, target=None, + direction=None): + + # {{{ sanity checks for direction + + if direction == 'out': + # TODO: Is this only valid for C-like targets? + # Do we need to move this to target.precodegen_checks? + raise LoopyError("ValueArg cannot have 'out' as the direction.") + elif direction is None: + direction = 'in' + elif direction == 'in': + pass + else: + raise LoopyError("Unknown type for direction of %s." % name) + + # }}} + KernelArgument.__init__(self, name=name, dtype=dtype, approximately=approximately, - target=target) + target=target, + direction=direction) def __str__(self): import loopy as lp -- GitLab From bc631eb9c7bcad5fb79b198aa602bb41dfe404dc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 May 2018 01:09:26 -0500 Subject: [PATCH 150/916] Added a few tests for register_kernel and fixed with_descrs --- loopy/kernel/function_interface.py | 13 +++-- test/test_transform.py | 85 ++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index ecd00f12e..368267d76 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -505,19 +505,22 @@ class CallableKernel(InKernelCallable): # tuning the subkernel so that we have the the matching shapes and # dim_tags. - # Collecting the parameters new_args = self.subkernel.args[:] kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) for id, descr in arg_id_to_descr.items(): - if isinstance(id, str): - id = kw_to_pos[id] - assert isinstance(id, int) + if isinstance(id, int): + id = pos_to_kw[id] + assert isinstance(id, str) if isinstance(descr, ArrayArgDescriptor): - new_args[id] = new_args[id].copy(shape=descr.shape, + new_arg = self.subkernel.arg_dict[id].copy( + shape=descr.shape, dim_tags=descr.dim_tags, memory_address_space=descr.mem_scope) + # replacing the new arg with the arg of the same name + new_args = [new_arg if arg.name == id else arg for arg in + new_args] elif isinstance(descr, ValueArgDescriptor): pass else: diff --git a/test/test_transform.py b/test/test_transform.py index 8c11c0efb..09a5de091 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -296,6 +296,91 @@ def test_slices_with_negative_step(ctx_factory): np.linalg.norm(2*x+3*y))) < 1e-15 +def test_register_knl_with_call_with_kwargs(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 2 + + a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.int) + c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel( + "{[i, j]:0<=i, j < %d}" % n, + """ + h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] + <>f1[i, j] = 2*f[i, j] + p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] + """, + [lp.ArrayArg('f'), lp.ArrayArg('e'), lp.ArrayArg('h'), + lp.ArrayArg('g'), ...]) + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, + """ + <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] + [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( + f=[j, l]: a[i, j, k, l, m], + g=[j, l]: d[i, j, k, l, m], + e=[j, l]: c[i, j, k, l, m]) + """) + knl = lp.register_callable_kernel( + caller_knl, 'linear_combo', callee_knl) + + evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) + + a = a_dev.get() + b = b_dev.get() + c = c_dev.get() + + h = out1.get() # h = 2c + 3a + 8b + p = out2.get() # p = 7c + 8a + 4b + h_exact = 3*a + 8*b + 2*c + p_exact = 8*a + 4*b + 7*c + + assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 + assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 + + +def test_register_knl_with_hw_axes(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 4 + + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """) + + callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """ + ) + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") + + knl = lp.register_callable_kernel( + caller_knl, 'linear_combo', callee_knl) + + evt, (out, ) = knl(queue, x=x_dev, y=y_dev) + + x_host = x_dev.get() + y_host = y_dev.get() + + assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( + 2*x_host+3*y_host) < 1e-15 + + def test_multi_arg_array_call(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From d84e6a6454e21644ab6a47ba3751fbab8e799cb1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 May 2018 13:06:09 -0500 Subject: [PATCH 151/916] fixes small wrinkle in the tests. --- test/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_transform.py b/test/test_transform.py index 09a5de091..b88f704b8 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -314,7 +314,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory): p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] """, [lp.ArrayArg('f'), lp.ArrayArg('e'), lp.ArrayArg('h'), - lp.ArrayArg('g'), ...]) + lp.ArrayArg('g'), '...']) caller_knl = lp.make_kernel( "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, -- GitLab From 7981215a166de53a8c2fda9981947c35e16a9fda Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 May 2018 14:21:59 -0500 Subject: [PATCH 152/916] f32 randoms for RNG. --- test/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_transform.py b/test/test_transform.py index b88f704b8..76ff4520a 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -303,7 +303,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory): n = 2 ** 2 a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.int) + b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_kernel( -- GitLab From 48b887bd4b674ffc138fd63542e2cd70cc37c1c9 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Wed, 11 Apr 2018 18:06:45 +0100 Subject: [PATCH 153/916] kernel inlining prototype --- loopy/transform/register_knl.py | 208 ++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 loopy/transform/register_knl.py diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py new file mode 100644 index 000000000..9997ade35 --- /dev/null +++ b/loopy/transform/register_knl.py @@ -0,0 +1,208 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import six + +from loopy.kernel import LoopKernel +from loopy.kernel.creation import FunctionScoper +from loopy.diagnostic import LoopyError +from loopy.kernel.function_interface import CallableKernel + +from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, + CInstruction, _DataObliviousInstruction) + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_callable_kernel +""" + + +# {{{ main entrypoint + +def register_callable_kernel(parent, function_name, child): + """ + The purpose of this transformation is so that one can inoke the child + kernel in the parent kernel. + + :arg parent + + This is the "main" kernel which will mostly remain unaltered and one + can interpret it as stitching up the child kernel in the parent kernel. + + :arg function_name + + The name of the function call with which the child kernel must be + associated in the parent kernel + + :arg child + + This is like a function in every other language and this might be + invoked in one of the instructions of the parent kernel. + + ..note:: + + One should note that the kernels would go under stringent compatibilty + tests so that both of them can be confirmed to be made for each other. + """ + + # {{{ sanity checks + + assert isinstance(parent, LoopKernel) + assert isinstance(child, LoopKernel) + assert isinstance(function_name, str) + + # }}} + + # scoping the function + function_scoper = FunctionScoper(set([function_name])) + new_insns = [] + + for insn in parent.instructions: + if isinstance(insn, CallInstruction): + new_insn = insn.copy(expression=function_scoper(insn.expression)) + new_insns.append(new_insn) + elif isinstance(insn, (_DataObliviousInstruction, MultiAssignmentBase, + CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("scope_functions not implemented for %s" % + type(insn)) + + # adding the scoped function to the scoped function dict of the parent + # kernel. + + scoped_functions = parent.scoped_functions.copy() + + if function_name in scoped_functions: + raise LoopyError("%s is already being used as a funciton name -- maybe" + "use a different name for registering the subkernel") + + scoped_functions[function_name] = CallableKernel(name=function_name, + subkernel=child) + + # returning the parent kernel with the new scoped function dictionary + return parent.copy(scoped_functions=scoped_functions, + instructions=new_insns) + +# }}} + + + +def inline_kernel(kernel, function, arg_map=None): + + child = kernel.scoped_functions[function].subkernel + vng = kernel.get_var_name_generator() + + # duplicate and rename inames + + import islpy as isl + + dim_type = isl.dim_type.set + + child_iname_map = {} + for iname in child.all_inames(): + child_iname_map[iname] = vng(iname) + + new_domains = [] + for domain in child.domains: + new_domain = domain.copy() + n_dim = new_domain.n_dim() + for i in range(n_dim): + iname = new_domain.get_dim_name(dim_type, i) + new_iname = child_iname_map[iname] + new_domain = new_domain.set_dim_name(dim_type, i, new_iname) + new_domains.append(new_domain) + + kernel = kernel.copy(domains= kernel.domains + new_domains) + + # rename temporaries + child_temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(child.temporary_variables): + new_name = vng(name) + child_temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # rename arguments + + calls = [insn for insn in kernel.instructions if isinstance(insn, CallInstruction) and insn.expression.function.name == function] + assert len(calls) == 1 + call, = calls + + parameters = call.assignees + call.expression.parameters + + child_arg_map = {} # arg -> SubArrayRef + for inside, outside in six.iteritems(arg_map): + child_arg_map[inside], = [p for p in parameters if p.subscript.aggregate.name == outside] + + + # Rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + class KernelInliner(SubstitutionMapper): + def map_subscript(self, expr): + if expr.aggregate.name in child_arg_map: + aggregate = self.subst_func(expr.aggregate) + indices = [self.subst_func(i) for i in expr.index_tuple] + sar = child_arg_map[expr.aggregate.name] # SubArrayRef + # insert non-sweeping indices from outter kernel + for i, index in enumerate(sar.subscript.index_tuple): + if index not in sar.swept_inames: + indices.insert(i, index) + return aggregate.index(tuple(indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + + var_map = dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner(make_subst_func(var_map)) + + inner_insns = [] + for insn in child.instructions: + new_insn = insn.with_transformed_expressions(subst_mapper) + within_inames = [child_iname_map[iname] for iname in insn.within_inames] + within_inames.extend(call.within_inames) + new_insn = new_insn.copy(within_inames=frozenset(within_inames)) + inner_insns.append(new_insn) + + new_insns = [] + for insn in kernel.instructions: + if insn == call: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + return kernel + + +# vim: foldmethod=marker -- GitLab From 073550effb8c2f2df5608b45220716d6b61cad82 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 12 Apr 2018 11:08:06 +0100 Subject: [PATCH 154/916] add test --- loopy/__init__.py | 3 +++ loopy/transform/register_knl.py | 9 ++++++-- test/test_transform.py | 38 +++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a5850ec0a..c695f7df5 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -120,6 +120,8 @@ from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, register_function_lookup) +from loopy.transform.register_knl import (register_callable_kernel, + inline_kernel) # }}} @@ -230,6 +232,7 @@ __all__ = [ "add_barrier", "register_callable_kernel", "register_function_lookup", + "inline_kernel", # }}} diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 9997ade35..faa42b743 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -37,6 +37,8 @@ __doc__ = """ .. currentmodule:: loopy .. autofunction:: register_callable_kernel + +.. autofunction:: inline_kernel """ @@ -139,6 +141,7 @@ def inline_kernel(kernel, function, arg_map=None): kernel = kernel.copy(domains= kernel.domains + new_domains) # rename temporaries + child_temp_map = {} new_temps = kernel.temporary_variables.copy() for name, temp in six.iteritems(child.temporary_variables): @@ -149,7 +152,7 @@ def inline_kernel(kernel, function, arg_map=None): kernel = kernel.copy(temporary_variables=new_temps) # rename arguments - + # TODO: put this in a loop calls = [insn for insn in kernel.instructions if isinstance(insn, CallInstruction) and insn.expression.function.name == function] assert len(calls) == 1 call, = calls @@ -174,6 +177,7 @@ def inline_kernel(kernel, function, arg_map=None): indices = [self.subst_func(i) for i in expr.index_tuple] sar = child_arg_map[expr.aggregate.name] # SubArrayRef # insert non-sweeping indices from outter kernel + # TODO: sweeping indices might flip: [i,j]: A[j, i] for i, index in enumerate(sar.subscript.index_tuple): if index not in sar.swept_inames: indices.insert(i, index) @@ -191,7 +195,8 @@ def inline_kernel(kernel, function, arg_map=None): new_insn = insn.with_transformed_expressions(subst_mapper) within_inames = [child_iname_map[iname] for iname in insn.within_inames] within_inames.extend(call.within_inames) - new_insn = new_insn.copy(within_inames=frozenset(within_inames)) + new_insn = new_insn.copy(within_inames=frozenset(within_inames), priority=call.priority) + # TODO: depends on? inner_insns.append(new_insn) new_insns = [] diff --git a/test/test_transform.py b/test/test_transform.py index 76ff4520a..92a6c5cc3 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -424,6 +424,44 @@ def test_multi_arg_array_call(ctx_factory): assert(norm(out_dict['min_index'][0] - np.argmin(b)) < tol) +def test_inlining_kernel(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 16 + + x = np.random.rand(n) + y = np.random.rand(n) + + knl1 = lp.make_kernel( + "{[i]: 0 <= i < 16}", + """ + for i + c[i] = a[i] + 2*b[i] + end + """ + ) + knl2 = lp.make_kernel( + "{[i, j]: 0 <= i, j < 16}", + """ + for j + [i]: z[j, i] = func([i]: x[i], [i]: y[i]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16,)), + lp.GlobalArg("y", np.float64, (16,)), "..." + ] + ) + + knl2 = lp.register_callable_kernel(knl2, 'func', knl1) + knl3 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + + evt, (out, ) = knl3(queue, x=x, y=y) + z = np.tile(x + y*2, [16, 1]) + + assert np.allclose(out, z) + + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 0d223307282c97413e7134fefd1031b0c32a37ed Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 12 Apr 2018 11:26:20 +0100 Subject: [PATCH 155/916] flake8 --- loopy/transform/register_knl.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index faa42b743..2adc2648e 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -112,8 +112,7 @@ def register_callable_kernel(parent, function_name, child): # }}} - -def inline_kernel(kernel, function, arg_map=None): +def inline_kernel(kernel, function, arg_map): child = kernel.scoped_functions[function].subkernel vng = kernel.get_var_name_generator() @@ -138,7 +137,7 @@ def inline_kernel(kernel, function, arg_map=None): new_domain = new_domain.set_dim_name(dim_type, i, new_iname) new_domains.append(new_domain) - kernel = kernel.copy(domains= kernel.domains + new_domains) + kernel = kernel.copy(domains=kernel.domains + new_domains) # rename temporaries @@ -152,8 +151,11 @@ def inline_kernel(kernel, function, arg_map=None): kernel = kernel.copy(temporary_variables=new_temps) # rename arguments + # TODO: automatically figuring out arg map # TODO: put this in a loop - calls = [insn for insn in kernel.instructions if isinstance(insn, CallInstruction) and insn.expression.function.name == function] + calls = [insn for insn in kernel.instructions + if isinstance(insn, CallInstruction) + and insn.expression.function.name == function] assert len(calls) == 1 call, = calls @@ -161,8 +163,8 @@ def inline_kernel(kernel, function, arg_map=None): child_arg_map = {} # arg -> SubArrayRef for inside, outside in six.iteritems(arg_map): - child_arg_map[inside], = [p for p in parameters if p.subscript.aggregate.name == outside] - + child_arg_map[inside], = [p for p in parameters + if p.subscript.aggregate.name == outside] # Rewrite instructions @@ -185,17 +187,21 @@ def inline_kernel(kernel, function, arg_map=None): else: return super(KernelInliner, self).map_subscript(expr) - var_map = dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner(make_subst_func(var_map)) + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner(make_subst_func(var_map)) inner_insns = [] for insn in child.instructions: new_insn = insn.with_transformed_expressions(subst_mapper) within_inames = [child_iname_map[iname] for iname in insn.within_inames] within_inames.extend(call.within_inames) - new_insn = new_insn.copy(within_inames=frozenset(within_inames), priority=call.priority) + new_insn = new_insn.copy(within_inames=frozenset(within_inames), + priority=call.priority) # TODO: depends on? inner_insns.append(new_insn) -- GitLab From 762e7b2d8ef2c3967e3d384be755609ebbd53739 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 12 Apr 2018 13:12:33 +0100 Subject: [PATCH 156/916] 2d tests --- loopy/transform/register_knl.py | 205 +++++++++++++++++--------------- test/test_transform.py | 85 ++++++++++++- 2 files changed, 193 insertions(+), 97 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 2adc2648e..8c0305154 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -114,105 +114,124 @@ def register_callable_kernel(parent, function_name, child): def inline_kernel(kernel, function, arg_map): + if function not in kernel.scoped_functions: + raise LoopyError("function: {0} does not exist".format(function)) + child = kernel.scoped_functions[function].subkernel vng = kernel.get_var_name_generator() - # duplicate and rename inames - - import islpy as isl - - dim_type = isl.dim_type.set - - child_iname_map = {} - for iname in child.all_inames(): - child_iname_map[iname] = vng(iname) - - new_domains = [] - for domain in child.domains: - new_domain = domain.copy() - n_dim = new_domain.n_dim() - for i in range(n_dim): - iname = new_domain.get_dim_name(dim_type, i) - new_iname = child_iname_map[iname] - new_domain = new_domain.set_dim_name(dim_type, i, new_iname) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains) - - # rename temporaries - - child_temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(child.temporary_variables): - new_name = vng(name) - child_temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # rename arguments - # TODO: automatically figuring out arg map - # TODO: put this in a loop - calls = [insn for insn in kernel.instructions - if isinstance(insn, CallInstruction) - and insn.expression.function.name == function] - assert len(calls) == 1 - call, = calls - - parameters = call.assignees + call.expression.parameters - - child_arg_map = {} # arg -> SubArrayRef - for inside, outside in six.iteritems(arg_map): - child_arg_map[inside], = [p for p in parameters - if p.subscript.aggregate.name == outside] - - # Rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - from loopy.symbolic import SubstitutionMapper - - class KernelInliner(SubstitutionMapper): - def map_subscript(self, expr): - if expr.aggregate.name in child_arg_map: - aggregate = self.subst_func(expr.aggregate) - indices = [self.subst_func(i) for i in expr.index_tuple] - sar = child_arg_map[expr.aggregate.name] # SubArrayRef - # insert non-sweeping indices from outter kernel - # TODO: sweeping indices might flip: [i,j]: A[j, i] - for i, index in enumerate(sar.subscript.index_tuple): - if index not in sar.swept_inames: - indices.insert(i, index) - return aggregate.index(tuple(indices)) + for call in kernel.instructions: + if not isinstance(call, CallInstruction): + continue + if call.expression.function.name != function: + continue + + # {{{ duplicate and rename inames + + import islpy as isl + + dim_type = isl.dim_type.set + + child_iname_map = {} + for iname in child.all_inames(): + child_iname_map[iname] = vng(iname) + + new_domains = [] + for domain in child.domains: + new_domain = domain.copy() + n_dim = new_domain.n_dim() + for i in range(n_dim): + iname = new_domain.get_dim_name(dim_type, i) + new_iname = child_iname_map[iname] + new_domain = new_domain.set_dim_name(dim_type, i, new_iname) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains) + + # }}} + + # {{{ rename temporaries + + child_temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(child.temporary_variables): + new_name = vng(name) + child_temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ arguments + # TODO: automatically figuring out arg map + parameters = call.assignees + call.expression.parameters + + child_arg_map = {} # arg -> SubArrayRef + for inside, outside in six.iteritems(arg_map): + child_arg_map[inside], = [p for p in parameters + if p.subscript.aggregate.name == outside] + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + class KernelInliner(SubstitutionMapper): + def map_subscript(self, expr): + if expr.aggregate.name in child_arg_map: + aggregate = self.subst_func(expr.aggregate) + sar = child_arg_map[expr.aggregate.name] # SubArrayRef + indices = [] + for index in sar.subscript.index_tuple: + if index in sar.swept_inames: + # map sweeping index to inner kernel index + pos = sar.swept_inames.index(index) + new_index = self.subst_func(expr.index_tuple[pos]) + else: + # non-sweepting index from outter kernel + new_index = index + indices.append(new_index) + return aggregate.index(tuple(indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner(make_subst_func(var_map)) + + inner_insns = [] + for insn in child.instructions: + new_insn = insn.with_transformed_expressions(subst_mapper) + within_inames = [child_iname_map[iname] for iname in insn.within_inames] + within_inames.extend(call.within_inames) + id = vng(new_insn.id) + new_insn = new_insn.copy( + id=id, + within_inames=frozenset(within_inames), + priority=call.priority, + depends_on=new_insn.depends_on | call.depends_on + ) + # TODO: depends on is too conservative? + inner_insns.append(new_insn) + + new_insns = [] + for insn in kernel.instructions: + if insn == call: + new_insns.extend(inner_insns) else: - return super(KernelInliner, self).map_subscript(expr) - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner(make_subst_func(var_map)) - - inner_insns = [] - for insn in child.instructions: - new_insn = insn.with_transformed_expressions(subst_mapper) - within_inames = [child_iname_map[iname] for iname in insn.within_inames] - within_inames.extend(call.within_inames) - new_insn = new_insn.copy(within_inames=frozenset(within_inames), - priority=call.priority) - # TODO: depends on? - inner_insns.append(new_insn) + new_insns.append(insn) - new_insns = [] - for insn in kernel.instructions: - if insn == call: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) + kernel = kernel.copy(instructions=new_insns) + + # }}} - kernel = kernel.copy(instructions=new_insns) return kernel diff --git a/test/test_transform.py b/test/test_transform.py index 92a6c5cc3..09b497348 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -424,7 +424,7 @@ def test_multi_arg_array_call(ctx_factory): assert(norm(out_dict['min_index'][0] - np.argmin(b)) < tol) -def test_inlining_kernel(ctx_factory): +def test_inline_kernel(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 16 @@ -440,6 +440,7 @@ def test_inlining_kernel(ctx_factory): end """ ) + knl2 = lp.make_kernel( "{[i, j]: 0 <= i, j < 16}", """ @@ -453,14 +454,90 @@ def test_inlining_kernel(ctx_factory): ] ) + knl3 = lp.make_kernel( + "{[i, j]: 0 <= i, j < 16}", + """ + for j + [i]: z[i, j] = func([i]: x[i], [i]: y[i]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16,)), + lp.GlobalArg("y", np.float64, (16,)), "..." + ] + ) + knl2 = lp.register_callable_kernel(knl2, 'func', knl1) - knl3 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out, ) = knl2(queue, x=x, y=y) + z = np.tile(x + y * 2, [16, 1]) + assert np.allclose(out, z) + + knl3 = lp.register_callable_kernel(knl3, 'func', knl1) + knl3 = lp.inline_kernel(knl3, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out,) = knl3(queue, x=x, y=y) + z = np.tile(x + y * 2, [16, 1]).transpose() + assert np.allclose(out, z) + + +def test_inline_kernel_2d(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 16 + + x = np.random.rand(n ** 2).reshape((n, n)) + y = np.random.rand(n ** 2).reshape((n, n)) + + knl1 = lp.make_kernel( + "{[i, j]: 0 <= i, j < 16}", + """ + for i, j + c[i, j] = a[i, j] + 2*b[i, j] + end + """, + kernel_data=[ + lp.GlobalArg("a", np.float64, (16, 16)), + lp.GlobalArg("b", np.float64, (16, 16)), "..." + ] + ) - evt, (out, ) = knl3(queue, x=x, y=y) - z = np.tile(x + y*2, [16, 1]) + knl2 = lp.make_kernel( + "{[i, j, k]: 0 <= i, j, k < 16}", + """ + for k + [i, j]: z[k, i, j] = func([i, j]: x[i, j], [i, j]: y[i, j]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16, 16)), + lp.GlobalArg("y", np.float64, (16, 16)), "..." + ] + ) + knl3 = lp.make_kernel( + "{[i, j, k]: 0 <= i, j, k < 16}", + """ + for k + [i, j]: z[k, j, i] = func([i, j]: x[i, j], [i, j]: y[i, j]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16, 16)), + lp.GlobalArg("y", np.float64, (16, 16)), "..." + ] + ) + + knl2 = lp.register_callable_kernel(knl2, 'func', knl1) + knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out, ) = knl2(queue, x=x, y=y) + z = np.tile(x + y * 2, [16, 1, 1]) assert np.allclose(out, z) + knl3 = lp.register_callable_kernel(knl3, 'func', knl1) + knl3 = lp.inline_kernel(knl3, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out,) = knl3(queue, x=x, y=y) + z = np.tile(np.transpose(x + y * 2), [16, 1, 1]) + assert np.allclose(out, z) def test_rename_argument(ctx_factory): ctx = ctx_factory() -- GitLab From 0e805a1bb4efee6da2b4c8cb97937e9fba01ca79 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 12 Apr 2018 19:18:15 +0100 Subject: [PATCH 157/916] better subscript mapping --- loopy/transform/register_knl.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 8c0305154..a8d52a3e6 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -180,21 +180,21 @@ def inline_kernel(kernel, function, arg_map): from loopy.symbolic import SubstitutionMapper class KernelInliner(SubstitutionMapper): + """ + Mapper to replace variables (indices, temporaries, arguments) in + the inner kernel. + """ def map_subscript(self, expr): if expr.aggregate.name in child_arg_map: aggregate = self.subst_func(expr.aggregate) sar = child_arg_map[expr.aggregate.name] # SubArrayRef - indices = [] - for index in sar.subscript.index_tuple: - if index in sar.swept_inames: - # map sweeping index to inner kernel index - pos = sar.swept_inames.index(index) - new_index = self.subst_func(expr.index_tuple[pos]) - else: - # non-sweepting index from outter kernel - new_index = index - indices.append(new_index) - return aggregate.index(tuple(indices)) + # first, map inner inames to outer inames + outer_indices = [self.subst_func(i) for i in expr.index_tuple] + # then, map index expressions in SubArrayRef to outer inames + index_map = dict(zip(sar.swept_inames, outer_indices)) + index_mapper = SubstitutionMapper(make_subst_func(index_map)) + new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) + return aggregate.index(tuple(new_indices)) else: return super(KernelInliner, self).map_subscript(expr) -- GitLab From bf70d0a3935ff719bf5e3a75cd9c0c714fb3ad0b Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 13 Apr 2018 14:38:56 +0100 Subject: [PATCH 158/916] add test for affine sweeping index --- test/test_transform.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/test_transform.py b/test/test_transform.py index 09b497348..7f6eed495 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -467,6 +467,19 @@ def test_inline_kernel(ctx_factory): ] ) + knl4 = lp.make_kernel( + "{[i, j]: 0 <= i, j < 16}", + """ + for j + [i]: z[j, 15-i] = func([i]: x[i], [i]: y[i]) + end + """, + kernel_data=[ + lp.GlobalArg("x", np.float64, (16,)), + lp.GlobalArg("y", np.float64, (16,)), "..." + ] + ) + knl2 = lp.register_callable_kernel(knl2, 'func', knl1) knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) evt, (out, ) = knl2(queue, x=x, y=y) @@ -479,6 +492,11 @@ def test_inline_kernel(ctx_factory): z = np.tile(x + y * 2, [16, 1]).transpose() assert np.allclose(out, z) + knl4 = lp.register_callable_kernel(knl4, 'func', knl1) + knl4 = lp.inline_kernel(knl4, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out,) = knl4(queue, x=x, y=y) + z = np.tile(np.flip(x + y * 2, 0), [16, 1]) + assert np.allclose(out, z) def test_inline_kernel_2d(ctx_factory): ctx = ctx_factory() -- GitLab From a74a880ecd0a9d1ebc8aa1d7483c3e49c8f3b272 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 13 Apr 2018 15:20:11 +0100 Subject: [PATCH 159/916] automatic matching of args --- loopy/transform/register_knl.py | 58 ++++++++++++++++++++++++++------- test/test_transform.py | 9 +++-- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index a8d52a3e6..dd3a477bf 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -112,11 +112,13 @@ def register_callable_kernel(parent, function_name, child): # }}} -def inline_kernel(kernel, function, arg_map): +def inline_kernel(knl, function, arg_map=None): - if function not in kernel.scoped_functions: + if function not in knl.scoped_functions: raise LoopyError("function: {0} does not exist".format(function)) + kernel = knl.copy() + child = kernel.scoped_functions[function].subkernel vng = kernel.get_var_name_generator() @@ -163,14 +165,48 @@ def inline_kernel(kernel, function, arg_map): # }}} - # {{{ arguments - # TODO: automatically figuring out arg map - parameters = call.assignees + call.expression.parameters + # {{{ match kernel arguments + + child_arg_map = {} # child arg name -> SubArrayRef + + # for kernel call: out1, out2 = func(in1, in2), we match out1, out2 to + # the written arguments, and in1, in2 to the readonly arguments in + # child kernel, according the order they appear in child.args + writes = child.get_written_variables() + reads = [arg.name for arg in child.args if arg.name not in writes] + writes = [arg.name for arg in child.args if arg.name in writes] + + if arg_map: + for inside, outside in six.iteritems(arg_map): + if inside not in child.arg_dict: + raise LoopyError("arg named '{0}' not in the child " + "kernel".format(inside)) + if inside in writes: + sar = [sar for sar in call.assignees + if sar.subscript.aggregate.name == outside] + if len(sar) != 1: + raise LoopyError("wrong number of variables " + "named '{0}'".format(outside)) + child_arg_map[inside], = sar + else: + sar = [sar for sar in call.expression.parameters + if sar.subscript.aggregate.name == outside] + if len(sar) != 1: + raise LoopyError("wrong number of variables " + "named '{0}'".format(outside)) + child_arg_map[inside], = sar + else: + if len(call.assignees) != len(writes): + raise LoopyError("expect {0} output variable(s), got {1}".format( + len(writes), len(call.assignees))) + if len(call.expression.parameters) != len(reads): + raise LoopyError("expect {0} input variable(s), got {1}".format( + len(reads), len(call.expression.parameters))) + for arg_name, sar in zip(writes, call.assignees): + child_arg_map[arg_name] = sar + for arg_name, sar in zip(reads, call.expression.parameters): + child_arg_map[arg_name] = sar - child_arg_map = {} # arg -> SubArrayRef - for inside, outside in six.iteritems(arg_map): - child_arg_map[inside], = [p for p in parameters - if p.subscript.aggregate.name == outside] # }}} # {{{ rewrite instructions @@ -202,8 +238,8 @@ def inline_kernel(kernel, function, arg_map): for k, v in six.iteritems(child_iname_map)) var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(arg_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(child_arg_map))) subst_mapper = KernelInliner(make_subst_func(var_map)) inner_insns = [] diff --git a/test/test_transform.py b/test/test_transform.py index 7f6eed495..c5180ead1 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -481,9 +481,14 @@ def test_inline_kernel(ctx_factory): ) knl2 = lp.register_callable_kernel(knl2, 'func', knl1) - knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) - evt, (out, ) = knl2(queue, x=x, y=y) z = np.tile(x + y * 2, [16, 1]) + + knl2_arg_map = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + evt, (out, ) = knl2_arg_map(queue, x=x, y=y) + assert np.allclose(out, z) + + knl2_no_arg_map = lp.inline_kernel(knl2, "func") + evt, (out, ) = knl2_no_arg_map(queue, x=x, y=y) assert np.allclose(out, z) knl3 = lp.register_callable_kernel(knl3, 'func', knl1) -- GitLab From 8917de2569a2fe0c8756de27540c8da752f1415f Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 13 Apr 2018 19:01:17 +0100 Subject: [PATCH 160/916] add inames to non-sweeping indices --- loopy/transform/register_knl.py | 35 +++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index dd3a477bf..f08269964 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -118,9 +118,8 @@ def inline_kernel(knl, function, arg_map=None): raise LoopyError("function: {0} does not exist".format(function)) kernel = knl.copy() - child = kernel.scoped_functions[function].subkernel - vng = kernel.get_var_name_generator() + for call in kernel.instructions: if not isinstance(call, CallInstruction): @@ -132,6 +131,8 @@ def inline_kernel(knl, function, arg_map=None): import islpy as isl + vng = kernel.get_var_name_generator() + dim_type = isl.dim_type.set child_iname_map = {} @@ -243,24 +244,38 @@ def inline_kernel(knl, function, arg_map=None): subst_mapper = KernelInliner(make_subst_func(var_map)) inner_insns = [] + + ing = kernel.get_instruction_id_generator() + insn_id = {} for insn in child.instructions: - new_insn = insn.with_transformed_expressions(subst_mapper) - within_inames = [child_iname_map[iname] for iname in insn.within_inames] - within_inames.extend(call.within_inames) - id = vng(new_insn.id) - new_insn = new_insn.copy( - id=id, + insn_id[insn.id] = ing(insn.id) + + for _insn in child.instructions: + insn = _insn.with_transformed_expressions(subst_mapper) + within_inames = insn.dependency_names() & kernel.all_inames() + within_inames = within_inames | call.within_inames + depends_on = frozenset(insn_id[dep] for dep in insn.depends_on) + depends_on = depends_on | call.depends_on + insn = insn.copy( + id=insn_id[insn.id], within_inames=frozenset(within_inames), priority=call.priority, - depends_on=new_insn.depends_on | call.depends_on + depends_on=depends_on ) # TODO: depends on is too conservative? - inner_insns.append(new_insn) + inner_insns.append(insn) + from loopy.kernel.instruction import NoOpInstruction new_insns = [] for insn in kernel.instructions: if insn == call: new_insns.extend(inner_insns) + noop = NoOpInstruction( + id=call.id, + within_inames=call.within_inames, + depends_on=call.depends_on + ) + new_insns.append(noop) else: new_insns.append(insn) -- GitLab From 32a0b13045d823c0fb06549436a1ee8e2f37512b Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 20 Apr 2018 18:19:55 +0100 Subject: [PATCH 161/916] still some issues with mapping subscripts --- loopy/transform/register_knl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index f08269964..a2c753440 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -226,7 +226,7 @@ def inline_kernel(knl, function, arg_map=None): aggregate = self.subst_func(expr.aggregate) sar = child_arg_map[expr.aggregate.name] # SubArrayRef # first, map inner inames to outer inames - outer_indices = [self.subst_func(i) for i in expr.index_tuple] + outer_indices = self.map_tuple(expr.index_tuple) # then, map index expressions in SubArrayRef to outer inames index_map = dict(zip(sar.swept_inames, outer_indices)) index_mapper = SubstitutionMapper(make_subst_func(index_map)) @@ -250,19 +250,20 @@ def inline_kernel(knl, function, arg_map=None): for insn in child.instructions: insn_id[insn.id] = ing(insn.id) + new_inames = [] + for _insn in child.instructions: insn = _insn.with_transformed_expressions(subst_mapper) - within_inames = insn.dependency_names() & kernel.all_inames() + within_inames = frozenset(child_iname_map[iname] for iname in insn.within_inames) within_inames = within_inames | call.within_inames depends_on = frozenset(insn_id[dep] for dep in insn.depends_on) depends_on = depends_on | call.depends_on insn = insn.copy( id=insn_id[insn.id], - within_inames=frozenset(within_inames), + within_inames=within_inames, priority=call.priority, depends_on=depends_on ) - # TODO: depends on is too conservative? inner_insns.append(insn) from loopy.kernel.instruction import NoOpInstruction -- GitLab From 1b6becb7150bdfa30d5880322251d22a2b964fa6 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Mon, 23 Apr 2018 18:37:16 +0100 Subject: [PATCH 162/916] seems to work now --- loopy/transform/register_knl.py | 38 +++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index a2c753440..bb43dd19d 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -25,6 +25,8 @@ THE SOFTWARE. import six +import numpy as np + from loopy.kernel import LoopKernel from loopy.kernel.creation import FunctionScoper from loopy.diagnostic import LoopyError @@ -137,7 +139,7 @@ def inline_kernel(knl, function, arg_map=None): child_iname_map = {} for iname in child.all_inames(): - child_iname_map[iname] = vng(iname) + child_iname_map[iname] = vng("child_"+iname) new_domains = [] for domain in child.domains: @@ -158,7 +160,7 @@ def inline_kernel(knl, function, arg_map=None): child_temp_map = {} new_temps = kernel.temporary_variables.copy() for name, temp in six.iteritems(child.temporary_variables): - new_name = vng(name) + new_name = vng("child_"+name) child_temp_map[name] = new_name new_temps[new_name] = temp.copy(name=new_name) @@ -215,6 +217,8 @@ def inline_kernel(knl, function, arg_map=None): import pymbolic.primitives as p from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import SubstitutionMapper + from loopy.isl_helpers import simplify_via_aff + from functools import reduce class KernelInliner(SubstitutionMapper): """ @@ -224,13 +228,33 @@ def inline_kernel(knl, function, arg_map=None): def map_subscript(self, expr): if expr.aggregate.name in child_arg_map: aggregate = self.subst_func(expr.aggregate) - sar = child_arg_map[expr.aggregate.name] # SubArrayRef + sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) + arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) + # first, map inner inames to outer inames outer_indices = self.map_tuple(expr.index_tuple) - # then, map index expressions in SubArrayRef to outer inames - index_map = dict(zip(sar.swept_inames, outer_indices)) + + # next, reshape to match dimension of outer arrays + inner_sizes = [int(np.prod(arg_in.shape[i+1:])) for i in range(len(arg_in.shape))] + make_sum = lambda x, y: p.Sum((x, y)) # TODO: can be more functional? + flatten_index = reduce(make_sum, map(p.Product, zip(outer_indices, inner_sizes))) + flatten_index = simplify_via_aff(flatten_index) + + from loopy.symbolic import pw_aff_to_expr + bounds = [kernel.get_iname_bounds(i.name) for i in sar.swept_inames] + sizes = [pw_aff_to_expr(b.size) for b in bounds] + sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] + new_indices = [] + for s in sizes: + ind = flatten_index // s + flatten_index = flatten_index - s * ind + new_indices.append(ind) + + # lastly, map sweeping indices to indices in Subscripts in SubArrayRef + index_map = dict(zip(sar.swept_inames, new_indices)) index_mapper = SubstitutionMapper(make_subst_func(index_map)) new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) + new_indices = tuple(simplify_via_aff(i) for i in new_indices) return aggregate.index(tuple(new_indices)) else: return super(KernelInliner, self).map_subscript(expr) @@ -248,7 +272,7 @@ def inline_kernel(knl, function, arg_map=None): ing = kernel.get_instruction_id_generator() insn_id = {} for insn in child.instructions: - insn_id[insn.id] = ing(insn.id) + insn_id[insn.id] = ing("child_"+insn.id) new_inames = [] @@ -274,7 +298,7 @@ def inline_kernel(knl, function, arg_map=None): noop = NoOpInstruction( id=call.id, within_inames=call.within_inames, - depends_on=call.depends_on + depends_on=call.depends_on | set(insn.id for insn in inner_insns) ) new_insns.append(noop) else: -- GitLab From 3877b398df2581024fe5feac044ba32ff4243095 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Tue, 24 Apr 2018 14:05:34 +0100 Subject: [PATCH 163/916] better dependency reasoning and some cleaning up --- loopy/transform/register_knl.py | 94 +++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 29 deletions(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index bb43dd19d..6d40942c9 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -114,15 +114,13 @@ def register_callable_kernel(parent, function_name, child): # }}} -def inline_kernel(knl, function, arg_map=None): +def inline_kernel(kernel, function, arg_map=None): - if function not in knl.scoped_functions: + if function not in kernel.scoped_functions: raise LoopyError("function: {0} does not exist".format(function)) - kernel = knl.copy() child = kernel.scoped_functions[function].subkernel - for call in kernel.instructions: if not isinstance(call, CallInstruction): continue @@ -134,7 +132,6 @@ def inline_kernel(knl, function, arg_map=None): import islpy as isl vng = kernel.get_var_name_generator() - dim_type = isl.dim_type.set child_iname_map = {} @@ -144,11 +141,10 @@ def inline_kernel(knl, function, arg_map=None): new_domains = [] for domain in child.domains: new_domain = domain.copy() - n_dim = new_domain.n_dim() - for i in range(n_dim): + for i in range(new_domain.n_dim()): iname = new_domain.get_dim_name(dim_type, i) - new_iname = child_iname_map[iname] - new_domain = new_domain.set_dim_name(dim_type, i, new_iname) + new_domain = new_domain.set_dim_name( + dim_type, i, child_iname_map[iname]) new_domains.append(new_domain) kernel = kernel.copy(domains=kernel.domains + new_domains) @@ -231,26 +227,43 @@ def inline_kernel(knl, function, arg_map=None): sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) - # first, map inner inames to outer inames + # Firstly, map inner inames to outer inames. outer_indices = self.map_tuple(expr.index_tuple) - # next, reshape to match dimension of outer arrays - inner_sizes = [int(np.prod(arg_in.shape[i+1:])) for i in range(len(arg_in.shape))] - make_sum = lambda x, y: p.Sum((x, y)) # TODO: can be more functional? - flatten_index = reduce(make_sum, map(p.Product, zip(outer_indices, inner_sizes))) + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in arg_in.shape): + raise LoopyError( + "Argument: {0} in child kernel: {1} does not have " + "constant shape.".format(arg_in, child.name)) + inner_sizes = [int(np.prod(arg_in.shape[i+1:])) + for i in range(len(arg_in.shape))] + flatten_index = reduce( + lambda x, y: p.Sum((x, y)), + map(p.Product, zip(outer_indices, inner_sizes))) flatten_index = simplify_via_aff(flatten_index) from loopy.symbolic import pw_aff_to_expr - bounds = [kernel.get_iname_bounds(i.name) for i in sar.swept_inames] + bounds = [kernel.get_iname_bounds(i.name) + for i in sar.swept_inames] sizes = [pw_aff_to_expr(b.size) for b in bounds] + if not all(isinstance(d, Integral) for d in sizes): + raise LoopyError( + "SubArrayRef: {0} in parent kernel: {1} does not have " + "swept inames with constant size.".format( + sar, kernel.name)) + sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] + new_indices = [] for s in sizes: ind = flatten_index // s flatten_index = flatten_index - s * ind new_indices.append(ind) - # lastly, map sweeping indices to indices in Subscripts in SubArrayRef + # Lastly, map sweeping indices to indices in Subscripts + # This takes care of cases such as [i, j]: A[i+j, i-j] index_map = dict(zip(sar.swept_inames, new_indices)) index_mapper = SubstitutionMapper(make_subst_func(index_map)) new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) @@ -267,40 +280,63 @@ def inline_kernel(knl, function, arg_map=None): for k, v in six.iteritems(child_arg_map))) subst_mapper = KernelInliner(make_subst_func(var_map)) - inner_insns = [] - ing = kernel.get_instruction_id_generator() insn_id = {} for insn in child.instructions: insn_id[insn.id] = ing("child_"+insn.id) - new_inames = [] + # {{{ root and leave instructions in child kernel + + dep_map = child.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of child kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing("child_start"), + within_inames=call.within_inames, + depends_on=call.depends_on + ) + noop_end = NoOpInstruction( + id=call.id, + within_inames=call.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] for _insn in child.instructions: insn = _insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(child_iname_map[iname] for iname in insn.within_inames) + within_inames = frozenset(map(child_iname_map.get, insn.within_inames)) within_inames = within_inames | call.within_inames - depends_on = frozenset(insn_id[dep] for dep in insn.depends_on) - depends_on = depends_on | call.depends_on + depends_on = frozenset(map(insn_id.get, insn.depends_on)) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) insn = insn.copy( id=insn_id[insn.id], within_inames=within_inames, + # TODO: probaby need to keep priority in child kernel priority=call.priority, depends_on=depends_on ) inner_insns.append(insn) - from loopy.kernel.instruction import NoOpInstruction + inner_insns.append(noop_end) + new_insns = [] for insn in kernel.instructions: if insn == call: new_insns.extend(inner_insns) - noop = NoOpInstruction( - id=call.id, - within_inames=call.within_inames, - depends_on=call.depends_on | set(insn.id for insn in inner_insns) - ) - new_insns.append(noop) else: new_insns.append(insn) -- GitLab From e2a348275eeaa0de80031a08447230ecd6d56461 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Wed, 25 Apr 2018 12:24:35 +0100 Subject: [PATCH 164/916] rebase to kernel_callables_v3 --- loopy/__init__.py | 4 +- loopy/transform/register_callable.py | 239 +++++++++++++++++++++++++++ 2 files changed, 240 insertions(+), 3 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index c695f7df5..1c7951dc0 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -119,9 +119,7 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, - register_function_lookup) -from loopy.transform.register_knl import (register_callable_kernel, - inline_kernel) + register_function_lookup, inline_kernel) # }}} diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 4df55905c..4ce3c72cc 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -22,6 +22,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six + +import numpy as np + from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel from pytools import ImmutableRecord @@ -137,4 +141,239 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # }}} + +def inline_kernel(kernel, function, arg_map=None): + + from loopy import CallInstruction, LoopyError + + if function not in kernel.scoped_functions: + raise LoopyError("function: {0} does not exist".format(function)) + + child = kernel.scoped_functions[function].subkernel + + for call in kernel.instructions: + if not isinstance(call, CallInstruction): + continue + if call.expression.function.name != function: + continue + + # {{{ duplicate and rename inames + + import islpy as isl + + vng = kernel.get_var_name_generator() + dim_type = isl.dim_type.set + + child_iname_map = {} + for iname in child.all_inames(): + child_iname_map[iname] = vng("child_"+iname) + + new_domains = [] + for domain in child.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + new_domain = new_domain.set_dim_name( + dim_type, i, child_iname_map[iname]) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains) + + # }}} + + # {{{ rename temporaries + + child_temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(child.temporary_variables): + new_name = vng("child_"+name) + child_temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + child_arg_map = {} # child arg name -> SubArrayRef + + # for kernel call: out1, out2 = func(in1, in2), we match out1, out2 to + # the written arguments, and in1, in2 to the readonly arguments in + # child kernel, according the order they appear in child.args + writes = child.get_written_variables() + reads = [arg.name for arg in child.args if arg.name not in writes] + writes = [arg.name for arg in child.args if arg.name in writes] + + if arg_map: + for inside, outside in six.iteritems(arg_map): + if inside not in child.arg_dict: + raise LoopyError("arg named '{0}' not in the child " + "kernel".format(inside)) + if inside in writes: + sar = [sar for sar in call.assignees + if sar.subscript.aggregate.name == outside] + if len(sar) != 1: + raise LoopyError("wrong number of variables " + "named '{0}'".format(outside)) + child_arg_map[inside], = sar + else: + sar = [sar for sar in call.expression.parameters + if sar.subscript.aggregate.name == outside] + if len(sar) != 1: + raise LoopyError("wrong number of variables " + "named '{0}'".format(outside)) + child_arg_map[inside], = sar + else: + if len(call.assignees) != len(writes): + raise LoopyError("expect {0} output variable(s), got {1}".format( + len(writes), len(call.assignees))) + if len(call.expression.parameters) != len(reads): + raise LoopyError("expect {0} input variable(s), got {1}".format( + len(reads), len(call.expression.parameters))) + for arg_name, sar in zip(writes, call.assignees): + child_arg_map[arg_name] = sar + for arg_name, sar in zip(reads, call.expression.parameters): + child_arg_map[arg_name] = sar + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + from loopy.isl_helpers import simplify_via_aff + from functools import reduce + + class KernelInliner(SubstitutionMapper): + """ + Mapper to replace variables (indices, temporaries, arguments) in + the inner kernel. + """ + def map_subscript(self, expr): + if expr.aggregate.name in child_arg_map: + aggregate = self.subst_func(expr.aggregate) + sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) + arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in arg_in.shape): + raise LoopyError( + "Argument: {0} in child kernel: {1} does not have " + "constant shape.".format(arg_in, child.name)) + inner_sizes = [int(np.prod(arg_in.shape[i+1:])) + for i in range(len(arg_in.shape))] + flatten_index = reduce( + lambda x, y: p.Sum((x, y)), + map(p.Product, zip(outer_indices, inner_sizes))) + flatten_index = simplify_via_aff(flatten_index) + + from loopy.symbolic import pw_aff_to_expr + bounds = [kernel.get_iname_bounds(i.name) + for i in sar.swept_inames] + sizes = [pw_aff_to_expr(b.size) for b in bounds] + if not all(isinstance(d, Integral) for d in sizes): + raise LoopyError( + "SubArrayRef: {0} in parent kernel: {1} does not have " + "swept inames with constant size.".format( + sar, kernel.name)) + + sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] + + new_indices = [] + for s in sizes: + ind = flatten_index // s + flatten_index = flatten_index - s * ind + new_indices.append(ind) + + # Lastly, map sweeping indices to indices in Subscripts + # This takes care of cases such as [i, j]: A[i+j, i-j] + index_map = dict(zip(sar.swept_inames, new_indices)) + index_mapper = SubstitutionMapper(make_subst_func(index_map)) + new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(child_temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(child_arg_map))) + subst_mapper = KernelInliner(make_subst_func(var_map)) + + ing = kernel.get_instruction_id_generator() + insn_id = {} + for insn in child.instructions: + insn_id[insn.id] = ing("child_"+insn.id) + + # {{{ root and leave instructions in child kernel + + dep_map = child.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of child kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing("child_start"), + within_inames=call.within_inames, + depends_on=call.depends_on + ) + noop_end = NoOpInstruction( + id=call.id, + within_inames=call.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for _insn in child.instructions: + insn = _insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(child_iname_map.get, insn.within_inames)) + within_inames = within_inames | call.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in child kernel + priority=call.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == call: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + return kernel + # vim: foldmethod=marker -- GitLab From 60704094dd8eb36ab1ee20fb09a33f41147c677f Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 27 Apr 2018 15:42:18 +0100 Subject: [PATCH 165/916] docstring and minor modifications --- loopy/transform/register_knl.py | 25 +++++++++++++++++++++++++ test/test_transform.py | 6 +++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py index 6d40942c9..6804e2972 100644 --- a/loopy/transform/register_knl.py +++ b/loopy/transform/register_knl.py @@ -115,6 +115,31 @@ def register_callable_kernel(parent, function_name, child): def inline_kernel(kernel, function, arg_map=None): + """ + This transformation inlines a callable child kernel into the parent kernel. + + :arg: kernel + + The parent kernel. + + :arg: function + + The name of the function call to which the callable kernel is inlined. + + :arg: arg_map + + Dictionary which maps argument names in the child kernel to variables + in the parnet kernel. If not provided, the arguments will be mapped + according to their access and position, i.e. the first argument in the + child kernel with write access will be mapped to the first assignee in + the function call, and so on. + + """ + + assert isinstance(kernel, LoopKernel) + assert isinstance(function, str) + if not arg_map: + assert isinstance(arg_map, dict) if function not in kernel.scoped_functions: raise LoopyError("function: {0} does not exist".format(function)) diff --git a/test/test_transform.py b/test/test_transform.py index c5180ead1..ee4627cfd 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -500,9 +500,12 @@ def test_inline_kernel(ctx_factory): knl4 = lp.register_callable_kernel(knl4, 'func', knl1) knl4 = lp.inline_kernel(knl4, "func", {"a": "x", "b": "y", "c": "z"}) evt, (out,) = knl4(queue, x=x, y=y) - z = np.tile(np.flip(x + y * 2, 0), [16, 1]) + z = x + y * 2 + z = z[::-1] + z = np.tile(z, [16, 1]) assert np.allclose(out, z) + def test_inline_kernel_2d(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -562,6 +565,7 @@ def test_inline_kernel_2d(ctx_factory): z = np.tile(np.transpose(x + y * 2), [16, 1, 1]) assert np.allclose(out, z) + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 51cd5945fb12a32f1ef6f8bf72ac41f6a126d6f3 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Tue, 1 May 2018 14:09:20 +0100 Subject: [PATCH 166/916] remove register_knl.py --- loopy/transform/register_callable.py | 11 +- loopy/transform/register_knl.py | 375 --------------------------- 2 files changed, 4 insertions(+), 382 deletions(-) delete mode 100644 loopy/transform/register_knl.py diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 4ce3c72cc..3c5d8fbcf 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -244,7 +244,6 @@ def inline_kernel(kernel, function, arg_map=None): from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import SubstitutionMapper from loopy.isl_helpers import simplify_via_aff - from functools import reduce class KernelInliner(SubstitutionMapper): """ @@ -267,11 +266,9 @@ def inline_kernel(kernel, function, arg_map=None): raise LoopyError( "Argument: {0} in child kernel: {1} does not have " "constant shape.".format(arg_in, child.name)) - inner_sizes = [int(np.prod(arg_in.shape[i+1:])) - for i in range(len(arg_in.shape))] - flatten_index = reduce( - lambda x, y: p.Sum((x, y)), - map(p.Product, zip(outer_indices, inner_sizes))) + flatten_index = sum( + idx * tag.stride + for idx, tag in zip(outer_indices, arg_in.dim_tags)) flatten_index = simplify_via_aff(flatten_index) from loopy.symbolic import pw_aff_to_expr @@ -289,7 +286,7 @@ def inline_kernel(kernel, function, arg_map=None): new_indices = [] for s in sizes: ind = flatten_index // s - flatten_index = flatten_index - s * ind + flatten_index -= s * ind new_indices.append(ind) # Lastly, map sweeping indices to indices in Subscripts diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py deleted file mode 100644 index 6804e2972..000000000 --- a/loopy/transform/register_knl.py +++ /dev/null @@ -1,375 +0,0 @@ -from __future__ import division, absolute_import - -__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - - -import six - -import numpy as np - -from loopy.kernel import LoopKernel -from loopy.kernel.creation import FunctionScoper -from loopy.diagnostic import LoopyError -from loopy.kernel.function_interface import CallableKernel - -from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, - CInstruction, _DataObliviousInstruction) - -__doc__ = """ -.. currentmodule:: loopy - -.. autofunction:: register_callable_kernel - -.. autofunction:: inline_kernel -""" - - -# {{{ main entrypoint - -def register_callable_kernel(parent, function_name, child): - """ - The purpose of this transformation is so that one can inoke the child - kernel in the parent kernel. - - :arg parent - - This is the "main" kernel which will mostly remain unaltered and one - can interpret it as stitching up the child kernel in the parent kernel. - - :arg function_name - - The name of the function call with which the child kernel must be - associated in the parent kernel - - :arg child - - This is like a function in every other language and this might be - invoked in one of the instructions of the parent kernel. - - ..note:: - - One should note that the kernels would go under stringent compatibilty - tests so that both of them can be confirmed to be made for each other. - """ - - # {{{ sanity checks - - assert isinstance(parent, LoopKernel) - assert isinstance(child, LoopKernel) - assert isinstance(function_name, str) - - # }}} - - # scoping the function - function_scoper = FunctionScoper(set([function_name])) - new_insns = [] - - for insn in parent.instructions: - if isinstance(insn, CallInstruction): - new_insn = insn.copy(expression=function_scoper(insn.expression)) - new_insns.append(new_insn) - elif isinstance(insn, (_DataObliviousInstruction, MultiAssignmentBase, - CInstruction)): - new_insns.append(insn) - else: - raise NotImplementedError("scope_functions not implemented for %s" % - type(insn)) - - # adding the scoped function to the scoped function dict of the parent - # kernel. - - scoped_functions = parent.scoped_functions.copy() - - if function_name in scoped_functions: - raise LoopyError("%s is already being used as a funciton name -- maybe" - "use a different name for registering the subkernel") - - scoped_functions[function_name] = CallableKernel(name=function_name, - subkernel=child) - - # returning the parent kernel with the new scoped function dictionary - return parent.copy(scoped_functions=scoped_functions, - instructions=new_insns) - -# }}} - - -def inline_kernel(kernel, function, arg_map=None): - """ - This transformation inlines a callable child kernel into the parent kernel. - - :arg: kernel - - The parent kernel. - - :arg: function - - The name of the function call to which the callable kernel is inlined. - - :arg: arg_map - - Dictionary which maps argument names in the child kernel to variables - in the parnet kernel. If not provided, the arguments will be mapped - according to their access and position, i.e. the first argument in the - child kernel with write access will be mapped to the first assignee in - the function call, and so on. - - """ - - assert isinstance(kernel, LoopKernel) - assert isinstance(function, str) - if not arg_map: - assert isinstance(arg_map, dict) - - if function not in kernel.scoped_functions: - raise LoopyError("function: {0} does not exist".format(function)) - - child = kernel.scoped_functions[function].subkernel - - for call in kernel.instructions: - if not isinstance(call, CallInstruction): - continue - if call.expression.function.name != function: - continue - - # {{{ duplicate and rename inames - - import islpy as isl - - vng = kernel.get_var_name_generator() - dim_type = isl.dim_type.set - - child_iname_map = {} - for iname in child.all_inames(): - child_iname_map[iname] = vng("child_"+iname) - - new_domains = [] - for domain in child.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - new_domain = new_domain.set_dim_name( - dim_type, i, child_iname_map[iname]) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains) - - # }}} - - # {{{ rename temporaries - - child_temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(child.temporary_variables): - new_name = vng("child_"+name) - child_temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - child_arg_map = {} # child arg name -> SubArrayRef - - # for kernel call: out1, out2 = func(in1, in2), we match out1, out2 to - # the written arguments, and in1, in2 to the readonly arguments in - # child kernel, according the order they appear in child.args - writes = child.get_written_variables() - reads = [arg.name for arg in child.args if arg.name not in writes] - writes = [arg.name for arg in child.args if arg.name in writes] - - if arg_map: - for inside, outside in six.iteritems(arg_map): - if inside not in child.arg_dict: - raise LoopyError("arg named '{0}' not in the child " - "kernel".format(inside)) - if inside in writes: - sar = [sar for sar in call.assignees - if sar.subscript.aggregate.name == outside] - if len(sar) != 1: - raise LoopyError("wrong number of variables " - "named '{0}'".format(outside)) - child_arg_map[inside], = sar - else: - sar = [sar for sar in call.expression.parameters - if sar.subscript.aggregate.name == outside] - if len(sar) != 1: - raise LoopyError("wrong number of variables " - "named '{0}'".format(outside)) - child_arg_map[inside], = sar - else: - if len(call.assignees) != len(writes): - raise LoopyError("expect {0} output variable(s), got {1}".format( - len(writes), len(call.assignees))) - if len(call.expression.parameters) != len(reads): - raise LoopyError("expect {0} input variable(s), got {1}".format( - len(reads), len(call.expression.parameters))) - for arg_name, sar in zip(writes, call.assignees): - child_arg_map[arg_name] = sar - for arg_name, sar in zip(reads, call.expression.parameters): - child_arg_map[arg_name] = sar - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - from loopy.symbolic import SubstitutionMapper - from loopy.isl_helpers import simplify_via_aff - from functools import reduce - - class KernelInliner(SubstitutionMapper): - """ - Mapper to replace variables (indices, temporaries, arguments) in - the inner kernel. - """ - def map_subscript(self, expr): - if expr.aggregate.name in child_arg_map: - aggregate = self.subst_func(expr.aggregate) - sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) - arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in arg_in.shape): - raise LoopyError( - "Argument: {0} in child kernel: {1} does not have " - "constant shape.".format(arg_in, child.name)) - inner_sizes = [int(np.prod(arg_in.shape[i+1:])) - for i in range(len(arg_in.shape))] - flatten_index = reduce( - lambda x, y: p.Sum((x, y)), - map(p.Product, zip(outer_indices, inner_sizes))) - flatten_index = simplify_via_aff(flatten_index) - - from loopy.symbolic import pw_aff_to_expr - bounds = [kernel.get_iname_bounds(i.name) - for i in sar.swept_inames] - sizes = [pw_aff_to_expr(b.size) for b in bounds] - if not all(isinstance(d, Integral) for d in sizes): - raise LoopyError( - "SubArrayRef: {0} in parent kernel: {1} does not have " - "swept inames with constant size.".format( - sar, kernel.name)) - - sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] - - new_indices = [] - for s in sizes: - ind = flatten_index // s - flatten_index = flatten_index - s * ind - new_indices.append(ind) - - # Lastly, map sweeping indices to indices in Subscripts - # This takes care of cases such as [i, j]: A[i+j, i-j] - index_map = dict(zip(sar.swept_inames, new_indices)) - index_mapper = SubstitutionMapper(make_subst_func(index_map)) - new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(child_arg_map))) - subst_mapper = KernelInliner(make_subst_func(var_map)) - - ing = kernel.get_instruction_id_generator() - insn_id = {} - for insn in child.instructions: - insn_id[insn.id] = ing("child_"+insn.id) - - # {{{ root and leave instructions in child kernel - - dep_map = child.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of child kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing("child_start"), - within_inames=call.within_inames, - depends_on=call.depends_on - ) - noop_end = NoOpInstruction( - id=call.id, - within_inames=call.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for _insn in child.instructions: - insn = _insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(child_iname_map.get, insn.within_inames)) - within_inames = within_inames | call.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in child kernel - priority=call.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == call: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} - - return kernel - - -# vim: foldmethod=marker -- GitLab From 1c5cfa2da7167f191640f1d9029b85080d1319a9 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Wed, 2 May 2018 17:40:11 +0100 Subject: [PATCH 167/916] updates based on feedbacks on MR --- loopy/__init__.py | 3 +- loopy/kernel/function_interface.py | 7 +- loopy/preprocess.py | 239 +++++++++++++++++++++++++- loopy/transform/register_callable.py | 242 +-------------------------- test/test_transform.py | 22 +-- 5 files changed, 253 insertions(+), 260 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 1c7951dc0..a5850ec0a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -119,7 +119,7 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, - register_function_lookup, inline_kernel) + register_function_lookup) # }}} @@ -230,7 +230,6 @@ __all__ = [ "add_barrier", "register_callable_kernel", "register_function_lookup", - "inline_kernel", # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 368267d76..79c9cb2e1 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -440,12 +440,12 @@ class CallableKernel(InKernelCallable): """ fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"]) + "name_in_target", "inline"]) init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target") + "name_in_target", "inline") def __init__(self, subkernel, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): + arg_id_to_descr=None, name_in_target=None, inline=False): super(CallableKernel, self).__init__( arg_id_to_dtype=arg_id_to_dtype, @@ -454,6 +454,7 @@ class CallableKernel(InKernelCallable): subkernel = subkernel.copy(name=name_in_target) self.name_in_target = name_in_target + self.inline = inline self.subkernel = subkernel.copy( args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) if arg.dtype is not None else arg for arg in subkernel.args]) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index bf1467c16..242422d61 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -38,7 +38,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import CombineMapper +from loopy.symbolic import CombineMapper, SubstitutionMapper, pw_aff_to_expr from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) @@ -2477,6 +2477,239 @@ def make_functions_ready_for_codegen(kernel): # }}} +# {{{ inline callable kernel + +class KernelInliner(SubstitutionMapper): + """Mapper to replace variables (indices, temporaries, arguments) in the + callee kernel with variables in the caller kernel. + + :arg caller: the caller kernel + :arg arg_map: dict of argument name to variables in caller + :arg arg_dict: dict of argument name to arguments in callee + """ + + def __init__(self, subst_func, caller, arg_map, arg_dict): + super(KernelInliner, self).__init__(subst_func) + self.caller = caller + self.arg_map = arg_map + self.arg_dict = arg_dict + + def map_subscript(self, expr): + if expr.aggregate.name in self.arg_map: + import numpy as np + from pymbolic.mapper.substitutor import make_subst_func + + aggregate = self.subst_func(expr.aggregate) + sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller + arg = self.arg_dict[expr.aggregate.name] # Arg in callee + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in arg.shape): + raise LoopyError( + "Argument: {0} in callee kernel: {1} does not have " + "constant shape.".format(arg)) + flatten_index = sum( + idx * tag.stride + for idx, tag in zip(outer_indices, arg.dim_tags)) + from loopy.isl_helpers import simplify_via_aff + flatten_index = simplify_via_aff(flatten_index) + + bounds = [self.caller.get_iname_bounds(i.name) + for i in sar.swept_inames] + sizes = [pw_aff_to_expr(b.size) for b in bounds] + if not all(isinstance(d, Integral) for d in sizes): + raise LoopyError( + "SubArrayRef: {0} in caller kernel does not have " + "swept inames with constant size.".format(sar)) + + sizes = [int(np.prod(sizes[i + 1:])) for i in range(len(sizes))] + + new_indices = [] + for s in sizes: + ind = flatten_index // s + flatten_index -= s * ind + new_indices.append(ind) + + # Lastly, map sweeping indices to indices in Subscripts + # This takes care of cases such as [i, j]: A[i+j, i-j] + index_map = dict(zip(sar.swept_inames, new_indices)) + index_mapper = SubstitutionMapper(make_subst_func(index_map)) + new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + + +def inline_callable_kernels(kernel): + + from loopy import CallInstruction + import islpy as isl + + for call in kernel.instructions: + if not isinstance(call, CallInstruction): + continue + + callable = kernel.scoped_functions[call.expression.function.name] + if not callable.inline: + continue + + callee = callable.subkernel + callee_label = callee.name[:4] + "_" # label used to generate new names + + # {{{ duplicate and rename inames + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + dim_type = isl.dim_type.set + + iname_map = {} + for iname in callee.all_inames(): + iname_map[iname] = vng(callee_label+iname) + + new_domains = [] + for domain in callee.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + new_domain = new_domain.set_dim_name( + dim_type, i, iname_map[iname]) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains) + + # }}} + + # {{{ rename temporaries + + temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(callee.temporary_variables): + new_name = vng(callee_label+name) + temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = call.assignees # writes + parameters = call.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + if isinstance(call.expression, CallWithKwargs): + from loopy.kernel.function_interface import get_kw_pos_association + + _, pos_to_kw = get_kw_pos_association(callee) + kw_parameters = call.expression.kw_parameters + for i in range(len(parameters), len(parameters) + len(kw_parameters)): + parameters = parameters + (kw_parameters[pos_to_kw[i]],) + + assignee_pos = 0 + parameter_pos = 0 + for i, arg in enumerate(callee.args): + if arg.direction == "out": + arg_map[arg.name] = assignees[assignee_pos] + assignee_pos += 1 + else: + arg_map[arg.name] = parameters[parameter_pos] + parameter_pos += 1 + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner( + make_subst_func(var_map), kernel, arg_map, callee.arg_dict) + + insn_id = {} + for insn in callee.instructions: + insn_id[insn.id] = ing(callee_label+insn.id) + + # {{{ root and leave instructions in callee kernel + + dep_map = callee.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=call.within_inames, + depends_on=call.depends_on + ) + noop_end = NoOpInstruction( + id=call.id, + within_inames=call.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for _insn in callee.instructions: + insn = _insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(iname_map.get, insn.within_inames)) + within_inames = within_inames | call.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=call.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == call: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + return kernel + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2548,6 +2781,9 @@ def preprocess_kernel(kernel, device=None): # defaults from being applied. kernel = realize_reduction(kernel, unknown_types_ok=False) + # inlining callable kernels that are marked with inline=True. + kernel = inline_callable_kernels(kernel) + # type specialize functions that were missed during the type inference. kernel = make_functions_ready_for_codegen(kernel) @@ -2563,6 +2799,7 @@ def preprocess_kernel(kernel, device=None): # inferring the shape and dim_tags of the arguments involved in a function # call. kernel = infer_arg_descr(kernel) + # tuning the functions in the kernel to align with the grid sizes. kernel = infer_hw_axes_sizes(kernel) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 3c5d8fbcf..8300fa374 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -22,10 +22,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - -import numpy as np - from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel from pytools import ImmutableRecord @@ -82,13 +78,15 @@ class RegisterCalleeKernel(ImmutableRecord): return None -def register_callable_kernel(caller_kernel, function_name, callee_kernel): +def register_callable_kernel(caller_kernel, function_name, callee_kernel, + inline=False): """Returns a copy of *caller_kernel*, which would resolve *function_name* in an expression as a call to *callee_kernel*. :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. :arg function_name: An instance of :class:`str`. :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :arg inline: Boolean flag of inlining callee kernel into caller. """ # {{{ sanity checks @@ -130,7 +128,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # kernel. callable_kernel = CallableKernel(subkernel=callee_kernel.copy( target=caller_kernel.target, - is_master_kernel=False)) + is_master_kernel=False), inline=inline) # disabling global barriers for callee kernel from loopy import set_options @@ -141,236 +139,4 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # }}} - -def inline_kernel(kernel, function, arg_map=None): - - from loopy import CallInstruction, LoopyError - - if function not in kernel.scoped_functions: - raise LoopyError("function: {0} does not exist".format(function)) - - child = kernel.scoped_functions[function].subkernel - - for call in kernel.instructions: - if not isinstance(call, CallInstruction): - continue - if call.expression.function.name != function: - continue - - # {{{ duplicate and rename inames - - import islpy as isl - - vng = kernel.get_var_name_generator() - dim_type = isl.dim_type.set - - child_iname_map = {} - for iname in child.all_inames(): - child_iname_map[iname] = vng("child_"+iname) - - new_domains = [] - for domain in child.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - new_domain = new_domain.set_dim_name( - dim_type, i, child_iname_map[iname]) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains) - - # }}} - - # {{{ rename temporaries - - child_temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(child.temporary_variables): - new_name = vng("child_"+name) - child_temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - child_arg_map = {} # child arg name -> SubArrayRef - - # for kernel call: out1, out2 = func(in1, in2), we match out1, out2 to - # the written arguments, and in1, in2 to the readonly arguments in - # child kernel, according the order they appear in child.args - writes = child.get_written_variables() - reads = [arg.name for arg in child.args if arg.name not in writes] - writes = [arg.name for arg in child.args if arg.name in writes] - - if arg_map: - for inside, outside in six.iteritems(arg_map): - if inside not in child.arg_dict: - raise LoopyError("arg named '{0}' not in the child " - "kernel".format(inside)) - if inside in writes: - sar = [sar for sar in call.assignees - if sar.subscript.aggregate.name == outside] - if len(sar) != 1: - raise LoopyError("wrong number of variables " - "named '{0}'".format(outside)) - child_arg_map[inside], = sar - else: - sar = [sar for sar in call.expression.parameters - if sar.subscript.aggregate.name == outside] - if len(sar) != 1: - raise LoopyError("wrong number of variables " - "named '{0}'".format(outside)) - child_arg_map[inside], = sar - else: - if len(call.assignees) != len(writes): - raise LoopyError("expect {0} output variable(s), got {1}".format( - len(writes), len(call.assignees))) - if len(call.expression.parameters) != len(reads): - raise LoopyError("expect {0} input variable(s), got {1}".format( - len(reads), len(call.expression.parameters))) - for arg_name, sar in zip(writes, call.assignees): - child_arg_map[arg_name] = sar - for arg_name, sar in zip(reads, call.expression.parameters): - child_arg_map[arg_name] = sar - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - from loopy.symbolic import SubstitutionMapper - from loopy.isl_helpers import simplify_via_aff - - class KernelInliner(SubstitutionMapper): - """ - Mapper to replace variables (indices, temporaries, arguments) in - the inner kernel. - """ - def map_subscript(self, expr): - if expr.aggregate.name in child_arg_map: - aggregate = self.subst_func(expr.aggregate) - sar = child_arg_map[expr.aggregate.name] # SubArrayRef (parent) - arg_in = child.arg_dict[expr.aggregate.name] # Arg (child) - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in arg_in.shape): - raise LoopyError( - "Argument: {0} in child kernel: {1} does not have " - "constant shape.".format(arg_in, child.name)) - flatten_index = sum( - idx * tag.stride - for idx, tag in zip(outer_indices, arg_in.dim_tags)) - flatten_index = simplify_via_aff(flatten_index) - - from loopy.symbolic import pw_aff_to_expr - bounds = [kernel.get_iname_bounds(i.name) - for i in sar.swept_inames] - sizes = [pw_aff_to_expr(b.size) for b in bounds] - if not all(isinstance(d, Integral) for d in sizes): - raise LoopyError( - "SubArrayRef: {0} in parent kernel: {1} does not have " - "swept inames with constant size.".format( - sar, kernel.name)) - - sizes = [int(np.prod(sizes[i+1:])) for i in range(len(sizes))] - - new_indices = [] - for s in sizes: - ind = flatten_index // s - flatten_index -= s * ind - new_indices.append(ind) - - # Lastly, map sweeping indices to indices in Subscripts - # This takes care of cases such as [i, j]: A[i+j, i-j] - index_map = dict(zip(sar.swept_inames, new_indices)) - index_mapper = SubstitutionMapper(make_subst_func(index_map)) - new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(child_temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(child_arg_map))) - subst_mapper = KernelInliner(make_subst_func(var_map)) - - ing = kernel.get_instruction_id_generator() - insn_id = {} - for insn in child.instructions: - insn_id[insn.id] = ing("child_"+insn.id) - - # {{{ root and leave instructions in child kernel - - dep_map = child.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of child kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing("child_start"), - within_inames=call.within_inames, - depends_on=call.depends_on - ) - noop_end = NoOpInstruction( - id=call.id, - within_inames=call.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for _insn in child.instructions: - insn = _insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(child_iname_map.get, insn.within_inames)) - within_inames = within_inames | call.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in child kernel - priority=call.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == call: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} - - return kernel - # vim: foldmethod=marker diff --git a/test/test_transform.py b/test/test_transform.py index ee4627cfd..b08d674a5 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -480,25 +480,17 @@ def test_inline_kernel(ctx_factory): ] ) - knl2 = lp.register_callable_kernel(knl2, 'func', knl1) + knl2 = lp.register_callable_kernel(knl2, 'func', knl1, inline=True) z = np.tile(x + y * 2, [16, 1]) - - knl2_arg_map = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) - evt, (out, ) = knl2_arg_map(queue, x=x, y=y) - assert np.allclose(out, z) - - knl2_no_arg_map = lp.inline_kernel(knl2, "func") - evt, (out, ) = knl2_no_arg_map(queue, x=x, y=y) + evt, (out, ) = knl2(queue, x=x, y=y) assert np.allclose(out, z) - knl3 = lp.register_callable_kernel(knl3, 'func', knl1) - knl3 = lp.inline_kernel(knl3, "func", {"a": "x", "b": "y", "c": "z"}) + knl3 = lp.register_callable_kernel(knl3, 'func', knl1, inline=True) evt, (out,) = knl3(queue, x=x, y=y) z = np.tile(x + y * 2, [16, 1]).transpose() assert np.allclose(out, z) - knl4 = lp.register_callable_kernel(knl4, 'func', knl1) - knl4 = lp.inline_kernel(knl4, "func", {"a": "x", "b": "y", "c": "z"}) + knl4 = lp.register_callable_kernel(knl4, 'func', knl1, inline=True) evt, (out,) = knl4(queue, x=x, y=y) z = x + y * 2 z = z[::-1] @@ -553,14 +545,12 @@ def test_inline_kernel_2d(ctx_factory): ] ) - knl2 = lp.register_callable_kernel(knl2, 'func', knl1) - knl2 = lp.inline_kernel(knl2, "func", {"a": "x", "b": "y", "c": "z"}) + knl2 = lp.register_callable_kernel(knl2, 'func', knl1, inline=True) evt, (out, ) = knl2(queue, x=x, y=y) z = np.tile(x + y * 2, [16, 1, 1]) assert np.allclose(out, z) - knl3 = lp.register_callable_kernel(knl3, 'func', knl1) - knl3 = lp.inline_kernel(knl3, "func", {"a": "x", "b": "y", "c": "z"}) + knl3 = lp.register_callable_kernel(knl3, 'func', knl1, inline=True) evt, (out,) = knl3(queue, x=x, y=y) z = np.tile(np.transpose(x + y * 2), [16, 1, 1]) assert np.allclose(out, z) -- GitLab From bc0ca75f385e96b92e1ea90803a769af3e6e8979 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 3 May 2018 11:07:58 +0100 Subject: [PATCH 168/916] test for callable type before inlining --- loopy/preprocess.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 242422d61..e4494bbda 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2549,6 +2549,7 @@ class KernelInliner(SubstitutionMapper): def inline_callable_kernels(kernel): from loopy import CallInstruction + from loopy.kernel.function_interface import CallableKernel import islpy as isl for call in kernel.instructions: @@ -2556,6 +2557,10 @@ def inline_callable_kernels(kernel): continue callable = kernel.scoped_functions[call.expression.function.name] + + if not isinstance(callable, CallableKernel): + continue + if not callable.inline: continue -- GitLab From 18ee74a8aeeb1a718b30e3c6a036347aed034f34 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 3 May 2018 11:48:52 +0100 Subject: [PATCH 169/916] test for function is scoped before inlining --- loopy/preprocess.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e4494bbda..8fe7acb78 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2556,6 +2556,9 @@ def inline_callable_kernels(kernel): if not isinstance(call, CallInstruction): continue + if call.expression.function.name not in kernel.scoped_functions: + continue + callable = kernel.scoped_functions[call.expression.function.name] if not isinstance(callable, CallableKernel): @@ -2773,6 +2776,10 @@ def preprocess_kernel(kernel, device=None): check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) + # Inlining callable kernels that are marked with inline=True. + # This should happen after type inference but before other transformations. + kernel = inline_callable_kernels(kernel) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel) @@ -2786,9 +2793,6 @@ def preprocess_kernel(kernel, device=None): # defaults from being applied. kernel = realize_reduction(kernel, unknown_types_ok=False) - # inlining callable kernels that are marked with inline=True. - kernel = inline_callable_kernels(kernel) - # type specialize functions that were missed during the type inference. kernel = make_functions_ready_for_codegen(kernel) -- GitLab From fe3e5166836831486f0946861f262e841008c511 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 3 May 2018 12:31:14 +0100 Subject: [PATCH 170/916] test for Call expression before inlining --- loopy/preprocess.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 8fe7acb78..1b1d9be38 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2550,12 +2550,17 @@ def inline_callable_kernels(kernel): from loopy import CallInstruction from loopy.kernel.function_interface import CallableKernel + from pymbolic.primitives import Call + import islpy as isl for call in kernel.instructions: if not isinstance(call, CallInstruction): continue + if not isinstance(call.expression, Call): + continue + if call.expression.function.name not in kernel.scoped_functions: continue -- GitLab From 22bb8c78378a0477df04b2da4f4a2e8afd284f62 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Tue, 1 May 2018 17:41:37 +0100 Subject: [PATCH 171/916] packing arguments for external functions --- loopy/preprocess.py | 144 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1b1d9be38..321f31e45 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2282,6 +2282,147 @@ def infer_arg_descr(kernel): # }}} +# {{{ + +def need_packing(tags_needed, tags): + if len(tags_needed) != len(tags): + return True + + strides_needed = (tag.stride for tag in tags_needed) + strides = (tag.stride for tag in tags) + return any(s1!=s2 for s1, s2 in zip(strides_needed, strides)) + +def add_pack_and_unpack(kernel): + """ + """ + + new_domains = [] + new_tmps = kernel.temporary_variables.copy() + new_calls = {} + + for call in kernel.instructions: + if not isinstance(call, CallInstruction): + continue + + callable = kernel.scoped_functions[call.expression.function.name] + from loopy.kernel.function_interface import CallableKernel + if isinstance(callable, CallableKernel): + # Not external functions + continue + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + + parameters = call.expression.parameters + packing = [] + new_params = [] + + from loopy.kernel.data import IlpBaseTag, VectorizeTag + import islpy as isl + from pymbolic import var + + dim_type = isl.dim_type.set + ilp_inames = set(iname for iname in call.within_inames if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) + new_ilp_inames = set() + ilp_inames_map = {} + for iname in ilp_inames: + new_iname_name = vng(iname + "_ilp") + ilp_inames_map[var(iname)] = var(new_iname_name) + new_ilp_inames.add(new_iname_name) + for iname in ilp_inames: + new_domain = kernel.get_inames_domain(iname).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + if old_iname in ilp_inames: + new_domain = new_domain.set_dim_name( + dim_type, i, ilp_inames_map[var(old_iname)].name) + new_domains.append(new_domain) + + from loopy.symbolic import SubArrayRef + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + for i,p in enumerate(parameters): + if isinstance(p, SubArrayRef): + des = callable.arg_id_to_descr[i] + name = p.subscript.aggregate.name + if name in kernel.temporary_variables: + array = kernel.temporary_variables[name] + else: + assert name in kernel.arg_dict + array = kernel.arg_dict[name] + dim_tags, _ = p.get_sub_array_dim_tags_and_shape(array.dim_tags, array.shape) + # Check if memory layout match + if need_packing(des.dim_tags, dim_tags): + new_swept_inames = ilp_inames_map.copy() + for iname in p.swept_inames: + new_swept_inames[iname] = var(vng(iname.name + "_pack")) + new_domain = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + new_domain = new_domain.set_dim_name( + dim_type, i, new_swept_inames[var(old_iname)].name) + new_domains.append(new_domain) + + pack_name = vng(name + "_pack") + + from loopy.kernel.data import TemporaryVariable + + pack_tmp = TemporaryVariable( + name=pack_name, + shape=des.shape, + dtype=array.dtype, + scope=array.scope, + dim_tags=des.dim_tags + ) + new_tmps[pack_name] = pack_tmp + + from loopy import Assignment + subst_mapper = SubstitutionMapper(make_subst_func(new_swept_inames)) + + packing.append(Assignment( + assignee=var(pack_name).index(tuple(new_swept_inames[i] for i in p.swept_inames)), + expression=subst_mapper.map_subscript(p.subscript), + within_inames=call.within_inames - ilp_inames | set(new_swept_inames[i].name for i in p.swept_inames) | new_ilp_inames, + depends_on=call.depends_on, + id=ing(call.id+"_pack") + )) + new_params.append(SubArrayRef(p.swept_inames, var(pack_name).index(p.swept_inames))) + else: + new_params.append(p) + else: + new_params.append(p) + if packing: + subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) + _call = call.with_transformed_expressions(subst_mapper) + new_expr = _call.expression.function() + new_params = list(map(subst_mapper, new_params)) + packing.append( + _call.copy( + depends_on=_call.depends_on | set(pack.id for pack in packing), + within_inames=_call.within_inames - ilp_inames | new_ilp_inames, + expression=_call.expression.function(*new_params) + ) + ) + new_calls[call] = packing + + if new_calls: + new_instructions = [] + for insn in kernel.instructions: + if insn in new_calls: + new_instructions.extend(new_calls[insn]) + else: + new_instructions.append(insn) + kernel = kernel.copy( + domains=kernel.domains + new_domains, + instructions=new_instructions, + temporary_variables=new_tmps + ) + return kernel + +# }}} + + # {{{ class HWAxesInferenceMapper(CombineMapper): @@ -2814,6 +2955,9 @@ def preprocess_kernel(kernel, device=None): # call. kernel = infer_arg_descr(kernel) + # packing args for external functions if necessary + kernel = add_pack_and_unpack(kernel) + # tuning the functions in the kernel to align with the grid sizes. kernel = infer_hw_axes_sizes(kernel) -- GitLab From f7c3792ec133a701865a69e48857a54dc91d0095 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 15 May 2018 07:41:42 -0500 Subject: [PATCH 172/916] Added comments/minor changes in function_interface::emit_call --- loopy/kernel/function_interface.py | 52 ++++++++++++++++++++++-------- loopy/target/c/__init__.py | 29 +++++------------ 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 79c9cb2e1..f30fc6599 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -156,6 +156,15 @@ class InKernelCallable(ImmutableRecord): Negative ids in the mapping attributes indicate the result arguments + .. automethod:: __init__ + .. automethod:: with_types + .. automethod:: with_descrs + .. automethod:: with_target + .. automethod:: with_hw_axes_sizes + .. automethod:: generate_preambles + .. automethod:: emit_call + .. automethod:: emit_call_insn + .. automethod:: is_ready_for_codegen """ fields = set(["arg_id_to_dtype", "arg_id_to_descr"]) @@ -200,21 +209,20 @@ class InKernelCallable(ImmutableRecord): Return values are denoted by negative integers, with the first returned value identified as *-1*. - :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a - new :class:`InKernelCallable` specialized for the given types, - and *arg_id_to_descr* is a mapping of the same form as the - argument above, however it may have more information present. - Any argument information exists both by its positional and - its keyword identifier. + :returns: a copy of *self* which is a new instance of + :class:`InKernelCallable` specialized for the given types, and + *arg_id_to_descr* is a mapping of the same form as the argument above, + however it may have more information present. Any argument information + exists both by its positional and its keyword identifier. """ raise NotImplementedError() def with_target(self, target): """ - Returns a copy with all the ``dtypes`` in - ``in_knl_callable.arg_id_to_dtype`` as instances of - :class:`loopy.LoopyType`. + Returns a copy of *self* with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer + :meth:`loopy.types.LoopyType.with_target`. :arg target: An instance of :class:`loopy.target.TargetBase`. """ @@ -241,10 +249,13 @@ class InKernelCallable(ImmutableRecord): def with_hw_axes_sizes(self, local_size, global_size): """ + Returns a copy of *self* with modifications to comply with the grid + sizes ``(local_size, global_size)`` of the kernel in which it is + supposed to be called. + :arg local_size: An instance of :class:`islpy.PwAff`. :arg global_size: An instance of :class:`islpy.PwAff`. """ - raise NotImplementedError() def is_ready_for_codegen(self): @@ -253,7 +264,7 @@ class InKernelCallable(ImmutableRecord): self.arg_id_to_descr is not None) def generate_preambles(self, target): - """ This would generate the target specific preamble. + """ Yields the target specific preamble. """ raise NotImplementedError() @@ -262,6 +273,18 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a tuple of ``(call, assignee_is_returned)`` which is the target + facing function call that would be seen in the generated code. ``call`` + is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned`` + is an instance of :class:`bool` to indicate if the assignee is returned + by value of C-type targets. + + :Example: If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is + interpreted in the target as ``a = f(c, d, &b)``. If + ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted + in the target as the statement ``f(c, d, &a, &b)``. + """ raise NotImplementedError() @@ -407,7 +430,10 @@ class ScalarCallable(InKernelCallable): dtype_to_type_context(target, tgt_dtype), tgt_dtype).expr)) - return var(self.name_in_target)(*c_parameters) + # assignee is returned whenever the size of assignees is non zero. + assignee_is_returned = len(assignees) > 0 + + return var(self.name_in_target)(*c_parameters), assignee_is_returned def generate_preambles(self, target): return @@ -604,7 +630,7 @@ class CallableKernel(InKernelCallable): for par, par_dtype in zip( parameters, par_dtypes)] - return var(self.name_in_target)(*c_parameters) + return var(self.name_in_target)(*c_parameters), False # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 86e7bea81..b8dcfcf77 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -887,35 +887,22 @@ class CASTBuilder(ASTBuilderBase): if in_knl_callable.name_in_target == 'loopy_make_tuple': return self.emit_tuple_assignment(codegen_state, insn) - in_knl_callable_as_call = in_knl_callable.emit_call_insn( + in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( insn=insn, target=self.target, expression_to_code_mapper=ecm) - from loopy.kernel.function_interface import (ScalarCallable, - CallableKernel) - if isinstance(in_knl_callable, ScalarCallable): - if insn.assignees: - from cgen import Assign - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - return Assign(lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), - in_knl_callable_as_call)) - else: - # No return scalar callables - from cgen import ExpressionStatement - return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), - in_knl_callable_as_call)) - - elif isinstance(in_knl_callable, CallableKernel): + if is_returned: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: from cgen import ExpressionStatement return ExpressionStatement( CExpression(self.get_c_expression_to_code_mapper(), in_knl_callable_as_call)) - else: - raise NotImplementedError("Unexpected type %s of In Kernel " - "Callable." % type(in_knl_callable)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): -- GitLab From e6e9632e3fc35402396c10be9e9b8a4762421c0f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 15 May 2018 14:40:50 -0500 Subject: [PATCH 173/916] Change in pattern for TJ's code --- loopy/kernel/function_interface.py | 246 ++++++++++++++++++++++++- loopy/preprocess.py | 258 ++------------------------- loopy/transform/register_callable.py | 6 +- 3 files changed, 258 insertions(+), 252 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index f30fc6599..934a8bad4 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -36,7 +36,8 @@ from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, - RuleAwareIdentityMapper, SubstitutionRuleExpander) + RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper, + pw_aff_to_expr, ) # {{{ argument descriptors @@ -444,6 +445,78 @@ class ScalarCallable(InKernelCallable): # }}} +# {{{ kernel inliner mapper + +class KernelInliner(SubstitutionMapper): + """Mapper to replace variables (indices, temporaries, arguments) in the + callee kernel with variables in the caller kernel. + + :arg caller: the caller kernel + :arg arg_map: dict of argument name to variables in caller + :arg arg_dict: dict of argument name to arguments in callee + """ + + def __init__(self, subst_func, caller, arg_map, arg_dict): + super(KernelInliner, self).__init__(subst_func) + self.caller = caller + self.arg_map = arg_map + self.arg_dict = arg_dict + + def map_subscript(self, expr): + if expr.aggregate.name in self.arg_map: + import numpy as np + from pymbolic.mapper.substitutor import make_subst_func + + aggregate = self.subst_func(expr.aggregate) + sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller + arg = self.arg_dict[expr.aggregate.name] # Arg in callee + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + print(arg.shape) + if not all(isinstance(d, Integral) for d in arg.shape): + raise LoopyError( + "Argument: {0} in callee kernel: {1} does not have " + "constant shape.".format(arg)) + flatten_index = sum( + idx * tag.stride + for idx, tag in zip(outer_indices, arg.dim_tags)) + from loopy.isl_helpers import simplify_via_aff + flatten_index = simplify_via_aff(flatten_index) + + bounds = [self.caller.get_iname_bounds(i.name) + for i in sar.swept_inames] + sizes = [pw_aff_to_expr(b.size) for b in bounds] + if not all(isinstance(d, Integral) for d in sizes): + raise LoopyError( + "SubArrayRef: {0} in caller kernel does not have " + "swept inames with constant size.".format(sar)) + + sizes = [int(np.prod(sizes[i + 1:])) for i in range(len(sizes))] + + new_indices = [] + for s in sizes: + ind = flatten_index // s + flatten_index -= s * ind + new_indices.append(ind) + + # Lastly, map sweeping indices to indices in Subscripts + # This takes care of cases such as [i, j]: A[i+j, i-j] + index_map = dict(zip(sar.swept_inames, new_indices)) + index_mapper = SubstitutionMapper(make_subst_func(index_map)) + new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + +# }}} + + # {{{ callable kernel class CallableKernel(InKernelCallable): @@ -466,12 +539,12 @@ class CallableKernel(InKernelCallable): """ fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target", "inline"]) + "name_in_target", "should_inline"]) init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target", "inline") + "name_in_target", "should_inline") def __init__(self, subkernel, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None, inline=False): + arg_id_to_descr=None, name_in_target=None, should_inline=False): super(CallableKernel, self).__init__( arg_id_to_dtype=arg_id_to_dtype, @@ -480,7 +553,7 @@ class CallableKernel(InKernelCallable): subkernel = subkernel.copy(name=name_in_target) self.name_in_target = name_in_target - self.inline = inline + self.should_inline = should_inline self.subkernel = subkernel.copy( args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) if arg.dtype is not None else arg for arg in subkernel.args]) @@ -572,9 +645,9 @@ class CallableKernel(InKernelCallable): self.name_in_target is not None) def generate_preambles(self, target): - """ This would generate the target specific preamble. + """ Yields the *target* specific preambles. """ - # FIXME: This is not correct, as the code code preamble generated + # TODO: This is not correct, as the code code preamble generated # during the code generationg of the child kernel, does not guarantee # that this thing would be updated. for preamble in self.subkernel.preambles: @@ -582,6 +655,165 @@ class CallableKernel(InKernelCallable): return + def inline_within_kernel(self, kernel, instruction): + """ + Returns a copy of *kernel* with the *instruction* in the *kernel* + replaced by inlining :attr:`subkernel` within it. + """ + from loopy.preprocess import preprocess_kernel + callee_knl = preprocess_kernel(self.subkernel) + + import islpy as isl + + callee_label = callee_knl.name[:4] + "_" + + # {{{ duplicate and rename inames + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + dim_type = isl.dim_type.set + + iname_map = {} + for iname in callee_knl.all_inames(): + iname_map[iname] = vng(callee_label+iname) + + new_domains = [] + for domain in callee_knl.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + new_domain = new_domain.set_dim_name( + dim_type, i, iname_map[iname]) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains) + + # }}} + + # {{{ rename temporaries + + temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(callee_knl.temporary_variables): + new_name = vng(callee_label+name) + temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = instruction.assignees # writes + parameters = instruction.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + if isinstance(instruction.expression, CallWithKwargs): + from loopy.kernel.function_interface import get_kw_pos_association + + _, pos_to_kw = get_kw_pos_association(callee_knl) + kw_parameters = instruction.expression.kw_parameters + for i in range(len(parameters), len(parameters) + len(kw_parameters)): + parameters = parameters + (kw_parameters[pos_to_kw[i]],) + + assignee_pos = 0 + parameter_pos = 0 + for i, arg in enumerate(callee_knl.args): + if arg.direction == "out": + arg_map[arg.name] = assignees[assignee_pos] + assignee_pos += 1 + else: + arg_map[arg.name] = parameters[parameter_pos] + parameter_pos += 1 + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner( + make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) + + insn_id = {} + for insn in callee_knl.instructions: + insn_id[insn.id] = ing(callee_label+insn.id) + + # {{{ root and leave instructions in callee kernel + + dep_map = callee_knl.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=instruction.within_inames, + depends_on=instruction.depends_on + ) + noop_end = NoOpInstruction( + id=instruction.id, + within_inames=instruction.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for insn in callee_knl.instructions: + print(insn) + print('Hurrah') + insn = insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(iname_map.get, insn.within_inames)) + within_inames = within_inames | instruction.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == instruction: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + return kernel + def emit_call_insn(self, insn, target, expression_to_code_mapper): assert self.is_ready_for_codegen() diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1b1d9be38..99acb3ac7 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -38,7 +38,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import CombineMapper, SubstitutionMapper, pw_aff_to_expr +from loopy.symbolic import CombineMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) @@ -2479,244 +2479,18 @@ def make_functions_ready_for_codegen(kernel): # {{{ inline callable kernel -class KernelInliner(SubstitutionMapper): - """Mapper to replace variables (indices, temporaries, arguments) in the - callee kernel with variables in the caller kernel. - - :arg caller: the caller kernel - :arg arg_map: dict of argument name to variables in caller - :arg arg_dict: dict of argument name to arguments in callee - """ - - def __init__(self, subst_func, caller, arg_map, arg_dict): - super(KernelInliner, self).__init__(subst_func) - self.caller = caller - self.arg_map = arg_map - self.arg_dict = arg_dict - - def map_subscript(self, expr): - if expr.aggregate.name in self.arg_map: - import numpy as np - from pymbolic.mapper.substitutor import make_subst_func - - aggregate = self.subst_func(expr.aggregate) - sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller - arg = self.arg_dict[expr.aggregate.name] # Arg in callee - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in arg.shape): - raise LoopyError( - "Argument: {0} in callee kernel: {1} does not have " - "constant shape.".format(arg)) - flatten_index = sum( - idx * tag.stride - for idx, tag in zip(outer_indices, arg.dim_tags)) - from loopy.isl_helpers import simplify_via_aff - flatten_index = simplify_via_aff(flatten_index) - - bounds = [self.caller.get_iname_bounds(i.name) - for i in sar.swept_inames] - sizes = [pw_aff_to_expr(b.size) for b in bounds] - if not all(isinstance(d, Integral) for d in sizes): - raise LoopyError( - "SubArrayRef: {0} in caller kernel does not have " - "swept inames with constant size.".format(sar)) - - sizes = [int(np.prod(sizes[i + 1:])) for i in range(len(sizes))] - - new_indices = [] - for s in sizes: - ind = flatten_index // s - flatten_index -= s * ind - new_indices.append(ind) - - # Lastly, map sweeping indices to indices in Subscripts - # This takes care of cases such as [i, j]: A[i+j, i-j] - index_map = dict(zip(sar.swept_inames, new_indices)) - index_mapper = SubstitutionMapper(make_subst_func(index_map)) - new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - - def inline_callable_kernels(kernel): - - from loopy import CallInstruction - from loopy.kernel.function_interface import CallableKernel - from pymbolic.primitives import Call - - import islpy as isl - - for call in kernel.instructions: - if not isinstance(call, CallInstruction): - continue - - if not isinstance(call.expression, Call): - continue - - if call.expression.function.name not in kernel.scoped_functions: - continue - - callable = kernel.scoped_functions[call.expression.function.name] - - if not isinstance(callable, CallableKernel): - continue - - if not callable.inline: - continue - - callee = callable.subkernel - callee_label = callee.name[:4] + "_" # label used to generate new names - - # {{{ duplicate and rename inames - - vng = kernel.get_var_name_generator() - ing = kernel.get_instruction_id_generator() - dim_type = isl.dim_type.set - - iname_map = {} - for iname in callee.all_inames(): - iname_map[iname] = vng(callee_label+iname) - - new_domains = [] - for domain in callee.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - new_domain = new_domain.set_dim_name( - dim_type, i, iname_map[iname]) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains) - - # }}} - - # {{{ rename temporaries - - temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(callee.temporary_variables): - new_name = vng(callee_label+name) - temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) - - assignees = call.assignees # writes - parameters = call.expression.parameters # reads - - # add keyword parameters - from pymbolic.primitives import CallWithKwargs - - if isinstance(call.expression, CallWithKwargs): - from loopy.kernel.function_interface import get_kw_pos_association - - _, pos_to_kw = get_kw_pos_association(callee) - kw_parameters = call.expression.kw_parameters - for i in range(len(parameters), len(parameters) + len(kw_parameters)): - parameters = parameters + (kw_parameters[pos_to_kw[i]],) - - assignee_pos = 0 - parameter_pos = 0 - for i, arg in enumerate(callee.args): - if arg.direction == "out": - arg_map[arg.name] = assignees[assignee_pos] - assignee_pos += 1 - else: - arg_map[arg.name] = parameters[parameter_pos] - parameter_pos += 1 - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner( - make_subst_func(var_map), kernel, arg_map, callee.arg_dict) - - insn_id = {} - for insn in callee.instructions: - insn_id[insn.id] = ing(callee_label+insn.id) - - # {{{ root and leave instructions in callee kernel - - dep_map = callee.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of callee kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing(callee_label+"_start"), - within_inames=call.within_inames, - depends_on=call.depends_on - ) - noop_end = NoOpInstruction( - id=call.id, - within_inames=call.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for _insn in callee.instructions: - insn = _insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(iname_map.get, insn.within_inames)) - within_inames = within_inames | call.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=call.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == call: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} + """ + Returns a copy of *kernel* with the callable kernels inlined. + """ + old_insns = kernel.instructions + for insn in old_insns: + if isinstance(insn, CallInstruction): + in_knl_callable = kernel.scoped_functions[insn.expression.function.name] + from loopy.kernel.function_interface import CallableKernel + if isinstance(in_knl_callable, CallableKernel) and ( + in_knl_callable.should_inline): + kernel = in_knl_callable.inline_within_kernel(kernel, insn) return kernel @@ -2781,10 +2555,6 @@ def preprocess_kernel(kernel, device=None): check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) - # Inlining callable kernels that are marked with inline=True. - # This should happen after type inference but before other transformations. - kernel = inline_callable_kernels(kernel) - from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel) @@ -2817,6 +2587,10 @@ def preprocess_kernel(kernel, device=None): # tuning the functions in the kernel to align with the grid sizes. kernel = infer_hw_axes_sizes(kernel) + # Inlining callable kernels that are marked with inline=True. + # This should happen after type inference but before other transformations. + kernel = inline_callable_kernels(kernel) + # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 8300fa374..57b86a92f 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -79,14 +79,14 @@ class RegisterCalleeKernel(ImmutableRecord): def register_callable_kernel(caller_kernel, function_name, callee_kernel, - inline=False): + should_inline=False): """Returns a copy of *caller_kernel*, which would resolve *function_name* in an expression as a call to *callee_kernel*. :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. :arg function_name: An instance of :class:`str`. :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg inline: Boolean flag of inlining callee kernel into caller. + :arg should_inline: Boolean flag of inlining callee kernel into caller. """ # {{{ sanity checks @@ -128,7 +128,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel, # kernel. callable_kernel = CallableKernel(subkernel=callee_kernel.copy( target=caller_kernel.target, - is_master_kernel=False), inline=inline) + is_master_kernel=False), should_inline=should_inline) # disabling global barriers for callee kernel from loopy import set_options -- GitLab From 542c3906682a2ba27e61d73ae248db58a5326e11 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 15 May 2018 17:10:46 -0500 Subject: [PATCH 174/916] Made changes in TJs code to handle preprocessing correctly --- loopy/kernel/function_interface.py | 50 ++++++++++++------------------ 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 934a8bad4..c9259eb13 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -36,8 +36,7 @@ from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, - RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper, - pw_aff_to_expr, ) + RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper) # {{{ argument descriptors @@ -464,12 +463,14 @@ class KernelInliner(SubstitutionMapper): def map_subscript(self, expr): if expr.aggregate.name in self.arg_map: - import numpy as np - from pymbolic.mapper.substitutor import make_subst_func aggregate = self.subst_func(expr.aggregate) sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller - arg = self.arg_dict[expr.aggregate.name] # Arg in callee + callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee + if aggregate.name in self.caller.arg_dict: + caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller + else: + caller_arg = self.caller.temporary_variables[aggregate.name] # Firstly, map inner inames to outer inames. outer_indices = self.map_tuple(expr.index_tuple) @@ -477,39 +478,30 @@ class KernelInliner(SubstitutionMapper): # Next, reshape to match dimension of outer arrays. # We can have e.g. A[3, 2] from outside and B[6] from inside from numbers import Integral - print(arg.shape) - if not all(isinstance(d, Integral) for d in arg.shape): + if not all(isinstance(d, Integral) for d in callee_arg.shape): raise LoopyError( "Argument: {0} in callee kernel: {1} does not have " - "constant shape.".format(arg)) - flatten_index = sum( + "constant shape.".format(callee_arg)) + + flatten_index = 0 + for i, idx in enumerate(sar.get_begin_subscript().index_tuple): + flatten_index += idx*caller_arg.dim_tags[i].stride + + flatten_index += sum( idx * tag.stride - for idx, tag in zip(outer_indices, arg.dim_tags)) + for idx, tag in zip(outer_indices, callee_arg.dim_tags)) + from loopy.isl_helpers import simplify_via_aff flatten_index = simplify_via_aff(flatten_index) - bounds = [self.caller.get_iname_bounds(i.name) - for i in sar.swept_inames] - sizes = [pw_aff_to_expr(b.size) for b in bounds] - if not all(isinstance(d, Integral) for d in sizes): - raise LoopyError( - "SubArrayRef: {0} in caller kernel does not have " - "swept inames with constant size.".format(sar)) - - sizes = [int(np.prod(sizes[i + 1:])) for i in range(len(sizes))] - new_indices = [] - for s in sizes: - ind = flatten_index // s - flatten_index -= s * ind + for dim_tag in caller_arg.dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) new_indices.append(ind) - # Lastly, map sweeping indices to indices in Subscripts - # This takes care of cases such as [i, j]: A[i+j, i-j] - index_map = dict(zip(sar.swept_inames, new_indices)) - index_mapper = SubstitutionMapper(make_subst_func(index_map)) - new_indices = index_mapper.map_tuple(sar.subscript.index_tuple) new_indices = tuple(simplify_via_aff(i) for i in new_indices) + return aggregate.index(tuple(new_indices)) else: return super(KernelInliner, self).map_subscript(expr) @@ -782,8 +774,6 @@ class CallableKernel(InKernelCallable): inner_insns = [noop_start] for insn in callee_knl.instructions: - print(insn) - print('Hurrah') insn = insn.with_transformed_expressions(subst_mapper) within_inames = frozenset(map(iname_map.get, insn.within_inames)) within_inames = within_inames | instruction.within_inames -- GitLab From 48e75db16ba259c7d6da5a8b7e3dec9c6b7eed82 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 15 May 2018 17:58:42 -0500 Subject: [PATCH 175/916] Shortened the tests and made changes to include parallelization within inline kernels. --- loopy/kernel/function_interface.py | 9 +- loopy/preprocess.py | 12 ++- test/test_transform.py | 154 +++-------------------------- 3 files changed, 28 insertions(+), 147 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index c9259eb13..4d0ea57a9 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -670,15 +670,22 @@ class CallableKernel(InKernelCallable): iname_map[iname] = vng(callee_label+iname) new_domains = [] + new_iname_to_tag = {} for domain in callee_knl.domains: new_domain = domain.copy() for i in range(new_domain.n_dim()): iname = new_domain.get_dim_name(dim_type, i) + if iname in callee_knl.iname_to_tag: + new_iname_to_tag[iname_map[iname]] = ( + callee_knl.iname_to_tag[iname]) new_domain = new_domain.set_dim_name( dim_type, i, iname_map[iname]) new_domains.append(new_domain) - kernel = kernel.copy(domains=kernel.domains + new_domains) + new_iname_to_tag.update(kernel.iname_to_tag) + + kernel = kernel.copy(domains=kernel.domains + new_domains, + iname_to_tag=new_iname_to_tag) # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 99acb3ac7..63301bab3 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2486,11 +2486,13 @@ def inline_callable_kernels(kernel): old_insns = kernel.instructions for insn in old_insns: if isinstance(insn, CallInstruction): - in_knl_callable = kernel.scoped_functions[insn.expression.function.name] - from loopy.kernel.function_interface import CallableKernel - if isinstance(in_knl_callable, CallableKernel) and ( - in_knl_callable.should_inline): - kernel = in_knl_callable.inline_within_kernel(kernel, insn) + if insn.expression.function.name in kernel.scoped_functions: + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + from loopy.kernel.function_interface import CallableKernel + if isinstance(in_knl_callable, CallableKernel) and ( + in_knl_callable.should_inline): + kernel = in_knl_callable.inline_within_kernel(kernel, insn) return kernel diff --git a/test/test_transform.py b/test/test_transform.py index b08d674a5..26b558165 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -204,7 +204,8 @@ def test_register_function_lookup(ctx_factory): assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 -def test_register_knl(ctx_factory): +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 2 ** 4 @@ -242,9 +243,9 @@ def test_register_knl(ctx_factory): ) child_knl = lp.register_callable_kernel( - child_knl, 'linear_combo1', grandchild_knl) + child_knl, 'linear_combo1', grandchild_knl, inline) knl = lp.register_callable_kernel( - parent_knl, 'linear_combo2', child_knl) + parent_knl, 'linear_combo2', child_knl, inline) evt, (out, ) = knl(queue, x=x, y=y) @@ -252,7 +253,8 @@ def test_register_knl(ctx_factory): np.linalg.norm(2*x+3*y))) < 1e-15 -def test_slices_with_negative_step(ctx_factory): +@pytest.mark.parametrize("inline", [False, True]) +def test_slices_with_negative_step(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 2 ** 4 @@ -288,7 +290,7 @@ def test_slices_with_negative_step(ctx_factory): ) knl = lp.register_callable_kernel( - parent_knl, 'linear_combo', child_knl) + parent_knl, 'linear_combo', child_knl, inline) evt, (out, ) = knl(queue, x=x, y=y) @@ -296,7 +298,8 @@ def test_slices_with_negative_step(ctx_factory): np.linalg.norm(2*x+3*y))) < 1e-15 -def test_register_knl_with_call_with_kwargs(ctx_factory): +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_call_with_kwargs(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -326,7 +329,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory): e=[j, l]: c[i, j, k, l, m]) """) knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl) + caller_knl, 'linear_combo', callee_knl, inline) evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) @@ -343,7 +346,8 @@ def test_register_knl_with_call_with_kwargs(ctx_factory): assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 -def test_register_knl_with_hw_axes(ctx_factory): +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -370,7 +374,7 @@ def test_register_knl_with_hw_axes(ctx_factory): caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl) + caller_knl, 'linear_combo', callee_knl, inline) evt, (out, ) = knl(queue, x=x_dev, y=y_dev) @@ -424,138 +428,6 @@ def test_multi_arg_array_call(ctx_factory): assert(norm(out_dict['min_index'][0] - np.argmin(b)) < tol) -def test_inline_kernel(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 16 - - x = np.random.rand(n) - y = np.random.rand(n) - - knl1 = lp.make_kernel( - "{[i]: 0 <= i < 16}", - """ - for i - c[i] = a[i] + 2*b[i] - end - """ - ) - - knl2 = lp.make_kernel( - "{[i, j]: 0 <= i, j < 16}", - """ - for j - [i]: z[j, i] = func([i]: x[i], [i]: y[i]) - end - """, - kernel_data=[ - lp.GlobalArg("x", np.float64, (16,)), - lp.GlobalArg("y", np.float64, (16,)), "..." - ] - ) - - knl3 = lp.make_kernel( - "{[i, j]: 0 <= i, j < 16}", - """ - for j - [i]: z[i, j] = func([i]: x[i], [i]: y[i]) - end - """, - kernel_data=[ - lp.GlobalArg("x", np.float64, (16,)), - lp.GlobalArg("y", np.float64, (16,)), "..." - ] - ) - - knl4 = lp.make_kernel( - "{[i, j]: 0 <= i, j < 16}", - """ - for j - [i]: z[j, 15-i] = func([i]: x[i], [i]: y[i]) - end - """, - kernel_data=[ - lp.GlobalArg("x", np.float64, (16,)), - lp.GlobalArg("y", np.float64, (16,)), "..." - ] - ) - - knl2 = lp.register_callable_kernel(knl2, 'func', knl1, inline=True) - z = np.tile(x + y * 2, [16, 1]) - evt, (out, ) = knl2(queue, x=x, y=y) - assert np.allclose(out, z) - - knl3 = lp.register_callable_kernel(knl3, 'func', knl1, inline=True) - evt, (out,) = knl3(queue, x=x, y=y) - z = np.tile(x + y * 2, [16, 1]).transpose() - assert np.allclose(out, z) - - knl4 = lp.register_callable_kernel(knl4, 'func', knl1, inline=True) - evt, (out,) = knl4(queue, x=x, y=y) - z = x + y * 2 - z = z[::-1] - z = np.tile(z, [16, 1]) - assert np.allclose(out, z) - - -def test_inline_kernel_2d(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 16 - - x = np.random.rand(n ** 2).reshape((n, n)) - y = np.random.rand(n ** 2).reshape((n, n)) - - knl1 = lp.make_kernel( - "{[i, j]: 0 <= i, j < 16}", - """ - for i, j - c[i, j] = a[i, j] + 2*b[i, j] - end - """, - kernel_data=[ - lp.GlobalArg("a", np.float64, (16, 16)), - lp.GlobalArg("b", np.float64, (16, 16)), "..." - ] - ) - - knl2 = lp.make_kernel( - "{[i, j, k]: 0 <= i, j, k < 16}", - """ - for k - [i, j]: z[k, i, j] = func([i, j]: x[i, j], [i, j]: y[i, j]) - end - """, - kernel_data=[ - lp.GlobalArg("x", np.float64, (16, 16)), - lp.GlobalArg("y", np.float64, (16, 16)), "..." - ] - ) - - knl3 = lp.make_kernel( - "{[i, j, k]: 0 <= i, j, k < 16}", - """ - for k - [i, j]: z[k, j, i] = func([i, j]: x[i, j], [i, j]: y[i, j]) - end - """, - kernel_data=[ - lp.GlobalArg("x", np.float64, (16, 16)), - lp.GlobalArg("y", np.float64, (16, 16)), "..." - ] - ) - - knl2 = lp.register_callable_kernel(knl2, 'func', knl1, inline=True) - evt, (out, ) = knl2(queue, x=x, y=y) - z = np.tile(x + y * 2, [16, 1, 1]) - assert np.allclose(out, z) - - knl3 = lp.register_callable_kernel(knl3, 'func', knl1, inline=True) - evt, (out,) = knl3(queue, x=x, y=y) - z = np.tile(np.transpose(x + y * 2), [16, 1, 1]) - assert np.allclose(out, z) - - def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 0db506694419c3f43e8e07744256165470373e4a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 15 May 2018 21:00:33 -0500 Subject: [PATCH 176/916] comment rewording. --- loopy/kernel/function_interface.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 4d0ea57a9..eb20c26fe 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -135,7 +135,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): class InKernelCallable(ImmutableRecord): """ - Describes a callable encountered in a kernel. + An abstract interface to define a callable encountered in a kernel. .. attribute:: name @@ -513,10 +513,11 @@ class KernelInliner(SubstitutionMapper): class CallableKernel(InKernelCallable): """ - Records information about in order to make the callee kernel compatible to be - called from a caller kernel. The :meth:`loopy.register_callable_kernel` - should be called in order to initiate association between a funciton in - caller kernel and the callee kernel. + Records informations about a callee kernel. Also provides interface through + member methods to make the callee kernel compatible to be called from a + caller kernel. The :meth:`loopy.register_callable_kernel` should be called + in order to initiate association between a function in caller kernel and + the callee kernel. The :meth:`CallableKernel.with_types` should be called in order to match the ``dtypes`` of the arguments that are shared between the caller and the -- GitLab From 6c866f87dab82fea839bfadf8f65ed9cd718b1dd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 16 May 2018 11:28:03 -0500 Subject: [PATCH 177/916] changed the signature of function_magnler --- loopy/__init__.py | 2 +- loopy/kernel/function_interface.py | 6 +++--- loopy/type_inference.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a5850ec0a..49ba932fa 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -359,7 +359,7 @@ def register_symbol_manglers(kernel, manglers): def register_function_manglers(kernel, manglers): """ - :arg manglers: list of functions of signature ``(target, name, arg_dtypes)`` + :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)`` returning a :class:`loopy.CallMangleInfo`. :returns: *kernel* with *manglers* registered """ diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index eb20c26fe..b78a6dbef 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -873,7 +873,7 @@ class ManglerCallable(ScalarCallable): .. attribute:: function_mangler - A function of signature ``(target, name , arg_dtypes)`` and returns an + A function of signature ``(kernel, name , arg_dtypes)`` and returns an instance of ``loopy.CallMangleInfo``. """ fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", @@ -911,7 +911,7 @@ class ManglerCallable(ScalarCallable): arg_dtypes = tuple(arg_id_to_dtype[key] for key in sorted_keys if key >= 0) - mangle_result = self.function_mangler(kernel.target, self.name, + mangle_result = self.function_mangler(kernel, self.name, arg_dtypes) if mangle_result: new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) @@ -934,7 +934,7 @@ class ManglerCallable(ScalarCallable): arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if key >= 0) - return self.function_mangler(kernel.target, self.name, arg_dtypes) + return self.function_mangler(kernel, self.name, arg_dtypes) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index e4f6ec0a4..53d7074f7 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -354,7 +354,7 @@ class TypeInferenceMapper(CombineMapper): # realized function. mangle_result = None for function_mangler in self.kernel.function_manglers: - mangle_result = function_mangler(self.kernel.target, identifier, + mangle_result = function_mangler(self.kernel, identifier, arg_dtypes) if mangle_result: # found a match. -- GitLab From 6a5b2c40a858402f964339e61fe2635af1a29842 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 16 May 2018 12:32:16 -0500 Subject: [PATCH 178/916] Minor error in complex trigonometric functions --- loopy/target/pyopencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index fe2f15b67..430770803 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -249,7 +249,7 @@ class PyOpenCLCallable(ScalarCallable): raise LoopyTypeError("unexpected complex type '%s'" % dtype) return self.copy(name_in_target="%s_%s" % (tpname, name), - arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + arg_id_to_dtype={0: dtype, -1: dtype}) else: # function calls for floating parameters. dtype = dtype.numpy_dtype -- GitLab From 50ba1929ab769d9bcc600b944adee52ae4ea0e36 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 23 May 2018 12:15:05 -0500 Subject: [PATCH 179/916] Some minor fixes in type inference. --- loopy/kernel/data.py | 9 ++++++++- loopy/preprocess.py | 6 +++--- loopy/target/pyopencl.py | 8 ++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index ab66a5e87..1c927b8af 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -250,9 +250,16 @@ class KernelArgument(ImmutableRecord): target = kwargs.pop("target", None) dtype = kwargs.pop("dtype", None) + + if 'for_atomic' in kwargs: + for_atomic = kwargs['for_atomic'] + else: + for_atomic = False + from loopy.types import to_loopy_type dtype = to_loopy_type( - dtype, allow_auto=True, allow_none=True, target=target) + dtype, allow_auto=True, allow_none=True, for_atomic=for_atomic, + target=target) import loopy as lp if dtype is lp.auto: diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 63301bab3..d4d793971 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2570,9 +2570,6 @@ def preprocess_kernel(kernel, device=None): # defaults from being applied. kernel = realize_reduction(kernel, unknown_types_ok=False) - # type specialize functions that were missed during the type inference. - kernel = make_functions_ready_for_codegen(kernel) - # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. @@ -2586,6 +2583,9 @@ def preprocess_kernel(kernel, device=None): # call. kernel = infer_arg_descr(kernel) + # type specialize functions that were missed during the type inference. + kernel = make_functions_ready_for_codegen(kernel) + # tuning the functions in the kernel to align with the grid sizes. kernel = infer_hw_axes_sizes(kernel) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 430770803..17d702136 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -252,13 +252,13 @@ class PyOpenCLCallable(ScalarCallable): arg_id_to_dtype={0: dtype, -1: dtype}) else: # function calls for floating parameters. - dtype = dtype.numpy_dtype - if dtype.kind in ('u', 'i'): - dtype = np.float32 + numpy_dtype = dtype.numpy_dtype + if numpy_dtype.kind in ('u', 'i'): + dtype = dtype.copy(numpy_dtype=np.float32) if name == 'abs': name = 'fabs' return self.copy(name_in_target=name, - arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + arg_id_to_dtype={0: dtype, -1: dtype}) return self.copy(arg_id_to_dtype=arg_id_to_dtype) -- GitLab From b48ab2e595eec30a85f2568746656fb5636c019a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 23 May 2018 13:39:16 -0500 Subject: [PATCH 180/916] changes the coefficient collector of swept inames. --- loopy/symbolic.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 55bd543fc..66fa8620f 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -749,6 +749,20 @@ class VariableInAnExpression(CombineMapper): return False +class SweptInameStrideCollector(CoefficientCollectorBase): + """ + Mapper to compute the coefficient swept inames for :class:`SubArrayRef`. + """ + def map_algebraic_leaf(self, expr): + # subscripts that are not involved in :attr:`target_names` are treated + # as constants. + if isinstance(expr, p.Subscript) and (self.target_names is None or + expr.aggregate.name not in self.target_names): + return {1: expr} + + return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr) + + class SubArrayRef(p.Expression): """Represents a generalized sliced notation of an array. @@ -790,6 +804,7 @@ class SubArrayRef(p.Expression): **Example:** Consider ``[i, k]: a[i, j, k, l]``. The beginning subscript would be ``a[0, j, 0, l]`` """ + # TODO: Set the zero to the minimum value of the iname. swept_inames_to_zeros = dict( (swept_iname.name, 0) for swept_iname in self.swept_inames) @@ -815,7 +830,7 @@ class SubArrayRef(p.Expression): linearized_index = sum(dim_tag.stride*iname for dim_tag, iname in zip(arg_dim_tags, self.subscript.index_tuple)) - strides_as_dict = CoefficientCollector(tuple(iname.name for iname in + strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) sub_dim_tags = tuple(DimTag(strides_as_dict[iname]) for iname in self.swept_inames) -- GitLab From 68ac270e677944468eb20c93ad6088d277c8af74 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 10 Jun 2018 09:14:52 -0500 Subject: [PATCH 181/916] Added some changes to TJs code. --- loopy/kernel/function_interface.py | 24 ++- loopy/preprocess.py | 146 +------------- loopy/transform/pack_and_unpack_args.py | 250 ++++++++++++++++++++++++ loopy/transform/register_callable.py | 8 +- 4 files changed, 277 insertions(+), 151 deletions(-) create mode 100644 loopy/transform/pack_and_unpack_args.py diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 79c9cb2e1..91d9b2911 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -439,12 +439,12 @@ class CallableKernel(InKernelCallable): sizes for the :attr:`subkernel` of the callable. """ - fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", + fields = set(["name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target", "inline"]) - init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", + init_arg_names = ("name", "subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target", "inline") - def __init__(self, subkernel, arg_id_to_dtype=None, + def __init__(self, name, subkernel, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None, inline=False): super(CallableKernel, self).__init__( @@ -453,6 +453,7 @@ class CallableKernel(InKernelCallable): if name_in_target is not None: subkernel = subkernel.copy(name=name_in_target) + self.name = name self.name_in_target = name_in_target self.inline = inline self.subkernel = subkernel.copy( @@ -533,6 +534,23 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr) + def with_packing_for_args(self): + from loopy.preprocess import preprocess_kernel + subkernel = preprocess_kernel(self.subkernel) + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + arg_id_to_descr = {} + + for pos, kw in pos_to_kw.items(): + arg = subkernel.arg_dict[kw] + arg_id_to_descr[pos] = ArrayArgDescriptor( + shape=arg.shape, + dim_tags=arg.dim_tags, + mem_scope='Global') + + return self.copy(subkernel=subkernel, + arg_id_to_descr=arg_id_to_descr) + def with_hw_axes_sizes(self, gsize, lsize): return self.copy( subkernel=self.subkernel.copy( diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 321f31e45..3cf1e1df9 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2282,147 +2282,6 @@ def infer_arg_descr(kernel): # }}} -# {{{ - -def need_packing(tags_needed, tags): - if len(tags_needed) != len(tags): - return True - - strides_needed = (tag.stride for tag in tags_needed) - strides = (tag.stride for tag in tags) - return any(s1!=s2 for s1, s2 in zip(strides_needed, strides)) - -def add_pack_and_unpack(kernel): - """ - """ - - new_domains = [] - new_tmps = kernel.temporary_variables.copy() - new_calls = {} - - for call in kernel.instructions: - if not isinstance(call, CallInstruction): - continue - - callable = kernel.scoped_functions[call.expression.function.name] - from loopy.kernel.function_interface import CallableKernel - if isinstance(callable, CallableKernel): - # Not external functions - continue - - vng = kernel.get_var_name_generator() - ing = kernel.get_instruction_id_generator() - - parameters = call.expression.parameters - packing = [] - new_params = [] - - from loopy.kernel.data import IlpBaseTag, VectorizeTag - import islpy as isl - from pymbolic import var - - dim_type = isl.dim_type.set - ilp_inames = set(iname for iname in call.within_inames if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) - new_ilp_inames = set() - ilp_inames_map = {} - for iname in ilp_inames: - new_iname_name = vng(iname + "_ilp") - ilp_inames_map[var(iname)] = var(new_iname_name) - new_ilp_inames.add(new_iname_name) - for iname in ilp_inames: - new_domain = kernel.get_inames_domain(iname).copy() - for i in range(new_domain.n_dim()): - old_iname = new_domain.get_dim_name(dim_type, i) - if old_iname in ilp_inames: - new_domain = new_domain.set_dim_name( - dim_type, i, ilp_inames_map[var(old_iname)].name) - new_domains.append(new_domain) - - from loopy.symbolic import SubArrayRef - from pymbolic.mapper.substitutor import make_subst_func - from loopy.symbolic import SubstitutionMapper - - for i,p in enumerate(parameters): - if isinstance(p, SubArrayRef): - des = callable.arg_id_to_descr[i] - name = p.subscript.aggregate.name - if name in kernel.temporary_variables: - array = kernel.temporary_variables[name] - else: - assert name in kernel.arg_dict - array = kernel.arg_dict[name] - dim_tags, _ = p.get_sub_array_dim_tags_and_shape(array.dim_tags, array.shape) - # Check if memory layout match - if need_packing(des.dim_tags, dim_tags): - new_swept_inames = ilp_inames_map.copy() - for iname in p.swept_inames: - new_swept_inames[iname] = var(vng(iname.name + "_pack")) - new_domain = kernel.get_inames_domain(iname.name).copy() - for i in range(new_domain.n_dim()): - old_iname = new_domain.get_dim_name(dim_type, i) - new_domain = new_domain.set_dim_name( - dim_type, i, new_swept_inames[var(old_iname)].name) - new_domains.append(new_domain) - - pack_name = vng(name + "_pack") - - from loopy.kernel.data import TemporaryVariable - - pack_tmp = TemporaryVariable( - name=pack_name, - shape=des.shape, - dtype=array.dtype, - scope=array.scope, - dim_tags=des.dim_tags - ) - new_tmps[pack_name] = pack_tmp - - from loopy import Assignment - subst_mapper = SubstitutionMapper(make_subst_func(new_swept_inames)) - - packing.append(Assignment( - assignee=var(pack_name).index(tuple(new_swept_inames[i] for i in p.swept_inames)), - expression=subst_mapper.map_subscript(p.subscript), - within_inames=call.within_inames - ilp_inames | set(new_swept_inames[i].name for i in p.swept_inames) | new_ilp_inames, - depends_on=call.depends_on, - id=ing(call.id+"_pack") - )) - new_params.append(SubArrayRef(p.swept_inames, var(pack_name).index(p.swept_inames))) - else: - new_params.append(p) - else: - new_params.append(p) - if packing: - subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) - _call = call.with_transformed_expressions(subst_mapper) - new_expr = _call.expression.function() - new_params = list(map(subst_mapper, new_params)) - packing.append( - _call.copy( - depends_on=_call.depends_on | set(pack.id for pack in packing), - within_inames=_call.within_inames - ilp_inames | new_ilp_inames, - expression=_call.expression.function(*new_params) - ) - ) - new_calls[call] = packing - - if new_calls: - new_instructions = [] - for insn in kernel.instructions: - if insn in new_calls: - new_instructions.extend(new_calls[insn]) - else: - new_instructions.append(insn) - kernel = kernel.copy( - domains=kernel.domains + new_domains, - instructions=new_instructions, - temporary_variables=new_tmps - ) - return kernel - -# }}} - - # {{{ class HWAxesInferenceMapper(CombineMapper): @@ -2955,11 +2814,8 @@ def preprocess_kernel(kernel, device=None): # call. kernel = infer_arg_descr(kernel) - # packing args for external functions if necessary - kernel = add_pack_and_unpack(kernel) - # tuning the functions in the kernel to align with the grid sizes. - kernel = infer_hw_axes_sizes(kernel) + # kernel = infer_hw_axes_sizes(kernel) # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py new file mode 100644 index 000000000..f6a748eef --- /dev/null +++ b/loopy/transform/pack_and_unpack_args.py @@ -0,0 +1,250 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Tianjiao Sun" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import CallInstruction +from loopy.symbolic import SubArrayRef + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: pack_and_unpack_args_for_call +""" + + +# {{{ main entrypoint + +def pack_and_unpack_args_for_call(kernel, call_name, args=None): + """ + """ + new_domains = [] + new_tmps = kernel.temporary_variables.copy() + new_calls = {} + + for insn in kernel.instructions: + if not isinstance(insn, CallInstruction): + # pack and unpack call only be done for CallInstructions. + continue + + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + + if in_knl_callable.name != call_name: + # not the function we're looking for. + continue + in_knl_callable = in_knl_callable.with_packing_for_args() + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + + parameters = insn.expression.parameters + if args is None: + args = [par.subscript.aggregate.name for par in parameters if + isinstance(par, SubArrayRef)] + [assignee.subscript.aggregate.name for + assignee in insn.assignees if isinstance(assignee, SubArrayRef)] + + # {{{ sanity checks for args + + for arg in args: + found_sub_array_ref = False + for par in parameters + insn.assignees: + if isinstance(par, SubArrayRef) and ( + par.subscript.aggregate.name == arg): + found_sub_array_ref = True + break + if not found_sub_array_ref: + raise LoopyError("No match found for packing arg '%s' of call '%s' " + "at insn '%s'." % (arg, call_name, insn.id)) + + # }}} + + packing = [] + unpacking = [] + new_id_to_parameters = {} + + from loopy.kernel.data import IlpBaseTag, VectorizeTag + import islpy as isl + from pymbolic import var + + dim_type = isl.dim_type.set + ilp_inames = set(iname for iname in insn.within_inames if isinstance( + kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) + new_ilp_inames = set() + ilp_inames_map = {} + for iname in ilp_inames: + new_iname_name = vng(iname + "_ilp") + ilp_inames_map[var(iname)] = var(new_iname_name) + new_ilp_inames.add(new_iname_name) + for iname in ilp_inames: + new_domain = kernel.get_inames_domain(iname).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + if old_iname in ilp_inames: + new_domain = new_domain.set_dim_name( + dim_type, i, ilp_inames_map[var(old_iname)].name) + new_domains.append(new_domain) + + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + + id_to_parameters = tuple(enumerate(parameters)) + tuple( + (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) + + for id, p in id_to_parameters: + if isinstance(p, SubArrayRef) and p.subscript.aggregate.name in args: + new_swept_inames = ilp_inames_map.copy() + for iname in p.swept_inames: + new_swept_inames[iname] = var(vng(iname.name + "_pack")) + new_domain = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain.n_dim()): + old_iname = new_domain.get_dim_name(dim_type, i) + new_domain = new_domain.set_dim_name( + dim_type, i, new_swept_inames[var(old_iname)].name) + new_domains.append(new_domain) + + arg = p.subscript.aggregate.name + pack_name = vng(arg + "_pack") + + from loopy.kernel.data import (TemporaryVariable, + temp_var_scope) + + pack_tmp = TemporaryVariable( + name=pack_name, + dtype=kernel.arg_dict[arg].dtype, + scope=temp_var_scope.PRIVATE, + ) + + new_tmps[pack_name] = pack_tmp + + from loopy import Assignment + subst_mapper = SubstitutionMapper(make_subst_func( + new_swept_inames)) + + # {{{ getting the lhs assignee + + arg_in_caller = kernel.arg_dict[arg] + + from loopy.isl_helpers import simplify_via_aff, make_slab + + flatten_index = simplify_via_aff( + sum(dim_tag.stride*idx for dim_tag, idx in + zip(arg_in_caller.dim_tags, p.subscript.index_tuple))) + + new_indices = [] + for dim_tag in in_knl_callable.arg_id_to_descr[id].dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + lhs_assignee = subst_mapper(var(pack_name).index(new_indices)) + + # }}} + + packing.append(Assignment( + assignee=lhs_assignee, + expression=subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | set( + new_swept_inames[i].name for i in p.swept_inames) | ( + new_ilp_inames), + depends_on=insn.depends_on, + id=ing(insn.id+"_pack") + )) + + unpacking.append(Assignment( + expression=lhs_assignee, + assignee=subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | set( + new_swept_inames[i].name for i in p.swept_inames) | ( + new_ilp_inames), + depends_on=frozenset([insn.id]), + id=ing(insn.id+"_unpack") + )) + + # {{{ getting the new swept inames + + updated_swept_inames = [] + + for i, _ in enumerate( + in_knl_callable.arg_id_to_descr[id].shape): + updated_swept_inames.append(var(vng("i_packsweep_"+arg))) + + ctx = kernel.isl_context + space = isl.Space.create_from_names(ctx, + set=[iname.name for iname in updated_swept_inames]) + iname_set = isl.BasicSet.universe(space) + for iname, axis_length in zip(updated_swept_inames, + in_knl_callable.arg_id_to_descr[id].shape): + iname_set = iname_set & make_slab(space, iname.name, 0, + axis_length) + new_domains = new_domains + [iname_set] + + # }}} + + new_id_to_parameters[id] = SubArrayRef(tuple(updated_swept_inames), + (var(pack_name).index(tuple(updated_swept_inames)))) + else: + new_id_to_parameters[id] = p + + if packing: + subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) + new_insn = insn.with_transformed_expressions(subst_mapper) + new_params = [new_id_to_parameters[i] for i, _ in + enumerate(parameters)] + new_assignees = [new_id_to_parameters[-i-1] for i, _ in + enumerate(insn.assignees)] + new_params = [subst_mapper(p) for p in new_params] + new_assignees = tuple(subst_mapper(a) for a in new_assignees) + packing.append( + new_insn.copy( + depends_on=new_insn.depends_on | set( + pack.id for pack in packing), + within_inames=new_insn.within_inames - ilp_inames | ( + new_ilp_inames), + expression=new_insn.expression.function(*new_params), + assignees=new_assignees + ) + ) + new_calls[insn] = packing + unpacking + + if new_calls: + new_instructions = [] + for insn in kernel.instructions: + if insn in new_calls: + new_instructions.extend(new_calls[insn]) + else: + new_instructions.append(insn) + kernel = kernel.copy( + domains=kernel.domains + new_domains, + instructions=new_instructions, + temporary_variables=new_tmps + ) + + return kernel + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 8300fa374..1204c9c13 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -126,9 +126,11 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel, # making the target of the child kernel to be same as the target of parent # kernel. - callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=caller_kernel.target, - is_master_kernel=False), inline=inline) + callable_kernel = CallableKernel(name=function_name, + subkernel=callee_kernel.copy( + target=caller_kernel.target, + is_master_kernel=False), + inline=inline) # disabling global barriers for callee kernel from loopy import set_options -- GitLab From 4af8ce256a040725ff7c41905f64916dd61cd2f2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 10 Jun 2018 16:02:28 -0500 Subject: [PATCH 182/916] Added pack, unpack. Remaining to comment the code. --- loopy/kernel/function_interface.py | 6 +-- loopy/preprocess.py | 2 +- loopy/transform/pack_and_unpack_args.py | 58 ++++++++++++++++--------- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 91d9b2911..cb05a65b8 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -535,20 +535,18 @@ class CallableKernel(InKernelCallable): arg_id_to_descr=arg_id_to_descr) def with_packing_for_args(self): - from loopy.preprocess import preprocess_kernel - subkernel = preprocess_kernel(self.subkernel) kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) arg_id_to_descr = {} for pos, kw in pos_to_kw.items(): - arg = subkernel.arg_dict[kw] + arg = self.subkernel.arg_dict[kw] arg_id_to_descr[pos] = ArrayArgDescriptor( shape=arg.shape, dim_tags=arg.dim_tags, mem_scope='Global') - return self.copy(subkernel=subkernel, + return self.copy(subkernel=self.subkernel, arg_id_to_descr=arg_id_to_descr) def with_hw_axes_sizes(self, gsize, lsize): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 3cf1e1df9..1b1d9be38 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2815,7 +2815,7 @@ def preprocess_kernel(kernel, device=None): kernel = infer_arg_descr(kernel) # tuning the functions in the kernel to align with the grid sizes. - # kernel = infer_hw_axes_sizes(kernel) + kernel = infer_hw_axes_sizes(kernel) # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index f6a748eef..853719c71 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -113,15 +113,21 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): for id, p in id_to_parameters: if isinstance(p, SubArrayRef) and p.subscript.aggregate.name in args: - new_swept_inames = ilp_inames_map.copy() + new_pack_inames = ilp_inames_map.copy() + new_unpack_inames = ilp_inames_map.copy() for iname in p.swept_inames: - new_swept_inames[iname] = var(vng(iname.name + "_pack")) - new_domain = kernel.get_inames_domain(iname.name).copy() - for i in range(new_domain.n_dim()): - old_iname = new_domain.get_dim_name(dim_type, i) - new_domain = new_domain.set_dim_name( - dim_type, i, new_swept_inames[var(old_iname)].name) - new_domains.append(new_domain) + new_pack_inames[iname] = var(vng(iname.name + "_pack")) + new_unpack_inames[iname] = var(vng(iname.name + "_unpack")) + new_domain_pack = kernel.get_inames_domain(iname.name).copy() + new_domain_unpack = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain_pack.n_dim()): + old_iname = new_domain_pack.get_dim_name(dim_type, i) + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) + new_domains.append(new_domain_pack) + new_domains.append(new_domain_unpack) arg = p.subscript.aggregate.name pack_name = vng(arg + "_pack") @@ -132,14 +138,18 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): pack_tmp = TemporaryVariable( name=pack_name, dtype=kernel.arg_dict[arg].dtype, + dim_tags=in_knl_callable.arg_id_to_descr[id].dim_tags, + shape=in_knl_callable.arg_id_to_descr[id].shape, scope=temp_var_scope.PRIVATE, ) new_tmps[pack_name] = pack_tmp from loopy import Assignment - subst_mapper = SubstitutionMapper(make_subst_func( - new_swept_inames)) + pack_subst_mapper = SubstitutionMapper(make_subst_func( + new_pack_inames)) + unpack_subst_mapper = SubstitutionMapper(make_subst_func( + new_unpack_inames)) # {{{ getting the lhs assignee @@ -159,28 +169,32 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): new_indices = tuple(simplify_via_aff(i) for i in new_indices) - lhs_assignee = subst_mapper(var(pack_name).index(new_indices)) + pack_lhs_assignee = pack_subst_mapper( + var(pack_name).index(new_indices)) + unpack_rhs = unpack_subst_mapper( + var(pack_name).index(new_indices)) # }}} packing.append(Assignment( - assignee=lhs_assignee, - expression=subst_mapper.map_subscript(p.subscript), + assignee=pack_lhs_assignee, + expression=pack_subst_mapper.map_subscript(p.subscript), within_inames=insn.within_inames - ilp_inames | set( - new_swept_inames[i].name for i in p.swept_inames) | ( + new_pack_inames[i].name for i in p.swept_inames) | ( new_ilp_inames), depends_on=insn.depends_on, - id=ing(insn.id+"_pack") + id=ing(insn.id+"_pack"), + depends_on_is_final=True )) unpacking.append(Assignment( - expression=lhs_assignee, - assignee=subst_mapper.map_subscript(p.subscript), + expression=unpack_rhs, + assignee=unpack_subst_mapper.map_subscript(p.subscript), within_inames=insn.within_inames - ilp_inames | set( - new_swept_inames[i].name for i in p.swept_inames) | ( + new_unpack_inames[i].name for i in p.swept_inames) | ( new_ilp_inames), - depends_on=frozenset([insn.id]), - id=ing(insn.id+"_unpack") + id=ing(insn.id+"_unpack"), + depends_on_is_final=True )) # {{{ getting the new swept inames @@ -227,7 +241,9 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): assignees=new_assignees ) ) - new_calls[insn] = packing + unpacking + new_unpacking = [unpack.copy(depends_on=frozenset( + pack.id for pack in packing)) for unpack in unpacking] + new_calls[insn] = packing + new_unpacking if new_calls: new_instructions = [] -- GitLab From fb63f2d7d0e543145feb5db9a313548f5b21856a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 10 Jun 2018 17:53:37 -0500 Subject: [PATCH 183/916] Added test and a bit of cleanup. --- loopy/__init__.py | 3 ++ loopy/transform/pack_and_unpack_args.py | 61 ++++++++++++++++--------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a5850ec0a..2da4815d3 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -120,6 +120,7 @@ from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, register_function_lookup) +from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call # }}} @@ -231,6 +232,8 @@ __all__ = [ "register_callable_kernel", "register_function_lookup", + "pack_and_unpack_args_for_call", + # }}} "get_dot_dependency_graph", diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 853719c71..cf0003f8a 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -37,10 +37,20 @@ __doc__ = """ def pack_and_unpack_args_for_call(kernel, call_name, args=None): """ + Returns a a copy of *kernel* with instructions appended to copy the + arguments in *args* to match the alignment expected by the *call_name* in + the kernel. The arguments are copied back to *args* with the appropriate + data layout. + + :arg call_name: An instance of :class:`str` denoting the function call in + the *kernel*. + :arg args: A list of the arguments as instances of :class:`str` which must + be packed and unpacked. If set *None*, it is interpreted that all the + array arguments would be packed anf unpacked. """ new_domains = [] new_tmps = kernel.temporary_variables.copy() - new_calls = {} + old_insn_to_new_insns = {} for insn in kernel.instructions: if not isinstance(insn, CallInstruction): @@ -66,6 +76,8 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): # {{{ sanity checks for args + assert isinstance(args, list) + for arg in args: found_sub_array_ref = False for par in parameters + insn.assignees: @@ -81,7 +93,6 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): packing = [] unpacking = [] - new_id_to_parameters = {} from loopy.kernel.data import IlpBaseTag, VectorizeTag import islpy as isl @@ -108,24 +119,31 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import SubstitutionMapper + # dict to store the new assignees and parameters, the mapping pattern + # from id to parameters is identical to InKernelCallable.arg_id_to_dtype id_to_parameters = tuple(enumerate(parameters)) + tuple( (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) + new_id_to_parameters = {} for id, p in id_to_parameters: if isinstance(p, SubArrayRef) and p.subscript.aggregate.name in args: - new_pack_inames = ilp_inames_map.copy() - new_unpack_inames = ilp_inames_map.copy() + new_pack_inames = ilp_inames_map.copy() # packing-specific inames + new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname + for iname in p.swept_inames: new_pack_inames[iname] = var(vng(iname.name + "_pack")) new_unpack_inames[iname] = var(vng(iname.name + "_unpack")) + + # Updating the domains corresponding to the new inames. new_domain_pack = kernel.get_inames_domain(iname.name).copy() new_domain_unpack = kernel.get_inames_domain(iname.name).copy() for i in range(new_domain_pack.n_dim()): old_iname = new_domain_pack.get_dim_name(dim_type, i) - new_domain_pack = new_domain_pack.set_dim_name( - dim_type, i, new_pack_inames[var(old_iname)].name) - new_domain_unpack = new_domain_unpack.set_dim_name( - dim_type, i, new_unpack_inames[var(old_iname)].name) + if var(old_iname) in new_pack_inames: + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) new_domains.append(new_domain_pack) new_domains.append(new_domain_unpack) @@ -151,7 +169,7 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): unpack_subst_mapper = SubstitutionMapper(make_subst_func( new_unpack_inames)) - # {{{ getting the lhs assignee + # {{{ getting the lhs for packing and rhs for unpacking arg_in_caller = kernel.arg_dict[arg] @@ -194,10 +212,11 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): new_unpack_inames[i].name for i in p.swept_inames) | ( new_ilp_inames), id=ing(insn.id+"_unpack"), + depends_on=frozenset([insn.id]), depends_on_is_final=True )) - # {{{ getting the new swept inames + # {{{ creating the sweep inames for the new sub array refs updated_swept_inames = [] @@ -225,12 +244,10 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): if packing: subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) new_insn = insn.with_transformed_expressions(subst_mapper) - new_params = [new_id_to_parameters[i] for i, _ in - enumerate(parameters)] - new_assignees = [new_id_to_parameters[-i-1] for i, _ in - enumerate(insn.assignees)] - new_params = [subst_mapper(p) for p in new_params] - new_assignees = tuple(subst_mapper(a) for a in new_assignees) + new_params = tuple(subst_mapper(new_id_to_parameters[i]) for i, _ in + enumerate(parameters)) + new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1]) + for i, _ in enumerate(insn.assignees)) packing.append( new_insn.copy( depends_on=new_insn.depends_on | set( @@ -241,15 +258,15 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): assignees=new_assignees ) ) - new_unpacking = [unpack.copy(depends_on=frozenset( - pack.id for pack in packing)) for unpack in unpacking] - new_calls[insn] = packing + new_unpacking + old_insn_to_new_insns[insn] = packing + unpacking - if new_calls: + if old_insn_to_new_insns: new_instructions = [] for insn in kernel.instructions: - if insn in new_calls: - new_instructions.extend(new_calls[insn]) + if insn in old_insn_to_new_insns: + # Replacing the current instruction with the group of + # instructions including the packing and unpacking instructions + new_instructions.extend(old_insn_to_new_insns[insn]) else: new_instructions.append(insn) kernel = kernel.copy( -- GitLab From 55690f031a0f718c42e26f7fd64109c0b0a3c2f6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 10 Jun 2018 17:56:24 -0500 Subject: [PATCH 184/916] Commiting the tests. --- test/test_transform.py | 46 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/test_transform.py b/test/test_transform.py index b08d674a5..8d42b61ff 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -556,6 +556,52 @@ def test_inline_kernel_2d(ctx_factory): assert np.allclose(out, z) +@pytest.mark.parametrize("inline", [False, True]) +def test_packing_unpacking(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + + callee1 = lp.make_kernel( + "{[i]: 0<=i<6}", + """ + a[i] = 2*b[i] + """) + + callee2 = lp.make_kernel( + "{[i, j]: 0<=i<2 and 0 <= j < 3}", + """ + a[i, j] = 3*b[i, j] + """) + + knl = lp.make_kernel( + "{[i, j, k]: 0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}", + """ + [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j]) + [k]: y2[k] = callee_fn2([k]: x2[k]) + """) + + knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1, inline=inline) + knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2, inline=inline) + + knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1') + knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2') + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2) + + y1 = out_dict['y1'].get() + y2 = out_dict['y2'].get() + + assert np.linalg.norm(2*x1.get()-y1)/np.linalg.norm( + 2*x1.get()) < 1e-15 + assert np.linalg.norm(3*x2.get()-y2)/np.linalg.norm( + 3*x2.get()) < 1e-15 + + def test_rename_argument(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 488e47a3896fb4266f9ea395a57f76f2104d54ae Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 11 Jun 2018 10:32:28 -0500 Subject: [PATCH 185/916] Fixes minor error in getting the iname domains. --- loopy/transform/pack_and_unpack_args.py | 47 ++++++++++++++----------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index cf0003f8a..9ed2766e2 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -56,6 +56,8 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): if not isinstance(insn, CallInstruction): # pack and unpack call only be done for CallInstructions. continue + if insn.expression.function.name not in kernel.scoped_functions: + continue in_knl_callable = kernel.scoped_functions[ insn.expression.function.name] @@ -70,9 +72,9 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): parameters = insn.expression.parameters if args is None: - args = [par.subscript.aggregate.name for par in parameters if - isinstance(par, SubArrayRef)] + [assignee.subscript.aggregate.name for - assignee in insn.assignees if isinstance(assignee, SubArrayRef)] + args = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] # {{{ sanity checks for args @@ -130,22 +132,24 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): new_pack_inames = ilp_inames_map.copy() # packing-specific inames new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname - for iname in p.swept_inames: - new_pack_inames[iname] = var(vng(iname.name + "_pack")) - new_unpack_inames[iname] = var(vng(iname.name + "_unpack")) + new_pack_inames = dict((iname, var(vng(iname.name + + "_pack"))) for iname in p.swept_inames) + new_unpack_inames = dict((iname, var(vng(iname.name + + "_unpack"))) for iname in p.swept_inames) # Updating the domains corresponding to the new inames. - new_domain_pack = kernel.get_inames_domain(iname.name).copy() - new_domain_unpack = kernel.get_inames_domain(iname.name).copy() - for i in range(new_domain_pack.n_dim()): - old_iname = new_domain_pack.get_dim_name(dim_type, i) - if var(old_iname) in new_pack_inames: - new_domain_pack = new_domain_pack.set_dim_name( - dim_type, i, new_pack_inames[var(old_iname)].name) - new_domain_unpack = new_domain_unpack.set_dim_name( - dim_type, i, new_unpack_inames[var(old_iname)].name) - new_domains.append(new_domain_pack) - new_domains.append(new_domain_unpack) + for iname in p.swept_inames: + new_domain_pack = kernel.get_inames_domain(iname.name).copy() + new_domain_unpack = kernel.get_inames_domain(iname.name).copy() + for i in range(new_domain_pack.n_dim()): + old_iname = new_domain_pack.get_dim_name(dim_type, i) + if var(old_iname) in new_pack_inames: + new_domain_pack = new_domain_pack.set_dim_name( + dim_type, i, new_pack_inames[var(old_iname)].name) + new_domain_unpack = new_domain_unpack.set_dim_name( + dim_type, i, new_unpack_inames[var(old_iname)].name) + new_domains.append(new_domain_pack) + new_domains.append(new_domain_unpack) arg = p.subscript.aggregate.name pack_name = vng(arg + "_pack") @@ -153,9 +157,14 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): from loopy.kernel.data import (TemporaryVariable, temp_var_scope) + if arg in kernel.arg_dict: + arg_in_caller = kernel.arg_dict[arg] + else: + arg_in_caller = kernel.temporary_variables[arg] + pack_tmp = TemporaryVariable( name=pack_name, - dtype=kernel.arg_dict[arg].dtype, + dtype=arg_in_caller.dtype, dim_tags=in_knl_callable.arg_id_to_descr[id].dim_tags, shape=in_knl_callable.arg_id_to_descr[id].shape, scope=temp_var_scope.PRIVATE, @@ -171,8 +180,6 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): # {{{ getting the lhs for packing and rhs for unpacking - arg_in_caller = kernel.arg_dict[arg] - from loopy.isl_helpers import simplify_via_aff, make_slab flatten_index = simplify_via_aff( -- GitLab From e0a167ae65df6e3002f0c74e8d8765acb57c17d0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 11 Jun 2018 12:27:50 -0500 Subject: [PATCH 186/916] Now transfers scoped functions from caller to callee. --- loopy/kernel/function_interface.py | 8 ++++ loopy/preprocess.py | 71 ++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index cb05a65b8..ea20ae9da 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -38,6 +38,14 @@ from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, SubstitutionRuleExpander) +from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, + RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper, + CombineMapper) + +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) +from functools import reduce + # {{{ argument descriptors diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1b1d9be38..a1964fc7d 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2546,6 +2546,54 @@ class KernelInliner(SubstitutionMapper): return super(KernelInliner, self).map_subscript(expr) +class CalleeScopedCallsCollector(CombineMapper): + """ + Collects the scoped functions which are a part of the callee kernel and + must be transferred to the caller kernel before inlining. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the caller kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def __init__(self, callee_scoped_functions): + self.callee_scoped_functions = callee_scoped_functions + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + def inline_callable_kernels(kernel): from loopy import CallInstruction @@ -2718,6 +2766,29 @@ def inline_callable_kernels(kernel): # }}} + # {{{ transferring the scoped functions from callee to caller + + callee_scoped_calls_collector = CalleeScopedCallsCollector( + callee.scoped_functions) + callee_scoped_calls_dict = {} + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( + insn.expression))) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown type of instruction %s." % type( + insn)) + + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + kernel = register_pymbolic_calls_to_knl_callables(kernel, + callee_scoped_calls_dict) + + # }}} + return kernel # }}} -- GitLab From b534f0b1952f505e826a3106d2568391e07ae9a3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 11 Jun 2018 12:32:55 -0500 Subject: [PATCH 187/916] adding unpacking instructions as dependencies. --- loopy/transform/pack_and_unpack_args.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 9ed2766e2..2c06a6fa9 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -275,7 +275,19 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): # instructions including the packing and unpacking instructions new_instructions.extend(old_insn_to_new_insns[insn]) else: - new_instructions.append(insn) + # for the instructions that depend on the call instruction that + # are to be packed and unpacked, we need to add the complete + # instruction block as a dependency for them. + new_depends_on = insn.depends_on + if insn.depends_on & set( + old_insn.id for old_insn in old_insn_to_new_insns): + # need to add the unpack instructions on dependencies. + for old_insn_id in insn.depends_on & set( + old_insn.id for old_insn in old_insn_to_new_insns): + old_insn = kernel.id_to_insn[old_insn_id] + new_depends_on |= frozenset(i.id for i + in old_insn_to_new_insns[old_insn]) + new_instructions.append(insn.copy(depends_on=new_depends_on)) kernel = kernel.copy( domains=kernel.domains + new_domains, instructions=new_instructions, -- GitLab From e9627aac35380f8d8b685bc45223a19a9e04ebe2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 14 Jun 2018 00:09:24 -0500 Subject: [PATCH 188/916] Adds interesting strided caller callee. --- loopy/kernel/function_interface.py | 81 +++++++++++++++++++++++++++- loopy/preprocess.py | 2 +- loopy/symbolic.py | 14 ++--- loopy/target/c/codegen/expression.py | 3 +- test/test_transform.py | 52 ++++++++++++++++++ 5 files changed, 142 insertions(+), 10 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index b78a6dbef..958d9d52d 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -34,9 +34,14 @@ from loopy.diagnostic import LoopyError from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name - from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, - RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper) + RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper, + CombineMapper) + +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) + +from functools import reduce # {{{ argument descriptors @@ -506,6 +511,55 @@ class KernelInliner(SubstitutionMapper): else: return super(KernelInliner, self).map_subscript(expr) + +class CalleeScopedCallsCollector(CombineMapper): + """ + Collects the scoped functions which are a part of the callee kernel and + must be transferred to the caller kernel before inlining. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the caller kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def __init__(self, callee_scoped_functions): + self.callee_scoped_functions = callee_scoped_functions + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + # }}} @@ -810,6 +864,29 @@ class CallableKernel(InKernelCallable): # }}} + # {{{ transferring the scoped functions from callee to caller + + callee_scoped_calls_collector = CalleeScopedCallsCollector( + callee_knl.scoped_functions) + callee_scoped_calls_dict = {} + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( + insn.expression))) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown type of instruction %s." % type( + insn)) + + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + kernel = register_pymbolic_calls_to_knl_callables(kernel, + callee_scoped_calls_dict) + + # }}} + return kernel def emit_call_insn(self, insn, target, expression_to_code_mapper): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d4d793971..9b69fd5d8 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2126,7 +2126,7 @@ def get_arg_description_from_sub_array_ref(sub_array, kernel): mem_scope = arg.memory_address_space sub_dim_tags, sub_shape = sub_array.get_sub_array_dim_tags_and_shape( - arg.dim_tags, arg.shape) + kernel, arg.dim_tags, arg.shape) return ArrayArgDescriptor(mem_scope=mem_scope, dim_tags=sub_dim_tags, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 66fa8620f..6628f4e46 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -811,7 +811,7 @@ class SubArrayRef(p.Expression): return EvaluatorWithDeficientContext(swept_inames_to_zeros)( self.subscript) - def get_sub_array_dim_tags_and_shape(self, arg_dim_tags, arg_shape): + def get_sub_array_dim_tags_and_shape(self, kernel, arg_dim_tags, arg_shape): """Returns the dim tags for the inner inames. .. arg:: arg_dim_tags @@ -827,16 +827,18 @@ class SubArrayRef(p.Expression): from loopy.kernel.array import FixedStrideArrayDimTag as DimTag sub_dim_tags = [] sub_shape = [] - linearized_index = sum(dim_tag.stride*iname for dim_tag, iname in - zip(arg_dim_tags, self.subscript.index_tuple)) + linearized_index = simplify_using_aff(kernel, + sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg_dim_tags, self.subscript.index_tuple))) strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) sub_dim_tags = tuple(DimTag(strides_as_dict[iname]) for iname in self.swept_inames) - sub_shape = tuple(dim_shape for dim_shape, index in zip( - arg_shape, self.subscript.index_tuple) if VariableInAnExpression( - self.swept_inames)(index)) + sub_shape = tuple( + pw_aff_to_expr( + kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 + for iname in self.swept_inames) if len(sub_shape) != len(self.swept_inames): # Not allowed something like: [i]: a[i, i] diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 9f55ce851..108360b4b 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -246,7 +246,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): else: subscript, = access_info.subscripts - result = make_var(access_info.array_name)[self.rec(subscript, 'i')] + result = make_var(access_info.array_name)[simplify_using_aff( + self.kernel, self.rec(subscript, 'i'))] if access_info.vector_index is not None: return self.codegen_state.ast_builder.add_vector_access( diff --git a/test/test_transform.py b/test/test_transform.py index 26b558165..d381413a4 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -385,6 +385,58 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): 2*x_host+3*y_host) < 1e-15 +@pytest.mark.parametrize("inline", [False, True]) +def test_shape_translation_through_sub_array_ref(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) + + callee1 = lp.make_kernel( + "{[i]: 0<=i<6}", + """ + a[i] = 2*abs(b[i]) + """) + + callee2 = lp.make_kernel( + "{[i, j]: 0<=i<3 and 0 <= j < 2}", + """ + a[i, j] = 3*b[i, j] + """) + + callee3 = lp.make_kernel( + "{[i]: 0<=i<6}", + """ + a[i] = 5*b[i] + """) + + knl = lp.make_kernel( + "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", + """ + [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) + [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) + [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) + """) + + knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1, True) + knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2, True) + knl = lp.register_callable_kernel(knl, 'callee_fn3', callee3, True) + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) + + y1 = out_dict['y1'].get() + y2 = out_dict['y2'].get() + y3 = out_dict['y3'].get() + + assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 + assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 + assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 + + def test_multi_arg_array_call(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 8df0b6f6e594f8f50a01135fd1a8e080a043cd6b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 14 Jun 2018 00:29:42 -0500 Subject: [PATCH 189/916] Changes because of adding simplify_via_aff while flattening out subscripts. --- doc/tutorial.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 345c26b68..429970a51 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -650,9 +650,9 @@ loop's tag to ``"unr"``: for (int i_outer = 0; i_outer <= int_floor_div_pos_b(-4 + n, 4); ++i_outer) { a[4 * i_outer] = 0.0f; - a[4 * i_outer + 1] = 0.0f; - a[4 * i_outer + 2] = 0.0f; - a[4 * i_outer + 3] = 0.0f; + a[1 + 4 * i_outer] = 0.0f; + a[2 + 4 * i_outer] = 0.0f; + a[3 + 4 * i_outer] = 0.0f; } ... -- GitLab From 91616e5829a8d08be7ed44e29fc4ae989b7ebdb9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 14 Jun 2018 01:03:52 -0500 Subject: [PATCH 190/916] Small errors in docs. --- doc/tutorial.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 429970a51..2e4de1f24 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -771,11 +771,11 @@ assumption: { a[4 * i_outer] = 0.0f; if (-2 + -4 * i_outer + n >= 0) - a[4 * i_outer + 1] = 0.0f; + a[1 + 4 * i_outer] = 0.0f; if (-3 + -4 * i_outer + n >= 0) - a[4 * i_outer + 2] = 0.0f; + a[2 + 4 * i_outer] = 0.0f; if (-4 + -4 * i_outer + n >= 0) - a[4 * i_outer + 3] = 0.0f; + a[3 + 4 * i_outer] = 0.0f; } ... -- GitLab From 5e379ea7bab14068909bb33810cb98ef052f6e7a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 14 Jun 2018 01:38:44 -0500 Subject: [PATCH 191/916] fixes changed in docs. --- doc/tutorial.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 2e4de1f24..dde7586aa 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -800,9 +800,9 @@ enabling some cost savings: for (int i_outer = 0; i_outer <= -2 + ((3 + n) / 4); ++i_outer) { a[4 * i_outer] = 0.0f; - a[4 * i_outer + 1] = 0.0f; - a[4 * i_outer + 2] = 0.0f; - a[4 * i_outer + 3] = 0.0f; + a[1 + 4 * i_outer] = 0.0f; + a[2 + 4 * i_outer] = 0.0f; + a[3 + 4 * i_outer] = 0.0f; } /* final slab for 'i_outer' */ { @@ -812,11 +812,11 @@ enabling some cost savings: { a[4 * i_outer] = 0.0f; if (-2 + -4 * i_outer + n >= 0) - a[4 * i_outer + 1] = 0.0f; + a[1 + 4 * i_outer] = 0.0f; if (-3 + -4 * i_outer + n >= 0) - a[4 * i_outer + 2] = 0.0f; + a[2 + 4 * i_outer] = 0.0f; if (4 + 4 * i_outer + -1 * n == 0) - a[4 * i_outer + 3] = 0.0f; + a[3 + 4 * i_outer] = 0.0f; } } ... -- GitLab From 98758f04eccc6bc1175af9f8acb2b1c0c8c964b5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 14 Jun 2018 12:37:34 -0500 Subject: [PATCH 192/916] minor changes so that strides with axis length 1 are not ignored. --- loopy/symbolic.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 6628f4e46..79052730e 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -825,26 +825,22 @@ class SubArrayRef(p.Expression): *SubArrayRef*. """ from loopy.kernel.array import FixedStrideArrayDimTag as DimTag + from loopy.isl_helpers import simplify_via_aff sub_dim_tags = [] sub_shape = [] - linearized_index = simplify_using_aff(kernel, + linearized_index = simplify_via_aff( sum(dim_tag.stride*iname for dim_tag, iname in zip(arg_dim_tags, self.subscript.index_tuple))) strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) - sub_dim_tags = tuple(DimTag(strides_as_dict[iname]) for iname in - self.swept_inames) + sub_dim_tags = tuple( + DimTag(strides_as_dict[iname]) for iname in self.swept_inames) sub_shape = tuple( pw_aff_to_expr( kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 for iname in self.swept_inames) - if len(sub_shape) != len(self.swept_inames): - # Not allowed something like: [i]: a[i, i] - raise LoopyError("Number of axes swept must be equal to the number " - "of inames declared for sweeping.") - return sub_dim_tags, sub_shape def __getinitargs__(self): -- GitLab From 95caba48320e15479b72034b8597524d29a20e00 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 16 Jun 2018 20:50:21 -0500 Subject: [PATCH 193/916] Added the name to the subkernel. --- loopy/transform/register_callable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 57b86a92f..f79b7efe8 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -128,6 +128,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel, # kernel. callable_kernel = CallableKernel(subkernel=callee_kernel.copy( target=caller_kernel.target, + name=function_name, is_master_kernel=False), should_inline=should_inline) # disabling global barriers for callee kernel -- GitLab From 672a859a3fd6c7a4924945d43a874a0063b6093e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 16 Jun 2018 21:17:10 -0500 Subject: [PATCH 194/916] Changed to on-the-fly inlining. --- loopy/__init__.py | 3 ++- loopy/kernel/function_interface.py | 12 ++++------- loopy/preprocess.py | 26 ---------------------- loopy/transform/register_callable.py | 32 ++++++++++++++++++++++++---- 4 files changed, 34 insertions(+), 39 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 49ba932fa..4fe83e3f4 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -119,7 +119,7 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, - register_function_lookup) + register_function_lookup, inline_callable) # }}} @@ -230,6 +230,7 @@ __all__ = [ "add_barrier", "register_callable_kernel", "register_function_lookup", + "inline_callable", # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 958d9d52d..00bbdedd2 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -586,21 +586,18 @@ class CallableKernel(InKernelCallable): """ fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target", "should_inline"]) + "name_in_target"]) init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target", "should_inline") + "name_in_target") def __init__(self, subkernel, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None, should_inline=False): + arg_id_to_descr=None, name_in_target=None): super(CallableKernel, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) - if name_in_target is not None: - subkernel = subkernel.copy(name=name_in_target) self.name_in_target = name_in_target - self.should_inline = should_inline self.subkernel = subkernel.copy( args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) if arg.dtype is not None else arg for arg in subkernel.args]) @@ -707,8 +704,7 @@ class CallableKernel(InKernelCallable): Returns a copy of *kernel* with the *instruction* in the *kernel* replaced by inlining :attr:`subkernel` within it. """ - from loopy.preprocess import preprocess_kernel - callee_knl = preprocess_kernel(self.subkernel) + callee_knl = self.subkernel import islpy as isl diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 9b69fd5d8..4d6471da9 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2477,28 +2477,6 @@ def make_functions_ready_for_codegen(kernel): # }}} -# {{{ inline callable kernel - -def inline_callable_kernels(kernel): - """ - Returns a copy of *kernel* with the callable kernels inlined. - """ - old_insns = kernel.instructions - for insn in old_insns: - if isinstance(insn, CallInstruction): - if insn.expression.function.name in kernel.scoped_functions: - in_knl_callable = kernel.scoped_functions[ - insn.expression.function.name] - from loopy.kernel.function_interface import CallableKernel - if isinstance(in_knl_callable, CallableKernel) and ( - in_knl_callable.should_inline): - kernel = in_knl_callable.inline_within_kernel(kernel, insn) - - return kernel - -# }}} - - preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2589,10 +2567,6 @@ def preprocess_kernel(kernel, device=None): # tuning the functions in the kernel to align with the grid sizes. kernel = infer_hw_axes_sizes(kernel) - # Inlining callable kernels that are marked with inline=True. - # This should happen after type inference but before other transformations. - kernel = inline_callable_kernels(kernel) - # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index f79b7efe8..c62ec8208 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -78,15 +78,13 @@ class RegisterCalleeKernel(ImmutableRecord): return None -def register_callable_kernel(caller_kernel, function_name, callee_kernel, - should_inline=False): +def register_callable_kernel(caller_kernel, function_name, callee_kernel): """Returns a copy of *caller_kernel*, which would resolve *function_name* in an expression as a call to *callee_kernel*. :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. :arg function_name: An instance of :class:`str`. :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg should_inline: Boolean flag of inlining callee kernel into caller. """ # {{{ sanity checks @@ -129,7 +127,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel, callable_kernel = CallableKernel(subkernel=callee_kernel.copy( target=caller_kernel.target, name=function_name, - is_master_kernel=False), should_inline=should_inline) + is_master_kernel=False)) # disabling global barriers for callee kernel from loopy import set_options @@ -140,4 +138,30 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel, # }}} + +# {{{ inline callable kernel + +def inline_callable(kernel, function_name): + """ + Returns a copy of *kernel* with the callable addresed by *function_name* inlined. + """ + from loopy.preprocess import infer_arg_descr + kernel = infer_arg_descr(kernel) + + old_insns = kernel.instructions + for insn in old_insns: + if isinstance(insn, CallInstruction): + if insn.expression.function.name in kernel.scoped_functions: + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + print(in_knl_callable.subkernel.name) + from loopy.kernel.function_interface import CallableKernel + if isinstance(in_knl_callable, CallableKernel) and ( + in_knl_callable.subkernel.name == function_name): + kernel = in_knl_callable.inline_within_kernel(kernel, insn) + + return kernel + +# }}} + # vim: foldmethod=marker -- GitLab From 838e7633b0e8724319a06366551fc32c1d35d6a7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 16 Jun 2018 22:01:33 -0500 Subject: [PATCH 195/916] changed tests according to the new inline behvior --- loopy/codegen/__init__.py | 4 +++- loopy/transform/register_callable.py | 1 - test/test_transform.py | 32 +++++++++++++++++++++------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d0eb57cb5..e5938dbc4 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -523,7 +523,9 @@ def generate_code_v2(kernel): from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel): auxiliary_dev_prog = generate_code_v2( - in_knl_callable.subkernel.copy(target=kernel.target) + in_knl_callable.subkernel.copy( + name=in_knl_callable.name_in_target, + target=kernel.target) ).device_programs[0].ast auxiliary_dev_progs.append(auxiliary_dev_prog) elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index c62ec8208..0b6201b64 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -154,7 +154,6 @@ def inline_callable(kernel, function_name): if insn.expression.function.name in kernel.scoped_functions: in_knl_callable = kernel.scoped_functions[ insn.expression.function.name] - print(in_knl_callable.subkernel.name) from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel) and ( in_knl_callable.subkernel.name == function_name): diff --git a/test/test_transform.py b/test/test_transform.py index d381413a4..d24e0b6a0 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -243,9 +243,12 @@ def test_register_knl(ctx_factory, inline): ) child_knl = lp.register_callable_kernel( - child_knl, 'linear_combo1', grandchild_knl, inline) + child_knl, 'linear_combo1', grandchild_knl) knl = lp.register_callable_kernel( - parent_knl, 'linear_combo2', child_knl, inline) + parent_knl, 'linear_combo2', child_knl) + if inline: + knl = lp.inline_callable(knl, 'linear_combo2') + knl = lp.inline_callable(knl, 'linear_combo1') evt, (out, ) = knl(queue, x=x, y=y) @@ -290,7 +293,9 @@ def test_slices_with_negative_step(ctx_factory, inline): ) knl = lp.register_callable_kernel( - parent_knl, 'linear_combo', child_knl, inline) + parent_knl, 'linear_combo', child_knl) + if inline: + knl = lp.inline_callable(knl, 'linear_combo') evt, (out, ) = knl(queue, x=x, y=y) @@ -328,8 +333,11 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): g=[j, l]: d[i, j, k, l, m], e=[j, l]: c[i, j, k, l, m]) """) + knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl, inline) + caller_knl, 'linear_combo', callee_knl) + if inline: + knl = lp.inline_callable(knl, 'linear_combo') evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) @@ -374,7 +382,10 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl, inline) + caller_knl, 'linear_combo', callee_knl) + + if inline: + knl = lp.inline_callable(knl, 'linear_combo') evt, (out, ) = knl(queue, x=x_dev, y=y_dev) @@ -420,9 +431,14 @@ def test_shape_translation_through_sub_array_ref(ctx_factory, inline): [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) """) - knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1, True) - knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2, True) - knl = lp.register_callable_kernel(knl, 'callee_fn3', callee3, True) + knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1) + knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2) + knl = lp.register_callable_kernel(knl, 'callee_fn3', callee3) + + if inline: + knl = lp.inline_callable(knl, 'callee_fn1') + knl = lp.inline_callable(knl, 'callee_fn2') + knl = lp.inline_callable(knl, 'callee_fn3') knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") -- GitLab From a26df2030b4a805f4ad26b41a7d5e26df07c6433 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 16 Jun 2018 22:04:44 -0500 Subject: [PATCH 196/916] improved instruction not implementedness. --- loopy/transform/register_callable.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 0b6201b64..17a92466d 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -158,6 +158,11 @@ def inline_callable(kernel, function_name): if isinstance(in_knl_callable, CallableKernel) and ( in_knl_callable.subkernel.name == function_name): kernel = in_knl_callable.inline_within_kernel(kernel, insn) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown instruction %s." % type(insn)) return kernel -- GitLab From b09a689d31e3b155b39d124f46e3f5d3f5054c04 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 17 Jun 2018 00:04:30 -0500 Subject: [PATCH 197/916] Changed the sub array arg descriptor invoke patters, --- loopy/preprocess.py | 38 ++---------- loopy/symbolic.py | 33 ++++++---- loopy/transform/register_callable.py | 93 ++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 46 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 4d6471da9..6f11224a6 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2107,32 +2107,6 @@ def check_atomic_loads(kernel): # {{{ arg_descr_inference -def get_arg_description_from_sub_array_ref(sub_array, kernel): - """ Gets the dim_tags, memory scope, shape informations of a - :class:`SubArrayRef` argument in the caller kernel packed into - :class:`ArrayArgDescriptor`. - """ - from loopy.kernel.function_interface import ArrayArgDescriptor - - name = sub_array.subscript.aggregate.name - - if name in kernel.temporary_variables: - arg = kernel.temporary_variables[name] - mem_scope = arg.scope - assert name not in kernel.arg_dict - else: - assert name in kernel.arg_dict - arg = kernel.arg_dict[name] - mem_scope = arg.memory_address_space - - sub_dim_tags, sub_shape = sub_array.get_sub_array_dim_tags_and_shape( - kernel, arg.dim_tags, arg.shape) - - return ArrayArgDescriptor(mem_scope=mem_scope, - dim_tags=sub_dim_tags, - shape=sub_shape) - - class ArgDescrInferenceMapper(CombineMapper): """ Returns a set of instances of :class:`tuple` (expr, @@ -2157,8 +2131,7 @@ class ArgDescrInferenceMapper(CombineMapper): return self.combine((self.rec(child) for child in expr.parameters)) # descriptors for the args - arg_id_to_descr = dict((i, - get_arg_description_from_sub_array_ref(par, self.kernel)) + arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel)) if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) for i, par in enumerate(expr.parameters)) @@ -2172,8 +2145,7 @@ class ArgDescrInferenceMapper(CombineMapper): for i, par in enumerate(assignees): if isinstance(par, SubArrayRef): assignee_id_to_descr[-i-1] = ( - get_arg_description_from_sub_array_ref(par, - self.kernel)) + par.get_array_arg_descriptor(self.kernel)) else: assignee_id_to_descr[-i-1] = ValueArgDescriptor() @@ -2197,8 +2169,7 @@ class ArgDescrInferenceMapper(CombineMapper): from loopy.symbolic import SubArrayRef # descriptors for the args and kwargs: - arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par, - self.kernel)) + arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel)) if isinstance(par, SubArrayRef) else ValueArgDescriptor() for i, par in tuple(enumerate(expr.parameters)) + tuple(expr.kw_parameters.items())) @@ -2212,8 +2183,7 @@ class ArgDescrInferenceMapper(CombineMapper): for i, par in enumerate(assignees): if isinstance(par, SubArrayRef): assignee_id_to_descr[-i-1] = ( - get_arg_description_from_sub_array_ref(par, - self.kernel)) + par.get_array_arg_descriptor(self.kernel)) else: assignee_id_to_descr[-i-1] = ValueArgDescriptor() diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 79052730e..ccaa8cdaa 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -811,26 +811,33 @@ class SubArrayRef(p.Expression): return EvaluatorWithDeficientContext(swept_inames_to_zeros)( self.subscript) - def get_sub_array_dim_tags_and_shape(self, kernel, arg_dim_tags, arg_shape): - """Returns the dim tags for the inner inames. - - .. arg:: arg_dim_tags + def get_array_arg_descriptor(self, kernel): + """ + Returns the dim_tags, memory scope, shape informations of a + :class:`SubArrayRef` argument in the caller kernel packed into + :class:`ArrayArgDescriptor` for the instance of :class:`SubArrayRef` in + the given *kernel*. + """ + from loopy.kernel.function_interface import ArrayArgDescriptor - a list of :class:`loopy.kernel.array.FixedStrideArrayDimTag` of the - argument referred by the *SubArrayRef*. + name = self.subscript.aggregate.name - .. arg:: arg_shape + if name in kernel.temporary_variables: + arg = kernel.temporary_variables[name] + mem_scope = arg.scope + assert name not in kernel.arg_dict + else: + assert name in kernel.arg_dict + arg = kernel.arg_dict[name] + mem_scope = arg.memory_address_space - a tuple indicating the shape of the argument referred by the - *SubArrayRef*. - """ from loopy.kernel.array import FixedStrideArrayDimTag as DimTag from loopy.isl_helpers import simplify_via_aff sub_dim_tags = [] sub_shape = [] linearized_index = simplify_via_aff( sum(dim_tag.stride*iname for dim_tag, iname in - zip(arg_dim_tags, self.subscript.index_tuple))) + zip(arg.dim_tags, self.subscript.index_tuple))) strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) @@ -841,7 +848,9 @@ class SubArrayRef(p.Expression): kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 for iname in self.swept_inames) - return sub_dim_tags, sub_shape + return ArrayArgDescriptor(mem_scope=mem_scope, + dim_tags=sub_dim_tags, + shape=sub_shape) def __getinitargs__(self): return (self.swept_inames, self.subscript) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 17a92466d..07980b854 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -28,6 +28,12 @@ from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) +from loopy.symbolic import IdentityMapper +from loopy.isl_helpers import simplify_via_aff +from pymbolic.primitives import CallWithKwargs +from loopy.kernel.function_interface import (get_kw_pos_association, + register_pymbolic_calls_to_knl_callables) + __doc__ = """ .. currentmodule:: loopy @@ -168,4 +174,91 @@ def inline_callable(kernel, function_name): # }}} + +# {{{ matching caller to callee args if dimenstions dont match + +class DimChanger(IdentityMapper): + def __init__(self, callee_arg_dict, desired_dim_tag_dict): + self.callee_arg_dict = callee_arg_dict + self.desired_dim_tag_dict = desired_dim_tag_dict + + def map_subscript(self, expr): + callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags + flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in + zip(callee_arg_dim_tags, expr.index_tuple)) + new_indices = [] + + for dim_tag in self.desired_dim_tag_dict[expr.aggregate.name]: + ind = flattened_index // dim_tag.stride + flattened_index -= (dim_tag.stride * ind) + new_indices.append(simplify_via_aff(ind)) + + return expr.aggregate.index(tuple(new_indices)) + + +def _match_caller_callee_argument_dimension(caller_knl, callee_fn): + """ + #TODO: Fix docs. + One must call this after registering the callee kernel into the caller + kernel. + """ + pymbolic_calls_to_new_callables = {} + for insn in caller_knl.instructions: + if not isinstance(insn, CallInstruction) or ( + insn.expression.function.name not in + caller_knl.scoped_functions): + continue + + in_knl_callable = caller_knl.scoped_functions[ + insn.expression.function.name] + + if in_knl_callable.subkernel.name != callee_fn: + continue + + # getting the caller callee arg association + + parameters = insn.expression.parameters[:] + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameter_dim_tags = [par.get_array_arg_descriptor(caller_knl).dim_tags + for par in parameters] + kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameter_dim_tags.append(kw_parameters[pos_to_kw[i]] + .get_array_arg_descriptor(caller_knl).dim_tags) + + # inserting the assigness at the required positions. + assignee_write_count = -1 + for i, arg in enumerate(in_knl_callable.subkernel.args): + if arg.direction == 'out': + assignee = assignees[-assignee_write_count-1] + parameter_dim_tags.insert(i, assignee + .get_array_arg_descriptor(caller_knl).dim_tags) + assignee_write_count -= 1 + + callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in + in_knl_callable.subkernel.args], parameter_dim_tags)) + dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, + callee_arg_to_desired_dim_tag) + new_callee_insns = [] + for callee_insn in in_knl_callable.subkernel.instructions: + if isinstance(callee_insn, MultiAssignmentBase): + new_callee_insns.append(callee_insn.copy(expression=dim_changer( + callee_insn.expression), + assignee=dim_changer(callee_insn.assignee))) + + new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) + + new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) + + pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable + + return register_pymbolic_calls_to_knl_callables(caller_knl, + pymbolic_calls_to_new_callables) + +# }}} # vim: foldmethod=marker -- GitLab From 942c808c1fd877b89c33b04b039f79b4782af834 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 17 Jun 2018 20:04:30 -0500 Subject: [PATCH 198/916] inline_callable->inline_callable_kernel and few changes to the algorithm of changing the dimensions of the callee kernel. --- loopy/__init__.py | 4 +- loopy/transform/register_callable.py | 81 ++++++++++++++++++++-------- test/test_transform.py | 16 +++--- 3 files changed, 69 insertions(+), 32 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 4fe83e3f4..d5aebbf22 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -119,7 +119,7 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.register_callable import (register_callable_kernel, - register_function_lookup, inline_callable) + register_function_lookup, inline_callable_kernel) # }}} @@ -230,7 +230,7 @@ __all__ = [ "add_barrier", "register_callable_kernel", "register_function_lookup", - "inline_callable", + "inline_callable_kernel", # }}} diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 07980b854..20240bc7f 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -66,7 +66,7 @@ def register_function_lookup(kernel, function_lookup): # {{{ register_callable_kernel -class RegisterCalleeKernel(ImmutableRecord): +class _RegisterCalleeKernel(ImmutableRecord): """ Helper class to make the function scoper from :func:`loopy.transform.register_callable_kernel` picklable. As python @@ -140,16 +140,17 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): callee_kernel = set_options(callee_kernel, "disable_global_barriers") return register_function_lookup(caller_kernel, - RegisterCalleeKernel(function_name, callable_kernel)) + _RegisterCalleeKernel(function_name, callable_kernel)) # }}} # {{{ inline callable kernel -def inline_callable(kernel, function_name): +def inline_callable_kernel(kernel, function_name): """ - Returns a copy of *kernel* with the callable addresed by *function_name* inlined. + Returns a copy of *kernel* with the callable kernel addresed by + *function_name* inlined. """ from loopy.preprocess import infer_arg_descr kernel = infer_arg_descr(kernel) @@ -178,9 +179,22 @@ def inline_callable(kernel, function_name): # {{{ matching caller to callee args if dimenstions dont match class DimChanger(IdentityMapper): - def __init__(self, callee_arg_dict, desired_dim_tag_dict): + """ + Mapper to change the dimensions of an argument. + + .. attribute:: callee_arg_dict + + A mapping from the argument name (:class:`str`) to instances of + :class:`loopy.kernel.array.ArrayBase`. + + .. attribute:: desried_shape + + A mapping from argument name (:class:`str`) to an instance of + :class:`tuple`. + """ + def __init__(self, callee_arg_dict, desired_shape): self.callee_arg_dict = callee_arg_dict - self.desired_dim_tag_dict = desired_dim_tag_dict + self.desired_shape = desired_shape def map_subscript(self, expr): callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags @@ -188,34 +202,43 @@ class DimChanger(IdentityMapper): zip(callee_arg_dim_tags, expr.index_tuple)) new_indices = [] - for dim_tag in self.desired_dim_tag_dict[expr.aggregate.name]: - ind = flattened_index // dim_tag.stride - flattened_index -= (dim_tag.stride * ind) + from operator import mul + from functools import reduce + stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) + + for length in self.desired_shape[expr.aggregate.name]: + stride /= length + ind = flattened_index // int(stride) + flattened_index -= (int(stride) * ind) new_indices.append(simplify_via_aff(ind)) return expr.aggregate.index(tuple(new_indices)) -def _match_caller_callee_argument_dimension(caller_knl, callee_fn): +def _match_caller_callee_argument_dimension(caller_knl, callee_function_name): """ - #TODO: Fix docs. - One must call this after registering the callee kernel into the caller - kernel. + Returns a copy of *caller_knl* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *caller_knl* aligned with the argument + dimesnsions required by *caller_knl*. """ pymbolic_calls_to_new_callables = {} for insn in caller_knl.instructions: if not isinstance(insn, CallInstruction) or ( insn.expression.function.name not in caller_knl.scoped_functions): + # Call to a callable kernel can only occur through a + # CallInstruction. continue in_knl_callable = caller_knl.scoped_functions[ insn.expression.function.name] - if in_knl_callable.subkernel.name != callee_fn: + if in_knl_callable.subkernel.name != callee_function_name: + # Not the callable we're looking for. continue - # getting the caller callee arg association + # getting the caller->callee arg association parameters = insn.expression.parameters[:] kw_parameters = {} @@ -224,24 +247,24 @@ def _match_caller_callee_argument_dimension(caller_knl, callee_fn): assignees = insn.assignees - parameter_dim_tags = [par.get_array_arg_descriptor(caller_knl).dim_tags + parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape for par in parameters] kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameter_dim_tags.append(kw_parameters[pos_to_kw[i]] - .get_array_arg_descriptor(caller_knl).dim_tags) + parameter_shapes.append(kw_parameters[pos_to_kw[i]] + .get_array_arg_descriptor(caller_knl).shape) # inserting the assigness at the required positions. assignee_write_count = -1 for i, arg in enumerate(in_knl_callable.subkernel.args): if arg.direction == 'out': assignee = assignees[-assignee_write_count-1] - parameter_dim_tags.insert(i, assignee - .get_array_arg_descriptor(caller_knl).dim_tags) + parameter_shapes.insert(i, assignee + .get_array_arg_descriptor(caller_knl).shape) assignee_write_count -= 1 callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in - in_knl_callable.subkernel.args], parameter_dim_tags)) + in_knl_callable.subkernel.args], parameter_shapes)) dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, callee_arg_to_desired_dim_tag) new_callee_insns = [] @@ -250,15 +273,29 @@ def _match_caller_callee_argument_dimension(caller_knl, callee_fn): new_callee_insns.append(callee_insn.copy(expression=dim_changer( callee_insn.expression), assignee=dim_changer(callee_insn.assignee))) - + elif isinstance(callee_insn, (CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknwon instruction %s." % + type(insn)) + + # subkernel with instructions adjusted according to the new dimensions. new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable + if not pymbolic_calls_to_new_callables: + # complain if no matching function found. + raise LoopyError("No CallableKernel with the name %s found in %s." % ( + callee_function_name, caller_knl.name)) + return register_pymbolic_calls_to_knl_callables(caller_knl, pymbolic_calls_to_new_callables) # }}} + + # vim: foldmethod=marker diff --git a/test/test_transform.py b/test/test_transform.py index d24e0b6a0..5ada3ed11 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -247,8 +247,8 @@ def test_register_knl(ctx_factory, inline): knl = lp.register_callable_kernel( parent_knl, 'linear_combo2', child_knl) if inline: - knl = lp.inline_callable(knl, 'linear_combo2') - knl = lp.inline_callable(knl, 'linear_combo1') + knl = lp.inline_callable_kernel(knl, 'linear_combo2') + knl = lp.inline_callable_kernel(knl, 'linear_combo1') evt, (out, ) = knl(queue, x=x, y=y) @@ -295,7 +295,7 @@ def test_slices_with_negative_step(ctx_factory, inline): knl = lp.register_callable_kernel( parent_knl, 'linear_combo', child_knl) if inline: - knl = lp.inline_callable(knl, 'linear_combo') + knl = lp.inline_callable_kernel(knl, 'linear_combo') evt, (out, ) = knl(queue, x=x, y=y) @@ -337,7 +337,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): knl = lp.register_callable_kernel( caller_knl, 'linear_combo', callee_knl) if inline: - knl = lp.inline_callable(knl, 'linear_combo') + knl = lp.inline_callable_kernel(knl, 'linear_combo') evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) @@ -385,7 +385,7 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): caller_knl, 'linear_combo', callee_knl) if inline: - knl = lp.inline_callable(knl, 'linear_combo') + knl = lp.inline_callable_kernel(knl, 'linear_combo') evt, (out, ) = knl(queue, x=x_dev, y=y_dev) @@ -436,9 +436,9 @@ def test_shape_translation_through_sub_array_ref(ctx_factory, inline): knl = lp.register_callable_kernel(knl, 'callee_fn3', callee3) if inline: - knl = lp.inline_callable(knl, 'callee_fn1') - knl = lp.inline_callable(knl, 'callee_fn2') - knl = lp.inline_callable(knl, 'callee_fn3') + knl = lp.inline_callable_kernel(knl, 'callee_fn1') + knl = lp.inline_callable_kernel(knl, 'callee_fn2') + knl = lp.inline_callable_kernel(knl, 'callee_fn3') knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") -- GitLab From 905492e7938841921f720108a8ebb49077d11f1c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 17 Jun 2018 23:47:44 -0500 Subject: [PATCH 199/916] Minor changes to adjust to the new iname_to_tags attribute of the kernel. --- loopy/kernel/function_interface.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 00bbdedd2..e4e3d43ed 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -721,22 +721,24 @@ class CallableKernel(InKernelCallable): iname_map[iname] = vng(callee_label+iname) new_domains = [] - new_iname_to_tag = {} + new_iname_to_tags = {} + + # transferring iname tags info from callee to the caller kernel. for domain in callee_knl.domains: new_domain = domain.copy() for i in range(new_domain.n_dim()): iname = new_domain.get_dim_name(dim_type, i) if iname in callee_knl.iname_to_tag: - new_iname_to_tag[iname_map[iname]] = ( + new_iname_to_tags[iname_map[iname]] = ( callee_knl.iname_to_tag[iname]) new_domain = new_domain.set_dim_name( dim_type, i, iname_map[iname]) new_domains.append(new_domain) - new_iname_to_tag.update(kernel.iname_to_tag) + new_iname_to_tags.update(kernel.iname_to_tag) kernel = kernel.copy(domains=kernel.domains + new_domains, - iname_to_tag=new_iname_to_tag) + iname_to_tags=new_iname_to_tags) # }}} -- GitLab From 6ea3f6e6ab3504c037e0568a5e308c78031a52c2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 18 Jun 2018 00:43:48 -0500 Subject: [PATCH 200/916] fixes minor error in transferring iname tags from callee to the caller kernel. --- loopy/kernel/function_interface.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e4e3d43ed..2e9c81e22 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -721,22 +721,19 @@ class CallableKernel(InKernelCallable): iname_map[iname] = vng(callee_label+iname) new_domains = [] - new_iname_to_tags = {} + new_iname_to_tags = kernel.iname_to_tags.copy() - # transferring iname tags info from callee to the caller kernel. + # transferring iname tags info from the callee to the caller kernel for domain in callee_knl.domains: new_domain = domain.copy() for i in range(new_domain.n_dim()): iname = new_domain.get_dim_name(dim_type, i) - if iname in callee_knl.iname_to_tag: - new_iname_to_tags[iname_map[iname]] = ( - callee_knl.iname_to_tag[iname]) + new_iname_to_tags[iname_map[iname]] = ( + callee_knl.iname_to_tags[iname]) new_domain = new_domain.set_dim_name( dim_type, i, iname_map[iname]) new_domains.append(new_domain) - new_iname_to_tags.update(kernel.iname_to_tag) - kernel = kernel.copy(domains=kernel.domains + new_domains, iname_to_tags=new_iname_to_tags) -- GitLab From 50383f3c6b70ea304912ea688c3db4722b2b9be6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 21 Jun 2018 00:53:56 -0500 Subject: [PATCH 201/916] Changes according to review-I. --- loopy/kernel/__init__.py | 18 ++++------- loopy/kernel/creation.py | 36 ++++++++++++---------- loopy/kernel/data.py | 39 +++++++++++------------- loopy/kernel/function_interface.py | 45 ++++++++++++++++++++-------- loopy/kernel/tools.py | 41 ++++++++++++------------- loopy/target/opencl.py | 2 +- loopy/transform/register_callable.py | 10 +++---- 7 files changed, 103 insertions(+), 88 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index b36abc847..cf0467e08 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -185,13 +185,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): A subclass of :class:`loopy.TargetBase`. - .. attribute:: is_master_kernel - - # FIXME: Naming suggestions? - # is_top_level_kernel - # is_caller_kernel - # is_called_from_host - # is_root_kernel + .. attribute:: is_called_from_host An instance of :class:`bool`. Will be set *False* for the kernel which would be called from another top level kernels. Default value is @@ -224,7 +218,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=None, state=kernel_state.INITIAL, - is_master_kernel=True, + is_called_from_host=True, target=None, overridden_get_grid_sizes_for_insn_ids=None): @@ -310,7 +304,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): index_dtype=index_dtype, options=options, state=state, - is_master_kernel=is_master_kernel, + is_called_from_host=is_called_from_host, target=target, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids)) @@ -362,7 +356,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return None - def lookup_function(self, identifier): + def find_scoped_function_identifier(self, identifier): """ Returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable` if the @@ -1043,7 +1037,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): insn_ids, ignore_auto=ignore_auto) - assert self.is_master_kernel, ("Callee kernels do not have sufficient " + assert self.is_called_from_host, ("Callee kernels do not have sufficient " "information to compute grid sizes.") global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( @@ -1407,7 +1401,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "silenced_warnings", "options", "state", - "is_master_kernel", + "is_called_from_host", "target", ) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 781d8b986..d3f12d417 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1861,7 +1861,8 @@ class FunctionScoper(RuleAwareIdentityMapper): if not isinstance(expr.function, ScopedFunction): # searching the kernel for the function. - in_knl_callable = self.kernel.lookup_function(expr.function.name) + in_knl_callable = self.kernel.find_scoped_function_identifier( + expr.function.name) if in_knl_callable: # Associating the newly created ScopedFunction with the # resolved in-kernel callable. @@ -1880,7 +1881,8 @@ class FunctionScoper(RuleAwareIdentityMapper): if not isinstance(expr.function, ScopedFunction): # searching the kernel for the function. - in_knl_callable = self.kernel.lookup_function(expr.function.name) + in_knl_callable = self.kernel.find_scoped_function_identifier( + expr.function.name) if in_knl_callable: # Associating the newly created ScopedFunction with the @@ -1908,26 +1910,30 @@ class FunctionScoper(RuleAwareIdentityMapper): # Noting down the extra functions arising due to certain reductions. if isinstance(expr.operation, MaxReductionOperation): - self.scoped_functions["max"] = self.kernel.lookup_function("max") + self.scoped_functions["max"] = ( + self.kernel.find_scoped_function_identifier("max")) elif isinstance(expr.operation, MinReductionOperation): - self.scoped_functions["min"] = self.kernel.lookup_function("min") + self.scoped_functions["min"] = ( + self.kernel.find_scoped_function_identifier("min")) elif isinstance(expr.operation, ArgMaxReductionOperation): - self.scoped_functions["max"] = self.kernel.lookup_function("max") - self.scoped_functions["make_tuple"] = self.kernel.lookup_function( - "make_tuple") + self.scoped_functions["max"] = ( + self.kernel.find_scoped_function_identifier("max")) + self.scoped_functions["make_tuple"] = ( + self.kernel.find_scoped_function_identifier("make_tuple")) self.scoped_functions[ArgExtOp(expr.operation)] = ( - self.kernel.lookup_function(expr.operation)) + self.kernel.find_scoped_function_identifier(expr.operation)) elif isinstance(expr.operation, ArgMinReductionOperation): - self.scoped_functions["min"] = self.kernel.lookup_function("min") - self.scoped_functions["make_tuple"] = self.kernel.lookup_function( - "make_tuple") + self.scoped_functions["min"] = ( + self.kernel.find_scoped_function_identifier("min")) + self.scoped_functions["make_tuple"] = ( + self.kernel.find_scoped_function_identifier("make_tuple")) self.scoped_functions[ArgExtOp(expr.operation)] = ( - self.kernel.lookup_function(expr.operation)) + self.kernel.find_scoped_function_identifier(expr.operation)) elif isinstance(expr.operation, _SegmentedScalarReductionOperation): - self.scoped_functions["make_tuple"] = self.kernel.lookup_function( - "make_tuple") + self.scoped_functions["make_tuple"] = ( + self.kernel.find_scoped_function_identifier("make_tuple")) self.scoped_functions[SegmentedOp(expr.operation)] = ( - self.kernel.lookup_function(expr.operation)) + self.kernel.find_scoped_function_identifier(expr.operation)) return super(FunctionScoper, self).map_reduction(expr, expn_state) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 1c927b8af..ddcb16563 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -271,26 +271,38 @@ class KernelArgument(ImmutableRecord): dtype = None kwargs["dtype"] = dtype - kwargs["direction"] = kwargs.pop("direction", None) + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) ImmutableRecord.__init__(self, **kwargs) class ArrayArg(ArrayBase, KernelArgument): + __doc__ = ArrayBase.__doc__ + ( + """ + .. attribute:: memory_address_space + + An attribute of :class:`MemoryAddressSpace` defining the address + space in which the array resides in the target memory layout. + Defaults to ``MemoryAddressSpace.GLOBAL`` + + .. attribute:: is_output_only + + An instance of :class:`bool`. If set to *TRUE*, recorded to be + returned from the kernel. + """) allowed_extra_kwargs = [ "memory_address_space", - "direction"] + "is_output_only"] def __init__(self, *args, **kwargs): # Defaulting the memory_address_space to be GLOBAL. kwargs["memory_address_space"] = kwargs.pop( "memory_address_space", MemoryAddressSpace.GLOBAL) - kwargs["direction"] = kwargs.pop("direction", None) + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) super(ArrayArg, self).__init__(*args, **kwargs) - __doc__ = ArrayBase.__doc__ min_target_axes = 0 max_target_axes = 1 @@ -334,28 +346,13 @@ class ImageArg(ArrayBase, KernelArgument): class ValueArg(KernelArgument): def __init__(self, name, dtype=None, approximately=1000, target=None, - direction=None): - - # {{{ sanity checks for direction - - if direction == 'out': - # TODO: Is this only valid for C-like targets? - # Do we need to move this to target.precodegen_checks? - raise LoopyError("ValueArg cannot have 'out' as the direction.") - elif direction is None: - direction = 'in' - elif direction == 'in': - pass - else: - raise LoopyError("Unknown type for direction of %s." % name) - - # }}} + is_output_only=None): KernelArgument.__init__(self, name=name, dtype=dtype, approximately=approximately, target=target, - direction=direction) + is_output_only=is_output_only) def __str__(self): import loopy as lp diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 00bbdedd2..e9aaeefe8 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -99,8 +99,8 @@ def get_kw_pos_association(kernel): Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in *kernel*. """ - from loopy.kernel.tools import infer_arg_direction - kernel = infer_arg_direction(kernel) + from loopy.kernel.tools import infer_arg_is_output_only + kernel = infer_arg_is_output_only(kernel) kw_to_pos = {} pos_to_kw = {} @@ -108,22 +108,39 @@ def get_kw_pos_association(kernel): write_count = -1 for arg in kernel.args: - if arg.direction == 'in': + if not arg.is_output_only: kw_to_pos[arg.name] = read_count pos_to_kw[read_count] = arg.name read_count += 1 - elif arg.direction == 'out': + else: kw_to_pos[arg.name] = write_count pos_to_kw[write_count] = arg.name write_count -= 1 - else: - raise LoopyError("Unknown value of kernel argument direction %s for " - "%s" % (arg.direction, arg.name)) return kw_to_pos, pos_to_kw class GridOverrideForCalleeKernel(ImmutableRecord): + """ + Helper class to set the + :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the + callee kernels. Refer + :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, + :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. + + .. attribute:: local_size + + The local work group size that has to be set in the callee kernel. + + .. attribute:: global_size + + The global work group size that to be set in the callee kernel. + + .. note:: + + This class acts as a pseduo-callable and its significance lies in + solving picklability issues. + """ fields = set(["local_size", "global_size"]) def __init__(self, local_size, global_size): @@ -304,9 +321,13 @@ class InKernelCallable(ImmutableRecord): class ScalarCallable(InKernelCallable): """ - Records the information about a scalar callable encountered in a kernel. - The :meth:`ScalarCallable.with_types` is intended to assist with type - specialization of the funciton. + An abstranct interface the to a scalar callable encountered in a kernel. + + .. note:: + + The :meth:`ScalarCallable.with_types` is intended to assist with type + specialization of the funciton and is expected to be supplemented in the + derived subclasses. """ fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) @@ -774,7 +795,7 @@ class CallableKernel(InKernelCallable): assignee_pos = 0 parameter_pos = 0 for i, arg in enumerate(callee_knl.args): - if arg.direction == "out": + if arg.is_output_only: arg_map[arg.name] = assignees[assignee_pos] assignee_pos += 1 else: @@ -911,7 +932,7 @@ class CallableKernel(InKernelCallable): # inserting the assigness at the required positions. assignee_write_count = -1 for i, arg in enumerate(self.subkernel.args): - if arg.direction == 'out': + if arg.is_output_only: assignee = assignees[-assignee_write_count-1] parameters.insert(i, assignee) par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 436b92223..080548005 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1849,41 +1849,38 @@ def get_callee_kernels(kernel, insn_ids=None): # {{{ direction helper tools -def infer_arg_direction(kernel): +def infer_arg_is_output_only(kernel): """ - Returns a copy of *kernel* with the directions of the argument inferred. + Returns a copy of *kernel* with the attribute ``is_output_only`` set. .. note:: - Implements a simple heuristic -- if the argument direction is not - specified by the user then if the argument is written at any point - during in the kernel then its direction is set to be ``out``, otherwise - ``in``. + + If the attribute ``is_output_only`` is not supplied from an user, then + infers it as an output argument if it is written at some point in the + kernel. """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg - direction_inferred_args = [] + new_args = [] for arg in kernel.args: - if isinstance(arg, (ArrayArg, ImageArg)): - if arg.direction is not None: - if arg.direction not in ['in', 'out']: - raise LoopyError("Unknown value of direction %s for %s." % ( - arg.direction, arg.name)) - direction_inferred_args.append(arg) + if isinstance(arg, (ArrayArg, ImageArg, ValueArg)): + if arg.is_output_only is not None: + assert isinstance(arg.is_output_only, bool) + new_args.append(arg) else: if arg.name in kernel.get_written_variables(): - direction_inferred_args.append(arg.copy(direction='out')) + new_args.append(arg.copy(is_output_only=True)) else: - direction_inferred_args.append(arg.copy(direction='in')) - elif isinstance(arg, (ValueArg, ConstantArg)): - # For ValueArg, ConstantArg the direction always has to be in. - if arg.direction is not None and arg.direction == 'out': - raise LoopyError("Argument %s cannot have 'out' direction." % - arg.name) + new_args.append(arg.copy(is_output_only=False)) + elif isinstance(arg, ConstantArg): + if arg.is_output_only: + raise LoopyError("Constant Argument %s cannot have " + "is_output_only True" % arg.name) else: - direction_inferred_args.append(arg.copy(direction='in')) + new_args.append(arg.copy(is_output_only=False)) else: raise NotImplementedError("Unkonwn argument type %s." % type(arg)) - return kernel.copy(args=direction_inferred_args) + return kernel.copy(args=new_args) # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 5d00dd39a..164bfb7aa 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -451,7 +451,7 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) - if not codegen_state.kernel.is_master_kernel: + if not codegen_state.kernel.is_called_from_host: # auxiliary kernels need not mention opencl speicific qualifiers # for a functions signature return fdecl diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 20240bc7f..dda5a0cc7 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -101,10 +101,10 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # check to make sure that the variables with 'out' direction is equal to # the number of assigness in the callee kernel intructions. - from loopy.kernel.tools import infer_arg_direction - callee_kernel = infer_arg_direction(callee_kernel) + from loopy.kernel.tools import infer_arg_is_output_only + callee_kernel = infer_arg_is_output_only(callee_kernel) expected_num_assignees = len([arg for arg in callee_kernel.args if - arg.direction == 'out']) + arg.is_output_only]) expected_num_parameters = len(callee_kernel.args) - expected_num_assignees for insn in caller_kernel.instructions: if isinstance(insn, CallInstruction) and ( @@ -133,7 +133,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): callable_kernel = CallableKernel(subkernel=callee_kernel.copy( target=caller_kernel.target, name=function_name, - is_master_kernel=False)) + is_called_from_host=False)) # disabling global barriers for callee kernel from loopy import set_options @@ -257,7 +257,7 @@ def _match_caller_callee_argument_dimension(caller_knl, callee_function_name): # inserting the assigness at the required positions. assignee_write_count = -1 for i, arg in enumerate(in_knl_callable.subkernel.args): - if arg.direction == 'out': + if arg.is_output_only: assignee = assignees[-assignee_write_count-1] parameter_shapes.insert(i, assignee .get_array_arg_descriptor(caller_knl).shape) -- GitLab From d1d9e1ed1bab00238ac4bbb527ccee3657f8d595 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 21 Jun 2018 00:58:57 -0500 Subject: [PATCH 202/916] Changes the name from MemoryAddressSpace-> AddressSpace. --- loopy/__init__.py | 4 +-- loopy/check.py | 26 +++++++++--------- loopy/codegen/control.py | 4 +-- loopy/kernel/__init__.py | 12 ++++---- loopy/kernel/data.py | 44 +++++++++++++++--------------- loopy/kernel/function_interface.py | 2 +- loopy/preprocess.py | 38 +++++++++++++------------- loopy/schedule/tools.py | 4 +-- loopy/statistics.py | 4 +-- loopy/target/c/__init__.py | 12 ++++---- loopy/target/cuda.py | 8 +++--- loopy/target/ispc.py | 10 +++---- loopy/target/opencl.py | 28 +++++++++---------- loopy/target/pyopencl.py | 10 +++---- loopy/transform/batch.py | 4 +-- loopy/transform/buffer.py | 10 +++---- loopy/transform/data.py | 14 +++++----- loopy/transform/precompute.py | 12 ++++---- loopy/transform/save.py | 8 +++--- 19 files changed, 127 insertions(+), 127 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index d5aebbf22..cd4f2ad78 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -45,7 +45,7 @@ from loopy.kernel.data import ( auto, KernelArgument, ValueArg, ArrayArg, GlobalArg, ConstantArg, ImageArg, - temp_var_scope, TemporaryVariable, MemoryAddressSpace, + temp_var_scope, TemporaryVariable, AddressSpace, SubstitutionRule, CallMangleInfo) from loopy.kernel.function_interface import ( @@ -170,7 +170,7 @@ __all__ = [ "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", - "MemoryAddressSpace", "temp_var_scope", "TemporaryVariable", + "AddressSpace", "temp_var_scope", "TemporaryVariable", "SubstitutionRule", "CallMangleInfo", diff --git a/loopy/check.py b/loopy/check.py index 080c5721c..8e2f74801 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -239,20 +239,20 @@ def check_for_inactive_iname_access(kernel): def _is_racing_iname_tag(tv, tag): - from loopy.kernel.data import (MemoryAddressSpace, + from loopy.kernel.data import (AddressSpace, LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto) - if tv.scope == MemoryAddressSpace.PRIVATE: + if tv.scope == AddressSpace.PRIVATE: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag))) - elif tv.scope == MemoryAddressSpace.LOCAL: + elif tv.scope == AddressSpace.LOCAL: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, GroupIndexTag)) - elif tv.scope == MemoryAddressSpace.GLOBAL: + elif tv.scope == AddressSpace.GLOBAL: return isinstance(tag, ConcurrentTag) elif tv.scope == auto: @@ -517,15 +517,15 @@ class IndirectDependencyEdgeFinder(object): def declares_nosync_with(kernel, var_scope, dep_a, dep_b): - from loopy.kernel.data import MemoryAddressSpace - if var_scope == MemoryAddressSpace.GLOBAL: + from loopy.kernel.data import AddressSpace + if var_scope == AddressSpace.GLOBAL: search_scopes = ["global", "any"] - elif var_scope == MemoryAddressSpace.LOCAL: + elif var_scope == AddressSpace.LOCAL: search_scopes = ["local", "any"] - elif var_scope == MemoryAddressSpace.PRIVATE: + elif var_scope == AddressSpace.PRIVATE: search_scopes = ["any"] else: - raise ValueError("unexpected value of 'MemoryAddressSpace'") + raise ValueError("unexpected value of 'AddressSpace'") ab_nosync = False ba_nosync = False @@ -548,7 +548,7 @@ def _check_variable_access_ordered_inner(kernel): wmap = kernel.writer_map() rmap = kernel.reader_map() - from loopy.kernel.data import ValueArg, MemoryAddressSpace, ArrayArg + from loopy.kernel.data import ValueArg, AddressSpace, ArrayArg from loopy.kernel.tools import find_aliasing_equivalence_classes depfind = IndirectDependencyEdgeFinder(kernel) @@ -577,7 +577,7 @@ def _check_variable_access_ordered_inner(kernel): if isinstance(arg, ArrayArg): scope = arg.memory_address_space elif isinstance(arg, ValueArg): - scope = MemoryAddressSpace.PRIVATE + scope = AddressSpace.PRIVATE else: # No need to consider ConstantArg and ImageArg (for now) # because those won't be written. @@ -843,7 +843,7 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): # {{{ check that temporaries are defined in subkernels where used def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace from loopy.kernel.tools import get_subkernels for subkernel in get_subkernels(kernel): @@ -874,7 +874,7 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): "aliases have a definition" % (temporary, subkernel)) continue - if tval.scope in (MemoryAddressSpace.PRIVATE, MemoryAddressSpace.LOCAL): + if tval.scope in (AddressSpace.PRIVATE, AddressSpace.LOCAL): from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError("temporary variable '%s' gets used " "in subkernel '%s' without a definition (maybe you forgot " diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index dd9cda618..3aecc4bcf 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -63,7 +63,7 @@ def synthesize_idis_for_extra_args(kernel, schedule_index): sched_item = kernel.schedule[schedule_index] from loopy.codegen import ImplementedDataInfo - from loopy.kernel.data import InameArg, MemoryAddressSpace + from loopy.kernel.data import InameArg, AddressSpace assert isinstance(sched_item, CallKernel) @@ -71,7 +71,7 @@ def synthesize_idis_for_extra_args(kernel, schedule_index): for arg in sched_item.extra_args: temporary = kernel.temporary_variables[arg] - assert temporary.scope == MemoryAddressSpace.GLOBAL + assert temporary.scope == AddressSpace.GLOBAL idis.extend( temporary.decl_info( kernel.target, diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index cf0467e08..74a7e7fe7 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -881,7 +881,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def global_var_names(self): - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace from loopy.kernel.data import ArrayArg return ( @@ -891,7 +891,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): | set( tv.name for tv in six.itervalues(self.temporary_variables) - if tv.scope == MemoryAddressSpace.GLOBAL)) + if tv.scope == AddressSpace.GLOBAL)) # }}} @@ -1118,17 +1118,17 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def local_var_names(self): - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace return set( tv.name for tv in six.itervalues(self.temporary_variables) - if tv.scope == MemoryAddressSpace.LOCAL) + if tv.scope == AddressSpace.LOCAL) def local_mem_use(self): - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace return sum( tv.nbytes for tv in six.itervalues(self.temporary_variables) - if tv.scope == MemoryAddressSpace.LOCAL) + if tv.scope == AddressSpace.LOCAL) # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index ddcb16563..6cd28047b 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -209,7 +209,7 @@ def parse_tag(tag): # {{{ memory address space -class MemoryAddressSpace: +class AddressSpace: """ Storage location of a variable. @@ -281,9 +281,9 @@ class ArrayArg(ArrayBase, KernelArgument): """ .. attribute:: memory_address_space - An attribute of :class:`MemoryAddressSpace` defining the address + An attribute of :class:`AddressSpace` defining the address space in which the array resides in the target memory layout. - Defaults to ``MemoryAddressSpace.GLOBAL`` + Defaults to ``AddressSpace.GLOBAL`` .. attribute:: is_output_only @@ -298,7 +298,7 @@ class ArrayArg(ArrayBase, KernelArgument): def __init__(self, *args, **kwargs): # Defaulting the memory_address_space to be GLOBAL. kwargs["memory_address_space"] = kwargs.pop( - "memory_address_space", MemoryAddressSpace.GLOBAL) + "memory_address_space", AddressSpace.GLOBAL) kwargs["is_output_only"] = kwargs.pop("is_output_only", None) super(ArrayArg, self).__init__(*args, **kwargs) @@ -392,7 +392,7 @@ class InameArg(ValueArg): class _deprecated_temp_var_scope_property(property): # noqa def __get__(self, cls, owner): from warnings import warn - warn("'temp_var_scope' is deprecated. Use 'MemoryAddressSpace'.", + warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", DeprecationWarning, stacklevel=2) return classmethod(self.fget).__get__(None, owner)() @@ -403,22 +403,22 @@ class temp_var_scope: # noqa @_deprecated_temp_var_scope_property def PRIVATE(self): - return MemoryAddressSpace.PRIVATE + return AddressSpace.PRIVATE @_deprecated_temp_var_scope_property def LOCAL(self): - return MemoryAddressSpace.LOCAL + return AddressSpace.LOCAL @_deprecated_temp_var_scope_property def GLOBAL(self): - return MemoryAddressSpace.GLOBAL + return AddressSpace.GLOBAL @classmethod def stringify(cls, val): from warnings import warn - warn("'temp_var_scope' is deprecated. Use 'MemoryAddressSpace'.", + warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", DeprecationWarning, stacklevel=2) - return MemoryAddressSpace.stringify + return AddressSpace.stringify class TemporaryVariable(ArrayBase): @@ -428,7 +428,7 @@ class TemporaryVariable(ArrayBase): .. attribute:: scope What memory this temporary variable lives in. - One of the values in :class:`MemoryAddressSpace`, + One of the values in :class:`AddressSpace`, or :class:`loopy.auto` if this is to be automatically determined. @@ -440,7 +440,7 @@ class TemporaryVariable(ArrayBase): .. attribute:: scope - One of :class:`MemoryAddressSpace`. + One of :class:`AddressSpace`. .. attribute:: initializer @@ -556,15 +556,15 @@ class TemporaryVariable(ArrayBase): @property def is_local(self): - """One of :class:`loopy.MemoryAddressSpace`.""" + """One of :class:`loopy.AddressSpace`.""" if self.scope is auto: return auto - elif self.scope == MemoryAddressSpace.LOCAL: + elif self.scope == AddressSpace.LOCAL: return True - elif self.scope == MemoryAddressSpace.PRIVATE: + elif self.scope == AddressSpace.PRIVATE: return False - elif self.scope == MemoryAddressSpace.GLOBAL: + elif self.scope == AddressSpace.GLOBAL: raise LoopyError("TemporaryVariable.is_local called on " "global temporary variable '%s'" % self.name) else: @@ -585,9 +585,9 @@ class TemporaryVariable(ArrayBase): shape_override=self.storage_shape) def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): - if self.scope == MemoryAddressSpace.GLOBAL: + if self.scope == AddressSpace.GLOBAL: return ast_builder.get_array_arg_decl(self.name + name_suffix, - MemoryAddressSpace.GLOBAL, shape, dtype, is_written) + AddressSpace.GLOBAL, shape, dtype, is_written) else: raise LoopyError("unexpected request for argument declaration of " "non-global temporary") @@ -596,7 +596,7 @@ class TemporaryVariable(ArrayBase): if self.scope is auto: scope_str = "auto" else: - scope_str = MemoryAddressSpace.stringify(self.scope) + scope_str = AddressSpace.stringify(self.scope) return ( self.stringify(include_typename=False) @@ -645,11 +645,11 @@ def iname_tag_to_temp_var_scope(iname_tag): iname_tag = parse_tag(iname_tag) if isinstance(iname_tag, GroupIndexTag): - return MemoryAddressSpace.GLOBAL + return AddressSpace.GLOBAL elif isinstance(iname_tag, LocalIndexTag): - return MemoryAddressSpace.LOCAL + return AddressSpace.LOCAL else: - return MemoryAddressSpace.PRIVATE + return AddressSpace.PRIVATE # {{{ substitution rule diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e9aaeefe8..42c0c74c2 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -63,7 +63,7 @@ class ArrayArgDescriptor(ImmutableRecord): .. attribute:: mem_scope - An attribute of :class:`loopy.kernel.data.MemoryAddressSpace`. + An attribute of :class:`loopy.kernel.data.AddressSpace`. .. attribute:: dim_tags diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 6f11224a6..4d9e71ef9 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -160,7 +160,7 @@ def find_temporary_scope(kernel): new_temp_vars = {} from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag, - MemoryAddressSpace) + AddressSpace) import loopy as lp writers = kernel.writer_map() @@ -221,12 +221,12 @@ def find_temporary_scope(kernel): assert locparallel_assignee_inames <= locparallel_compute_inames assert grpparallel_assignee_inames <= grpparallel_compute_inames - desired_scope = MemoryAddressSpace.PRIVATE + desired_scope = AddressSpace.PRIVATE for iname_descr, scope_descr, apin, cpin, scope in [ ("local", "local", locparallel_assignee_inames, - locparallel_compute_inames, MemoryAddressSpace.LOCAL), + locparallel_compute_inames, AddressSpace.LOCAL), ("group", "global", grpparallel_assignee_inames, - grpparallel_compute_inames, MemoryAddressSpace.GLOBAL), + grpparallel_compute_inames, AddressSpace.GLOBAL), ]: if (apin != cpin and bool(apin)): @@ -774,7 +774,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): last_added_insn_id = insn.id - from loopy.kernel.data import MemoryAddressSpace, TemporaryVariable + from loopy.kernel.data import AddressSpace, TemporaryVariable FIRST_POINTER_ASSIGNEE_IDX = 1 # noqa @@ -787,7 +787,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): assignee_var_name in kernel.temporary_variables and (kernel.temporary_variables[assignee_var_name].scope - == MemoryAddressSpace.PRIVATE)): + == AddressSpace.PRIVATE)): new_assignees.append(assignee) continue @@ -809,7 +809,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): TemporaryVariable( name=new_assignee_name, dtype=None, - scope=MemoryAddressSpace.PRIVATE)) + scope=AddressSpace.PRIVATE)) from pymbolic import var new_assignee = var(new_assignee_name) @@ -990,12 +990,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, for i in range(nresults)] for name in temp_var_names: - from loopy.kernel.data import TemporaryVariable, MemoryAddressSpace + from loopy.kernel.data import TemporaryVariable, AddressSpace new_temporary_variables[name] = TemporaryVariable( name=name, shape=(), dtype=None, - scope=MemoryAddressSpace.PRIVATE) + scope=AddressSpace.PRIVATE) from pymbolic import var temp_vars = tuple(var(n) for n in temp_var_names) @@ -1021,13 +1021,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace acc_var_names = make_temporaries( name_based_on="acc_"+"_".join(expr.inames), nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=MemoryAddressSpace.PRIVATE) + scope=AddressSpace.PRIVATE) init_insn_depends_on = frozenset() @@ -1159,21 +1159,21 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, _get_int_iname_size(oiname) for oiname in outer_local_inames) - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace neutral_var_names = make_temporaries( name_based_on="neutral_"+red_iname, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=MemoryAddressSpace.PRIVATE) + scope=AddressSpace.PRIVATE) acc_var_names = make_temporaries( name_based_on="acc_"+red_iname, nvars=nresults, shape=outer_local_iname_sizes + (size,), dtypes=reduction_dtypes, - scope=MemoryAddressSpace.LOCAL) + scope=AddressSpace.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) @@ -1393,13 +1393,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, track_iname) - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace acc_var_names = make_temporaries( name_based_on="acc_" + scan_iname, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=MemoryAddressSpace.PRIVATE) + scope=AddressSpace.PRIVATE) from pymbolic import var acc_vars = tuple(var(n) for n in acc_var_names) @@ -1518,21 +1518,21 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # }}} - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace read_var_names = make_temporaries( name_based_on="read_"+scan_iname+"_arg_{index}", nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=MemoryAddressSpace.PRIVATE) + scope=AddressSpace.PRIVATE) acc_var_names = make_temporaries( name_based_on="acc_"+scan_iname, nvars=nresults, shape=outer_local_iname_sizes + (scan_size,), dtypes=reduction_dtypes, - scope=MemoryAddressSpace.LOCAL) + scope=AddressSpace.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) read_vars = tuple(var(n) for n in read_var_names) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 00c2df142..d1e3a85e9 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -22,7 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from loopy.kernel.data import MemoryAddressSpace +from loopy.kernel.data import AddressSpace # {{{ block boundary finder @@ -91,7 +91,7 @@ def add_extra_args_to_schedule(kernel): more_args = set(tv for tv in used_temporaries if - kernel.temporary_variables[tv].scope == MemoryAddressSpace.GLOBAL + kernel.temporary_variables[tv].scope == AddressSpace.GLOBAL and kernel.temporary_variables[tv].initializer is None and diff --git a/loopy/statistics.py b/loopy/statistics.py index 5cebbee3c..eaca21527 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -30,7 +30,7 @@ import islpy as isl from pymbolic.mapper import CombineMapper from functools import reduce from loopy.kernel.data import ( - MultiAssignmentBase, TemporaryVariable, MemoryAddressSpace) + MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError from pytools import Record @@ -848,7 +848,7 @@ class LocalMemAccessCounter(MemAccessCounter): if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if isinstance(array, TemporaryVariable) and ( - array.scope == MemoryAddressSpace.LOCAL): + array.scope == AddressSpace.LOCAL): sub_map[MemAccess(mtype='local', dtype=dtype, count_granularity=CountGranularity.WORKITEM)] = 1 return sub_map diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index b8dcfcf77..9be9db38c 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -497,7 +497,7 @@ class CASTBuilder(ASTBuilderBase): result = [] - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace from loopy.schedule import CallKernel # We only need to write declarations for global variables with # the first device program. `is_first_dev_prog` determines @@ -512,7 +512,7 @@ class CASTBuilder(ASTBuilderBase): six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): - if tv.scope == MemoryAddressSpace.GLOBAL and ( + if tv.scope == AddressSpace.GLOBAL and ( tv.initializer is not None): assert tv.read_only @@ -574,7 +574,7 @@ class CASTBuilder(ASTBuilderBase): return None def get_temporary_decls(self, codegen_state, schedule_index): - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel @@ -606,7 +606,7 @@ class CASTBuilder(ASTBuilderBase): if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations - if tv.scope != MemoryAddressSpace.GLOBAL and ( + if tv.scope != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( @@ -785,8 +785,8 @@ class CASTBuilder(ASTBuilderBase): from warnings import warn warn("get_global_arg_decl is deprecated use get_array_arg_decl " "instead.", DeprecationWarning, stacklevel=2) - from loopy.kernel.data import MemoryAddressSpace - return self.get_array_arg_decl(name, MemoryAddressSpace.GLOBAL, shape, + from loopy.kernel.data import AddressSpace + return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, dtype, is_written) def get_constant_arg_decl(self, name, shape, dtype, is_written): diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 7e3724a3a..11fcf5747 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -32,7 +32,7 @@ from loopy.target.c import CTarget, CASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.kernel.data import MemoryAddressSpace +from loopy.kernel.data import AddressSpace from pymbolic import var from loopy.kernel.function_interface import ScalarCallable @@ -351,10 +351,10 @@ class CUDACASTBuilder(CASTBuilder): raise LoopyError("unknown barrier kind") def wrap_temporary_decl(self, decl, scope): - if scope == MemoryAddressSpace.LOCAL: + if scope == AddressSpace.LOCAL: from cgen.cuda import CudaShared return CudaShared(decl) - elif scope == MemoryAddressSpace.PRIVATE: + elif scope == AddressSpace.PRIVATE: return decl else: raise ValueError("unexpected temporary variable scope: %s" @@ -380,7 +380,7 @@ class CUDACASTBuilder(CASTBuilder): from warnings import warn warn("get_global_arg_decl is deprecated use get_array_arg_decl " "instead.", DeprecationWarning, stacklevel=2) - return self.get_array_arg_decl(name, MemoryAddressSpace.GLOBAL, shape, + return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, dtype, is_written) def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 0a4299033..a9f291a80 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -32,7 +32,7 @@ from loopy.diagnostic import LoopyError from loopy.symbolic import Literal from pymbolic import var import pymbolic.primitives as p -from loopy.kernel.data import MemoryAddressSpace +from loopy.kernel.data import AddressSpace from pymbolic.mapper.stringifier import PREC_NONE from pytools import memoize_method @@ -82,7 +82,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): def map_variable(self, expr, type_context): tv = self.kernel.temporary_variables.get(expr.name) - if tv is not None and tv.scope == MemoryAddressSpace.PRIVATE: + if tv is not None and tv.scope == AddressSpace.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # below in decl generation) @@ -102,7 +102,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): ary = self.find_array(expr) if (isinstance(ary, TemporaryVariable) - and ary.scope == MemoryAddressSpace.PRIVATE): + and ary.scope == AddressSpace.PRIVATE): # generate access code for acccess to private-index temporaries gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() @@ -308,7 +308,7 @@ class ISPCASTBuilder(CASTBuilder): shape = decl_info.shape - if temp_var.scope == MemoryAddressSpace.PRIVATE: + if temp_var.scope == AddressSpace.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # above in expr to code mapper) @@ -347,7 +347,7 @@ class ISPCASTBuilder(CASTBuilder): from warnings import warn warn("get_global_arg_decl is deprecated use get_array_arg_decl " "instead.", DeprecationWarning, stacklevel=2) - return self.get_array_arg_decl(name, MemoryAddressSpace.GLOBAL, shape, + return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, dtype, is_written) def get_value_arg_decl(self, name, shape, dtype, is_written): diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 164bfb7aa..85af4ece3 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -32,7 +32,7 @@ from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper -from loopy.kernel.data import MemoryAddressSpace +from loopy.kernel.data import AddressSpace from loopy.kernel.function_interface import ScalarCallable from pymbolic import var @@ -517,10 +517,10 @@ class OpenCLCASTBuilder(CASTBuilder): raise LoopyError("unknown barrier kind") def wrap_temporary_decl(self, decl, scope): - if scope == MemoryAddressSpace.LOCAL: + if scope == AddressSpace.LOCAL: from cgen.opencl import CLLocal return CLLocal(decl) - elif scope == MemoryAddressSpace.PRIVATE: + elif scope == AddressSpace.PRIVATE: return decl else: raise ValueError("unexpected temporary variable scope: %s" @@ -532,15 +532,15 @@ class OpenCLCASTBuilder(CASTBuilder): def get_array_arg_decl(self, name, mem_address_space, shape, dtype, is_written): from cgen.opencl import CLGlobal, CLLocal - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace - if mem_address_space == MemoryAddressSpace.LOCAL: + if mem_address_space == AddressSpace.LOCAL: return CLLocal(super(OpenCLCASTBuilder, self).get_array_arg_decl( name, mem_address_space, shape, dtype, is_written)) - elif mem_address_space == MemoryAddressSpace.PRIVATE: + elif mem_address_space == AddressSpace.PRIVATE: return super(OpenCLCASTBuilder, self).get_array_arg_decl( name, mem_address_space, shape, dtype, is_written) - elif mem_address_space == MemoryAddressSpace.GLOBAL: + elif mem_address_space == AddressSpace.GLOBAL: return CLGlobal(super(OpenCLCASTBuilder, self).get_array_arg_decl( name, mem_address_space, shape, dtype, is_written)) else: @@ -548,12 +548,12 @@ class OpenCLCASTBuilder(CASTBuilder): % mem_address_space) def get_global_arg_decl(self, name, shape, dtype, is_written): - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace from warnings import warn warn("get_global_arg_decl is deprecated use get_array_arg_decl " "instead.", DeprecationWarning, stacklevel=2) - return self.get_array_arg_decl(name, MemoryAddressSpace.GLOBAL, shape, + return self.get_array_arg_decl(name, AddressSpace.GLOBAL, shape, dtype, is_written) def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written): @@ -605,7 +605,7 @@ class OpenCLCASTBuilder(CASTBuilder): old_val_var = codegen_state.var_name_generator("loopy_old_val") new_val_var = codegen_state.var_name_generator("loopy_new_val") - from loopy.kernel.data import TemporaryVariable, MemoryAddressSpace + from loopy.kernel.data import TemporaryVariable, AddressSpace ecm = codegen_state.expression_to_code_mapper.with_assignments( { old_val_var: TemporaryVariable(old_val_var, lhs_dtype), @@ -647,20 +647,20 @@ class OpenCLCASTBuilder(CASTBuilder): if ( isinstance(lhs_var, ArrayArg) and - lhs_var.memory_address_space == MemoryAddressSpace.GLOBAL): + lhs_var.memory_address_space == AddressSpace.GLOBAL): var_kind = "__global" elif ( isinstance(lhs_var, ArrayArg) and - lhs_var.memory_address_space == MemoryAddressSpace.LOCAL): + lhs_var.memory_address_space == AddressSpace.LOCAL): var_kind = "__local" elif ( isinstance(lhs_var, TemporaryVariable) - and lhs_var.scope == MemoryAddressSpace.LOCAL): + and lhs_var.scope == AddressSpace.LOCAL): var_kind = "__local" elif ( isinstance(lhs_var, TemporaryVariable) - and lhs_var.scope == MemoryAddressSpace.GLOBAL): + and lhs_var.scope == AddressSpace.GLOBAL): var_kind = "__global" else: raise LoopyError("unexpected kind of variable '%s' in " diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 17d702136..7355ceb2c 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -52,11 +52,11 @@ def adjust_local_temp_var_storage(kernel, device): new_temp_vars = {} - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace lmem_size = cl_char.usable_local_mem_size(device) for temp_var in six.itervalues(kernel.temporary_variables): - if temp_var.scope != MemoryAddressSpace.LOCAL: + if temp_var.scope != AddressSpace.LOCAL: new_temp_vars[temp_var.name] = \ temp_var.copy(storage_shape=temp_var.shape) continue @@ -69,7 +69,7 @@ def adjust_local_temp_var_storage(kernel, device): other_loctemp_nbytes = [ tv.nbytes for tv in six.itervalues(kernel.temporary_variables) - if tv.scope == MemoryAddressSpace.LOCAL + if tv.scope == AddressSpace.LOCAL and tv.name != temp_var.name] storage_shape = temp_var.storage_shape @@ -698,11 +698,11 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): from operator import mul return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace global_temporaries = sorted( (tv for tv in six.itervalues(codegen_state.kernel.temporary_variables) - if tv.scope == MemoryAddressSpace.GLOBAL), + if tv.scope == AddressSpace.GLOBAL), key=lambda tv: tv.name) from pymbolic.mapper.stringifier import PREC_NONE diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index b576e539e..0d3db360d 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -39,14 +39,14 @@ __doc__ = """ # {{{ to_batched def temp_needs_batching_if_not_sequential(tv, batch_varying_args): - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace if tv.name in batch_varying_args: return True if tv.initializer is not None and tv.read_only: # do not batch read_only temps if not in # `batch_varying_args` return False - if tv.scope == MemoryAddressSpace.PRIVATE: + if tv.scope == AddressSpace.PRIVATE: # do not batch private temps if not in `batch_varying args` return False return True diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 058919a77..801da4c13 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -137,7 +137,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable - :class:`loopy.MemoryAddressSpace` and shape is created. + :class:`loopy.AddressSpace` and shape is created. By default, the value of the buffered cells in *var_name* are read prior to any (read/write) use, and the modified values are written out after use has @@ -160,7 +160,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, matching contexts. See :func:`loopy.match.parse_stack_match` for syntax. :arg temporary_scope: If given, override the choice of - :class:`MemoryAddressSpace` for the created temporary. + :class:`AddressSpace` for the created temporary. :arg default_tag: The default :ref:`iname-tags` to be assigned to the inames used for fetching and storing :arg fetch_bounding_box: If the access footprint is non-convex @@ -171,7 +171,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, # {{{ unify temporary_scope / temporary_is_local - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace if temporary_is_local is not None: from warnings import warn warn("temporary_is_local is deprecated. Use temporary_scope instead", @@ -182,9 +182,9 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, "temporary_scope") if temporary_is_local: - temporary_scope = MemoryAddressSpace.LOCAL + temporary_scope = AddressSpace.LOCAL else: - temporary_scope = MemoryAddressSpace.PRIVATE + temporary_scope = AddressSpace.PRIVATE del temporary_is_local diff --git a/loopy/transform/data.py b/loopy/transform/data.py index a1ad951be..58cd64714 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -175,7 +175,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, :arg rule_name: base name of the generated temporary variable. :arg temporary_name: The name of the temporary to be used. - :arg temporary_scope: The :class:`MemoryAddressSpace` to use for the + :arg temporary_scope: The :class:`AddressSpace` to use for the temporary. :arg temporary_is_local: Deprecated, use *temporary_scope* instead. :arg footprint_subscripts: A list of tuples indicating the index (i.e. @@ -647,24 +647,24 @@ def set_temporary_scope(kernel, temp_var_names, scope): :arg temp_var_names: a container with membership checking, or a comma-separated string of variables for which the scope is to be set. - :arg scope: One of the values from :class:`MemoryAddressSpace`, or one + :arg scope: One of the values from :class:`AddressSpace`, or one of the strings ``"private"``, ``"local"``, or ``"global"``. """ if isinstance(temp_var_names, str): temp_var_names = [s.strip() for s in temp_var_names.split(",")] - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace if isinstance(scope, str): try: - scope = getattr(MemoryAddressSpace, scope.upper()) + scope = getattr(AddressSpace, scope.upper()) except AttributeError: raise LoopyError("scope '%s' unknown" % scope) if not isinstance(scope, int) or scope not in [ - MemoryAddressSpace.PRIVATE, - MemoryAddressSpace.LOCAL, - MemoryAddressSpace.GLOBAL]: + AddressSpace.PRIVATE, + AddressSpace.LOCAL, + AddressSpace.GLOBAL]: raise LoopyError("invalid scope '%s'" % scope) new_temp_vars = kernel.temporary_variables.copy() diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 82d2d3b34..2e3358dc5 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -341,7 +341,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # {{{ unify temporary_scope / temporary_is_local - from loopy.kernel.data import MemoryAddressSpace + from loopy.kernel.data import AddressSpace if temporary_is_local is not None: from warnings import warn warn("temporary_is_local is deprecated. Use temporary_scope instead", @@ -352,9 +352,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, "temporary_scope") if temporary_is_local: - temporary_scope = MemoryAddressSpace.LOCAL + temporary_scope = AddressSpace.LOCAL else: - temporary_scope = MemoryAddressSpace.PRIVATE + temporary_scope = AddressSpace.PRIVATE del temporary_is_local @@ -804,7 +804,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, compute_dep_id = compute_insn_id added_compute_insns = [compute_insn] - if temporary_scope == MemoryAddressSpace.GLOBAL: + if temporary_scope == AddressSpace.GLOBAL: barrier_insn_id = kernel.make_unique_instruction_id( based_on=c_subst_name+"_barrier") from loopy.kernel.instruction import BarrierInstruction @@ -976,8 +976,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, raise LoopyError("Existing and new temporary '%s' do not " "have matching scopes (existing: %s, new: %s)" % (temporary_name, - MemoryAddressSpace.stringify(temp_var.scope), - MemoryAddressSpace.stringify(temporary_scope))) + AddressSpace.stringify(temp_var.scope), + AddressSpace.stringify(temporary_scope))) temp_var = temp_var.copy(scope=temporary_scope) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 2ac84a681..e5c5a99b2 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -27,7 +27,7 @@ from loopy.diagnostic import LoopyError import loopy as lp import six -from loopy.kernel.data import auto, MemoryAddressSpace +from loopy.kernel.data import auto, AddressSpace from pytools import memoize_method, Record from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, @@ -228,7 +228,7 @@ class TemporarySaver(object): return TemporaryVariable( name=self.name, dtype=temporary.dtype, - scope=MemoryAddressSpace.GLOBAL, + scope=AddressSpace.GLOBAL, shape=self.new_shape) @property @@ -439,7 +439,7 @@ class TemporarySaver(object): group_sizes, local_sizes = ( self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) - if temporary.scope == lp.MemoryAddressSpace.LOCAL: + if temporary.scope == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. del local_tags[:] local_sizes = () @@ -452,7 +452,7 @@ class TemporarySaver(object): def auto_promote_temporary(self, temporary_name): temporary = self.kernel.temporary_variables[temporary_name] - if temporary.scope == MemoryAddressSpace.GLOBAL: + if temporary.scope == AddressSpace.GLOBAL: # Nothing to be done for global temporaries (I hope) return None -- GitLab From 61511d728f208e4180afdeb1f8969da0e462b8ce Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 21 Jun 2018 13:45:14 -0500 Subject: [PATCH 203/916] comment rewording. --- loopy/kernel/creation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 7728eddbe..f808c42c2 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1841,8 +1841,12 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): class FunctionScoper(RuleAwareIdentityMapper): """ - Converts functions known to the kernel as instances of - :class:`loopy.symbolic.ScopedFunction`. + Mapper to convert the ``function`` attribute of a + :class:`pymbolic.primitives.Call` known in the kernel as instances of + :class:`loopy.symbolic.ScopedFunction`. A function is known in the + *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` + returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. **Example**: If given an expression of the form ``sin(x) + unknown_function(y) + log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + -- GitLab From a4773886fd58fff2203a6d97e780d4e79cd58065 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 21 Jun 2018 14:22:04 -0500 Subject: [PATCH 204/916] changes according to new system of iname_to_tags --- loopy/kernel/function_interface.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 8d7bd498b..28737d647 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -749,8 +749,10 @@ class CallableKernel(InKernelCallable): new_domain = domain.copy() for i in range(new_domain.n_dim()): iname = new_domain.get_dim_name(dim_type, i) - new_iname_to_tags[iname_map[iname]] = ( - callee_knl.iname_to_tags[iname]) + + if iname in callee_knl.iname_to_tags: + new_iname_to_tags[iname_map[iname]] = ( + callee_knl.iname_to_tags[iname]) new_domain = new_domain.set_dim_name( dim_type, i, iname_map[iname]) new_domains.append(new_domain) -- GitLab From c2d7fb2999f9377df4f29be8f7cafc2a47e1ff6d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 21 Jun 2018 14:36:32 -0500 Subject: [PATCH 205/916] Some more comments. --- loopy/check.py | 4 +++- loopy/symbolic.py | 12 +++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 77e916328..4a340e6dd 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -105,7 +105,9 @@ class UnscopedCallCollector(CombineMapper): def check_functions_are_scoped(kernel): """ Checks if all the calls in the instruction expression have been scoped, - otherwise indicates to what all calls we await signature. + otherwise indicates to what all calls we await signature. Refer + :class:`loopy.symbolic.ScopedFunction` for a detailed explanation of a + scoped function. """ from loopy.symbolic import SubstitutionRuleExpander diff --git a/loopy/symbolic.py b/loopy/symbolic.py index ccaa8cdaa..3fdd1aab8 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -684,12 +684,18 @@ class RuleArgument(p.Expression): class ScopedFunction(p.Expression): - """ Connects a call to a callable available in a kernel. + """ + A function invocation whose definition is known in a :mod:`loopy` kernel. + Each instance of :class:`loopy.symbolic.ScopedFunction` in an expression + points to an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` through the + mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. .. attribute:: function - An instance of :class:`pymbolic.primitives.Variable` or - `loopy.library.reduction.ArgExtOp`. + An instance of :class:`pymbolic.primitives.Variable`, + :class:`loopy.library.reduction.ArgExtOp` or + :class:`loopy.library.reduction.SegmentedOp`. """ init_arg_names = ("function", ) -- GitLab From 66c6a5bc252fc70d8f60a02bec2b10eb00311e9a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 21 Jun 2018 21:20:40 -0500 Subject: [PATCH 206/916] Added unpicklability testing in function_scopers --- loopy/transform/register_callable.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index dda5a0cc7..455c2e51e 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -38,6 +38,8 @@ from loopy.kernel.function_interface import (get_kw_pos_association, __doc__ = """ .. currentmodule:: loopy +.. autofunction:: register_function_lookup + .. autofunction:: register_callable_kernel """ @@ -53,7 +55,14 @@ def register_function_lookup(kernel, function_lookup): """ # adding the function lookup to the set of function lookers in the kernel. - new_function_scopers = kernel.function_scopers + [function_lookup] + if function_lookup not in kernel.function_scopers: + from loopy.tools import unpickles_equally + if not unpickles_equally(function_lookup): + raise LoopyError("function '%s' does not " + "compare equally after being upickled " + "and would disrupt loopy's caches" + % function_lookup) + new_function_scopers = kernel.function_scopers + [function_lookup] registered_kernel = kernel.copy(function_scopers=new_function_scopers) from loopy.kernel.creation import scope_functions -- GitLab From aef58128e3f2ed55ee5980a3fb318307e8b40931 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 23 Jun 2018 13:26:45 -0500 Subject: [PATCH 207/916] Added documentation for scoped functions. --- doc/index.rst | 1 + doc/ref_scoped_functions.rst | 270 +++++++++++++++++++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 doc/ref_scoped_functions.rst diff --git a/doc/index.rst b/doc/index.rst index d862a8acd..69f08730c 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -68,6 +68,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform + ref_scoped_functions ref_other misc diff --git a/doc/ref_scoped_functions.rst b/doc/ref_scoped_functions.rst new file mode 100644 index 000000000..c2deaca67 --- /dev/null +++ b/doc/ref_scoped_functions.rst @@ -0,0 +1,270 @@ +ScopedFunctions +=============== + +``ScopedFunctions`` are pymbolic nodes within expressions in a +``Loo.py`` kernel, whose name has been resolved by the kernel. + +A pymbolic ``Call`` node can be converted to a ``ScopedFunction`` if it +is resolved by one of the ``function_scoper`` in a :attr:`LoopKernel.scoped_functions` + +- Functions already registered by the target. Some examples include -- + ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.) +- Functions that are defined in ``Loo.py`` and are realized into + different set of instructions during code generation. Some examples + include ``make_tuple``, ``ArgExtOp``, ``index_of``, ... +- Functions registered as ``CallableKernels`` using + ``lp.register_callable_kernel(...)``. +- Functions that have been provided through + ``lp.register_function_scoper(...)`` +- Functions that can be made known from the user through + ``lp.register_function_mangler``. This is planned to be deprecated, + as its functionality is superseded by + ``lp.register_function_scoper(...)``. + +Expressions after a function is scoped. +--------------------------------------- + +Consider the following expression. + +:: + + sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i]) + +During the kernel creation phase, the kernel would know that ``sin`` is +a function known to the target and hence it should be scoped. And as +expected, after ``make_kernel`` has been called the above expression +would get converted to: + +:: + + ScopedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + callable_knl_func(c[i])*mangler_call(d[i]) + +This would also make an entry in the kernel's ``scoped_functions`` +dictionary as: + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None)} + +It might be noteworthy that at this step, it only scopes functions +through their names without any information about the types of the +function. + +Once, the user calls the transformation: +``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``, +the expression gets converted to: + +:: + + ScopedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + ScopedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) + +This also makes an entry in the ``scoped_functions`` dictionary as -- + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)} + +Now, if the user calls +``register_function_mangler(knl, 'mangler_call')``, one might expect +that the mangler call function should get scoped, but that does **not** +happen, because the "old" ``function_manglers``, would return a match +only if all the parameters of the function match viz. name, argument +arity and argument types. Hence, the ``scoped_functions`` dictionary +would remain unchanged. + +``ScopedFunctions`` and specializations +--------------------------------------- + +Consider the same ``ScopedFunction('sin')`` as above. This function +although scoped does not the know the types i.e. it does yet know that +for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or +``sinl``. Hence, right now the function can be called as a +"type-generic" function as further in the pipeline it can take any one +of the above definitions. The functions go through a "specialization" +processes at various points in the pipeline, where the attributes of the +callables are resolved. + +- During type inference, the functions go though type specialization + where in the ``arg_id_to_dtype`` of the functions is realized. +- During descriptor inference, the functions goes through a description + specialization where the ``arg_id_to_descr`` is populated. The + ``arg_id_to_descr`` contains important information regarding shape, + strides and scope of the arguments which form an important part of + ``CallableKernel`` as this information would be helpful to to + generate the function signature and make changes to the data access + pattern of the variables in the callee kernel. +- Whenever a ``ScopedFunction`` goes through a specialization, this is + indicated by changing the name in the ``pymbolic`` node. + +If during type inference, it is inferred that the type of ``a[i]`` is +``np.float32``. The new ``pymbolic`` node would be: + +:: + + ScopedFunction('sin_0')(a[i]) + ... + +This name change is done so that it indicates that the node points to a +different ``ScalarCallable`` in the dictionary. And hence a new entry is +added to the ``scoped_functions`` dictionary as: + +:: + + {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None), + 'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32, + -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')} + +Description Inference +--------------------- + +Although this step has no significance for a ``ScalarCallable``, it +forms a very important part of ``CallableKernel``. In which the +``dim_tags``, ``shape`` and ``mem_scope`` of the arguments of the +callable kernel is altered. + +- The ``dim_tags`` attribute helps to ensure that the memory layout + between the caller and the callee kernel is coherent. +- The ``mem_scope`` attribute ensures that, while writing the device + code we emit the appropriate scope qualifiers for the function + declaration arguments. +- The ``shape`` attribute helps in: + + - Storage allocation. + - Memory layout. + - Out of bounds accesses to be caught in ``Loo.py``. + +Hence, in the ``Loo.py`` pipeline, one might expect the following +developments of the ``sin`` pymbolic call expression node. + +:: + + sin -> (Kernel creation) -> ScopedFunction(Variable('sin')) -> + (Type Inference) -> ScopedFunction(Variable('sin_0')) -> + (Descriptor Inference) -> ScopedFunction(Variable('sin_1')) + +Changes on the target side to accommodate the new function interface. +--------------------------------------------------------------------- + +The earlier "function\_mangler" as a member method of the class +``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The +function scopers would return a list of functions with the signature +``(target, identifier)->lp.InKernelCallable``. + +An example of registering Vector callables is shown below. +---------------------------------------------------------- + +.. code:: python + + import loopy as lp + import numpy as np + from loopy.diagnostic import LoopyError + from loopy.target.c import CTarget + + + # {{{ blas callable + + class BLASCallable(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + for i in range(0, 2): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + mat_dtype = arg_id_to_dtype[0].numpy_dtype + vec_dtype = arg_id_to_dtype[1].numpy_dtype + + if mat_dtype != vec_dtype: + raise LoopyError("DGEMV should have same dtype for matrix and " + "vector") + + if vec_dtype == np.float32: + name_in_target = "cblas_sgemv" + elif vec_dtype == np.float64: + name_in_target = "cblas_dgemv" + else: + raise LoopyError("GEMV only supported for float32 and float64 " + "types") + + from loopy.types import NumpyType + return self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), + -1: NumpyType(vec_dtype)}) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + + parameters.append(insn.assignees[0]) + par_dtypes.append(self.arg_id_to_dtype[-1]) + + # no type casting in array calls. + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + mat_descr = self.arg_id_to_descr[0] + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + c_parameters.insert(0, var('CblasRowMajor')) + c_parameters.insert(1, var('CblasNoTrans')) + c_parameters.insert(2, mat_descr.shape[0]) + c_parameters.insert(3, mat_descr.shape[1]) + c_parameters.insert(4, 1) + c_parameters.insert(6, 1) + c_parameters.insert(8, 1) + c_parameters.insert(10, 1) + return var(self.name_in_target)(*c_parameters), False + + def generate_preambles(self, target): + assert isinstance(target, CTarget) + yield("99_cblas", "#include ") + return + + + def blas_fn_lookup(target, identifier): + if identifier == 'gemv': + return BLASCallable(name='gemv') + return None + + # }}} + + + n = 10 + + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[:] = gemv(A[:, :], x[:]) + """, [ + lp.ArrayArg('A', dtype=np.float64, shape=(n, n)), + lp.ArrayArg('x', dtype=np.float64, shape=(n, )), + lp.ArrayArg('y', shape=(n, )), ...], + target=CTarget()) + knl = lp.register_function_lookup(knl, blas_fn_lookup) + -- GitLab From e22d43dacfe299cc33df674a068096dd549158f6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 23 Jun 2018 13:27:29 -0500 Subject: [PATCH 208/916] improves the comments for sub array refs. --- loopy/symbolic.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 3fdd1aab8..1c8461e61 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -770,15 +770,20 @@ class SweptInameStrideCollector(CoefficientCollectorBase): class SubArrayRef(p.Expression): - """Represents a generalized sliced notation of an array. + """ + An algebraic expression to map an affine memory layout pattern (known as + sub-arary) as consecutive elements of the sweeping axes which are defined + using :attr:`SubArrayRef.swept_inames`. .. attribute:: swept_inames - These are a tuple of sweeping inames over the array. + An instance of :class:`tuple` denoting the axes to which the sub array + is supposed to be mapper to. .. attribute:: subscript - The subscript whose adress space is to be referenced + An instance of :class:`pymbolic.primitives.Subscript` denoting the + array in the kernel. """ init_arg_names = ("swept_inames", "subscript") -- GitLab From dcc296384360790e06b39caa97a85ad854a665f4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 23 Jun 2018 15:09:49 -0500 Subject: [PATCH 209/916] Made some minor changes to the improvement of the packing interface. --- loopy/kernel/function_interface.py | 12 ++-- loopy/transform/pack_and_unpack_args.py | 87 +++++++++++++++---------- test/test_transform.py | 4 +- 3 files changed, 62 insertions(+), 41 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index ea20ae9da..1fe33576a 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -289,24 +289,26 @@ class ScalarCallable(InKernelCallable): specialization of the funciton. """ - fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) - init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", + fields = set(["arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr", "name_in_target") - def __init__(self, name, arg_id_to_dtype=None, + def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): super(ScalarCallable, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) - self.name = name self.name_in_target = name_in_target def __getinitargs__(self): - return (self.name, self.arg_id_to_dtype, self.arg_id_to_descr, + return (self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) + def name(self): + return self.subkernel.name + def with_types(self, arg_id_to_dtype, kernel): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 2c06a6fa9..89e138844 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -35,7 +35,8 @@ __doc__ = """ # {{{ main entrypoint -def pack_and_unpack_args_for_call(kernel, call_name, args=None): +def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, + args_to_unpack=None): """ Returns a a copy of *kernel* with instructions appended to copy the arguments in *args* to match the alignment expected by the *call_name* in @@ -44,9 +45,12 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): :arg call_name: An instance of :class:`str` denoting the function call in the *kernel*. - :arg args: A list of the arguments as instances of :class:`str` which must - be packed and unpacked. If set *None*, it is interpreted that all the - array arguments would be packed anf unpacked. + :arg args_to_unpack: A list of the arguments as instances of :class:`str` which + must be packed. If set *None*, it is interpreted that all the array + arguments would be packed. + :arg args_to_unpack: A list of the arguments as instances of :class:`str` + which must be unpacked. If set *None*, it is interpreted that + all the array arguments should be unpacked. """ new_domains = [] new_tmps = kernel.temporary_variables.copy() @@ -71,18 +75,25 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): ing = kernel.get_instruction_id_generator() parameters = insn.expression.parameters - if args is None: - args = [par.subscript.aggregate.name for par in + if args_to_pack is None: + args_to_pack = [par.subscript.aggregate.name for par in + parameters+insn.assignees if isinstance(par, SubArrayRef) + and (par.swept_inames)] + if args_to_unpack is None: + args_to_unpack = [par.subscript.aggregate.name for par in parameters+insn.assignees if isinstance(par, SubArrayRef) and (par.swept_inames)] # {{{ sanity checks for args - assert isinstance(args, list) + assert isinstance(args_to_pack, list) + assert isinstance(args_to_unpack, list) - for arg in args: + for arg in args_to_pack: found_sub_array_ref = False + for par in parameters + insn.assignees: + # checking that the given args is a sub array ref if isinstance(par, SubArrayRef) and ( par.subscript.aggregate.name == arg): found_sub_array_ref = True @@ -90,11 +101,17 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): if not found_sub_array_ref: raise LoopyError("No match found for packing arg '%s' of call '%s' " "at insn '%s'." % (arg, call_name, insn.id)) + for arg in args_to_unpack: + if arg not in args_to_pack: + raise LoopyError("Argument %s should be packed in order to be " + "unpacked." % arg) # }}} - packing = [] - unpacking = [] + packing_insns = [] + unpacking_insns = [] + + # {{{ handling ilp tags from loopy.kernel.data import IlpBaseTag, VectorizeTag import islpy as isl @@ -118,6 +135,8 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): dim_type, i, ilp_inames_map[var(old_iname)].name) new_domains.append(new_domain) + # }}} + from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import SubstitutionMapper @@ -128,7 +147,8 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): new_id_to_parameters = {} for id, p in id_to_parameters: - if isinstance(p, SubArrayRef) and p.subscript.aggregate.name in args: + if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name in + args_to_pack): new_pack_inames = ilp_inames_map.copy() # packing-specific inames new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname @@ -201,7 +221,7 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): # }}} - packing.append(Assignment( + packing_insns.append(Assignment( assignee=pack_lhs_assignee, expression=pack_subst_mapper.map_subscript(p.subscript), within_inames=insn.within_inames - ilp_inames | set( @@ -212,16 +232,17 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): depends_on_is_final=True )) - unpacking.append(Assignment( - expression=unpack_rhs, - assignee=unpack_subst_mapper.map_subscript(p.subscript), - within_inames=insn.within_inames - ilp_inames | set( - new_unpack_inames[i].name for i in p.swept_inames) | ( - new_ilp_inames), - id=ing(insn.id+"_unpack"), - depends_on=frozenset([insn.id]), - depends_on_is_final=True - )) + if p.subscript.aggregate.name in args_to_unpack: + unpacking_insns.append(Assignment( + expression=unpack_rhs, + assignee=unpack_subst_mapper.map_subscript(p.subscript), + within_inames=insn.within_inames - ilp_inames | set( + new_unpack_inames[i].name for i in p.swept_inames) | ( + new_ilp_inames), + id=ing(insn.id+"_unpack"), + depends_on=frozenset([insn.id]), + depends_on_is_final=True + )) # {{{ creating the sweep inames for the new sub array refs @@ -248,24 +269,22 @@ def pack_and_unpack_args_for_call(kernel, call_name, args=None): else: new_id_to_parameters[id] = p - if packing: + if packing_insns: subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) - new_insn = insn.with_transformed_expressions(subst_mapper) + new_call_insn = insn.with_transformed_expressions(subst_mapper) new_params = tuple(subst_mapper(new_id_to_parameters[i]) for i, _ in enumerate(parameters)) new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1]) for i, _ in enumerate(insn.assignees)) - packing.append( - new_insn.copy( - depends_on=new_insn.depends_on | set( - pack.id for pack in packing), - within_inames=new_insn.within_inames - ilp_inames | ( + new_call_insn = new_call_insn.copy( + depends_on=new_call_insn.depends_on | set( + pack.id for pack in packing_insns), + within_inames=new_call_insn.within_inames - ilp_inames | ( new_ilp_inames), - expression=new_insn.expression.function(*new_params), - assignees=new_assignees - ) - ) - old_insn_to_new_insns[insn] = packing + unpacking + expression=new_call_insn.expression.function(*new_params), + assignees=new_assignees) + old_insn_to_new_insns[insn] = (packing_insns + [new_call_insn] + + unpacking_insns) if old_insn_to_new_insns: new_instructions = [] diff --git a/test/test_transform.py b/test/test_transform.py index 8d42b61ff..39ef926f9 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -583,8 +583,8 @@ def test_packing_unpacking(ctx_factory, inline): [k]: y2[k] = callee_fn2([k]: x2[k]) """) - knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1, inline=inline) - knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2, inline=inline) + knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1, inline) + knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2, inline) knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1') knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2') -- GitLab From c1d80dec395f85f0d30dad9c49d98410d4ed9866 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 23 Jun 2018 15:26:30 -0500 Subject: [PATCH 210/916] Still some minor merge "fixes" --- loopy/kernel/function_interface.py | 9 +++++++-- loopy/transform/pack_and_unpack_args.py | 2 +- test/test_transform.py | 8 ++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index f6511db01..25fd8403b 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -626,9 +626,13 @@ class CallableKernel(InKernelCallable): if arg.dtype is not None else arg for arg in subkernel.args]) def __getinitargs__(self): - return (self.name, self.subkernel, self.arg_id_to_dtype, + return (self.subkernel, self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) + @property + def name(self): + return self.subkernel.name + def with_types(self, arg_id_to_dtype, kernel): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) @@ -874,7 +878,8 @@ class CallableKernel(InKernelCallable): insn = insn.with_transformed_expressions(subst_mapper) within_inames = frozenset(map(iname_map.get, insn.within_inames)) within_inames = within_inames | instruction.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) + depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( + instruction.depends_on) if insn.id in heads: depends_on = depends_on | set([noop_start.id]) insn = insn.copy( diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 89e138844..663c60b2a 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -1,6 +1,6 @@ from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2018 Tianjiao Sun" +__copyright__ = "Copyright (C) 2018 Tianjiao Sun, Kaushik Kulkarni" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/test/test_transform.py b/test/test_transform.py index e30d6e263..6e441976a 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -523,12 +523,16 @@ def test_packing_unpacking(ctx_factory, inline): [k]: y2[k] = callee_fn2([k]: x2[k]) """) - knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1, inline) - knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2, inline) + knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1) + knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2) knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1') knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2') + if inline: + knl = lp.inline_callable_kernel(knl, 'callee_fn1') + knl = lp.inline_callable_kernel(knl, 'callee_fn2') + knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(queue, x1=x1, x2=x2) -- GitLab From 77d92ffbad86120ab4bb854310f2725b2d97a9a0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 23 Jun 2018 16:05:38 -0500 Subject: [PATCH 211/916] Minor error fix. --- loopy/kernel/function_interface.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 25fd8403b..743ca2941 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -330,26 +330,24 @@ class ScalarCallable(InKernelCallable): derived subclasses. """ - fields = set(["arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) - init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr", + fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") - def __init__(self, arg_id_to_dtype=None, + def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): super(ScalarCallable, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) + self.name = name self.name_in_target = name_in_target def __getinitargs__(self): return (self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def name(self): - return self.subkernel.name - def with_types(self, arg_id_to_dtype, kernel): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) -- GitLab From a1e5f6c6ea9845664bd26139efab968ae71f7cfe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 27 Jun 2018 12:01:59 -0500 Subject: [PATCH 212/916] Comment rewording. --- loopy/kernel/function_interface.py | 3 ++- loopy/symbolic.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 743ca2941..089b6cb36 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -702,6 +702,7 @@ class CallableKernel(InKernelCallable): arg_id_to_descr=arg_id_to_descr) def with_packing_for_args(self): + from loopy.kernel.data import AddressSpace kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) arg_id_to_descr = {} @@ -711,7 +712,7 @@ class CallableKernel(InKernelCallable): arg_id_to_descr[pos] = ArrayArgDescriptor( shape=arg.shape, dim_tags=arg.dim_tags, - mem_scope='Global') + mem_scope=AddressSpace.GLOBAL) return self.copy(subkernel=self.subkernel, arg_id_to_descr=arg_id_to_descr) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 1c8461e61..09e6e5747 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -689,7 +689,9 @@ class ScopedFunction(p.Expression): Each instance of :class:`loopy.symbolic.ScopedFunction` in an expression points to an instance of :class:`loopy.kernel.function_interface.InKernelCallable` through the - mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. + mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer + :ref:`ref_scoped_function` for a slightly detailed explanation on scoped + functions. .. attribute:: function -- GitLab From 50be51a06e4ffc12d3948f190bff6cff5c2012b2 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Tue, 8 May 2018 15:34:14 +0100 Subject: [PATCH 213/916] start working on opaque types --- loopy/codegen/__init__.py | 5 ++++- loopy/preprocess.py | 6 +++++- loopy/target/c/__init__.py | 4 +++- loopy/types.py | 16 ++++++++++++++++ 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e5938dbc4..fcd170316 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -478,9 +478,12 @@ def generate_code_v2(kernel): else: raise ValueError("argument type not understood: '%s'" % type(arg)) + from loopy.types import OpaqueType + allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): - if var.dtype.involves_complex(): + dtype = var.dtype + if not isinstance(dtype, OpaqueType) and dtype.involves_complex(): allow_complex = True # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c4719ace5..1d5f8c130 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -51,13 +51,17 @@ logger = logging.getLogger(__name__) def prepare_for_caching(kernel): import loopy as lp + from loopy.types import OpaqueType new_args = [] tgt = kernel.target for arg in kernel.args: dtype = arg.dtype - if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: + if (dtype is not None + and not isinstance(dtype, OpaqueType) + and dtype is not lp.auto + and dtype.target is not tgt): arg = arg.copy(dtype=dtype.with_target(kernel.target)) new_args.append(arg) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 9be9db38c..366d167da 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -62,11 +62,13 @@ class DTypeRegistryWrapper(object): return self.wrapped_registry.get_or_register_dtype(names, dtype) def dtype_to_ctype(self, dtype): - from loopy.types import LoopyType, NumpyType + from loopy.types import LoopyType, NumpyType, OpaqueType assert isinstance(dtype, LoopyType) if isinstance(dtype, NumpyType): return self.wrapped_registry.dtype_to_ctype(dtype) + elif isinstance(dtype, OpaqueType): + return dtype.name else: raise LoopyError( "unable to convert type '%s' to C" diff --git a/loopy/types.py b/loopy/types.py index 8f0f310c3..de7890aa8 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -177,6 +177,22 @@ class AtomicNumpyType(NumpyType, AtomicType): # }}} +# {{{ + +class OpaqueType(LoopyType): + def __init__(self, name): + assert isinstance(name, str) + self.name = name + + def is_integral(self): + return False + + def is_complex(self): + return False + +# }}} + + def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, target=None): from loopy.kernel.data import auto -- GitLab From b4498bc0c55b7add93506176c2b935e508880cb9 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 25 May 2018 11:34:34 +0100 Subject: [PATCH 214/916] const type inference --- loopy/type_inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 53d7074f7..c05cdb2c1 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -314,6 +314,7 @@ class TypeInferenceMapper(CombineMapper): continue # }}} + continue raise LoopyError("Overwriting a specialized function " "is illegal--maybe start with new instance of " -- GitLab From a911a9a38694be8aa1f36ba9d0db13f7fc3ef3c7 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 7 Jun 2018 08:25:41 +0100 Subject: [PATCH 215/916] bypass argument checking for inlining --- loopy/kernel/function_interface.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 089b6cb36..b48d99001 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -518,16 +518,21 @@ class KernelInliner(SubstitutionMapper): for idx, tag in zip(outer_indices, callee_arg.dim_tags)) from loopy.isl_helpers import simplify_via_aff - flatten_index = simplify_via_aff(flatten_index) + try: + flatten_index = simplify_via_aff(flatten_index) + except: + pass new_indices = [] for dim_tag in caller_arg.dim_tags: ind = flatten_index // dim_tag.stride flatten_index -= (dim_tag.stride * ind) + try: + ind = simplify_via_aff(ind) + except: + pass new_indices.append(ind) - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - return aggregate.index(tuple(new_indices)) else: return super(KernelInliner, self).map_subscript(expr) @@ -696,7 +701,10 @@ class CallableKernel(InKernelCallable): raise LoopyError("Descriptor must be either an instance of " "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % type(descr)) - descriptor_specialized_knl = self.subkernel.copy(args=new_args) + if self.should_inline: + descriptor_specialized_knl = self.subkernel.copy() + else: + descriptor_specialized_knl = self.subkernel.copy(args=new_args) return self.copy(subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr) @@ -900,6 +908,8 @@ class CallableKernel(InKernelCallable): new_insns.append(insn) kernel = kernel.copy(instructions=new_insns) + # TODO: resolve name clash here + kernel.scoped_functions.update(callee_knl.scoped_functions) # }}} -- GitLab From cad54af88ff40afa88edfdcee9c0cea4875c32a4 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Mon, 18 Jun 2018 18:27:06 +0100 Subject: [PATCH 216/916] rebase to kernel_callable --- loopy/check.py | 2 +- loopy/kernel/function_interface.py | 5 +---- loopy/symbolic.py | 10 +++++++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 4a340e6dd..60d2fd698 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -729,7 +729,7 @@ def pre_schedule_checks(kernel): check_for_data_dependent_parallel_bounds(kernel) check_bounds(kernel) check_write_destinations(kernel) - check_has_schedulable_iname_nesting(kernel) + # check_has_schedulable_iname_nesting(kernel) check_variable_access_ordered(kernel) logger.debug("%s: pre-schedule check: done" % kernel.name) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index b48d99001..8363ee810 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -701,10 +701,7 @@ class CallableKernel(InKernelCallable): raise LoopyError("Descriptor must be either an instance of " "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % type(descr)) - if self.should_inline: - descriptor_specialized_knl = self.subkernel.copy() - else: - descriptor_specialized_knl = self.subkernel.copy(args=new_args) + descriptor_specialized_knl = self.subkernel.copy() return self.copy(subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 09e6e5747..8800f2845 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -848,9 +848,13 @@ class SubArrayRef(p.Expression): from loopy.isl_helpers import simplify_via_aff sub_dim_tags = [] sub_shape = [] - linearized_index = simplify_via_aff( - sum(dim_tag.stride*iname for dim_tag, iname in - zip(arg.dim_tags, self.subscript.index_tuple))) + linearized_index = sum(dim_tag.stride*iname + for dim_tag, iname + in zip(arg.dim_tags, self.subscript.index_tuple)) + try: + linearized_index = simplify_via_aff(linearized_index) + except: + pass strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) -- GitLab From b06efc14202b21a93571993b593b12aacd9d2bf8 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Wed, 20 Jun 2018 19:29:06 +0100 Subject: [PATCH 217/916] try simplifying with integer variables --- loopy/kernel/function_interface.py | 6 +++--- loopy/symbolic.py | 14 ++++++++++++-- loopy/transform/register_callable.py | 2 ++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 8363ee810..e85a83d37 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -517,9 +517,9 @@ class KernelInliner(SubstitutionMapper): idx * tag.stride for idx, tag in zip(outer_indices, callee_arg.dim_tags)) - from loopy.isl_helpers import simplify_via_aff + from loopy.symbolic import simplify_using_aff try: - flatten_index = simplify_via_aff(flatten_index) + flatten_index = simplify_using_aff(self.caller, flatten_index) except: pass @@ -528,7 +528,7 @@ class KernelInliner(SubstitutionMapper): ind = flatten_index // dim_tag.stride flatten_index -= (dim_tag.stride * ind) try: - ind = simplify_via_aff(ind) + ind = simplify_using_aff(self.caller, ind) except: pass new_indices.append(ind) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8800f2845..47bdc4e30 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1671,7 +1671,8 @@ def guarded_pwaff_from_expr(space, expr, vars_to_zero=None): # {{{ simplify using aff def simplify_using_aff(kernel, expr): - inames = get_dependencies(expr) & kernel.all_inames() + deps = get_dependencies(expr) + inames = deps & kernel.all_inames() domain = kernel.get_inames_domain(inames) @@ -1685,7 +1686,16 @@ def simplify_using_aff(kernel, expr): except TypeError: return expr except UnknownVariableError: - return expr + integers = deps & set(t for t, v in kernel.temporary_variables.items() if np.issubdtype(v.dtype, np.integer)) + names = sorted(list(integers)) # need to sort for deterministic code generation + nd = domain.dim(isl.dim_type.set) + domain = domain.add_dims(isl.dim_type.set, len(names)) + for i, name in enumerate(names): + domain = domain.set_dim_name(isl.dim_type.set, nd + i, name) + try: + aff = aff_from_expr(domain.space, expr) + except: + return expr # FIXME: Deal with assumptions, too. aff = aff.gist(domain) diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 455c2e51e..449a53f92 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -206,6 +206,8 @@ class DimChanger(IdentityMapper): self.desired_shape = desired_shape def map_subscript(self, expr): + if expr.aggregate.name not in self.callee_arg_dict: + return super(DimChanger, self).map_subscript(expr) callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in zip(callee_arg_dim_tags, expr.index_tuple)) -- GitLab From 335fa5f69cc2cdae00c4b55b62b0695988b498fa Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 28 Jun 2018 10:39:36 +0100 Subject: [PATCH 218/916] minor changes --- loopy/symbolic.py | 4 ++-- loopy/target/c/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 47bdc4e30..6024d334d 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1686,8 +1686,8 @@ def simplify_using_aff(kernel, expr): except TypeError: return expr except UnknownVariableError: - integers = deps & set(t for t, v in kernel.temporary_variables.items() if np.issubdtype(v.dtype, np.integer)) - names = sorted(list(integers)) # need to sort for deterministic code generation + integer_vars = deps & set(t for t, v in kernel.temporary_variables.items() if np.issubdtype(v.dtype, np.integer)) + names = sorted(list(integer_vars)) # need to sort for deterministic code generation nd = domain.dim(isl.dim_type.set) domain = domain.add_dims(isl.dim_type.set, len(names)) for i, name in enumerate(names): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 366d167da..545f8d925 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -453,7 +453,7 @@ def scope_c_math_functions(target, identifier): represented by :arg:`identifier` is known in C, otherwise returns *None*. """ if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]: + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin"]: return CMathCallable(name=identifier) return None -- GitLab From 7039a728ba4f96dd1ac0d1098d1033ae48a173a4 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 28 Jun 2018 13:51:58 +0100 Subject: [PATCH 219/916] add more C math functions --- loopy/target/c/__init__.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 545f8d925..6a8befa95 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -409,7 +409,7 @@ class CMathCallable(ScalarCallable): arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) # binary functions - if name in ["fmax", "fmin"]: + if name in ["fmax", "fmin", "pow", "atan2"]: for id in arg_id_to_dtype: if not -1 <= id <= 1: @@ -428,7 +428,7 @@ class CMathCallable(ScalarCallable): if dtype.kind == "c": raise LoopyTypeError("%s does not support complex numbers") - elif dtype.kind == "f": + elif dtype.kind == "f" and name in ["fmax", "fmin"]: from loopy.target.opencl import OpenCLTarget if not isinstance(kernel.target, OpenCLTarget): if dtype == np.float64: @@ -452,8 +452,10 @@ def scope_c_math_functions(target, identifier): Returns an instance of :class:`InKernelCallable` if the function represented by :arg:`identifier` is known in C, otherwise returns *None*. """ - if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin"]: + if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", + "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", + "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", + "fabs"]: return CMathCallable(name=identifier) return None -- GitLab From 88395a731c044d32a8d54da6ee8be5bd9061646b Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 28 Jun 2018 14:19:56 +0100 Subject: [PATCH 220/916] updates based on discussion on gitlab --- loopy/codegen/__init__.py | 4 +--- loopy/kernel/function_interface.py | 1 - loopy/types.py | 6 ++++++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index fcd170316..830718465 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -478,12 +478,10 @@ def generate_code_v2(kernel): else: raise ValueError("argument type not understood: '%s'" % type(arg)) - from loopy.types import OpaqueType - allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): dtype = var.dtype - if not isinstance(dtype, OpaqueType) and dtype.involves_complex(): + if dtype.involves_complex(): allow_complex = True # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e85a83d37..3f9a84675 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -905,7 +905,6 @@ class CallableKernel(InKernelCallable): new_insns.append(insn) kernel = kernel.copy(instructions=new_insns) - # TODO: resolve name clash here kernel.scoped_functions.update(callee_knl.scoped_functions) # }}} diff --git a/loopy/types.py b/loopy/types.py index de7890aa8..d52e029a5 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -180,9 +180,15 @@ class AtomicNumpyType(NumpyType, AtomicType): # {{{ class OpaqueType(LoopyType): + """An opaque data type is truly opaque - it has no allocations, no + temporaries of that type, etc. The only thing allowed is to be pass in + through one ValueArg and go out to another. It is introduced to accomodate + functional calls to external libraries. + """ def __init__(self, name): assert isinstance(name, str) self.name = name + self.target = None def is_integral(self): return False -- GitLab From 96e18021509b5b0952af74f88f5da72ad33cafb1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 29 Jun 2018 00:53:04 -0500 Subject: [PATCH 221/916] Fixes from a first, partial pass over the kernel_callables MR --- doc/index.rst | 2 +- ...{ref_scoped_functions.rst => ref_call.rst} | 127 +----- doc/ref_kernel.rst | 6 +- examples/python/call-external.py | 105 +++++ loopy/__init__.py | 22 +- loopy/auto_test.py | 6 +- loopy/check.py | 50 ++- loopy/codegen/__init__.py | 29 +- loopy/codegen/control.py | 2 +- loopy/frontend/fortran/translator.py | 2 +- loopy/isl_helpers.py | 3 + loopy/kernel/__init__.py | 27 +- loopy/kernel/creation.py | 50 ++- loopy/kernel/data.py | 194 ++++---- loopy/kernel/function_interface.py | 385 ++-------------- loopy/kernel/instruction.py | 90 ++-- loopy/kernel/tools.py | 8 +- loopy/preprocess.py | 67 +-- loopy/schedule/__init__.py | 12 +- loopy/schedule/device_mapping.py | 4 +- loopy/schedule/tools.py | 3 +- loopy/statistics.py | 10 +- loopy/symbolic.py | 9 +- loopy/target/c/__init__.py | 12 +- loopy/target/ispc.py | 6 +- loopy/target/opencl.py | 8 +- loopy/target/pyopencl.py | 6 +- loopy/transform/batch.py | 2 +- .../{register_callable.py => callable.py} | 337 +++++++++++++- loopy/transform/data.py | 8 +- loopy/transform/diff.py | 2 +- loopy/transform/fusion.py | 4 +- loopy/transform/pack_and_unpack_args.py | 26 +- loopy/transform/precompute.py | 50 ++- loopy/transform/save.py | 6 +- test/test_callables.py | 415 ++++++++++++++++++ test/test_loopy.py | 27 +- test/test_transform.py | 364 --------------- 38 files changed, 1319 insertions(+), 1167 deletions(-) rename doc/{ref_scoped_functions.rst => ref_call.rst} (59%) create mode 100644 examples/python/call-external.py rename loopy/transform/{register_callable.py => callable.py} (50%) create mode 100644 test/test_callables.py diff --git a/doc/index.rst b/doc/index.rst index 69f08730c..0644b34c4 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -68,7 +68,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform - ref_scoped_functions + ref_call ref_other misc diff --git a/doc/ref_scoped_functions.rst b/doc/ref_call.rst similarity index 59% rename from doc/ref_scoped_functions.rst rename to doc/ref_call.rst index c2deaca67..46edc533c 100644 --- a/doc/ref_scoped_functions.rst +++ b/doc/ref_call.rst @@ -1,5 +1,5 @@ -ScopedFunctions -=============== +Calling Loopy Kernels and External Functions +============================================ ``ScopedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` kernel, whose name has been resolved by the kernel. @@ -21,8 +21,8 @@ is resolved by one of the ``function_scoper`` in a :attr:`LoopKernel.scoped_func as its functionality is superseded by ``lp.register_function_scoper(...)``. -Expressions after a function is scoped. ---------------------------------------- +Expressions after a function is scoped +-------------------------------------- Consider the following expression. @@ -127,12 +127,12 @@ Description Inference Although this step has no significance for a ``ScalarCallable``, it forms a very important part of ``CallableKernel``. In which the -``dim_tags``, ``shape`` and ``mem_scope`` of the arguments of the +``dim_tags``, ``shape`` and ``address_space`` of the arguments of the callable kernel is altered. - The ``dim_tags`` attribute helps to ensure that the memory layout between the caller and the callee kernel is coherent. -- The ``mem_scope`` attribute ensures that, while writing the device +- The ``address_space`` attribute ensures that, while writing the device code we emit the appropriate scope qualifiers for the function declaration arguments. - The ``shape`` attribute helps in: @@ -150,121 +150,16 @@ developments of the ``sin`` pymbolic call expression node. (Type Inference) -> ScopedFunction(Variable('sin_0')) -> (Descriptor Inference) -> ScopedFunction(Variable('sin_1')) -Changes on the target side to accommodate the new function interface. ---------------------------------------------------------------------- +Changes on the target side to accommodate the new function interface +-------------------------------------------------------------------- The earlier "function\_mangler" as a member method of the class ``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The function scopers would return a list of functions with the signature ``(target, identifier)->lp.InKernelCallable``. -An example of registering Vector callables is shown below. ----------------------------------------------------------- - -.. code:: python +An example: Calling BLAS +------------------------ - import loopy as lp - import numpy as np - from loopy.diagnostic import LoopyError - from loopy.target.c import CTarget - - - # {{{ blas callable - - class BLASCallable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel): - for i in range(0, 2): - if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: - # the types provided aren't mature enough to specialize the - # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) - - mat_dtype = arg_id_to_dtype[0].numpy_dtype - vec_dtype = arg_id_to_dtype[1].numpy_dtype - - if mat_dtype != vec_dtype: - raise LoopyError("DGEMV should have same dtype for matrix and " - "vector") - - if vec_dtype == np.float32: - name_in_target = "cblas_sgemv" - elif vec_dtype == np.float64: - name_in_target = "cblas_dgemv" - else: - raise LoopyError("GEMV only supported for float32 and float64 " - "types") - - from loopy.types import NumpyType - return self.copy(name_in_target=name_in_target, - arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), - -1: NumpyType(vec_dtype)}) - - def emit_call_insn(self, insn, target, expression_to_code_mapper): - assert self.is_ready_for_codegen() - - from loopy.kernel.instruction import CallInstruction - - assert isinstance(insn, CallInstruction) - - parameters = insn.expression.parameters - - parameters = list(parameters) - par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] - - parameters.append(insn.assignees[0]) - par_dtypes.append(self.arg_id_to_dtype[-1]) - - # no type casting in array calls. - from loopy.expression import dtype_to_type_context - from pymbolic.mapper.stringifier import PREC_NONE - from loopy.symbolic import SubArrayRef - from pymbolic import var - - mat_descr = self.arg_id_to_descr[0] - - c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr if isinstance(par, SubArrayRef) else - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr - for par, par_dtype in zip( - parameters, par_dtypes)] - c_parameters.insert(0, var('CblasRowMajor')) - c_parameters.insert(1, var('CblasNoTrans')) - c_parameters.insert(2, mat_descr.shape[0]) - c_parameters.insert(3, mat_descr.shape[1]) - c_parameters.insert(4, 1) - c_parameters.insert(6, 1) - c_parameters.insert(8, 1) - c_parameters.insert(10, 1) - return var(self.name_in_target)(*c_parameters), False - - def generate_preambles(self, target): - assert isinstance(target, CTarget) - yield("99_cblas", "#include ") - return - - - def blas_fn_lookup(target, identifier): - if identifier == 'gemv': - return BLASCallable(name='gemv') - return None - - # }}} - - - n = 10 - - knl = lp.make_kernel( - "{[i]: 0<=i<10}", - """ - y[:] = gemv(A[:, :], x[:]) - """, [ - lp.ArrayArg('A', dtype=np.float64, shape=(n, n)), - lp.ArrayArg('x', dtype=np.float64, shape=(n, )), - lp.ArrayArg('y', shape=(n, )), ...], - target=CTarget()) - knl = lp.register_function_lookup(knl, blas_fn_lookup) +.. literalinclude:: ../examples/python/external-call.py diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 07b7836d8..c9ce20626 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -363,9 +363,9 @@ C Block Instructions Atomic Operations ^^^^^^^^^^^^^^^^^ -.. autoclass:: memory_ordering +.. autoclass:: MemoryOrdering -.. autoclass:: memory_scope +.. autoclass:: MemoryScope .. autoclass:: VarAtomicity @@ -586,7 +586,7 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to .. autoclass:: LoopKernel -.. autoclass:: kernel_state +.. autoclass:: KernelState :members: :undoc-members: diff --git a/examples/python/call-external.py b/examples/python/call-external.py new file mode 100644 index 000000000..904270472 --- /dev/null +++ b/examples/python/call-external.py @@ -0,0 +1,105 @@ +import loopy as lp +import numpy as np +from loopy.diagnostic import LoopyError +from loopy.target.c import CTarget + + +# {{{ blas callable + +class BLASCallable(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + for i in range(0, 2): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + mat_dtype = arg_id_to_dtype[0].numpy_dtype + vec_dtype = arg_id_to_dtype[1].numpy_dtype + + if mat_dtype != vec_dtype: + raise LoopyError("DGEMV should have same dtype for matrix and " + "vector") + + if vec_dtype == np.float32: + name_in_target = "cblas_sgemv" + elif vec_dtype == np.float64: + name_in_target = "cblas_dgemv" + else: + raise LoopyError("GEMV only supported for float32 and float64 " + "types") + + from loopy.types import NumpyType + return self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), + -1: NumpyType(vec_dtype)}) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + + parameters.append(insn.assignees[0]) + par_dtypes.append(self.arg_id_to_dtype[-1]) + + # no type casting in array calls. + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + mat_descr = self.arg_id_to_descr[0] + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + c_parameters.insert(0, var('CblasRowMajor')) + c_parameters.insert(1, var('CblasNoTrans')) + c_parameters.insert(2, mat_descr.shape[0]) + c_parameters.insert(3, mat_descr.shape[1]) + c_parameters.insert(4, 1) + c_parameters.insert(6, 1) + c_parameters.insert(8, 1) + c_parameters.insert(10, 1) + return var(self.name_in_target)(*c_parameters), False + + def generate_preambles(self, target): + assert isinstance(target, CTarget) + yield("99_cblas", "#include ") + return + + +def blas_fn_lookup(target, identifier): + if identifier == 'gemv': + return BLASCallable(name='gemv') + return None + +# }}} + + +n = 10 + +knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[:] = gemv(A[:, :], x[:]) + """, [ + lp.ArrayArg('A', dtype=np.float64, shape=(n, n)), + lp.ArrayArg('x', dtype=np.float64, shape=(n, )), + lp.ArrayArg('y', shape=(n, )), ...], + target=CTarget()) + +knl = lp.register_function_lookup(knl, blas_fn_lookup) diff --git a/loopy/__init__.py b/loopy/__init__.py index a5931d03a..a552e498e 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -37,7 +37,9 @@ from loopy.library.function import ( default_function_mangler, single_arg_function_mangler) from loopy.kernel.instruction import ( - memory_ordering, memory_scope, VarAtomicity, AtomicInit, AtomicUpdate, + MemoryOrdering, memory_ordering, + MemoryScope, memory_scope, + VarAtomicity, AtomicInit, AtomicUpdate, InstructionBase, MultiAssignmentBase, Assignment, ExpressionInstruction, CallInstruction, CInstruction, NoOpInstruction, BarrierInstruction) @@ -45,13 +47,14 @@ from loopy.kernel.data import ( auto, KernelArgument, ValueArg, ArrayArg, GlobalArg, ConstantArg, ImageArg, - temp_var_scope, TemporaryVariable, AddressSpace, + AddressSpace, temp_var_scope, + TemporaryVariable, SubstitutionRule, CallMangleInfo) from loopy.kernel.function_interface import ( ScalarCallable) -from loopy.kernel import LoopKernel, kernel_state +from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( get_dot_dependency_graph, show_dependency_graph, @@ -118,7 +121,7 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier -from loopy.transform.register_callable import (register_callable_kernel, +from loopy.transform.callable import (register_callable_kernel, register_function_lookup, inline_callable_kernel) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call @@ -158,9 +161,13 @@ __all__ = [ "auto", - "LoopKernel", "kernel_state", + "LoopKernel", + "KernelState", "kernel_state", # lower case is deprecated - "memory_ordering", "memory_scope", "VarAtomicity", + "MemoryOrdering", "memory_ordering", # lower case is deprecated + "MemoryScope", "memory_scope", # lower case is deprecated + + "VarAtomicity", "AtomicInit", "AtomicUpdate", "InstructionBase", "MultiAssignmentBase", "Assignment", "ExpressionInstruction", @@ -171,7 +178,8 @@ __all__ = [ "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", - "AddressSpace", "temp_var_scope", "TemporaryVariable", + "AddressSpace", "temp_var_scope", # temp_var_scope is deprecated + "TemporaryVariable", "SubstitutionRule", "CallMangleInfo", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 8e647b02d..015c82dd1 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -515,11 +515,11 @@ def auto_test_vs_ref( properties=cl.command_queue_properties.PROFILING_ENABLE) args = None - from loopy.kernel import kernel_state + from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ - kernel_state.PREPROCESSED, - kernel_state.SCHEDULED]: + KernelState.PREPROCESSED, + KernelState.SCHEDULED]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) diff --git a/loopy/check.py b/loopy/check.py index 4a340e6dd..86d0d48d3 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -124,7 +124,8 @@ def check_functions_are_scoped(kernel): elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): pass else: - raise NotImplementedError("Unknown type of instruction %s." % type(insn)) + raise NotImplementedError( + "Unknown type of instruction %s" % type(insn).__name__) # }}} @@ -185,14 +186,15 @@ def check_loop_priority_inames_known(kernel): def _get_all_unique_iname_tags(kernel): - """ Returns a list of all the unique iname tags in the *kernel*. + """Returns a set of all the iname tags used in *kernel* that + inherit from :class:`loopy.kernel.data.UniqueTag`. """ from loopy.kernel.data import UniqueTag iname_tags = [kernel.iname_to_tag.get(iname) for iname in kernel.all_inames()] - unique_iname_tags = [tag for tag in iname_tags if - isinstance(tag, UniqueTag)] - return unique_iname_tags + return set( + tag for tag in iname_tags if + isinstance(tag, UniqueTag)) def check_multiple_tags_allowed(kernel): @@ -225,13 +227,13 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) - # checking usage of iname tags in the callee kernel. + # check usage of iname tags in the callee kernel if isinstance(insn, CallInstruction): in_knl_callable = kernel.scoped_functions[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): - # checking for collision in iname_tag keys in the instruction - # due to the callee kernel. + # check for collision in iname_tag keys in the instruction + # due to the callee kernel common_iname_tags = [tag for tag in _get_all_unique_iname_tags(in_knl_callable.subkernel) if tag.key in insn_tag_keys] @@ -257,25 +259,25 @@ def _is_racing_iname_tag(tv, tag): from loopy.kernel.data import (AddressSpace, LocalIndexTagBase, GroupIndexTag, ConcurrentTag, auto) - if tv.scope == AddressSpace.PRIVATE: + if tv.address_space == AddressSpace.PRIVATE: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, (LocalIndexTagBase, GroupIndexTag))) - elif tv.scope == AddressSpace.LOCAL: + elif tv.address_space == AddressSpace.LOCAL: return ( isinstance(tag, ConcurrentTag) and not isinstance(tag, GroupIndexTag)) - elif tv.scope == AddressSpace.GLOBAL: + elif tv.address_space == AddressSpace.GLOBAL: return isinstance(tag, ConcurrentTag) - elif tv.scope == auto: + elif tv.address_space == auto: raise LoopyError("scope of temp var '%s' has not yet been" "determined" % tv.name) else: - raise ValueError("unexpected value of temp_var.scope for " + raise ValueError("unexpected value of temp_var.address_space for " "temporary variable '%s'" % tv.name) @@ -542,13 +544,13 @@ class IndirectDependencyEdgeFinder(object): return False -def declares_nosync_with(kernel, var_scope, dep_a, dep_b): +def declares_nosync_with(kernel, var_address_space, dep_a, dep_b): from loopy.kernel.data import AddressSpace - if var_scope == AddressSpace.GLOBAL: + if var_address_space == AddressSpace.GLOBAL: search_scopes = ["global", "any"] - elif var_scope == AddressSpace.LOCAL: + elif var_address_space == AddressSpace.LOCAL: search_scopes = ["local", "any"] - elif var_scope == AddressSpace.PRIVATE: + elif var_address_space == AddressSpace.PRIVATE: search_scopes = ["any"] else: raise ValueError("unexpected value of 'AddressSpace'") @@ -597,19 +599,19 @@ def _check_variable_access_ordered_inner(kernel): continue if name in kernel.temporary_variables: - scope = kernel.temporary_variables[name].scope + address_space = kernel.temporary_variables[name].address_space else: arg = kernel.arg_dict[name] if isinstance(arg, ArrayArg): - scope = arg.memory_address_space + address_space = arg.address_space elif isinstance(arg, ValueArg): - scope = AddressSpace.PRIVATE + address_space = AddressSpace.PRIVATE else: # No need to consider ConstantArg and ImageArg (for now) # because those won't be written. - raise ValueError("could not determine scope of '%s'" % name) + raise ValueError("could not determine address_space of '%s'" % name) - # Check even for PRIVATE scope, to ensure intentional program order. + # Check even for PRIVATE address space, to ensure intentional program order. from loopy.symbolic import AccessRangeOverlapChecker overlap_checker = AccessRangeOverlapChecker(kernel) @@ -623,7 +625,7 @@ def _check_variable_access_ordered_inner(kernel): other = kernel.id_to_insn[other_id] has_dependency_relationship = ( - declares_nosync_with(kernel, scope, other, writer) + declares_nosync_with(kernel, address_space, other, writer) or depfind(writer_id, other_id) or @@ -907,7 +909,7 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): "aliases have a definition" % (temporary, subkernel)) continue - if tval.scope in (AddressSpace.PRIVATE, AddressSpace.LOCAL): + if tval.address_space in (AddressSpace.PRIVATE, AddressSpace.LOCAL): from loopy.diagnostic import MissingDefinitionError raise MissingDefinitionError("temporary variable '%s' gets used " "in subkernel '%s' without a definition (maybe you forgot " diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e5938dbc4..e9d30d013 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -415,8 +415,8 @@ def generate_code_v2(kernel): :returns: a :class:`CodeGenerationResult` """ - from loopy.kernel import kernel_state - if kernel.state == kernel_state.INITIAL: + from loopy.kernel import KernelState + if kernel.state == KernelState.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) @@ -424,7 +424,7 @@ def generate_code_v2(kernel): from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) - if kernel.state != kernel_state.SCHEDULED: + if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") @@ -510,17 +510,18 @@ def generate_code_v2(kernel): from loopy.codegen.result import generate_host_or_device_program - # {{{ collecting ASTs of auxiliary kernels + # {{{ collect ASTs of auxiliary kernels auxiliary_dev_progs = [] - # scanning through all the call instructions if there is any instance of + # scan through all the call instructions if there is any instance of # CallableKernel, whose code is to be generated. + from loopy.kernel.function_interface import CallableKernel + for insn in kernel.instructions: if isinstance(insn, CallInstruction): in_knl_callable = kernel.scoped_functions[ insn.expression.function.name] - from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel): auxiliary_dev_prog = generate_code_v2( in_knl_callable.subkernel.copy( @@ -528,20 +529,22 @@ def generate_code_v2(kernel): target=kernel.target) ).device_programs[0].ast auxiliary_dev_progs.append(auxiliary_dev_prog) + elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, BarrierInstruction, CInstruction, _DataObliviousInstruction)): pass + else: - raise NotImplementedError("Unknown type of instruction %s." % ( - str(type(insn)))) + raise NotImplementedError("Unknown type of instruction %s" % ( + type(insn).__name__)) codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) - # Modifying the first device program to add the auxiliary kernels - # as functions. + # Modify the first device program to add the auxiliary kernels + # as functions new_dev_prog = codegen_result.device_programs[0] for auxiliary_dev_prog in auxiliary_dev_progs: new_dev_prog = new_dev_prog.copy( @@ -580,7 +583,7 @@ def generate_code_v2(kernel): for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) - # {{{ collecting preambles from all the in kernel callables. + # {{{ collect preambles from all the in kernel callables. in_knl_callable_collector = InKernelCallablesCollector(kernel) @@ -592,7 +595,9 @@ def generate_code_v2(kernel): elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): pass else: - raise NotImplementedError("Unkown instruction %s" % type(insn)) + raise NotImplementedError( + "Unknown instruction type '%s'" + % type(insn).__name__) # }}} diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 9969f6ad0..45e2a18c4 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -72,7 +72,7 @@ def synthesize_idis_for_extra_args(kernel, schedule_index): for arg in sched_item.extra_args: temporary = kernel.temporary_variables[arg] - assert temporary.scope == AddressSpace.GLOBAL + assert temporary.address_space == AddressSpace.GLOBAL idis.extend( temporary.decl_info( kernel.target, diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 70415c333..bcbe41874 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -679,7 +679,7 @@ class F2LoopyTranslator(FTreeWalkerBase): if dims is not None: # default order is set to "F" in kernel creation below kernel_data.append( - lp.ArrayArg( + lp.GlobalArg( arg_name, dtype=sub.get_type(arg_name), shape=sub.get_loopy_shape(arg_name), diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 847eb0d97..1de0b621a 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -82,6 +82,9 @@ def make_slab(space, iname, start, stop, step=1): An instance of :class:`int` or an instance of :class:`islpy._isl.Aff` indicating the upper bound of ``step*iname``. + + :arg step: + An instance of :class:`int`. """ zero = isl.Aff.zero_on_domain(space) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 4141ac4cb..fd1550ccb 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -94,12 +94,16 @@ class _UniqueVarNameGenerator(UniqueNameGenerator): # {{{ loop kernel object -class kernel_state: # noqa +class KernelState: # noqa INITIAL = 0 PREPROCESSED = 1 SCHEDULED = 2 +# FIXME Introduce noisy deprecation goop +kernel_state = KernelState + + class LoopKernel(ImmutableRecordWithoutPickling): """These correspond more or less directly to arguments of :func:`loopy.make_kernel`. @@ -189,7 +193,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: state - A value from :class:`kernel_state`. + A value from :class:`KernelState`. .. attribute:: target @@ -227,7 +231,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): index_dtype=np.int32, options=None, - state=kernel_state.INITIAL, + state=KernelState.INITIAL, is_called_from_host=True, target=None, @@ -302,9 +306,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): raise TypeError("index_dtype must be signed") if state not in [ - kernel_state.INITIAL, - kernel_state.PREPROCESSED, - kernel_state.SCHEDULED, + KernelState.INITIAL, + KernelState.PREPROCESSED, + KernelState.SCHEDULED, ]: raise ValueError("invalid value for 'state'") @@ -320,9 +324,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT if function_scopers is None: - from loopy.library.function import loopy_specific_callable_scopers - # populating the function scopers from the target and the loopy + # populate the function scopers from the target and the loopy # specific callable scopers + + from loopy.library.function import loopy_specific_callable_scopers function_scopers = [loopy_specific_callable_scopers] + ( target.get_device_ast_builder().function_scopers()) @@ -982,7 +987,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): | set( tv.name for tv in six.itervalues(self.temporary_variables) - if tv.scope == AddressSpace.GLOBAL)) + if tv.address_space == AddressSpace.GLOBAL)) # }}} @@ -1217,13 +1222,13 @@ class LoopKernel(ImmutableRecordWithoutPickling): return set( tv.name for tv in six.itervalues(self.temporary_variables) - if tv.scope == AddressSpace.LOCAL) + if tv.address_space == AddressSpace.LOCAL) def local_mem_use(self): from loopy.kernel.data import AddressSpace return sum( tv.nbytes for tv in six.itervalues(self.temporary_variables) - if tv.scope == AddressSpace.LOCAL) + if tv.address_space == AddressSpace.LOCAL) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f808c42c2..aa53d8ec8 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -35,7 +35,7 @@ from loopy.symbolic import ( from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace) from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel @@ -1156,14 +1156,18 @@ class ArgumentGuesser: # other writable type of variable is an argument. return ArrayArg(arg_name, - shape=lp.auto, offset=self.default_offset) + shape=lp.auto, + offset=self.default_offset, + address_space=AddressSpace.GLOBAL) irank = self.find_index_rank(arg_name) if irank == 0: # read-only, no indices return ValueArg(arg_name) else: - return ArrayArg(arg_name, shape=lp.auto, offset=self.default_offset) + return ArrayArg( + arg_name, shape=lp.auto, offset=self.default_offset, + address_space=AddressSpace.GLOBAL) def convert_names_to_full_args(self, kernel_args): new_kernel_args = [] @@ -1449,7 +1453,7 @@ def create_temporaries(knl, default_order): new_temp_vars[assignee_name] = lp.TemporaryVariable( name=assignee_name, dtype=temp_var_type, - scope=lp.auto, + address_space=lp.auto, base_indices=lp.auto, shape=lp.auto, order=default_order, @@ -1848,7 +1852,7 @@ class FunctionScoper(RuleAwareIdentityMapper): returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable`. - **Example**: If given an expression of the form ``sin(x) + unknown_function(y) + + **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + unknown_function(y) + ScopedFunction('log')(z)``. @@ -1866,12 +1870,12 @@ class FunctionScoper(RuleAwareIdentityMapper): from loopy.symbolic import ScopedFunction if not isinstance(expr.function, ScopedFunction): - # searching the kernel for the function. + # search the kernel for the function in_knl_callable = self.kernel.find_scoped_function_identifier( expr.function.name) if in_knl_callable: - # Associating the newly created ScopedFunction with the - # resolved in-kernel callable. + # associate the newly created ScopedFunction with the + # resolved in-kernel callable self.scoped_functions[expr.function.name] = in_knl_callable return type(expr)( @@ -1879,20 +1883,22 @@ class FunctionScoper(RuleAwareIdentityMapper): tuple(self.rec(child, expn_state) for child in expr.parameters)) - # This is an unknown function as of yet, hence not modifying it. + # this is an unknown function as of yet, do not modify it return super(FunctionScoper, self).map_call(expr, expn_state) def map_call_with_kwargs(self, expr, expn_state): + # FIXME duplicated logic with map_call + from loopy.symbolic import ScopedFunction if not isinstance(expr.function, ScopedFunction): - # searching the kernel for the function. + # search the kernel for the function. in_knl_callable = self.kernel.find_scoped_function_identifier( expr.function.name) if in_knl_callable: - # Associating the newly created ScopedFunction with the - # resolved in-kernel callable. + # associate the newly created ScopedFunction with the + # resolved in-kernel callable self.scoped_functions[expr.function.name] = in_knl_callable return type(expr)( ScopedFunction(expr.function.name), @@ -1903,7 +1909,7 @@ class FunctionScoper(RuleAwareIdentityMapper): for key, val in six.iteritems(expr.kw_parameters)) ) - # This is an unknown function as of yet, hence not modifying it. + # this is an unknown function as of yet, do not modify it return super(FunctionScoper, self).map_call_with_kwargs(expr, expn_state) @@ -1914,7 +1920,12 @@ class FunctionScoper(RuleAwareIdentityMapper): SegmentedOp) from loopy.library.reduction import ArgExtOp - # Noting down the extra functions arising due to certain reductions. + # note down the extra functions arising due to certain reductions + + # FIXME Discuss this. It cannot stay the way it is, because non-built-in + # reductions cannot add themselves to this list. We may need to change + # the reduction interface. Why don't reductions generate scoped functions + # in the first place? if isinstance(expr.operation, MaxReductionOperation): self.scoped_functions["max"] = ( self.kernel.find_scoped_function_identifier("max")) @@ -2015,16 +2026,16 @@ class SliceToInameReplacer(IdentityMapper): """ Converts slices to instances of :class:`loopy.symbolic.SubArrayRef`. - :attribute var_name_gen: + .. attribute:: var_name_gen Variable name generator, in order to generate unique inames within the kernel domain. - :attribute knl: + .. attribute:: knl An instance of :class:`loopy.LoopKernel` - :attribute iname_domains: + .. attribute:: iname_domains An instance of :class:`dict` to store the slices enountered in the expressions as a mapping from ``iname`` to a tuple of ``(start, stop, @@ -2047,7 +2058,7 @@ class SliceToInameReplacer(IdentityMapper): swept_inames = [] for i, index in enumerate(expr.index_tuple): if isinstance(index, Slice): - unique_var_name = self.var_name_gen(based_on="islice") + unique_var_name = self.var_name_gen(based_on="i") if expr.aggregate.name in self.knl.arg_dict: domain_length = self.knl.arg_dict[expr.aggregate.name].shape[i] elif expr.aggregate.name in self.knl.temporary_variables: @@ -2436,7 +2447,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): knl = create_temporaries(knl, default_order) - # Convert slices to iname domains + # convert slices to iname domains knl = realize_slices_as_sub_array_refs(knl) # ------------------------------------------------------------------------- @@ -2476,7 +2487,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) - # Function Lookup knl = scope_functions(knl) from loopy.preprocess import prepare_for_caching diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 83f98ecd1..f75e1a8c4 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -32,8 +32,8 @@ from loopy.kernel.array import ArrayBase from loopy.diagnostic import LoopyError from loopy.kernel.instruction import ( # noqa InstructionBase, - memory_ordering, - memory_scope, + MemoryOrdering, + MemoryScope, VarAtomicity, AtomicInit, AtomicUpdate, @@ -43,11 +43,12 @@ from loopy.kernel.instruction import ( # noqa CallInstruction, make_assignment, CInstruction) +from warnings import warn class auto(object): # noqa """A generic placeholder object for something that should be automatically - detected. See, for example, the *shape* or *strides* argument of + determined. See, for example, the *shape* or *strides* argument of :class:`GlobalArg`. """ @@ -243,9 +244,8 @@ def parse_tag(tag): # {{{ memory address space -class AddressSpace: - """ - Storage location of a variable. +class AddressSpace(object): + """Storage location of a variable. .. attribute:: PRIVATE .. attribute:: LOCAL @@ -268,7 +268,38 @@ class AddressSpace: elif val == cls.GLOBAL: return "global" else: - raise ValueError("unexpected value of MemoryAddressScope") + raise ValueError("unexpected value of AddressSpace") + + +class _deprecated_temp_var_scope_property(property): # noqa + def __get__(self, cls, owner): + warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", + DeprecationWarning, stacklevel=2) + + return classmethod(self.fget).__get__(None, owner)() + + +class temp_var_scope(object): # noqa + """Deprecated. Use :class:`AddressSpace` instead. + """ + + @_deprecated_temp_var_scope_property + def PRIVATE(self): + return AddressSpace.PRIVATE + + @_deprecated_temp_var_scope_property + def LOCAL(self): + return AddressSpace.LOCAL + + @_deprecated_temp_var_scope_property + def GLOBAL(self): + return AddressSpace.GLOBAL + + @classmethod + def stringify(cls, val): + warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", + DeprecationWarning, stacklevel=2) + return AddressSpace.stringify(val) # }}} @@ -297,7 +328,6 @@ class KernelArgument(ImmutableRecord): import loopy as lp if dtype is lp.auto: - from warnings import warn warn("Argument/temporary data type should be None if unspecified, " "not auto. This usage will be disallowed in 2018.", DeprecationWarning, stacklevel=2) @@ -313,26 +343,24 @@ class KernelArgument(ImmutableRecord): class ArrayArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ + ( """ - .. attribute:: memory_address_space + .. attribute:: address_space An attribute of :class:`AddressSpace` defining the address - space in which the array resides in the target memory layout. - Defaults to ``AddressSpace.GLOBAL`` + space in which the array resides. .. attribute:: is_output_only - An instance of :class:`bool`. If set to *TRUE*, recorded to be + An instance of :class:`bool`. If set to *True*, recorded to be returned from the kernel. """) allowed_extra_kwargs = [ - "memory_address_space", + "address_space", "is_output_only"] def __init__(self, *args, **kwargs): - # Defaulting the memory_address_space to be GLOBAL. - kwargs["memory_address_space"] = kwargs.pop( - "memory_address_space", AddressSpace.GLOBAL) + if "address_space" not in kwargs: + raise TypeError("'address_space' must be specified") kwargs["is_output_only"] = kwargs.pop("is_output_only", None) super(ArrayArg, self).__init__(*args, **kwargs) @@ -342,16 +370,19 @@ class ArrayArg(ArrayBase, KernelArgument): def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_array_arg_decl(self.name + name_suffix, - self.memory_address_space, shape, dtype, is_written) + self.address_space, shape, dtype, is_written) -class GlobalArg(ArrayBase, KernelArgument): - def __new__(cls, *args, **kwargs): - from warnings import warn - warn("Use of 'GlobalArg' is deprecated, use 'ArrayArg' instead.", - DeprecationWarning, stacklevel=2) +# Making this a function prevents incorrect use in isinstance. +# Note: This is *not* deprecated, as it is super-common and +# incrementally more convenient to use than ArrayArg directly. +def GlobalArg(*args, **kwargs): + address_space = kwargs.pop("address_space", None) + if address_space is not None: + raise TypeError("may not pass 'address_space' to GlobalArg") + kwargs["address_space"] = AddressSpace.GLOBAL - return ArrayArg(*args, **kwargs) + return ArrayArg(*args, **kwargs) class ConstantArg(ArrayBase, KernelArgument): @@ -423,43 +454,12 @@ class InameArg(ValueArg): # {{{ temporary variable -class _deprecated_temp_var_scope_property(property): # noqa - def __get__(self, cls, owner): - from warnings import warn - warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", - DeprecationWarning, stacklevel=2) - - return classmethod(self.fget).__get__(None, owner)() - -class temp_var_scope: # noqa - """Deprecated. Use :class:`mem_adress_space` instead. - """ - - @_deprecated_temp_var_scope_property - def PRIVATE(self): - return AddressSpace.PRIVATE - - @_deprecated_temp_var_scope_property - def LOCAL(self): - return AddressSpace.LOCAL - - @_deprecated_temp_var_scope_property - def GLOBAL(self): - return AddressSpace.GLOBAL - - @classmethod - def stringify(cls, val): - from warnings import warn - warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.", - DeprecationWarning, stacklevel=2) - return AddressSpace.stringify - class TemporaryVariable(ArrayBase): __doc__ = ArrayBase.__doc__ + """ .. attribute:: storage_shape .. attribute:: base_indices - .. attribute:: scope + .. attribute:: address_space What memory this temporary variable lives in. One of the values in :class:`AddressSpace`, @@ -472,10 +472,6 @@ class TemporaryVariable(ArrayBase): hold the data in this temporary. Note that this storage array must not match any existing variable names. - .. attribute:: scope - - One of :class:`AddressSpace`. - .. attribute:: initializer *None* or a :class:`numpy.ndarray` of data to be used to initialize the @@ -501,14 +497,14 @@ class TemporaryVariable(ArrayBase): allowed_extra_kwargs = [ "storage_shape", "base_indices", - "scope", + "address_space", "base_storage", "initializer", "read_only", "_base_storage_access_may_be_aliasing", ] - def __init__(self, name, dtype=None, shape=(), scope=auto, + def __init__(self, name, dtype=None, shape=(), address_space=None, dim_tags=None, offset=0, dim_names=None, strides=None, order=None, base_indices=None, storage_shape=None, base_storage=None, initializer=None, read_only=False, @@ -519,6 +515,28 @@ class TemporaryVariable(ArrayBase): :arg base_indices: :class:`loopy.auto` or a tuple of base indices """ + scope = kwargs.pop("scope", None) + if scope is not None: + warn("Passing 'scope' is deprecated. Use 'address_space' instead.", + DeprecationWarning, stacklevel=2) + + if address_space is not None: + raise ValueError("only one of 'scope' and 'address_space' " + "may be specified") + else: + address_space = scope + + del scope + + if address_space is None: + address_space = auto + + if address_space is None: + raise LoopyError( + "temporary variable '%s': " + "address_space must not be None" + % name) + if initializer is None: pass elif isinstance(initializer, np.ndarray): @@ -579,7 +597,8 @@ class TemporaryVariable(ArrayBase): dtype=dtype, shape=shape, strides=strides, dim_tags=dim_tags, offset=offset, dim_names=dim_names, order=order, - base_indices=base_indices, scope=scope, + base_indices=base_indices, + address_space=address_space, storage_shape=storage_shape, base_storage=base_storage, initializer=initializer, @@ -589,20 +608,33 @@ class TemporaryVariable(ArrayBase): **kwargs) @property - def is_local(self): - """One of :class:`loopy.AddressSpace`.""" - - if self.scope is auto: - return auto - elif self.scope == AddressSpace.LOCAL: - return True - elif self.scope == AddressSpace.PRIVATE: - return False - elif self.scope == AddressSpace.GLOBAL: - raise LoopyError("TemporaryVariable.is_local called on " - "global temporary variable '%s'" % self.name) - else: - raise LoopyError("unexpected value of TemporaryVariable.scope") + def scope(self): + warn("Use of 'TemporaryVariable.scope' is deprecated, " + "use 'TemporaryVariable.address_space' instead.", + DeprecationWarning, stacklevel=2) + + return self.address_space + + def copy(self, **kwargs): + address_space = kwargs.pop("address_space", None) + scope = kwargs.pop("scope", None) + + if scope is not None: + warn("Passing 'scope' is deprecated. Use 'address_space' instead.", + DeprecationWarning, stacklevel=2) + + if address_space is not None: + raise ValueError("only one of 'scope' and 'address_space' " + "may be specified") + else: + address_space = scope + + del scope + + if address_space is not None: + kwargs["address_space"] = address_space + + return super(TemporaryVariable, self).copy(**kwargs) @property def nbytes(self): @@ -619,7 +651,7 @@ class TemporaryVariable(ArrayBase): shape_override=self.storage_shape) def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): - if self.scope == AddressSpace.GLOBAL: + if self.address_space == AddressSpace.GLOBAL: return ast_builder.get_array_arg_decl(self.name + name_suffix, AddressSpace.GLOBAL, shape, dtype, is_written) else: @@ -627,10 +659,10 @@ class TemporaryVariable(ArrayBase): "non-global temporary") def __str__(self): - if self.scope is auto: + if self.address_space is auto: scope_str = "auto" else: - scope_str = AddressSpace.stringify(self.scope) + scope_str = AddressSpace.stringify(self.address_space) return ( self.stringify(include_typename=False) @@ -642,7 +674,7 @@ class TemporaryVariable(ArrayBase): super(TemporaryVariable, self).__eq__(other) and self.storage_shape == other.storage_shape and self.base_indices == other.base_indices - and self.scope == other.scope + and self.address_space == other.address_space and self.base_storage == other.base_storage and ( (self.initializer is None and other.initializer is None) @@ -661,7 +693,7 @@ class TemporaryVariable(ArrayBase): self.update_persistent_hash_for_shape(key_hash, key_builder, self.storage_shape) key_builder.rec(key_hash, self.base_indices) - key_builder.rec(key_hash, self.scope) + key_builder.rec(key_hash, self.address_space) key_builder.rec(key_hash, self.base_storage) initializer = self.initializer diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 089b6cb36..edb222ec2 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -35,13 +35,7 @@ from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, - RuleAwareIdentityMapper, SubstitutionRuleExpander, SubstitutionMapper, - CombineMapper) - -from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction) - -from functools import reduce + RuleAwareIdentityMapper, SubstitutionRuleExpander) # {{{ argument descriptors @@ -61,7 +55,7 @@ class ArrayArgDescriptor(ImmutableRecord): Shape of the array. - .. attribute:: mem_scope + .. attribute:: address_space An attribute of :class:`loopy.kernel.data.AddressSpace`. @@ -69,9 +63,10 @@ class ArrayArgDescriptor(ImmutableRecord): A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` """ - fields = set(['shape', 'mem_scope', 'dim_tags']) - def __init__(self, shape, mem_scope, dim_tags): + fields = set(['shape', 'address_space', 'dim_tags']) + + def __init__(self, shape, address_space, dim_tags): # {{{ sanity checks @@ -79,6 +74,8 @@ class ArrayArgDescriptor(ImmutableRecord): assert isinstance(shape, tuple) assert isinstance(dim_tags, tuple) + + # FIXME at least vector dim tags should be supported assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in dim_tags) @@ -86,7 +83,7 @@ class ArrayArgDescriptor(ImmutableRecord): super(ArrayArgDescriptor, self).__init__( shape=shape, - mem_scope=mem_scope, + address_space=address_space, dim_tags=dim_tags) # }}} @@ -176,7 +173,8 @@ class InKernelCallable(ImmutableRecord): .. note:: - Negative ids in the mapping attributes indicate the result arguments + Negative "id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. .. automethod:: __init__ .. automethod:: with_types @@ -470,120 +468,6 @@ class ScalarCallable(InKernelCallable): # }}} -# {{{ kernel inliner mapper - -class KernelInliner(SubstitutionMapper): - """Mapper to replace variables (indices, temporaries, arguments) in the - callee kernel with variables in the caller kernel. - - :arg caller: the caller kernel - :arg arg_map: dict of argument name to variables in caller - :arg arg_dict: dict of argument name to arguments in callee - """ - - def __init__(self, subst_func, caller, arg_map, arg_dict): - super(KernelInliner, self).__init__(subst_func) - self.caller = caller - self.arg_map = arg_map - self.arg_dict = arg_dict - - def map_subscript(self, expr): - if expr.aggregate.name in self.arg_map: - - aggregate = self.subst_func(expr.aggregate) - sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller - callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee - if aggregate.name in self.caller.arg_dict: - caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller - else: - caller_arg = self.caller.temporary_variables[aggregate.name] - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in callee_arg.shape): - raise LoopyError( - "Argument: {0} in callee kernel: {1} does not have " - "constant shape.".format(callee_arg)) - - flatten_index = 0 - for i, idx in enumerate(sar.get_begin_subscript().index_tuple): - flatten_index += idx*caller_arg.dim_tags[i].stride - - flatten_index += sum( - idx * tag.stride - for idx, tag in zip(outer_indices, callee_arg.dim_tags)) - - from loopy.isl_helpers import simplify_via_aff - flatten_index = simplify_via_aff(flatten_index) - - new_indices = [] - for dim_tag in caller_arg.dim_tags: - ind = flatten_index // dim_tag.stride - flatten_index -= (dim_tag.stride * ind) - new_indices.append(ind) - - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - - -class CalleeScopedCallsCollector(CombineMapper): - """ - Collects the scoped functions which are a part of the callee kernel and - must be transferred to the caller kernel before inlining. - - :returns: - An :class:`frozenset` of function names that are not scoped in - the caller kernel. - - .. note:: - :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are - never scoped in the pipeline. - """ - - def __init__(self, callee_scoped_functions): - self.callee_scoped_functions = callee_scoped_functions - - def combine(self, values): - import operator - return reduce(operator.or_, values, frozenset()) - - def map_call(self, expr): - if expr.function.name in self.callee_scoped_functions: - return (frozenset([(expr, - self.callee_scoped_functions[expr.function.name])]) | - self.combine((self.rec(child) for child in expr.parameters))) - else: - return self.combine((self.rec(child) for child in expr.parameters)) - - def map_call_with_kwargs(self, expr): - if expr.function.name in self.callee_scoped_functions: - return (frozenset([(expr, - self.callee_scoped_functions[expr.function.name])]) | - self.combine((self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values())))) - else: - return self.combine((self.rec(child) for child in - expr.parameters+tuple(expr.kw_parameters.values()))) - - def map_constant(self, expr): - return frozenset() - - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant - - -# }}} - - # {{{ callable kernel class CallableKernel(InKernelCallable): @@ -594,15 +478,16 @@ class CallableKernel(InKernelCallable): in order to initiate association between a function in caller kernel and the callee kernel. - The :meth:`CallableKernel.with_types` should be called in order to match + :meth:`CallableKernel.with_types` should be called in order to match the ``dtypes`` of the arguments that are shared between the caller and the callee kernel. - The :meth:`CallableKernel.with_descrs` should be called in order to match - the ``dim_tags, shape, mem_scopes`` of the arguments shared between the + :meth:`CallableKernel.with_descrs` should be called in order to match + :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, + :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the caller and the callee kernel. - The :meth:`CallableKernel.with_hw_axes` should be called to set the grid + :meth:`CallableKernel.with_hw_axes` should be called to set the grid sizes for the :attr:`subkernel` of the callable. """ @@ -652,43 +537,43 @@ class CallableKernel(InKernelCallable): pre_specialized_subkernel = self.subkernel.copy( args=new_args) - # inferring the types of the written variables based on the knowledge + # infer the types of the written variables based on the knowledge # of the types of the arguments supplied specialized_kernel = infer_unknown_types(pre_specialized_subkernel, expect_completion=True) new_arg_id_to_dtype = {} for arg in specialized_kernel.args: - # associating the updated_arg_id_to_dtype with keyword as well as - # positional id. + # associate the updated_arg_id_to_dtype with keyword as well as + # positional id new_arg_id_to_dtype[arg.name] = arg.dtype new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype - # Returning the kernel call with specialized subkernel and the corresponding + # Return the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype return self.copy(subkernel=specialized_kernel, arg_id_to_dtype=new_arg_id_to_dtype) def with_descrs(self, arg_id_to_descr): - # tuning the subkernel so that we have the the matching shapes and - # dim_tags. + # tune the subkernel so that we have the matching shapes and + # dim_tags new_args = self.subkernel.args[:] kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - for id, descr in arg_id_to_descr.items(): - if isinstance(id, int): - id = pos_to_kw[id] - assert isinstance(id, str) + for arg_id, descr in arg_id_to_descr.items(): + if isinstance(arg_id, int): + arg_id = pos_to_kw[arg_id] + assert isinstance(arg_id, str) if isinstance(descr, ArrayArgDescriptor): - new_arg = self.subkernel.arg_dict[id].copy( + new_arg = self.subkernel.arg_dict[arg_id].copy( shape=descr.shape, dim_tags=descr.dim_tags, - memory_address_space=descr.mem_scope) + address_space=descr.address_space) # replacing the new arg with the arg of the same name - new_args = [new_arg if arg.name == id else arg for arg in + new_args = [new_arg if arg.name == arg_id else arg for arg in new_args] elif isinstance(descr, ValueArgDescriptor): pass @@ -712,7 +597,7 @@ class CallableKernel(InKernelCallable): arg_id_to_descr[pos] = ArrayArgDescriptor( shape=arg.shape, dim_tags=arg.dim_tags, - mem_scope=AddressSpace.GLOBAL) + address_space=AddressSpace.GLOBAL) return self.copy(subkernel=self.subkernel, arg_id_to_descr=arg_id_to_descr) @@ -724,7 +609,6 @@ class CallableKernel(InKernelCallable): GridOverrideForCalleeKernel(lsize, gsize)))) def is_ready_for_codegen(self): - return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None and self.name_in_target is not None) @@ -732,7 +616,7 @@ class CallableKernel(InKernelCallable): def generate_preambles(self, target): """ Yields the *target* specific preambles. """ - # TODO: This is not correct, as the code code preamble generated + # FIXME TODO: This is not correct, as the code code preamble generated # during the code generationg of the child kernel, does not guarantee # that this thing would be updated. for preamble in self.subkernel.preambles: @@ -740,194 +624,6 @@ class CallableKernel(InKernelCallable): return - def inline_within_kernel(self, kernel, instruction): - """ - Returns a copy of *kernel* with the *instruction* in the *kernel* - replaced by inlining :attr:`subkernel` within it. - """ - callee_knl = self.subkernel - - import islpy as isl - - callee_label = callee_knl.name[:4] + "_" - - # {{{ duplicate and rename inames - - vng = kernel.get_var_name_generator() - ing = kernel.get_instruction_id_generator() - dim_type = isl.dim_type.set - - iname_map = {} - for iname in callee_knl.all_inames(): - iname_map[iname] = vng(callee_label+iname) - - new_domains = [] - new_iname_to_tags = kernel.iname_to_tags.copy() - - # transferring iname tags info from the callee to the caller kernel - for domain in callee_knl.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - - if iname in callee_knl.iname_to_tags: - new_iname_to_tags[iname_map[iname]] = ( - callee_knl.iname_to_tags[iname]) - new_domain = new_domain.set_dim_name( - dim_type, i, iname_map[iname]) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains, - iname_to_tags=new_iname_to_tags) - - # }}} - - # {{{ rename temporaries - - temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(callee_knl.temporary_variables): - new_name = vng(callee_label+name) - temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) - - assignees = instruction.assignees # writes - parameters = instruction.expression.parameters # reads - - # add keyword parameters - from pymbolic.primitives import CallWithKwargs - - if isinstance(instruction.expression, CallWithKwargs): - from loopy.kernel.function_interface import get_kw_pos_association - - _, pos_to_kw = get_kw_pos_association(callee_knl) - kw_parameters = instruction.expression.kw_parameters - for i in range(len(parameters), len(parameters) + len(kw_parameters)): - parameters = parameters + (kw_parameters[pos_to_kw[i]],) - - assignee_pos = 0 - parameter_pos = 0 - for i, arg in enumerate(callee_knl.args): - if arg.is_output_only: - arg_map[arg.name] = assignees[assignee_pos] - assignee_pos += 1 - else: - arg_map[arg.name] = parameters[parameter_pos] - parameter_pos += 1 - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner( - make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) - - insn_id = {} - for insn in callee_knl.instructions: - insn_id[insn.id] = ing(callee_label+insn.id) - - # {{{ root and leave instructions in callee kernel - - dep_map = callee_knl.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of callee kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing(callee_label+"_start"), - within_inames=instruction.within_inames, - depends_on=instruction.depends_on - ) - noop_end = NoOpInstruction( - id=instruction.id, - within_inames=instruction.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for insn in callee_knl.instructions: - insn = insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(iname_map.get, insn.within_inames)) - within_inames = within_inames | instruction.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( - instruction.depends_on) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == instruction: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} - - # {{{ transferring the scoped functions from callee to caller - - callee_scoped_calls_collector = CalleeScopedCallsCollector( - callee_knl.scoped_functions) - callee_scoped_calls_dict = {} - - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( - insn.expression))) - elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknown type of instruction %s." % type( - insn)) - - from loopy.kernel.function_interface import ( - register_pymbolic_calls_to_knl_callables) - kernel = register_pymbolic_calls_to_knl_callables(kernel, - callee_scoped_calls_dict) - - # }}} - - return kernel - def emit_call_insn(self, insn, target, expression_to_code_mapper): assert self.is_ready_for_codegen() @@ -951,7 +647,7 @@ class CallableKernel(InKernelCallable): parameters.append(kw_parameters[pos_to_kw[i]]) par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) - # inserting the assigness at the required positions. + # insert the assigness at the required positions assignee_write_count = -1 for i, arg in enumerate(self.subkernel.args): if arg.is_output_only: @@ -960,7 +656,7 @@ class CallableKernel(InKernelCallable): par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) assignee_write_count -= 1 - # no type casting in array calls. + # no type casting in array calls from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import SubArrayRef @@ -1015,10 +711,10 @@ class ManglerCallable(ScalarCallable): def with_types(self, arg_id_to_dtype, kernel): if self.arg_id_to_dtype is not None: # specializing an already specialized function. - for id, dtype in arg_id_to_dtype.items(): + for arg_id, dtype in arg_id_to_dtype.items(): # only checking for the ones which have been provided # if does not match, returns an error. - if self.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]: raise LoopyError("Overwriting a specialized" " function is illegal--maybe start with new instance of" " ManglerCallable?") @@ -1057,12 +753,14 @@ class ManglerCallable(ScalarCallable): # {{{ new pymbolic calls to scoped functions +# FIXME Are these identifiers guaranteed to be available? Is there a var name +# generator somewhere ensuring that that's the case? def next_indexed_variable(function): """ Returns an instance of :class:`str` with the next indexed-name in the sequence for the name of *function*. - :Example: ``Variable('sin_0')`` will return ``'sin_1'``. + **Example:** ``Variable('sin_0')`` will return ``'sin_1'``. :arg function: Either an instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.reduction.ArgExtOp` or @@ -1149,6 +847,9 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_exprs_to_knl_callables): + # FIXME This could use an example. I have no idea what this does. + # Surely I can't associate arbitrary pymbolic expresions (3+a?) + # with callables? """ Returns a copy of :arg:`kernel` which includes an association with the given pymbolic expressions to the instances of :class:`InKernelCallable` for the @@ -1156,7 +857,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg pymbolic_exprs_to_knl_callables: A mapping from pymbolic expressions + :arg pymbolic_exprs_to_knl_callables: A mapping from :mod:`pymbolic` expressions to the instances of :class:`loopy.kernel.function_interface.InKernelCallable`. """ @@ -1182,7 +883,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_call_function = pymbolic_call.function.function else: raise NotImplementedError("Unknown type %s for pymbolic call " - "function." % type(pymbolic_call)) + "function" % type(pymbolic_call).__name__) unique_var = next_indexed_variable(pymbolic_call_function) from loopy.library.reduction import ArgExtOp, SegmentedOp @@ -1203,7 +904,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_calls_to_new_names[pymbolic_call] = ( scoped_functions_to_names[in_knl_callable]) - # Using the data populated in pymbolic_calls_to_new_names to change the + # Use the data populated in pymbolic_calls_to_new_names to change the # names of the scoped functions of all the calls in the kernel. rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index fafebf37d..b09931373 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -538,64 +538,78 @@ def _get_assignee_subscript_deps(expr): # {{{ atomic ops -class memory_ordering: # noqa +class MemoryOrdering: # noqa """Ordering of atomic operations, defined as in C11 and OpenCL. - .. attribute:: relaxed - .. attribute:: acquire - .. attribute:: release - .. attribute:: acq_rel - .. attribute:: seq_cst + .. attribute:: RELAXED + .. attribute:: ACQUIRE + .. attribute:: RELEASE + .. attribute:: ACQ_REL + .. attribute:: SEQ_CST """ - relaxed = 0 - acquire = 1 - release = 2 - acq_rel = 3 - seq_cst = 4 + RELAXED = 0 + ACQUIRE = 1 + RELEASE = 2 + ACQ_REL = 3 + SEQ_CST = 4 + + # FIXME Introduce compat/deprecation goop for now-upper-case enum + # constants @staticmethod def to_string(v): - for i in dir(memory_ordering): + for i in dir(MemoryOrdering): if i.startswith("_"): continue - if getattr(memory_ordering, i) == v: + if getattr(MemoryOrdering, i) == v: return i - raise ValueError("Unknown value of memory_ordering") + raise ValueError("Unknown value of MemoryOrdering") + + +# FIXME Introduce noisy deprecation goop +memory_ordering = MemoryOrdering -class memory_scope: # noqa +class MemoryScope: # noqa """Scope of atomicity, defined as in OpenCL. .. attribute:: auto Scope matches the accessibility of the variable. - .. attribute:: work_item - .. attribute:: work_group - .. attribute:: work_device - .. attribute:: all_svm_devices + .. attribute:: WORK_ITEM + .. attribute:: WORK_GROUP + .. attribute:: WORK_DEVICE + .. attribute:: ALL_SVM_DEVICES """ - work_item = 0 - work_group = 1 - device = 2 - all_svm_devices = 2 + WORK_ITEM = 0 + WORK_GROUP = 1 + DEVICE = 2 + ALL_SVM_DEVICES = 2 + + # FIXME Introduce compat/deprecation goop for now-upper-case enum + # constants auto = -1 @staticmethod def to_string(v): - for i in dir(memory_scope): + for i in dir(MemoryScope): if i.startswith("_"): continue - if getattr(memory_scope, i) == v: + if getattr(MemoryScope, i) == v: return i - raise ValueError("Unknown value of memory_scope") + raise ValueError("Unknown value of MemoryScope") + + +# FIXME Introduce noisy deprecation goop +memory_scope = MemoryScope class VarAtomicity(object): @@ -628,15 +642,15 @@ class OrderedAtomic(VarAtomicity): .. attribute:: ordering - One of the values from :class:`memory_ordering` + One of the values from :class:`MemoryOrdering` .. attribute:: scope - One of the values from :class:`memory_scope` + One of the values from :class:`MemoryScope` """ - ordering = memory_ordering.seq_cst - scope = memory_scope.auto + ordering = MemoryOrdering.SEQ_CST + scope = MemoryScope.auto def update_persistent_hash(self, key_hash, key_builder): """Custom hash computation function for use with @@ -657,8 +671,8 @@ class OrderedAtomic(VarAtomicity): return "%s[%s]%s/%s" % ( self.op_name, self.var_name, - memory_ordering.to_string(self.ordering), - memory_scope.to_string(self.scope)) + MemoryOrdering.to_string(self.ordering), + MemoryScope.to_string(self.scope)) class AtomicInit(OrderedAtomic): @@ -667,11 +681,11 @@ class AtomicInit(OrderedAtomic): .. attribute:: ordering - One of the values from :class:`memory_ordering` + One of the values from :class:`MemoryOrdering` .. attribute:: scope - One of the values from :class:`memory_scope` + One of the values from :class:`MemoryScope` """ op_name = 'init' @@ -681,11 +695,11 @@ class AtomicUpdate(OrderedAtomic): .. attribute:: ordering - One of the values from :class:`memory_ordering` + One of the values from :class:`MemoryOrdering` .. attribute:: scope - One of the values from :class:`memory_scope` + One of the values from :class:`MemoryScope` """ op_name = 'update' @@ -695,11 +709,11 @@ class AtomicLoad(OrderedAtomic): .. attribute:: ordering - One of the values from :class:`memory_ordering` + One of the values from :class:`MemoryOrdering` .. attribute:: scope - One of the values from :class:`memory_scope` + One of the values from :class:`MemoryScope` """ op_name = 'load' diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index fb57133e9..ed739c0fd 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1725,8 +1725,8 @@ def get_subkernels(kernel): See also :class:`loopy.schedule.CallKernel`. """ - from loopy.kernel import kernel_state - if kernel.state != kernel_state.SCHEDULED: + from loopy.kernel import KernelState + if kernel.state != KernelState.SCHEDULED: raise LoopyError("Kernel must be scheduled") from loopy.schedule import CallKernel @@ -1742,8 +1742,8 @@ def get_subkernel_to_insn_id_map(kernel): consisting of the instruction ids scheduled within the subkernel. The kernel must be scheduled. """ - from loopy.kernel import kernel_state - if kernel.state != kernel_state.SCHEDULED: + from loopy.kernel import KernelState + if kernel.state != KernelState.SCHEDULED: raise LoopyError("Kernel must be scheduled") from loopy.schedule import ( diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c4719ace5..777cc1c64 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -137,7 +137,7 @@ def check_reduction_iname_uniqueness(kernel): # }}} -# {{{ decide temporary scope +# {{{ decide temporary address space def _get_compute_inames_tagged(kernel, insn, tag_base): return set(iname for iname in kernel.insn_inames(insn.id) @@ -154,8 +154,8 @@ def _get_assignee_inames_tagged(kernel, insn, tag_base, tv_names): if kernel.iname_tags_of_type(iname, tag_base)) -def find_temporary_scope(kernel): - logger.debug("%s: find temporary scope" % kernel.name) +def find_temporary_address_space(kernel): + logger.debug("%s: find temporary address space" % kernel.name) new_temp_vars = {} from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag, @@ -183,7 +183,7 @@ def find_temporary_scope(kernel): # Only fill out for variables that do not yet know if they're # local. (I.e. those generated by implicit temporary generation.) - if temp_var.scope is not lp.auto: + if temp_var.address_space is not lp.auto: new_temp_vars[temp_var.name] = temp_var continue @@ -194,7 +194,7 @@ def find_temporary_scope(kernel): for alias in base_storage_to_aliases.get(temp_var.base_storage, []): my_writers = my_writers | writers.get(alias, frozenset()) - desired_scope_per_insn = [] + desired_aspace_per_insn = [] for insn_id in my_writers: insn = kernel.id_to_insn[insn_id] @@ -220,8 +220,8 @@ def find_temporary_scope(kernel): assert locparallel_assignee_inames <= locparallel_compute_inames assert grpparallel_assignee_inames <= grpparallel_compute_inames - desired_scope = AddressSpace.PRIVATE - for iname_descr, scope_descr, apin, cpin, scope in [ + desired_aspace = AddressSpace.PRIVATE + for iname_descr, aspace_descr, apin, cpin, aspace in [ ("local", "local", locparallel_assignee_inames, locparallel_compute_inames, AddressSpace.LOCAL), ("group", "global", grpparallel_assignee_inames, @@ -231,46 +231,45 @@ def find_temporary_scope(kernel): if (apin != cpin and bool(apin)): warn_with_kernel( kernel, - "write_race_%s(%s)" % (scope_descr, insn_id), + "write_race_%s(%s)" % (aspace_descr, insn_id), "instruction '%s' looks invalid: " "it assigns to indices based on %s IDs, but " "its temporary '%s' cannot be made %s because " "a write race across the iname(s) '%s' would emerge. " "(Do you need to add an extra iname to your prefetch?)" - % (insn_id, iname_descr, temp_var.name, scope_descr, + % (insn_id, iname_descr, temp_var.name, aspace_descr, ", ".join(cpin - apin)), WriteRaceConditionWarning) if (apin == cpin - - # doesn't want to be in this scope if there aren't any - # parallel inames of that kind: + # doesn't want to be in this address space if there + # aren't any parallel inames of that kind and bool(cpin)): - desired_scope = max(desired_scope, scope) + desired_aspace = max(desired_aspace, aspace) - desired_scope_per_insn.append(desired_scope) + desired_aspace_per_insn.append(desired_aspace) - if not desired_scope_per_insn: + if not desired_aspace_per_insn: if temp_var.initializer is None: warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, "temporary variable '%s' never written, eliminating" % temp_var.name, LoopyAdvisory) else: raise LoopyError("temporary variable '%s': never written, " - "cannot automatically determine scope" + "cannot automatically determine address space" % temp_var.name) continue - overall_scope = max(desired_scope_per_insn) + overall_aspace = max(desired_aspace_per_insn) from pytools import all - if not all(iscope == overall_scope for iscope in desired_scope_per_insn): + if not all(iaspace == overall_aspace for iaspace in desired_aspace_per_insn): raise LoopyError("not all instructions agree on the " - "the desired scope (private/local/global) of the " + "the desired address space (private/local/global) of the " "temporary '%s'" % temp_var.name) - new_temp_vars[temp_var.name] = temp_var.copy(scope=overall_scope) + new_temp_vars[temp_var.name] = temp_var.copy(address_space=overall_aspace) return kernel.copy(temporary_variables=new_temp_vars) @@ -785,7 +784,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): if ( assignee_var_name in kernel.temporary_variables and - (kernel.temporary_variables[assignee_var_name].scope + (kernel.temporary_variables[assignee_var_name].address_space == AddressSpace.PRIVATE)): new_assignees.append(assignee) continue @@ -1026,7 +1025,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=AddressSpace.PRIVATE) + address_space=AddressSpace.PRIVATE) init_insn_depends_on = frozenset() @@ -1161,14 +1160,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=AddressSpace.PRIVATE) + address_space=AddressSpace.PRIVATE) acc_var_names = make_temporaries( name_based_on="acc_"+red_iname, nvars=nresults, shape=outer_local_iname_sizes + (size,), dtypes=reduction_dtypes, - scope=AddressSpace.LOCAL) + address_space=AddressSpace.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) @@ -1354,7 +1353,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return mapper(expr, temp_kernel, None) - def make_temporaries(name_based_on, nvars, shape, dtypes, scope): + def make_temporaries(name_based_on, nvars, shape, dtypes, address_space): var_names = [ var_name_gen(name_based_on.format(index=i)) for i in range(nvars)] @@ -1366,7 +1365,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, name=name, shape=shape, dtype=dtype, - scope=scope) + address_space=address_space) return var_names @@ -1394,7 +1393,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=AddressSpace.PRIVATE) + address_space=AddressSpace.PRIVATE) from pymbolic import var acc_vars = tuple(var(n) for n in acc_var_names) @@ -1516,14 +1515,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, nvars=nresults, shape=(), dtypes=reduction_dtypes, - scope=AddressSpace.PRIVATE) + address_space=AddressSpace.PRIVATE) acc_var_names = make_temporaries( name_based_on="acc_"+scan_iname, nvars=nresults, shape=outer_local_iname_sizes + (scan_size,), dtypes=reduction_dtypes, - scope=AddressSpace.LOCAL) + address_space=AddressSpace.LOCAL) acc_vars = tuple(var(n) for n in acc_var_names) read_vars = tuple(var(n) for n in read_var_names) @@ -2134,6 +2133,7 @@ class ArgDescrInferenceMapper(CombineMapper): import operator return reduce(operator.or_, values, frozenset()) + # FIXME logic duplication between map_call and map_call_with_kwargs def map_call(self, expr, **kwargs): from loopy.kernel.function_interface import ValueArgDescriptor from loopy.symbolic import SubArrayRef, ScopedFunction @@ -2363,6 +2363,7 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): def combine(self, values): return all(values) + # FIXME logic duplication between map_call and map_call_with_kwargs def map_call(self, expr, *args, **kwargs): from loopy.library.reduction import ArgExtOp, SegmentedOp from pymbolic.primitives import Variable @@ -2470,8 +2471,8 @@ def preprocess_kernel(kernel, device=None): warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) - from loopy.kernel import kernel_state - if kernel.state >= kernel_state.PREPROCESSED: + from loopy.kernel import KernelState + if kernel.state >= KernelState.PREPROCESSED: return kernel # {{{ cache retrieval @@ -2536,7 +2537,7 @@ def preprocess_kernel(kernel, device=None): kernel = realize_ilp(kernel) - kernel = find_temporary_scope(kernel) + kernel = find_temporary_address_space(kernel) # inferring the shape and dim_tags of the arguments involved in a function # call. @@ -2561,7 +2562,7 @@ def preprocess_kernel(kernel, device=None): logger.info("%s: preprocess done" % kernel.name) kernel = kernel.copy( - state=kernel_state.PREPROCESSED) + state=KernelState.PREPROCESSED) # {{{ prepare for caching diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 440ac22cb..652f8b893 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1850,8 +1850,8 @@ def generate_loop_schedules(kernel, debug_args={}): def generate_loop_schedules_inner(kernel, debug_args={}): - from loopy.kernel import kernel_state - if kernel.state not in (kernel_state.PREPROCESSED, kernel_state.SCHEDULED): + from loopy.kernel import KernelState + if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") @@ -1862,7 +1862,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): debug = ScheduleDebugger(**debug_args) - preschedule = kernel.schedule if kernel.state == kernel_state.SCHEDULED else () + preschedule = kernel.schedule if kernel.state == KernelState.SCHEDULED else () prescheduled_inames = set( insn.iname @@ -1914,7 +1914,7 @@ def generate_loop_schedules_inner(kernel, debug_args={}): unscheduled_insn_ids=set(insn.id for insn in kernel.instructions), scheduled_insn_ids=frozenset(), - within_subkernel=kernel.state != kernel_state.SCHEDULED, + within_subkernel=kernel.state != KernelState.SCHEDULED, may_schedule_global_barriers=True, preschedule=preschedule, @@ -1984,11 +1984,11 @@ def generate_loop_schedules_inner(kernel, debug_args={}): new_kernel = kernel.copy( schedule=gen_sched, - state=kernel_state.SCHEDULED) + state=KernelState.SCHEDULED) from loopy.schedule.device_mapping import \ map_schedule_onto_host_or_device - if kernel.state != kernel_state.SCHEDULED: + if kernel.state != KernelState.SCHEDULED: # Device mapper only gets run once. new_kernel = map_schedule_onto_host_or_device(new_kernel) diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py index 5c41f0399..59afb07d2 100644 --- a/loopy/schedule/device_mapping.py +++ b/loopy/schedule/device_mapping.py @@ -30,8 +30,8 @@ from loopy.schedule.tools import get_block_boundaries def map_schedule_onto_host_or_device(kernel): # FIXME: Should be idempotent. - from loopy.kernel import kernel_state - assert kernel.state == kernel_state.SCHEDULED + from loopy.kernel import KernelState + assert kernel.state == KernelState.SCHEDULED from functools import partial device_prog_name_gen = partial( diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index d1e3a85e9..e0129fd98 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -91,7 +91,8 @@ def add_extra_args_to_schedule(kernel): more_args = set(tv for tv in used_temporaries if - kernel.temporary_variables[tv].scope == AddressSpace.GLOBAL + kernel.temporary_variables[tv].address_space + == AddressSpace.GLOBAL and kernel.temporary_variables[tv].initializer is None and diff --git a/loopy/statistics.py b/loopy/statistics.py index 521eaeb5a..6c012ca21 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -919,7 +919,7 @@ class LocalMemAccessCounter(MemAccessCounter): if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if isinstance(array, TemporaryVariable) and ( - array.scope == AddressSpace.LOCAL): + array.address_space == AddressSpace.LOCAL): if index is None: # no subscript sub_map[MemAccess( @@ -1739,8 +1739,8 @@ def gather_access_footprints(kernel, ignore_uncountable=False): from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.kernel import kernel_state - if kernel.state < kernel_state.PREPROCESSED: + from loopy.kernel import KernelState + if kernel.state < KernelState.PREPROCESSED: kernel = preprocess_kernel(kernel) write_footprints = [] @@ -1793,8 +1793,8 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): from loopy.preprocess import preprocess_kernel, infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.kernel import kernel_state - if kernel.state < kernel_state.PREPROCESSED: + from loopy.kernel import KernelState + if kernel.state < KernelState.PREPROCESSED: kernel = preprocess_kernel(kernel) result = {} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 09e6e5747..2c235a0d1 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -836,13 +836,13 @@ class SubArrayRef(p.Expression): name = self.subscript.aggregate.name if name in kernel.temporary_variables: - arg = kernel.temporary_variables[name] - mem_scope = arg.scope assert name not in kernel.arg_dict + arg = kernel.temporary_variables[name] else: assert name in kernel.arg_dict arg = kernel.arg_dict[name] - mem_scope = arg.memory_address_space + + aspace = arg.address_space from loopy.kernel.array import FixedStrideArrayDimTag as DimTag from loopy.isl_helpers import simplify_via_aff @@ -861,7 +861,8 @@ class SubArrayRef(p.Expression): kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 for iname in self.swept_inames) - return ArrayArgDescriptor(mem_scope=mem_scope, + return ArrayArgDescriptor( + address_space=aspace, dim_tags=sub_dim_tags, shape=sub_shape) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 9be9db38c..eab1e6afc 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -512,7 +512,7 @@ class CASTBuilder(ASTBuilderBase): six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): - if tv.scope == AddressSpace.GLOBAL and ( + if tv.address_space == AddressSpace.GLOBAL and ( tv.initializer is not None): assert tv.read_only @@ -606,12 +606,12 @@ class CASTBuilder(ASTBuilderBase): if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations - if tv.scope != AddressSpace.GLOBAL and ( + if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), - tv.scope) + tv.address_space) if tv.initializer is not None: assert tv.read_only @@ -627,7 +627,7 @@ class CASTBuilder(ASTBuilderBase): base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append( - tv.scope) + tv.address_space) align_size = tv.dtype.itemsize @@ -643,9 +643,9 @@ class CASTBuilder(ASTBuilderBase): cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) - cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope) + cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( - temp_var_decl, tv.scope) + temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index abe49a241..0464270a3 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -82,7 +82,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): def map_variable(self, expr, type_context): tv = self.kernel.temporary_variables.get(expr.name) - if tv is not None and tv.scope == AddressSpace.PRIVATE: + if tv is not None and tv.address_space == AddressSpace.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # below in decl generation) @@ -102,7 +102,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper): ary = self.find_array(expr) if (isinstance(ary, TemporaryVariable) - and ary.scope == AddressSpace.PRIVATE): + and ary.address_space == AddressSpace.PRIVATE): # generate access code for acccess to private-index temporaries gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() @@ -308,7 +308,7 @@ class ISPCASTBuilder(CASTBuilder): shape = decl_info.shape - if temp_var.scope == AddressSpace.PRIVATE: + if temp_var.address_space == AddressSpace.PRIVATE: # FIXME: This is a pretty coarse way of deciding what # private temporaries get duplicated. Refine? (See also # above in expr to code mapper) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 85af4ece3..6ee5969b3 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -647,20 +647,20 @@ class OpenCLCASTBuilder(CASTBuilder): if ( isinstance(lhs_var, ArrayArg) and - lhs_var.memory_address_space == AddressSpace.GLOBAL): + lhs_var.address_space == AddressSpace.GLOBAL): var_kind = "__global" elif ( isinstance(lhs_var, ArrayArg) and - lhs_var.memory_address_space == AddressSpace.LOCAL): + lhs_var.address_space == AddressSpace.LOCAL): var_kind = "__local" elif ( isinstance(lhs_var, TemporaryVariable) - and lhs_var.scope == AddressSpace.LOCAL): + and lhs_var.address_space == AddressSpace.LOCAL): var_kind = "__local" elif ( isinstance(lhs_var, TemporaryVariable) - and lhs_var.scope == AddressSpace.GLOBAL): + and lhs_var.address_space == AddressSpace.GLOBAL): var_kind = "__global" else: raise LoopyError("unexpected kind of variable '%s' in " diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 7355ceb2c..27c4f4ab4 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -56,7 +56,7 @@ def adjust_local_temp_var_storage(kernel, device): lmem_size = cl_char.usable_local_mem_size(device) for temp_var in six.itervalues(kernel.temporary_variables): - if temp_var.scope != AddressSpace.LOCAL: + if temp_var.address_space != AddressSpace.LOCAL: new_temp_vars[temp_var.name] = \ temp_var.copy(storage_shape=temp_var.shape) continue @@ -69,7 +69,7 @@ def adjust_local_temp_var_storage(kernel, device): other_loctemp_nbytes = [ tv.nbytes for tv in six.itervalues(kernel.temporary_variables) - if tv.scope == AddressSpace.LOCAL + if tv.address_space == AddressSpace.LOCAL and tv.name != temp_var.name] storage_shape = temp_var.storage_shape @@ -702,7 +702,7 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): global_temporaries = sorted( (tv for tv in six.itervalues(codegen_state.kernel.temporary_variables) - if tv.scope == AddressSpace.GLOBAL), + if tv.address_space == AddressSpace.GLOBAL), key=lambda tv: tv.name) from pymbolic.mapper.stringifier import PREC_NONE diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 0d3db360d..f0b9814c4 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -46,7 +46,7 @@ def temp_needs_batching_if_not_sequential(tv, batch_varying_args): # do not batch read_only temps if not in # `batch_varying_args` return False - if tv.scope == AddressSpace.PRIVATE: + if tv.address_space == AddressSpace.PRIVATE: # do not batch private temps if not in `batch_varying args` return False return True diff --git a/loopy/transform/register_callable.py b/loopy/transform/callable.py similarity index 50% rename from loopy/transform/register_callable.py rename to loopy/transform/callable.py index 455c2e51e..092cef887 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/callable.py @@ -22,15 +22,19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six + +import islpy as isl +from pymbolic.primitives import CallWithKwargs + from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) -from loopy.symbolic import IdentityMapper +from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff -from pymbolic.primitives import CallWithKwargs from loopy.kernel.function_interface import (get_kw_pos_association, register_pymbolic_calls_to_knl_callables) @@ -144,7 +148,7 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): name=function_name, is_called_from_host=False)) - # disabling global barriers for callee kernel + # FIXME disabling global barriers for callee kernel (for now) from loopy import set_options callee_kernel = set_options(callee_kernel, "disable_global_barriers") @@ -154,12 +158,321 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # }}} +# {{{ callee scoped calls collector (to support inlining) + +class CalleeScopedCallsCollector(CombineMapper): + """ + Collects the scoped functions which are a part of the callee kernel and + must be transferred to the caller kernel before inlining. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the caller kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def __init__(self, callee_scoped_functions): + self.callee_scoped_functions = callee_scoped_functions + + def combine(self, values): + import operator + from functools import reduce + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + +# }}} + + +# {{{ kernel inliner mapper + +class KernelInliner(SubstitutionMapper): + """Mapper to replace variables (indices, temporaries, arguments) in the + callee kernel with variables in the caller kernel. + + :arg caller: the caller kernel + :arg arg_map: dict of argument name to variables in caller + :arg arg_dict: dict of argument name to arguments in callee + """ + + def __init__(self, subst_func, caller, arg_map, arg_dict): + super(KernelInliner, self).__init__(subst_func) + self.caller = caller + self.arg_map = arg_map + self.arg_dict = arg_dict + + def map_subscript(self, expr): + if expr.aggregate.name in self.arg_map: + + aggregate = self.subst_func(expr.aggregate) + sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller + callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee + if aggregate.name in self.caller.arg_dict: + caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller + else: + caller_arg = self.caller.temporary_variables[aggregate.name] + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in callee_arg.shape): + raise LoopyError( + "Argument: {0} in callee kernel: {1} does not have " + "constant shape.".format(callee_arg)) + + flatten_index = 0 + for i, idx in enumerate(sar.get_begin_subscript().index_tuple): + flatten_index += idx*caller_arg.dim_tags[i].stride + + flatten_index += sum( + idx * tag.stride + for idx, tag in zip(outer_indices, callee_arg.dim_tags)) + + from loopy.isl_helpers import simplify_via_aff + flatten_index = simplify_via_aff(flatten_index) + + new_indices = [] + for dim_tag in caller_arg.dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + +# }}} + + +# {{{ inlining of a single call instruction + +def _inline_call_instruction(kernel, callee_knl, instruction): + """ + Returns a copy of *kernel* with the *instruction* in the *kernel* + replaced by inlining :attr:`subkernel` within it. + """ + callee_label = callee_knl.name[:4] + "_" + + # {{{ duplicate and rename inames + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + dim_type = isl.dim_type.set + + iname_map = {} + for iname in callee_knl.all_inames(): + iname_map[iname] = vng(callee_label+iname) + + new_domains = [] + new_iname_to_tags = kernel.iname_to_tags.copy() + + # transferring iname tags info from the callee to the caller kernel + for domain in callee_knl.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + + if iname in callee_knl.iname_to_tags: + new_iname_to_tags[iname_map[iname]] = ( + callee_knl.iname_to_tags[iname]) + new_domain = new_domain.set_dim_name( + dim_type, i, iname_map[iname]) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains, + iname_to_tags=new_iname_to_tags) + + # }}} + + # {{{ rename temporaries + + temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(callee_knl.temporary_variables): + new_name = vng(callee_label+name) + temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = instruction.assignees # writes + parameters = instruction.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + if isinstance(instruction.expression, CallWithKwargs): + from loopy.kernel.function_interface import get_kw_pos_association + + _, pos_to_kw = get_kw_pos_association(callee_knl) + kw_parameters = instruction.expression.kw_parameters + for i in range(len(parameters), len(parameters) + len(kw_parameters)): + parameters = parameters + (kw_parameters[pos_to_kw[i]],) + + assignee_pos = 0 + parameter_pos = 0 + for i, arg in enumerate(callee_knl.args): + if arg.is_output_only: + arg_map[arg.name] = assignees[assignee_pos] + assignee_pos += 1 + else: + arg_map[arg.name] = parameters[parameter_pos] + parameter_pos += 1 + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner( + make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) + + insn_id = {} + for insn in callee_knl.instructions: + insn_id[insn.id] = ing(callee_label+insn.id) + + # {{{ root and leave instructions in callee kernel + + dep_map = callee_knl.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=instruction.within_inames, + depends_on=instruction.depends_on + ) + noop_end = NoOpInstruction( + id=instruction.id, + within_inames=instruction.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for insn in callee_knl.instructions: + insn = insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(iname_map.get, insn.within_inames)) + within_inames = within_inames | instruction.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( + instruction.depends_on) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == instruction: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + # {{{ transferring the scoped functions from callee to caller + + callee_scoped_calls_collector = CalleeScopedCallsCollector( + callee_knl.scoped_functions) + callee_scoped_calls_dict = {} + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( + insn.expression))) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown type of instruction %s." % type( + insn)) + + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + kernel = register_pymbolic_calls_to_knl_callables(kernel, + callee_scoped_calls_dict) + + # }}} + + return kernel + +# }}} + + # {{{ inline callable kernel +# FIXME This should take a 'within' parameter to be able to only inline +# *some* calls to a kernel, but not others. def inline_callable_kernel(kernel, function_name): """ - Returns a copy of *kernel* with the callable kernel addresed by - *function_name* inlined. + Returns a copy of *kernel* with the callable kernel addressed by + (scoped) name *function_name* inlined. """ from loopy.preprocess import infer_arg_descr kernel = infer_arg_descr(kernel) @@ -167,25 +480,33 @@ def inline_callable_kernel(kernel, function_name): old_insns = kernel.instructions for insn in old_insns: if isinstance(insn, CallInstruction): + # FIXME This seems to use identifiers across namespaces. Why not + # check whether the function is a scoped function first? if insn.expression.function.name in kernel.scoped_functions: in_knl_callable = kernel.scoped_functions[ insn.expression.function.name] from loopy.kernel.function_interface import CallableKernel if isinstance(in_knl_callable, CallableKernel) and ( in_knl_callable.subkernel.name == function_name): - kernel = in_knl_callable.inline_within_kernel(kernel, insn) + kernel = _inline_call_instruction( + kernel, in_knl_callable.subkernel, insn) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass else: - raise NotImplementedError("Unknown instruction %s." % type(insn)) + raise NotImplementedError( + "Unknown instruction type %s" + % type(insn).__name__) return kernel # }}} -# {{{ matching caller to callee args if dimenstions dont match +# {{{ tools to match caller to callee args by (guessed) automatic reshaping + +# (This is undocumented and not recommended, but it is currently needed +# to support Firedrake.) class DimChanger(IdentityMapper): """ diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 19414424d..5b1ee6cca 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -147,7 +147,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, rule_name=None, temporary_name=None, - temporary_scope=None, temporary_is_local=None, + temporary_address_space=None, temporary_scope=None, footprint_subscripts=None, fetch_bounding_box=False, fetch_outer_inames=None): @@ -184,9 +184,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, :arg rule_name: base name of the generated temporary variable. :arg temporary_name: The name of the temporary to be used. - :arg temporary_scope: The :class:`AddressSpace` to use for the + :arg temporary_address_space: The :class:`AddressSpace` to use for the temporary. - :arg temporary_is_local: Deprecated, use *temporary_scope* instead. :arg footprint_subscripts: A list of tuples indicating the index (i.e. subscript) tuples used to generate the footprint. @@ -335,7 +334,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, - temporary_scope=temporary_scope, temporary_is_local=temporary_is_local, + temporary_address_space=temporary_address_space, + temporary_scope=temporary_scope, precompute_outer_inames=fetch_outer_inames) # {{{ remove inames that were temporarily added by slice sweeps diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index f1a015413..d0edcfd78 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -336,7 +336,7 @@ class DifferentiationContext(object): if var_name in self.kernel.arg_dict: self.new_args.append( - lp.ArrayArg( + lp.GlobalArg( new_var_name, arg.dtype, shape=shape, diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 8f8593c2c..49e30a751 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -130,8 +130,8 @@ def _merge_values(item_name, val_a, val_b): # {{{ two-kernel fusion def _fuse_two_kernels(knla, knlb): - from loopy.kernel import kernel_state - if knla.state != kernel_state.INITIAL or knlb.state != kernel_state.INITIAL: + from loopy.kernel import KernelState + if knla.state != KernelState.INITIAL or knlb.state != KernelState.INITIAL: raise LoopyError("can only fuse kernels in INITIAL state") # {{{ fuse domains diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 663c60b2a..87136d017 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -33,8 +33,6 @@ __doc__ = """ """ -# {{{ main entrypoint - def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, args_to_unpack=None): """ @@ -141,12 +139,12 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, from loopy.symbolic import SubstitutionMapper # dict to store the new assignees and parameters, the mapping pattern - # from id to parameters is identical to InKernelCallable.arg_id_to_dtype + # from arg_id to parameters is identical to InKernelCallable.arg_id_to_dtype id_to_parameters = tuple(enumerate(parameters)) + tuple( (-i-1, assignee) for i, assignee in enumerate(insn.assignees)) new_id_to_parameters = {} - for id, p in id_to_parameters: + for arg_id, p in id_to_parameters: if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name in args_to_pack): new_pack_inames = ilp_inames_map.copy() # packing-specific inames @@ -185,8 +183,8 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, pack_tmp = TemporaryVariable( name=pack_name, dtype=arg_in_caller.dtype, - dim_tags=in_knl_callable.arg_id_to_descr[id].dim_tags, - shape=in_knl_callable.arg_id_to_descr[id].shape, + dim_tags=in_knl_callable.arg_id_to_descr[arg_id].dim_tags, + shape=in_knl_callable.arg_id_to_descr[arg_id].shape, scope=temp_var_scope.PRIVATE, ) @@ -207,7 +205,7 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, zip(arg_in_caller.dim_tags, p.subscript.index_tuple))) new_indices = [] - for dim_tag in in_knl_callable.arg_id_to_descr[id].dim_tags: + for dim_tag in in_knl_callable.arg_id_to_descr[arg_id].dim_tags: ind = flatten_index // dim_tag.stride flatten_index -= (dim_tag.stride * ind) new_indices.append(ind) @@ -249,7 +247,7 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, updated_swept_inames = [] for i, _ in enumerate( - in_knl_callable.arg_id_to_descr[id].shape): + in_knl_callable.arg_id_to_descr[arg_id].shape): updated_swept_inames.append(var(vng("i_packsweep_"+arg))) ctx = kernel.isl_context @@ -257,17 +255,18 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, set=[iname.name for iname in updated_swept_inames]) iname_set = isl.BasicSet.universe(space) for iname, axis_length in zip(updated_swept_inames, - in_knl_callable.arg_id_to_descr[id].shape): + in_knl_callable.arg_id_to_descr[arg_id].shape): iname_set = iname_set & make_slab(space, iname.name, 0, axis_length) new_domains = new_domains + [iname_set] # }}} - new_id_to_parameters[id] = SubArrayRef(tuple(updated_swept_inames), - (var(pack_name).index(tuple(updated_swept_inames)))) + new_id_to_parameters[arg_id] = SubArrayRef( + tuple(updated_swept_inames), + (var(pack_name).index(tuple(updated_swept_inames)))) else: - new_id_to_parameters[id] = p + new_id_to_parameters[arg_id] = p if packing_insns: subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map)) @@ -315,7 +314,4 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, return kernel -# }}} - - # vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index acc21b09d..52d568975 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -268,8 +268,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, dtype=None, fetch_bounding_box=False, - temporary_scope=None, temporary_is_local=None, - compute_insn_id=None): + temporary_address_space=None, + compute_insn_id=None, + **kwargs): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two things to operate, a list of *sweep_inames* (order irrelevant) and an @@ -355,27 +356,30 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, eliminated. """ - # {{{ unify temporary_scope / temporary_is_local + # {{{ unify temporary_address_space / temporary_scope + + temporary_scope = kwargs.pop("temporary_scope", None) from loopy.kernel.data import AddressSpace - if temporary_is_local is not None: + if temporary_scope is not None: from warnings import warn - warn("temporary_is_local is deprecated. Use temporary_scope instead", + warn("temporary_scope is deprecated. Use temporary_address_space instead", DeprecationWarning, stacklevel=2) - if temporary_scope is not None: - raise LoopyError("may not specify both temporary_is_local and " + if temporary_address_space is not None: + raise LoopyError("may not specify both temporary_address_space and " "temporary_scope") - if temporary_is_local: - temporary_scope = AddressSpace.LOCAL - else: - temporary_scope = AddressSpace.PRIVATE + temporary_address_space = temporary_scope - del temporary_is_local + del temporary_scope # }}} + if kwargs: + raise TypeError("unrecognized keyword arguments: %s" + % ", ".join(kwargs.keys())) + # {{{ check, standardize arguments if isinstance(sweep_inames, str): @@ -847,7 +851,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, compute_dep_id = compute_insn_id added_compute_insns = [compute_insn] - if temporary_scope == AddressSpace.GLOBAL: + if temporary_address_space == AddressSpace.GLOBAL: barrier_insn_id = kernel.make_unique_instruction_id( based_on=c_subst_name+"_barrier") from loopy.kernel.instruction import BarrierInstruction @@ -959,8 +963,8 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, import loopy as lp - if temporary_scope is None: - temporary_scope = lp.auto + if temporary_address_space is None: + temporary_address_space = lp.auto new_temp_shape = tuple(abm.non1_storage_shape) @@ -971,7 +975,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, dtype=dtype, base_indices=(0,)*len(new_temp_shape), shape=tuple(abm.non1_storage_shape), - scope=temporary_scope, + address_space=temporary_address_space, dim_names=tuple(non1_storage_axis_names)) else: @@ -1009,20 +1013,20 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, temp_var = temp_var.copy(shape=new_temp_shape) - if temporary_scope == temp_var.scope: + if temporary_address_space == temp_var.address_space: pass - elif temporary_scope is lp.auto: - temporary_scope = temp_var.scope - elif temp_var.scope is lp.auto: + elif temporary_address_space is lp.auto: + temporary_address_space = temp_var.address_space + elif temp_var.address_space is lp.auto: pass else: raise LoopyError("Existing and new temporary '%s' do not " "have matching scopes (existing: %s, new: %s)" % (temporary_name, - AddressSpace.stringify(temp_var.scope), - AddressSpace.stringify(temporary_scope))) + AddressSpace.stringify(temp_var.address_space), + AddressSpace.stringify(temporary_address_space))) - temp_var = temp_var.copy(scope=temporary_scope) + temp_var = temp_var.copy(address_space=temporary_address_space) # }}} diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 0283b84f9..cca62bc52 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -441,7 +441,7 @@ class TemporarySaver(object): group_sizes, local_sizes = ( self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) - if temporary.scope == lp.AddressSpace.LOCAL: + if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. del local_tags[:] local_sizes = () @@ -454,7 +454,7 @@ class TemporarySaver(object): def auto_promote_temporary(self, temporary_name): temporary = self.kernel.temporary_variables[temporary_name] - if temporary.scope == AddressSpace.GLOBAL: + if temporary.address_space == AddressSpace.GLOBAL: # Nothing to be done for global temporaries (I hope) return None @@ -673,7 +673,7 @@ class TemporarySaver(object): domain = domain.set_dim_name( isl.dim_type.set, orig_dim + dim_idx, new_iname) - if orig_temporary.is_local: + if orig_temporary.address_space == AddressSpace.LOCAL: # If the temporary has local scope, then loads / stores can # be done in parallel. from loopy.kernel.data import AutoFitLocalIndexTag diff --git a/test/test_callables.py b/test/test_callables.py new file mode 100644 index 000000000..3b27b2d5b --- /dev/null +++ b/test/test_callables.py @@ -0,0 +1,415 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.clrandom # noqa: F401 +import loopy as lp +import pytest +import sys + + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import register_log2_lookup + + x = np.random.rand(10) + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + knl = lp.register_function_lookup(knl, register_log2_lookup) + + evt, (out, ) = knl(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + grandchild_knl = lp.make_kernel( + "{[i, j]:0<= i, j< 16}", + """ + c[i, j] = 2*a[i, j] + 3*b[i, j] + """) + + child_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) + """) + + parent_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + child_knl = lp.register_callable_kernel( + child_knl, 'linear_combo1', grandchild_knl) + knl = lp.register_callable_kernel( + parent_knl, 'linear_combo2', child_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo2') + knl = lp.inline_callable_kernel(knl, 'linear_combo1') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_slices_with_negative_step(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + child_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """) + + parent_knl = lp.make_kernel( + "{[i, k, m]: 0<=i, k, m<16}", + """ + z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + y[i, :, k, :, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='z', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, 'linear_combo', child_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_call_with_kwargs(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 2 + + a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel( + "{[i, j]:0<=i, j < %d}" % n, + """ + h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] + <>f1[i, j] = 2*f[i, j] + p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] + """, + [ + lp.GlobalArg('f'), + lp.GlobalArg('e'), + lp.GlobalArg('h'), + lp.GlobalArg('g'), + '...']) + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, + """ + <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] + [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( + f=[j, l]: a[i, j, k, l, m], + g=[j, l]: d[i, j, k, l, m], + e=[j, l]: c[i, j, k, l, m]) + """) + + knl = lp.register_callable_kernel( + caller_knl, 'linear_combo', callee_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) + + a = a_dev.get() + b = b_dev.get() + c = c_dev.get() + + h = out1.get() # h = 2c + 3a + 8b + p = out2.get() # p = 7c + 8a + 4b + h_exact = 3*a + 8*b + 2*c + p_exact = 8*a + 4*b + 7*c + + assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 + assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_hw_axes(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 4 + + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """) + + callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """ + ) + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") + + knl = lp.register_callable_kernel( + caller_knl, 'linear_combo', callee_knl) + + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x_dev, y=y_dev) + + x_host = x_dev.get() + y_host = y_dev.get() + + assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( + 2*x_host+3*y_host) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_shape_translation_through_sub_array_ref(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) + + callee1 = lp.make_kernel( + "{[i]: 0<=i<6}", + """ + a[i] = 2*abs(b[i]) + """) + + callee2 = lp.make_kernel( + "{[i, j]: 0<=i<3 and 0 <= j < 2}", + """ + a[i, j] = 3*b[i, j] + """) + + callee3 = lp.make_kernel( + "{[i]: 0<=i<6}", + """ + a[i] = 5*b[i] + """) + + knl = lp.make_kernel( + "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", + """ + [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) + [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) + [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) + """) + + knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1) + knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2) + knl = lp.register_callable_kernel(knl, 'callee_fn3', callee3) + + if inline: + knl = lp.inline_callable_kernel(knl, 'callee_fn1') + knl = lp.inline_callable_kernel(knl, 'callee_fn2') + knl = lp.inline_callable_kernel(knl, 'callee_fn3') + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) + + y1 = out_dict['y1'].get() + y2 = out_dict['y2'].get() + y3 = out_dict['y3'].get() + + assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 + assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 + assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 + + +def test_multi_arg_array_call(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + import pymbolic.primitives as p + n = 10 + acc_i = p.Variable("acc_i") + i = p.Variable("i") + index = p.Variable("index") + a_i = p.Subscript(p.Variable("a"), p.Variable("i")) + argmin_kernel = lp.make_kernel( + "{[i]: 0 <= i < n}", + [ + lp.Assignment(id="init2", assignee=index, + expression=0), + lp.Assignment(id="init1", assignee=acc_i, + expression="214748367"), + lp.Assignment(id="insn", assignee=index, + expression=p.If(p.Expression.eq(acc_i, a_i), i, index), + depends_on="update"), + lp.Assignment(id="update", assignee=acc_i, + expression=p.Variable("min")(acc_i, a_i), + depends_on="init1,init2")]) + + argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) + + knl = lp.make_kernel( + "{[i]:0<=i 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_loopy.py b/test/test_loopy.py index c069916e5..accf9c1df 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -69,7 +69,7 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory): """, [lp.TemporaryVariable( 'cnst', shape=('n'), initializer=cnst, - scope=lp.temp_var_scope.GLOBAL, + scope=lp.AddressSpace.GLOBAL, read_only=True), '...']) knl = lp.fix_parameters(knl, n=16) knl = lp.add_barrier(knl, "id:first", "id:second") @@ -1070,7 +1070,7 @@ def test_atomic(ctx_factory, dtype): def test_atomic_load(ctx_factory, dtype): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - from loopy.kernel.data import temp_var_scope as scopes + from loopy.kernel.data import AddressSpace n = 10 vec_width = 4 @@ -1108,7 +1108,7 @@ def test_atomic_load(ctx_factory, dtype): lp.GlobalArg("a", dtype, shape=lp.auto), lp.GlobalArg("b", dtype, shape=lp.auto), lp.TemporaryVariable('temp', dtype, for_atomic=True, - scope=scopes.LOCAL), + scope=AddressSpace.LOCAL), "..." ], silenced_warnings=["write_race(init)", "write_race(temp_sum)"]) @@ -1895,8 +1895,8 @@ def test_global_barrier(ctx_factory): print(knl) knl = lp.preprocess_kernel(knl) - assert knl.temporary_variables["z"].scope == lp.temp_var_scope.GLOBAL - assert knl.temporary_variables["v"].scope == lp.temp_var_scope.GLOBAL + assert knl.temporary_variables["z"].address_space == lp.AddressSpace.GLOBAL + assert knl.temporary_variables["v"].address_space == lp.AddressSpace.GLOBAL print(knl) @@ -2023,7 +2023,7 @@ def test_temp_initializer(ctx_factory, src_order, tmp_order): lp.TemporaryVariable("tmp", initializer=a, shape=lp.auto, - scope=lp.temp_var_scope.PRIVATE, + scope=lp.AddressSpace.PRIVATE, read_only=True, order=tmp_order), "..." @@ -2048,7 +2048,7 @@ def test_const_temp_with_initializer_not_saved(): lp.TemporaryVariable("tmp", initializer=np.arange(10), shape=lp.auto, - scope=lp.temp_var_scope.PRIVATE, + scope=lp.AddressSpace.PRIVATE, read_only=True), "..." ], @@ -2264,7 +2264,6 @@ def test_integer_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - from loopy.kernel.data import temp_var_scope as scopes from loopy.types import to_loopy_type n = 200 @@ -2272,7 +2271,7 @@ def test_integer_reduction(ctx_factory): var_int = np.random.randint(1000, size=n).astype(vtype) var_lp = lp.TemporaryVariable('var', initializer=var_int, read_only=True, - scope=scopes.PRIVATE, + scope=lp.AddressSpace.PRIVATE, dtype=to_loopy_type(vtype), shape=lp.auto) @@ -2453,8 +2452,6 @@ def test_barrier_insertion_near_bottom_of_loop(): def test_barrier_in_overridden_get_grid_size_expanded_kernel(): - from loopy.kernel.data import temp_var_scope as scopes - # make simple barrier'd kernel knl = lp.make_kernel('{[i]: 0 <= i < 10}', """ @@ -2465,7 +2462,7 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): end """, [lp.TemporaryVariable("a", np.float32, shape=(10,), order='C', - scope=scopes.LOCAL), + scope=lp.AddressSpace.LOCAL), lp.GlobalArg("b", np.float32, shape=(11,), order='C')], seq_dependencies=True) @@ -2690,7 +2687,6 @@ def test_wildcard_dep_matching(): def test_preamble_with_separate_temporaries(ctx_factory): - from loopy.kernel.data import temp_var_scope as scopes # create a function mangler # and finally create a test @@ -2717,7 +2713,8 @@ def test_preamble_with_separate_temporaries(ctx_factory): """, [lp.GlobalArg('out', shape=('n',)), lp.TemporaryVariable( - 'offsets', shape=(offsets.size,), initializer=offsets, scope=scopes.GLOBAL, + 'offsets', shape=(offsets.size,), initializer=offsets, + scope=lp.AddressSpace.GLOBAL, read_only=True), lp.GlobalArg('data', shape=(data.size,), dtype=np.float64)], ) @@ -2851,7 +2848,7 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier): """ % second_index, [ lp.TemporaryVariable("a", lp.auto, shape=(256,), - scope=lp.temp_var_scope.LOCAL), + scope=lp.AddressSpace.LOCAL), ]) knl = lp.tag_inames(knl, "i:l.0") diff --git a/test/test_transform.py b/test/test_transform.py index 6e441976a..ed184fb50 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -182,370 +182,6 @@ def test_add_barrier(ctx_factory): assert (np.linalg.norm(out-2*a.T) < 1e-16) -def test_register_function_lookup(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - from testlib import register_log2_lookup - - x = np.random.rand(10) - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) - - knl = lp.make_kernel( - "{[i]: 0<=i<10}", - """ - y[i] = log2(x[i]) - """) - knl = lp.register_function_lookup(knl, register_log2_lookup) - - evt, (out, ) = knl(queue, x=x) - - assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 2 ** 4 - - x = np.random.rand(n, n, n, n, n) - y = np.random.rand(n, n, n, n, n) - - grandchild_knl = lp.make_kernel( - "{[i, j]:0<= i, j< 16}", - """ - c[i, j] = 2*a[i, j] + 3*b[i, j] - """) - - child_knl = lp.make_kernel( - "{[i, j]:0<=i, j < 16}", - """ - [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) - """) - - parent_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", - """ - [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], - [j, l]: y[i, j, k, l, m]) - """, - kernel_data=[ - lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], - ) - - child_knl = lp.register_callable_kernel( - child_knl, 'linear_combo1', grandchild_knl) - knl = lp.register_callable_kernel( - parent_knl, 'linear_combo2', child_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo2') - knl = lp.inline_callable_kernel(knl, 'linear_combo1') - - evt, (out, ) = knl(queue, x=x, y=y) - - assert (np.linalg.norm(2*x+3*y-out)/( - np.linalg.norm(2*x+3*y))) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_slices_with_negative_step(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 2 ** 4 - - x = np.random.rand(n, n, n, n, n) - y = np.random.rand(n, n, n, n, n) - - child_knl = lp.make_kernel( - "{[i, j]:0<=i, j < 16}", - """ - g[i, j] = 2*e[i, j] + 3*f[i, j] - """) - - parent_knl = lp.make_kernel( - "{[i, k, m]: 0<=i, k, m<16}", - """ - z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], - y[i, :, k, :, m]) - """, - kernel_data=[ - lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='z', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], - ) - - knl = lp.register_callable_kernel( - parent_knl, 'linear_combo', child_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out, ) = knl(queue, x=x, y=y) - - assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( - np.linalg.norm(2*x+3*y))) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl_with_call_with_kwargs(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 2 ** 2 - - a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - - callee_knl = lp.make_kernel( - "{[i, j]:0<=i, j < %d}" % n, - """ - h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] - <>f1[i, j] = 2*f[i, j] - p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] - """, - [lp.ArrayArg('f'), lp.ArrayArg('e'), lp.ArrayArg('h'), - lp.ArrayArg('g'), '...']) - - caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, - """ - <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] - [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( - f=[j, l]: a[i, j, k, l, m], - g=[j, l]: d[i, j, k, l, m], - e=[j, l]: c[i, j, k, l, m]) - """) - - knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) - - a = a_dev.get() - b = b_dev.get() - c = c_dev.get() - - h = out1.get() # h = 2c + 3a + 8b - p = out2.get() # p = 7c + 8a + 4b - h_exact = 3*a + 8*b + 2*c - p_exact = 8*a + 4*b + 7*c - - assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 - assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl_with_hw_axes(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 2 ** 4 - - x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - - callee_knl = lp.make_kernel( - "{[i, j]:0<=i, j < 16}", - """ - g[i, j] = 2*e[i, j] + 3*f[i, j] - """) - - callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") - - caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", - """ - [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], - [j, l]: y[i, j, k, l, m]) - """ - ) - caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") - - knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl) - - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out, ) = knl(queue, x=x_dev, y=y_dev) - - x_host = x_dev.get() - y_host = y_dev.get() - - assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( - 2*x_host+3*y_host) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_shape_translation_through_sub_array_ref(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) - x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) - x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) - - callee1 = lp.make_kernel( - "{[i]: 0<=i<6}", - """ - a[i] = 2*abs(b[i]) - """) - - callee2 = lp.make_kernel( - "{[i, j]: 0<=i<3 and 0 <= j < 2}", - """ - a[i, j] = 3*b[i, j] - """) - - callee3 = lp.make_kernel( - "{[i]: 0<=i<6}", - """ - a[i] = 5*b[i] - """) - - knl = lp.make_kernel( - "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", - """ - [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) - [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) - [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) - """) - - knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1) - knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2) - knl = lp.register_callable_kernel(knl, 'callee_fn3', callee3) - - if inline: - knl = lp.inline_callable_kernel(knl, 'callee_fn1') - knl = lp.inline_callable_kernel(knl, 'callee_fn2') - knl = lp.inline_callable_kernel(knl, 'callee_fn3') - - knl = lp.set_options(knl, "write_cl") - knl = lp.set_options(knl, "return_dict") - evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) - - y1 = out_dict['y1'].get() - y2 = out_dict['y2'].get() - y3 = out_dict['y3'].get() - - assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 - assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 - assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 - - -def test_multi_arg_array_call(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - import pymbolic.primitives as p - n = 10 - acc_i = p.Variable("acc_i") - i = p.Variable("i") - index = p.Variable("index") - a_i = p.Subscript(p.Variable("a"), p.Variable("i")) - argmin_kernel = lp.make_kernel( - "{[i]: 0 <= i < n}", - [ - lp.Assignment(id="init2", assignee=index, - expression=0), - lp.Assignment(id="init1", assignee=acc_i, - expression="214748367"), - lp.Assignment(id="insn", assignee=index, - expression=p.If(p.Expression.eq(acc_i, a_i), i, index), - depends_on="update"), - lp.Assignment(id="update", assignee=acc_i, - expression=p.Variable("min")(acc_i, a_i), - depends_on="init1,init2")]) - - argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) - - knl = lp.make_kernel( - "{[i]:0<=i Date: Fri, 29 Jun 2018 19:48:37 +0100 Subject: [PATCH 222/916] minor update --- loopy/types.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/loopy/types.py b/loopy/types.py index d52e029a5..59d605c85 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -196,6 +196,9 @@ class OpaqueType(LoopyType): def is_complex(self): return False + def involves_complex(self): + return False + # }}} -- GitLab From 2f430adffb1d2eb4933f2c6ec93eb951f3927c19 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Jul 2018 20:24:57 -0500 Subject: [PATCH 223/916] Hunk edits to isolate the new function interface --- doc/index.rst | 1 + loopy/__init__.py | 8 + loopy/check.py | 102 +++++++- loopy/codegen/__init__.py | 54 ++++ loopy/kernel/__init__.py | 49 ++-- loopy/kernel/creation.py | 156 +++++++++++- loopy/kernel/tools.py | 8 + loopy/library/function.py | 39 +++ loopy/library/random123.py | 104 ++++---- loopy/library/reduction.py | 216 +++++++--------- loopy/preprocess.py | 359 +++++++++++++++++++++++++++ loopy/statistics.py | 9 +- loopy/symbolic.py | 86 ++++++- loopy/target/__init__.py | 7 +- loopy/target/c/__init__.py | 233 ++++++++--------- loopy/target/c/codegen/expression.py | 84 ++----- loopy/target/cuda.py | 84 +++++-- loopy/target/opencl.py | 182 +++++++++----- loopy/target/pyopencl.py | 110 +++++--- loopy/target/python.py | 52 ++-- loopy/transform/diff.py | 9 +- loopy/type_inference.py | 183 ++++++++++++-- test/testlib.py | 40 +++ 23 files changed, 1616 insertions(+), 559 deletions(-) diff --git a/doc/index.rst b/doc/index.rst index d862a8acd..0644b34c4 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -68,6 +68,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform + ref_call ref_other misc diff --git a/loopy/__init__.py b/loopy/__init__.py index f50ce237c..d541f1dae 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -51,6 +51,8 @@ from loopy.kernel.data import ( TemporaryVariable, SubstitutionRule, CallMangleInfo) +from loopy.kernel.function_interface import ( + ScalarCallable) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -119,6 +121,8 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier +from loopy.transform.callable import register_function_lookup + # }}} from loopy.type_inference import infer_unknown_types @@ -168,6 +172,8 @@ __all__ = [ "CallInstruction", "CInstruction", "NoOpInstruction", "BarrierInstruction", + "ScalarCallable", + "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", "AddressSpace", "temp_var_scope", # temp_var_scope is deprecated @@ -230,6 +236,8 @@ __all__ = [ "add_barrier", + "register_function_lookup", + # }}} "get_dot_dependency_graph", diff --git a/loopy/check.py b/loopy/check.py index 84f3b04e0..dd96c1ba6 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -27,9 +27,13 @@ from six.moves import range from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper +from loopy.symbolic import WalkMapper, CombineMapper, ScopedFunction from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -55,6 +59,74 @@ def check_identifiers_in_subst_rules(knl): "kernel-global identifiers" % (knl.name, ", ".join(deps-rule_allowed_identifiers))) + +class UnscopedCallCollector(CombineMapper): + """ + Collects all the unscoped calls within a kernel. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + from loopy.library.reduction import ArgExtOp + if not isinstance(expr.function, (ScopedFunction, ArgExtOp)): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if not isinstance(expr.function, ScopedFunction): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def check_functions_are_scoped(kernel): + """ Checks if all the calls in the instruction expression have been scoped, + otherwise indicates to what all calls we await signature. Refer + :class:`loopy.symbolic.ScopedFunction` for a detailed explanation of a + scoped function. + """ + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + unscoped_calls = UnscopedCallCollector()(subst_expander( + insn.expression)) + if unscoped_calls: + raise LoopyError("Unknown function '%s' obtained -- register a " + "function or a kernel corresponding to it." % + set(unscoped_calls).pop()) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown type of instruction %s" % type(insn).__name__) + # }}} @@ -113,6 +185,18 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) +def _get_all_unique_iname_tags(kernel): + """Returns a set of all the iname tags used in *kernel* that + inherit from :class:`loopy.kernel.data.UniqueTag`. + """ + from loopy.kernel.data import UniqueTag + iname_tags = [kernel.iname_to_tag.get(iname) for iname in + kernel.all_inames()] + return set( + tag for tag in iname_tags if + isinstance(tag, UniqueTag)) + + def check_multiple_tags_allowed(kernel): from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type) @@ -129,6 +213,7 @@ def check_multiple_tags_allowed(kernel): def check_for_double_use_of_hw_axes(kernel): from loopy.kernel.data import UniqueTag + from loopy.kernel.instruction import CallInstruction for insn in kernel.instructions: insn_tag_keys = set() @@ -141,6 +226,21 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) + # check usage of iname tags in the callee kernel + if isinstance(insn, CallInstruction): + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + # check for collision in iname_tag keys in the instruction + # due to the callee kernel + common_iname_tags = [tag for tag in + _get_all_unique_iname_tags(in_knl_callable.subkernel) + if tag.key in insn_tag_keys] + if common_iname_tags: + raise LoopyError("instruction '%s' has multiple " + "inames tagged '%s'" % (insn.id, + common_iname_tags.pop())) + def check_for_inactive_iname_access(kernel): for insn in kernel.instructions: diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1b..16fef45b5 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,6 +32,16 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from cgen import Collection +from loopy.symbolic import CombineMapper + +from loopy.kernel.instruction import ( + Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, + CInstruction, _DataObliviousInstruction, MultiAssignmentBase) + +from functools import reduce + + import logging logger = logging.getLogger(__name__) @@ -362,6 +372,32 @@ code_gen_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) +class InKernelCallablesCollector(CombineMapper): + """ + Returns an instance of :class:`frozenset` containing instances of + :class:`loopy.kernel.function_interface.InKernelCallable` in the + :attr:``kernel`. + """ + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_scoped_function(self, expr): + return frozenset([self.kernel.scoped_functions[ + expr.name]]) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + class PreambleInfo(ImmutableRecord): """ .. attribute:: kernel @@ -506,6 +542,24 @@ def generate_code_v2(kernel): for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) + # {{{ collect preambles from all the in kernel callables. + + in_knl_callable_collector = InKernelCallablesCollector(kernel) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + for in_knl_callable in in_knl_callable_collector(insn.expression): + preambles.extend(in_knl_callable.generate_preambles(kernel.target)) + + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown instruction type '%s'" + % type(insn).__name__) + + # }}} + codegen_result = codegen_result.copy(device_preambles=preambles) # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6b0033808..e89455d30 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -37,10 +37,6 @@ import re from pytools import UniqueNameGenerator, generate_unique_names -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted from loopy.diagnostic import StaticValueFindingError @@ -186,6 +182,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: function_manglers .. attribute:: symbol_manglers + .. attribute:: function_scopers + + A list of functions of signature ``(target, name)`` returning a + :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + .. attribute:: substitutions a mapping from substitution names to @@ -238,6 +239,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): iname_to_tags=None, substitutions=None, function_manglers=None, + function_scopers=None, + scoped_functions={}, symbol_manglers=[], iname_slab_increments=None, @@ -277,15 +280,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if substitutions is None: substitutions = {} if function_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] - if symbol_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] + function_manglers = [] if iname_slab_increments is None: iname_slab_increments = {} @@ -348,6 +343,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT + if function_scopers is None: + # populate the function scopers from the target and the loopy + # specific callable scopers + + from loopy.library.function import loopy_specific_callable_scopers + function_scopers = [loopy_specific_callable_scopers] + ( + target.get_device_ast_builder().function_scopers()) + ImmutableRecordWithoutPickling.__init__(self, domains=domains, instructions=instructions, @@ -367,6 +370,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, function_manglers=function_manglers, + function_scopers=function_scopers, + scoped_functions=scoped_functions, symbol_manglers=symbol_manglers, index_dtype=index_dtype, options=options, @@ -380,7 +385,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling + # {{{ function mangling/scoping def mangle_function(self, identifier, arg_dtypes, ast_builder=None): if ast_builder is None: @@ -423,6 +428,20 @@ class LoopKernel(ImmutableRecordWithoutPickling): return None + def find_scoped_function_identifier(self, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + for scoper in self.function_scopers: + in_knl_callable = scoper(self.target, identifier) + if in_knl_callable: + return in_knl_callable + + return None + # }}} # {{{ symbol mangling @@ -1505,7 +1524,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): "preamble_generators", "function_manglers", + "function_scopers", "symbol_manglers", + "scoped_functions", ) def update_persistent_hash(self, key_hash, key_builder): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index c2b54cf8b..8b371b47d 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -24,16 +24,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np from pymbolic.mapper import CSECachingMapperMixin +from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids -from loopy.symbolic import IdentityMapper, WalkMapper +from loopy.symbolic import ( + IdentityMapper, WalkMapper, SubArrayRef, + RuleAwareIdentityMapper) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace) +from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, + CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -1139,7 +1143,7 @@ class ArgumentGuesser: def make_new_arg(self, arg_name): arg_name = arg_name.strip() - from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace + from loopy.kernel.data import ValueArg, ArrayArg import loopy as lp if arg_name in self.all_params: @@ -1835,6 +1839,148 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # }}} +# {{{ scope functions + +class FunctionScoper(RuleAwareIdentityMapper): + """ + Mapper to convert the ``function`` attribute of a + :class:`pymbolic.primitives.Call` known in the kernel as instances of + :class:`loopy.symbolic.ScopedFunction`. A function is known in the + *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` + returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + + log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + + unknown_function(y) + ScopedFunction('log')(z)``. + + :arg rule_mapping_context: An instance of + :class:`loopy.symbolic.RuleMappingContext`. + :arg function_ids: A container with instances of :class:`str` indicating + the function identifiers to look for while scoping functions. + """ + def __init__(self, rule_mapping_context, kernel): + super(FunctionScoper, self).__init__(rule_mapping_context) + self.kernel = kernel + self.scoped_functions = {} + + def map_call(self, expr, expn_state): + from loopy.symbolic import ScopedFunction + if not isinstance(expr.function, ScopedFunction): + + # search the kernel for the function + in_knl_callable = self.kernel.find_scoped_function_identifier( + expr.function.name) + if in_knl_callable: + # associate the newly created ScopedFunction with the + # resolved in-kernel callable + self.scoped_functions[expr.function.name] = in_knl_callable + + return type(expr)( + ScopedFunction(expr.function.name), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + + # this is an unknown function as of yet, do not modify it + return super(FunctionScoper, self).map_call(expr, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + # FIXME duplicated logic with map_call + + from loopy.symbolic import ScopedFunction + if not isinstance(expr.function, ScopedFunction): + + # search the kernel for the function. + in_knl_callable = self.kernel.find_scoped_function_identifier( + expr.function.name) + + if in_knl_callable: + # associate the newly created ScopedFunction with the + # resolved in-kernel callable + self.scoped_functions[expr.function.name] = in_knl_callable + return type(expr)( + ScopedFunction(expr.function.name), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + + # this is an unknown function as of yet, do not modify it + return super(FunctionScoper, self).map_call_with_kwargs(expr, + expn_state) + + def map_reduction(self, expr, expn_state): + from loopy.library.reduction import (MaxReductionOperation, + MinReductionOperation, ArgMinReductionOperation, + ArgMaxReductionOperation, _SegmentedScalarReductionOperation, + SegmentedOp) + from loopy.library.reduction import ArgExtOp + + # note down the extra functions arising due to certain reductions + + # FIXME Discuss this. It cannot stay the way it is, because non-built-in + # reductions cannot add themselves to this list. We may need to change + # the reduction interface. Why don't reductions generate scoped functions + # in the first place? + if isinstance(expr.operation, MaxReductionOperation): + self.scoped_functions["max"] = ( + self.kernel.find_scoped_function_identifier("max")) + elif isinstance(expr.operation, MinReductionOperation): + self.scoped_functions["min"] = ( + self.kernel.find_scoped_function_identifier("min")) + elif isinstance(expr.operation, ArgMaxReductionOperation): + self.scoped_functions["max"] = ( + self.kernel.find_scoped_function_identifier("max")) + self.scoped_functions["make_tuple"] = ( + self.kernel.find_scoped_function_identifier("make_tuple")) + self.scoped_functions[ArgExtOp(expr.operation)] = ( + self.kernel.find_scoped_function_identifier(expr.operation)) + elif isinstance(expr.operation, ArgMinReductionOperation): + self.scoped_functions["min"] = ( + self.kernel.find_scoped_function_identifier("min")) + self.scoped_functions["make_tuple"] = ( + self.kernel.find_scoped_function_identifier("make_tuple")) + self.scoped_functions[ArgExtOp(expr.operation)] = ( + self.kernel.find_scoped_function_identifier(expr.operation)) + elif isinstance(expr.operation, _SegmentedScalarReductionOperation): + self.scoped_functions["make_tuple"] = ( + self.kernel.find_scoped_function_identifier("make_tuple")) + self.scoped_functions[SegmentedOp(expr.operation)] = ( + self.kernel.find_scoped_function_identifier(expr.operation)) + + return super(FunctionScoper, self).map_reduction(expr, expn_state) + + +def scope_functions(kernel): + """ + Returns a kernel with the pymbolic nodes involving known functions realized + as instances of :class:`loopy.symbolic.ScopedFunction`, along with the + resolved functions being added to the ``scoped_functions`` dictionary of + the kernel. + """ + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + function_scoper = FunctionScoper(rule_mapping_context, kernel) + + # scoping fucntions and collecting the scoped functions + kernel_with_scoped_functions = rule_mapping_context.finish_kernel( + function_scoper.map_kernel(kernel)) + + # updating the functions collected during the scoped functions + updated_scoped_functions = kernel.scoped_functions.copy() + updated_scoped_functions.update(function_scoper.scoped_functions) + + return kernel_with_scoped_functions.copy( + scoped_functions=updated_scoped_functions) + +# }}} + + # {{{ kernel creation top-level def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): @@ -2174,6 +2320,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) + knl = scope_functions(knl) + from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 95c3c336c..1d79a86d7 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1877,7 +1877,15 @@ def infer_arg_is_output_only(kernel): else: new_args.append(arg.copy(is_output_only=False)) elif isinstance(arg, ConstantArg): +<<<<<<< HEAD + if arg.is_output_only: + raise LoopyError("Constant Argument %s cannot have " + "is_output_only True" % arg.name) + else: + new_args.append(arg.copy(is_output_only=False)) +======= new_args.append(arg) +>>>>>>> master else: raise NotImplementedError("Unkonwn argument type %s." % type(arg)) diff --git a/loopy/library/function.py b/loopy/library/function.py index 9d557ac9f..4873eca91 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from loopy.kernel.function_interface import ScalarCallable + def default_function_mangler(kernel, name, arg_dtypes): from loopy.library.reduction import reduction_function_mangler @@ -56,4 +58,41 @@ def tuple_function_mangler(kernel, name, arg_dtypes): return None +class MakeTupleCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple") + + def with_descrs(self, arg_id_to_descr): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), + (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) + + return self.copy(arg_id_to_descr=new_arg_id_to_descr) + + +class IndexOfCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = kernel.index_dtype + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + +def loopy_specific_callable_scopers(target, identifier): + if identifier == "make_tuple": + return MakeTupleCallable(name="make_tuple") + + if identifier in ["indexof", "indexof_vec"]: + return IndexOfCallable(name=identifier) + + from loopy.library.reduction import reduction_scoper + return reduction_scoper(target, identifier) + + # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index b8633114d..a2880bfb8 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -27,6 +27,7 @@ THE SOFTWARE. from pytools import ImmutableRecord from mako.template import Template +from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -163,60 +164,73 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(preamble_info): - for f in preamble_info.seen_functions: - try: - rng_variant = FUNC_NAMES_TO_RNG[f.name] - except KeyError: - continue +class Random123Callable(ScalarCallable): + """ + Records information about for the random123 functions. + """ + + def with_types(self, arg_id_to_dtype, kernel): + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + name = self.name + target = kernel.target + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + fn = rng_variant.full_name + if name == fn: + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen") + + elif name == fn + "_f32": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name) + + elif name == fn + "_f64": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name) + + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + def generate_preambles(self, target): + rng_variant = FUNC_NAMES_TO_RNG[self.name] from loopy.target.pyopencl import PyOpenCLTarget yield ("90-random123-"+rng_variant.full_name, PREAMBLE_TEMPLATE.render( is_pyopencl_target=isinstance( - preamble_info.kernel.target, + target, PyOpenCLTarget), rng_variant=rng_variant, )) + return -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None + +def random123_function_scoper(target, identifier): + if identifier in FUNC_NAMES_TO_RNG: + return Random123Callable(name=identifier) + + return None # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 8ed5cbe56..ca2f02347 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -24,6 +24,8 @@ THE SOFTWARE. from pymbolic import var +from loopy.symbolic import ScopedFunction +from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier @@ -180,7 +182,7 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + return ScopedFunction("max")(operand1, operand2) class MinReductionOperation(ScalarReductionOperation): @@ -188,7 +190,7 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + return ScopedFunction("min")(operand1, operand2) # {{{ base class for symbolic reduction ops @@ -237,7 +239,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype): scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ScopedFunction("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): @@ -254,7 +256,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return SegmentedOp(self)(*(operand1 + operand2)) + return ScopedFunction(SegmentedOp(self))(*(operand1 + operand2)) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -268,29 +270,6 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): op = "((%s) * (%s))" which = "product" - -def get_segmented_function_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) - # }}} @@ -313,7 +292,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ScopedFunction("make_tuple")(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)) def __str__(self): @@ -330,7 +309,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ArgExtOp(self)(*(operand1 + operand2)) + return ScopedFunction(ArgExtOp(self))(*(operand1 + operand2)) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -344,38 +323,6 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): update_comparison = "<=" neutral_sign = +1 - -def get_argext_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - prefix = op.prefix(scalar_dtype, index_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=kernel.target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) - # }}} @@ -429,70 +376,91 @@ def parse_reduction_op(name): # }}} -def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, index_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, index_dtype), - arg_dtypes=( - scalar_dtype, - index_dtype, - scalar_dtype, - index_dtype), - ) - - elif isinstance(func_id, SegmentedOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, segment_flag_dtype), - arg_dtypes=( - scalar_dtype, - segment_flag_dtype, - scalar_dtype, - segment_flag_dtype), - ) +# {{{ reduction specific callables + +class ReductionCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + name_in_target = self.name.prefix(scalar_dtype, index_dtype) + "_op" + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target) + + def with_descr(self, arg_id_to_descr): + from loopy.library.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = arg_id_to_descr.copy() + new_arg_id_to_descr[-1] = ValueArgDescriptor() + return self.copy(arg_id_to_descr=arg_id_to_descr) + + def generate_preambles(self, target): + if isinstance(self.name, _ArgExtremumReductionOperation): + op = self.name + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(index_t)s index1, + %(scalar_t)s op2, %(index_t)s index2, + %(index_t)s *index_out) + { + if (op2 %(comp)s op1) + { + *index_out = index2; + return op2; + } + else + { + *index_out = index1; + return op1; + } + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + elif isinstance(self.name, _SegmentedScalarReductionOperation): + op = self.name + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) + + return + + +def reduction_scoper(target, identifier): + if isinstance(identifier, (_ArgExtremumReductionOperation, + _SegmentedScalarReductionOperation)): + return ReductionCallable(name=identifier) return None - -def reduction_preamble_generator(preamble_info): - from loopy.target.opencl import OpenCLTarget - - for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_argext_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) - - elif isinstance(func.name, SegmentedOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_segmented_function_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) +# }}} # vim: fdm=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fc950c78e..6beadb3de 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -27,6 +27,7 @@ import six from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) +from functools import reduce import islpy as isl @@ -37,6 +38,10 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from loopy.symbolic import CombineMapper + +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + CallInstruction, _DataObliviousInstruction) import logging logger = logging.getLogger(__name__) @@ -2108,6 +2113,350 @@ def check_atomic_loads(kernel): # }}} +# {{{ arg_descr_inference + +class ArgDescrInferenceMapper(CombineMapper): + """ + Returns a set of instances of :class:`tuple` (expr, + in_kernel_callable). The mapped `in_kernel_callable` of the + :class:`InKernelCallable` are descriptor specialized for the given + arguments. + """ + + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + # FIXME logic duplication between map_call and map_call_with_kwargs + def map_call(self, expr, **kwargs): + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import SubArrayRef, ScopedFunction + + # ignoring if the call is not to a ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) + + # descriptors for the args + arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel)) + if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) + for i, par in enumerate(expr.parameters)) + + assignee_id_to_descr = {} + + # assignee descriptor + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + assert isinstance(assignees, tuple) + for i, par in enumerate(assignees): + if isinstance(par, SubArrayRef): + assignee_id_to_descr[-i-1] = ( + par.get_array_arg_descriptor(self.kernel)) + else: + assignee_id_to_descr[-i-1] = ValueArgDescriptor() + + # gathering all the descriptors + # TODO: I dont like in place updates. Change this to somthing else. + # Perhaps make a function? + combined_arg_id_to_descr = arg_id_to_descr.copy() + combined_arg_id_to_descr.update(assignee_id_to_descr) + + # specializing the function according to the parameter description + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_descrs( + combined_arg_id_to_descr)) + + # collecting the descriptors for args, kwargs, assignees + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) + + def map_call_with_kwargs(self, expr, **kwargs): + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import SubArrayRef + + # descriptors for the args and kwargs: + arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel)) + if isinstance(par, SubArrayRef) else ValueArgDescriptor() + for i, par in tuple(enumerate(expr.parameters)) + + tuple(expr.kw_parameters.items())) + + assignee_id_to_descr = {} + + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + assert isinstance(assignees, tuple) + for i, par in enumerate(assignees): + if isinstance(par, SubArrayRef): + assignee_id_to_descr[-i-1] = ( + par.get_array_arg_descriptor(self.kernel)) + else: + assignee_id_to_descr[-i-1] = ValueArgDescriptor() + + # gathering all the descriptors + # TODO: I dont like in place updates. Change this to somthing else. + # Perhaps make a function? + combined_arg_id_to_descr = arg_id_to_descr.copy() + combined_arg_id_to_descr.update(assignee_id_to_descr) + + # specializing the function according to the parameter description + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_descrs( + combined_arg_id_to_descr)) + + # collecting the descriptors for args, kwargs, assignees + return ( + frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) + + def map_constant(self, expr, **kwargs): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def infer_arg_descr(kernel): + """ + Returns a copy of *kernel* with the argument shapes and strides matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. + """ + + arg_description_modifier = ArgDescrInferenceMapper(kernel) + pymbolic_calls_to_functions = set() + + for insn in kernel.instructions: + + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_dtype + pymbolic_calls_to_functions.update( + arg_description_modifier(insn.expression, + assignees=insn.assignees)) + elif isinstance(insn, MultiAssignmentBase): + pymbolic_calls_to_functions.update(arg_description_modifier( + insn.expression)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + # making it the set of tuples a dict + pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions) + + # Now do the similar treatment as done for type inference. + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + + return register_pymbolic_calls_to_knl_callables(kernel, + pymbolic_calls_to_functions) + +# }}} + + +# {{{ + +class HWAxesInferenceMapper(CombineMapper): + """ + Returns a set of instances of :class:`tuple` (expr, + in_kernel_callable). The mapped `in_kernel_callable` of the + :class:`InKernelCallable` are specialized for the the grid sizes of + :attr:`kernel`. + """ + + def __init__(self, kernel): + self.kernel = kernel + self.local_size, self.global_size = kernel.get_grid_size_upper_bounds() + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr, **kwargs): + # ignoring if the call is not to a ScopedFunction + from loopy.symbolic import ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) + + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes( + self.local_size, self.global_size)) + + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in expr.parameters))) + + def map_call_with_kwargs(self, expr, **kwargs): + from loopy.symbolic import ScopedFunction + # ignoring if the call is not to a ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) + + new_scoped_function = ( + self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes( + self.local_size, self.global_size)) + + return (frozenset(((expr, new_scoped_function), )) | + self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values())))) + + def map_constant(self, expr, **kwargs): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def infer_hw_axes_sizes(kernel): + """ + Returns a copy of *kernel* with the hardware axes matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_hw_axes_sizes`. + """ + hw_axes_modifier = HWAxesInferenceMapper(kernel) + pymbolic_calls_to_functions = set() + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + pymbolic_calls_to_functions.update(hw_axes_modifier( + insn.expression)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("unknown type of instruction %s." % + type(insn)) + + # making it the set of tuples a dict + pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions) + + # Now do the similar treatment as done for type inference. + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + + return register_pymbolic_calls_to_knl_callables(kernel, + pymbolic_calls_to_functions) + +# }}} + + +# {{{ catching functions that are not ready for codegen + +class FunctionsNotReadyForCodegenCollector(CombineMapper): + """ + Returns all instances of function calls in an expression which are + not ready for code generation. + """ + def __init__(self, kernel): + self.kernel = kernel + + def combine(self, values): + return all(values) + + # FIXME logic duplication between map_call and map_call_with_kwargs + def map_call(self, expr, *args, **kwargs): + from loopy.library.reduction import ArgExtOp, SegmentedOp + from pymbolic.primitives import Variable + from loopy.symbolic import ScopedFunction + + if isinstance(expr.function, (ArgExtOp, SegmentedOp)): + return self.combine( + tuple( + self.rec(child, *args, **kwargs) for child in + expr.parameters)) + elif isinstance(expr.function, Variable): + # UnScopedFunction obtained and hence clearly not ready for + # codegen. + return False + + elif isinstance(expr.function, ScopedFunction): + is_ready_for_codegen = self.kernel.scoped_functions[ + expr.function.name].is_ready_for_codegen() + return self.combine( + (is_ready_for_codegen,) + + tuple( + self.rec(child, *args, **kwargs) + for child in expr.parameters)) + else: + raise LoopyError("Unexpected function type %s obtained in %s" + % (type(expr.function), expr)) + + def map_call_with_kwargs(self, expr, *args, **kwargs): + is_ready_for_codegen = self.kernel.scoped_functions[ + expr.function.name].is_ready_for_codegen() + return self.combine( + (is_ready_for_codegen,) + + tuple( + self.rec(child, *args, **kwargs) + for child in expr.parameters) + + tuple( + self.rec(child, *args, **kwargs) + for child in expr.kw_parameters.values()) + ) + + def map_constant(self, expr): + return True + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def make_functions_ready_for_codegen(kernel): + """ + Specializes the functions in the kernel that are missed during type + inference. + + .. code:: python + + knl = lp.make_kernel( + "{[i]: 0<=i<16}", + "a[i] = sin(b[i])", + [lp.ArrayArg('a', dtype=np.float64), + lp.ArrayArg('b', dtype=np.float64)]) + + In the above case, none of the instructions undergo type-specialization, as + all the arguments' types have been realized. But, this would be a problem + during the code generation phase as ``sin`` did not undergo type + specialization, and hence must be fixed through this function. + """ + from loopy.type_inference import TypeInferenceMapper + from loopy.symbolic import SubstitutionRuleExpander + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + + unready_functions_collector = FunctionsNotReadyForCodegenCollector(kernel) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + type_inf_mapper = TypeInferenceMapper(kernel) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + expr = subst_expander(insn.expression) + if not unready_functions_collector(expr): + # Infer the type of the functions that are not type specialized. + type_inf_mapper(expr, return_tuple=isinstance(insn, + CallInstruction), return_dtype_set=True) + + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + + else: + NotImplementedError("Unknown Instruction") + + return register_pymbolic_calls_to_knl_callables(kernel, + type_inf_mapper.specialized_functions) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2188,6 +2537,16 @@ def preprocess_kernel(kernel, device=None): kernel = find_temporary_address_space(kernel) + # inferring the shape and dim_tags of the arguments involved in a function + # call. + kernel = infer_arg_descr(kernel) + + # type specialize functions that were missed during the type inference. + kernel = make_functions_ready_for_codegen(kernel) + + # tuning the functions in the kernel to align with the grid sizes. + kernel = infer_hw_axes_sizes(kernel) + # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) diff --git a/loopy/statistics.py b/loopy/statistics.py index cee28b24f..6c012ca21 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -712,9 +712,16 @@ class ExpressionOpCounter(CounterBase): map_variable = map_constant def map_call(self, expr): + from loopy.symbolic import ScopedFunction + if isinstance(expr.function, ScopedFunction): + function_identifier = self.knl.scoped_functions[ + expr.function.name].name + else: + function_identifier = expr.function.name + return ToCountMap( {Op(dtype=self.type_inf(expr), - name='func:'+str(expr.function), + name='func:'+function_identifier, count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.parameters) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8927cd6fb..770e1128a 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -56,6 +56,7 @@ from pymbolic.mapper.constant_folder import \ ConstantFoldingMapper as ConstantFoldingMapperBase from pymbolic.parser import Parser as ParserBase +from loopy.diagnostic import LoopyError from loopy.diagnostic import ExpressionToAffineConversionError @@ -106,7 +107,10 @@ class IdentityMapperMixin(object): return expr def map_type_annotation(self, expr, *args): - return type(expr)(expr.type, self.rec(expr.child)) + return type(expr)(expr.type, self.rec(expr.child, *args)) + + def map_scoped_function(self, expr, *args): + return ScopedFunction(self.rec(expr.function, *args)) map_type_cast = map_type_annotation @@ -165,9 +169,16 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + def map_scoped_function(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.function, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant + map_scoped_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): @@ -232,13 +243,16 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_scoped_function(self, expr, prec): + return "ScopedFunction('%s')" % expr.name + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation) # noqa + or type(expr.function) != type(other.function) # noqa ): return [] @@ -274,6 +288,13 @@ class DependencyMapper(DependencyMapperBase): return self.combine( self.rec(child, *args) for child in expr.parameters) + def map_call_with_kwargs(self, expr, *args): + # Loopy does not have first-class functions. Do not descend + # into 'function' attribute of Call. + return self.combine( + self.rec(child, *args) for child in expr.parameters+tuple( + expr.kw_parameters.values())) + def map_reduction(self, expr): deps = self.rec(expr.expr) return deps - set(p.Variable(iname) for iname in expr.inames) @@ -289,6 +310,9 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) + def map_scoped_function(self, expr): + return self.rec(expr.function) + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -638,6 +662,51 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") + +class ScopedFunction(p.Expression): + """ + A function invocation whose definition is known in a :mod:`loopy` kernel. + Each instance of :class:`loopy.symbolic.ScopedFunction` in an expression + points to an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` through the + mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer + :ref:`ref_scoped_function` for a slightly detailed explanation on scoped + functions. + + .. attribute:: function + + An instance of :class:`pymbolic.primitives.Variable`, + :class:`loopy.library.reduction.ArgExtOp` or + :class:`loopy.library.reduction.SegmentedOp`. + """ + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + from loopy.library.reduction import ArgExtOp, SegmentedOp + assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp)) + self.function = function + + @property + def name(self): + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(self.function, p.Variable): + return self.function.name + elif isinstance(self.function, (ArgExtOp, SegmentedOp)): + return self.function + else: + raise LoopyError("Unexpected function type %s in ScopedFunction." % + type(self.function)) + + def __getinitargs__(self): + return (self.function, ) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_scoped_function") + # }}} @@ -650,9 +719,12 @@ def get_dependencies(expr): # {{{ rule-aware mappers def parse_tagged_name(expr): + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, p.Variable): + elif isinstance(expr, ScopedFunction): + return parse_tagged_name(expr.function) + elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) @@ -1100,6 +1172,14 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: return IdentityMapper.map_call(self, expr) + def map_call_with_kwargs(self, expr): + for par in expr.kw_parameters.values(): + if not isinstance(par, SubArrayRef): + raise LoopyError("Keyword Arguments is only supported for" + " array arguments--use positional order to specify" + " the order of the arguments in the call.") + return IdentityMapper.map_call_with_kwargs(self, expr) + # {{{ customization to pymbolic parser diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a81354e2f..9733fa446 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -150,7 +150,12 @@ class ASTBuilderBase(object): # {{{ library - def function_manglers(self): + def function_scopers(self): + """ + Returns an instance of list of the functions of signature + ``(target, identifiers)`` returning either an instance of + :class:`InKernelCallable` if a match is found or *None*. + """ return [] def symbol_manglers(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 83efecf0e..eab1e6afc 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,7 +27,6 @@ THE SOFTWARE. import six import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block @@ -35,6 +34,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper from loopy.types import NumpyType +from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p from pytools import memoize_method @@ -354,71 +354,105 @@ def c_symbol_mangler(kernel, name): # }}} -# {{{ function mangler +# {{{ function scoping -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None +class CMathCallable(ScalarCallable): + """ + An umbrella callable for all the math functions which can be seen in a + C-Target. + """ - if name in ["abs", "min", "max"]: - name = "f" + name + def with_types(self, arg_id_to_dtype, kernel): + name = self.name - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): + if name in ["abs", "min", "max"]: + name = "f" + name - dtype = arg_dtypes[0].numpy_dtype + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: - if modify_name: - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: - name = name + "l" # fabsl - else: - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - # binary functions - if (name in ["fmax", "fmin"] - and len(arg_dtypes) == 2): + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyTypeError("%s does not support complex numbers") + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == 'c': + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - elif dtype.kind == "f": - if modify_name: + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified if dtype == np.float64: - pass # fmin + pass # fabs elif dtype == np.float32: - name = name + "f" # fminf + name = name + "f" # fabsf elif dtype == np.float128: - name = name + "l" # fminl + name = name + "l" # fabsl else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) + raise LoopyTypeError("%s does not support type %s" % (name, + dtype)) + + return self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + + # binary functions + if name in ["fmax", "fmin"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + elif dtype.kind == "f": + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) + dtype = NumpyType(dtype) + return self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) + + return self.copy(arg_id_to_dtype=arg_id_to_dtype) - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) +def scope_c_math_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function + represented by :arg:`identifier` is known in C, otherwise returns *None*. + """ + if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]: + return CMathCallable(name=identifier) return None # }}} @@ -427,12 +461,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CASTBuilder(ASTBuilderBase): # {{{ library - def function_manglers(self): - return ( - super(CASTBuilder, self).function_manglers() + [ - c_math_mangler - ]) - def symbol_manglers(self): return ( super(CASTBuilder, self).symbol_manglers() + [ @@ -445,6 +473,11 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) + def function_scopers(self): + return ( + super(CASTBuilder, self).function_scopers() + [ + scope_c_math_functions]) + # }}} # {{{ code generation @@ -846,82 +879,30 @@ class CASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) def emit_multiple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None + ecm = codegen_state.expression_to_code_mapper + func_id = insn.expression.function.name + in_knl_callable = codegen_state.kernel.scoped_functions[func_id] - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. + if in_knl_callable.name_in_target == 'loopy_make_tuple': return self.emit_tuple_assignment(codegen_state, insn) - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes)) + in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( + insn=insn, + target=self.target, + expression_to_code_mapper=ecm) - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) - - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: + if is_returned: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast( - mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) - - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index dd2104d0c..ecb6ad7d9 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -41,7 +41,7 @@ from pymbolic import var from loopy.expression import dtype_to_type_context from loopy.type_inference import TypeInferenceMapper -from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType @@ -383,19 +383,18 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Variable, Subscript - - identifier = expr.function + from pymbolic.primitives import Subscript # {{{ implement indexof, indexof_vec - if identifier.name in ["indexof", "indexof_vec"]: + identifier_name = self.kernel.scoped_functions[expr.function.name].name + if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier.name) + raise LoopyError("%s takes exactly one argument" % identifier_name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError( - "argument to %s must be a subscript" % identifier.name) + "argument to %s must be a subscript" % identifier_name) ary = self.find_array(arg) @@ -407,11 +406,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier.name) + raise LoopyError("%s does not support images" % identifier_name) - if identifier.name == "indexof": + if identifier_name == "indexof": return access_info.subscripts[0] - elif identifier.name == "indexof_vec": + elif identifier_name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): @@ -430,56 +429,21 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - - processed_parameters = None - - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: - raise LoopyError("functions with more or fewer than one return value " - "may not be used in an expression") - - if mangle_result.arg_dtypes is not None: - processed_parameters = tuple( - self.rec(par, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) - - else: - # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to - # propagate the type context here. But for many others, it does - # not. Using the inferred type as a stopgap for now. - processed_parameters = tuple( - self.rec(par, - type_context=dtype_to_type_context( - self.kernel.target, par_dtype)) - for par, par_dtype in zip(expr.parameters, par_dtypes)) - - from warnings import warn - warn("Calling function '%s' with unknown C signature--" - "return CallMangleInfo.arg_dtypes" - % identifier, LoopyWarning) - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return var(mangle_result.target_name)(*processed_parameters) + from loopy.kernel.function_interface import ManglerCallable + if isinstance(self.kernel.scoped_functions[expr.function.name], + ManglerCallable): + from loopy.codegen import SeenFunction + in_knl_callable = self.kernel.scoped_functions[expr.function.name] + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) + + return self.kernel.scoped_functions[expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target) # {{{ deal with complex-valued variables diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 673d3b284..b2e4118d2 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -34,6 +34,7 @@ from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import AddressSpace from pymbolic import var +from loopy.kernel.function_interface import ScalarCallable # {{{ vector types @@ -111,29 +112,71 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ function scoper -def cuda_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +_CUDA_SPECIFIC_FUNCTIONS = { + "rsqrt": 1, + "atan2": 2, + } - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") +class CudaCallable(ScalarCallable): - if dtype.kind == "f": - name = "f" + name + def cuda_with_types(self, arg_id_to_dtype, kernel): - return dtype, name + name = self.name - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] - return scalar_dtype, name + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] + return self.copy(name_in_target=name, + arg_id_to_dtype={-1: NumpyType(scalar_dtype), + 0: dtype, 1: dtype}) + + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype) + + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + +def scope_cuda_functions(target, identifier): + if identifier in set(["dot"]) | set( + _CUDA_SPECIFIC_FUNCTIONS): + return CudaCallable(name=identifier) return None @@ -217,13 +260,12 @@ class CudaTarget(CTarget): # {{{ ast builder class CUDACASTBuilder(CASTBuilder): + # {{{ library - def function_manglers(self): - return ( - super(CUDACASTBuilder, self).function_manglers() + [ - cuda_function_mangler - ]) + def function_scopers(self): + return [scope_cuda_functions] + ( + super(CUDACASTBuilder, self).function_scopers()) # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 432c95ef3..de07adf97 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,11 +31,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler -from loopy.kernel.data import AddressSpace, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import AddressSpace +from loopy.kernel.function_interface import ScalarCallable from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -166,59 +166,117 @@ VECTOR_LITERAL_FUNCS = dict( ) -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class OpenCLCallable(ScalarCallable): + """ + Records information about OpenCL functions which are not covered by + :class:`loopy.target.c.CMathCallable`. + """ + + def with_types(self, arg_id_to_dtype, kernel): + name = self.name + + if name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if dtype.kind in ['u', 'i', 'f']: + if dtype.kind == 'f': + name = 'f'+name + dtype = NumpyType(dtype) + return self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) + else: + # Unsupported type. + raise LoopyError("%s function not supported for the types %s" % + (name, dtype)) + + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}) + + if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype) + + if name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in + range(count)) + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) + + return self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype) + + # does not satisfy any of the conditions needed for specialization. + # hence just returning a copy of the callable. + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + +def scope_opencl_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = set(["max", "min", "dot"]) | set( + _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - - if count != len(arg_dtypes): - return None - - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) + if identifier in opencl_function_ids: + return OpenCLCallable(name=identifier) return None @@ -365,13 +423,10 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_manglers(self): + def function_scopers(self): return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super(OpenCLCASTBuilder, self).function_manglers()) + [scope_opencl_functions] + super( + OpenCLCASTBuilder, self).function_scopers()) def symbol_manglers(self): return ( @@ -380,13 +435,10 @@ class OpenCLCASTBuilder(CASTBuilder): ]) def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator return ( super(OpenCLCASTBuilder, self).preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator, - ]) + opencl_preamble_generator]) # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 73e8e0092..27c4f4ab4 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,12 +31,12 @@ from six.moves import range import numpy as np -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError, warn_with_kernel +from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError from warnings import warn +from loopy.kernel.function_interface import ScalarCallable import logging logger = logging.getLogger(__name__) @@ -199,37 +199,79 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes +# {{{ pyopencl function scopers - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" +class PyOpenCLCallable(ScalarCallable): + """ + Records information about the callables which are not covered by + :class:`loopy.target.opencl.OpenCLCallable` + """ + def with_types(self, arg_id_to_dtype, kernel): + + name = self.name + + for id in arg_id_to_dtype: + # since all the below functions are single arg. + if not -1 <= id <= 0: + raise LoopyError("%s can only take one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = arg_id_to_dtype[0] + + if name in ["real", "imag", "abs"]: + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}) + + if name in ["sqrt", "exp", "log", + "sin", "cos", "tan", + "sinh", "cosh", "tanh", + "conj", "abs"]: + if dtype.is_complex(): + # function parameters are complex. + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: dtype}) else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) - - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) + # function calls for floating parameters. + numpy_dtype = dtype.numpy_dtype + if numpy_dtype.kind in ('u', 'i'): + dtype = dtype.copy(numpy_dtype=np.float32) + if name == 'abs': + name = 'fabs' + return self.copy(name_in_target=name, + arg_id_to_dtype={0: dtype, -1: dtype}) + + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + +def pyopencl_function_scoper(target, identifier): + if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"]: + return PyOpenCLCallable(name=identifier) return None +# }}} + # {{{ preamble generator @@ -739,19 +781,15 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler + def function_scopers(self): + from loopy.library.random123 import random123_function_scoper return ( - super(PyOpenCLCASTBuilder, self).function_manglers() + [ - pyopencl_function_mangler, - random123_function_mangler - ]) + [pyopencl_function_scoper, random123_function_scoper] + super( + PyOpenCLCASTBuilder, self).function_scopers()) def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator return ([ pyopencl_preamble_generator, - random123_preamble_generator, ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index ce04986d3..2804b0fb9 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -82,47 +82,35 @@ class ExpressionToPythonMapper(StringifyMapper): expr, enclosing_prec) def map_call(self, expr, enclosing_prec): - from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE - identifier = expr.function + identifier_name = self.kernel.scoped_functions[expr.function.name].name - if identifier.name in ["indexof", "indexof_vec"]: + if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + from loopy.kernel.function_interface import ManglerCallable + in_knl_callable = self.kernel.scoped_functions[expr.function.name] + if isinstance(in_knl_callable, ManglerCallable): + from loopy.codegen import SeenFunction + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) str_parameters = None + number_of_assignees = len([key for key in + in_knl_callable.arg_id_to_dtype.keys() if key < 0]) - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: + if number_of_assignees != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - str_parameters = [ - self.rec(par, PREC_NONE) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)] + str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") @@ -189,11 +177,11 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_manglers(self): + def function_scopers(self): + from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_manglers() + [ - _numpy_single_arg_function_mangler, - ]) + super(PythonASTBuilderBase, self).function_scopers() + + [scope_c_math_functions]) def preamble_generators(self): return ( diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d4dcb3701..d0edcfd78 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -398,7 +398,14 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", # }}} - return diff_context.get_new_kernel(), result + # Differentiation lead to addition of new functions to the kernel. + # For example differentiating `sin(x)` -> `cos(x)`. Hence we would need to + # scope `cos(x)`. + from loopy.kernel.creation import scope_functions + differentiated_scoped_kernel = scope_functions( + diff_context.get_new_kernel()) + + return differentiated_scoped_kernel, result # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 010a0658f..a68520525 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -44,6 +44,19 @@ def _debug(kernel, s, *args): logger.debug("%s: %s" % (kernel.name, logstr)) +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :param arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = dict((id, dtype) for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): @@ -60,6 +73,8 @@ class TypeInferenceMapper(CombineMapper): new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() + self.scoped_functions = kernel.scoped_functions + self.specialized_functions = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -250,15 +265,18 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable + from pymbolic.primitives import Variable, CallWithKwargs + from loopy.symbolic import ScopedFunction + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + kw_parameters = {} identifier = expr.function - if isinstance(identifier, Variable): + if isinstance(identifier, (Variable, ScopedFunction)): identifier = identifier.name - if identifier in ["indexof", "indexof_vec"]: - return [self.kernel.index_dtype] - def none_if_empty(d): if d: d, = d @@ -266,25 +284,121 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) - if None in arg_dtypes: - return [] + arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) + + # specializing the known function wrt type + if isinstance(expr.function, ScopedFunction): + in_knl_callable = self.scoped_functions[expr.function.name] + + # {{{ checking that there is no overwriting of types of in_knl_callable + + if in_knl_callable.arg_id_to_dtype is not None: + + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + if in_knl_callable.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int + + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue + + # }}} + + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") + + # }}} + + in_knl_callable = in_knl_callable.with_types( + arg_id_to_dtype, self.kernel) + + # storing the type specialized function so that it can be used for + # later use + self.specialized_functions[expr] = in_knl_callable.with_target( + self.kernel.target) + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if new_arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] + + elif isinstance(expr.function, Variable): + # Since, the function is not "scoped", attempt to infer using + # kernel.function_manglers + + # {{{ trying to infer using function manglers + + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in + expr.parameters) + + # finding the function_mangler which would be associated with the + # realized function. + mangle_result = None + for function_mangler in self.kernel.function_manglers: + mangle_result = function_mangler(self.kernel, identifier, + arg_dtypes) + if mangle_result: + # found a match. + break - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] - else: if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") + from loopy.kernel.function_interface import (ManglerCallable, + ValueArgDescriptor) + + # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes + arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) + for i, dt in enumerate(mangle_result.arg_dtypes)) + arg_id_to_dtype.update(dict((-i-1, + dtype.with_target(self.kernel.target)) for i, dtype in enumerate( + mangle_result.result_dtypes))) + arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.arg_dtypes)) + res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.result_dtypes)) + arg_id_to_descr = dict(arg_descrs+res_descrs) + + # creating the ManglerCallable object corresponding to the + # function. + self.specialized_functions[expr] = ManglerCallable( + identifier, function_mangler, arg_id_to_dtype, + arg_id_to_descr, mangle_result.target_name) + + # Returning the type. + if return_tuple: + if mangle_result is not None: + return [mangle_result.result_dtypes] + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not return_tuple: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct " + "assignments") + + return [mangle_result.result_dtypes[0]] + # }}} - return [mangle_result.result_dtypes[0]] + return [] - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) + map_call_with_kwargs = map_call def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -406,7 +520,7 @@ class TypeInferenceMapper(CombineMapper): def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): - return [kernel.index_dtype], [] + return [kernel.index_dtype], [], {} from functools import partial debug = partial(_debug, kernel) @@ -451,11 +565,12 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types + return None, type_inf_mapper.symbols_with_unknown_types, None result = type_inf_mapper.combine(dtype_sets) - return result, type_inf_mapper.symbols_with_unknown_types + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.specialized_functions) # }}} @@ -553,6 +668,8 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument + specialized_functions = {} + for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] @@ -576,7 +693,7 @@ def infer_unknown_types(kernel, expect_completion=False): debug("inferring type for %s %s", type(item).__name__, item.name) - result, symbols_with_unavailable_types = ( + result, symbols_with_unavailable_types, new_specialized_functions = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) @@ -597,6 +714,10 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + # TODO: I dont like in-place updates. Change this to something + # else. Perhaps add a function for doing this, which does it + # using a bunch of copies? + specialized_functions.update(new_specialized_functions) else: debug(" failure") @@ -639,11 +760,23 @@ def infer_unknown_types(kernel, expect_completion=False): logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) - return unexpanded_kernel.copy( + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + type_specialized_kernel = register_pymbolic_calls_to_knl_callables( + pre_type_specialized_knl, specialized_functions) + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.check import check_functions_are_scoped + check_functions_are_scoped(type_specialized_kernel) + + return type_specialized_kernel + # }}} diff --git a/test/testlib.py b/test/testlib.py index ad290ee7c..a22988ec8 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel @@ -132,4 +133,43 @@ class SeparateTemporariesPreambleTestPreambleGenerator( # }}} + +# {{{ test_register_function_lookup + +class Log2Callable(lp.ScalarCallable): + + def with_types(self, arg_id_to_dtype, kernel): + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + + from loopy.target.opencl import OpenCLTarget + name_in_target = "log2" + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified + if dtype == np.float32: + name_in_target = "log2f" + elif dtype == np.float128: + name_in_target = "log2l" + + from loopy.types import NumpyType + return self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + + +def register_log2_lookup(target, identifier): + if identifier == 'log2': + return Log2Callable(name='log2') + return None + +# }}} + # vim: foldmethod=marker -- GitLab From f08921f4239a273c3a214d901aa27b195fd3bcc1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Jul 2018 20:27:02 -0500 Subject: [PATCH 224/916] New files from the function interface. --- doc/ref_call.rst | 165 ++++++ examples/python/call-external.py | 105 ++++ loopy/kernel/function_interface.py | 921 +++++++++++++++++++++++++++++ loopy/transform/callable.py | 631 ++++++++++++++++++++ 4 files changed, 1822 insertions(+) create mode 100644 doc/ref_call.rst create mode 100644 examples/python/call-external.py create mode 100644 loopy/kernel/function_interface.py create mode 100644 loopy/transform/callable.py diff --git a/doc/ref_call.rst b/doc/ref_call.rst new file mode 100644 index 000000000..46edc533c --- /dev/null +++ b/doc/ref_call.rst @@ -0,0 +1,165 @@ +Calling Loopy Kernels and External Functions +============================================ + +``ScopedFunctions`` are pymbolic nodes within expressions in a +``Loo.py`` kernel, whose name has been resolved by the kernel. + +A pymbolic ``Call`` node can be converted to a ``ScopedFunction`` if it +is resolved by one of the ``function_scoper`` in a :attr:`LoopKernel.scoped_functions` + +- Functions already registered by the target. Some examples include -- + ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.) +- Functions that are defined in ``Loo.py`` and are realized into + different set of instructions during code generation. Some examples + include ``make_tuple``, ``ArgExtOp``, ``index_of``, ... +- Functions registered as ``CallableKernels`` using + ``lp.register_callable_kernel(...)``. +- Functions that have been provided through + ``lp.register_function_scoper(...)`` +- Functions that can be made known from the user through + ``lp.register_function_mangler``. This is planned to be deprecated, + as its functionality is superseded by + ``lp.register_function_scoper(...)``. + +Expressions after a function is scoped +-------------------------------------- + +Consider the following expression. + +:: + + sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i]) + +During the kernel creation phase, the kernel would know that ``sin`` is +a function known to the target and hence it should be scoped. And as +expected, after ``make_kernel`` has been called the above expression +would get converted to: + +:: + + ScopedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + callable_knl_func(c[i])*mangler_call(d[i]) + +This would also make an entry in the kernel's ``scoped_functions`` +dictionary as: + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None)} + +It might be noteworthy that at this step, it only scopes functions +through their names without any information about the types of the +function. + +Once, the user calls the transformation: +``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``, +the expression gets converted to: + +:: + + ScopedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + ScopedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) + +This also makes an entry in the ``scoped_functions`` dictionary as -- + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)} + +Now, if the user calls +``register_function_mangler(knl, 'mangler_call')``, one might expect +that the mangler call function should get scoped, but that does **not** +happen, because the "old" ``function_manglers``, would return a match +only if all the parameters of the function match viz. name, argument +arity and argument types. Hence, the ``scoped_functions`` dictionary +would remain unchanged. + +``ScopedFunctions`` and specializations +--------------------------------------- + +Consider the same ``ScopedFunction('sin')`` as above. This function +although scoped does not the know the types i.e. it does yet know that +for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or +``sinl``. Hence, right now the function can be called as a +"type-generic" function as further in the pipeline it can take any one +of the above definitions. The functions go through a "specialization" +processes at various points in the pipeline, where the attributes of the +callables are resolved. + +- During type inference, the functions go though type specialization + where in the ``arg_id_to_dtype`` of the functions is realized. +- During descriptor inference, the functions goes through a description + specialization where the ``arg_id_to_descr`` is populated. The + ``arg_id_to_descr`` contains important information regarding shape, + strides and scope of the arguments which form an important part of + ``CallableKernel`` as this information would be helpful to to + generate the function signature and make changes to the data access + pattern of the variables in the callee kernel. +- Whenever a ``ScopedFunction`` goes through a specialization, this is + indicated by changing the name in the ``pymbolic`` node. + +If during type inference, it is inferred that the type of ``a[i]`` is +``np.float32``. The new ``pymbolic`` node would be: + +:: + + ScopedFunction('sin_0')(a[i]) + ... + +This name change is done so that it indicates that the node points to a +different ``ScalarCallable`` in the dictionary. And hence a new entry is +added to the ``scoped_functions`` dictionary as: + +:: + + {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None), + 'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32, + -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')} + +Description Inference +--------------------- + +Although this step has no significance for a ``ScalarCallable``, it +forms a very important part of ``CallableKernel``. In which the +``dim_tags``, ``shape`` and ``address_space`` of the arguments of the +callable kernel is altered. + +- The ``dim_tags`` attribute helps to ensure that the memory layout + between the caller and the callee kernel is coherent. +- The ``address_space`` attribute ensures that, while writing the device + code we emit the appropriate scope qualifiers for the function + declaration arguments. +- The ``shape`` attribute helps in: + + - Storage allocation. + - Memory layout. + - Out of bounds accesses to be caught in ``Loo.py``. + +Hence, in the ``Loo.py`` pipeline, one might expect the following +developments of the ``sin`` pymbolic call expression node. + +:: + + sin -> (Kernel creation) -> ScopedFunction(Variable('sin')) -> + (Type Inference) -> ScopedFunction(Variable('sin_0')) -> + (Descriptor Inference) -> ScopedFunction(Variable('sin_1')) + +Changes on the target side to accommodate the new function interface +-------------------------------------------------------------------- + +The earlier "function\_mangler" as a member method of the class +``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The +function scopers would return a list of functions with the signature +``(target, identifier)->lp.InKernelCallable``. + +An example: Calling BLAS +------------------------ + +.. literalinclude:: ../examples/python/external-call.py + diff --git a/examples/python/call-external.py b/examples/python/call-external.py new file mode 100644 index 000000000..904270472 --- /dev/null +++ b/examples/python/call-external.py @@ -0,0 +1,105 @@ +import loopy as lp +import numpy as np +from loopy.diagnostic import LoopyError +from loopy.target.c import CTarget + + +# {{{ blas callable + +class BLASCallable(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel): + for i in range(0, 2): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return self.copy(arg_id_to_dtype=arg_id_to_dtype) + + mat_dtype = arg_id_to_dtype[0].numpy_dtype + vec_dtype = arg_id_to_dtype[1].numpy_dtype + + if mat_dtype != vec_dtype: + raise LoopyError("DGEMV should have same dtype for matrix and " + "vector") + + if vec_dtype == np.float32: + name_in_target = "cblas_sgemv" + elif vec_dtype == np.float64: + name_in_target = "cblas_dgemv" + else: + raise LoopyError("GEMV only supported for float32 and float64 " + "types") + + from loopy.types import NumpyType + return self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), + -1: NumpyType(vec_dtype)}) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + + parameters.append(insn.assignees[0]) + par_dtypes.append(self.arg_id_to_dtype[-1]) + + # no type casting in array calls. + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + mat_descr = self.arg_id_to_descr[0] + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + c_parameters.insert(0, var('CblasRowMajor')) + c_parameters.insert(1, var('CblasNoTrans')) + c_parameters.insert(2, mat_descr.shape[0]) + c_parameters.insert(3, mat_descr.shape[1]) + c_parameters.insert(4, 1) + c_parameters.insert(6, 1) + c_parameters.insert(8, 1) + c_parameters.insert(10, 1) + return var(self.name_in_target)(*c_parameters), False + + def generate_preambles(self, target): + assert isinstance(target, CTarget) + yield("99_cblas", "#include ") + return + + +def blas_fn_lookup(target, identifier): + if identifier == 'gemv': + return BLASCallable(name='gemv') + return None + +# }}} + + +n = 10 + +knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[:] = gemv(A[:, :], x[:]) + """, [ + lp.ArrayArg('A', dtype=np.float64, shape=(n, n)), + lp.ArrayArg('x', dtype=np.float64, shape=(n, )), + lp.ArrayArg('y', shape=(n, )), ...], + target=CTarget()) + +knl = lp.register_function_lookup(knl, blas_fn_lookup) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py new file mode 100644 index 000000000..edb222ec2 --- /dev/null +++ b/loopy/kernel/function_interface.py @@ -0,0 +1,921 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Andreas Klöckner, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import re +import six + +from six.moves import zip + +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError + +from pymbolic.primitives import Variable +from loopy.symbolic import parse_tagged_name + +from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, + RuleAwareIdentityMapper, SubstitutionRuleExpander) + + +# {{{ argument descriptors + +class ValueArgDescriptor(ImmutableRecord): + pass + + +class ArrayArgDescriptor(ImmutableRecord): + """ + Records information about an array argument to an in-kernel callable, to be + passed to and returned from + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used + for matching shape and scope of caller and callee kernels. + + ..attribute:: shape + + Shape of the array. + + .. attribute:: address_space + + An attribute of :class:`loopy.kernel.data.AddressSpace`. + + .. attribute:: dim_tags + + A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` + """ + + fields = set(['shape', 'address_space', 'dim_tags']) + + def __init__(self, shape, address_space, dim_tags): + + # {{{ sanity checks + + from loopy.kernel.array import FixedStrideArrayDimTag + + assert isinstance(shape, tuple) + assert isinstance(dim_tags, tuple) + + # FIXME at least vector dim tags should be supported + assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in + dim_tags) + + # }}} + + super(ArrayArgDescriptor, self).__init__( + shape=shape, + address_space=address_space, + dim_tags=dim_tags) + +# }}} + + +# {{{ helper function for in-kernel callables + +def get_kw_pos_association(kernel): + """ + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in + *kernel*. + """ + from loopy.kernel.tools import infer_arg_is_output_only + kernel = infer_arg_is_output_only(kernel) + kw_to_pos = {} + pos_to_kw = {} + + read_count = 0 + write_count = -1 + + for arg in kernel.args: + if not arg.is_output_only: + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + else: + kw_to_pos[arg.name] = write_count + pos_to_kw[write_count] = arg.name + write_count -= 1 + + return kw_to_pos, pos_to_kw + + +class GridOverrideForCalleeKernel(ImmutableRecord): + """ + Helper class to set the + :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the + callee kernels. Refer + :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, + :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. + + .. attribute:: local_size + + The local work group size that has to be set in the callee kernel. + + .. attribute:: global_size + + The global work group size that to be set in the callee kernel. + + .. note:: + + This class acts as a pseduo-callable and its significance lies in + solving picklability issues. + """ + fields = set(["local_size", "global_size"]) + + def __init__(self, local_size, global_size): + self.local_size = local_size + self.global_size = global_size + + def __call__(self, insn_ids, ignore_auto=True): + return self.local_size, self.global_size + +# }}} + + +# {{{ template class + +class InKernelCallable(ImmutableRecord): + """ + An abstract interface to define a callable encountered in a kernel. + + .. attribute:: name + + The name of the callable which can be encountered within a kernel. + + .. attribute:: arg_id_to_dtype + + A mapping which indicates the arguments types and result types it would + be handling. This would be set once the callable is type specialized. + + .. attribute:: arg_id_to_descr + + A mapping which gives indicates the argument shape and ``dim_tags`` it + would be responsible for generating code. These parameters would be set, + once it is shape and stride(``dim_tags``) specialized. + + .. note:: + + Negative "id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. + + .. automethod:: __init__ + .. automethod:: with_types + .. automethod:: with_descrs + .. automethod:: with_target + .. automethod:: with_hw_axes_sizes + .. automethod:: generate_preambles + .. automethod:: emit_call + .. automethod:: emit_call_insn + .. automethod:: is_ready_for_codegen + """ + + fields = set(["arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): + + super(InKernelCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr) + + def with_types(self, arg_id_to_dtype, kernel): + """ + :arg arg_id_to_type: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.types.LoopyType` instances. + Unspecified/unknown types are not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_type* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr): + """ + :arg arg_id_to_descr: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.ArrayArgDescriptor` instances. + Unspecified/unknown types are not represented in *arg_id_to_descr*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a copy of *self* which is a new instance of + :class:`InKernelCallable` specialized for the given types, and + *arg_id_to_descr* is a mapping of the same form as the argument above, + however it may have more information present. Any argument information + exists both by its positional and its keyword identifier. + """ + + raise NotImplementedError() + + def with_target(self, target): + """ + Returns a copy of *self* with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer + :meth:`loopy.types.LoopyType.with_target`. + + :arg target: An instance of :class:`loopy.target.TargetBase`. + """ + + if target is None: + raise LoopyError("target cannot be None for with_target") + + def with_target_if_not_None(dtype): + """ + Returns a copy of :arg:`dtype` associated with the target. If + ``dtype`` is *None* returns *None*. + """ + if dtype: + return dtype.with_target(target) + else: + return None + + new_arg_id_to_dtype = None + if self.arg_id_to_dtype: + new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, + dtype in self.arg_id_to_dtype.items()) + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + def with_hw_axes_sizes(self, local_size, global_size): + """ + Returns a copy of *self* with modifications to comply with the grid + sizes ``(local_size, global_size)`` of the kernel in which it is + supposed to be called. + + :arg local_size: An instance of :class:`islpy.PwAff`. + :arg global_size: An instance of :class:`islpy.PwAff`. + """ + raise NotImplementedError() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ Yields the target specific preamble. + """ + raise NotImplementedError() + + def emit_call(self, expression_to_code_mapper, expression, target): + + raise NotImplementedError() + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a tuple of ``(call, assignee_is_returned)`` which is the target + facing function call that would be seen in the generated code. ``call`` + is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned`` + is an instance of :class:`bool` to indicate if the assignee is returned + by value of C-type targets. + + :Example: If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is + interpreted in the target as ``a = f(c, d, &b)``. If + ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted + in the target as the statement ``f(c, d, &a, &b)``. + """ + + raise NotImplementedError() + + def __hash__(self): + + return hash(tuple(self.fields)) + +# }}} + + +# {{{ scalar callable + +class ScalarCallable(InKernelCallable): + """ + An abstranct interface the to a scalar callable encountered in a kernel. + + .. note:: + + The :meth:`ScalarCallable.with_types` is intended to assist with type + specialization of the funciton and is expected to be supplemented in the + derived subclasses. + """ + + fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(ScalarCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.name = name + self.name_in_target = name_in_target + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr, + self.name_in_target) + + def with_types(self, arg_id_to_dtype, kernel): + raise LoopyError("No type inference information present for " + "the function %s." % (self.name)) + + def with_descrs(self, arg_id_to_descr): + + arg_id_to_descr[-1] = ValueArgDescriptor() + return self.copy(arg_id_to_descr=arg_id_to_descr) + + def with_hw_axes_sizes(self, global_size, local_size): + return self.copy() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + # {{{ code generation + + def emit_call(self, expression_to_code_mapper, expression, target): + + assert self.is_ready_for_codegen() + + # must have single assignee + assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 + arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in + range(len(self.arg_id_to_dtype)-1)) + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + expression.parameters) + + from loopy.expression import dtype_to_type_context + # processing the parameters with the required dtypes + processed_parameters = tuple( + expression_to_code_mapper.rec(par, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expression.parameters, par_dtypes, arg_dtypes)) + + from pymbolic import var + return var(self.name_in_target)(*processed_parameters) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a pymbolic call for C-based targets, when the instructions + involve multiple return values along with the required type casting. + The first assignee is returned, but the rest of them are appended to + the parameters and passed by reference. + + :Example: ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` + + :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. + :arg target: An instance of :class:`loopy.target.TargetBase`. + :arg expression_to_code_mapper: An instance of :class:`IdentityMapper` + responsible for code mapping from :mod:`loopy` syntax to the + **target syntax**. + """ + + # FIXME: needs to get information about whether the callable has should + # do pass by reference by all values or should return one value for + # pass by value return. + + # For example: The code generation of `sincos` would be different for + # C-Target and OpenCL-target. + + # Currently this is formulated such that the first argument is returned + # and rest all are passed by reference as arguments to the function. + + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + assignees = insn.assignees[1:] + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + parameters) + arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in + enumerate(parameters)) + + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in + enumerate(assignees)) + + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip( + parameters, par_dtypes, arg_dtypes)] + + for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): + if tgt_dtype != expression_to_code_mapper.infer_type(a): + raise LoopyError("Type Mismatch in function %s. Expected: %s" + "Got: %s" % (self.name, tgt_dtype, + expression_to_code_mapper.infer_type(a))) + c_parameters.append( + var("&")( + expression_to_code_mapper(a, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr)) + + # assignee is returned whenever the size of assignees is non zero. + assignee_is_returned = len(assignees) > 0 + + return var(self.name_in_target)(*c_parameters), assignee_is_returned + + def generate_preambles(self, target): + return + yield + + # }}} + +# }}} + + +# {{{ callable kernel + +class CallableKernel(InKernelCallable): + """ + Records informations about a callee kernel. Also provides interface through + member methods to make the callee kernel compatible to be called from a + caller kernel. The :meth:`loopy.register_callable_kernel` should be called + in order to initiate association between a function in caller kernel and + the callee kernel. + + :meth:`CallableKernel.with_types` should be called in order to match + the ``dtypes`` of the arguments that are shared between the caller and the + callee kernel. + + :meth:`CallableKernel.with_descrs` should be called in order to match + :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, + :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the + caller and the callee kernel. + + :meth:`CallableKernel.with_hw_axes` should be called to set the grid + sizes for the :attr:`subkernel` of the callable. + """ + + fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") + + def __init__(self, subkernel, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(CallableKernel, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.name_in_target = name_in_target + self.subkernel = subkernel.copy( + args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) + if arg.dtype is not None else arg for arg in subkernel.args]) + + def __getinitargs__(self): + return (self.subkernel, self.arg_id_to_dtype, + self.arg_id_to_descr, self.name_in_target) + + @property + def name(self): + return self.subkernel.name + + def with_types(self, arg_id_to_dtype, kernel): + + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + new_args = [] + for arg in self.subkernel.args: + kw = arg.name + if kw in arg_id_to_dtype: + # id exists as kw + new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) + elif kw_to_pos[kw] in arg_id_to_dtype: + # id exists as positional argument + new_args.append(arg.copy( + dtype=arg_id_to_dtype[kw_to_pos[kw]])) + else: + new_args.append(arg) + + from loopy.type_inference import infer_unknown_types + pre_specialized_subkernel = self.subkernel.copy( + args=new_args) + + # infer the types of the written variables based on the knowledge + # of the types of the arguments supplied + specialized_kernel = infer_unknown_types(pre_specialized_subkernel, + expect_completion=True) + + new_arg_id_to_dtype = {} + for arg in specialized_kernel.args: + # associate the updated_arg_id_to_dtype with keyword as well as + # positional id + new_arg_id_to_dtype[arg.name] = arg.dtype + new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype + + # Return the kernel call with specialized subkernel and the corresponding + # new arg_id_to_dtype + return self.copy(subkernel=specialized_kernel, + arg_id_to_dtype=new_arg_id_to_dtype) + + def with_descrs(self, arg_id_to_descr): + + # tune the subkernel so that we have the matching shapes and + # dim_tags + + new_args = self.subkernel.args[:] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + for arg_id, descr in arg_id_to_descr.items(): + if isinstance(arg_id, int): + arg_id = pos_to_kw[arg_id] + assert isinstance(arg_id, str) + + if isinstance(descr, ArrayArgDescriptor): + new_arg = self.subkernel.arg_dict[arg_id].copy( + shape=descr.shape, + dim_tags=descr.dim_tags, + address_space=descr.address_space) + # replacing the new arg with the arg of the same name + new_args = [new_arg if arg.name == arg_id else arg for arg in + new_args] + elif isinstance(descr, ValueArgDescriptor): + pass + else: + raise LoopyError("Descriptor must be either an instance of " + "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % + type(descr)) + descriptor_specialized_knl = self.subkernel.copy(args=new_args) + + return self.copy(subkernel=descriptor_specialized_knl, + arg_id_to_descr=arg_id_to_descr) + + def with_packing_for_args(self): + from loopy.kernel.data import AddressSpace + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + arg_id_to_descr = {} + + for pos, kw in pos_to_kw.items(): + arg = self.subkernel.arg_dict[kw] + arg_id_to_descr[pos] = ArrayArgDescriptor( + shape=arg.shape, + dim_tags=arg.dim_tags, + address_space=AddressSpace.GLOBAL) + + return self.copy(subkernel=self.subkernel, + arg_id_to_descr=arg_id_to_descr) + + def with_hw_axes_sizes(self, gsize, lsize): + return self.copy( + subkernel=self.subkernel.copy( + overridden_get_grid_sizes_for_insn_ids=( + GridOverrideForCalleeKernel(lsize, gsize)))) + + def is_ready_for_codegen(self): + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None and + self.name_in_target is not None) + + def generate_preambles(self, target): + """ Yields the *target* specific preambles. + """ + # FIXME TODO: This is not correct, as the code code preamble generated + # during the code generationg of the child kernel, does not guarantee + # that this thing would be updated. + for preamble in self.subkernel.preambles: + yield preamble + + return + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + from pymbolic.primitives import CallWithKwargs + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameters.append(kw_parameters[pos_to_kw[i]]) + par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) + + # insert the assigness at the required positions + assignee_write_count = -1 + for i, arg in enumerate(self.subkernel.args): + if arg.is_output_only: + assignee = assignees[-assignee_write_count-1] + parameters.insert(i, assignee) + par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) + assignee_write_count -= 1 + + # no type casting in array calls + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + + return var(self.name_in_target)(*c_parameters), False + +# }}} + + +# {{{ mangler callable + +class ManglerCallable(ScalarCallable): + """ + A callable whose characateristic is defined by a function mangler. + + .. attribute:: function_mangler + + A function of signature ``(kernel, name , arg_dtypes)`` and returns an + instance of ``loopy.CallMangleInfo``. + """ + fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", + "arg_id_to_descr", "name_in_target") + + def __init__(self, name, function_mangler, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + self.function_mangler = function_mangler + + super(ManglerCallable, self).__init__( + name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + def __getinitargs__(self): + return (self.name, self.function_mangler, self.arg_id_to_dtype, + self.arg_id_to_descr, self.name_in_target) + + def with_types(self, arg_id_to_dtype, kernel): + if self.arg_id_to_dtype is not None: + # specializing an already specialized function. + for arg_id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + # if does not match, returns an error. + if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]: + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " ManglerCallable?") + + sorted_keys = sorted(arg_id_to_dtype.keys()) + arg_dtypes = tuple(arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + mangle_result = self.function_mangler(kernel, self.name, + arg_dtypes) + if mangle_result: + new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) + new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in + enumerate(mangle_result.result_dtypes))) + return self.copy(name_in_target=mangle_result.target_name, + arg_id_to_dtype=new_arg_id_to_dtype) + else: + # The function mangler does not agree with the arg id to dtypes + # provided. Indicating that is illegal. + raise LoopyError("Function %s not coherent with the provided types." % ( + self.name, kernel.target)) + + def mangle_result(self, kernel): + """ + Returns an instance of :class:`loopy.kernel.data.CallMangleInfo` for + the given pair :attr:`function_mangler` and :attr:`arg_id_to_dtype`. + """ + sorted_keys = sorted(self.arg_id_to_dtype.keys()) + arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + return self.function_mangler(kernel, self.name, arg_dtypes) + +# }}} + + +# {{{ new pymbolic calls to scoped functions + +# FIXME Are these identifiers guaranteed to be available? Is there a var name +# generator somewhere ensuring that that's the case? +def next_indexed_variable(function): + """ + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. + + **Example:** ``Variable('sin_0')`` will return ``'sin_1'``. + + :arg function: Either an instance of :class:`pymbolic.primitives.Variable` + or :class:`loopy.reduction.ArgExtOp` or + :class:`loopy.reduction.SegmentedOp`. + """ + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(function, (ArgExtOp, SegmentedOp)): + return function.copy() + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(function.name) + + if match is None: + if function.name[-1] == '_': + return "{old_name}0".format(old_name=function.name) + else: + return "{old_name}_0".format(old_name=function.name) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + + +class ScopedFunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``expr_to_new_names`` + """ + + def __init__(self, rule_mapping_context, expr_to_new_names, subst_expander): + super(ScopedFunctionNameChanger, self).__init__(rule_mapping_context) + self.expr_to_new_names = expr_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.expr_to_new_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.expr_to_new_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + else: + return super(ScopedFunctionNameChanger, self).map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.expr_to_new_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + elif expanded_expr in self.expr_to_new_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return super(ScopedFunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + +def register_pymbolic_calls_to_knl_callables(kernel, + pymbolic_exprs_to_knl_callables): + # FIXME This could use an example. I have no idea what this does. + # Surely I can't associate arbitrary pymbolic expresions (3+a?) + # with callables? + """ + Returns a copy of :arg:`kernel` which includes an association with the given + pymbolic expressions to the instances of :class:`InKernelCallable` for the + mapping given by :arg:`pymbolic_exprs_to_knl_calllables`. + + :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`. + + :arg pymbolic_exprs_to_knl_callables: A mapping from :mod:`pymbolic` expressions + to the instances of + :class:`loopy.kernel.function_interface.InKernelCallable`. + """ + + scoped_names_to_functions = kernel.scoped_functions.copy() + + # A dict containing the new scoped functions to the names which have been + # assigned to them + scoped_functions_to_names = {} + + # A dict containing the new name that need to be assigned to the + # corresponding pymbolic call + pymbolic_calls_to_new_names = {} + + for pymbolic_call, in_knl_callable in pymbolic_exprs_to_knl_callables.items(): + # checking if such a in-kernel callable already exists. + if in_knl_callable not in scoped_functions_to_names: + # No matching in_knl_callable found => make a new one with a new + # name. + if isinstance(pymbolic_call.function, Variable): + pymbolic_call_function = pymbolic_call.function + elif isinstance(pymbolic_call.function, ScopedFunction): + pymbolic_call_function = pymbolic_call.function.function + else: + raise NotImplementedError("Unknown type %s for pymbolic call " + "function" % type(pymbolic_call).__name__) + + unique_var = next_indexed_variable(pymbolic_call_function) + from loopy.library.reduction import ArgExtOp, SegmentedOp + while unique_var in scoped_names_to_functions and not isinstance( + unique_var, (ArgExtOp, SegmentedOp)): + # keep on finding new names till one a unique one is found. + unique_var = next_indexed_variable(Variable(unique_var)) + + # book-keeping of the functions and names mappings for later use + if isinstance(in_knl_callable, CallableKernel): + # for array calls the name in the target is the name of the + # scoped funciton + in_knl_callable = in_knl_callable.copy( + name_in_target=unique_var) + scoped_names_to_functions[unique_var] = in_knl_callable + scoped_functions_to_names[in_knl_callable] = unique_var + + pymbolic_calls_to_new_names[pymbolic_call] = ( + scoped_functions_to_names[in_knl_callable]) + + # Use the data populated in pymbolic_calls_to_new_names to change the + # names of the scoped functions of all the calls in the kernel. + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + scope_changer = ScopedFunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + scoped_kernel = scope_changer.map_kernel(kernel) + + return scoped_kernel.copy(scoped_functions=scoped_names_to_functions) + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py new file mode 100644 index 000000000..092cef887 --- /dev/null +++ b/loopy/transform/callable.py @@ -0,0 +1,631 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six + +import islpy as isl +from pymbolic.primitives import CallWithKwargs + +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, + CInstruction, _DataObliviousInstruction) +from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper +from loopy.isl_helpers import simplify_via_aff +from loopy.kernel.function_interface import (get_kw_pos_association, + register_pymbolic_calls_to_knl_callables) + + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_function_lookup + +.. autofunction:: register_callable_kernel +""" + + +# {{{ register function lookup + +def register_function_lookup(kernel, function_lookup): + """ + Returns a copy of *kernel* with the *function_lookup* registered. + + :arg function_lookup: A function of signature ``(target, identifier)`` + returning a :class:`loopy.kernel.function_interface.InKernelCallable`. + """ + + # adding the function lookup to the set of function lookers in the kernel. + if function_lookup not in kernel.function_scopers: + from loopy.tools import unpickles_equally + if not unpickles_equally(function_lookup): + raise LoopyError("function '%s' does not " + "compare equally after being upickled " + "and would disrupt loopy's caches" + % function_lookup) + new_function_scopers = kernel.function_scopers + [function_lookup] + registered_kernel = kernel.copy(function_scopers=new_function_scopers) + from loopy.kernel.creation import scope_functions + + # returning the scoped_version of the kernel, as new functions maybe + # resolved. + return scope_functions(registered_kernel) + +# }}} + + +# {{{ register_callable_kernel + +class _RegisterCalleeKernel(ImmutableRecord): + """ + Helper class to make the function scoper from + :func:`loopy.transform.register_callable_kernel` picklable. As python + cannot pickle lexical closures. + """ + fields = set(['function_name', 'callable_kernel']) + + def __init__(self, function_name, callable_kernel): + self.function_name = function_name + self.callable_kernel = callable_kernel + + def __call__(self, target, identifier): + if identifier == self.function_name: + return self.callable_kernel + return None + + +def register_callable_kernel(caller_kernel, function_name, callee_kernel): + """Returns a copy of *caller_kernel*, which would resolve *function_name* in an + expression as a call to *callee_kernel*. + + :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :arg function_name: An instance of :class:`str`. + :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + """ + + # {{{ sanity checks + + assert isinstance(caller_kernel, LoopKernel) + assert isinstance(callee_kernel, LoopKernel) + assert isinstance(function_name, str) + + # check to make sure that the variables with 'out' direction is equal to + # the number of assigness in the callee kernel intructions. + from loopy.kernel.tools import infer_arg_is_output_only + callee_kernel = infer_arg_is_output_only(callee_kernel) + expected_num_assignees = len([arg for arg in callee_kernel.args if + arg.is_output_only]) + expected_num_parameters = len(callee_kernel.args) - expected_num_assignees + for insn in caller_kernel.instructions: + if isinstance(insn, CallInstruction) and ( + insn.expression.function.name == 'function_name'): + if insn.assignees != expected_num_assignees: + raise LoopyError("The number of arguments with 'out' direction " + "in callee kernel %s and the number of assignees in " + "instruction %s do not match." % ( + callee_kernel.name, insn.id)) + if insn.expression.prameters != expected_num_parameters: + raise LoopyError("The number of expected arguments " + "for the callee kernel %s and the number of parameters in " + "instruction %s do not match." % ( + callee_kernel.name, insn.id)) + + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("unknown instruction %s" % type(insn)) + + # }}} + + # making the target of the child kernel to be same as the target of parent + # kernel. + callable_kernel = CallableKernel(subkernel=callee_kernel.copy( + target=caller_kernel.target, + name=function_name, + is_called_from_host=False)) + + # FIXME disabling global barriers for callee kernel (for now) + from loopy import set_options + callee_kernel = set_options(callee_kernel, "disable_global_barriers") + + return register_function_lookup(caller_kernel, + _RegisterCalleeKernel(function_name, callable_kernel)) + +# }}} + + +# {{{ callee scoped calls collector (to support inlining) + +class CalleeScopedCallsCollector(CombineMapper): + """ + Collects the scoped functions which are a part of the callee kernel and + must be transferred to the caller kernel before inlining. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the caller kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def __init__(self, callee_scoped_functions): + self.callee_scoped_functions = callee_scoped_functions + + def combine(self, values): + import operator + from functools import reduce + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters))) + else: + return self.combine((self.rec(child) for child in expr.parameters)) + + def map_call_with_kwargs(self, expr): + if expr.function.name in self.callee_scoped_functions: + return (frozenset([(expr, + self.callee_scoped_functions[expr.function.name])]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + +# }}} + + +# {{{ kernel inliner mapper + +class KernelInliner(SubstitutionMapper): + """Mapper to replace variables (indices, temporaries, arguments) in the + callee kernel with variables in the caller kernel. + + :arg caller: the caller kernel + :arg arg_map: dict of argument name to variables in caller + :arg arg_dict: dict of argument name to arguments in callee + """ + + def __init__(self, subst_func, caller, arg_map, arg_dict): + super(KernelInliner, self).__init__(subst_func) + self.caller = caller + self.arg_map = arg_map + self.arg_dict = arg_dict + + def map_subscript(self, expr): + if expr.aggregate.name in self.arg_map: + + aggregate = self.subst_func(expr.aggregate) + sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller + callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee + if aggregate.name in self.caller.arg_dict: + caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller + else: + caller_arg = self.caller.temporary_variables[aggregate.name] + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in callee_arg.shape): + raise LoopyError( + "Argument: {0} in callee kernel: {1} does not have " + "constant shape.".format(callee_arg)) + + flatten_index = 0 + for i, idx in enumerate(sar.get_begin_subscript().index_tuple): + flatten_index += idx*caller_arg.dim_tags[i].stride + + flatten_index += sum( + idx * tag.stride + for idx, tag in zip(outer_indices, callee_arg.dim_tags)) + + from loopy.isl_helpers import simplify_via_aff + flatten_index = simplify_via_aff(flatten_index) + + new_indices = [] + for dim_tag in caller_arg.dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + +# }}} + + +# {{{ inlining of a single call instruction + +def _inline_call_instruction(kernel, callee_knl, instruction): + """ + Returns a copy of *kernel* with the *instruction* in the *kernel* + replaced by inlining :attr:`subkernel` within it. + """ + callee_label = callee_knl.name[:4] + "_" + + # {{{ duplicate and rename inames + + vng = kernel.get_var_name_generator() + ing = kernel.get_instruction_id_generator() + dim_type = isl.dim_type.set + + iname_map = {} + for iname in callee_knl.all_inames(): + iname_map[iname] = vng(callee_label+iname) + + new_domains = [] + new_iname_to_tags = kernel.iname_to_tags.copy() + + # transferring iname tags info from the callee to the caller kernel + for domain in callee_knl.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + + if iname in callee_knl.iname_to_tags: + new_iname_to_tags[iname_map[iname]] = ( + callee_knl.iname_to_tags[iname]) + new_domain = new_domain.set_dim_name( + dim_type, i, iname_map[iname]) + new_domains.append(new_domain) + + kernel = kernel.copy(domains=kernel.domains + new_domains, + iname_to_tags=new_iname_to_tags) + + # }}} + + # {{{ rename temporaries + + temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(callee_knl.temporary_variables): + new_name = vng(callee_label+name) + temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = instruction.assignees # writes + parameters = instruction.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + if isinstance(instruction.expression, CallWithKwargs): + from loopy.kernel.function_interface import get_kw_pos_association + + _, pos_to_kw = get_kw_pos_association(callee_knl) + kw_parameters = instruction.expression.kw_parameters + for i in range(len(parameters), len(parameters) + len(kw_parameters)): + parameters = parameters + (kw_parameters[pos_to_kw[i]],) + + assignee_pos = 0 + parameter_pos = 0 + for i, arg in enumerate(callee_knl.args): + if arg.is_output_only: + arg_map[arg.name] = assignees[assignee_pos] + assignee_pos += 1 + else: + arg_map[arg.name] = parameters[parameter_pos] + parameter_pos += 1 + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner( + make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) + + insn_id = {} + for insn in callee_knl.instructions: + insn_id[insn.id] = ing(callee_label+insn.id) + + # {{{ root and leave instructions in callee kernel + + dep_map = callee_knl.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=instruction.within_inames, + depends_on=instruction.depends_on + ) + noop_end = NoOpInstruction( + id=instruction.id, + within_inames=instruction.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for insn in callee_knl.instructions: + insn = insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(iname_map.get, insn.within_inames)) + within_inames = within_inames | instruction.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( + instruction.depends_on) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == instruction: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + # {{{ transferring the scoped functions from callee to caller + + callee_scoped_calls_collector = CalleeScopedCallsCollector( + callee_knl.scoped_functions) + callee_scoped_calls_dict = {} + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( + insn.expression))) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknown type of instruction %s." % type( + insn)) + + from loopy.kernel.function_interface import ( + register_pymbolic_calls_to_knl_callables) + kernel = register_pymbolic_calls_to_knl_callables(kernel, + callee_scoped_calls_dict) + + # }}} + + return kernel + +# }}} + + +# {{{ inline callable kernel + +# FIXME This should take a 'within' parameter to be able to only inline +# *some* calls to a kernel, but not others. +def inline_callable_kernel(kernel, function_name): + """ + Returns a copy of *kernel* with the callable kernel addressed by + (scoped) name *function_name* inlined. + """ + from loopy.preprocess import infer_arg_descr + kernel = infer_arg_descr(kernel) + + old_insns = kernel.instructions + for insn in old_insns: + if isinstance(insn, CallInstruction): + # FIXME This seems to use identifiers across namespaces. Why not + # check whether the function is a scoped function first? + if insn.expression.function.name in kernel.scoped_functions: + in_knl_callable = kernel.scoped_functions[ + insn.expression.function.name] + from loopy.kernel.function_interface import CallableKernel + if isinstance(in_knl_callable, CallableKernel) and ( + in_knl_callable.subkernel.name == function_name): + kernel = _inline_call_instruction( + kernel, in_knl_callable.subkernel, insn) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown instruction type %s" + % type(insn).__name__) + + return kernel + +# }}} + + +# {{{ tools to match caller to callee args by (guessed) automatic reshaping + +# (This is undocumented and not recommended, but it is currently needed +# to support Firedrake.) + +class DimChanger(IdentityMapper): + """ + Mapper to change the dimensions of an argument. + + .. attribute:: callee_arg_dict + + A mapping from the argument name (:class:`str`) to instances of + :class:`loopy.kernel.array.ArrayBase`. + + .. attribute:: desried_shape + + A mapping from argument name (:class:`str`) to an instance of + :class:`tuple`. + """ + def __init__(self, callee_arg_dict, desired_shape): + self.callee_arg_dict = callee_arg_dict + self.desired_shape = desired_shape + + def map_subscript(self, expr): + callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags + flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in + zip(callee_arg_dim_tags, expr.index_tuple)) + new_indices = [] + + from operator import mul + from functools import reduce + stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) + + for length in self.desired_shape[expr.aggregate.name]: + stride /= length + ind = flattened_index // int(stride) + flattened_index -= (int(stride) * ind) + new_indices.append(simplify_via_aff(ind)) + + return expr.aggregate.index(tuple(new_indices)) + + +def _match_caller_callee_argument_dimension(caller_knl, callee_function_name): + """ + Returns a copy of *caller_knl* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *caller_knl* aligned with the argument + dimesnsions required by *caller_knl*. + """ + pymbolic_calls_to_new_callables = {} + for insn in caller_knl.instructions: + if not isinstance(insn, CallInstruction) or ( + insn.expression.function.name not in + caller_knl.scoped_functions): + # Call to a callable kernel can only occur through a + # CallInstruction. + continue + + in_knl_callable = caller_knl.scoped_functions[ + insn.expression.function.name] + + if in_knl_callable.subkernel.name != callee_function_name: + # Not the callable we're looking for. + continue + + # getting the caller->callee arg association + + parameters = insn.expression.parameters[:] + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape + for par in parameters] + kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameter_shapes.append(kw_parameters[pos_to_kw[i]] + .get_array_arg_descriptor(caller_knl).shape) + + # inserting the assigness at the required positions. + assignee_write_count = -1 + for i, arg in enumerate(in_knl_callable.subkernel.args): + if arg.is_output_only: + assignee = assignees[-assignee_write_count-1] + parameter_shapes.insert(i, assignee + .get_array_arg_descriptor(caller_knl).shape) + assignee_write_count -= 1 + + callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in + in_knl_callable.subkernel.args], parameter_shapes)) + dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, + callee_arg_to_desired_dim_tag) + new_callee_insns = [] + for callee_insn in in_knl_callable.subkernel.instructions: + if isinstance(callee_insn, MultiAssignmentBase): + new_callee_insns.append(callee_insn.copy(expression=dim_changer( + callee_insn.expression), + assignee=dim_changer(callee_insn.assignee))) + elif isinstance(callee_insn, (CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknwon instruction %s." % + type(insn)) + + # subkernel with instructions adjusted according to the new dimensions. + new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) + + new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) + + pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable + + if not pymbolic_calls_to_new_callables: + # complain if no matching function found. + raise LoopyError("No CallableKernel with the name %s found in %s." % ( + callee_function_name, caller_knl.name)) + + return register_pymbolic_calls_to_knl_callables(caller_knl, + pymbolic_calls_to_new_callables) + +# }}} + + +# vim: foldmethod=marker -- GitLab From 2240fda99160a8deac0d62bd10e05d181522d066 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Jul 2018 20:30:00 -0500 Subject: [PATCH 225/916] removes conflict in constant arg is_output_onlt --- loopy/kernel/tools.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 1d79a86d7..95c3c336c 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1877,15 +1877,7 @@ def infer_arg_is_output_only(kernel): else: new_args.append(arg.copy(is_output_only=False)) elif isinstance(arg, ConstantArg): -<<<<<<< HEAD - if arg.is_output_only: - raise LoopyError("Constant Argument %s cannot have " - "is_output_only True" % arg.name) - else: - new_args.append(arg.copy(is_output_only=False)) -======= new_args.append(arg) ->>>>>>> master else: raise NotImplementedError("Unkonwn argument type %s." % type(arg)) -- GitLab From 359c9ebc78ab42152e0918bd7ca78ca2db9ff224 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Jul 2018 20:32:40 -0500 Subject: [PATCH 226/916] no callable kernel till now. --- loopy/check.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index dd96c1ba6..dd1cbf3d1 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -213,7 +213,6 @@ def check_multiple_tags_allowed(kernel): def check_for_double_use_of_hw_axes(kernel): from loopy.kernel.data import UniqueTag - from loopy.kernel.instruction import CallInstruction for insn in kernel.instructions: insn_tag_keys = set() @@ -226,21 +225,6 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) - # check usage of iname tags in the callee kernel - if isinstance(insn, CallInstruction): - in_knl_callable = kernel.scoped_functions[ - insn.expression.function.name] - if isinstance(in_knl_callable, CallableKernel): - # check for collision in iname_tag keys in the instruction - # due to the callee kernel - common_iname_tags = [tag for tag in - _get_all_unique_iname_tags(in_knl_callable.subkernel) - if tag.key in insn_tag_keys] - if common_iname_tags: - raise LoopyError("instruction '%s' has multiple " - "inames tagged '%s'" % (insn.id, - common_iname_tags.pop())) - def check_for_inactive_iname_access(kernel): for insn in kernel.instructions: -- GitLab From 76dd368a1669e87a6a2894fd139e4423cc49dfcd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Jul 2018 20:35:57 -0500 Subject: [PATCH 227/916] no callable kernel --- loopy/transform/callable.py | 554 ------------------------------------ 1 file changed, 554 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 092cef887..44f994e9e 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -43,8 +43,6 @@ __doc__ = """ .. currentmodule:: loopy .. autofunction:: register_function_lookup - -.. autofunction:: register_callable_kernel """ @@ -76,556 +74,4 @@ def register_function_lookup(kernel, function_lookup): # }}} - -# {{{ register_callable_kernel - -class _RegisterCalleeKernel(ImmutableRecord): - """ - Helper class to make the function scoper from - :func:`loopy.transform.register_callable_kernel` picklable. As python - cannot pickle lexical closures. - """ - fields = set(['function_name', 'callable_kernel']) - - def __init__(self, function_name, callable_kernel): - self.function_name = function_name - self.callable_kernel = callable_kernel - - def __call__(self, target, identifier): - if identifier == self.function_name: - return self.callable_kernel - return None - - -def register_callable_kernel(caller_kernel, function_name, callee_kernel): - """Returns a copy of *caller_kernel*, which would resolve *function_name* in an - expression as a call to *callee_kernel*. - - :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg function_name: An instance of :class:`str`. - :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - """ - - # {{{ sanity checks - - assert isinstance(caller_kernel, LoopKernel) - assert isinstance(callee_kernel, LoopKernel) - assert isinstance(function_name, str) - - # check to make sure that the variables with 'out' direction is equal to - # the number of assigness in the callee kernel intructions. - from loopy.kernel.tools import infer_arg_is_output_only - callee_kernel = infer_arg_is_output_only(callee_kernel) - expected_num_assignees = len([arg for arg in callee_kernel.args if - arg.is_output_only]) - expected_num_parameters = len(callee_kernel.args) - expected_num_assignees - for insn in caller_kernel.instructions: - if isinstance(insn, CallInstruction) and ( - insn.expression.function.name == 'function_name'): - if insn.assignees != expected_num_assignees: - raise LoopyError("The number of arguments with 'out' direction " - "in callee kernel %s and the number of assignees in " - "instruction %s do not match." % ( - callee_kernel.name, insn.id)) - if insn.expression.prameters != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of parameters in " - "instruction %s do not match." % ( - callee_kernel.name, insn.id)) - - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("unknown instruction %s" % type(insn)) - - # }}} - - # making the target of the child kernel to be same as the target of parent - # kernel. - callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=caller_kernel.target, - name=function_name, - is_called_from_host=False)) - - # FIXME disabling global barriers for callee kernel (for now) - from loopy import set_options - callee_kernel = set_options(callee_kernel, "disable_global_barriers") - - return register_function_lookup(caller_kernel, - _RegisterCalleeKernel(function_name, callable_kernel)) - -# }}} - - -# {{{ callee scoped calls collector (to support inlining) - -class CalleeScopedCallsCollector(CombineMapper): - """ - Collects the scoped functions which are a part of the callee kernel and - must be transferred to the caller kernel before inlining. - - :returns: - An :class:`frozenset` of function names that are not scoped in - the caller kernel. - - .. note:: - :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are - never scoped in the pipeline. - """ - - def __init__(self, callee_scoped_functions): - self.callee_scoped_functions = callee_scoped_functions - - def combine(self, values): - import operator - from functools import reduce - return reduce(operator.or_, values, frozenset()) - - def map_call(self, expr): - if expr.function.name in self.callee_scoped_functions: - return (frozenset([(expr, - self.callee_scoped_functions[expr.function.name])]) | - self.combine((self.rec(child) for child in expr.parameters))) - else: - return self.combine((self.rec(child) for child in expr.parameters)) - - def map_call_with_kwargs(self, expr): - if expr.function.name in self.callee_scoped_functions: - return (frozenset([(expr, - self.callee_scoped_functions[expr.function.name])]) | - self.combine((self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values())))) - else: - return self.combine((self.rec(child) for child in - expr.parameters+tuple(expr.kw_parameters.values()))) - - def map_constant(self, expr): - return frozenset() - - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant - -# }}} - - -# {{{ kernel inliner mapper - -class KernelInliner(SubstitutionMapper): - """Mapper to replace variables (indices, temporaries, arguments) in the - callee kernel with variables in the caller kernel. - - :arg caller: the caller kernel - :arg arg_map: dict of argument name to variables in caller - :arg arg_dict: dict of argument name to arguments in callee - """ - - def __init__(self, subst_func, caller, arg_map, arg_dict): - super(KernelInliner, self).__init__(subst_func) - self.caller = caller - self.arg_map = arg_map - self.arg_dict = arg_dict - - def map_subscript(self, expr): - if expr.aggregate.name in self.arg_map: - - aggregate = self.subst_func(expr.aggregate) - sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller - callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee - if aggregate.name in self.caller.arg_dict: - caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller - else: - caller_arg = self.caller.temporary_variables[aggregate.name] - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in callee_arg.shape): - raise LoopyError( - "Argument: {0} in callee kernel: {1} does not have " - "constant shape.".format(callee_arg)) - - flatten_index = 0 - for i, idx in enumerate(sar.get_begin_subscript().index_tuple): - flatten_index += idx*caller_arg.dim_tags[i].stride - - flatten_index += sum( - idx * tag.stride - for idx, tag in zip(outer_indices, callee_arg.dim_tags)) - - from loopy.isl_helpers import simplify_via_aff - flatten_index = simplify_via_aff(flatten_index) - - new_indices = [] - for dim_tag in caller_arg.dim_tags: - ind = flatten_index // dim_tag.stride - flatten_index -= (dim_tag.stride * ind) - new_indices.append(ind) - - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - -# }}} - - -# {{{ inlining of a single call instruction - -def _inline_call_instruction(kernel, callee_knl, instruction): - """ - Returns a copy of *kernel* with the *instruction* in the *kernel* - replaced by inlining :attr:`subkernel` within it. - """ - callee_label = callee_knl.name[:4] + "_" - - # {{{ duplicate and rename inames - - vng = kernel.get_var_name_generator() - ing = kernel.get_instruction_id_generator() - dim_type = isl.dim_type.set - - iname_map = {} - for iname in callee_knl.all_inames(): - iname_map[iname] = vng(callee_label+iname) - - new_domains = [] - new_iname_to_tags = kernel.iname_to_tags.copy() - - # transferring iname tags info from the callee to the caller kernel - for domain in callee_knl.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - - if iname in callee_knl.iname_to_tags: - new_iname_to_tags[iname_map[iname]] = ( - callee_knl.iname_to_tags[iname]) - new_domain = new_domain.set_dim_name( - dim_type, i, iname_map[iname]) - new_domains.append(new_domain) - - kernel = kernel.copy(domains=kernel.domains + new_domains, - iname_to_tags=new_iname_to_tags) - - # }}} - - # {{{ rename temporaries - - temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(callee_knl.temporary_variables): - new_name = vng(callee_label+name) - temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) - - assignees = instruction.assignees # writes - parameters = instruction.expression.parameters # reads - - # add keyword parameters - from pymbolic.primitives import CallWithKwargs - - if isinstance(instruction.expression, CallWithKwargs): - from loopy.kernel.function_interface import get_kw_pos_association - - _, pos_to_kw = get_kw_pos_association(callee_knl) - kw_parameters = instruction.expression.kw_parameters - for i in range(len(parameters), len(parameters) + len(kw_parameters)): - parameters = parameters + (kw_parameters[pos_to_kw[i]],) - - assignee_pos = 0 - parameter_pos = 0 - for i, arg in enumerate(callee_knl.args): - if arg.is_output_only: - arg_map[arg.name] = assignees[assignee_pos] - assignee_pos += 1 - else: - arg_map[arg.name] = parameters[parameter_pos] - parameter_pos += 1 - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner( - make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) - - insn_id = {} - for insn in callee_knl.instructions: - insn_id[insn.id] = ing(callee_label+insn.id) - - # {{{ root and leave instructions in callee kernel - - dep_map = callee_knl.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of callee kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing(callee_label+"_start"), - within_inames=instruction.within_inames, - depends_on=instruction.depends_on - ) - noop_end = NoOpInstruction( - id=instruction.id, - within_inames=instruction.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for insn in callee_knl.instructions: - insn = insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(iname_map.get, insn.within_inames)) - within_inames = within_inames | instruction.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( - instruction.depends_on) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == instruction: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} - - # {{{ transferring the scoped functions from callee to caller - - callee_scoped_calls_collector = CalleeScopedCallsCollector( - callee_knl.scoped_functions) - callee_scoped_calls_dict = {} - - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( - insn.expression))) - elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknown type of instruction %s." % type( - insn)) - - from loopy.kernel.function_interface import ( - register_pymbolic_calls_to_knl_callables) - kernel = register_pymbolic_calls_to_knl_callables(kernel, - callee_scoped_calls_dict) - - # }}} - - return kernel - -# }}} - - -# {{{ inline callable kernel - -# FIXME This should take a 'within' parameter to be able to only inline -# *some* calls to a kernel, but not others. -def inline_callable_kernel(kernel, function_name): - """ - Returns a copy of *kernel* with the callable kernel addressed by - (scoped) name *function_name* inlined. - """ - from loopy.preprocess import infer_arg_descr - kernel = infer_arg_descr(kernel) - - old_insns = kernel.instructions - for insn in old_insns: - if isinstance(insn, CallInstruction): - # FIXME This seems to use identifiers across namespaces. Why not - # check whether the function is a scoped function first? - if insn.expression.function.name in kernel.scoped_functions: - in_knl_callable = kernel.scoped_functions[ - insn.expression.function.name] - from loopy.kernel.function_interface import CallableKernel - if isinstance(in_knl_callable, CallableKernel) and ( - in_knl_callable.subkernel.name == function_name): - kernel = _inline_call_instruction( - kernel, in_knl_callable.subkernel, insn) - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError( - "Unknown instruction type %s" - % type(insn).__name__) - - return kernel - -# }}} - - -# {{{ tools to match caller to callee args by (guessed) automatic reshaping - -# (This is undocumented and not recommended, but it is currently needed -# to support Firedrake.) - -class DimChanger(IdentityMapper): - """ - Mapper to change the dimensions of an argument. - - .. attribute:: callee_arg_dict - - A mapping from the argument name (:class:`str`) to instances of - :class:`loopy.kernel.array.ArrayBase`. - - .. attribute:: desried_shape - - A mapping from argument name (:class:`str`) to an instance of - :class:`tuple`. - """ - def __init__(self, callee_arg_dict, desired_shape): - self.callee_arg_dict = callee_arg_dict - self.desired_shape = desired_shape - - def map_subscript(self, expr): - callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags - flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in - zip(callee_arg_dim_tags, expr.index_tuple)) - new_indices = [] - - from operator import mul - from functools import reduce - stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) - - for length in self.desired_shape[expr.aggregate.name]: - stride /= length - ind = flattened_index // int(stride) - flattened_index -= (int(stride) * ind) - new_indices.append(simplify_via_aff(ind)) - - return expr.aggregate.index(tuple(new_indices)) - - -def _match_caller_callee_argument_dimension(caller_knl, callee_function_name): - """ - Returns a copy of *caller_knl* with the instance of - :class:`loopy.kernel.function_interface.CallableKernel` addressed by - *callee_function_name* in the *caller_knl* aligned with the argument - dimesnsions required by *caller_knl*. - """ - pymbolic_calls_to_new_callables = {} - for insn in caller_knl.instructions: - if not isinstance(insn, CallInstruction) or ( - insn.expression.function.name not in - caller_knl.scoped_functions): - # Call to a callable kernel can only occur through a - # CallInstruction. - continue - - in_knl_callable = caller_knl.scoped_functions[ - insn.expression.function.name] - - if in_knl_callable.subkernel.name != callee_function_name: - # Not the callable we're looking for. - continue - - # getting the caller->callee arg association - - parameters = insn.expression.parameters[:] - kw_parameters = {} - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - - assignees = insn.assignees - - parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape - for par in parameters] - kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) - for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameter_shapes.append(kw_parameters[pos_to_kw[i]] - .get_array_arg_descriptor(caller_knl).shape) - - # inserting the assigness at the required positions. - assignee_write_count = -1 - for i, arg in enumerate(in_knl_callable.subkernel.args): - if arg.is_output_only: - assignee = assignees[-assignee_write_count-1] - parameter_shapes.insert(i, assignee - .get_array_arg_descriptor(caller_knl).shape) - assignee_write_count -= 1 - - callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in - in_knl_callable.subkernel.args], parameter_shapes)) - dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, - callee_arg_to_desired_dim_tag) - new_callee_insns = [] - for callee_insn in in_knl_callable.subkernel.instructions: - if isinstance(callee_insn, MultiAssignmentBase): - new_callee_insns.append(callee_insn.copy(expression=dim_changer( - callee_insn.expression), - assignee=dim_changer(callee_insn.assignee))) - elif isinstance(callee_insn, (CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknwon instruction %s." % - type(insn)) - - # subkernel with instructions adjusted according to the new dimensions. - new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) - - new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) - - pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable - - if not pymbolic_calls_to_new_callables: - # complain if no matching function found. - raise LoopyError("No CallableKernel with the name %s found in %s." % ( - callee_function_name, caller_knl.name)) - - return register_pymbolic_calls_to_knl_callables(caller_knl, - pymbolic_calls_to_new_callables) - -# }}} - - # vim: foldmethod=marker -- GitLab From 91a42f59b006b2b310b1ba661a9428052e9516ff Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Jul 2018 20:40:43 -0500 Subject: [PATCH 228/916] Minor hunk editing again. --- loopy/kernel/function_interface.py | 215 ----------------------------- loopy/transform/callable.py | 14 -- test/test_callables.py | 68 +++++++++ 3 files changed, 68 insertions(+), 229 deletions(-) create mode 100644 test/test_callables.py diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index edb222ec2..ddfe9b73e 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -468,215 +468,6 @@ class ScalarCallable(InKernelCallable): # }}} -# {{{ callable kernel - -class CallableKernel(InKernelCallable): - """ - Records informations about a callee kernel. Also provides interface through - member methods to make the callee kernel compatible to be called from a - caller kernel. The :meth:`loopy.register_callable_kernel` should be called - in order to initiate association between a function in caller kernel and - the callee kernel. - - :meth:`CallableKernel.with_types` should be called in order to match - the ``dtypes`` of the arguments that are shared between the caller and the - callee kernel. - - :meth:`CallableKernel.with_descrs` should be called in order to match - :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, - :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the - caller and the callee kernel. - - :meth:`CallableKernel.with_hw_axes` should be called to set the grid - sizes for the :attr:`subkernel` of the callable. - """ - - fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"]) - init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target") - - def __init__(self, subkernel, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): - - super(CallableKernel, self).__init__( - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr) - - self.name_in_target = name_in_target - self.subkernel = subkernel.copy( - args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) - if arg.dtype is not None else arg for arg in subkernel.args]) - - def __getinitargs__(self): - return (self.subkernel, self.arg_id_to_dtype, - self.arg_id_to_descr, self.name_in_target) - - @property - def name(self): - return self.subkernel.name - - def with_types(self, arg_id_to_dtype, kernel): - - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - new_args = [] - for arg in self.subkernel.args: - kw = arg.name - if kw in arg_id_to_dtype: - # id exists as kw - new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) - elif kw_to_pos[kw] in arg_id_to_dtype: - # id exists as positional argument - new_args.append(arg.copy( - dtype=arg_id_to_dtype[kw_to_pos[kw]])) - else: - new_args.append(arg) - - from loopy.type_inference import infer_unknown_types - pre_specialized_subkernel = self.subkernel.copy( - args=new_args) - - # infer the types of the written variables based on the knowledge - # of the types of the arguments supplied - specialized_kernel = infer_unknown_types(pre_specialized_subkernel, - expect_completion=True) - - new_arg_id_to_dtype = {} - for arg in specialized_kernel.args: - # associate the updated_arg_id_to_dtype with keyword as well as - # positional id - new_arg_id_to_dtype[arg.name] = arg.dtype - new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype - - # Return the kernel call with specialized subkernel and the corresponding - # new arg_id_to_dtype - return self.copy(subkernel=specialized_kernel, - arg_id_to_dtype=new_arg_id_to_dtype) - - def with_descrs(self, arg_id_to_descr): - - # tune the subkernel so that we have the matching shapes and - # dim_tags - - new_args = self.subkernel.args[:] - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - for arg_id, descr in arg_id_to_descr.items(): - if isinstance(arg_id, int): - arg_id = pos_to_kw[arg_id] - assert isinstance(arg_id, str) - - if isinstance(descr, ArrayArgDescriptor): - new_arg = self.subkernel.arg_dict[arg_id].copy( - shape=descr.shape, - dim_tags=descr.dim_tags, - address_space=descr.address_space) - # replacing the new arg with the arg of the same name - new_args = [new_arg if arg.name == arg_id else arg for arg in - new_args] - elif isinstance(descr, ValueArgDescriptor): - pass - else: - raise LoopyError("Descriptor must be either an instance of " - "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % - type(descr)) - descriptor_specialized_knl = self.subkernel.copy(args=new_args) - - return self.copy(subkernel=descriptor_specialized_knl, - arg_id_to_descr=arg_id_to_descr) - - def with_packing_for_args(self): - from loopy.kernel.data import AddressSpace - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - arg_id_to_descr = {} - - for pos, kw in pos_to_kw.items(): - arg = self.subkernel.arg_dict[kw] - arg_id_to_descr[pos] = ArrayArgDescriptor( - shape=arg.shape, - dim_tags=arg.dim_tags, - address_space=AddressSpace.GLOBAL) - - return self.copy(subkernel=self.subkernel, - arg_id_to_descr=arg_id_to_descr) - - def with_hw_axes_sizes(self, gsize, lsize): - return self.copy( - subkernel=self.subkernel.copy( - overridden_get_grid_sizes_for_insn_ids=( - GridOverrideForCalleeKernel(lsize, gsize)))) - - def is_ready_for_codegen(self): - return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None and - self.name_in_target is not None) - - def generate_preambles(self, target): - """ Yields the *target* specific preambles. - """ - # FIXME TODO: This is not correct, as the code code preamble generated - # during the code generationg of the child kernel, does not guarantee - # that this thing would be updated. - for preamble in self.subkernel.preambles: - yield preamble - - return - - def emit_call_insn(self, insn, target, expression_to_code_mapper): - - assert self.is_ready_for_codegen() - - from loopy.kernel.instruction import CallInstruction - from pymbolic.primitives import CallWithKwargs - - assert isinstance(insn, CallInstruction) - - parameters = insn.expression.parameters - kw_parameters = {} - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - - assignees = insn.assignees - - parameters = list(parameters) - par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameters.append(kw_parameters[pos_to_kw[i]]) - par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) - - # insert the assigness at the required positions - assignee_write_count = -1 - for i, arg in enumerate(self.subkernel.args): - if arg.is_output_only: - assignee = assignees[-assignee_write_count-1] - parameters.insert(i, assignee) - par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) - assignee_write_count -= 1 - - # no type casting in array calls - from loopy.expression import dtype_to_type_context - from pymbolic.mapper.stringifier import PREC_NONE - from loopy.symbolic import SubArrayRef - from pymbolic import var - - c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr if isinstance(par, SubArrayRef) else - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr - for par, par_dtype in zip( - parameters, par_dtypes)] - - return var(self.name_in_target)(*c_parameters), False - -# }}} - - # {{{ mangler callable class ManglerCallable(ScalarCallable): @@ -892,12 +683,6 @@ def register_pymbolic_calls_to_knl_callables(kernel, # keep on finding new names till one a unique one is found. unique_var = next_indexed_variable(Variable(unique_var)) - # book-keeping of the functions and names mappings for later use - if isinstance(in_knl_callable, CallableKernel): - # for array calls the name in the target is the name of the - # scoped funciton - in_knl_callable = in_knl_callable.copy( - name_in_target=unique_var) scoped_names_to_functions[unique_var] = in_knl_callable scoped_functions_to_names[in_knl_callable] = unique_var diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 44f994e9e..789dff2eb 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -22,21 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - -import islpy as isl -from pymbolic.primitives import CallWithKwargs - -from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel -from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, - CInstruction, _DataObliviousInstruction) -from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper -from loopy.isl_helpers import simplify_via_aff -from loopy.kernel.function_interface import (get_kw_pos_association, - register_pymbolic_calls_to_knl_callables) __doc__ = """ diff --git a/test/test_callables.py b/test/test_callables.py new file mode 100644 index 000000000..735f16514 --- /dev/null +++ b/test/test_callables.py @@ -0,0 +1,68 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.clrandom # noqa: F401 +import loopy as lp +import sys + + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import register_log2_lookup + + x = np.random.rand(10) + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + knl = lp.register_function_lookup(knl, register_log2_lookup) + + evt, (out, ) = knl(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: foldmethod=marker -- GitLab From 96791efeff9475be562c1268e40fa770fd7610ef Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Jul 2018 20:58:51 -0500 Subject: [PATCH 229/916] Flake8 fixes. --- loopy/codegen/__init__.py | 8 +++----- loopy/kernel/creation.py | 6 +----- loopy/symbolic.py | 8 -------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 16fef45b5..f93031a97 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,12 +32,10 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION -from cgen import Collection -from loopy.symbolic import CombineMapper +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) -from loopy.kernel.instruction import ( - Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, - CInstruction, _DataObliviousInstruction, MultiAssignmentBase) +from loopy.symbolic import CombineMapper from functools import reduce diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 8b371b47d..3fa952133 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -27,17 +27,13 @@ THE SOFTWARE. import numpy as np from pymbolic.mapper import CSECachingMapperMixin -from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids -from loopy.symbolic import ( - IdentityMapper, WalkMapper, SubArrayRef, +from loopy.symbolic import (IdentityMapper, WalkMapper, RuleAwareIdentityMapper) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, SubstitutionRule, AddressSpace) -from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, - CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 770e1128a..f060bf8b7 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1172,14 +1172,6 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: return IdentityMapper.map_call(self, expr) - def map_call_with_kwargs(self, expr): - for par in expr.kw_parameters.values(): - if not isinstance(par, SubArrayRef): - raise LoopyError("Keyword Arguments is only supported for" - " array arguments--use positional order to specify" - " the order of the arguments in the call.") - return IdentityMapper.map_call_with_kwargs(self, expr) - # {{{ customization to pymbolic parser -- GitLab From 335153b471d81bf30829a8461c6a4bc7a2f97416 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Jul 2018 22:34:19 -0500 Subject: [PATCH 230/916] Isolating just eh function interface for now. --- examples/python/call-external.py | 105 ------------------------------- loopy/preprocess.py | 21 ++----- 2 files changed, 5 insertions(+), 121 deletions(-) delete mode 100644 examples/python/call-external.py diff --git a/examples/python/call-external.py b/examples/python/call-external.py deleted file mode 100644 index 904270472..000000000 --- a/examples/python/call-external.py +++ /dev/null @@ -1,105 +0,0 @@ -import loopy as lp -import numpy as np -from loopy.diagnostic import LoopyError -from loopy.target.c import CTarget - - -# {{{ blas callable - -class BLASCallable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel): - for i in range(0, 2): - if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: - # the types provided aren't mature enough to specialize the - # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) - - mat_dtype = arg_id_to_dtype[0].numpy_dtype - vec_dtype = arg_id_to_dtype[1].numpy_dtype - - if mat_dtype != vec_dtype: - raise LoopyError("DGEMV should have same dtype for matrix and " - "vector") - - if vec_dtype == np.float32: - name_in_target = "cblas_sgemv" - elif vec_dtype == np.float64: - name_in_target = "cblas_dgemv" - else: - raise LoopyError("GEMV only supported for float32 and float64 " - "types") - - from loopy.types import NumpyType - return self.copy(name_in_target=name_in_target, - arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), - -1: NumpyType(vec_dtype)}) - - def emit_call_insn(self, insn, target, expression_to_code_mapper): - assert self.is_ready_for_codegen() - - from loopy.kernel.instruction import CallInstruction - - assert isinstance(insn, CallInstruction) - - parameters = insn.expression.parameters - - parameters = list(parameters) - par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] - - parameters.append(insn.assignees[0]) - par_dtypes.append(self.arg_id_to_dtype[-1]) - - # no type casting in array calls. - from loopy.expression import dtype_to_type_context - from pymbolic.mapper.stringifier import PREC_NONE - from loopy.symbolic import SubArrayRef - from pymbolic import var - - mat_descr = self.arg_id_to_descr[0] - - c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr if isinstance(par, SubArrayRef) else - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr - for par, par_dtype in zip( - parameters, par_dtypes)] - c_parameters.insert(0, var('CblasRowMajor')) - c_parameters.insert(1, var('CblasNoTrans')) - c_parameters.insert(2, mat_descr.shape[0]) - c_parameters.insert(3, mat_descr.shape[1]) - c_parameters.insert(4, 1) - c_parameters.insert(6, 1) - c_parameters.insert(8, 1) - c_parameters.insert(10, 1) - return var(self.name_in_target)(*c_parameters), False - - def generate_preambles(self, target): - assert isinstance(target, CTarget) - yield("99_cblas", "#include ") - return - - -def blas_fn_lookup(target, identifier): - if identifier == 'gemv': - return BLASCallable(name='gemv') - return None - -# }}} - - -n = 10 - -knl = lp.make_kernel( - "{[i]: 0<=i<10}", - """ - y[:] = gemv(A[:, :], x[:]) - """, [ - lp.ArrayArg('A', dtype=np.float64, shape=(n, n)), - lp.ArrayArg('x', dtype=np.float64, shape=(n, )), - lp.ArrayArg('y', shape=(n, )), ...], - target=CTarget()) - -knl = lp.register_function_lookup(knl, blas_fn_lookup) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 6beadb3de..2e4d07974 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2133,15 +2133,14 @@ class ArgDescrInferenceMapper(CombineMapper): # FIXME logic duplication between map_call and map_call_with_kwargs def map_call(self, expr, **kwargs): from loopy.kernel.function_interface import ValueArgDescriptor - from loopy.symbolic import SubArrayRef, ScopedFunction + from loopy.symbolic import ScopedFunction # ignoring if the call is not to a ScopedFunction if not isinstance(expr.function, ScopedFunction): return self.combine((self.rec(child) for child in expr.parameters)) # descriptors for the args - arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel)) - if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) + arg_id_to_descr = dict((i, ValueArgDescriptor()) for i, par in enumerate(expr.parameters)) assignee_id_to_descr = {} @@ -2152,11 +2151,7 @@ class ArgDescrInferenceMapper(CombineMapper): assignees = kwargs['assignees'] assert isinstance(assignees, tuple) for i, par in enumerate(assignees): - if isinstance(par, SubArrayRef): - assignee_id_to_descr[-i-1] = ( - par.get_array_arg_descriptor(self.kernel)) - else: - assignee_id_to_descr[-i-1] = ValueArgDescriptor() + assignee_id_to_descr[-i-1] = ValueArgDescriptor() # gathering all the descriptors # TODO: I dont like in place updates. Change this to somthing else. @@ -2175,11 +2170,9 @@ class ArgDescrInferenceMapper(CombineMapper): def map_call_with_kwargs(self, expr, **kwargs): from loopy.kernel.function_interface import ValueArgDescriptor - from loopy.symbolic import SubArrayRef # descriptors for the args and kwargs: - arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.kernel)) - if isinstance(par, SubArrayRef) else ValueArgDescriptor() + arg_id_to_descr = dict((i, ValueArgDescriptor()) for i, par in tuple(enumerate(expr.parameters)) + tuple(expr.kw_parameters.items())) @@ -2190,11 +2183,7 @@ class ArgDescrInferenceMapper(CombineMapper): assignees = kwargs['assignees'] assert isinstance(assignees, tuple) for i, par in enumerate(assignees): - if isinstance(par, SubArrayRef): - assignee_id_to_descr[-i-1] = ( - par.get_array_arg_descriptor(self.kernel)) - else: - assignee_id_to_descr[-i-1] = ValueArgDescriptor() + assignee_id_to_descr[-i-1] = ValueArgDescriptor() # gathering all the descriptors # TODO: I dont like in place updates. Change this to somthing else. -- GitLab From d844cfd8115bbcf464c7fae14fe6e663f0841f5e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 Jul 2018 08:28:38 -0500 Subject: [PATCH 231/916] removes logic duplication between map_call and map_call_with_kwargs. --- loopy/check.py | 13 +++-- loopy/kernel/creation.py | 26 +++------ loopy/preprocess.py | 113 +++++++++++---------------------------- loopy/type_inference.py | 3 +- 4 files changed, 44 insertions(+), 111 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index dd1cbf3d1..307c9c001 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -78,15 +78,14 @@ class UnscopedCallCollector(CombineMapper): return reduce(operator.or_, values, frozenset()) def map_call(self, expr): - from loopy.library.reduction import ArgExtOp - if not isinstance(expr.function, (ScopedFunction, ArgExtOp)): - return (frozenset([expr.function.name]) | - self.combine((self.rec(child) for child in expr.parameters))) - else: - return self.combine((self.rec(child) for child in expr.parameters)) + from pymbolic.primitives import CallWithKwargs + return self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={})) def map_call_with_kwargs(self, expr): - if not isinstance(expr.function, ScopedFunction): + from loopy.library.reduction import ArgExtOp + if not isinstance(expr.function, (ScopedFunction, ArgExtOp)): return (frozenset([expr.function.name]) | self.combine((self.rec(child) for child in expr.parameters + tuple(expr.kw_parameters.values())))) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 3fa952133..8f25d2421 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1861,28 +1861,14 @@ class FunctionScoper(RuleAwareIdentityMapper): self.scoped_functions = {} def map_call(self, expr, expn_state): - from loopy.symbolic import ScopedFunction - if not isinstance(expr.function, ScopedFunction): - - # search the kernel for the function - in_knl_callable = self.kernel.find_scoped_function_identifier( - expr.function.name) - if in_knl_callable: - # associate the newly created ScopedFunction with the - # resolved in-kernel callable - self.scoped_functions[expr.function.name] = in_knl_callable - - return type(expr)( - ScopedFunction(expr.function.name), - tuple(self.rec(child, expn_state) - for child in expr.parameters)) - - # this is an unknown function as of yet, do not modify it - return super(FunctionScoper, self).map_call(expr, expn_state) + from pymbolic.primitives import Call, CallWithKwargs + new_call_with_kwargs = self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={}), expn_state) + return Call(new_call_with_kwargs.function, + new_call_with_kwargs.parameters) def map_call_with_kwargs(self, expr, expn_state): - # FIXME duplicated logic with map_call - from loopy.symbolic import ScopedFunction if not isinstance(expr.function, ScopedFunction): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 2e4d07974..92f245fab 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2130,51 +2130,20 @@ class ArgDescrInferenceMapper(CombineMapper): import operator return reduce(operator.or_, values, frozenset()) - # FIXME logic duplication between map_call and map_call_with_kwargs def map_call(self, expr, **kwargs): + from pymbolic.primitives import Call, CallWithKwargs from loopy.kernel.function_interface import ValueArgDescriptor - from loopy.symbolic import ScopedFunction - - # ignoring if the call is not to a ScopedFunction - if not isinstance(expr.function, ScopedFunction): - return self.combine((self.rec(child) for child in expr.parameters)) - - # descriptors for the args - arg_id_to_descr = dict((i, ValueArgDescriptor()) - for i, par in enumerate(expr.parameters)) - - assignee_id_to_descr = {} - - # assignee descriptor - if 'assignees' in kwargs: - # If supplied with assignees then this is a CallInstruction - assignees = kwargs['assignees'] - assert isinstance(assignees, tuple) - for i, par in enumerate(assignees): - assignee_id_to_descr[-i-1] = ValueArgDescriptor() - - # gathering all the descriptors - # TODO: I dont like in place updates. Change this to somthing else. - # Perhaps make a function? - combined_arg_id_to_descr = arg_id_to_descr.copy() - combined_arg_id_to_descr.update(assignee_id_to_descr) - - # specializing the function according to the parameter description - new_scoped_function = ( - self.kernel.scoped_functions[expr.function.name].with_descrs( - combined_arg_id_to_descr)) - - # collecting the descriptors for args, kwargs, assignees - return (frozenset(((expr, new_scoped_function), )) | - self.combine((self.rec(child) for child in expr.parameters))) - def map_call_with_kwargs(self, expr, **kwargs): - from loopy.kernel.function_interface import ValueArgDescriptor + if isinstance(expr, Call): + kw_parameters = {} + else: + assert isinstance(expr, CallWithKwargs) + kw_parameters = expr.kw_parameters # descriptors for the args and kwargs: arg_id_to_descr = dict((i, ValueArgDescriptor()) for i, par in tuple(enumerate(expr.parameters)) + - tuple(expr.kw_parameters.items())) + tuple(kw_parameters.items())) assignee_id_to_descr = {} @@ -2186,8 +2155,6 @@ class ArgDescrInferenceMapper(CombineMapper): assignee_id_to_descr[-i-1] = ValueArgDescriptor() # gathering all the descriptors - # TODO: I dont like in place updates. Change this to somthing else. - # Perhaps make a function? combined_arg_id_to_descr = arg_id_to_descr.copy() combined_arg_id_to_descr.update(assignee_id_to_descr) @@ -2199,7 +2166,10 @@ class ArgDescrInferenceMapper(CombineMapper): # collecting the descriptors for args, kwargs, assignees return ( frozenset(((expr, new_scoped_function), )) | - self.combine((self.rec(child) for child in expr.parameters))) + self.combine((self.rec(child) for child in + expr.parameters+tuple(kw_parameters)))) + + map_call_with_kwargs = map_call def map_constant(self, expr, **kwargs): return frozenset() @@ -2269,23 +2239,18 @@ class HWAxesInferenceMapper(CombineMapper): return reduce(operator.or_, values, frozenset()) def map_call(self, expr, **kwargs): - # ignoring if the call is not to a ScopedFunction - from loopy.symbolic import ScopedFunction - if not isinstance(expr.function, ScopedFunction): - return self.combine((self.rec(child) for child in expr.parameters)) - - new_scoped_function = ( - self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes( - self.local_size, self.global_size)) - - return (frozenset(((expr, new_scoped_function), )) | - self.combine((self.rec(child) for child in expr.parameters))) + from pymbolic.primitives import CallWithKwargs, Call + if isinstance(expr, Call): + kw_parameters = {} + else: + assert isinstance(expr, CallWithKwargs) + kw_parameters = expr.kw_parameters - def map_call_with_kwargs(self, expr, **kwargs): from loopy.symbolic import ScopedFunction # ignoring if the call is not to a ScopedFunction if not isinstance(expr.function, ScopedFunction): - return self.combine((self.rec(child) for child in expr.parameters)) + return self.combine((self.rec(child) for child in + expr.parameters+tuple(kw_parameters.values()))) new_scoped_function = ( self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes( @@ -2293,7 +2258,9 @@ class HWAxesInferenceMapper(CombineMapper): return (frozenset(((expr, new_scoped_function), )) | self.combine((self.rec(child) for child in - expr.parameters+tuple(expr.kw_parameters.values())))) + expr.parameters+tuple(kw_parameters.values())))) + + map_call_with_kwargs = map_call def map_constant(self, expr, **kwargs): return frozenset() @@ -2349,35 +2316,13 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): def combine(self, values): return all(values) - # FIXME logic duplication between map_call and map_call_with_kwargs def map_call(self, expr, *args, **kwargs): - from loopy.library.reduction import ArgExtOp, SegmentedOp - from pymbolic.primitives import Variable - from loopy.symbolic import ScopedFunction - - if isinstance(expr.function, (ArgExtOp, SegmentedOp)): - return self.combine( - tuple( - self.rec(child, *args, **kwargs) for child in - expr.parameters)) - elif isinstance(expr.function, Variable): - # UnScopedFunction obtained and hence clearly not ready for - # codegen. - return False - - elif isinstance(expr.function, ScopedFunction): - is_ready_for_codegen = self.kernel.scoped_functions[ - expr.function.name].is_ready_for_codegen() - return self.combine( - (is_ready_for_codegen,) + - tuple( - self.rec(child, *args, **kwargs) - for child in expr.parameters)) + from pymbolic.primitives import CallWithKwargs, Call + if isinstance(expr, Call): + kw_parameters = {} else: - raise LoopyError("Unexpected function type %s obtained in %s" - % (type(expr.function), expr)) - - def map_call_with_kwargs(self, expr, *args, **kwargs): + assert isinstance(expr, CallWithKwargs) + kw_parameters = expr.kw_parameters is_ready_for_codegen = self.kernel.scoped_functions[ expr.function.name].is_ready_for_codegen() return self.combine( @@ -2387,9 +2332,11 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): for child in expr.parameters) + tuple( self.rec(child, *args, **kwargs) - for child in expr.kw_parameters.values()) + for child in kw_parameters.values()) ) + map_call_with_kwargs = map_call + def map_constant(self, expr): return True diff --git a/loopy/type_inference.py b/loopy/type_inference.py index a68520525..e869ae62b 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -265,12 +265,13 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable, CallWithKwargs + from pymbolic.primitives import Variable, CallWithKwargs, Call from loopy.symbolic import ScopedFunction if isinstance(expr, CallWithKwargs): kw_parameters = expr.kw_parameters else: + assert isinstance(expr, Call) kw_parameters = {} identifier = expr.function -- GitLab From c211fb2c2164d9def11cf05909a117c9b1b66c51 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 Jul 2018 08:47:09 -0500 Subject: [PATCH 232/916] streamlines reuction scoped function generator. --- loopy/kernel/creation.py | 40 ++------------------------------------ loopy/library/reduction.py | 21 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 38 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 8f25d2421..e90d3823f 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1894,44 +1894,8 @@ class FunctionScoper(RuleAwareIdentityMapper): expn_state) def map_reduction(self, expr, expn_state): - from loopy.library.reduction import (MaxReductionOperation, - MinReductionOperation, ArgMinReductionOperation, - ArgMaxReductionOperation, _SegmentedScalarReductionOperation, - SegmentedOp) - from loopy.library.reduction import ArgExtOp - - # note down the extra functions arising due to certain reductions - - # FIXME Discuss this. It cannot stay the way it is, because non-built-in - # reductions cannot add themselves to this list. We may need to change - # the reduction interface. Why don't reductions generate scoped functions - # in the first place? - if isinstance(expr.operation, MaxReductionOperation): - self.scoped_functions["max"] = ( - self.kernel.find_scoped_function_identifier("max")) - elif isinstance(expr.operation, MinReductionOperation): - self.scoped_functions["min"] = ( - self.kernel.find_scoped_function_identifier("min")) - elif isinstance(expr.operation, ArgMaxReductionOperation): - self.scoped_functions["max"] = ( - self.kernel.find_scoped_function_identifier("max")) - self.scoped_functions["make_tuple"] = ( - self.kernel.find_scoped_function_identifier("make_tuple")) - self.scoped_functions[ArgExtOp(expr.operation)] = ( - self.kernel.find_scoped_function_identifier(expr.operation)) - elif isinstance(expr.operation, ArgMinReductionOperation): - self.scoped_functions["min"] = ( - self.kernel.find_scoped_function_identifier("min")) - self.scoped_functions["make_tuple"] = ( - self.kernel.find_scoped_function_identifier("make_tuple")) - self.scoped_functions[ArgExtOp(expr.operation)] = ( - self.kernel.find_scoped_function_identifier(expr.operation)) - elif isinstance(expr.operation, _SegmentedScalarReductionOperation): - self.scoped_functions["make_tuple"] = ( - self.kernel.find_scoped_function_identifier("make_tuple")) - self.scoped_functions[SegmentedOp(expr.operation)] = ( - self.kernel.find_scoped_function_identifier(expr.operation)) - + self.scoped_functions.update( + expr.operation.get_scalar_callables(self.kernel)) return super(FunctionScoper, self).map_reduction(expr, expn_state) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index ca2f02347..5fa6d75ce 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -83,6 +83,9 @@ class ReductionOperation(object): raise LoopyError("unable to parse reduction type: '%s'" % op_type) + def get_scalar_callables(self, kernel): + return {} + class ScalarReductionOperation(ReductionOperation): def __init__(self, forced_result_type=None): @@ -184,6 +187,10 @@ class MaxReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2): return ScopedFunction("max")(operand1, operand2) + def get_scalar_callables(self, kernel): + return { + "max": kernel.find_scoped_function_identifier("max")} + class MinReductionOperation(ScalarReductionOperation): def neutral_element(self, dtype): @@ -192,6 +199,9 @@ class MinReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2): return ScopedFunction("min")(operand1, operand2) + def get_scalar_callables(self, kernel): + return { + "min": kernel.find_scoped_function_identifier("min")} # {{{ base class for symbolic reduction ops @@ -258,6 +268,11 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def __call__(self, dtypes, operand1, operand2): return ScopedFunction(SegmentedOp(self))(*(operand1 + operand2)) + def get_scalar_callables(self, kernel): + return { + "make_tuple": kernel.find_scoped_function_identifier("make_tuple"), + SegmentedOp(self): kernel.find_scoped_function_identifier(self)} + class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): base_reduction_class = SumReductionOperation @@ -311,6 +326,12 @@ class _ArgExtremumReductionOperation(ReductionOperation): def __call__(self, dtypes, operand1, operand2): return ScopedFunction(ArgExtOp(self))(*(operand1 + operand2)) + def get_scalar_callables(self, kernel): + return { + self.which: kernel.find_scoped_function_identifier(self.which), + "make_tuple": kernel.find_scoped_function_identifier("make_tuple"), + ArgExtOp(self): kernel.find_scoped_function_identifier(self)} + class ArgMaxReductionOperation(_ArgExtremumReductionOperation): which = "max" -- GitLab From 20c1c379c0a42e0528714fb22d4338aa01f97ef6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 Jul 2018 10:48:02 -0500 Subject: [PATCH 233/916] Flake8 --- loopy/library/reduction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 5fa6d75ce..a05c630e7 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -203,6 +203,7 @@ class MinReductionOperation(ScalarReductionOperation): return { "min": kernel.find_scoped_function_identifier("min")} + # {{{ base class for symbolic reduction ops class ReductionOpFunction(FunctionIdentifier): -- GitLab From e423522df9eeb46cb7014d9a447863dd0bfad5af Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 Jul 2018 10:48:18 -0500 Subject: [PATCH 234/916] fixes minor error in map_call. --- loopy/preprocess.py | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 92f245fab..098549def 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2133,6 +2133,11 @@ class ArgDescrInferenceMapper(CombineMapper): def map_call(self, expr, **kwargs): from pymbolic.primitives import Call, CallWithKwargs from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import ScopedFunction + + # ignore if the call is not to a ScopedFunction + if not isinstance(expr.function, ScopedFunction): + return self.combine((self.rec(child) for child in expr.parameters)) if isinstance(expr, Call): kw_parameters = {} @@ -2318,22 +2323,38 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): def map_call(self, expr, *args, **kwargs): from pymbolic.primitives import CallWithKwargs, Call + from loopy.library.reduction import ArgExtOp, SegmentedOp + from pymbolic.primitives import Variable + from loopy.symbolic import ScopedFunction + if isinstance(expr, Call): kw_parameters = {} else: assert isinstance(expr, CallWithKwargs) kw_parameters = expr.kw_parameters - is_ready_for_codegen = self.kernel.scoped_functions[ - expr.function.name].is_ready_for_codegen() - return self.combine( - (is_ready_for_codegen,) - + tuple( - self.rec(child, *args, **kwargs) - for child in expr.parameters) - + tuple( - self.rec(child, *args, **kwargs) - for child in kw_parameters.values()) - ) + + if isinstance(expr.function, (ArgExtOp, SegmentedOp)): + return self.combine( + tuple( + self.rec(child, *args, **kwargs) for child in + expr.parameters + tuple(kw_parameters))) + elif isinstance(expr.function, Variable): + # UnScopedFunction obtained and hence clearly not ready for + # codegen. + return False + + elif isinstance(expr.function, ScopedFunction): + is_ready_for_codegen = self.kernel.scoped_functions[ + expr.function.name].is_ready_for_codegen() + return self.combine( + (is_ready_for_codegen,) + + tuple( + self.rec(child, *args, **kwargs) + for child in + expr.parameters+tuple(kw_parameters.values()))) + else: + raise LoopyError("Unexpected function type %s obtained in %s" + % (type(expr.function), expr)) map_call_with_kwargs = map_call -- GitLab From dafcfba59195e9354edabcac086e0461fe84a034 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 Jul 2018 11:11:57 -0500 Subject: [PATCH 235/916] errors in resolving logic duplication. --- loopy/kernel/creation.py | 17 +++++++++---- loopy/kernel/function_interface.py | 40 ++++++++++-------------------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index e90d3823f..f67f1028c 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1862,14 +1862,21 @@ class FunctionScoper(RuleAwareIdentityMapper): def map_call(self, expr, expn_state): from pymbolic.primitives import Call, CallWithKwargs - new_call_with_kwargs = self.rec(CallWithKwargs( - function=expr.function, parameters=expr.parameters, - kw_parameters={}), expn_state) - return Call(new_call_with_kwargs.function, - new_call_with_kwargs.parameters) + from loopy.symbolic import parse_tagged_name + + name, tag = parse_tagged_name(expr.function) + if name not in self.rule_mapping_context.old_subst_rules: + new_call_with_kwargs = self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={}), expn_state) + return Call(new_call_with_kwargs.function, + new_call_with_kwargs.parameters) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) def map_call_with_kwargs(self, expr, expn_state): from loopy.symbolic import ScopedFunction + if not isinstance(expr.function, ScopedFunction): # search the kernel for the function. diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index ddfe9b73e..c6c87f35b 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -607,33 +607,19 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): return self.map_substitution(name, tag, expr.parameters, expn_state) def map_call_with_kwargs(self, expr, expn_state): - name, tag = parse_tagged_name(expr.function) - if name not in self.rule_mapping_context.old_subst_rules: - expanded_expr = self.subst_expander(expr) - if expr in self.expr_to_new_names: - return type(expr)( - ScopedFunction(self.expr_to_new_names[expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - elif expanded_expr in self.expr_to_new_names: - return type(expr)( - ScopedFunction(self.expr_to_new_names[expanded_expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - else: - return super(ScopedFunctionNameChanger, self).map_call_with_kwargs( - expr, expn_state) + if expr in self.expr_to_new_names: + return type(expr)( + ScopedFunction(self.expr_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) else: - return self.map_substitution(name, tag, expr.parameters, expn_state) + return super(ScopedFunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) def register_pymbolic_calls_to_knl_callables(kernel, @@ -664,9 +650,9 @@ def register_pymbolic_calls_to_knl_callables(kernel, pymbolic_calls_to_new_names = {} for pymbolic_call, in_knl_callable in pymbolic_exprs_to_knl_callables.items(): - # checking if such a in-kernel callable already exists. + # check if such a in-kernel callable already exists. if in_knl_callable not in scoped_functions_to_names: - # No matching in_knl_callable found => make a new one with a new + # No matching in_knl_callable found, implies make a new one with a new # name. if isinstance(pymbolic_call.function, Variable): pymbolic_call_function = pymbolic_call.function -- GitLab From 86b76919582f9a01207af7789cfca4be9cf0bf49 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 5 Jul 2018 17:32:02 +0100 Subject: [PATCH 236/916] minor (temp) changes --- loopy/check.py | 2 +- loopy/target/c/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 60d2fd698..ab7f430ef 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -730,7 +730,7 @@ def pre_schedule_checks(kernel): check_bounds(kernel) check_write_destinations(kernel) # check_has_schedulable_iname_nesting(kernel) - check_variable_access_ordered(kernel) + # check_variable_access_ordered(kernel) logger.debug("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 6a8befa95..681914986 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -455,7 +455,7 @@ def scope_c_math_functions(target, identifier): if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", - "fabs"]: + "fabs", "tan"]: return CMathCallable(name=identifier) return None -- GitLab From 4ab87c223d888950db30e3efca9b12afa3bc552f Mon Sep 17 00:00:00 2001 From: tj-sun Date: Tue, 10 Jul 2018 13:06:15 +0100 Subject: [PATCH 237/916] hash builder for opaque type --- loopy/types.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/loopy/types.py b/loopy/types.py index 59d605c85..0a08b8a81 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -199,6 +199,9 @@ class OpaqueType(LoopyType): def involves_complex(self): return False + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, self.name) + # }}} -- GitLab From d3e24b4a602538f1b004a69068972a079e31aa8a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 10 Jul 2018 18:16:02 -0500 Subject: [PATCH 238/916] added example for register_calls_to_callables. --- loopy/kernel/function_interface.py | 44 ++++++++++++++++++------------ 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index c6c87f35b..fa103b178 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -37,6 +37,8 @@ from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, SubstitutionRuleExpander) +from pymbolic.primitives import Call + # {{{ argument descriptors @@ -300,7 +302,7 @@ class InKernelCallable(ImmutableRecord): is an instance of :class:`bool` to indicate if the assignee is returned by value of C-type targets. - :Example: If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is + *Example:* If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is interpreted in the target as ``a = f(c, d, &b)``. If ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted in the target as the statement ``f(c, d, &a, &b)``. @@ -396,7 +398,7 @@ class ScalarCallable(InKernelCallable): The first assignee is returned, but the rest of them are appended to the parameters and passed by reference. - :Example: ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` + *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. :arg target: An instance of :class:`loopy.target.TargetBase`. @@ -405,13 +407,6 @@ class ScalarCallable(InKernelCallable): **target syntax**. """ - # FIXME: needs to get information about whether the callable has should - # do pass by reference by all values or should return one value for - # pass by value return. - - # For example: The code generation of `sincos` would be different for - # C-Target and OpenCL-target. - # Currently this is formulated such that the first argument is returned # and rest all are passed by reference as arguments to the function. @@ -544,14 +539,12 @@ class ManglerCallable(ScalarCallable): # {{{ new pymbolic calls to scoped functions -# FIXME Are these identifiers guaranteed to be available? Is there a var name -# generator somewhere ensuring that that's the case? def next_indexed_variable(function): """ Returns an instance of :class:`str` with the next indexed-name in the sequence for the name of *function*. - **Example:** ``Variable('sin_0')`` will return ``'sin_1'``. + *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. :arg function: Either an instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.reduction.ArgExtOp` or @@ -623,20 +616,36 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): def register_pymbolic_calls_to_knl_callables(kernel, - pymbolic_exprs_to_knl_callables): + pymbolic_calls_to_knl_callables): # FIXME This could use an example. I have no idea what this does. # Surely I can't associate arbitrary pymbolic expresions (3+a?) # with callables? """ Returns a copy of :arg:`kernel` which includes an association with the given - pymbolic expressions to the instances of :class:`InKernelCallable` for the - mapping given by :arg:`pymbolic_exprs_to_knl_calllables`. + pymbolic calls to the instances of :class:`InKernelCallable` for the + mapping given by :arg:`pymbolic_calls_to_knl_calllables`. :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg pymbolic_exprs_to_knl_callables: A mapping from :mod:`pymbolic` expressions + :arg pymbolic_calls_to_knl_callables: A mapping from :mod:`pymbolic` expressions to the instances of :class:`loopy.kernel.function_interface.InKernelCallable`. + + *Example:* Conisder the expression of an instruction in the kernel as + ``Call(ScopedFunction('sin_0'), Variable('x'))``, with the + ``scoped_functions`` of the *kernel* being ``{'sin_0': + ScalarCallable(name='sin')}`` and the argument + ``pymbolic_calls_to_callables = {Call(ScopedFunction('sin_0'), + Variable('x')): ScalarCallable(name='sin', arg_id_to_dtype={0: float64, + -1: np.float64})}``. After applying the transformation the expression + would rename its function name and hence would become + ``Call(ScopedFunction('sin_1'), Variable('x'))`` and the transformed + kernel would have ``scoped_functions={'sin_0': + ScalarCallable(name='sin'), 'sin_1': Variable('x')): + ScalarCallable(name='sin', arg_id_to_dtype={0: np.float64, -1: + np.float64})}``. Hence, the expression would rename the function + pymbolic node and the scoped functions dictionary would register the + new callable corresponding to the new pymbolic node. """ scoped_names_to_functions = kernel.scoped_functions.copy() @@ -649,8 +658,9 @@ def register_pymbolic_calls_to_knl_callables(kernel, # corresponding pymbolic call pymbolic_calls_to_new_names = {} - for pymbolic_call, in_knl_callable in pymbolic_exprs_to_knl_callables.items(): + for pymbolic_call, in_knl_callable in pymbolic_calls_to_knl_callables.items(): # check if such a in-kernel callable already exists. + assert isinstance(pymbolic_call, Call) if in_knl_callable not in scoped_functions_to_names: # No matching in_knl_callable found, implies make a new one with a new # name. -- GitLab From c1489c23331e2d615dc1144df58c06a44cec9416 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 12 Jul 2018 11:16:32 -0500 Subject: [PATCH 239/916] revamped ref_call --- doc/ref_call.rst | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 46edc533c..f5178cbee 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -1,11 +1,37 @@ Calling Loopy Kernels and External Functions ============================================ -``ScopedFunctions`` are pymbolic nodes within expressions in a -``Loo.py`` kernel, whose name has been resolved by the kernel. +Goals of a function interface +----------------------------- + +- Must be able to have complete information of the function just through the + epxression node. +- Must adhere to :mod:`loopy` semantics of immutability. +- Must have a class instance linked with the expression node which would record + the properties of the function. +- Must indicate in the expression if the function is known to the kernel. (This + is intended to be done by making the function expression node an instance of + ``ScopedFunction`` as soon as the function definition is resolved by the + kernel) +- Function overloading is not encouraged in :mod:`loopy` as it gives rise to + contention while debugging with the help of the kernel intermediate + representation and hence if the expression nodes point to different function + instances they must differ in their representation. For example: ``float + sin(float )`` and ``double sin(double )`` should diverge by having different + identifiers as soon as data type of the argument is inferred. +- Must have an interface to register external functions. + + +Scoped Function and resolving +----------------------------- + +``ScopedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` +kernel, whose name has been resolved by the kernel. The process of matching a +function idenitifier with the function definition is called "resolving". A pymbolic ``Call`` node can be converted to a ``ScopedFunction`` if it -is resolved by one of the ``function_scoper`` in a :attr:`LoopKernel.scoped_functions` +is "resolved" by one of the ``function_scoper`` in a +:attr:`LoopKernel.scoped_functions` - Functions already registered by the target. Some examples include -- ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.) -- GitLab From d96488eb413af670dcb20992cdf458b620f30efd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 25 Jul 2018 22:08:15 -0500 Subject: [PATCH 240/916] beginnings towards a better design. --- loopy/program.py | 382 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) create mode 100644 loopy/program.py diff --git a/loopy/program.py b/loopy/program.py new file mode 100644 index 000000000..a2326e6ba --- /dev/null +++ b/loopy/program.py @@ -0,0 +1,382 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +import re + +from pytools import ImmutableRecord +from pymbolic.primitives import Variable + +from loopy.symbolic import RuleAwareIdentityMapper +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) + + +class FunctionResolver(RuleAwareIdentityMapper): + """ + Mapper to convert the ``function`` attribute of a + :class:`pymbolic.primitives.Call` known in the kernel as instances of + :class:`loopy.symbolic.ScopedFunction`. A function is known in the + *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` + returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + + log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + + unknown_function(y) + ScopedFunction('log')(z)``. + + :arg rule_mapping_context: An instance of + :class:`loopy.symbolic.RuleMappingContext`. + :arg function_ids: A container with instances of :class:`str` indicating + the function identifiers to look for while scoping functions. + """ + def __init__(self, rule_mapping_context, kernel, program_callables_info, + function_resolvers): + super(FunctionResolver, self).__init__(rule_mapping_context) + self.kernel = kernel + self.program_callables_info = program_callables_info + # FIXME: function_resolvesrs looks like a very bad name change it + self.function_resolvers = function_resolvers + + def find_resolved_function_from_identifier(self, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + # FIXME change docs + for scoper in self.function_resolvers: + # fixme: do we really need to given target for the function + in_knl_callable = scoper(self.kernel.target, identifier) + if in_knl_callable is not None: + return in_knl_callable + + return None + + def map_call(self, expr, expn_state): + from pymbolic.primitives import Call, CallWithKwargs + from loopy.symbolic import parse_tagged_name + + name, tag = parse_tagged_name(expr.function) + if name not in self.rule_mapping_context.old_subst_rules: + new_call_with_kwargs = self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={}), expn_state) + return Call(new_call_with_kwargs.function, + new_call_with_kwargs.parameters) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + from loopy.symbolic import ScopedFunction + + if not isinstance(expr.function, ScopedFunction): + + # search the kernel for the function. + in_knl_callable = self.find_scoped_function_identifier( + expr.function.name) + + if in_knl_callable: + # associate the newly created ScopedFunction with the + # resolved in-kernel callable + self.scoped_functions[expr.function.name] = in_knl_callable + return type(expr)( + ScopedFunction(expr.function.name), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + + # this is an unknown function as of yet, do not modify it + return super(FunctionResolver, self).map_call_with_kwargs(expr, + expn_state) + + def map_reduction(self, expr, expn_state): + self.scoped_functions.update( + expr.operation.get_scalar_callables(self.kernel)) + return super(FunctionResolver, self).map_reduction(expr, expn_state) + + +def resolve_callables(name, resolved_functions, function_resolvers): + + kernel = resolved_functions[name].subkernel + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + function_scoper = FunctionResolver(rule_mapping_context, kernel) + + # scoping fucntions and collecting the scoped functions + kernel_with_scoped_functions = rule_mapping_context.finish_kernel( + function_scoper.map_kernel(kernel)) + + # updating the functions collected during the scoped functions + updated_scoped_functions = kernel.scoped_functions.copy() + updated_scoped_functions.update(function_scoper.scoped_functions) + + return kernel_with_scoped_functions.copy( + scoped_functions=updated_scoped_functions) + +# {{{ program definition + +class Program(ImmutableRecord): + def __init__(self, + root_kernel_name, + program_callables_info, + target=None, + function_resolvers=None): + + # fixme: check if all sanity checks have been covered? + assert root_kernel_name in program_callables_info + + if target is None: + target = program_callables_info[root_kernel_name].subkernel.target + + if function_resolvers is None: + # populate the function scopers from the target and the loopy + # specific callable scopers + + assert len(program_callables_info.resolved_functons) == 1 + + from loopy.library.function import loopy_specific_callable_scopers + function_resolvers = [loopy_specific_callable_scopers] + ( + target.get_device_ast_builder().function_scopers()) + + # new function resolvers have arrived, implies we need to resolve + # the callables identified by this set of resolvers + program_callables_info = ( + program_callables_info.with_edit_callables_mode()) + + for name, in_knl_callable in program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + # resolve the callables in the subkernel + resolved_functions = resolve_callables(name, + program_callables_info, function_resolvers) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable %s." % + type(in_knl_callable).__name__) + + program_callables_info, renames_needed = ( + program_callables_info.with_exit_edit_mode()) + assert not renames_needed + + super(Program, self).__init__( + root_kernel_name=root_kernel_name, + resolved_functions=resolved_functions, + target=target, + function_resolvers=function_resolvers) + +# }}} + + +def next_indexed_function_identifier(function): + """ + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. + + *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. + + :arg function: Either an instance of :class:`pymbolic.primitives.Variable` + or :class:`loopy.reduction.ArgExtOp` or + :class:`loopy.reduction.SegmentedOp`. + """ + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(function, (ArgExtOp, SegmentedOp)): + return function.copy() + elif isinstance(function, str): + function = Variable(function) + + assert isinstance(function, Variable) + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(function.name) + + if match is None: + if function.name[-1] == '_': + return "{old_name}0".format(old_name=function.name) + else: + return "{old_name}_0".format(old_name=function.name) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + + +class ProgramCallablesInfo(ImmutableRecord): + def __init__(self, resolved_functions, num_times_callables_called=None, + history_of_callable_names=None, is_being_edited=False, + old_resolved_functions={}, num_times_hit_during_editing={}, + renames_needed_after_editing={}): + + if num_times_callables_called is None: + num_times_callables_called = dict((func_id, 1) for func_id in + resolved_functions) + if history_of_callable_names is None: + history_of_callable_names = dict((func_id, [func_id]) for func_id in + resolved_functions) + + super(ProgramCallablesInfo, self).__init__( + resolved_functions=resolved_functions, + num_times_callables_called=num_times_callables_called, + history_of_callables_callable_names=history_of_callable_names, + old_resolved_functions=old_resolved_functions, + is_being_edited=is_being_edited, + num_times_hit_during_editing=num_times_hit_during_editing, + renames_needed_after_editing=renames_needed_after_editing) + + def with_edit_callables_mode(self): + return self.copy(is_being_edited=True, + old_resolved_functions=self.resolved_functions.copy(), + num_times_hit_during_editring=dict((func_id, 0) for func_id in + self.resolved_functions)) + + def with_callable(self, function, in_kernel_callable): + """ + :arg function: An instance of :class:`pymbolic.primitives.Variable` or + :class:`loopy.library.reduction.ReductionOpFunction`. + + :arg in_kernel_callables: An instance of + :class:`loopy.InKernelCallable`. + """ + assert self.is_being_edited + + from loopy.library.reduction import ArgExtOp, SegmentedOp + + # {{{ sanity checks + + assert isinstance(function, (Variable, ArgExtOp, SegmentedOp)) + + # }}} + + renames_needed_after_editing = self.renames_needed_after_editing.copy() + num_times_hit_during_editing = self.num_times_hit_during_editing.copy() + num_times_callable_being_called = self.num_times_being_called.copy() + num_times_hit_during_editing[function.name] += 1 + + if in_kernel_callable in self.resolved_functions.values(): + for func_id, in_knl_callable in self.scoped_functions.items(): + if in_knl_callable == in_kernel_callable: + num_times_callable_being_called[func_id] += 1 + num_times_callable_being_called[function] -= 1 + if num_times_callable_being_called[function] == 0: + renames_needed_after_editing[func_id] = function + + return self, func_id + else: + + # {{{ ingoring this for now + + if False and isinstance(function, (ArgExtOp, SegmentedOp)): + # ignoring this casse for now + # FIXME: If a kernel has two flavors of ArgExtOp then they are + # overwritten and hence not supported.(for now). + updated_scoped_functions = self.scoped_functions.copy() + updated_scoped_functions[function] = in_kernel_callable + + return self.copy(updated_scoped_functions), function.copy() + # }}} + + #fixme: deal with the history over here. + unique_function_identifier = function.name + if self.num_times[function.name] > 1: + while unique_function_identifier in self.scoped_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + num_times_callable_being_called[function] -= 1 + num_times_callable_being_called[unique_function_identifier] = 1 + + updated_scoped_functions = self.scoped_functions.copy() + updated_scoped_functions[unique_function_identifier] = in_kernel_callable + + return (self.copy(scoped_functions=updated_scoped_functions), + Variable(unique_function_identifier)) + + def with_exit_edit_mode(self): + assert self.is_being_edited + + num_times_callable_being_called = self.num_times_callable_being_called.copy() + + for func_id in self.old_resolved_functions: + + if self.num_times_hit_during_editing[func_id] > 0 and ( + self.num_times_hit_during_editing[func_id] < + num_times_callable_being_called[func_id]): + unique_function_identifier = func_id + + while unique_function_identifier in self.scoped_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + (num_times_callable_being_called[func_id], + num_times_callable_being_called[unique_function_identifier]) = ( + self.num_times_hit_while_editing[func_id], + num_times_callable_being_called[func_id] - + self.num_times_being_hit_while_editing[func_id]) + + if self.num_times_hit_during_edition[func_id] > 0 and ( + self.num_times_hit_during_editing[func_id] > + num_times_callable_being_called[func_id]): + raise RuntimeError("Should not traverse more number of times than " + "it is called.") + + return ( + self.copy( + is_begin_edited=False, + num_times_callable_being_called=num_times_callable_being_called, + num_times_hit_during_editing={}, + renames_needed_while_editing={}), + self.renames_needed_while_editing) + + def __getitem__(self, item): + return self.reoslved_functions[item] + + def __contains__(self, item): + return item in self.resolved_functions + + def items(self): + return self.resolved_functions.items() + + +def make_program_from_kernel(kernel): + callable_knl = CallableKernel(subkernel=kernel) + resolved_functions = {kernel.name: callable_knl} + program_callables_info = ProgramCallablesInfo(resolved_functions) + + program = Program( + root_kernel_name=kernel.name, + program_callables_info=program_callables_info) + + return program + + +# vim: foldmethod=marker -- GitLab From fcbb611f0193bd97dcd79c0d05f112a1d6ecc61c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 25 Jul 2018 22:14:16 -0500 Subject: [PATCH 241/916] ScopedFunction -> ResolvedFunction --- doc/ref_call.rst | 26 +++++++++++++------------- loopy/check.py | 6 +++--- loopy/codegen/__init__.py | 2 +- loopy/kernel/creation.py | 16 ++++++++-------- loopy/kernel/function_interface.py | 26 +++++++++++++------------- loopy/library/reduction.py | 14 +++++++------- loopy/preprocess.py | 18 +++++++++--------- loopy/program.py | 14 +++++++------- loopy/statistics.py | 4 ++-- loopy/symbolic.py | 24 ++++++++++++------------ loopy/type_inference.py | 6 +++--- 11 files changed, 78 insertions(+), 78 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index f5178cbee..4ff1ef2fc 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -11,7 +11,7 @@ Goals of a function interface the properties of the function. - Must indicate in the expression if the function is known to the kernel. (This is intended to be done by making the function expression node an instance of - ``ScopedFunction`` as soon as the function definition is resolved by the + ``ResolvedFunction`` as soon as the function definition is resolved by the kernel) - Function overloading is not encouraged in :mod:`loopy` as it gives rise to contention while debugging with the help of the kernel intermediate @@ -25,11 +25,11 @@ Goals of a function interface Scoped Function and resolving ----------------------------- -``ScopedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` +``ResolvedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` kernel, whose name has been resolved by the kernel. The process of matching a function idenitifier with the function definition is called "resolving". -A pymbolic ``Call`` node can be converted to a ``ScopedFunction`` if it +A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it is "resolved" by one of the ``function_scoper`` in a :attr:`LoopKernel.scoped_functions` @@ -63,7 +63,7 @@ would get converted to: :: - ScopedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i]) This would also make an entry in the kernel's ``scoped_functions`` @@ -84,8 +84,8 @@ the expression gets converted to: :: - ScopedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + - ScopedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + ResolvedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) This also makes an entry in the ``scoped_functions`` dictionary as -- @@ -104,10 +104,10 @@ only if all the parameters of the function match viz. name, argument arity and argument types. Hence, the ``scoped_functions`` dictionary would remain unchanged. -``ScopedFunctions`` and specializations +``ResolvedFunctions`` and specializations --------------------------------------- -Consider the same ``ScopedFunction('sin')`` as above. This function +Consider the same ``ResolvedFunction('sin')`` as above. This function although scoped does not the know the types i.e. it does yet know that for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or ``sinl``. Hence, right now the function can be called as a @@ -125,7 +125,7 @@ callables are resolved. ``CallableKernel`` as this information would be helpful to to generate the function signature and make changes to the data access pattern of the variables in the callee kernel. -- Whenever a ``ScopedFunction`` goes through a specialization, this is +- Whenever a ``ResolvedFunction`` goes through a specialization, this is indicated by changing the name in the ``pymbolic`` node. If during type inference, it is inferred that the type of ``a[i]`` is @@ -133,7 +133,7 @@ If during type inference, it is inferred that the type of ``a[i]`` is :: - ScopedFunction('sin_0')(a[i]) + ... + ResolvedFunction('sin_0')(a[i]) + ... This name change is done so that it indicates that the node points to a different ``ScalarCallable`` in the dictionary. And hence a new entry is @@ -172,9 +172,9 @@ developments of the ``sin`` pymbolic call expression node. :: - sin -> (Kernel creation) -> ScopedFunction(Variable('sin')) -> - (Type Inference) -> ScopedFunction(Variable('sin_0')) -> - (Descriptor Inference) -> ScopedFunction(Variable('sin_1')) + sin -> (Kernel creation) -> ResolvedFunction(Variable('sin')) -> + (Type Inference) -> ResolvedFunction(Variable('sin_0')) -> + (Descriptor Inference) -> ResolvedFunction(Variable('sin_1')) Changes on the target side to accommodate the new function interface -------------------------------------------------------------------- diff --git a/loopy/check.py b/loopy/check.py index 4ad080332..586b94351 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -27,7 +27,7 @@ from six.moves import range from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper, CombineMapper, ScopedFunction +from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, @@ -85,7 +85,7 @@ class UnscopedCallCollector(CombineMapper): def map_call_with_kwargs(self, expr): from loopy.library.reduction import ArgExtOp - if not isinstance(expr.function, (ScopedFunction, ArgExtOp)): + if not isinstance(expr.function, (ResolvedFunction, ArgExtOp)): return (frozenset([expr.function.name]) | self.combine((self.rec(child) for child in expr.parameters + tuple(expr.kw_parameters.values())))) @@ -105,7 +105,7 @@ class UnscopedCallCollector(CombineMapper): def check_functions_are_scoped(kernel): """ Checks if all the calls in the instruction expression have been scoped, otherwise indicates to what all calls we await signature. Refer - :class:`loopy.symbolic.ScopedFunction` for a detailed explanation of a + :class:`loopy.symbolic.ResolvedFunction` for a detailed explanation of a scoped function. """ diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e9d30d013..eacd53886 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -385,7 +385,7 @@ class InKernelCallablesCollector(CombineMapper): import operator return reduce(operator.or_, values, frozenset()) - def map_scoped_function(self, expr): + def map_resolved_function(self, expr): return frozenset([self.kernel.scoped_functions[ expr.name]]) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 391b64f43..68f10b463 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1847,14 +1847,14 @@ class FunctionScoper(RuleAwareIdentityMapper): """ Mapper to convert the ``function`` attribute of a :class:`pymbolic.primitives.Call` known in the kernel as instances of - :class:`loopy.symbolic.ScopedFunction`. A function is known in the + :class:`loopy.symbolic.ResolvedFunction`. A function is known in the *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable`. **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + - log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + - unknown_function(y) + ScopedFunction('log')(z)``. + log(z)``, then the mapper would return ``ResolvedFunction('sin')(x) + + unknown_function(y) + ResolvedFunction('log')(z)``. :arg rule_mapping_context: An instance of :class:`loopy.symbolic.RuleMappingContext`. @@ -1881,20 +1881,20 @@ class FunctionScoper(RuleAwareIdentityMapper): return self.map_substitution(name, tag, expr.parameters, expn_state) def map_call_with_kwargs(self, expr, expn_state): - from loopy.symbolic import ScopedFunction + from loopy.symbolic import ResolvedFunction - if not isinstance(expr.function, ScopedFunction): + if not isinstance(expr.function, ResolvedFunction): # search the kernel for the function. in_knl_callable = self.kernel.find_scoped_function_identifier( expr.function.name) if in_knl_callable: - # associate the newly created ScopedFunction with the + # associate the newly created ResolvedFunction with the # resolved in-kernel callable self.scoped_functions[expr.function.name] = in_knl_callable return type(expr)( - ScopedFunction(expr.function.name), + ResolvedFunction(expr.function.name), tuple(self.rec(child, expn_state) for child in expr.parameters), dict( @@ -1915,7 +1915,7 @@ class FunctionScoper(RuleAwareIdentityMapper): def scope_functions(kernel): """ Returns a kernel with the pymbolic nodes involving known functions realized - as instances of :class:`loopy.symbolic.ScopedFunction`, along with the + as instances of :class:`loopy.symbolic.ResolvedFunction`, along with the resolved functions being added to the ``scoped_functions`` dictionary of the kernel. """ diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 919552ccc..3db4c082b 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -34,7 +34,7 @@ from loopy.diagnostic import LoopyError from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name -from loopy.symbolic import (ScopedFunction, SubstitutionRuleMappingContext, +from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, SubstitutionRuleExpander) from pymbolic.primitives import Call @@ -776,14 +776,14 @@ def next_indexed_variable(function): num=int(match.group('num'))+1) -class ScopedFunctionNameChanger(RuleAwareIdentityMapper): +class ResolvedFunctionNameChanger(RuleAwareIdentityMapper): """ Changes the names of scoped functions in calls of expressions according to the mapping ``expr_to_new_names`` """ def __init__(self, rule_mapping_context, expr_to_new_names, subst_expander): - super(ScopedFunctionNameChanger, self).__init__(rule_mapping_context) + super(ResolvedFunctionNameChanger, self).__init__(rule_mapping_context) self.expr_to_new_names = expr_to_new_names self.subst_expander = subst_expander @@ -794,16 +794,16 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): expanded_expr = self.subst_expander(expr) if expr in self.expr_to_new_names: return type(expr)( - ScopedFunction(self.expr_to_new_names[expr]), + ResolvedFunction(self.expr_to_new_names[expr]), tuple(self.rec(child, expn_state) for child in expr.parameters)) elif expanded_expr in self.expr_to_new_names: return type(expr)( - ScopedFunction(self.expr_to_new_names[expanded_expr]), + ResolvedFunction(self.expr_to_new_names[expanded_expr]), tuple(self.rec(child, expn_state) for child in expr.parameters)) else: - return super(ScopedFunctionNameChanger, self).map_call( + return super(ResolvedFunctionNameChanger, self).map_call( expr, expn_state) else: return self.map_substitution(name, tag, expr.parameters, expn_state) @@ -812,7 +812,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): if expr in self.expr_to_new_names: return type(expr)( - ScopedFunction(self.expr_to_new_names[expr]), + ResolvedFunction(self.expr_to_new_names[expr]), tuple(self.rec(child, expn_state) for child in expr.parameters), dict( @@ -820,7 +820,7 @@ class ScopedFunctionNameChanger(RuleAwareIdentityMapper): for key, val in six.iteritems(expr.kw_parameters)) ) else: - return super(ScopedFunctionNameChanger, self).map_call_with_kwargs( + return super(ResolvedFunctionNameChanger, self).map_call_with_kwargs( expr, expn_state) @@ -841,14 +841,14 @@ def register_pymbolic_calls_to_knl_callables(kernel, :class:`loopy.kernel.function_interface.InKernelCallable`. *Example:* Conisder the expression of an instruction in the kernel as - ``Call(ScopedFunction('sin_0'), Variable('x'))``, with the + ``Call(ResolvedFunction('sin_0'), Variable('x'))``, with the ``scoped_functions`` of the *kernel* being ``{'sin_0': ScalarCallable(name='sin')}`` and the argument - ``pymbolic_calls_to_callables = {Call(ScopedFunction('sin_0'), + ``pymbolic_calls_to_callables = {Call(ResolvedFunction('sin_0'), Variable('x')): ScalarCallable(name='sin', arg_id_to_dtype={0: float64, -1: np.float64})}``. After applying the transformation the expression would rename its function name and hence would become - ``Call(ScopedFunction('sin_1'), Variable('x'))`` and the transformed + ``Call(ResolvedFunction('sin_1'), Variable('x'))`` and the transformed kernel would have ``scoped_functions={'sin_0': ScalarCallable(name='sin'), 'sin_1': Variable('x')): ScalarCallable(name='sin', arg_id_to_dtype={0: np.float64, -1: @@ -875,7 +875,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, # name. if isinstance(pymbolic_call.function, Variable): pymbolic_call_function = pymbolic_call.function - elif isinstance(pymbolic_call.function, ScopedFunction): + elif isinstance(pymbolic_call.function, ResolvedFunction): pymbolic_call_function = pymbolic_call.function.function else: raise NotImplementedError("Unknown type %s for pymbolic call " @@ -905,7 +905,7 @@ def register_pymbolic_calls_to_knl_callables(kernel, rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) subst_expander = SubstitutionRuleExpander(kernel.substitutions) - scope_changer = ScopedFunctionNameChanger(rule_mapping_context, + scope_changer = ResolvedFunctionNameChanger(rule_mapping_context, pymbolic_calls_to_new_names, subst_expander) scoped_kernel = scope_changer.map_kernel(kernel) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index a05c630e7..d2d4ea4db 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -24,7 +24,7 @@ THE SOFTWARE. from pymbolic import var -from loopy.symbolic import ScopedFunction +from loopy.symbolic import ResolvedFunction from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -185,7 +185,7 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return ScopedFunction("max")(operand1, operand2) + return ResolvedFunction("max")(operand1, operand2) def get_scalar_callables(self, kernel): return { @@ -197,7 +197,7 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return ScopedFunction("min")(operand1, operand2) + return ResolvedFunction("min")(operand1, operand2) def get_scalar_callables(self, kernel): return { @@ -250,7 +250,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype): scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return ScopedFunction("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): @@ -267,7 +267,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return ScopedFunction(SegmentedOp(self))(*(operand1 + operand2)) + return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2)) def get_scalar_callables(self, kernel): return { @@ -308,7 +308,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return ScopedFunction("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)) def __str__(self): @@ -325,7 +325,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ScopedFunction(ArgExtOp(self))(*(operand1 + operand2)) + return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2)) def get_scalar_callables(self, kernel): return { diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 5f9fe7535..1779ec692 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2136,10 +2136,10 @@ class ArgDescrInferenceMapper(CombineMapper): def map_call(self, expr, **kwargs): from pymbolic.primitives import Call, CallWithKwargs from loopy.kernel.function_interface import ValueArgDescriptor - from loopy.symbolic import ScopedFunction, SubArrayRef + from loopy.symbolic import ResolvedFunction, SubArrayRef - # ignore if the call is not to a ScopedFunction - if not isinstance(expr.function, ScopedFunction): + # ignore if the call is not to a ResolvedFunction + if not isinstance(expr.function, ResolvedFunction): return self.combine((self.rec(child) for child in expr.parameters)) if isinstance(expr, Call): @@ -2258,9 +2258,9 @@ class HWAxesInferenceMapper(CombineMapper): assert isinstance(expr, CallWithKwargs) kw_parameters = expr.kw_parameters - from loopy.symbolic import ScopedFunction - # ignoring if the call is not to a ScopedFunction - if not isinstance(expr.function, ScopedFunction): + from loopy.symbolic import ResolvedFunction + # ignoring if the call is not to a ResolvedFunction + if not isinstance(expr.function, ResolvedFunction): return self.combine((self.rec(child) for child in expr.parameters+tuple(kw_parameters.values()))) @@ -2332,7 +2332,7 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): from pymbolic.primitives import CallWithKwargs, Call from loopy.library.reduction import ArgExtOp, SegmentedOp from pymbolic.primitives import Variable - from loopy.symbolic import ScopedFunction + from loopy.symbolic import ResolvedFunction if isinstance(expr, Call): kw_parameters = {} @@ -2347,11 +2347,11 @@ class FunctionsNotReadyForCodegenCollector(CombineMapper): expr.parameters + tuple(kw_parameters))) elif isinstance(expr.function, Variable): - # UnScopedFunction obtained and hence clearly not ready for + # UnResolvedFunction obtained and hence clearly not ready for # codegen. return False - elif isinstance(expr.function, ScopedFunction): + elif isinstance(expr.function, ResolvedFunction): is_ready_for_codegen = self.kernel.scoped_functions[ expr.function.name].is_ready_for_codegen() return self.combine( diff --git a/loopy/program.py b/loopy/program.py index a2326e6ba..0ff2d41a2 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -37,14 +37,14 @@ class FunctionResolver(RuleAwareIdentityMapper): """ Mapper to convert the ``function`` attribute of a :class:`pymbolic.primitives.Call` known in the kernel as instances of - :class:`loopy.symbolic.ScopedFunction`. A function is known in the + :class:`loopy.symbolic.ResolvedFunction`. A function is known in the *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable`. **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + - log(z)``, then the mapper would return ``ScopedFunction('sin')(x) + - unknown_function(y) + ScopedFunction('log')(z)``. + log(z)``, then the mapper would return ``ResolvedFunction('sin')(x) + + unknown_function(y) + ResolvedFunction('log')(z)``. :arg rule_mapping_context: An instance of :class:`loopy.symbolic.RuleMappingContext`. @@ -90,20 +90,20 @@ class FunctionResolver(RuleAwareIdentityMapper): return self.map_substitution(name, tag, expr.parameters, expn_state) def map_call_with_kwargs(self, expr, expn_state): - from loopy.symbolic import ScopedFunction + from loopy.symbolic import ResolvedFunction - if not isinstance(expr.function, ScopedFunction): + if not isinstance(expr.function, ResolvedFunction): # search the kernel for the function. in_knl_callable = self.find_scoped_function_identifier( expr.function.name) if in_knl_callable: - # associate the newly created ScopedFunction with the + # associate the newly created ResolvedFunction with the # resolved in-kernel callable self.scoped_functions[expr.function.name] = in_knl_callable return type(expr)( - ScopedFunction(expr.function.name), + ResolvedFunction(expr.function.name), tuple(self.rec(child, expn_state) for child in expr.parameters), dict( diff --git a/loopy/statistics.py b/loopy/statistics.py index 6c012ca21..72f73f56a 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -712,8 +712,8 @@ class ExpressionOpCounter(CounterBase): map_variable = map_constant def map_call(self, expr): - from loopy.symbolic import ScopedFunction - if isinstance(expr.function, ScopedFunction): + from loopy.symbolic import ResolvedFunction + if isinstance(expr.function, ResolvedFunction): function_identifier = self.knl.scoped_functions[ expr.function.name].name else: diff --git a/loopy/symbolic.py b/loopy/symbolic.py index e02d5995e..9f336f565 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -112,8 +112,8 @@ class IdentityMapperMixin(object): return SubArrayRef(self.rec(expr.swept_inames, *args), self.rec(expr.subscript, *args)) - def map_scoped_function(self, expr, *args): - return ScopedFunction(self.rec(expr.function, *args)) + def map_resolved_function(self, expr, *args): + return ResolvedFunction(self.rec(expr.function, *args)) map_type_cast = map_type_annotation @@ -179,7 +179,7 @@ class WalkMapper(WalkMapperBase): self.rec(expr.swept_inames, *args) self.rec(expr.subscript, *args) - def map_scoped_function(self, expr, *args): + def map_resolved_function(self, expr, *args): if not self.visit(expr): return @@ -188,7 +188,7 @@ class WalkMapper(WalkMapperBase): class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant - map_scoped_function = CallbackMapperBase.map_constant + map_resolved_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): @@ -256,8 +256,8 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) - def map_scoped_function(self, expr, prec): - return "ScopedFunction('%s')" % expr.name + def map_resolved_function(self, expr, prec): + return "ResolvedFunction('%s')" % expr.name def map_sub_array_ref(self, expr, prec): return "SubArrayRef({inames}, ({subscr}))".format( @@ -332,7 +332,7 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) - def map_scoped_function(self, expr): + def map_resolved_function(self, expr): return self.rec(expr.function) @@ -684,10 +684,10 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") -class ScopedFunction(p.Expression): +class ResolvedFunction(p.Expression): """ A function invocation whose definition is known in a :mod:`loopy` kernel. - Each instance of :class:`loopy.symbolic.ScopedFunction` in an expression + Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression points to an instance of :class:`loopy.kernel.function_interface.InKernelCallable` through the mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer @@ -717,7 +717,7 @@ class ScopedFunction(p.Expression): elif isinstance(self.function, (ArgExtOp, SegmentedOp)): return self.function else: - raise LoopyError("Unexpected function type %s in ScopedFunction." % + raise LoopyError("Unexpected function type %s in ResolvedFunction." % type(self.function)) def __getinitargs__(self): @@ -726,7 +726,7 @@ class ScopedFunction(p.Expression): def stringifier(self): return StringifyMapper - mapper_method = intern("map_scoped_function") + mapper_method = intern("map_resolved_function") class EvaluatorWithDeficientContext(PartialEvaluationMapper): @@ -898,7 +898,7 @@ def parse_tagged_name(expr): from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, ScopedFunction): + elif isinstance(expr, ResolvedFunction): return parse_tagged_name(expr.function) elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 166634534..a5b3003d4 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -266,7 +266,7 @@ class TypeInferenceMapper(CombineMapper): def map_call(self, expr, return_tuple=False): from pymbolic.primitives import Variable, CallWithKwargs, Call - from loopy.symbolic import ScopedFunction + from loopy.symbolic import ResolvedFunction if isinstance(expr, CallWithKwargs): kw_parameters = expr.kw_parameters @@ -275,7 +275,7 @@ class TypeInferenceMapper(CombineMapper): kw_parameters = {} identifier = expr.function - if isinstance(identifier, (Variable, ScopedFunction)): + if isinstance(identifier, (Variable, ResolvedFunction)): identifier = identifier.name def none_if_empty(d): @@ -289,7 +289,7 @@ class TypeInferenceMapper(CombineMapper): tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) # specializing the known function wrt type - if isinstance(expr.function, ScopedFunction): + if isinstance(expr.function, ResolvedFunction): in_knl_callable = self.scoped_functions[expr.function.name] # {{{ checking that there is no overwriting of types of in_knl_callable -- GitLab From 1c25bbf3c9910ba75ac410553ca5e9207af74689 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 26 Jul 2018 00:12:16 -0500 Subject: [PATCH 242/916] Naive resolving works. --- loopy/kernel/__init__.py | 35 -------- loopy/kernel/creation.py | 108 +----------------------- loopy/program.py | 175 +++++++++++++++++++++++++-------------- 3 files changed, 117 insertions(+), 201 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index a42b2892c..48a77c425 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -182,11 +182,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: function_manglers .. attribute:: symbol_manglers - .. attribute:: function_scopers - - A list of functions of signature ``(target, name)`` returning a - :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. - .. attribute:: substitutions a mapping from substitution names to @@ -245,8 +240,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): iname_to_tags=None, substitutions=None, function_manglers=None, - function_scopers=None, - scoped_functions={}, symbol_manglers=[], iname_slab_increments=None, @@ -259,7 +252,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=None, state=KernelState.INITIAL, - is_called_from_host=True, target=None, overridden_get_grid_sizes_for_insn_ids=None, @@ -350,14 +342,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains) assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT - if function_scopers is None: - # populate the function scopers from the target and the loopy - # specific callable scopers - - from loopy.library.function import loopy_specific_callable_scopers - function_scopers = [loopy_specific_callable_scopers] + ( - target.get_device_ast_builder().function_scopers()) - ImmutableRecordWithoutPickling.__init__(self, domains=domains, instructions=instructions, @@ -377,13 +361,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, function_manglers=function_manglers, - function_scopers=function_scopers, - scoped_functions=scoped_functions, symbol_manglers=symbol_manglers, index_dtype=index_dtype, options=options, state=state, - is_called_from_host=is_called_from_host, target=target, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), @@ -436,20 +417,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): return None - def find_scoped_function_identifier(self, identifier): - """ - Returns an instance of - :class:`loopy.kernel.function_interface.InKernelCallable` if the - :arg:`identifier` is known to any kernel function scoper, otherwise returns - *None*. - """ - for scoper in self.function_scopers: - in_knl_callable = scoper(self.target, identifier) - if in_knl_callable: - return in_knl_callable - - return None - # }}} # {{{ symbol mangling @@ -1568,9 +1535,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "preamble_generators", "function_manglers", - "function_scopers", "symbol_manglers", - "scoped_functions", ) def update_persistent_hash(self, key_hash, key_builder): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 68f10b463..fa27bc5b6 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -30,8 +30,7 @@ from pymbolic.mapper import CSECachingMapperMixin from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids from loopy.symbolic import ( - IdentityMapper, WalkMapper, SubArrayRef, - RuleAwareIdentityMapper) + IdentityMapper, WalkMapper, SubArrayRef) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, @@ -1841,105 +1840,6 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # }}} -# {{{ scope functions - -class FunctionScoper(RuleAwareIdentityMapper): - """ - Mapper to convert the ``function`` attribute of a - :class:`pymbolic.primitives.Call` known in the kernel as instances of - :class:`loopy.symbolic.ResolvedFunction`. A function is known in the - *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` - returns an instance of - :class:`loopy.kernel.function_interface.InKernelCallable`. - - **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + - log(z)``, then the mapper would return ``ResolvedFunction('sin')(x) + - unknown_function(y) + ResolvedFunction('log')(z)``. - - :arg rule_mapping_context: An instance of - :class:`loopy.symbolic.RuleMappingContext`. - :arg function_ids: A container with instances of :class:`str` indicating - the function identifiers to look for while scoping functions. - """ - def __init__(self, rule_mapping_context, kernel): - super(FunctionScoper, self).__init__(rule_mapping_context) - self.kernel = kernel - self.scoped_functions = {} - - def map_call(self, expr, expn_state): - from pymbolic.primitives import Call, CallWithKwargs - from loopy.symbolic import parse_tagged_name - - name, tag = parse_tagged_name(expr.function) - if name not in self.rule_mapping_context.old_subst_rules: - new_call_with_kwargs = self.rec(CallWithKwargs( - function=expr.function, parameters=expr.parameters, - kw_parameters={}), expn_state) - return Call(new_call_with_kwargs.function, - new_call_with_kwargs.parameters) - else: - return self.map_substitution(name, tag, expr.parameters, expn_state) - - def map_call_with_kwargs(self, expr, expn_state): - from loopy.symbolic import ResolvedFunction - - if not isinstance(expr.function, ResolvedFunction): - - # search the kernel for the function. - in_knl_callable = self.kernel.find_scoped_function_identifier( - expr.function.name) - - if in_knl_callable: - # associate the newly created ResolvedFunction with the - # resolved in-kernel callable - self.scoped_functions[expr.function.name] = in_knl_callable - return type(expr)( - ResolvedFunction(expr.function.name), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - - # this is an unknown function as of yet, do not modify it - return super(FunctionScoper, self).map_call_with_kwargs(expr, - expn_state) - - def map_reduction(self, expr, expn_state): - self.scoped_functions.update( - expr.operation.get_scalar_callables(self.kernel)) - return super(FunctionScoper, self).map_reduction(expr, expn_state) - - -def scope_functions(kernel): - """ - Returns a kernel with the pymbolic nodes involving known functions realized - as instances of :class:`loopy.symbolic.ResolvedFunction`, along with the - resolved functions being added to the ``scoped_functions`` dictionary of - the kernel. - """ - - from loopy.symbolic import SubstitutionRuleMappingContext - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, kernel.get_var_name_generator()) - - function_scoper = FunctionScoper(rule_mapping_context, kernel) - - # scoping fucntions and collecting the scoped functions - kernel_with_scoped_functions = rule_mapping_context.finish_kernel( - function_scoper.map_kernel(kernel)) - - # updating the functions collected during the scoped functions - updated_scoped_functions = kernel.scoped_functions.copy() - updated_scoped_functions.update(function_scoper.scoped_functions) - - return kernel_with_scoped_functions.copy( - scoped_functions=updated_scoped_functions) - -# }}} - - # {{{ slice to sub array ref def get_slice_params(slice, dimension_length): @@ -2444,16 +2344,14 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) - knl = scope_functions(knl) + from loopy.kernel.tools import infer_arg_is_output_only + knl = infer_arg_is_output_only(knl) from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) creation_plog.done() - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) - return knl # }}} diff --git a/loopy/program.py b/loopy/program.py index 0ff2d41a2..cf6068451 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -95,15 +95,18 @@ class FunctionResolver(RuleAwareIdentityMapper): if not isinstance(expr.function, ResolvedFunction): # search the kernel for the function. - in_knl_callable = self.find_scoped_function_identifier( + in_knl_callable = self.find_resolved_function_from_identifier( expr.function.name) if in_knl_callable: # associate the newly created ResolvedFunction with the # resolved in-kernel callable - self.scoped_functions[expr.function.name] = in_knl_callable + + self.program_callables_info, new_func_id = ( + self.program_callables_info.with_callable(expr.function, + in_knl_callable, True)) return type(expr)( - ResolvedFunction(expr.function.name), + ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) for child in expr.parameters), dict( @@ -121,26 +124,29 @@ class FunctionResolver(RuleAwareIdentityMapper): return super(FunctionResolver, self).map_reduction(expr, expn_state) -def resolve_callables(name, resolved_functions, function_resolvers): +def resolve_callables(name, program_callables_info, function_resolvers): - kernel = resolved_functions[name].subkernel + kernel = program_callables_info[name].subkernel from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) - function_scoper = FunctionResolver(rule_mapping_context, kernel) + function_resolver = FunctionResolver(rule_mapping_context, kernel, + program_callables_info, function_resolvers) # scoping fucntions and collecting the scoped functions - kernel_with_scoped_functions = rule_mapping_context.finish_kernel( - function_scoper.map_kernel(kernel)) + kernel_with_functions_resolved = rule_mapping_context.finish_kernel( + function_resolver.map_kernel(kernel)) + program_callables_info = function_resolver.program_callables_info + + new_in_knl_callable = program_callables_info[name].copy( + subkernel=kernel_with_functions_resolved) + program_callables_info, _ = program_callables_info.with_callable( + Variable(name), new_in_knl_callable) - # updating the functions collected during the scoped functions - updated_scoped_functions = kernel.scoped_functions.copy() - updated_scoped_functions.update(function_scoper.scoped_functions) + return program_callables_info - return kernel_with_scoped_functions.copy( - scoped_functions=updated_scoped_functions) # {{{ program definition @@ -151,7 +157,8 @@ class Program(ImmutableRecord): target=None, function_resolvers=None): - # fixme: check if all sanity checks have been covered? + # FIXME: check if all sanity checks have been covered? + # FIXME: The comments over here may need some attention. assert root_kernel_name in program_callables_info if target is None: @@ -161,7 +168,9 @@ class Program(ImmutableRecord): # populate the function scopers from the target and the loopy # specific callable scopers - assert len(program_callables_info.resolved_functons) == 1 + # at this point only the root kernel can be present in the + # callables. + assert len(program_callables_info.resolved_functions) == 1 from loopy.library.function import loopy_specific_callable_scopers function_resolvers = [loopy_specific_callable_scopers] + ( @@ -175,9 +184,9 @@ class Program(ImmutableRecord): for name, in_knl_callable in program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): # resolve the callables in the subkernel - resolved_functions = resolve_callables(name, - program_callables_info, function_resolvers) - + program_callables_info = ( + resolve_callables(name, program_callables_info, + function_resolvers)) elif isinstance(in_knl_callable, ScalarCallable): pass else: @@ -186,14 +195,26 @@ class Program(ImmutableRecord): program_callables_info, renames_needed = ( program_callables_info.with_exit_edit_mode()) + + # at this point no renames must be needed assert not renames_needed super(Program, self).__init__( root_kernel_name=root_kernel_name, - resolved_functions=resolved_functions, + program_callables_info=program_callables_info, target=target, function_resolvers=function_resolvers) + def __str__(self): + # FIXME: make this better + print(self.program_callables_info.num_times_callables_called) + return ( + (self.program_callables_info[ + self.root_kernel_name].subkernel).__str__() + + '\nResolved Functions: ' + + (self.program_callables_info.resolved_functions.keys()).__str__() + + '\n' + 75*'-' + '\n') + # }}} @@ -245,7 +266,7 @@ class ProgramCallablesInfo(ImmutableRecord): super(ProgramCallablesInfo, self).__init__( resolved_functions=resolved_functions, num_times_callables_called=num_times_callables_called, - history_of_callables_callable_names=history_of_callable_names, + history_of_callable_names=history_of_callable_names, old_resolved_functions=old_resolved_functions, is_being_edited=is_being_edited, num_times_hit_during_editing=num_times_hit_during_editing, @@ -254,17 +275,25 @@ class ProgramCallablesInfo(ImmutableRecord): def with_edit_callables_mode(self): return self.copy(is_being_edited=True, old_resolved_functions=self.resolved_functions.copy(), - num_times_hit_during_editring=dict((func_id, 0) for func_id in + num_times_hit_during_editing=dict((func_id, 0) for func_id in self.resolved_functions)) - def with_callable(self, function, in_kernel_callable): + def with_callable(self, function, in_kernel_callable, + resolved_for_the_first_time=False): """ :arg function: An instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.library.reduction.ReductionOpFunction`. :arg in_kernel_callables: An instance of :class:`loopy.InKernelCallable`. + + .. note:: + + Assumes that each callable is touched atmost once, the internal + working of this function fails if that is violated and raises a + *RuntimeError*. """ + # FIXME: add a note about using enter and exit assert self.is_being_edited from loopy.library.reduction import ArgExtOp, SegmentedOp @@ -277,59 +306,83 @@ class ProgramCallablesInfo(ImmutableRecord): renames_needed_after_editing = self.renames_needed_after_editing.copy() num_times_hit_during_editing = self.num_times_hit_during_editing.copy() - num_times_callable_being_called = self.num_times_being_called.copy() - num_times_hit_during_editing[function.name] += 1 + num_times_callables_called = ( + self.num_times_callables_called.copy()) + + if function.name in self.old_resolved_functions: + num_times_hit_during_editing[function.name] += 1 if in_kernel_callable in self.resolved_functions.values(): - for func_id, in_knl_callable in self.scoped_functions.items(): + for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: - num_times_callable_being_called[func_id] += 1 - num_times_callable_being_called[function] -= 1 - if num_times_callable_being_called[function] == 0: - renames_needed_after_editing[func_id] = function - - return self, func_id + num_times_callables_called[func_id] += 1 + if not resolved_for_the_first_time: + num_times_callables_called[function.name] -= 1 + if num_times_callables_called[function.name] == 0: + renames_needed_after_editing[func_id] = function.name + + return ( + self.copy( + num_times_hit_during_editing=( + num_times_hit_during_editing), + num_times_callables_called=( + num_times_callables_called), + renames_needed_after_editing=( + renames_needed_after_editing)), + func_id) else: # {{{ ingoring this for now if False and isinstance(function, (ArgExtOp, SegmentedOp)): - # ignoring this casse for now + # FIXME: ignoring this casse for now # FIXME: If a kernel has two flavors of ArgExtOp then they are # overwritten and hence not supported.(for now). - updated_scoped_functions = self.scoped_functions.copy() - updated_scoped_functions[function] = in_kernel_callable + updated_resolved_functions = self.scoped_functions.copy() + updated_resolved_functions[function] = in_kernel_callable - return self.copy(updated_scoped_functions), function.copy() + return self.copy(updated_resolved_functions), function.copy() # }}} - #fixme: deal with the history over here. + # FIXME: deal with the history over here. + # FIXME: once the code logic is running beautify this part. + # many "ifs" can be avoided unique_function_identifier = function.name - if self.num_times[function.name] > 1: - while unique_function_identifier in self.scoped_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) - - num_times_callable_being_called[function] -= 1 - num_times_callable_being_called[unique_function_identifier] = 1 - - updated_scoped_functions = self.scoped_functions.copy() - updated_scoped_functions[unique_function_identifier] = in_kernel_callable - - return (self.copy(scoped_functions=updated_scoped_functions), + if function.name in self.old_resolved_functions: + if self.num_times_callables_called[function.name] > 1: + while unique_function_identifier in self.scoped_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + if not resolved_for_the_first_time: + num_times_callables_called[function.name] -= 1 + num_times_callables_called[unique_function_identifier] = 1 + else: + num_times_callables_called[unique_function_identifier] = 1 + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing=num_times_hit_during_editing, + renames_needed_after_editing=renames_needed_after_editing), Variable(unique_function_identifier)) def with_exit_edit_mode(self): assert self.is_being_edited - num_times_callable_being_called = self.num_times_callable_being_called.copy() + num_times_callables_called = self.num_times_callables_called.copy() for func_id in self.old_resolved_functions: if self.num_times_hit_during_editing[func_id] > 0 and ( self.num_times_hit_during_editing[func_id] < - num_times_callable_being_called[func_id]): + num_times_callables_called[func_id]): unique_function_identifier = func_id while unique_function_identifier in self.scoped_functions: @@ -337,28 +390,28 @@ class ProgramCallablesInfo(ImmutableRecord): next_indexed_function_identifier( unique_function_identifier)) - (num_times_callable_being_called[func_id], - num_times_callable_being_called[unique_function_identifier]) = ( + (num_times_callables_called[func_id], + num_times_callables_called[unique_function_identifier]) = ( self.num_times_hit_while_editing[func_id], - num_times_callable_being_called[func_id] - + num_times_callables_called[func_id] - self.num_times_being_hit_while_editing[func_id]) - if self.num_times_hit_during_edition[func_id] > 0 and ( + if self.num_times_hit_during_editing[func_id] > 0 and ( self.num_times_hit_during_editing[func_id] > - num_times_callable_being_called[func_id]): + num_times_callables_called[func_id]): raise RuntimeError("Should not traverse more number of times than " "it is called.") return ( self.copy( - is_begin_edited=False, - num_times_callable_being_called=num_times_callable_being_called, + is_being_edited=False, + num_times_callables_called=num_times_callables_called, num_times_hit_during_editing={}, - renames_needed_while_editing={}), - self.renames_needed_while_editing) + renames_needed_after_editing={}), + self.renames_needed_after_editing) def __getitem__(self, item): - return self.reoslved_functions[item] + return self.resolved_functions[item] def __contains__(self, item): return item in self.resolved_functions -- GitLab From e2ea68351fcfc34d9242964450b09af11d662626 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 26 Jul 2018 02:41:35 -0500 Subject: [PATCH 243/916] proceed towards type inference. --- loopy/codegen/__init__.py | 6 ++- loopy/kernel/__init__.py | 32 +------------ loopy/kernel/creation.py | 4 +- loopy/kernel/tools.py | 12 +++-- loopy/preprocess.py | 18 +++++++- loopy/program.py | 73 +++++++++++++++++++++++++++++- loopy/target/execution.py | 2 +- loopy/target/pyopencl_execution.py | 8 ++-- 8 files changed, 112 insertions(+), 43 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index eacd53886..00e95b17d 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -410,7 +410,7 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_v2(kernel): +def generate_code_for_a_single_kernel(kernel, is_root_kernel=True): """ :returns: a :class:`CodeGenerationResult` """ @@ -619,6 +619,10 @@ def generate_code_v2(kernel): return codegen_result +def generate_code_v2(program): + pass + + def generate_code(kernel, device=None): if device is not None: from warnings import warn diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 48a77c425..374b88a38 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1394,39 +1394,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ implementation arguments - - @property - @memoize_method - def impl_arg_to_arg(self): - from loopy.kernel.array import ArrayBase - - result = {} - - for arg in self.args: - if not isinstance(arg, ArrayBase): - result[arg.name] = arg - continue - - if arg.shape is None or arg.dim_tags is None: - result[arg.name] = arg - continue - - subscripts_and_names = arg.subscripts_and_names() - if subscripts_and_names is None: - result[arg.name] = arg - continue - - for index, sub_arg_name in subscripts_and_names: - result[sub_arg_name] = arg - - return result - - # }}} - # {{{ direct execution def __call__(self, *args, **kwargs): + # FIXME: scream and then convert to a program + 1/0 key = self.target.get_kernel_executor_cache_key(*args, **kwargs) try: kex = self._kernel_executor_cache[key] diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index fa27bc5b6..22bdf5f84 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2347,8 +2347,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): from loopy.kernel.tools import infer_arg_is_output_only knl = infer_arg_is_output_only(knl) - from loopy.preprocess import prepare_for_caching - knl = prepare_for_caching(knl) + from loopy.preprocess import prepare_single_kernel_for_caching + knl = prepare_single_kernel_for_caching(knl) creation_plog.done() diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 54e30fa7a..5492b091c 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -43,19 +43,25 @@ logger = logging.getLogger(__name__) # {{{ add and infer argument dtypes -def add_dtypes(knl, dtype_dict): +def add_dtypes(program, dtype_dict): """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(knl, dtype_dict) + root_kernel = program.root_kernel + dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( + root_kernel, dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) + root_kernel - return knl.copy(args=new_args, temporary_variables=new_temp_vars) + root_kernel_with_added_dtypes = ( + root_kernel.copy(args=new_args, temporary_variables=new_temp_vars)) + + return program.with_root_kernel(root_kernel_with_added_dtypes) def _add_dtypes_overdetermined(knl, dtype_dict): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1779ec692..d763833d0 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -42,6 +42,7 @@ from loopy.symbolic import CombineMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -49,7 +50,7 @@ logger = logging.getLogger(__name__) # {{{ prepare for caching -def prepare_for_caching(kernel): +def prepare_single_kernel_for_caching(kernel): import loopy as lp new_args = [] @@ -76,6 +77,21 @@ def prepare_for_caching(kernel): return kernel + +def prepare_for_caching(program): + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = prepare_single_kernel_for_caching( + in_knl_callable.subkernel) + new_resolved_functions[func_id] = ( + in_knl_callable.copy(subkernel=new_subkernel)) + elif isinstance(in_knl_callable, ScalarCallable): + new_resolved_functions[func_id] = in_knl_callable + else: + raise NotImplementedError("Unknown InKernelCallable %s." % + type(in_knl_callable).__name__) + # }}} diff --git a/loopy/program.py b/loopy/program.py index cf6068451..70956ab0b 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -25,7 +25,7 @@ THE SOFTWARE. import six import re -from pytools import ImmutableRecord +from pytools import ImmutableRecord, memoize_method from pymbolic.primitives import Variable from loopy.symbolic import RuleAwareIdentityMapper @@ -205,6 +205,73 @@ class Program(ImmutableRecord): target=target, function_resolvers=function_resolvers) + self._program_executor_cache = {} + + @property + def name(self): + #FIXME: discuss with @inducer if we use "name" instead of + # "root_kernel_name" + return self.root_kernel_name + + @property + def root_kernel(self): + return self.program_callables_info[self.root_kernel_name].subkernel + + def with_root_kernel(self, root_kernel): + new_in_knl_callable = self.program_callables_info[ + self.root_kernel_name].copy(subkernel=root_kernel) + new_resolved_functions = ( + self.program_callables_info.resolved_functions.copy()) + new_resolved_functions[self.root_kernel_name] = new_in_knl_callable + + return self.copy( + program_callables_info=self.program_callables_info.copy( + resolved_functions=new_resolved_functions)) + + @property + def args(self): + return self.root_kernel.args[:] + + # {{{ implementation arguments + + @property + @memoize_method + def impl_arg_to_arg(self): + from loopy.kernel.array import ArrayBase + + result = {} + + for arg in self.args: + if not isinstance(arg, ArrayBase): + result[arg.name] = arg + continue + + if arg.shape is None or arg.dim_tags is None: + result[arg.name] = arg + continue + + subscripts_and_names = arg.subscripts_and_names() + if subscripts_and_names is None: + result[arg.name] = arg + continue + + for index, sub_arg_name in subscripts_and_names: + result[sub_arg_name] = arg + + return result + + # }}} + + def __call__(self, *args, **kwargs): + key = self.target.get_kernel_executor_cache_key(*args, **kwargs) + try: + pex = self._program_executor_cache[key] + except KeyError: + pex = self.target.get_kernel_executor(self, *args, **kwargs) + self._program_executor_cache[key] = pex + + return pex(*args, **kwargs) + def __str__(self): # FIXME: make this better print(self.program_callables_info.num_times_callables_called) @@ -250,6 +317,8 @@ def next_indexed_function_identifier(function): num=int(match.group('num'))+1) +# {{{ program callables info + class ProgramCallablesInfo(ImmutableRecord): def __init__(self, resolved_functions, num_times_callables_called=None, history_of_callable_names=None, is_being_edited=False, @@ -419,6 +488,8 @@ class ProgramCallablesInfo(ImmutableRecord): def items(self): return self.resolved_functions.items() +# }}} + def make_program_from_kernel(kernel): callable_knl = CallableKernel(subkernel=kernel) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3cdf20577..8f0f8edda 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -723,7 +723,7 @@ class KernelExecutorBase(object): self.packing_controller = SeparateArrayPackingController(kernel) self.output_names = tuple(arg.name for arg in self.kernel.args - if arg.name in self.kernel.get_written_variables()) + if arg.is_output_only) self.has_runtime_typed_args = any( arg.dtype is None diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 27be61987..73e722af5 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -252,7 +252,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, program): """ :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels @@ -261,13 +261,13 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super(PyOpenCLKernelExecutor, self).__init__(kernel) + super(PyOpenCLKernelExecutor, self).__init__(program) self.context = context from loopy.target.pyopencl import PyOpenCLTarget - if isinstance(kernel.target, PyOpenCLTarget): - self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0])) + if isinstance(program.target, PyOpenCLTarget): + self.kernel = program.copy(target=PyOpenCLTarget(context.devices[0])) def get_invoker_uncached(self, kernel, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() -- GitLab From fa0e5e5f664656a85c1a017ef0aa22d9be428614 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 26 Jul 2018 21:48:32 -0500 Subject: [PATCH 244/916] work on type inference. --- loopy/kernel/function_interface.py | 26 ++++---- loopy/type_inference.py | 96 +++++++++++++++++++++++++----- 2 files changed, 96 insertions(+), 26 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 3db4c082b..d051d8c65 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -201,7 +201,7 @@ class InKernelCallable(ImmutableRecord): def __getinitargs__(self): return (self.arg_id_to_dtype, self.arg_id_to_descr) - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): """ :arg arg_id_to_type: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -218,10 +218,12 @@ class InKernelCallable(ImmutableRecord): Any argument information exists both by its positional and its keyword identifier. """ + # FIXME: In all these with_** functions add that also passes a + # program_callables_info raise NotImplementedError() - def with_descrs(self, arg_id_to_descr): + def with_descrs(self, arg_id_to_descr, program_callables_info): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -348,7 +350,7 @@ class ScalarCallable(InKernelCallable): return (self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) @@ -511,8 +513,8 @@ class CallableKernel(InKernelCallable): def name(self): return self.subkernel.name - def with_types(self, arg_id_to_dtype, kernel): - + def with_types(self, arg_id_to_dtype, caller_kernel, + program_callables_info): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) new_args = [] @@ -528,26 +530,30 @@ class CallableKernel(InKernelCallable): else: new_args.append(arg) - from loopy.type_inference import infer_unknown_types + from loopy.type_inference import ( + infer_unknown_types_for_a_single_kernel) pre_specialized_subkernel = self.subkernel.copy( args=new_args) # infer the types of the written variables based on the knowledge # of the types of the arguments supplied - specialized_kernel = infer_unknown_types(pre_specialized_subkernel, - expect_completion=True) + specialized_kernel, program_callables_info = ( + infer_unknown_types_for_a_single_kernel( + pre_specialized_subkernel, + program_callables_info, + expect_completion=True)) new_arg_id_to_dtype = {} for arg in specialized_kernel.args: # associate the updated_arg_id_to_dtype with keyword as well as - # positional id + # positional id. new_arg_id_to_dtype[arg.name] = arg.dtype new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype # Return the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype return self.copy(subkernel=specialized_kernel, - arg_id_to_dtype=new_arg_id_to_dtype) + arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info def with_descrs(self, arg_id_to_descr): diff --git a/loopy/type_inference.py b/loopy/type_inference.py index a5b3003d4..6225e4c11 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -60,7 +60,7 @@ def get_return_types_as_tuple(arg_id_to_dtype): # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, new_assignments=None): + def __init__(self, kernel, program_callables_info, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -73,8 +73,8 @@ class TypeInferenceMapper(CombineMapper): new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() - self.scoped_functions = kernel.scoped_functions - self.specialized_functions = {} + self.program_callables_info = program_callables_info + self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -108,7 +108,8 @@ class TypeInferenceMapper(CombineMapper): # are Python-equal (for many common constants such as integers). def copy(self): - return type(self)(self.kernel, self.new_assignments) + return type(self)(self.kernel, self.program_callables_info, + self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() @@ -322,13 +323,31 @@ class TypeInferenceMapper(CombineMapper): # }}} - in_knl_callable = in_knl_callable.with_types( - arg_id_to_dtype, self.kernel) + in_knl_callable, self.program_callables_info = ( + in_knl_callable.with_types( + arg_id_to_dtype, self.kernel, + self.program_callables_info)) + + in_knl_callable = in_knl_callable.with_target(self.kernel.target) # storing the type specialized function so that it can be used for # later use - self.specialized_functions[expr] = in_knl_callable.with_target( - self.kernel.target) + self.program_callables_info, new_function_id = ( + self.program_callables_info.with_callable( + expr.function, + in_knl_callable.with_target(self.kernel.target))) + + if isinstance(expr, Call): + self.old_calls_to_new_calls = Call( + ResolvedFunction(new_function_id), + expr.parameters) + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls = CallWithKwargs( + ResolvedFunction(new_function_id), + expr.parameters, kw_parameters) + + self.old_calls_to_new_calls = Call new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype @@ -353,6 +372,7 @@ class TypeInferenceMapper(CombineMapper): # finding the function_mangler which would be associated with the # realized function. + mangle_result = None for function_mangler in self.kernel.function_manglers: mangle_result = function_mangler(self.kernel, identifier, @@ -379,9 +399,22 @@ class TypeInferenceMapper(CombineMapper): # creating the ManglerCallable object corresponding to the # function. - self.specialized_functions[expr] = ManglerCallable( + in_knl_callable = ManglerCallable( identifier, function_mangler, arg_id_to_dtype, arg_id_to_descr, mangle_result.target_name) + self.program_callables_info, new_function_id = ( + self.program_callables_info.with_callable( + expr.function, in_knl_callable)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls = Call( + ResolvedFunction(new_function_id), + expr.parameters) + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls = CallWithKwargs( + ResolvedFunction(new_function_id), + expr.parameters, kw_parameters) # Returning the type. if return_tuple: @@ -575,7 +608,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): result = type_inf_mapper.combine(dtype_sets) return (result, type_inf_mapper.symbols_with_unknown_types, - type_inf_mapper.specialized_functions) + type_inf_mapper.old_calls_to_new_calls) # }}} @@ -602,7 +635,8 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types(kernel, expect_completion=False): +def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, + expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) @@ -664,7 +698,8 @@ def infer_unknown_types(kernel, expect_completion=False): new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) + type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info, + item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) @@ -673,7 +708,7 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument - specialized_functions = {} + old_calls_to_new_calls = {} for var_chain in sccs: changed_during_last_queue_run = False @@ -698,7 +733,7 @@ def infer_unknown_types(kernel, expect_completion=False): debug("inferring type for %s %s", type(item).__name__, item.name) - result, symbols_with_unavailable_types, new_specialized_functions = ( + result, symbols_with_unavailable_types, new_old_calls_to_new_calls = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) @@ -722,7 +757,7 @@ def infer_unknown_types(kernel, expect_completion=False): # TODO: I dont like in-place updates. Change this to something # else. Perhaps add a function for doing this, which does it # using a bunch of copies? - specialized_functions.update(new_specialized_functions) + old_calls_to_new_calls.update(new_old_calls_to_new_calls) else: debug(" failure") @@ -770,6 +805,7 @@ def infer_unknown_types(kernel, expect_completion=False): args=[new_arg_dict[arg.name] for arg in kernel.args], ) + # this has to be subsitutition from loopy.kernel.function_interface import ( register_pymbolic_calls_to_knl_callables) type_specialized_kernel = register_pymbolic_calls_to_knl_callables( @@ -780,7 +816,35 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.check import check_functions_are_scoped check_functions_are_scoped(type_specialized_kernel) - return type_specialized_kernel + return program_callables_info, type_specialized_kernel + + +def infer_unknown_types(program, expect_completion=False): + """Infer types on temporaries and arguments.""" + + program_callables_info = program.progra_callables_info + + type_uninferred_knl_callable = ( + program_callables_info[program.root_kernel_name]) + type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel + + program_callables_info = program.program_calllables_info.with_edit_mode() + root_kernel, program_callables_info = ( + infer_unknown_types_for_a_single_kernel( + type_uninferred_root_kernel, + program_callables_info, expect_completion)) + + type_inferred_knl_callable = type_uninferred_knl_callable.copy( + subkernel=root_kernel) + + program_callables_info.with_callable(program.root_kernel_name, + type_inferred_knl_callable) + + program_callables_info, renames_needed = ( + program_callables_info.with_exit_mode()) + + return program.with_renamed_callables( + program_callables_info, renames_needed) # }}} -- GitLab From 682ab6229fd67455ee91d4b6973b65ec1b3356d6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Jul 2018 10:57:13 -0500 Subject: [PATCH 245/916] type inference works for simple cases. --- loopy/kernel/function_interface.py | 121 ++++++----------------------- loopy/program.py | 7 +- loopy/target/c/__init__.py | 31 +++++--- loopy/target/cuda.py | 29 ++++--- loopy/target/opencl.py | 46 +++++++---- loopy/target/pyopencl.py | 22 ++++-- loopy/transform/callable.py | 8 +- loopy/type_inference.py | 46 +++++------ 8 files changed, 138 insertions(+), 172 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index d051d8c65..aac793efb 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -31,14 +31,11 @@ from six.moves import zip from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from pymbolic.primitives import Variable from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, SubstitutionRuleExpander) -from pymbolic.primitives import Call - # {{{ argument descriptors @@ -782,15 +779,16 @@ def next_indexed_variable(function): num=int(match.group('num'))+1) -class ResolvedFunctionNameChanger(RuleAwareIdentityMapper): +class FunctionNameChanger(RuleAwareIdentityMapper): """ Changes the names of scoped functions in calls of expressions according to - the mapping ``expr_to_new_names`` + the mapping ``calls_to_new_functions`` """ - def __init__(self, rule_mapping_context, expr_to_new_names, subst_expander): - super(ResolvedFunctionNameChanger, self).__init__(rule_mapping_context) - self.expr_to_new_names = expr_to_new_names + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super(FunctionNameChanger, self).__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names self.subst_expander = subst_expander def map_call(self, expr, expn_state): @@ -798,27 +796,29 @@ class ResolvedFunctionNameChanger(RuleAwareIdentityMapper): if name not in self.rule_mapping_context.old_subst_rules: expanded_expr = self.subst_expander(expr) - if expr in self.expr_to_new_names: + if expr in self.calls_to_new_names: return type(expr)( - ResolvedFunction(self.expr_to_new_names[expr]), + ResolvedFunction(self.calls_to_new_names[expr]), tuple(self.rec(child, expn_state) for child in expr.parameters)) - elif expanded_expr in self.expr_to_new_names: + elif expanded_expr in self.calls_to_new_names: + # FIXME: this is horribly wrong logic. + # investigate how to make edits to a substitution rule return type(expr)( - ResolvedFunction(self.expr_to_new_names[expanded_expr]), + ResolvedFunction(self.calls_to_new_names[expanded_expr]), tuple(self.rec(child, expn_state) - for child in expr.parameters)) + for child in expanded_expr.parameters)) else: - return super(ResolvedFunctionNameChanger, self).map_call( + return super(FunctionNameChanger, self).map_call( expr, expn_state) else: return self.map_substitution(name, tag, expr.parameters, expn_state) def map_call_with_kwargs(self, expr, expn_state): - if expr in self.expr_to_new_names: + if expr in self.calls_to_new_names: return type(expr)( - ResolvedFunction(self.expr_to_new_names[expr]), + ResolvedFunction(self.calls_to_new_names[expr]), tuple(self.rec(child, expn_state) for child in expr.parameters), dict( @@ -826,96 +826,19 @@ class ResolvedFunctionNameChanger(RuleAwareIdentityMapper): for key, val in six.iteritems(expr.kw_parameters)) ) else: - return super(ResolvedFunctionNameChanger, self).map_call_with_kwargs( + return super(FunctionNameChanger, self).map_call_with_kwargs( expr, expn_state) -def register_pymbolic_calls_to_knl_callables(kernel, - pymbolic_calls_to_knl_callables): - # FIXME This could use an example. I have no idea what this does. - # Surely I can't associate arbitrary pymbolic expresions (3+a?) - # with callables? - """ - Returns a copy of :arg:`kernel` which includes an association with the given - pymbolic calls to the instances of :class:`InKernelCallable` for the - mapping given by :arg:`pymbolic_calls_to_knl_calllables`. - - :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`. - - :arg pymbolic_calls_to_knl_callables: A mapping from :mod:`pymbolic` expressions - to the instances of - :class:`loopy.kernel.function_interface.InKernelCallable`. - - *Example:* Conisder the expression of an instruction in the kernel as - ``Call(ResolvedFunction('sin_0'), Variable('x'))``, with the - ``scoped_functions`` of the *kernel* being ``{'sin_0': - ScalarCallable(name='sin')}`` and the argument - ``pymbolic_calls_to_callables = {Call(ResolvedFunction('sin_0'), - Variable('x')): ScalarCallable(name='sin', arg_id_to_dtype={0: float64, - -1: np.float64})}``. After applying the transformation the expression - would rename its function name and hence would become - ``Call(ResolvedFunction('sin_1'), Variable('x'))`` and the transformed - kernel would have ``scoped_functions={'sin_0': - ScalarCallable(name='sin'), 'sin_1': Variable('x')): - ScalarCallable(name='sin', arg_id_to_dtype={0: np.float64, -1: - np.float64})}``. Hence, the expression would rename the function - pymbolic node and the scoped functions dictionary would register the - new callable corresponding to the new pymbolic node. - """ - - scoped_names_to_functions = kernel.scoped_functions.copy() - - # A dict containing the new scoped functions to the names which have been - # assigned to them - scoped_functions_to_names = {} - - # A dict containing the new name that need to be assigned to the - # corresponding pymbolic call - pymbolic_calls_to_new_names = {} - - for pymbolic_call, in_knl_callable in pymbolic_calls_to_knl_callables.items(): - # check if such a in-kernel callable already exists. - assert isinstance(pymbolic_call, Call) - if in_knl_callable not in scoped_functions_to_names: - # No matching in_knl_callable found, implies make a new one with a new - # name. - if isinstance(pymbolic_call.function, Variable): - pymbolic_call_function = pymbolic_call.function - elif isinstance(pymbolic_call.function, ResolvedFunction): - pymbolic_call_function = pymbolic_call.function.function - else: - raise NotImplementedError("Unknown type %s for pymbolic call " - "function" % type(pymbolic_call).__name__) - - unique_var = next_indexed_variable(pymbolic_call_function) - from loopy.library.reduction import ArgExtOp, SegmentedOp - while unique_var in scoped_names_to_functions and not isinstance( - unique_var, (ArgExtOp, SegmentedOp)): - # keep on finding new names till one a unique one is found. - unique_var = next_indexed_variable(Variable(unique_var)) - - # book-keeping of the functions and names mappings for later use - if isinstance(in_knl_callable, CallableKernel): - # for array calls the name in the target is the name of the - # scoped funciton - in_knl_callable = in_knl_callable.copy( - name_in_target=unique_var) - scoped_names_to_functions[unique_var] = in_knl_callable - scoped_functions_to_names[in_knl_callable] = unique_var - - pymbolic_calls_to_new_names[pymbolic_call] = ( - scoped_functions_to_names[in_knl_callable]) - - # Use the data populated in pymbolic_calls_to_new_names to change the - # names of the scoped functions of all the calls in the kernel. +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, kernel.get_var_name_generator()) + kernel.substitutions, kernel.get_var_name_generator()) subst_expander = SubstitutionRuleExpander(kernel.substitutions) - scope_changer = ResolvedFunctionNameChanger(rule_mapping_context, + name_changer = FunctionNameChanger(rule_mapping_context, pymbolic_calls_to_new_names, subst_expander) - scoped_kernel = scope_changer.map_kernel(kernel) - return scoped_kernel.copy(scoped_functions=scoped_names_to_functions) + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) # }}} diff --git a/loopy/program.py b/loopy/program.py index 70956ab0b..75e00616c 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -194,7 +194,7 @@ class Program(ImmutableRecord): type(in_knl_callable).__name__) program_callables_info, renames_needed = ( - program_callables_info.with_exit_edit_mode()) + program_callables_info.with_exit_edit_callables_mode()) # at this point no renames must be needed assert not renames_needed @@ -369,6 +369,9 @@ class ProgramCallablesInfo(ImmutableRecord): # {{{ sanity checks + if isinstance(function, str): + function = Variable(function) + assert isinstance(function, (Variable, ArgExtOp, SegmentedOp)) # }}} @@ -442,7 +445,7 @@ class ProgramCallablesInfo(ImmutableRecord): renames_needed_after_editing=renames_needed_after_editing), Variable(unique_function_identifier)) - def with_exit_edit_mode(self): + def with_exit_edit_callables_mode(self): assert self.is_being_edited num_times_callables_called = self.num_times_callables_called.copy() diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index eab1e6afc..eb7f43a37 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -362,7 +362,7 @@ class CMathCallable(ScalarCallable): C-Target. """ - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): name = self.name if name in ["abs", "min", "max"]: @@ -379,7 +379,9 @@ class CMathCallable(ScalarCallable): if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) dtype = arg_id_to_dtype[0] dtype = dtype.numpy_dtype @@ -391,7 +393,7 @@ class CMathCallable(ScalarCallable): raise LoopyTypeError("%s does not support type %s" % (name, dtype)) from loopy.target.opencl import OpenCLTarget - if not isinstance(kernel.target, OpenCLTarget): + if not isinstance(caller_kernel.target, OpenCLTarget): # for CUDA, C Targets the name must be modified if dtype == np.float64: pass # fabs @@ -403,8 +405,11 @@ class CMathCallable(ScalarCallable): raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - return self.copy(name_in_target=name, - arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + program_callables_info) # binary functions if name in ["fmax", "fmin"]: @@ -417,7 +422,9 @@ class CMathCallable(ScalarCallable): arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() @@ -428,7 +435,7 @@ class CMathCallable(ScalarCallable): elif dtype.kind == "f": from loopy.target.opencl import OpenCLTarget - if not isinstance(kernel.target, OpenCLTarget): + if not isinstance(caller_kernel.target, OpenCLTarget): if dtype == np.float64: pass # fmin elif dtype == np.float32: @@ -439,10 +446,14 @@ class CMathCallable(ScalarCallable): raise LoopyTypeError("%s does not support type %s" % (name, dtype)) dtype = NumpyType(dtype) - return self.copy(name_in_target=name, - arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + program_callables_info) - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) def scope_c_math_functions(target, identifier): diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index b2e4118d2..fe576cdca 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -122,7 +122,8 @@ _CUDA_SPECIFIC_FUNCTIONS = { class CudaCallable(ScalarCallable): - def cuda_with_types(self, arg_id_to_dtype, kernel): + def cuda_with_types(self, arg_id_to_dtype, caller_kernel, + program_callables_info): name = self.name @@ -135,13 +136,17 @@ class CudaCallable(ScalarCallable): arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) dtype = arg_id_to_dtype[0] scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] - return self.copy(name_in_target=name, - arg_id_to_dtype={-1: NumpyType(scalar_dtype), - 0: dtype, 1: dtype}) + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), + 0: dtype, 1: dtype}), + program_callables_info) if name in _CUDA_SPECIFIC_FUNCTIONS: num_args = _CUDA_SPECIFIC_FUNCTIONS[name] @@ -154,7 +159,9 @@ class CudaCallable(ScalarCallable): if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in @@ -167,10 +174,14 @@ class CudaCallable(ScalarCallable): updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, num_args)) - return self.copy(name_in_target=name, - arg_id_to_dtype=updated_arg_id_to_dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + program_callables_info) - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) def scope_cuda_functions(target, identifier): diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 6ee5969b3..81b6770c1 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -172,7 +172,7 @@ class OpenCLCallable(ScalarCallable): :class:`loopy.target.c.CMathCallable`. """ - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): name = self.name if name in ["max", "min"]: @@ -180,7 +180,9 @@ class OpenCLCallable(ScalarCallable): if not -1 <= id <= 1: raise LoopyError("%s can take only 2 arguments." % name) if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() @@ -190,8 +192,10 @@ class OpenCLCallable(ScalarCallable): if dtype.kind == 'f': name = 'f'+name dtype = NumpyType(dtype) - return self.copy(name_in_target=name, - arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + program_callables_info) else: # Unsupported type. raise LoopyError("%s function not supported for the types %s" % @@ -206,12 +210,16 @@ class OpenCLCallable(ScalarCallable): arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) dtype = arg_id_to_dtype[0] scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] - return self.copy(name_in_target=name, arg_id_to_dtype={-1: - NumpyType(scalar_dtype), 0: dtype, 1: dtype}) + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}), + program_callables_info) if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] @@ -224,7 +232,9 @@ class OpenCLCallable(ScalarCallable): if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in @@ -237,8 +247,10 @@ class OpenCLCallable(ScalarCallable): updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, num_args)) - return self.copy(name_in_target=name, - arg_id_to_dtype=updated_arg_id_to_dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + program_callables_info) if name in VECTOR_LITERAL_FUNCS: base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] @@ -252,19 +264,25 @@ class OpenCLCallable(ScalarCallable): if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(count)) updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( NumpyType(dtype), count) - return self.copy(name_in_target="(%s%d) " % (base_tp_name, count), - arg_id_to_dtype=updated_arg_id_to_dtype) + return ( + self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype), + program_callables_info) # does not satisfy any of the conditions needed for specialization. # hence just returning a copy of the callable. - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) def scope_opencl_functions(target, identifier): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 27c4f4ab4..2ee70d65e 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -206,7 +206,7 @@ class PyOpenCLCallable(ScalarCallable): Records information about the callables which are not covered by :class:`loopy.target.opencl.OpenCLCallable` """ - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): name = self.name @@ -218,7 +218,9 @@ class PyOpenCLCallable(ScalarCallable): if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) dtype = arg_id_to_dtype[0] @@ -248,8 +250,10 @@ class PyOpenCLCallable(ScalarCallable): else: raise LoopyTypeError("unexpected complex type '%s'" % dtype) - return self.copy(name_in_target="%s_%s" % (tpname, name), - arg_id_to_dtype={0: dtype, -1: dtype}) + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: dtype}), + program_callables_info) else: # function calls for floating parameters. numpy_dtype = dtype.numpy_dtype @@ -257,10 +261,14 @@ class PyOpenCLCallable(ScalarCallable): dtype = dtype.copy(numpy_dtype=np.float32) if name == 'abs': name = 'fabs' - return self.copy(name_in_target=name, - arg_id_to_dtype={0: dtype, -1: dtype}) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: dtype, -1: dtype}), + program_callables_info) - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) def pyopencl_function_scoper(target, identifier): diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 092cef887..3c0caa9e5 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -36,7 +36,7 @@ from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import (get_kw_pos_association, - register_pymbolic_calls_to_knl_callables) + change_names_of_pymbolic_calls) __doc__ = """ @@ -453,9 +453,7 @@ def _inline_call_instruction(kernel, callee_knl, instruction): raise NotImplementedError("Unknown type of instruction %s." % type( insn)) - from loopy.kernel.function_interface import ( - register_pymbolic_calls_to_knl_callables) - kernel = register_pymbolic_calls_to_knl_callables(kernel, + kernel = change_names_of_pymbolic_calls(kernel, callee_scoped_calls_dict) # }}} @@ -622,7 +620,7 @@ def _match_caller_callee_argument_dimension(caller_knl, callee_function_name): raise LoopyError("No CallableKernel with the name %s found in %s." % ( callee_function_name, caller_knl.name)) - return register_pymbolic_calls_to_knl_callables(caller_knl, + return change_names_of_pymbolic_calls(caller_knl, pymbolic_calls_to_new_callables) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 6225e4c11..30d7aa0a0 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -291,7 +291,7 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type if isinstance(expr.function, ResolvedFunction): - in_knl_callable = self.scoped_functions[expr.function.name] + in_knl_callable = self.program_callables_info[expr.function.name] # {{{ checking that there is no overwriting of types of in_knl_callable @@ -334,20 +334,15 @@ class TypeInferenceMapper(CombineMapper): # later use self.program_callables_info, new_function_id = ( self.program_callables_info.with_callable( - expr.function, - in_knl_callable.with_target(self.kernel.target))) + expr.function.function, + in_knl_callable)) + print(self.program_callables_info['sin']) if isinstance(expr, Call): - self.old_calls_to_new_calls = Call( - ResolvedFunction(new_function_id), - expr.parameters) + self.old_calls_to_new_calls[expr] = new_function_id else: assert isinstance(expr, CallWithKwargs) - self.old_calls_to_new_calls = CallWithKwargs( - ResolvedFunction(new_function_id), - expr.parameters, kw_parameters) - - self.old_calls_to_new_calls = Call + self.old_calls_to_new_calls[expr] = new_function_id new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype @@ -407,14 +402,10 @@ class TypeInferenceMapper(CombineMapper): expr.function, in_knl_callable)) if isinstance(expr, Call): - self.old_calls_to_new_calls = Call( - ResolvedFunction(new_function_id), - expr.parameters) + self.old_calls_to_new_calls[expr] = new_function_id else: assert isinstance(expr, CallWithKwargs) - self.old_calls_to_new_calls = CallWithKwargs( - ResolvedFunction(new_function_id), - expr.parameters, kw_parameters) + self.old_calls_to_new_calls = new_function_id # Returning the type. if return_tuple: @@ -608,7 +599,8 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): result = type_inf_mapper.combine(dtype_sets) return (result, type_inf_mapper.symbols_with_unknown_types, - type_inf_mapper.old_calls_to_new_calls) + type_inf_mapper.old_calls_to_new_calls, + type_inf_mapper.program_callables_info) # }}} @@ -733,7 +725,8 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, debug("inferring type for %s %s", type(item).__name__, item.name) - result, symbols_with_unavailable_types, new_old_calls_to_new_calls = ( + (result, symbols_with_unavailable_types, + new_old_calls_to_new_calls, program_callables_info) = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) @@ -807,28 +800,29 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, # this has to be subsitutition from loopy.kernel.function_interface import ( - register_pymbolic_calls_to_knl_callables) - type_specialized_kernel = register_pymbolic_calls_to_knl_callables( - pre_type_specialized_knl, specialized_functions) + change_names_of_pymbolic_calls) + type_specialized_kernel = change_names_of_pymbolic_calls( + pre_type_specialized_knl, old_calls_to_new_calls) if expect_completion: # if completion is expected, then it is important that all the # callables are scoped. from loopy.check import check_functions_are_scoped check_functions_are_scoped(type_specialized_kernel) - return program_callables_info, type_specialized_kernel + return type_specialized_kernel, program_callables_info def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" - program_callables_info = program.progra_callables_info + program_callables_info = program.program_callables_info type_uninferred_knl_callable = ( program_callables_info[program.root_kernel_name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - program_callables_info = program.program_calllables_info.with_edit_mode() + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) root_kernel, program_callables_info = ( infer_unknown_types_for_a_single_kernel( type_uninferred_root_kernel, @@ -841,7 +835,7 @@ def infer_unknown_types(program, expect_completion=False): type_inferred_knl_callable) program_callables_info, renames_needed = ( - program_callables_info.with_exit_mode()) + program_callables_info.with_exit_edit_callables_mode()) return program.with_renamed_callables( program_callables_info, renames_needed) -- GitLab From 8ebcc22cfbd7b895c9d0b9584e77b5e9a9ca457f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Jul 2018 14:47:07 -0500 Subject: [PATCH 246/916] Finalized the design of with_exit_edit_callables_mode --- loopy/program.py | 150 +++++++++++++++++++++++----------------- loopy/type_inference.py | 13 ++-- 2 files changed, 92 insertions(+), 71 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 75e00616c..c668c69df 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -28,7 +28,7 @@ import re from pytools import ImmutableRecord, memoize_method from pymbolic.primitives import Variable -from loopy.symbolic import RuleAwareIdentityMapper +from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) @@ -90,7 +90,6 @@ class FunctionResolver(RuleAwareIdentityMapper): return self.map_substitution(name, tag, expr.parameters, expn_state) def map_call_with_kwargs(self, expr, expn_state): - from loopy.symbolic import ResolvedFunction if not isinstance(expr.function, ResolvedFunction): @@ -156,6 +155,7 @@ class Program(ImmutableRecord): program_callables_info, target=None, function_resolvers=None): + assert isinstance(program_callables_info, ProgramCallablesInfo) # FIXME: check if all sanity checks have been covered? # FIXME: The comments over here may need some attention. @@ -193,12 +193,9 @@ class Program(ImmutableRecord): raise NotImplementedError("Unknown callable %s." % type(in_knl_callable).__name__) - program_callables_info, renames_needed = ( + program_callables_info = ( program_callables_info.with_exit_edit_callables_mode()) - # at this point no renames must be needed - assert not renames_needed - super(Program, self).__init__( root_kernel_name=root_kernel_name, program_callables_info=program_callables_info, @@ -317,6 +314,31 @@ def next_indexed_function_identifier(function): num=int(match.group('num'))+1) +class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, renaming_dict): + super(ResolvedFunctionRenamer, self).__init__( + rule_mapping_context) + self.renaming_dict = renaming_dict + + def map_resolved_functions(self, expr, expn_state): + if expr.name in self.renaming_dict: + return ResolvedFunction(self.renaming_dict[expr.name]) + else: + return super(ResolvedFunctionRenamer, self).rec(expr, expn_state) + + +def rename_resolved_functions_in_a_single_kernel(kernel, + renaming_dict): + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + resolved_function_renamer = ResolvedFunctionRenamer(rule_mapping_context, + renaming_dict) + return ( + rule_mapping_context.finish_kernel( + resolved_function_renamer.map_kernel(kernel))) + + # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): @@ -378,10 +400,9 @@ class ProgramCallablesInfo(ImmutableRecord): renames_needed_after_editing = self.renames_needed_after_editing.copy() num_times_hit_during_editing = self.num_times_hit_during_editing.copy() - num_times_callables_called = ( - self.num_times_callables_called.copy()) + num_times_callables_called = self.num_times_callables_called.copy() - if function.name in self.old_resolved_functions: + if not resolved_for_the_first_time: num_times_hit_during_editing[function.name] += 1 if in_kernel_callable in self.resolved_functions.values(): @@ -404,34 +425,21 @@ class ProgramCallablesInfo(ImmutableRecord): func_id) else: - # {{{ ingoring this for now - - if False and isinstance(function, (ArgExtOp, SegmentedOp)): - # FIXME: ignoring this casse for now - # FIXME: If a kernel has two flavors of ArgExtOp then they are - # overwritten and hence not supported.(for now). - updated_resolved_functions = self.scoped_functions.copy() - updated_resolved_functions[function] = in_kernel_callable - - return self.copy(updated_resolved_functions), function.copy() - # }}} - - # FIXME: deal with the history over here. + # FIXME: maybe deal with the history over here? # FIXME: once the code logic is running beautify this part. # many "ifs" can be avoided unique_function_identifier = function.name - if function.name in self.old_resolved_functions: - if self.num_times_callables_called[function.name] > 1: - while unique_function_identifier in self.scoped_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) - - if not resolved_for_the_first_time: - num_times_callables_called[function.name] -= 1 - num_times_callables_called[unique_function_identifier] = 1 - else: - num_times_callables_called[unique_function_identifier] = 1 + if (resolved_for_the_first_time or + self.num_times_callables_called[function.name] > 1): + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + if not resolved_for_the_first_time: + num_times_callables_called[function.name] -= 1 + + num_times_callables_called[unique_function_identifier] = 1 updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( @@ -448,39 +456,40 @@ class ProgramCallablesInfo(ImmutableRecord): def with_exit_edit_callables_mode(self): assert self.is_being_edited - num_times_callables_called = self.num_times_callables_called.copy() - - for func_id in self.old_resolved_functions: - - if self.num_times_hit_during_editing[func_id] > 0 and ( - self.num_times_hit_during_editing[func_id] < - num_times_callables_called[func_id]): - unique_function_identifier = func_id - - while unique_function_identifier in self.scoped_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + num_times_callables_called = {} + resolved_functions = {} + + for func_id, in_knl_callable in self.resolved_functions.items(): + if isinstance(in_knl_callable, CallableKernel): + old_subkernel = in_knl_callable.subkernel + new_subkernel = rename_resolved_functions_in_a_single_kernel( + old_subkernel, self.renames_needed_after_editing) + in_knl_callable = ( + in_knl_callable.copy(subkernel=new_subkernel)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) - (num_times_callables_called[func_id], - num_times_callables_called[unique_function_identifier]) = ( - self.num_times_hit_while_editing[func_id], - num_times_callables_called[func_id] - - self.num_times_being_hit_while_editing[func_id]) + if func_id in self.renames_needed_after_editing: + new_func_id = self.renames_needed_after_editing[func_id] + resolved_functions[new_func_id] = ( + in_knl_callable) + num_times_callables_called[new_func_id] = ( + self.num_times_callables_called[func_id]) - if self.num_times_hit_during_editing[func_id] > 0 and ( - self.num_times_hit_during_editing[func_id] > - num_times_callables_called[func_id]): - raise RuntimeError("Should not traverse more number of times than " - "it is called.") + else: + resolved_functions[func_id] = in_knl_callable + num_times_callables_called[func_id] = ( + self.num_times_callables_called[func_id]) - return ( - self.copy( - is_being_edited=False, - num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing={}, - renames_needed_after_editing={}), - self.renames_needed_after_editing) + return self.copy( + is_being_edited=False, + resolved_functions=resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing={}, + renames_needed_after_editing={}) def __getitem__(self, item): return self.resolved_functions[item] @@ -506,4 +515,17 @@ def make_program_from_kernel(kernel): return program +# {{{ ingoring this for now + +# if False and isinstance(function, (ArgExtOp, SegmentedOp)): +# FIXME: ignoring this casse for now +# FIXME: If a kernel has two flavors of ArgExtOp then they are +# overwritten and hence not supported.(for now). +# updated_resolved_functions = self.scoped_functions.copy() +# updated_resolved_functions[function] = in_kernel_callable +# return self.copy(updated_resolved_functions), function.copy() + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 30d7aa0a0..cf63bf288 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -336,7 +336,6 @@ class TypeInferenceMapper(CombineMapper): self.program_callables_info.with_callable( expr.function.function, in_knl_callable)) - print(self.program_callables_info['sin']) if isinstance(expr, Call): self.old_calls_to_new_calls[expr] = new_function_id @@ -831,14 +830,14 @@ def infer_unknown_types(program, expect_completion=False): type_inferred_knl_callable = type_uninferred_knl_callable.copy( subkernel=root_kernel) - program_callables_info.with_callable(program.root_kernel_name, - type_inferred_knl_callable) + program_callables_info, _ = ( + program_callables_info.with_callable( + program.root_kernel_name, + type_inferred_knl_callable)) - program_callables_info, renames_needed = ( + program_callables_info = ( program_callables_info.with_exit_edit_callables_mode()) - - return program.with_renamed_callables( - program_callables_info, renames_needed) + return program.copy(program_callables_info=program_callables_info) # }}} -- GitLab From 1deaaed4494ece88b6b9164d48bfd8d7adf9feec Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Jul 2018 15:33:46 -0500 Subject: [PATCH 247/916] Still in process of realizing should there be a kernel or should there be a program :( --- loopy/kernel/__init__.py | 31 +++++++++++++++++++++++++++++ loopy/program.py | 32 +----------------------------- loopy/target/execution.py | 14 ++++++------- loopy/target/pyopencl_execution.py | 2 +- 4 files changed, 40 insertions(+), 39 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 374b88a38..fba06720c 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1394,6 +1394,37 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} + + # {{{ implementation arguments + + @property + @memoize_method + def impl_arg_to_arg(self): + from loopy.kernel.array import ArrayBase + + result = {} + + for arg in self.args: + if not isinstance(arg, ArrayBase): + result[arg.name] = arg + continue + + if arg.shape is None or arg.dim_tags is None: + result[arg.name] = arg + continue + + subscripts_and_names = arg.subscripts_and_names() + if subscripts_and_names is None: + result[arg.name] = arg + continue + + for index, sub_arg_name in subscripts_and_names: + result[sub_arg_name] = arg + + return result + + # }}} + # {{{ direct execution def __call__(self, *args, **kwargs): diff --git a/loopy/program.py b/loopy/program.py index c668c69df..06c87f241 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -25,7 +25,7 @@ THE SOFTWARE. import six import re -from pytools import ImmutableRecord, memoize_method +from pytools import ImmutableRecord from pymbolic.primitives import Variable from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction @@ -229,36 +229,6 @@ class Program(ImmutableRecord): def args(self): return self.root_kernel.args[:] - # {{{ implementation arguments - - @property - @memoize_method - def impl_arg_to_arg(self): - from loopy.kernel.array import ArrayBase - - result = {} - - for arg in self.args: - if not isinstance(arg, ArrayBase): - result[arg.name] = arg - continue - - if arg.shape is None or arg.dim_tags is None: - result[arg.name] = arg - continue - - subscripts_and_names = arg.subscripts_and_names() - if subscripts_and_names is None: - result[arg.name] = arg - continue - - for index, sub_arg_name in subscripts_and_names: - result[sub_arg_name] = arg - - return result - - # }}} - def __call__(self, *args, **kwargs): key = self.target.get_kernel_executor_cache_key(*args, **kwargs) try: diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 8f0f8edda..55295045f 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -713,21 +713,21 @@ class KernelExecutorBase(object): .. automethod:: __call__ """ - def __init__(self, kernel): + def __init__(self, program): """ :arg kernel: a loopy.LoopKernel """ - self.kernel = kernel + self.program = program - self.packing_controller = SeparateArrayPackingController(kernel) + self.packing_controller = SeparateArrayPackingController(program) - self.output_names = tuple(arg.name for arg in self.kernel.args + self.output_names = tuple(arg.name for arg in self.program.args if arg.is_output_only) self.has_runtime_typed_args = any( arg.dtype is None - for arg in kernel.args) + for arg in program.args) def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes @@ -769,8 +769,8 @@ class KernelExecutorBase(object): from loopy.preprocess import prepare_for_caching # prepare_for_caching() gets run by preprocess, but the kernel at this # stage is not guaranteed to be preprocessed. - cacheable_kernel = prepare_for_caching(self.kernel) - cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) + cacheable_program = prepare_for_caching(self.program) + cache_key = (type(self).__name__, cacheable_program, arg_to_dtype_set) if CACHING_ENABLED: try: diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 73e722af5..a1ccc91ff 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -267,7 +267,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): from loopy.target.pyopencl import PyOpenCLTarget if isinstance(program.target, PyOpenCLTarget): - self.kernel = program.copy(target=PyOpenCLTarget(context.devices[0])) + self.program = program.copy(target=PyOpenCLTarget(context.devices[0])) def get_invoker_uncached(self, kernel, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() -- GitLab From f1cecff6476357140f6e7a896eb4b0f324e89842 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 27 Jul 2018 17:35:22 -0500 Subject: [PATCH 248/916] Preprocessing works(for the most.) --- loopy/kernel/__init__.py | 31 ------ loopy/kernel/function_interface.py | 2 +- loopy/preprocess.py | 155 +++++++++++++++++------------ loopy/program.py | 32 +++++- loopy/target/execution.py | 29 +++--- loopy/target/pyopencl_execution.py | 20 ++-- 6 files changed, 149 insertions(+), 120 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index fba06720c..374b88a38 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1394,37 +1394,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - - # {{{ implementation arguments - - @property - @memoize_method - def impl_arg_to_arg(self): - from loopy.kernel.array import ArrayBase - - result = {} - - for arg in self.args: - if not isinstance(arg, ArrayBase): - result[arg.name] = arg - continue - - if arg.shape is None or arg.dim_tags is None: - result[arg.name] = arg - continue - - subscripts_and_names = arg.subscripts_and_names() - if subscripts_and_names is None: - result[arg.name] = arg - continue - - for index, sub_arg_name in subscripts_and_names: - result[sub_arg_name] = arg - - return result - - # }}} - # {{{ direct execution def __call__(self, *args, **kwargs): diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index aac793efb..2aa14b3d3 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -220,7 +220,7 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for positional arguments, names for keyword diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d763833d0..cece73f24 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -37,8 +37,8 @@ from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now -from loopy.type_inference import infer_unknown_types -from loopy.symbolic import CombineMapper +from loopy.type_inference import infer_unknown_types_for_a_single_kernel +from loopy.symbolic import CombineMapper, RuleAwareIdentityMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) @@ -2134,7 +2134,7 @@ def check_atomic_loads(kernel): # {{{ arg_descr_inference -class ArgDescrInferenceMapper(CombineMapper): +class ArgDescrInferenceMapper(RuleAwareIdentityMapper): """ Returns a set of instances of :class:`tuple` (expr, in_kernel_callable). The mapped `in_kernel_callable` of the @@ -2142,21 +2142,21 @@ class ArgDescrInferenceMapper(CombineMapper): arguments. """ - def __init__(self, kernel): - self.kernel = kernel - - def combine(self, values): - import operator - return reduce(operator.or_, values, frozenset()) + def __init__(self, rule_mapping_context, caller_kernel, + program_callables_info): + super(ArgDescrInferenceMapper, self).__init__( + rule_mapping_context) + self.caller_kernel = caller_kernel + self.program_callables_info = program_callables_info - def map_call(self, expr, **kwargs): + def map_call(self, expr, expn_state, **kwargs): from pymbolic.primitives import Call, CallWithKwargs from loopy.kernel.function_interface import ValueArgDescriptor from loopy.symbolic import ResolvedFunction, SubArrayRef - # ignore if the call is not to a ResolvedFunction if not isinstance(expr.function, ResolvedFunction): - return self.combine((self.rec(child) for child in expr.parameters)) + # ignore if the call is not to a ResolvedFunction + return super(ArgDescrInferenceMapper, self).rec(expr) if isinstance(expr, Call): kw_parameters = {} @@ -2178,7 +2178,7 @@ class ArgDescrInferenceMapper(CombineMapper): for i, par in enumerate(assignees): if isinstance(par, SubArrayRef): assignee_id_to_descr[-i-1] = ( - par.get_array_arg_descriptor(self.kernel)) + par.get_array_arg_descriptor(self.caller_kernel)) else: assignee_id_to_descr[-i-1] = ValueArgDescriptor() @@ -2187,63 +2187,74 @@ class ArgDescrInferenceMapper(CombineMapper): combined_arg_id_to_descr.update(assignee_id_to_descr) # specializing the function according to the parameter description - new_scoped_function = ( - self.kernel.scoped_functions[expr.function.name].with_descrs( + new_in_knl_callable = ( + self.program_callables_info[expr.function.name].with_descrs( combined_arg_id_to_descr)) + self.program_callables_info, new_func_id = ( + self.program_callables_info.with_callable( + expr.function.function, + new_in_knl_callable)) - # collecting the descriptors for args, kwargs, assignees - return ( - frozenset(((expr, new_scoped_function), )) | - self.combine((self.rec(child) for child in - expr.parameters+tuple(kw_parameters)))) + if isinstance(expr, Call): + return Call( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + else: + assert isinstance(expr, CallWithKwargs) + return CallWithKwargs( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) map_call_with_kwargs = map_call - def map_constant(self, expr, **kwargs): - return frozenset() + def map_kernel(self, kernel): + + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_dtype + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn)) + elif isinstance(insn, MultiAssignmentBase): + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant + return kernel.copy(instructions=new_insns) -def infer_arg_descr(kernel): +def infer_arg_descr(kernel, program_callables_info): """ Returns a copy of *kernel* with the argument shapes and strides matching for scoped functions in the *kernel*. Refer :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. """ + # FIXME: update this docs, once the design is finalized - arg_description_modifier = ArgDescrInferenceMapper(kernel) - pymbolic_calls_to_functions = set() + from loopy.symbolic import SubstitutionRuleMappingContext - for insn in kernel.instructions: + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) - if isinstance(insn, CallInstruction): - # In call instructions the assignees play an important in - # determining the arg_id_to_dtype - pymbolic_calls_to_functions.update( - arg_description_modifier(insn.expression, - assignees=insn.assignees)) - elif isinstance(insn, MultiAssignmentBase): - pymbolic_calls_to_functions.update(arg_description_modifier( - insn.expression)) - elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): - pass - else: - raise NotImplementedError("arg_descr_inference for %s instruction" % - type(insn)) + arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, + kernel, program_callables_info) - # making it the set of tuples a dict - pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions) + descr_inferred_kernel = rule_mapping_context.finish_kernel( + arg_descr_inf_mapper.map_kernel(kernel)) - # Now do the similar treatment as done for type inference. - from loopy.kernel.function_interface import ( - register_pymbolic_calls_to_knl_callables) - - return register_pymbolic_calls_to_knl_callables(kernel, - pymbolic_calls_to_functions) + return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info # }}} @@ -2443,12 +2454,35 @@ preprocess_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def preprocess_kernel(kernel, device=None): +def preprocess_program(program, device=None): + if device is not None: from warnings import warn warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) + root_kernel_callable = program.program_callables_info[program.name] + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + root_kernel, program_callables_info = preprocess_kernel( + program.root_kernel, program_callables_info, device) + processed_root_knl_callable = root_kernel_callable.copy(subkernel=root_kernel) + program_callables_info, _ = ( + program_callables_info.with_callable( + program.root_kernel_name, + processed_root_knl_callable)) + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + # FIXME: some version of the below funtion run should occur + # FIXME:type specialize functions that were missed during the type inference. + # program_callables_info = make_callables_ready_for_codegen( + # program_callables_info) + + return program.copy(program_callables_info=program_callables_info) + + +def preprocess_kernel(kernel, program_callables_info, device=None): from loopy.kernel import KernelState if kernel.state >= KernelState.PREPROCESSED: return kernel @@ -2491,7 +2525,8 @@ def preprocess_kernel(kernel, device=None): # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. - kernel = infer_unknown_types(kernel, expect_completion=False) + kernel, program_callables_info = infer_unknown_types_for_a_single_kernel( + kernel, program_callables_info, expect_completion=False) check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) @@ -2519,13 +2554,7 @@ def preprocess_kernel(kernel, device=None): # inferring the shape and dim_tags of the arguments involved in a function # call. - kernel = infer_arg_descr(kernel) - - # type specialize functions that were missed during the type inference. - kernel = make_functions_ready_for_codegen(kernel) - - # tuning the functions in the kernel to align with the grid sizes. - kernel = infer_hw_axes_sizes(kernel) + kernel, program_callables_info = infer_arg_descr(kernel, program_callables_info) # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) @@ -2552,13 +2581,13 @@ def preprocess_kernel(kernel, device=None): if CACHING_ENABLED: input_kernel = prepare_for_caching(input_kernel) - kernel = prepare_for_caching(kernel) + kernel = prepare_single_kernel_for_caching(kernel) # }}} if CACHING_ENABLED: preprocess_cache.store_if_not_present(input_kernel, kernel) - return kernel + return kernel, program_callables_info # vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py index 06c87f241..f2ea40506 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -25,7 +25,7 @@ THE SOFTWARE. import six import re -from pytools import ImmutableRecord +from pytools import ImmutableRecord, memoize_method from pymbolic.primitives import Variable from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction @@ -210,6 +210,36 @@ class Program(ImmutableRecord): # "root_kernel_name" return self.root_kernel_name + # {{{ implementation arguments + + @property + @memoize_method + def impl_arg_to_arg(self): + from loopy.kernel.array import ArrayBase + + result = {} + + for arg in self.args: + if not isinstance(arg, ArrayBase): + result[arg.name] = arg + continue + + if arg.shape is None or arg.dim_tags is None: + result[arg.name] = arg + continue + + subscripts_and_names = arg.subscripts_and_names() + if subscripts_and_names is None: + result[arg.name] = arg + continue + + for index, sub_arg_name in subscripts_and_names: + result[sub_arg_name] = arg + + return result + + # }}} + @property def root_kernel(self): return self.program_callables_info[self.root_kernel_name].subkernel diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 55295045f..423246842 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -729,16 +729,16 @@ class KernelExecutorBase(object): arg.dtype is None for arg in program.args) - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): + def get_typed_and_scheduled_program_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes - kernel = self.kernel + program = self.program if arg_to_dtype_set: var_to_dtype = {} for var, dtype in arg_to_dtype_set: try: - dest_name = kernel.impl_arg_to_arg[var].name + dest_name = program.impl_arg_to_arg[var].name except KeyError: dest_name = var @@ -749,21 +749,22 @@ class KernelExecutorBase(object): "no known variable/argument with that name" % var) - kernel = add_dtypes(kernel, var_to_dtype) + program = add_dtypes(program, var_to_dtype) from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + program = infer_unknown_types(program, expect_completion=True) - if kernel.schedule is None: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) + if program.root_kernel.schedule is None: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + program = program.with_root_kernel( + get_one_scheduled_kernel(program.root_kernel)) - return kernel + return program - def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + def get_typed_and_scheduled_program(self, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching @@ -778,9 +779,9 @@ class KernelExecutorBase(object): except KeyError: pass - logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + logger.debug("%s: typed-and-scheduled cache miss" % self.program.name) - kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + kernel = self.get_typed_and_scheduled_program_uncached(arg_to_dtype_set) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) @@ -791,7 +792,7 @@ class KernelExecutorBase(object): if not self.has_runtime_typed_args: return None - impl_arg_to_arg = self.kernel.impl_arg_to_arg + impl_arg_to_arg = self.program.impl_arg_to_arg arg_to_dtype = {} for arg_name, val in six.iteritems(kwargs): arg = impl_arg_to_arg.get(arg_name, None) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index a1ccc91ff..8d577bb01 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -274,16 +274,16 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() - if self.kernel.options.write_cl: + if self.program.options.write_cl: output = dev_code if self.kernel.options.highlight_cl: output = get_highlighted_code(output) @@ -302,17 +302,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): cl_program = ( cl.Program(self.context, dev_code) - .build(options=kernel.options.cl_build_options)) + .build(options=program.options.cl_build_options)) cl_kernels = _Kernels() for dp in codegen_result.device_programs: setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) return _KernelInfo( - kernel=kernel, + program=program, cl_kernels=cl_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) def __call__(self, queue, **kwargs): """ @@ -347,10 +347,10 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.cl_kernels, queue, allocator, wait_for, + return program_info.invoker( + program_info.cl_kernels, queue, allocator, wait_for, out_host, **kwargs) # }}} -- GitLab From c3c9d16ac5f14a8ffedf0419ead8bd33ff6eab18 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 12:54:02 -0500 Subject: [PATCH 249/916] work for the hw axes iname tags --- loopy/preprocess.py | 106 ++++++++++++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 34 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index cece73f24..9b9c555c8 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2261,53 +2261,78 @@ def infer_arg_descr(kernel, program_callables_info): # {{{ -class HWAxesInferenceMapper(CombineMapper): +class HWAxesInferenceMapper(RuleAwareIdentityMapper): """ Returns a set of instances of :class:`tuple` (expr, in_kernel_callable). The mapped `in_kernel_callable` of the :class:`InKernelCallable` are specialized for the the grid sizes of :attr:`kernel`. """ + # FIXME: docs after the design is final. - def __init__(self, kernel): - self.kernel = kernel - self.local_size, self.global_size = kernel.get_grid_size_upper_bounds() - - def combine(self, values): - import operator - return reduce(operator.or_, values, frozenset()) + def __init__(self, rule_mapping_context, caller_kernel, + program_callables_info): + super(ArgDescrInferenceMapper, self).__init__( + rule_mapping_context) + self.caller_kernel = caller_kernel + self.program_callables_info = program_callables_info + self.local_size, self.global_size = ( + caller_kernel.get_grid_size_upper_bounds()) - def map_call(self, expr, **kwargs): + def map_call(self, expr, expn_state): from pymbolic.primitives import CallWithKwargs, Call - if isinstance(expr, Call): - kw_parameters = {} - else: - assert isinstance(expr, CallWithKwargs) - kw_parameters = expr.kw_parameters - from loopy.symbolic import ResolvedFunction - # ignoring if the call is not to a ResolvedFunction + if not isinstance(expr.function, ResolvedFunction): - return self.combine((self.rec(child) for child in - expr.parameters+tuple(kw_parameters.values()))) + # ignore if the call is not to a ResolvedFunction + return super(ArgDescrInferenceMapper, self).rec(expr) - new_scoped_function = ( - self.kernel.scoped_functions[expr.function.name].with_hw_axes_sizes( + new_in_knl_callable = ( + self.program_callables_info[expr.function.name].with_hw_axes_sizes( self.local_size, self.global_size)) + self.program_callables_info, new_func_id = ( + self.program_callables_info.with_callable( + expr.function.function, + new_in_knl_callable)) - return (frozenset(((expr, new_scoped_function), )) | - self.combine((self.rec(child) for child in - expr.parameters+tuple(kw_parameters.values())))) + if isinstance(expr, Call): + return Call( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + else: + assert isinstance(expr, CallWithKwargs) + return CallWithKwargs( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) map_call_with_kwargs = map_call - def map_constant(self, expr, **kwargs): - return frozenset() + def map_kernel(self, kernel): - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_dtype + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn)) + elif isinstance(insn, MultiAssignmentBase): + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + return kernel.copy(instructions=new_insns) def infer_hw_axes_sizes(kernel): @@ -2474,12 +2499,25 @@ def preprocess_program(program, device=None): program_callables_info = ( program_callables_info.with_exit_edit_callables_mode()) - # FIXME: some version of the below funtion run should occur - # FIXME:type specialize functions that were missed during the type inference. - # program_callables_info = make_callables_ready_for_codegen( - # program_callables_info) + semi_preprocessed_program = ( + program.copy(program_callables_info=program_callables_info)) + + # FIXME: need to make function ready for codegen here + + # overriding the hw axes sizes of all the callable kernel. + local_size, global_size = semi_preprocessed_program.get_grid_size_upper_bounds() + + resolved_function_with_hw_axes_sizes_set = {} + + for func_id, in_knl_callable in semi_preprocessed_program.program_callables_info: + resolved_function_with_hw_axes_sizes_set[func_id] = ( + in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + + new_program_callables_info = ( + semi_preprocessed_program.program_callables_info.copy( + resolved_functions=resolved_function_with_hw_axes_sizes_set)) - return program.copy(program_callables_info=program_callables_info) + return program.copy(program_callables_info=new_program_callables_info) def preprocess_kernel(kernel, program_callables_info, device=None): -- GitLab From 1e2b3f6f048b99d39cd0cc7a19e6d3c71bc5791e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 16:33:13 -0500 Subject: [PATCH 250/916] bajillions of renaming frorom kernel->program --- loopy/check.py | 18 +++-- loopy/codegen/__init__.py | 101 ++++++++---------------- loopy/codegen/control.py | 3 +- loopy/kernel/__init__.py | 33 +++++--- loopy/kernel/tools.py | 5 +- loopy/preprocess.py | 114 +-------------------------- loopy/program.py | 35 +++++++- loopy/schedule/__init__.py | 19 +++-- loopy/target/c/codegen/expression.py | 18 +++-- loopy/target/execution.py | 59 +++++++------- loopy/target/opencl.py | 3 +- loopy/target/pyopencl_execution.py | 36 +++++---- loopy/target/python.py | 3 +- 13 files changed, 179 insertions(+), 268 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 586b94351..53275d2a2 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -749,7 +749,8 @@ def pre_schedule_checks(kernel): # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, + sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) @@ -764,7 +765,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + program_callables_info) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) @@ -781,7 +783,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): - i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) + i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, + program_callables_info, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -832,9 +835,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): return past_end_i -def check_for_unused_hw_axes_in_insns(kernel): +def check_for_unused_hw_axes_in_insns(kernel, program_callables_info): if kernel.schedule: - _check_for_unused_hw_axes_in_kernel_chunk(kernel) + _check_for_unused_hw_axes_in_kernel_chunk(kernel, + program_callables_info) # }}} @@ -988,11 +992,11 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel): +def pre_codegen_checks(kernel, program_callables_info): try: logger.debug("pre-codegen check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel) + check_for_unused_hw_axes_in_insns(kernel, program_callables_info) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 00e95b17d..d3c6ebe87 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -197,12 +197,15 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: program_callables_info """ def __init__(self, kernel, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, + program_callables_info, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, @@ -216,6 +219,7 @@ class CodeGenerationState(object): self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex + self.program_callables_info = program_callables_info self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -263,6 +267,7 @@ class CodeGenerationState(object): seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, + program_callables_info=self.program_callables_info, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -410,16 +415,12 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_for_a_single_kernel(kernel, is_root_kernel=True): +def generate_code_for_a_single_kernel(kernel, program_callables_info): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import KernelState - if kernel.state == KernelState.INITIAL: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) @@ -443,11 +444,8 @@ def generate_code_for_a_single_kernel(kernel, is_root_kernel=True): # }}} - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) + pre_codegen_checks(kernel, program_callables_info) logger.info("%s: generate code: start" % kernel.name) @@ -506,54 +504,15 @@ def generate_code_for_a_single_kernel(kernel, is_root_kernel=True): kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule)) + schedule_index_end=len(kernel.schedule), + program_callables_info=program_callables_info) from loopy.codegen.result import generate_host_or_device_program - # {{{ collect ASTs of auxiliary kernels - - auxiliary_dev_progs = [] - - # scan through all the call instructions if there is any instance of - # CallableKernel, whose code is to be generated. - from loopy.kernel.function_interface import CallableKernel - - for insn in kernel.instructions: - if isinstance(insn, CallInstruction): - in_knl_callable = kernel.scoped_functions[ - insn.expression.function.name] - if isinstance(in_knl_callable, CallableKernel): - auxiliary_dev_prog = generate_code_v2( - in_knl_callable.subkernel.copy( - name=in_knl_callable.name_in_target, - target=kernel.target) - ).device_programs[0].ast - auxiliary_dev_progs.append(auxiliary_dev_prog) - - elif isinstance(insn, (Assignment, NoOpInstruction, Assignment, - BarrierInstruction, CInstruction, - _DataObliviousInstruction)): - pass - - else: - raise NotImplementedError("Unknown type of instruction %s" % ( - type(insn).__name__)) - codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) - # Modify the first device program to add the auxiliary kernels - # as functions - new_dev_prog = codegen_result.device_programs[0] - for auxiliary_dev_prog in auxiliary_dev_progs: - new_dev_prog = new_dev_prog.copy( - ast=Collection([auxiliary_dev_prog, new_dev_prog.ast])) - new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:] - codegen_result = codegen_result.copy(device_programs=new_device_programs) - - # }}} - device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains @@ -583,24 +542,6 @@ def generate_code_for_a_single_kernel(kernel, is_root_kernel=True): for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) - # {{{ collect preambles from all the in kernel callables. - - in_knl_callable_collector = InKernelCallablesCollector(kernel) - - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - for in_knl_callable in in_knl_callable_collector(insn.expression): - preambles.extend(in_knl_callable.generate_preambles(kernel.target)) - - elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): - pass - else: - raise NotImplementedError( - "Unknown instruction type '%s'" - % type(insn).__name__) - - # }}} - codegen_result = codegen_result.copy(device_preambles=preambles) # }}} @@ -620,7 +561,29 @@ def generate_code_for_a_single_kernel(kernel, is_root_kernel=True): def generate_code_v2(program): - pass + + from loopy.kernel import KernelState + if program.root_kernel.state == KernelState.INITIAL: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) + + # collect preambles + for callable_knl in program.program_callables_info.values(): + pass + + # collect func decls + for callable_knl in program.program_callables_info.values(): + pass + + # collect func defs + for callable_knl in program.program_callables_info.values(): + pass + + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + + return generate_code_for_a_single_kernel(program.root_kernel, + program.program_callables_info) def generate_code(kernel, device=None): diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 45e2a18c4..90bdbda31 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -115,7 +115,8 @@ def generate_code_for_sched_index(codegen_state, sched_index): new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + codegen_state.program_callables_info) return merge_codegen_results(codegen_state, [ codegen_result, diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 374b88a38..ce7bdac42 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -254,6 +254,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, + is_called_from_host=True, + overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): """ @@ -366,6 +368,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, + is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) @@ -1033,8 +1036,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) - @memoize_method - def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, + program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1047,8 +1050,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ collecting the callee kernels in insn_ids - from loopy.kernel.tools import get_callee_kernels - callee_kernels = get_callee_kernels(self, insn_ids) + from loopy.kernel.tools import get_direct_callee_kernels + callee_kernels = get_direct_callee_kernels(self, + program_callables_info, insn_ids) # }}} @@ -1068,7 +1072,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): # updating the grid sizes from the callee_kernels. for callee_kernel in callee_kernels: gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts( - frozenset(insn.id for insn in callee_kernel.instructions)) + frozenset(insn.id for insn in callee_kernel.instructions), + program_callables_info, ignore_auto) global_sizes.update(gsize) local_sizes.update(lsize) @@ -1115,8 +1120,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return global_sizes, local_sizes - @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, + ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1135,7 +1140,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "information to compute grid sizes.") global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( - insn_ids, ignore_auto=ignore_auto) + insn_ids, program_callables_info, ignore_auto=ignore_auto) def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -1166,7 +1171,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, + program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1177,7 +1183,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, ignore_auto) + insn_ids, program_callables_info, ignore_auto) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1185,7 +1191,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, ignore_auto=False): + def get_grid_size_upper_bounds(self, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1193,9 +1199,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), + program_callables_info, ignore_auto=ignore_auto) - def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + def get_grid_size_upper_bounds_as_exprs(self, program_callables_info, + ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1204,6 +1212,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), + program_callables_info, ignore_auto=ignore_auto) # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 5492b091c..3395e876f 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1860,7 +1860,7 @@ def find_aliasing_equivalence_classes(kernel): # {{{ callee kernel tools -def get_callee_kernels(kernel, insn_ids=None): +def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): """ Returns an instance of :class:`frozenset` of all the callee kernels called in instructions in the *kernel* whose IDs are given in *insn_ids*. @@ -1870,6 +1870,7 @@ def get_callee_kernels(kernel, insn_ids=None): If *insn_ids* is *None* returns all the callee kernels called by *kernel*. """ + #FIXME: explain what "direct" means if insn_ids is None: insn_ids = frozenset(insn.id for insn in kernel.instructions) @@ -1886,7 +1887,7 @@ def get_callee_kernels(kernel, insn_ids=None): MultiAssignmentBase, CInstruction, _DataObliviousInstruction) if isinstance(insn, CallInstruction): if insn.expression.function.name in kernel.scoped_functions: - in_knl_callable = kernel.scoped_functions[ + in_knl_callable = program_callables_info[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): return in_knl_callable.subkernel diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 9b9c555c8..fe3e79a20 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -27,8 +27,6 @@ import six from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) -from functools import reduce - import islpy as isl from pytools.persistent_dict import WriteOncePersistentDict @@ -2259,114 +2257,6 @@ def infer_arg_descr(kernel, program_callables_info): # }}} -# {{{ - -class HWAxesInferenceMapper(RuleAwareIdentityMapper): - """ - Returns a set of instances of :class:`tuple` (expr, - in_kernel_callable). The mapped `in_kernel_callable` of the - :class:`InKernelCallable` are specialized for the the grid sizes of - :attr:`kernel`. - """ - # FIXME: docs after the design is final. - - def __init__(self, rule_mapping_context, caller_kernel, - program_callables_info): - super(ArgDescrInferenceMapper, self).__init__( - rule_mapping_context) - self.caller_kernel = caller_kernel - self.program_callables_info = program_callables_info - self.local_size, self.global_size = ( - caller_kernel.get_grid_size_upper_bounds()) - - def map_call(self, expr, expn_state): - from pymbolic.primitives import CallWithKwargs, Call - from loopy.symbolic import ResolvedFunction - - if not isinstance(expr.function, ResolvedFunction): - # ignore if the call is not to a ResolvedFunction - return super(ArgDescrInferenceMapper, self).rec(expr) - - new_in_knl_callable = ( - self.program_callables_info[expr.function.name].with_hw_axes_sizes( - self.local_size, self.global_size)) - self.program_callables_info, new_func_id = ( - self.program_callables_info.with_callable( - expr.function.function, - new_in_knl_callable)) - - if isinstance(expr, Call): - return Call( - ResolvedFunction(new_func_id), - tuple(self.rec(child, expn_state) - for child in expr.parameters)) - else: - assert isinstance(expr, CallWithKwargs) - return CallWithKwargs( - ResolvedFunction(new_func_id), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - - map_call_with_kwargs = map_call - - def map_kernel(self, kernel): - - new_insns = [] - - for insn in kernel.instructions: - if isinstance(insn, CallInstruction): - # In call instructions the assignees play an important in - # determining the arg_id_to_dtype - new_insns.append(insn.with_transformed_expressions( - self, kernel, insn)) - elif isinstance(insn, MultiAssignmentBase): - new_insns.append(insn.with_transformed_expressions( - self, kernel, insn)) - elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): - new_insns.append(insn) - else: - raise NotImplementedError("arg_descr_inference for %s instruction" % - type(insn)) - - return kernel.copy(instructions=new_insns) - - -def infer_hw_axes_sizes(kernel): - """ - Returns a copy of *kernel* with the hardware axes matching for - scoped functions in the *kernel*. Refer - :meth:`loopy.kernel.function_interface.InKernelCallable.with_hw_axes_sizes`. - """ - hw_axes_modifier = HWAxesInferenceMapper(kernel) - pymbolic_calls_to_functions = set() - - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - pymbolic_calls_to_functions.update(hw_axes_modifier( - insn.expression)) - elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): - pass - else: - raise NotImplementedError("unknown type of instruction %s." % - type(insn)) - - # making it the set of tuples a dict - pymbolic_calls_to_functions = dict(pymbolic_calls_to_functions) - - # Now do the similar treatment as done for type inference. - from loopy.kernel.function_interface import ( - register_pymbolic_calls_to_knl_callables) - - return register_pymbolic_calls_to_knl_callables(kernel, - pymbolic_calls_to_functions) - -# }}} - - # {{{ catching functions that are not ready for codegen class FunctionsNotReadyForCodegenCollector(CombineMapper): @@ -2505,11 +2395,13 @@ def preprocess_program(program, device=None): # FIXME: need to make function ready for codegen here # overriding the hw axes sizes of all the callable kernel. + # FIXME: maybe need to wrap this within a function? local_size, global_size = semi_preprocessed_program.get_grid_size_upper_bounds() resolved_function_with_hw_axes_sizes_set = {} - for func_id, in_knl_callable in semi_preprocessed_program.program_callables_info: + for func_id, in_knl_callable in ( + semi_preprocessed_program.program_callables_info.items()): resolved_function_with_hw_axes_sizes_set[func_id] = ( in_knl_callable.with_hw_axes_sizes(local_size, global_size)) diff --git a/loopy/program.py b/loopy/program.py index f2ea40506..342f8ba78 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -31,6 +31,7 @@ from pymbolic.primitives import Variable from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) +from loopy.diagnostic import LoopyError class FunctionResolver(RuleAwareIdentityMapper): @@ -204,6 +205,26 @@ class Program(ImmutableRecord): self._program_executor_cache = {} + def get_grid_size_upper_bounds(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + return self.root_kernel.get_grid_size_upper_bounds( + self.program_callables_info, + ignore_auto=ignore_auto) + + def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :mod:`pymbolic` expressions + """ + return self.root_kernel.get_grid_size_upper_bounds_as_exprs( + self.program_callables_info, + ignore_auto=ignore_auto) + @property def name(self): #FIXME: discuss with @inducer if we use "name" instead of @@ -381,11 +402,15 @@ class ProgramCallablesInfo(ImmutableRecord): .. note:: Assumes that each callable is touched atmost once, the internal - working of this function fails if that is violated and raises a - *RuntimeError*. + working of this function fails if that is violated. """ # FIXME: add a note about using enter and exit - assert self.is_being_edited + if not self.is_being_edited: + if function.name in self.resolved_functions and ( + self.resolved_functions[function.name] == in_kernel_callable): + return self, function + else: + raise LoopyError("Use 'enter_edit_callables_mode' first.") from loopy.library.reduction import ArgExtOp, SegmentedOp @@ -500,6 +525,10 @@ class ProgramCallablesInfo(ImmutableRecord): def items(self): return self.resolved_functions.items() + def values(self): + return self.resolved_functions.values() + + # }}} diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 652f8b893..eb631c130 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1832,7 +1832,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, debug_args={}): +def generate_loop_schedules(kernel, program_callables_info, debug_args={}): """ .. warning:: @@ -1845,11 +1845,12 @@ def generate_loop_schedules(kernel, debug_args={}): """ with MinRecursionLimitForScheduling(kernel): - for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args): + for sched in generate_loop_schedules_inner(kernel, + program_callables_info, debug_args=debug_args): yield sched -def generate_loop_schedules_inner(kernel, debug_args={}): +def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " @@ -1969,7 +1970,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - gsize, lsize = kernel.get_grid_size_upper_bounds() + gsize, lsize = ( + kernel.get_grid_size_upper_bounds(program_callables_info)) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2026,7 +2028,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel): +def _get_one_scheduled_kernel_inner(kernel, program_callables_info): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2036,10 +2038,10 @@ def _get_one_scheduled_kernel_inner(kernel): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel))) + return next(iter(generate_loop_schedules(kernel, program_callables_info))) -def get_one_scheduled_kernel(kernel): +def get_one_scheduled_kernel(kernel, program_callables_info): from loopy import CACHING_ENABLED sched_cache_key = kernel @@ -2057,7 +2059,8 @@ def get_one_scheduled_kernel(kernel): if not from_cache: with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): - result = _get_one_scheduled_kernel_inner(kernel) + result = _get_one_scheduled_kernel_inner(kernel, + program_callables_info) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 108360b4b..defc643f6 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -54,7 +54,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.program_callables_info) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -391,7 +392,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): # {{{ implement indexof, indexof_vec - identifier_name = self.kernel.scoped_functions[expr.function.name].name + identifier_name = ( + self.codegen_state.program_callables_info[expr.function.name].name) if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier_name) @@ -434,7 +436,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} from loopy.kernel.function_interface import ManglerCallable - if isinstance(self.kernel.scoped_functions[expr.function.name], + if isinstance(self.codegen_state.program_callables_info[expr.function.name], ManglerCallable): from loopy.codegen import SeenFunction in_knl_callable = self.kernel.scoped_functions[expr.function.name] @@ -444,10 +446,12 @@ class ExpressionToCExpressionMapper(IdentityMapper): mangle_result.target_name, mangle_result.arg_dtypes)) - return self.kernel.scoped_functions[expr.function.name].emit_call( - expression_to_code_mapper=self, - expression=expr, - target=self.kernel.target) + return ( + self.codegen_state.program_callables_info[ + expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target)) # {{{ deal with complex-valued variables diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 423246842..e68d14a21 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -214,9 +214,9 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from offsets - def generate_integer_arg_finding_from_offsets(self, gen, kernel, + def generate_integer_arg_finding_from_offsets(self, gen, program, implemented_data_info): - options = kernel.options + options = program.root_kernel.options gen("# {{{ find integer arguments from offsets") gen("") @@ -239,7 +239,7 @@ class ExecutionWrapperGeneratorBase(object): else: gen("_lpy_offset = %s.offset" % impl_array_name) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" @@ -264,8 +264,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from strides def generate_integer_arg_finding_from_strides( - self, gen, kernel, implemented_data_info): - options = kernel.options + self, gen, program, implemented_data_info): + options = program.root_kernel.options gen("# {{{ find integer arguments from strides") gen("") @@ -284,7 +284,7 @@ class ExecutionWrapperGeneratorBase(object): "passed array\")" % (arg.name, impl_array_name)) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" @@ -307,8 +307,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ check that value args are present def generate_value_arg_check( - self, gen, kernel, implemented_data_info): - if kernel.options.skip_arg_checks: + self, gen, program, implemented_data_info): + if program.root_kernel.options.skip_arg_checks: return from loopy.kernel.data import ValueArg @@ -361,7 +361,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ arg setup def generate_arg_setup( - self, gen, kernel, implemented_data_info, options): + self, gen, program, implemented_data_info, options): import loopy as lp from loopy.kernel.data import KernelArgument @@ -384,8 +384,8 @@ class ExecutionWrapperGeneratorBase(object): expect_no_more_arguments = False for arg_idx, arg in enumerate(implemented_data_info): - is_written = arg.base_name in kernel.get_written_variables() - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) + is_written = arg.base_name in program.root_kernel.get_written_variables() + program_arg = program.impl_arg_to_arg.get(arg.name) if not issubclass(arg.arg_class, KernelArgument): expect_no_more_arguments = True @@ -447,7 +447,7 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): self.handle_alloc( - gen, arg, kernel_arg, strify, options.skip_arg_checks) + gen, arg, program_arg, strify, options.skip_arg_checks) gen("_lpy_made_by_loopy = True") gen("") @@ -465,7 +465,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("if %s.dtype != %s:" % (arg.name, self.python_dtype_str( - kernel_arg.dtype.numpy_dtype))) + program_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " "(got: %%s, expected: %s)\" %% %s.dtype)" @@ -493,10 +493,10 @@ class ExecutionWrapperGeneratorBase(object): "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - if kernel_arg.shape is None: + if program_arg.shape is None: pass - elif any(shape_axis is None for shape_axis in kernel_arg.shape): + elif any(shape_axis is None for shape_axis in program_arg.shape): gen("if len(%s.shape) != %s:" % (arg.name, len(arg.unvec_shape))) with Indentation(gen): @@ -519,8 +519,8 @@ class ExecutionWrapperGeneratorBase(object): # }}} - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize + if arg.unvec_strides and program_arg.dim_tags: + itemsize = program_arg.dtype.numpy_dtype.itemsize sym_strides = tuple( itemsize*s_i for s_i in arg.unvec_strides) @@ -558,7 +558,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_kernel()." + "default_offset=loopy.auto to make_program()." "\")" % arg.name) gen("") @@ -617,7 +617,7 @@ class ExecutionWrapperGeneratorBase(object): def generate_host_code(self, gen, codegen_result): raise NotImplementedError - def __call__(self, kernel, codegen_result): + def __call__(self, program, codegen_result): """ Generates the wrapping python invoker for this execution target @@ -629,12 +629,12 @@ class ExecutionWrapperGeneratorBase(object): kernel """ - options = kernel.options + options = program.root_kernel.options implemented_data_info = codegen_result.implemented_data_info from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % kernel.name, + "invoke_%s_loopy_kernel" % program.name, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info @@ -651,21 +651,21 @@ class ExecutionWrapperGeneratorBase(object): self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_offsets( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_strides( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_value_arg_check( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) args = self.generate_arg_setup( - gen, kernel, implemented_data_info, options) + gen, program, implemented_data_info, options) self.generate_invocation(gen, codegen_result.host_program.name, args, - kernel, implemented_data_info) + program, implemented_data_info) - self.generate_output_handler(gen, options, kernel, implemented_data_info) + self.generate_output_handler(gen, options, program, implemented_data_info) if options.write_wrapper: output = gen.get() @@ -760,7 +760,8 @@ class KernelExecutorBase(object): from loopy.schedule import get_one_scheduled_kernel program = program.with_root_kernel( - get_one_scheduled_kernel(program.root_kernel)) + get_one_scheduled_kernel(program.root_kernel, + program.program_callables_info)) return program diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 81b6770c1..2b501c872 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -482,7 +482,8 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.program_callables_info) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 8d577bb01..890208bf6 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -151,9 +151,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args, - kernel, implemented_data_info): - if kernel.options.cl_exec_manage_array_events: + def generate_invocation(self, gen, program_name, args, + program, implemented_data_info): + if program.root_kernel.options.cl_exec_manage_array_events: gen(""" if wait_for is None: wait_for = [] @@ -169,20 +169,21 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("") - gen("_lpy_evt = {kernel_name}({args})" + gen("_lpy_evt = {program_name}({args})" .format( - kernel_name=kernel_name, + program_name=program_name, args=", ".join( ["_lpy_cl_kernels", "queue"] + args + ["wait_for=wait_for"]))) - if kernel.options.cl_exec_manage_array_events: + if program.root_kernel.options.cl_exec_manage_array_events: gen("") from loopy.kernel.data import ArrayArg for arg in implemented_data_info: if (issubclass(arg.arg_class, ArrayArg) - and arg.base_name in kernel.get_written_variables()): + and arg.base_name in ( + program.root_kernel.get_written_variables())): gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name)) # }}} @@ -190,7 +191,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -207,7 +208,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if not issubclass(arg.arg_class, KernelArgument): continue - is_written = arg.base_name in kernel.get_written_variables() + is_written = arg.base_name in ( + program.root_kernel.get_written_variables()) if is_written: gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) @@ -218,12 +220,12 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in program.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return _lpy_evt, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -283,18 +285,18 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): dev_code = codegen_result.device_code() - if self.program.options.write_cl: + if self.program.root_kernel.options.write_cl: output = dev_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") @@ -302,7 +304,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): cl_program = ( cl.Program(self.context, dev_code) - .build(options=program.options.cl_build_options)) + .build(options=program.root_kernel.options.cl_build_options)) cl_kernels = _Kernels() for dp in codegen_result.device_programs: diff --git a/loopy/target/python.py b/loopy/target/python.py index 2804b0fb9..b7a83d25b 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -44,7 +44,8 @@ class ExpressionToPythonMapper(StringifyMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.program_callables_info) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): -- GitLab From c5a60f0a059eaffb9ec253da05b74d94c0be2673 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 17:07:11 -0500 Subject: [PATCH 251/916] minor error while renaming --- loopy/program.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 342f8ba78..d4966218e 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -341,11 +341,12 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper): rule_mapping_context) self.renaming_dict = renaming_dict - def map_resolved_functions(self, expr, expn_state): + def map_resolved_function(self, expr, expn_state): if expr.name in self.renaming_dict: return ResolvedFunction(self.renaming_dict[expr.name]) else: - return super(ResolvedFunctionRenamer, self).rec(expr, expn_state) + return super(ResolvedFunctionRenamer, self).map_resolved_function( + expr, expn_state) def rename_resolved_functions_in_a_single_kernel(kernel, -- GitLab From 7d1a1459e39a9c9b91f83114497cf1cc78dd0de0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 17:10:36 -0500 Subject: [PATCH 252/916] flake 8 --- loopy/codegen/__init__.py | 5 ----- loopy/target/c/__init__.py | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d3c6ebe87..d80dec27e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,13 +32,8 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION -from cgen import Collection from loopy.symbolic import CombineMapper -from loopy.kernel.instruction import ( - Assignment, NoOpInstruction, BarrierInstruction, CallInstruction, - CInstruction, _DataObliviousInstruction, MultiAssignmentBase) - from functools import reduce diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index eb7f43a37..db2780ba5 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -409,7 +409,7 @@ class CMathCallable(ScalarCallable): self.copy(name_in_target=name, arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}), - program_callables_info) + program_callables_info) # binary functions if name in ["fmax", "fmin"]: -- GitLab From 06ac2972b3cd10f4c3e804c535619585166ad0e0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 21:14:11 -0500 Subject: [PATCH 253/916] minor changes --- loopy/kernel/creation.py | 4 +++- loopy/library/reduction.py | 4 ++-- loopy/program.py | 7 +++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 22bdf5f84..f3e09db3b 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2352,7 +2352,9 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): creation_plog.done() - return knl + from loopy.program import make_program_from_kernel + # FIXME: warn to not use this? + return make_program_from_kernel(knl) # }}} diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index d2d4ea4db..503b76988 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -189,7 +189,7 @@ class MaxReductionOperation(ScalarReductionOperation): def get_scalar_callables(self, kernel): return { - "max": kernel.find_scoped_function_identifier("max")} + var("max"): kernel.find_scoped_function_identifier("max")} class MinReductionOperation(ScalarReductionOperation): @@ -201,7 +201,7 @@ class MinReductionOperation(ScalarReductionOperation): def get_scalar_callables(self, kernel): return { - "min": kernel.find_scoped_function_identifier("min")} + var("min"): kernel.find_scoped_function_identifier("min")} # {{{ base class for symbolic reduction ops diff --git a/loopy/program.py b/loopy/program.py index d4966218e..96c3e58ac 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -119,8 +119,11 @@ class FunctionResolver(RuleAwareIdentityMapper): expn_state) def map_reduction(self, expr, expn_state): - self.scoped_functions.update( - expr.operation.get_scalar_callables(self.kernel)) + for func_id, in_knl_callable in ( + expr.operation.get_scalar_callables(self.kernel)).items(): + self.program_callables_info, _ = ( + self.program_callables_info.with_callable(func_id, + in_knl_callable)) return super(FunctionResolver, self).map_reduction(expr, expn_state) -- GitLab From 0887998b16ca4caba99a9bdb19eb17189e1920fa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 22:15:51 -0500 Subject: [PATCH 254/916] somewhat suboptimal design choice for options. --- loopy/__init__.py | 6 ++- loopy/preprocess.py | 97 +++++++++++++++++++++++++-------------------- 2 files changed, 57 insertions(+), 46 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a552e498e..088b259d3 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -306,13 +306,14 @@ __all__ = [ # {{{ set_options -def set_options(kernel, *args, **kwargs): +def set_options(program, *args, **kwargs): """Return a new kernel with the options given as keyword arguments, or from a string representation passed in as the first (and only) positional argument. See also :class:`Options`. """ + kernel = program.root_kernel if args and kwargs: raise TypeError("cannot pass both positional and keyword arguments") @@ -337,7 +338,8 @@ def set_options(kernel, *args, **kwargs): from loopy.options import make_options new_opt.update(make_options(arg)) - return kernel.copy(options=new_opt) + return program.with_root_kernel( + kernel.copy(options=new_opt)) # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fe3e79a20..88609ee99 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2369,50 +2369,7 @@ preprocess_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def preprocess_program(program, device=None): - - if device is not None: - from warnings import warn - warn("passing 'device' to preprocess_kernel() is deprecated", - DeprecationWarning, stacklevel=2) - - root_kernel_callable = program.program_callables_info[program.name] - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) - root_kernel, program_callables_info = preprocess_kernel( - program.root_kernel, program_callables_info, device) - processed_root_knl_callable = root_kernel_callable.copy(subkernel=root_kernel) - program_callables_info, _ = ( - program_callables_info.with_callable( - program.root_kernel_name, - processed_root_knl_callable)) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - - semi_preprocessed_program = ( - program.copy(program_callables_info=program_callables_info)) - - # FIXME: need to make function ready for codegen here - - # overriding the hw axes sizes of all the callable kernel. - # FIXME: maybe need to wrap this within a function? - local_size, global_size = semi_preprocessed_program.get_grid_size_upper_bounds() - - resolved_function_with_hw_axes_sizes_set = {} - - for func_id, in_knl_callable in ( - semi_preprocessed_program.program_callables_info.items()): - resolved_function_with_hw_axes_sizes_set[func_id] = ( - in_knl_callable.with_hw_axes_sizes(local_size, global_size)) - - new_program_callables_info = ( - semi_preprocessed_program.program_callables_info.copy( - resolved_functions=resolved_function_with_hw_axes_sizes_set)) - - return program.copy(program_callables_info=new_program_callables_info) - - -def preprocess_kernel(kernel, program_callables_info, device=None): +def preprocess_single_kernel(kernel, program_callables_info, device=None): from loopy.kernel import KernelState if kernel.state >= KernelState.PREPROCESSED: return kernel @@ -2520,4 +2477,56 @@ def preprocess_kernel(kernel, program_callables_info, device=None): return kernel, program_callables_info + +def preprocess_kernel(kernel, device=None): + # FIXME: better error message + from loopy.program import Program + if not isinstance(kernel, Program): + raise LoopyError("Not supported") + return preprocess_program(kernel, device) + + +def preprocess_program(program, device=None): + + if device is not None: + from warnings import warn + warn("passing 'device' to preprocess_kernel() is deprecated", + DeprecationWarning, stacklevel=2) + + root_kernel_callable = program.program_callables_info[program.name] + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + root_kernel, program_callables_info = preprocess_single_kernel( + program.root_kernel, program_callables_info, device) + processed_root_knl_callable = root_kernel_callable.copy(subkernel=root_kernel) + program_callables_info, _ = ( + program_callables_info.with_callable( + program.root_kernel_name, + processed_root_knl_callable)) + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + semi_preprocessed_program = ( + program.copy(program_callables_info=program_callables_info)) + + # FIXME: need to make function ready for codegen here + + # overriding the hw axes sizes of all the callable kernel. + # FIXME: maybe need to wrap this within a function? + local_size, global_size = semi_preprocessed_program.get_grid_size_upper_bounds() + + resolved_function_with_hw_axes_sizes_set = {} + + for func_id, in_knl_callable in ( + semi_preprocessed_program.program_callables_info.items()): + resolved_function_with_hw_axes_sizes_set[func_id] = ( + in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + + new_program_callables_info = ( + semi_preprocessed_program.program_callables_info.copy( + resolved_functions=resolved_function_with_hw_axes_sizes_set)) + + return program.copy(program_callables_info=new_program_callables_info) + + # vim: foldmethod=marker -- GitLab From 0ead3f61ab32d3f14a7d26778f6f9a4995884412 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 22:36:29 -0500 Subject: [PATCH 255/916] good design? --- loopy/__init__.py | 13 +++++++++---- loopy/kernel/__init__.py | 12 +++--------- loopy/kernel/creation.py | 4 +--- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 088b259d3..a3d5f0e58 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -306,15 +306,13 @@ __all__ = [ # {{{ set_options -def set_options(program, *args, **kwargs): +def set_options_for_single_kernel(kernel, *args, **kwargs): """Return a new kernel with the options given as keyword arguments, or from a string representation passed in as the first (and only) positional argument. See also :class:`Options`. """ - kernel = program.root_kernel - if args and kwargs: raise TypeError("cannot pass both positional and keyword arguments") @@ -338,8 +336,15 @@ def set_options(program, *args, **kwargs): from loopy.options import make_options new_opt.update(make_options(arg)) + return kernel.copy(options=new_opt) + + +def set_options(program, *args, **kwargs): + if isinstance(program, LoopKernel): + return set_options_for_single_kernel(program, *args, **kwargs) + kernel = program.root_kernel return program.with_root_kernel( - kernel.copy(options=new_opt)) + set_options_for_single_kernel(kernel, *args, **kwargs)) # }}} diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index ce7bdac42..5afdf39ac 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1407,15 +1407,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): def __call__(self, *args, **kwargs): # FIXME: scream and then convert to a program - 1/0 - key = self.target.get_kernel_executor_cache_key(*args, **kwargs) - try: - kex = self._kernel_executor_cache[key] - except KeyError: - kex = self.target.get_kernel_executor(self, *args, **kwargs) - self._kernel_executor_cache[key] = kex - - return kex(*args, **kwargs) + from loopy.program import make_program_from_kernel + program = make_program_from_kernel(self) + return program(*args, **kwargs) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f3e09db3b..22bdf5f84 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2352,9 +2352,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): creation_plog.done() - from loopy.program import make_program_from_kernel - # FIXME: warn to not use this? - return make_program_from_kernel(knl) + return knl # }}} -- GitLab From f59edc4f4ddbbba2a024907c7133de3747f71bf6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 22:45:13 -0500 Subject: [PATCH 256/916] some more back compatibility --- loopy/preprocess.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 88609ee99..13b6decc4 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2479,11 +2479,10 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): def preprocess_kernel(kernel, device=None): - # FIXME: better error message - from loopy.program import Program - if not isinstance(kernel, Program): - raise LoopyError("Not supported") - return preprocess_program(kernel, device) + # FIXME: error message? + from loopy.program import make_program_from_kernel + program = make_program_from_kernel(kernel) + return preprocess_program(program, device) def preprocess_program(program, device=None): -- GitLab From 6f1e2f70d78d40d824f3b7390b4bc36b240715a2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 23:38:49 -0500 Subject: [PATCH 257/916] passes one test. --- loopy/codegen/loop.py | 2 +- loopy/kernel/__init__.py | 2 ++ loopy/preprocess.py | 8 ++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index ebddf3153..39cf20c7d 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -248,7 +248,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block) + insn_ids_for_block, codegen_state.program_callables_info) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 5afdf39ac..800ba36c0 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1132,8 +1132,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ if self.overridden_get_grid_sizes_for_insn_ids: + print(self.overridden_get_grid_sizes_for_insn_ids) return self.overridden_get_grid_sizes_for_insn_ids( insn_ids, + program_callables_info=program_callables_info, ignore_auto=ignore_auto) assert self.is_called_from_host, ("Callee kernels do not have sufficient " diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 13b6decc4..8f347b22e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2518,8 +2518,12 @@ def preprocess_program(program, device=None): for func_id, in_knl_callable in ( semi_preprocessed_program.program_callables_info.items()): - resolved_function_with_hw_axes_sizes_set[func_id] = ( - in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + if func_id == semi_preprocessed_program.name: + resolved_function_with_hw_axes_sizes_set[func_id] = ( + in_knl_callable) + else: + resolved_function_with_hw_axes_sizes_set[func_id] = ( + in_knl_callable.with_hw_axes_sizes(local_size, global_size)) new_program_callables_info = ( semi_preprocessed_program.program_callables_info.copy( -- GitLab From 0b1477804acad701acbe0d2b1766356c1721f6b3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 28 Jul 2018 23:56:27 -0500 Subject: [PATCH 258/916] successful_tests++ --- loopy/__init__.py | 4 ++++ loopy/kernel/function_interface.py | 2 +- test/test_loopy.py | 5 ++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a3d5f0e58..49611d55f 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -53,6 +53,8 @@ from loopy.kernel.data import ( CallMangleInfo) from loopy.kernel.function_interface import ( ScalarCallable) +from loopy.program import ( + Program, make_program_from_kernel) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -176,6 +178,8 @@ __all__ = [ "ScalarCallable", + "Program", "make_program_from_kernel", + "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", "AddressSpace", "temp_var_scope", # temp_var_scope is deprecated diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 2aa14b3d3..b66b865e8 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -143,7 +143,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): self.local_size = local_size self.global_size = global_size - def __call__(self, insn_ids, ignore_auto=True): + def __call__(self, insn_ids, program_callables_info, ignore_auto=True): return self.local_size, self.global_size # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index accf9c1df..1e60ca07f 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -143,7 +143,10 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - knl = lp.infer_unknown_types(knl) + + prog = lp.make_program_from_kernel(knl) + prog = lp.infer_unknown_types(prog) + knl = prog.root_kernel from loopy.types import to_loopy_type assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) -- GitLab From 2c56087669326fbe23c9bd7f60811f77f3d52366 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 29 Jul 2018 14:21:12 -0500 Subject: [PATCH 259/916] successful_tests++ --- loopy/type_inference.py | 13 ++++++++++++- test/test_loopy.py | 5 +---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index cf63bf288..07eb1c9c9 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -813,6 +813,13 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" + from loopy.kernel import LoopKernel + input_was_kernel = False + if isinstance(program, LoopKernel): + # FIXME: warning + input_was_kernel = True + from loopy.program import make_program_from_kernel + program = make_program_from_kernel(program) program_callables_info = program.program_callables_info @@ -837,7 +844,11 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info = ( program_callables_info.with_exit_edit_callables_mode()) - return program.copy(program_callables_info=program_callables_info) + if input_was_kernel: + return (program.copy( + program_callables_info=program_callables_info)).root_kernel + else: + return program.copy(program_callables_info=program_callables_info) # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 1e60ca07f..accf9c1df 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -143,10 +143,7 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - - prog = lp.make_program_from_kernel(knl) - prog = lp.infer_unknown_types(prog) - knl = prog.root_kernel + knl = lp.infer_unknown_types(knl) from loopy.types import to_loopy_type assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) -- GitLab From 53e2b875c12d9f21be461272f44ef147df1d98d3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 29 Jul 2018 23:26:49 -0500 Subject: [PATCH 260/916] completed type inference after making the functions inferring the functions. --- loopy/preprocess.py | 4 +--- loopy/program.py | 2 ++ loopy/target/pyopencl.py | 8 +++++--- loopy/type_inference.py | 39 +++++++++++++++++++++++++++++++++------ 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 8f347b22e..972c5019f 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2508,10 +2508,8 @@ def preprocess_program(program, device=None): semi_preprocessed_program = ( program.copy(program_callables_info=program_callables_info)) - # FIXME: need to make function ready for codegen here + # FIXME: think of wrapping this in a function? - # overriding the hw axes sizes of all the callable kernel. - # FIXME: maybe need to wrap this within a function? local_size, global_size = semi_preprocessed_program.get_grid_size_upper_bounds() resolved_function_with_hw_axes_sizes_set = {} diff --git a/loopy/program.py b/loopy/program.py index 96c3e58ac..8fec476bb 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -414,6 +414,8 @@ class ProgramCallablesInfo(ImmutableRecord): self.resolved_functions[function.name] == in_kernel_callable): return self, function else: + print('Old: ', self.resolved_functions[function.name]) + print('New: ', in_kernel_callable) raise LoopyError("Use 'enter_edit_callables_mode' first.") from loopy.library.reduction import ArgExtOp, SegmentedOp diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 2ee70d65e..ab37665d0 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -233,9 +233,11 @@ class PyOpenCLCallable(ScalarCallable): else: raise LoopyTypeError("unexpected complex type '%s'" % dtype) - return self.copy(name_in_target="%s_%s" % (tpname, name), - arg_id_to_dtype={0: dtype, -1: NumpyType( - np.dtype(dtype.numpy_dtype.type(0).real))}) + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}), + program_callables_info) if name in ["sqrt", "exp", "log", "sin", "cos", "tan", diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 07eb1c9c9..aa8222553 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -34,6 +34,7 @@ from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) + import logging logger = logging.getLogger(__name__) @@ -266,6 +267,7 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): + from pymbolic.primitives import Variable, CallWithKwargs, Call from loopy.symbolic import ResolvedFunction @@ -788,6 +790,25 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, # }}} + if expect_completion: + # FIXME: copy the explanation from make_function_ready_for_codegen + # here. + for insn in kernel.instructions: + if isinstance(insn, lp.MultiAssignmentBase): + # just a dummy run over the expression, to pass over all the + # functions + type_inf_mapper(insn.expression, return_tuple=isinstance(insn, + lp.CallInstruction), return_dtype_set=True) + elif isinstance(insn, (lp._DatObliviousInstruction, + lp.CInstruction)): + pass + else: + raise NotImplementedError("Unknown instructions type %s." % ( + type(insn).__name__)) + + program_callables_info = type_inf_mapper.program_callables_info + old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) + end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) @@ -802,11 +823,14 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, change_names_of_pymbolic_calls) type_specialized_kernel = change_names_of_pymbolic_calls( pre_type_specialized_knl, old_calls_to_new_calls) - if expect_completion: - # if completion is expected, then it is important that all the - # callables are scoped. - from loopy.check import check_functions_are_scoped - check_functions_are_scoped(type_specialized_kernel) + + # this code is dead, move it up after mangler callables are made + # illegal. + # if expect_completion: + # # if completion is expected, then it is important that all the + # # callables are scoped. + # from loopy.check import check_functions_are_scoped + # check_functions_are_scoped(type_specialized_kernel) return type_specialized_kernel, program_callables_info @@ -816,7 +840,7 @@ def infer_unknown_types(program, expect_completion=False): from loopy.kernel import LoopKernel input_was_kernel = False if isinstance(program, LoopKernel): - # FIXME: warning + # FIXME: deprecate warning needed here input_was_kernel = True from loopy.program import make_program_from_kernel program = make_program_from_kernel(program) @@ -844,6 +868,9 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info = ( program_callables_info.with_exit_edit_callables_mode()) + + # FIXME: maybe put all of this in a function? + # need to infer functions that were left out during inference if input_was_kernel: return (program.copy( program_callables_info=program_callables_info)).root_kernel -- GitLab From 429616185422ae1a2c0e6e09c3d4c18c8591bd76 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:14:08 -0500 Subject: [PATCH 261/916] Mordernize auto_test --- loopy/auto_test.py | 282 ++++++++++++++++++++------------------------- 1 file changed, 127 insertions(+), 155 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 015c82dd1..fce9c6492 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -75,7 +75,7 @@ class TestArgInfo(Record): # {{{ "reference" arguments -def make_ref_args(kernel, impl_arg_info, queue, parameters): +def make_ref_args(program, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array @@ -88,7 +88,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): ref_arg_data = [] for arg in impl_arg_info: - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) + kernel_arg = program.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: if arg.offset_for_name: @@ -117,7 +117,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters): shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype - is_output = arg.base_name in kernel.get_written_variables() + is_output = arg.base_name in program.root_kernel.get_written_variables() if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( @@ -387,20 +387,22 @@ def auto_test_vs_ref( test_knl = ref_knl do_check = False - if len(ref_knl.args) != len(test_knl.args): - raise LoopyError("ref_knl and test_knl do not have the same number " + ref_prog = lp.make_program_from_kernel(ref_knl) + test_prog = lp.make_program_from_kernel(test_knl) + + if len(ref_prog.args) != len(test_prog.args): + raise LoopyError("ref_prog and test_prog do not have the same number " "of arguments") - for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): + for i, (ref_arg, test_arg) in enumerate(zip(ref_prog.args, test_prog.args)): if ref_arg.name != test_arg.name: - raise LoopyError("ref_knl and test_knl argument lists disagree at index " - "%d (1-based)" % (i+1)) + raise LoopyError("ref_prog and test_prog argument lists disagree at " + "index %d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: - raise LoopyError("ref_knl and test_knl argument lists disagree at index " - "%d (1-based)" % (i+1)) + raise LoopyError("ref_prog and test_prog argument lists disagree at " + "index %d (1-based)" % (i+1)) - from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): @@ -421,7 +423,7 @@ def auto_test_vs_ref( # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types - ref_knl = infer_unknown_types(ref_knl, expect_completion=True) + ref_prog = infer_unknown_types(ref_prog, expect_completion=True) found_ref_device = False @@ -431,30 +433,25 @@ def auto_test_vs_ref( ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) + ref_codegen_result = lp.generate_code_v2(ref_prog) - pp_ref_knl = lp.preprocess_kernel(ref_knl) - - for knl in lp.generate_loop_schedules(pp_ref_knl): - ref_sched_kernel = knl - break + ref_implemented_data_info = ref_codegen_result.implemented_data_info logger.info("%s (ref): trying %s for the reference calculation" % ( - ref_knl.name, dev)) + ref_prog.name, dev)) - ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") - print(get_highlighted_code(ref_compiled.get_code())) + print(get_highlighted_code( + ref_codegen_result.device_code())) print(75*"-") - ref_kernel_info = ref_compiled.kernel_info(frozenset()) - try: ref_args, ref_arg_data = \ - make_ref_args(ref_sched_kernel, - ref_kernel_info.implemented_data_info, + make_ref_args(ref_prog, + ref_implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: @@ -479,13 +476,13 @@ def auto_test_vs_ref( ref_queue.finish() logger.info("%s (ref): using %s for the reference calculation" % ( - ref_knl.name, dev)) - logger.info("%s (ref): run" % ref_knl.name) + ref_prog.name, dev)) + logger.info("%s (ref): run" % ref_prog.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: - ref_evt, _ = ref_compiled(ref_queue, **ref_args) + ref_evt, _ = ref_prog(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) @@ -493,7 +490,7 @@ def auto_test_vs_ref( ref_stop = time() ref_elapsed_wall = ref_stop-ref_start - logger.info("%s (ref): run done" % ref_knl.name) + logger.info("%s (ref): run done" % ref_prog.name) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) @@ -514,161 +511,136 @@ def auto_test_vs_ref( queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - args = None - from loopy.kernel import KernelState - from loopy.target.pyopencl import PyOpenCLTarget - if test_knl.state not in [ - KernelState.PREPROCESSED, - KernelState.SCHEDULED]: - if isinstance(test_knl.target, PyOpenCLTarget): - test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) + from loopy.type_inference import infer_unknown_types - test_knl = lp.preprocess_kernel(test_knl) + test_prog = infer_unknown_types(test_prog, expect_completion=True) + test_prog_codegen_result = lp.generate_code_v2(test_prog) + + args = make_args(test_prog, + test_prog_codegen_result.implemented_data_info, + queue, ref_arg_data, parameters) + args["out_host"] = False + + if not quiet: + print(75*"-") + print("Kernel #%d:" % i) + print(75*"-") + if print_code: + print(get_highlighted_code( + test_prog_codegen_result.device_code())) + print(75*"-") + if dump_binary: + print(type(test_prog_codegen_result.cl_program)) + print(test_prog_codegen_result.cl_program.binaries[0]) + print(75*"-") - if not test_knl.schedule: - test_kernels = lp.generate_loop_schedules(test_knl) - else: - test_kernels = [test_knl] + logger.info("%s: run warmup" % (test_prog.name)) - test_kernel_count = 0 + for i in range(warmup_rounds): + if not AUTO_TEST_SKIP_RUN: + test_prog(queue, **args) - from loopy.type_inference import infer_unknown_types - for i, kernel in enumerate(test_kernels): - test_kernel_count += 1 - if test_kernel_count > max_test_kernel_count: - break + if need_check and not AUTO_TEST_SKIP_RUN: + for arg_desc in ref_arg_data: + if arg_desc is None: + continue + if not arg_desc.needs_checking: + continue - kernel = infer_unknown_types(kernel, expect_completion=True) + from pyopencl.compyte.array import as_strided + ref_ary = as_strided( + arg_desc.ref_storage_array.get(), + shape=arg_desc.ref_shape, + strides=arg_desc.ref_numpy_strides).flatten() + test_ary = as_strided( + arg_desc.test_storage_array.get(), + shape=arg_desc.test_shape, + strides=arg_desc.test_numpy_strides).flatten() + common_len = min(len(ref_ary), len(test_ary)) + ref_ary = ref_ary[:common_len] + test_ary = test_ary[:common_len] - compiled = CompiledKernel(ctx, kernel) + error_is_small, error = check_result(test_ary, ref_ary) + if not error_is_small: + raise AutomaticTestFailure(error) - if args is None: - kernel_info = compiled.kernel_info(frozenset()) + need_check = False - args = make_args(kernel, - kernel_info.implemented_data_info, - queue, ref_arg_data, parameters) - args["out_host"] = False + events = [] + queue.finish() - if not quiet: - print(75*"-") - print("Kernel #%d:" % i) - print(75*"-") - if print_code: - print(compiled.get_highlighted_code()) - print(75*"-") - if dump_binary: - print(type(compiled.cl_program)) - print(compiled.cl_program.binaries[0]) - print(75*"-") + logger.info("%s: warmup done" % (test_prog.name)) - logger.info("%s: run warmup" % (knl.name)) + logger.info("%s: timing run" % (test_prog.name)) - for i in range(warmup_rounds): - if not AUTO_TEST_SKIP_RUN: - compiled(queue, **args) - - if need_check and not AUTO_TEST_SKIP_RUN: - for arg_desc in ref_arg_data: - if arg_desc is None: - continue - if not arg_desc.needs_checking: - continue - - from pyopencl.compyte.array import as_strided - ref_ary = as_strided( - arg_desc.ref_storage_array.get(), - shape=arg_desc.ref_shape, - strides=arg_desc.ref_numpy_strides).flatten() - test_ary = as_strided( - arg_desc.test_storage_array.get(), - shape=arg_desc.test_shape, - strides=arg_desc.test_numpy_strides).flatten() - common_len = min(len(ref_ary), len(test_ary)) - ref_ary = ref_ary[:common_len] - test_ary = test_ary[:common_len] - - error_is_small, error = check_result(test_ary, ref_ary) - if not error_is_small: - raise AutomaticTestFailure(error) - - need_check = False - - events = [] - queue.finish() + timing_rounds = warmup_rounds - logger.info("%s: warmup done" % (knl.name)) + while True: + from time import time + start_time = time() - logger.info("%s: timing run" % (knl.name)) + evt_start = cl.enqueue_marker(queue) - timing_rounds = warmup_rounds + for i in range(timing_rounds): + if not AUTO_TEST_SKIP_RUN: + evt, _ = test_prog(queue, **args) + events.append(evt) + else: + events.append(cl.enqueue_marker(queue)) - while True: - from time import time - start_time = time() + evt_end = cl.enqueue_marker(queue) - evt_start = cl.enqueue_marker(queue) + queue.finish() + stop_time = time() - for i in range(timing_rounds): - if not AUTO_TEST_SKIP_RUN: - evt, _ = compiled(queue, **args) - events.append(evt) - else: - events.append(cl.enqueue_marker(queue)) + for evt in events: + evt.wait() + evt_start.wait() + evt_end.wait() - evt_end = cl.enqueue_marker(queue) + elapsed_event = (1e-9*events[-1].profile.END + - 1e-9*events[0].profile.START) \ + / timing_rounds + try: + elapsed_event_marker = ((1e-9*evt_end.profile.START + - 1e-9*evt_start.profile.START) + / timing_rounds) + except cl.RuntimeError: + elapsed_event_marker = None - queue.finish() - stop_time = time() + elapsed_wall = (stop_time-start_time)/timing_rounds - for evt in events: - evt.wait() - evt_start.wait() - evt_end.wait() + if elapsed_wall * timing_rounds < 0.3: + timing_rounds *= 4 + else: + break - elapsed_event = (1e-9*events[-1].profile.END - - 1e-9*events[0].profile.START) \ - / timing_rounds - try: - elapsed_event_marker = ((1e-9*evt_end.profile.START - - 1e-9*evt_start.profile.START) - / timing_rounds) - except cl.RuntimeError: - elapsed_event_marker = None + logger.info("%s: timing run done" % (test_prog.name)) - elapsed_wall = (stop_time-start_time)/timing_rounds + rates = "" + for cnt, lbl in zip(op_count, op_label): + rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - if elapsed_wall * timing_rounds < 0.3: - timing_rounds *= 4 + if not quiet: + def format_float_or_none(v): + if v is None: + return "" else: - break + return "%g" % v - logger.info("%s: timing run done" % (knl.name)) + print("elapsed: %s s event, %s s marker-event %s s wall " + "(%d rounds)%s" % ( + format_float_or_none(elapsed_event), + format_float_or_none(elapsed_event_marker), + format_float_or_none(elapsed_wall), timing_rounds, rates)) - rates = "" + if do_check: + ref_rates = "" for cnt, lbl in zip(op_count, op_label): - rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - + ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: - def format_float_or_none(v): - if v is None: - return "" - else: - return "%g" % v - - print("elapsed: %s s event, %s s marker-event %s s wall " - "(%d rounds)%s" % ( - format_float_or_none(elapsed_event), - format_float_or_none(elapsed_event_marker), - format_float_or_none(elapsed_wall), timing_rounds, rates)) - - if do_check: - ref_rates = "" - for cnt, lbl in zip(op_count, op_label): - ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) - if not quiet: - print("ref: elapsed: %g s event, %g s wall%s" % ( - ref_elapsed_event, ref_elapsed_wall, ref_rates)) + print("ref: elapsed: %g s event, %g s wall%s" % ( + ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} -- GitLab From b9391c6e13201c8d969349525b0201c85cbbff36 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:16:54 -0500 Subject: [PATCH 262/916] successful_tests++ --- loopy/__init__.py | 5 +++-- loopy/codegen/__init__.py | 2 +- loopy/preprocess.py | 7 +++---- loopy/program.py | 16 ++++++++++------ loopy/target/execution.py | 5 +++-- loopy/type_inference.py | 8 +------- 6 files changed, 21 insertions(+), 22 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 49611d55f..057657101 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -130,7 +130,8 @@ from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call # }}} from loopy.type_inference import infer_unknown_types -from loopy.preprocess import preprocess_kernel, realize_reduction +from loopy.preprocess import (preprocess_kernel, realize_reduction, + preprocess_program) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, @@ -262,7 +263,7 @@ __all__ = [ "infer_unknown_types", - "preprocess_kernel", "realize_reduction", + "preprocess_kernel", "realize_reduction", "preprocess_program", "generate_loop_schedules", "get_one_scheduled_kernel", "GeneratedProgram", "CodeGenerationResult", "PreambleInfo", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d80dec27e..3c58b2564 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -418,7 +418,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): from loopy.kernel import KernelState if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, program_callables_info) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 972c5019f..3409080dd 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -35,7 +35,7 @@ from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now -from loopy.type_inference import infer_unknown_types_for_a_single_kernel +from loopy.type_inference import infer_unknown_types from loopy.symbolic import CombineMapper, RuleAwareIdentityMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, @@ -2412,9 +2412,6 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. - kernel, program_callables_info = infer_unknown_types_for_a_single_kernel( - kernel, program_callables_info, expect_completion=False) - check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) @@ -2492,6 +2489,8 @@ def preprocess_program(program, device=None): warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) + program = infer_unknown_types(program, expect_completion=False) + root_kernel_callable = program.program_callables_info[program.name] program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) diff --git a/loopy/program.py b/loopy/program.py index 8fec476bb..08efc0e89 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -228,12 +228,6 @@ class Program(ImmutableRecord): self.program_callables_info, ignore_auto=ignore_auto) - @property - def name(self): - #FIXME: discuss with @inducer if we use "name" instead of - # "root_kernel_name" - return self.root_kernel_name - # {{{ implementation arguments @property @@ -268,6 +262,16 @@ class Program(ImmutableRecord): def root_kernel(self): return self.program_callables_info[self.root_kernel_name].subkernel + @property + def name(self): + #FIXME: discuss with @inducer if we use "name" instead of + # "root_kernel_name" + return self.root_kernel_name + + @property + def arg_dict(self): + return self.root_kernel.arg_dict + def with_root_kernel(self, root_kernel): new_in_knl_callable = self.program_callables_info[ self.root_kernel_name].copy(subkernel=root_kernel) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index e68d14a21..b61c29a51 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -143,7 +143,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from shapes def generate_integer_arg_finding_from_shapes( - self, gen, kernel, implemented_data_info): + self, gen, program, implemented_data_info): # a mapping from integer argument names to a list of tuples # (arg_name, expression), where expression is a # unary function of kernel.arg_dict[arg_name] @@ -168,7 +168,8 @@ class ExecutionWrapperGeneratorBase(object): if len(deps) == 1: integer_arg_var, = deps - if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): + if program.arg_dict[ + integer_arg_var.name].dtype.is_integral(): from pymbolic.algorithm import solve_affine_equations_for try: # friggin' overkill :) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index aa8222553..e0517a71f 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -838,10 +838,8 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" from loopy.kernel import LoopKernel - input_was_kernel = False if isinstance(program, LoopKernel): # FIXME: deprecate warning needed here - input_was_kernel = True from loopy.program import make_program_from_kernel program = make_program_from_kernel(program) @@ -871,11 +869,7 @@ def infer_unknown_types(program, expect_completion=False): # FIXME: maybe put all of this in a function? # need to infer functions that were left out during inference - if input_was_kernel: - return (program.copy( - program_callables_info=program_callables_info)).root_kernel - else: - return program.copy(program_callables_info=program_callables_info) + return program.copy(program_callables_info=program_callables_info) # }}} -- GitLab From 6d9d105f2cdbff28bc2c40c8b8d725547d82a2cd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:30:42 -0500 Subject: [PATCH 263/916] successful_test++ --- loopy/type_inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index e0517a71f..8f31c9d57 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -33,6 +33,7 @@ from loopy.types import NumpyType from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.kernel.instruction import _DataObliviousInstruction import logging @@ -799,7 +800,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, # functions type_inf_mapper(insn.expression, return_tuple=isinstance(insn, lp.CallInstruction), return_dtype_set=True) - elif isinstance(insn, (lp._DatObliviousInstruction, + elif isinstance(insn, (_DataObliviousInstruction, lp.CInstruction)): pass else: -- GitLab From e0b5a51a99d1e81c4537e883fd2bb40eb66d069d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:33:05 -0500 Subject: [PATCH 264/916] successful_tesst++ --- test/test_loopy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index accf9c1df..6b4c05114 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -125,9 +125,8 @@ def test_type_inference_no_artificial_doubles(ctx_factory): assumptions="n>=1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "double" not in code + code = lp.generate_code(knl) + assert "double" not in code def test_type_inference_with_type_dependencies(): -- GitLab From b789912e23feebdd964106e471e415e1434b56e1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:34:53 -0500 Subject: [PATCH 265/916] successful_tests++ --- test/test_loopy.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 6b4c05114..21ddc778c 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -208,11 +208,7 @@ def test_owed_barriers(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(lp.generate_code_v2(knl).device_code()) def test_wg_too_small(ctx_factory): -- GitLab From 6c3ad7e0bfe1c6b2405a97049bf60b8ae1af7100 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:36:34 -0500 Subject: [PATCH 266/916] successful_tests++ --- test/test_loopy.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 21ddc778c..15fc7b286 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -225,12 +225,10 @@ def test_wg_too_small(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) import pytest - for gen_knl in kernel_gen: - with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, gen_knl).get_code() + with pytest.raises(RuntimeError): + lp.generate_code_v2(knl) def test_multi_cse(ctx_factory): -- GitLab From 0ce3eecba78640096b9adb3a2fbcd285fa214bf4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:45:19 -0500 Subject: [PATCH 267/916] successful_tests++ --- test/test_loopy.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 15fc7b286..869f9981b 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -212,8 +212,6 @@ def test_owed_barriers(ctx_factory): def test_wg_too_small(ctx_factory): - ctx = ctx_factory() - knl = lp.make_kernel( "{[i]: 0<=i<100}", [ @@ -224,15 +222,13 @@ def test_wg_too_small(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - import pytest with pytest.raises(RuntimeError): - lp.generate_code_v2(knl) + prog = lp.make_program_from_kernel(knl) + lp.generate_code_v2(prog) def test_multi_cse(ctx_factory): - ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", @@ -245,12 +241,7 @@ def test_multi_cse(ctx_factory): knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + lp.generate_code_v2(knl) # {{{ code generator fuzzing @@ -344,8 +335,7 @@ def test_fuzz_code_generator(ctx_factory): lp.ValueArg(name, get_dtype(val)) for name, val in six.iteritems(var_values) ]) - ck = lp.CompiledKernel(ctx, knl) - evt, (lp_value,) = ck(queue, out_host=True, **var_values) + evt, (lp_value,) = knl(queue, out_host=True, **var_values) err = abs(true_value-lp_value)/abs(true_value) if abs(err) > 1e-10: print(80*"-") @@ -353,7 +343,8 @@ def test_fuzz_code_generator(ctx_factory): print("true=%r" % true_value) print("loopy=%r" % lp_value) print(80*"-") - print(ck.get_code()) + print(lp.generate_code_v2(lp.make_program_from_kernel( + knl).device_code())) print(80*"-") print(var_values) print(80*"-") -- GitLab From 9d79590288ad4e760dd3a74eca73df82c4f8c0a2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:51:17 -0500 Subject: [PATCH 268/916] successful_tests++ --- loopy/type_inference.py | 3 ++- test/test_loopy.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 8f31c9d57..dcbb168fe 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -551,7 +551,8 @@ class TypeInferenceMapper(CombineMapper): def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): - return [kernel.index_dtype], [], {} + return [kernel.index_dtype], [], {}, ( + type_inf_mapper.program_callables_info) from functools import partial debug = partial(_debug, kernel) diff --git a/test/test_loopy.py b/test/test_loopy.py index 869f9981b..1015b00a0 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -375,9 +375,8 @@ def test_bare_data_dependency(ctx_factory): lp.ValueArg("n", np.int32), ]) - cknl = lp.CompiledKernel(ctx, knl) n = 20000 - evt, (a,) = cknl(queue, n=n, out_host=True) + evt, (a,) = knl(queue, n=n, out_host=True) assert a.shape == (n,) assert (a == 1).all() -- GitLab From 3822ac6d9c815984a7fd19cb89b44dc0e0c1d9a0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:58:30 -0500 Subject: [PATCH 269/916] successful_tests++ --- test/test_loopy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1015b00a0..469cb3da1 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -424,10 +424,10 @@ def test_ilp_write_race_avoidance_local(ctx_factory): []) knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) + prog = lp.make_program_from_kernel(knl) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16, 17) + prog = lp.preprocess_program(prog, ctx.devices[0]) + assert prog.root_kernel.temporary_variables['a'].shape == (16, 17) def test_ilp_write_race_avoidance_private(ctx_factory): -- GitLab From 31bd5e214042cdae61872935c83e6dbd8a6ceae6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 01:59:42 -0500 Subject: [PATCH 270/916] successful_tests++ --- test/test_loopy.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 469cb3da1..0140ed041 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -442,9 +442,10 @@ def test_ilp_write_race_avoidance_private(ctx_factory): knl = lp.tag_inames(knl, dict(j="ilp")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16,) + prog = lp.make_program_from_kernel(knl) + + prog = lp.preprocess_program(prog, ctx.devices[0]) + assert prog.root_kernel.temporary_variables['a'].shape == (16,) # }}} -- GitLab From 7f311185a21945ad07f69b600c2e2e98fcba9f66 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 03:48:43 -0500 Subject: [PATCH 271/916] successful_tests+=4 --- loopy/codegen/__init__.py | 5 +++++ test/test_loopy.py | 32 ++++++++++---------------------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 3c58b2564..14211acb9 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -556,6 +556,11 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): def generate_code_v2(program): + from loopy.kernel import LoopKernel + from loopy.program import make_program_from_kernel + + if isinstance(program, LoopKernel): + program = make_program_from_kernel(program) from loopy.kernel import KernelState if program.root_kernel.state == KernelState.INITIAL: diff --git a/test/test_loopy.py b/test/test_loopy.py index 0140ed041..21722b885 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -142,7 +142,9 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - knl = lp.infer_unknown_types(knl) + prog = lp.make_program_from_kernel(knl) + prog = lp.infer_unknown_types(prog) + knl = prog.root_kernel from loopy.types import to_loopy_type assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) @@ -175,7 +177,6 @@ def test_sized_and_complex_literals(ctx_factory): def test_simple_side_effect(ctx_factory): - ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<100}", @@ -185,13 +186,8 @@ def test_simple_side_effect(ctx_factory): [lp.GlobalArg("a", np.float32, shape=(100,))] ) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + prog = lp.make_program_from_kernel(knl) + print(lp.generate_code_v2(prog)) def test_owed_barriers(ctx_factory): @@ -224,8 +220,7 @@ def test_wg_too_small(ctx_factory): import pytest with pytest.raises(RuntimeError): - prog = lp.make_program_from_kernel(knl) - lp.generate_code_v2(prog) + lp.generate_code_v2(knl) def test_multi_cse(ctx_factory): @@ -386,7 +381,6 @@ def test_bare_data_dependency(ctx_factory): @pytest.mark.skipif("sys.version_info < (2,6)") def test_ilp_write_race_detection_global(ctx_factory): - ctx = ctx_factory() knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j Date: Mon, 30 Jul 2018 16:43:53 -0500 Subject: [PATCH 272/916] handles realize_reduction acoording to the new model(finally!) --- loopy/preprocess.py | 209 ++++++++++++---------------------------- loopy/type_inference.py | 7 +- 2 files changed, 66 insertions(+), 150 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 3409080dd..6db16d110 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -36,7 +36,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import CombineMapper, RuleAwareIdentityMapper +from loopy.symbolic import RuleAwareIdentityMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) @@ -907,9 +907,9 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, - automagic_scans_ok=False, force_scan=False, - force_outer_iname_for_scan=None): +def realize_reduction(kernel, program_callables_info, insn_id_filter=None, + unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, + force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -1029,7 +1029,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential - def map_reduction_seq(expr, rec, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, program_callables_info, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1147,7 +1147,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, nresults, arg_dtypes, + def map_reduction_local(expr, rec, program_callables_info, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1476,17 +1476,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ local-parallel scan - def map_scan_local(expr, rec, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, - sweep_min_value, scan_min_value, stride): + def map_scan_local(expr, rec, program_callables_info, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 if scan_size == 1: - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, program_callables_info, + nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1685,15 +1685,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ seq/par dispatch - def map_reduction(expr, rec, nresults=1): + def map_reduction(expr, rec, program_callables_info, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes = ( + arg_dtypes, reduction_dtypes, program_callables_info = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, unknown_types_ok)) + temp_kernel, expr, program_callables_info, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1829,12 +1829,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if n_sequential: assert n_local_par == 0 - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, program_callables_info, + nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + expr, rec, program_callables_info, nresults, arg_dtypes, + reduction_dtypes) # }}} @@ -1867,9 +1868,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: - new_expressions = cb_mapper(insn.expression, nresults=nresults) + new_expressions = cb_mapper(insn.expression, + program_callables_info=program_callables_info, + nresults=nresults) else: - new_expressions = (cb_mapper(insn.expression),) + new_expressions = ( + cb_mapper(insn.expression, + program_callables_info=program_callables_info),) if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -2233,7 +2238,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): return kernel.copy(instructions=new_insns) -def infer_arg_descr(kernel, program_callables_info): +def infer_arg_descr_from_root_kernel(kernel, program_callables_info): """ Returns a copy of *kernel* with the argument shapes and strides matching for scoped functions in the *kernel*. Refer @@ -2254,112 +2259,23 @@ def infer_arg_descr(kernel, program_callables_info): return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info -# }}} - - -# {{{ catching functions that are not ready for codegen - -class FunctionsNotReadyForCodegenCollector(CombineMapper): - """ - Returns all instances of function calls in an expression which are - not ready for code generation. - """ - def __init__(self, kernel): - self.kernel = kernel - - def combine(self, values): - return all(values) - - def map_call(self, expr, *args, **kwargs): - from pymbolic.primitives import CallWithKwargs, Call - from loopy.library.reduction import ArgExtOp, SegmentedOp - from pymbolic.primitives import Variable - from loopy.symbolic import ResolvedFunction - - if isinstance(expr, Call): - kw_parameters = {} - else: - assert isinstance(expr, CallWithKwargs) - kw_parameters = expr.kw_parameters - - if isinstance(expr.function, (ArgExtOp, SegmentedOp)): - return self.combine( - tuple( - self.rec(child, *args, **kwargs) for child in - expr.parameters + tuple(kw_parameters))) - - elif isinstance(expr.function, Variable): - # UnResolvedFunction obtained and hence clearly not ready for - # codegen. - return False - - elif isinstance(expr.function, ResolvedFunction): - is_ready_for_codegen = self.kernel.scoped_functions[ - expr.function.name].is_ready_for_codegen() - return self.combine( - (is_ready_for_codegen,) + - tuple( - self.rec(child, *args, **kwargs) - for child in - expr.parameters+tuple(kw_parameters.values()))) - else: - raise LoopyError("Unexpected function type %s obtained in %s" - % (type(expr.function), expr)) - - map_call_with_kwargs = map_call - - def map_constant(self, expr): - return True - - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant - - -def make_functions_ready_for_codegen(kernel): - """ - Specializes the functions in the kernel that are missed during type - inference. - - .. code:: python - - knl = lp.make_kernel( - "{[i]: 0<=i<16}", - "a[i] = sin(b[i])", - [lp.ArrayArg('a', dtype=np.float64), - lp.ArrayArg('b', dtype=np.float64)]) - In the above case, none of the instructions undergo type-specialization, as - all the arguments' types have been realized. But, this would be a problem - during the code generation phase as ``sin`` did not undergo type - specialization, and hence must be fixed through this function. - """ - from loopy.type_inference import TypeInferenceMapper - from loopy.symbolic import SubstitutionRuleExpander - from loopy.kernel.function_interface import ( - register_pymbolic_calls_to_knl_callables) - - unready_functions_collector = FunctionsNotReadyForCodegenCollector(kernel) - subst_expander = SubstitutionRuleExpander(kernel.substitutions) - type_inf_mapper = TypeInferenceMapper(kernel) +def infer_arg_descr(program): + root_kernel_callable = program.program_callables_info[program.name] + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + root_kernel = program.root_kernel - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - expr = subst_expander(insn.expression) - if not unready_functions_collector(expr): - # Infer the type of the functions that are not type specialized. - type_inf_mapper(expr, return_tuple=isinstance(insn, - CallInstruction), return_dtype_set=True) - - elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): - pass + new_root_kernel, program_callables_info = infer_arg_descr_from_root_kernel( + root_kernel, program_callables_info) + new_root_kernel_callable = root_kernel_callable.copy( + subkernel=new_root_kernel) + program_callables_info.with_callable(program.name, + new_root_kernel_callable) - else: - NotImplementedError("Unknown Instruction") + program_callables_info = program_callables_info.with_exit_edit_callables_mode() - return register_pymbolic_calls_to_knl_callables(kernel, - type_inf_mapper.specialized_functions) + return program.copy(program_callables_info=program_callables_info) # }}} @@ -2426,7 +2342,8 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. - kernel = realize_reduction(kernel, unknown_types_ok=False) + kernel = realize_reduction(kernel, program_callables_info, + unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators @@ -2436,10 +2353,6 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): kernel = find_temporary_address_space(kernel) - # inferring the shape and dim_tags of the arguments involved in a function - # call. - kernel, program_callables_info = infer_arg_descr(kernel, program_callables_info) - # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) @@ -2472,11 +2385,12 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): if CACHING_ENABLED: preprocess_cache.store_if_not_present(input_kernel, kernel) - return kernel, program_callables_info + return kernel def preprocess_kernel(kernel, device=None): # FIXME: error message? + # FIXME: do we assume that we should give out a program or a kernel from loopy.program import make_program_from_kernel program = make_program_from_kernel(kernel) return preprocess_program(program, device) @@ -2491,31 +2405,28 @@ def preprocess_program(program, device=None): program = infer_unknown_types(program, expect_completion=False) - root_kernel_callable = program.program_callables_info[program.name] - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) - root_kernel, program_callables_info = preprocess_single_kernel( - program.root_kernel, program_callables_info, device) - processed_root_knl_callable = root_kernel_callable.copy(subkernel=root_kernel) - program_callables_info, _ = ( - program_callables_info.with_callable( - program.root_kernel_name, - processed_root_knl_callable)) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) + # {{{ preprocess the root kernel + + root_kernel = preprocess_single_kernel( + program.root_kernel, program.program_callables_info, device) + program = program.with_root_kernel(root_kernel) - semi_preprocessed_program = ( - program.copy(program_callables_info=program_callables_info)) + # }}} + + # infer arg descrs of the callables + program = infer_arg_descr(program) + + # {{{ hw axes inference # FIXME: think of wrapping this in a function? - local_size, global_size = semi_preprocessed_program.get_grid_size_upper_bounds() + local_size, global_size = program.get_grid_size_upper_bounds() resolved_function_with_hw_axes_sizes_set = {} for func_id, in_knl_callable in ( - semi_preprocessed_program.program_callables_info.items()): - if func_id == semi_preprocessed_program.name: + program.program_callables_info.items()): + if func_id == program.name: resolved_function_with_hw_axes_sizes_set[func_id] = ( in_knl_callable) else: @@ -2523,10 +2434,14 @@ def preprocess_program(program, device=None): in_knl_callable.with_hw_axes_sizes(local_size, global_size)) new_program_callables_info = ( - semi_preprocessed_program.program_callables_info.copy( + program.program_callables_info.copy( resolved_functions=resolved_function_with_hw_axes_sizes_set)) - return program.copy(program_callables_info=new_program_callables_info) + program = program.copy(program_callables_info=new_program_callables_info) + + # }}} + + return program # vim: foldmethod=marker diff --git a/loopy/type_inference.py b/loopy/type_inference.py index dcbb168fe..51af1d7b0 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -879,8 +879,8 @@ def infer_unknown_types(program, expect_completion=False): # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel) + kernel, expr, program_callables_info, unknown_types_ok): + type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info) import loopy as lp if expr.is_tuple_typed: @@ -911,7 +911,8 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( if dt is not lp.auto else dt for dt in reduction_dtypes) - return tuple(arg_dtypes), reduction_dtypes + return tuple(arg_dtypes), reduction_dtypes, ( + type_inf_mapper.program_callables_info) # }}} -- GitLab From d1b33354f725bad1641967b662f18b7214d496d3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 16:54:39 -0500 Subject: [PATCH 273/916] adds kwargs option to mpa_resolved_function --- loopy/symbolic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 9f336f565..e800599d1 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -108,12 +108,12 @@ class IdentityMapperMixin(object): def map_type_annotation(self, expr, *args): return type(expr)(expr.type, self.rec(expr.child, *args)) - def map_sub_array_ref(self, expr, *args): - return SubArrayRef(self.rec(expr.swept_inames, *args), - self.rec(expr.subscript, *args)) + def map_sub_array_ref(self, expr, *args, **kwargs): + return SubArrayRef(self.rec(expr.swept_inames, *args, **kwargs), + self.rec(expr.subscript, *args, **kwargs)) - def map_resolved_function(self, expr, *args): - return ResolvedFunction(self.rec(expr.function, *args)) + def map_resolved_function(self, expr, *args, **kwargs): + return ResolvedFunction(self.rec(expr.function, *args, **kwargs)) map_type_cast = map_type_annotation -- GitLab From 4e840cdbfd74193012d6458b5aa26474e1d02c73 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 17:04:48 -0500 Subject: [PATCH 274/916] successful_tests+=3 --- test/test_loopy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 21722b885..ac5ebc2af 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -484,7 +484,7 @@ def test_arg_shape_guessing(ctx_factory): assumptions="n>=1") print(knl) - print(lp.generate_code_2(knl)) + print(lp.generate_code_v2(knl)) def test_arg_guessing(ctx_factory): @@ -503,7 +503,6 @@ def test_arg_guessing(ctx_factory): def test_arg_guessing_with_reduction(ctx_factory): #logging.basicConfig(level=logging.DEBUG) - ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j Date: Mon, 30 Jul 2018 17:19:36 -0500 Subject: [PATCH 275/916] correction to include program_callables_info in pre_codegen_checks. --- loopy/check.py | 2 +- loopy/target/pyopencl.py | 9 +++++---- test/test_loopy.py | 9 +++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 53275d2a2..8e41e6976 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1000,7 +1000,7 @@ def pre_codegen_checks(kernel, program_callables_info): check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel) + kernel.target.pre_codegen_check(kernel, program_callables_info) check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index ab37665d0..03ba26930 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -134,7 +134,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, device): +def check_sizes(kernel, program_callables_info, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -151,7 +151,8 @@ def check_sizes(kernel, device): if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately - glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() + glens, llens = ( + kernel.get_grid_size_upper_bounds_as_exprs(program_callables_info)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -396,8 +397,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel): - check_sizes(kernel, self.device) + def pre_codegen_check(self, kernel, program_callables_info): + check_sizes(kernel, program_callables_info, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) diff --git a/test/test_loopy.py b/test/test_loopy.py index ac5ebc2af..1acf53681 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -521,7 +521,6 @@ def test_arg_guessing_with_reduction(ctx_factory): def test_unknown_arg_shape(ctx_factory): ctx = ctx_factory() from loopy.target.pyopencl import PyOpenCLTarget - from loopy.compiled import CompiledKernel bsize = [256, 0] knl = lp.make_kernel( @@ -537,11 +536,13 @@ def test_unknown_arg_shape(ctx_factory): """, seq_dependencies=True, name="uniform_l", - target=PyOpenCLTarget(), + target=PyOpenCLTarget(ctx.devices[0]), assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) - knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset()) # noqa + prog = lp.make_program_from_kernel(knl) + prog = lp.add_and_infer_dtypes(prog, dict(a=np.float32)) + + lp.generate_code_v2(prog) # }}} -- GitLab From d886ce6a31d2d3aea609d93ff69eaa5b8222abdd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 17:30:58 -0500 Subject: [PATCH 276/916] successful_tests++ --- test/test_loopy.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1acf53681..25c91c010 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -581,8 +581,6 @@ def test_offsets_and_slicing(ctx_factory): knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1") - cknl = lp.CompiledKernel(ctx, knl) - a_full = cl.clrandom.rand(queue, (n, n), np.float64) a_full_h = a_full.get() b_full = cl.clrandom.rand(queue, (n, n), np.float64) @@ -596,8 +594,7 @@ def test_offsets_and_slicing(ctx_factory): b_full_h[b_sub] = 2*a_full_h[a_sub] - print(cknl.get_highlighted_code({"a": a.dtype})) - cknl(queue, a=a, b=b) + evt, (out, ) = knl(queue, a=a, b=b) import numpy.linalg as la assert la.norm(b_full.get() - b_full_h) < 1e-13 -- GitLab From 4cf2042d5cbac6a495858950bb9776df484cbc7d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Jul 2018 22:29:37 -0500 Subject: [PATCH 277/916] pass more tests. --- loopy/kernel/tools.py | 13 ++++++++----- loopy/transform/data.py | 36 ++++++++++++++++++++++++++++++++--- loopy/transform/iname.py | 32 ++++++++++++++++++++++++++++++- loopy/transform/precompute.py | 8 ++++---- test/test_loopy.py | 15 ++++++--------- 5 files changed, 82 insertions(+), 22 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 3395e876f..bb9703e9c 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -753,7 +753,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, axis=0, local_size=None): +def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -767,7 +767,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - ignore_auto=True) + program_callables_info, ignore_auto=True) # {{{ axis assignment helper function @@ -834,17 +834,19 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: - from loopy import split_iname, untag_inames + from loopy import untag_inames + from loopy.transform.iname import split_iname_for_single_kernel # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. return assign_automatic_axes( - split_iname( + split_iname_for_single_kernel( untag_inames(kernel, iname, AutoLocalIndexTagBase), iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), + program_callables_info=program_callables_info, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -934,7 +936,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if axis >= len(local_size): return kernel else: - return assign_automatic_axes(kernel, axis=axis+1, + return assign_automatic_axes(kernel, + program_callables_info=program_callables_info, axis=axis+1, local_size=local_size) # }}} diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 5b1ee6cca..8ed4cbc91 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -30,6 +30,9 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable # {{{ convenience: add_prefetch @@ -140,7 +143,8 @@ class _not_provided: # noqa: N801 pass -def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, +def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, + sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, @@ -239,6 +243,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, This function internally uses :func:`extract_subst` and :func:`precompute`. """ + assert isinstance(kernel, LoopKernel) # {{{ fish indexing out of var_name and into footprint_subscripts @@ -329,8 +334,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # warning message. from loopy.transform.precompute import precompute - new_kernel = precompute(kernel, subst_use, sweep_inames, - precompute_inames=dim_arg_names, + new_kernel = precompute(kernel, program_callables_info, subst_use, + sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, @@ -363,6 +368,31 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, else: return new_kernel + +def add_prefetch(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = add_prefetch_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 2b618a464..72330c2df 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -34,6 +34,10 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + __doc__ = """ .. currentmodule:: loopy @@ -306,7 +310,7 @@ def _split_iname_backend(kernel, split_iname, # {{{ split iname -def split_iname(kernel, split_iname, inner_length, +def split_iname_for_single_kernel(kernel, split_iname, inner_length, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, slabs=(0, 0), do_tagged_check=True, @@ -331,6 +335,8 @@ def split_iname(kernel, split_iname, inner_length, :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ + assert isinstance(kernel, LoopKernel) + def make_new_loop_index(inner, outer): return inner + outer*inner_length @@ -342,6 +348,30 @@ def split_iname(kernel, split_iname, inner_length, slabs=slabs, do_tagged_check=do_tagged_check, within=within) + +def split_iname(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = split_iname_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 52d568975..e3153fe24 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -258,9 +258,9 @@ class _not_provided(object): # noqa: N801 pass -def precompute(kernel, subst_use, sweep_inames=[], within=None, - storage_axes=None, temporary_name=None, precompute_inames=None, - precompute_outer_inames=None, +def precompute(kernel, program_callables_info, subst_use, sweep_inames=[], + within=None, storage_axes=None, temporary_name=None, + precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, # "None" is a valid value here, distinct from the default. @@ -1044,7 +1044,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, program_callables_info) return kernel diff --git a/test/test_loopy.py b/test/test_loopy.py index 25c91c010..0849eba9b 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -601,8 +601,6 @@ def test_offsets_and_slicing(ctx_factory): def test_vector_ilp_with_prefetch(ctx_factory): - ctx = ctx_factory() - knl = lp.make_kernel( "{ [i]: 0<=i Date: Tue, 31 Jul 2018 12:40:50 -0500 Subject: [PATCH 278/916] the hunt restarts :) --- loopy/preprocess.py | 3 ++- loopy/transform/iname.py | 31 +++++++++++++++++++++++++++++-- loopy/transform/precompute.py | 4 ++-- test/test_loopy.py | 16 ++++++++++------ 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 6db16d110..0bd3076c5 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1952,7 +1952,8 @@ def realize_reduction(kernel, program_callables_info, insn_id_filter=None, kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) - kernel = lp.tag_inames(kernel, new_iname_tags) + from loopy.transform.iname import tag_inames_for_single_kernel + kernel = tag_inames_for_single_kernel(kernel, new_iname_tags) # TODO: remove unused inames... diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 72330c2df..f4d1fdedb 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -303,7 +303,8 @@ def _split_iname_backend(kernel, split_iname, kernel = tag_inames(kernel, {outer_iname: existing_tag, inner_iname: existing_tag}) - return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag}) + return tag_inames_for_single_kernel(kernel, {outer_iname: outer_tag, + inner_iname: inner_tag}) # }}} @@ -655,7 +656,8 @@ def untag_inames(kernel, iname_to_untag, tag_type): # {{{ tag inames -def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): +def tag_inames_for_single_kernel(kernel, iname_to_tag, force=False, + ignore_nonexistent=False): """Tag an iname :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given @@ -777,6 +779,31 @@ def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): return kernel.copy(iname_to_tags=knl_iname_to_tags) + +def tag_inames(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = tag_inames_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index e3153fe24..2af3c04b7 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -1037,8 +1037,8 @@ def precompute(kernel, program_callables_info, subst_use, sweep_inames=[], # }}} - from loopy import tag_inames - kernel = tag_inames(kernel, new_iname_to_tag) + from loopy.transform.iname import tag_inames_for_single_kernel + kernel = tag_inames_for_single_kernel(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type diff --git a/test/test_loopy.py b/test/test_loopy.py index 0849eba9b..e4cff5b7f 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -74,9 +74,11 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory): knl = lp.fix_parameters(knl, n=16) knl = lp.add_barrier(knl, "id:first", "id:second") - knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.0") - knl = lp.split_iname(knl, "ii", 2, outer_tag="g.0", inner_tag="l.0") - evt, (out,) = knl(queue, a=a) + prog = lp.make_program_from_kernel(knl) + + prog = lp.split_iname(prog, "i", 2, outer_tag="g.0", inner_tag="l.0") + prog = lp.split_iname(prog, "ii", 2, outer_tag="g.0", inner_tag="l.0") + evt, (out,) = prog(queue, a=a) assert np.linalg.norm(out-((2*(a+cnst)+cnst))) <= 1e-15 @@ -233,10 +235,12 @@ def test_multi_cse(ctx_factory): [lp.GlobalArg("a", np.float32, shape=(100,))], local_sizes={0: 16}) - knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") - knl = lp.add_prefetch(knl, "a", []) + prog = lp.make_program_from_kernel(knl) + + prog = lp.split_iname(prog, "i", 16, inner_tag="l.0") + prog = lp.add_prefetch(prog, "a", []) - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) # {{{ code generator fuzzing -- GitLab From ffbac0d804d2cb79f48c3c7566cce2be73364fbc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 31 Jul 2018 15:51:36 -0500 Subject: [PATCH 279/916] more test passes. --- loopy/__init__.py | 5 -- loopy/auto_test.py | 13 ++- loopy/kernel/creation.py | 4 +- loopy/kernel/function_interface.py | 3 + loopy/kernel/tools.py | 8 +- loopy/library/function.py | 44 ++------- loopy/library/random123.py | 18 ++-- loopy/library/reduction.py | 4 +- loopy/transform/add_barrier.py | 34 ++++++- loopy/transform/data.py | 30 ++++++- loopy/transform/iname.py | 31 ++++++- loopy/transform/parameter.py | 31 ++++++- loopy/type_inference.py | 4 +- test/test_loopy.py | 137 +++++++++++++++++------------ 14 files changed, 240 insertions(+), 126 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 057657101..bfc616400 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -33,9 +33,6 @@ from loopy.diagnostic import LoopyError, LoopyWarning # {{{ imported user interface -from loopy.library.function import ( - default_function_mangler, single_arg_function_mangler) - from loopy.kernel.instruction import ( MemoryOrdering, memory_ordering, MemoryScope, memory_scope, @@ -188,8 +185,6 @@ __all__ = [ "SubstitutionRule", "CallMangleInfo", - "default_function_mangler", "single_arg_function_mangler", - "make_kernel", "UniqueName", "register_reduction_parser", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index fce9c6492..884bd946b 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -29,7 +29,9 @@ from warnings import warn import numpy as np import loopy as lp + from loopy.diagnostic import LoopyError, AutomaticTestFailure +from loopy.kernel import LoopKernel AUTO_TEST_SKIP_RUN = False @@ -387,8 +389,15 @@ def auto_test_vs_ref( test_knl = ref_knl do_check = False - ref_prog = lp.make_program_from_kernel(ref_knl) - test_prog = lp.make_program_from_kernel(test_knl) + if isinstance(ref_knl, LoopKernel): + ref_prog = lp.make_program_from_kernel(ref_knl) + else: + ref_prog = ref_knl + + if isinstance(test_knl, LoopKernel): + test_prog = lp.make_program_from_kernel(test_knl) + else: + test_prog = test_knl if len(ref_prog.args) != len(test_prog.args): raise LoopyError("ref_prog and test_prog do not have the same number " diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 22bdf5f84..f0e73bee9 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2318,8 +2318,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # NOTE: add_inferred_inames will be phased out and throws warnings if it # does something. knl = add_inferred_inames(knl) - from loopy.transform.parameter import fix_parameters - knl = fix_parameters(knl, **fixed_parameters) + from loopy.transform.parameter import fix_parameters_for_single_kernel + knl = fix_parameters_for_single_kernel(knl, **fixed_parameters) # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index b66b865e8..71324c85d 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -36,6 +36,8 @@ from loopy.symbolic import parse_tagged_name from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext, RuleAwareIdentityMapper, SubstitutionRuleExpander) +from loopy.kernel import LoopKernel + # {{{ argument descriptors @@ -492,6 +494,7 @@ class CallableKernel(InKernelCallable): def __init__(self, subkernel, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): + assert isinstance(subkernel, LoopKernel) super(CallableKernel, self).__init__( arg_id_to_dtype=arg_id_to_dtype, diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index bb9703e9c..4420dbe4a 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -36,6 +36,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg from loopy.tools import natsorted +from loopy.program import Program import logging logger = logging.getLogger(__name__) @@ -113,7 +114,8 @@ def get_arguments_with_incomplete_dtype(knl): if arg.dtype is None] -def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): +def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False): + assert isinstance(prog, Program) processed_dtype_dict = {} for k, v in six.iteritems(dtype_dict): @@ -122,10 +124,10 @@ def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): if subkey: processed_dtype_dict[subkey] = v - knl = add_dtypes(knl, processed_dtype_dict) + prog = add_dtypes(prog, processed_dtype_dict) from loopy.type_inference import infer_unknown_types - return infer_unknown_types(knl, expect_completion=expect_completion) + return infer_unknown_types(prog, expect_completion=expect_completion) def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): diff --git a/loopy/library/function.py b/loopy/library/function.py index 4873eca91..50bde1744 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -25,48 +25,15 @@ THE SOFTWARE. from loopy.kernel.function_interface import ScalarCallable -def default_function_mangler(kernel, name, arg_dtypes): - from loopy.library.reduction import reduction_function_mangler - - manglers = [reduction_function_mangler, tuple_function_mangler] - for mangler in manglers: - result = mangler(kernel, name, arg_dtypes) - if result is not None: - return result - - return None - - -def single_arg_function_mangler(kernel, name, arg_dtypes): - if len(arg_dtypes) == 1: - dtype, = arg_dtypes - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo(name, (dtype,), (dtype,)) - - return None - - -def tuple_function_mangler(kernel, name, arg_dtypes): - if name == "make_tuple": - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="loopy_make_tuple", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - - return None - - class MakeTupleCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): new_arg_id_to_dtype = arg_id_to_dtype.copy() for i in range(len(arg_id_to_dtype)): if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] - return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target="loopy_make_tuple") + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple"), program_callables_info) def with_descrs(self, arg_id_to_descr): from loopy.kernel.function_interface import ValueArgDescriptor @@ -77,11 +44,12 @@ class MakeTupleCallable(ScalarCallable): class IndexOfCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): new_arg_id_to_dtype = arg_id_to_dtype.copy() new_arg_id_to_dtype[-1] = kernel.index_dtype - return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), + program_callables_info) def loopy_specific_callable_scopers(target, identifier): diff --git a/loopy/library/random123.py b/loopy/library/random123.py index a2880bfb8..d172408d8 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -169,13 +169,14 @@ class Random123Callable(ScalarCallable): Records information about for the random123 functions. """ - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return (self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) name = self.name target = kernel.target @@ -191,8 +192,10 @@ class Random123Callable(ScalarCallable): if name == fn: new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: key_dtype} - return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=fn+"_gen") + return ( + self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen"), + program_callables_info) elif name == fn + "_f32": new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), @@ -200,7 +203,7 @@ class Random123Callable(ScalarCallable): -2: ctr_dtype, 0: ctr_dtype, 1: key_dtype} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name) + name_in_target=name), program_callables_info elif name == fn + "_f64": new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), @@ -208,9 +211,10 @@ class Random123Callable(ScalarCallable): -2: ctr_dtype, 0: ctr_dtype, 1: key_dtype} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name) + name_in_target=name), program_callables_info - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return (self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) def generate_preambles(self, target): rng_variant = FUNC_NAMES_TO_RNG[self.name] diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 503b76988..538125af1 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -401,7 +401,7 @@ def parse_reduction_op(name): # {{{ reduction specific callables class ReductionCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, @@ -412,7 +412,7 @@ class ReductionCallable(ScalarCallable): name_in_target = self.name.prefix(scalar_dtype, index_dtype) + "_op" return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name_in_target) + name_in_target=name_in_target), program_callables_info def with_descr(self, arg_id_to_descr): from loopy.library.kernel.function_interface import ValueArgDescriptor diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index cfbbd56e9..b6dddad38 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -26,6 +26,9 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable __doc__ = """ .. currentmodule:: loopy @@ -36,8 +39,9 @@ __doc__ = """ # {{{ add_barrier -def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, - tags=None, synchronization_kind="global", mem_kind=None): +def add_barrier_for_single_kernel(knl, insn_before="", insn_after="", + id_based_on=None, tags=None, synchronization_kind="global", + mem_kind=None): """Takes in a kernel that needs to be added a barrier and returns a kernel which has a barrier inserted into it. It takes input of 2 instructions and then adds a barrier in between those 2 instructions. The expressions can @@ -55,6 +59,8 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, for "global" bariers. If not supplied, defaults to :arg:`synchronization_kind` """ + assert isinstance(knl, LoopKernel) + if mem_kind is None: mem_kind = synchronization_kind @@ -82,6 +88,30 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, return new_knl + +def add_barrier(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = add_barrier_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} # vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 8ed4cbc91..596daf3ee 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -415,7 +415,7 @@ def change_arg_to_image(knl, name): # {{{ tag array axes -def tag_array_axes(knl, ary_names, dim_tags): +def tag_array_axes_for_single_kernel(knl, ary_names, dim_tags): """ .. versionchanged:: 2016.2 @@ -444,7 +444,33 @@ def tag_array_axes(knl, ary_names, dim_tags): return knl -tag_data_axes = MovedFunctionDeprecationWrapper(tag_array_axes) +tag_data_axes = ( + MovedFunctionDeprecationWrapper(tag_array_axes_for_single_kernel)) + + +def tag_array_axes(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = tag_array_axes_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index f4d1fdedb..6d69a8a1d 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -97,7 +97,7 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) -def prioritize_loops(kernel, loop_priority): +def prioritize_loops_for_single_kernel(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the kernel logically requires a different nesting, priority is ignored. @@ -111,6 +111,8 @@ def prioritize_loops(kernel, loop_priority): :arg: an iterable of inames, or, for brevity, a comma-separated string of inames """ + + assert isinstance(kernel, LoopKernel) if isinstance(loop_priority, str): loop_priority = tuple(s.strip() for s in loop_priority.split(",") if s.strip()) @@ -118,6 +120,30 @@ def prioritize_loops(kernel, loop_priority): return kernel.copy(loop_priority=kernel.loop_priority.union([loop_priority])) + +def prioritize_loops(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = prioritize_loops_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} @@ -787,8 +813,7 @@ def tag_inames(program, *args, **kwargs): for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = tag_inames_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, - *args, **kwargs) + in_knl_callable.subkernel, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index fc5dad91d..4b95d2a7b 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -28,6 +28,10 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + __doc__ = """ .. currentmodule:: loopy @@ -134,19 +138,44 @@ def _fix_parameter(kernel, name, value): )) -def fix_parameters(kernel, **value_dict): +def fix_parameters_for_single_kernel(kernel, **value_dict): """Fix the values of the arguments to specific constants. *value_dict* consists of *name*/*value* pairs, where *name* will be fixed to be *value*. *name* may refer to :ref:`domain-parameters` or :ref:`arguments`. """ + assert isinstance(kernel, LoopKernel) for name, value in six.iteritems(value_dict): kernel = _fix_parameter(kernel, name, value) return kernel + +def fix_parameters(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = fix_parameters_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} # vim: foldmethod=marker diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 51af1d7b0..c899f9f6c 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -35,6 +35,7 @@ from loopy.diagnostic import ( TypeInferenceFailure, DependencyTypeInferenceFailure) from loopy.kernel.instruction import _DataObliviousInstruction +from loopy.program import ProgramCallablesInfo import logging logger = logging.getLogger(__name__) @@ -71,6 +72,7 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel + assert isinstance(program_callables_info, ProgramCallablesInfo) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments @@ -116,7 +118,7 @@ class TypeInferenceMapper(CombineMapper): def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, new_ass) + return type(self)(self.kernel, self.program_callables_info, new_ass) @staticmethod def combine(dtype_sets): diff --git a/test/test_loopy.py b/test/test_loopy.py index e4cff5b7f..5a92e7dbe 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -71,11 +71,12 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory): 'cnst', shape=('n'), initializer=cnst, scope=lp.AddressSpace.GLOBAL, read_only=True), '...']) - knl = lp.fix_parameters(knl, n=16) - knl = lp.add_barrier(knl, "id:first", "id:second") prog = lp.make_program_from_kernel(knl) + prog = lp.fix_parameters(prog, n=16) + prog = lp.add_barrier(prog, "id:first", "id:second") + prog = lp.split_iname(prog, "i", 2, outer_tag="g.0", inner_tag="l.0") prog = lp.split_iname(prog, "ii", 2, outer_tag="g.0", inner_tag="l.0") evt, (out,) = prog(queue, a=a) @@ -200,13 +201,15 @@ def test_owed_barriers(ctx_factory): [ " z[i] = a[i]" ], - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) - knl = lp.tag_inames(knl, dict(i="l.0")) + prog = lp.make_program_from_kernel(knl) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - print(lp.generate_code_v2(knl).device_code()) + prog = lp.tag_inames(prog, dict(i="l.0")) + + print(lp.generate_code_v2(prog).device_code()) def test_wg_too_small(ctx_factory): @@ -218,11 +221,13 @@ def test_wg_too_small(ctx_factory): [lp.GlobalArg("a", np.float32, shape=(100,))], local_sizes={0: 16}) - knl = lp.tag_inames(knl, dict(i="l.0")) + prog = lp.make_program_from_kernel(knl) + + prog = lp.tag_inames(prog, dict(i="l.0")) import pytest with pytest.raises(RuntimeError): - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) def test_multi_cse(ctx_factory): @@ -397,13 +402,15 @@ def test_ilp_write_race_detection_global(ctx_factory): ], assumptions="n>=1") - knl = lp.tag_inames(knl, dict(j="ilp")) + prog = lp.make_program_from_kernel(knl) + + prog = lp.tag_inames(prog, dict(j="ilp")) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) @@ -417,10 +424,11 @@ def test_ilp_write_race_avoidance_local(ctx_factory): [ "<> a[i] = 5+i+j", ], + []) - knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) prog = lp.make_program_from_kernel(knl) + prog = lp.tag_inames(prog, dict(i="l.0", j="ilp")) prog = lp.preprocess_program(prog, ctx.devices[0]) assert prog.root_kernel.temporary_variables['a'].shape == (16, 17) @@ -436,9 +444,8 @@ def test_ilp_write_race_avoidance_private(ctx_factory): ], []) - knl = lp.tag_inames(knl, dict(j="ilp")) - prog = lp.make_program_from_kernel(knl) + prog = lp.tag_inames(prog, dict(j="ilp")) prog = lp.preprocess_program(prog, ctx.devices[0]) assert prog.root_kernel.temporary_variables['a'].shape == (16,) @@ -563,10 +570,11 @@ def test_nonlinear_index(ctx_factory): lp.GlobalArg("a", shape="n"), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_offsets_and_slicing(ctx_factory): @@ -644,12 +652,14 @@ def test_c_instruction(ctx_factory): lp.TemporaryVariable("x", np.float32), "...", ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) + prog = lp.make_program_from_kernel(knl) - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + prog = lp.split_iname(prog, "i", 128, outer_tag="g.0", inner_tag="l.0") - print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(prog) + print(lp.generate_code_v2(prog)) def test_dependent_domain_insn_iname_finding(ctx_factory): @@ -670,19 +680,21 @@ def test_dependent_domain_insn_iname_finding(ctx_factory): None, shape=None), lp.GlobalArg("strengths", None, shape="nsources"), - "..."]) + "..."], + target=lp.PyOpenCLTarget(ctx.devices[0])) - print(knl) assert "isrc_box" in knl.insn_inames("set_strength") - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( + prog = lp.make_program_from_kernel(knl) + prog = lp.add_dtypes(prog, dict( source_boxes=np.int32, box_source_starts=np.int32, box_source_counts_nonchild=np.int32, strengths=np.float64, nsources=np.int32, - ))) + )) + + print(prog) + print(lp.generate_code_v2(prog).device_code()) def test_inames_deps_from_write_subscript(ctx_factory): @@ -713,14 +725,15 @@ def test_modulo_indexing(ctx_factory): [ lp.GlobalArg("a", None, shape="n"), "..." - ] + ], target=lp.PyOpenCLTarget(ctx.devices[0]) ) - print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( + prog = lp.make_program_from_kernel(knl) + print(prog) + prog = lp.add_dtypes(prog, dict( a=np.float32, - ))) + )) + print(lp.generate_code_v2(prog).device_code()) @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) @@ -735,17 +748,18 @@ def test_vector_types(ctx_factory, vec_len): lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) + prog = lp.make_program_from_kernel(knl) - knl = lp.fix_parameters(knl, vec_len=vec_len) + prog = lp.fix_parameters(prog, vec_len=vec_len) - ref_knl = knl + ref_prog = prog - knl = lp.tag_data_axes(knl, "out", "c,vec") - knl = lp.tag_inames(knl, dict(j="unr")) + prog = lp.tag_array_axes(prog, "out", "c,vec") + prog = lp.tag_inames(prog, dict(j="unr")) - knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + prog = lp.split_iname(prog, "i", 128, outer_tag="g.0", inner_tag="l.0") - lp.auto_test_vs_ref(ref_knl, ctx, knl, + lp.auto_test_vs_ref(ref_prog, ctx, prog, parameters=dict( n=20000 )) @@ -798,10 +812,11 @@ def test_ilp_loop_bound(ctx_factory): ref_knl = knl - knl = lp.prioritize_loops(knl, "j,i,k") - knl = lp.split_iname(knl, "k", 4, inner_tag="ilp") + prog = lp.make_program_from_kernel(knl) + prog = lp.prioritize_loops(prog, "j,i,k") + prog = lp.split_iname(prog, "k", 4, inner_tag="ilp") - lp.auto_test_vs_ref(ref_knl, ctx, knl, + lp.auto_test_vs_ref(ref_knl, ctx, prog, parameters=dict( n=200 )) @@ -829,13 +844,15 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory): "a[i] = 2*a[i]", assumptions="n>=1") - ref_knl = knl + prog = lp.make_program_from_kernel(knl) + + ref_prog = prog for outer_tag in ["for", "g.0"]: - knl = ref_knl - knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr", + prog = ref_prog + prog = lp.split_iname(prog, "i", 4, slabs=(0, 1), inner_tag="unr", outer_tag=outer_tag) - knl = lp.prioritize_loops(knl, "i_outer") + prog = lp.prioritize_loops(prog, "i_outer") a = cl.array.empty(queue, 20, np.float32) a.fill(17) @@ -844,10 +861,10 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory): knl = lp.set_options(knl, write_cl=True) print("TEST-----------------------------------------") - knl(queue, a=a_knl) + prog(queue, a=a_knl) print("REF-----------------------------------------") - ref_knl(queue, a=a_ref) - print("DONE-----------------------------------------") + ref_prog(queue, a=a_ref) + print("DONE---------------------------l--------------") print("REF", a_ref) print("KNL", a_knl) @@ -867,12 +884,11 @@ def test_multiple_writes_to_local_temporary(): <> temp[i, 0] = 17 temp[i, 1] = 15 """) - knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code, _ = lp.generate_code(k) - print(code) + prog = lp.make_program_from_kernel(knl) + prog = lp.tag_inames(prog, dict(i="l.0")) + + print(lp.generate_code_v2(prog).device_code()) def test_make_copy_kernel(ctx_factory): @@ -907,19 +923,23 @@ def test_auto_test_can_detect_problems(ctx_factory): a[i,j] = 25 """) + ref_prog = lp.make_program_from_kernel(ref_knl) + knl = lp.make_kernel( "{[i]: 0<=i Date: Wed, 1 Aug 2018 12:14:27 -0500 Subject: [PATCH 280/916] more changes to the interface. --- loopy/__init__.py | 33 +++- loopy/kernel/creation.py | 7 +- loopy/kernel/tools.py | 6 +- loopy/preprocess.py | 4 +- loopy/program.py | 70 ++++++++ loopy/target/__init__.py | 2 +- loopy/target/execution.py | 7 +- loopy/target/ispc.py | 5 +- loopy/transform/data.py | 52 +++++- loopy/transform/fusion.py | 8 + loopy/transform/iname.py | 29 +++- loopy/transform/save.py | 27 +++- loopy/transform/subst.py | 30 +++- test/test_loopy.py | 331 ++++++++++++++++++-------------------- 14 files changed, 399 insertions(+), 212 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index bfc616400..a93ca0400 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -30,7 +30,6 @@ from loopy.symbolic import ( TaggedVariable, Reduction, LinearSubscript, TypeCast) from loopy.diagnostic import LoopyError, LoopyWarning - # {{{ imported user interface from loopy.kernel.instruction import ( @@ -49,7 +48,7 @@ from loopy.kernel.data import ( SubstitutionRule, CallMangleInfo) from loopy.kernel.function_interface import ( - ScalarCallable) + CallableKernel, ScalarCallable) from loopy.program import ( Program, make_program_from_kernel) @@ -313,6 +312,8 @@ def set_options_for_single_kernel(kernel, *args, **kwargs): See also :class:`Options`. """ + assert isinstance(kernel, LoopKernel) + if args and kwargs: raise TypeError("cannot pass both positional and keyword arguments") @@ -340,11 +341,27 @@ def set_options_for_single_kernel(kernel, *args, **kwargs): def set_options(program, *args, **kwargs): - if isinstance(program, LoopKernel): - return set_options_for_single_kernel(program, *args, **kwargs) - kernel = program.root_kernel - return program.with_root_kernel( - set_options_for_single_kernel(kernel, *args, **kwargs)) + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = set_options_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) # }}} @@ -457,7 +474,7 @@ class CacheMode(object): # {{{ make copy kernel def make_copy_kernel(new_dim_tags, old_dim_tags=None): - """Returns a :class:`LoopKernel` that changes the data layout + """Returns a :class:`loopy.Program` that changes the data layout of a variable (called "input") to the new layout specified by *new_dim_tags* from the one specified by *old_dim_tags*. *old_dim_tags* defaults to an all-C layout of the same rank diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f0e73bee9..60473cf1b 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1775,8 +1775,8 @@ def add_inferred_inames(knl): def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): logger.debug("%s: default deps" % kernel.name) - from loopy.transform.subst import expand_subst - expanded_kernel = expand_subst(kernel) + from loopy.transform.subst import expand_subst_for_single_kernel + expanded_kernel = expand_subst_for_single_kernel(kernel) writer_map = kernel.writer_map() @@ -2352,7 +2352,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): creation_plog.done() - return knl + from loopy.program import make_program_from_kernel + return make_program_from_kernel(knl) # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 4420dbe4a..cd2604227 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -186,8 +186,8 @@ def find_all_insn_inames(kernel): all_read_deps = {} all_write_deps = {} - from loopy.transform.subst import expand_subst - kernel = expand_subst(kernel) + from loopy.transform.subst import expand_subst_for_single_kernel + kernel = expand_subst_for_single_kernel(kernel) for insn in kernel.instructions: all_read_deps[insn.id] = read_deps = insn.read_dependency_names() @@ -870,7 +870,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non del new_iname_to_tags[iname] return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), - axis=recursion_axis, local_size=local_size) + program_callables_info, axis=recursion_axis, local_size=local_size) # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 0bd3076c5..6d01469af 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2322,8 +2322,8 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): # }}} - from loopy.transform.subst import expand_subst - kernel = expand_subst(kernel) + from loopy.transform.subst import expand_subst_for_single_kernel + kernel = expand_subst_for_single_kernel(kernel) # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. diff --git a/loopy/program.py b/loopy/program.py index 08efc0e89..23697e365 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -32,6 +32,7 @@ from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.diagnostic import LoopyError +from pymbolic import var class FunctionResolver(RuleAwareIdentityMapper): @@ -526,6 +527,75 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_hit_during_editing={}, renames_needed_after_editing={}) + def merge_program(self, program2): + # FIXME: this is not correct and should not be touched till then. + 1/0 + # rename the callables in program2 to see no clash between the 2. + renames_needed_in_program2 = {} + + for old_func_id in program2.program_callables_info: + if old_func_id == program2.name: + # dont rename the root kernel + renames_needed_in_program2[old_func_id] = ( + old_func_id) + continue + unique_function_identifier = old_func_id + while unique_function_identifier in self.resolved_functions or ( + unique_function_identifier in + renames_needed_in_program2.values()): + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + renames_needed_in_program2[old_func_id] = ( + unique_function_identifier) + + # rename ALL the callables in program2 + new_prog2_resolved_functions = {} + new_prog2_num_times_callables_called = {} + + for func_id, in_knl_callable in program2.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + old_subkernel = in_knl_callable.subkernel + new_subkernel = rename_resolved_functions_in_a_single_kernel( + old_subkernel, renames_needed_in_program2) + in_knl_callable = ( + in_knl_callable.copy(subkernel=new_subkernel)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) + + new_func_id = renames_needed_in_program2[func_id] + new_prog2_resolved_functions[new_func_id] = ( + in_knl_callable) + new_prog2_num_times_callables_called[new_func_id] = ( + program2.program_callables_info.num_times_callables_called[ + func_id]) + + new_prog1_callables_info = self.with_edit_callables_mode() + # TODO: there maybe a case of trouble when merging the kernel being + # called from *self*, that's improbable, but can be fixed with a + # condition. + for old_func_id, in_knl_callable_in_prog2 in ( + new_prog2_resolved_functions.items()): + for i in range( + new_prog2_num_times_callables_called[old_func_id]): + new_prog1_callables_info, new_func_id = ( + new_prog1_callables_info.with_callable( + var(old_func_id), in_knl_callable_in_prog2)) + + # FIXME: perform all the edits on + merged_prog_callables_info = ( + new_prog1_callables_info.with_exit_edit_callables_mode()) + new_merged_resolved_functions = ( + merged_prog_callables_info.resolved_functions.copy()) + new_subkernel = new_merged_resolved_functions.pop( + program2.name).subkernel + new_merged_prog_callables_info = merged_prog_callables_info.copy( + resolved_functions=new_merged_resolved_functions) + return new_merged_prog_callables_info, new_subkernel + def __getitem__(self, item): return self.resolved_functions[item] diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 9733fa446..e3b4853c3 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -80,7 +80,7 @@ class TargetBase(object): def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel): + def pre_codegen_check(self, kernel, program_callables_info): pass # }}} diff --git a/loopy/target/execution.py b/loopy/target/execution.py index b61c29a51..7eda33fa5 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -61,12 +61,12 @@ class SeparateArrayPackingController(object): It also repacks outgoing arrays of this type back into an object array. """ - def __init__(self, kernel): + def __init__(self, program): # map from arg name self.packing_info = {} from loopy.kernel.array import ArrayBase - for arg in kernel.args: + for arg in program.args: if not isinstance(arg, ArrayBase): continue @@ -82,7 +82,8 @@ class SeparateArrayPackingController(object): name=arg.name, sep_shape=arg.sep_shape(), subscripts_and_names=subscripts_and_names, - is_written=arg.name in kernel.get_written_variables()) + is_written=arg.name in + program.root_kernel.get_written_variables()) def unpack(self, kernel_kwargs): if not self.packing_info: diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 0464270a3..539631833 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -172,8 +172,9 @@ class ISPCTarget(CTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel): - gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() + def pre_codegen_check(self, kernel, program_callables_info): + gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( + program_callables_info) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 596daf3ee..95e2fec8e 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -549,7 +549,7 @@ def remove_unused_arguments(knl): # {{{ alias_temporaries -def alias_temporaries(knl, names, base_name_prefix=None, +def alias_temporaries_for_single_kernel(knl, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of storage. @@ -628,6 +628,30 @@ def alias_temporaries(knl, names, base_name_prefix=None, instructions=new_insns, temporary_variables=new_temporary_variables) + +def alias_temporaries(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = alias_temporaries_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} @@ -711,7 +735,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): # {{{ set temporary scope -def set_temporary_scope(kernel, temp_var_names, scope): +def set_temporary_scope_for_single_kernel(kernel, temp_var_names, scope): """ :arg temp_var_names: a container with membership checking, or a comma-separated string of variables for which the @@ -747,6 +771,30 @@ def set_temporary_scope(kernel, temp_var_names, scope): return kernel.copy(temporary_variables=new_temp_vars) + +def set_temporary_scope(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = set_temporary_scope_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 49e30a751..7bd03c1de 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -31,6 +31,8 @@ from islpy import dim_type from loopy.diagnostic import LoopyError from pymbolic import var +from loopy.kernel import LoopKernel + def _apply_renames_in_exprs(kernel, var_renames): from loopy.symbolic import ( @@ -331,6 +333,8 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + + assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) if data_flow is None: @@ -411,4 +415,8 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): return result + +def fuse_programs(programs, suffixes=None, data_flow=None): + 1/0 + # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 6d69a8a1d..67a44e89f 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -886,7 +886,8 @@ class _InameDuplicator(RuleAwareIdentityMapper): return insn.copy(within_inames=new_fid) -def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, +def duplicate_inames_for_single_kernel(knl, inames, within, new_inames=None, + suffix=None, tags={}): """ :arg within: a stack match as understood by @@ -965,12 +966,36 @@ def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, for old_iname, new_iname in zip(inames, new_inames): new_tag = tags.get(old_iname) if new_tag is not None: - knl = tag_inames(knl, {new_iname: new_tag}) + knl = tag_inames_for_single_kernel(knl, {new_iname: new_tag}) # }}} return knl + +def duplicate_inames(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = duplicate_inames_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/transform/save.py b/loopy/transform/save.py index cca62bc52..4b957b033 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -64,7 +64,7 @@ class LivenessAnalysis(object): def __init__(self, kernel): self.kernel = kernel - self.schedule = self.kernel.schedule + self.schedule = kernel.schedule @memoize_method def get_successor_relation(self): @@ -235,8 +235,9 @@ class TemporarySaver(object): def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel): + def __init__(self, kernel, program_callables_info): self.kernel = kernel + self.program_callables_info = program_callables_info self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -439,7 +440,8 @@ class TemporarySaver(object): return (), () group_sizes, local_sizes = ( - self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, + self.program_callables_info)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -628,7 +630,7 @@ class TemporarySaver(object): kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel) + return assign_automatic_axes(kernel, self.program_callables_info) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -722,7 +724,7 @@ class TemporarySaver(object): # {{{ auto save and reload across kernel calls -def save_and_reload_temporaries(knl): +def save_and_reload_temporaries(program): """ Add instructions to save and reload temporary variables that are live across kernel calls. @@ -745,8 +747,19 @@ def save_and_reload_temporaries(knl): :returns: The resulting kernel """ + + knl = program.root_kernel + + if not knl.schedule: + program = lp.preprocess_program(program) + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(program.root_kernel, + program.program_callables_info) + + assert knl.schedule is not None + liveness = LivenessAnalysis(knl) - saver = TemporarySaver(knl) + saver = TemporarySaver(knl, program.program_callables_info) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) @@ -784,7 +797,7 @@ def save_and_reload_temporaries(knl): .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) - return saver.finish() + return program.with_root_kernel(saver.finish()) # }}} diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index a681afe06..f7b5081ce 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -33,6 +33,9 @@ from pymbolic.mapper.substitutor import make_subst_func from pytools import ImmutableRecord from pymbolic import var +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -468,7 +471,8 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst -def expand_subst(kernel, within=None): +def expand_subst_for_single_kernel(kernel, within=None): + assert isinstance(kernel, LoopKernel) if not kernel.substitutions: return kernel @@ -485,6 +489,30 @@ def expand_subst(kernel, within=None): return rule_mapping_context.finish_kernel(submap.map_kernel(kernel)) + +def expand_subst(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = expand_subst_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 5a92e7dbe..d69119f91 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -71,15 +71,12 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory): 'cnst', shape=('n'), initializer=cnst, scope=lp.AddressSpace.GLOBAL, read_only=True), '...']) + knl = lp.fix_parameters(knl, n=16) + knl = lp.add_barrier(knl, "id:first", "id:second") - prog = lp.make_program_from_kernel(knl) - - prog = lp.fix_parameters(prog, n=16) - prog = lp.add_barrier(prog, "id:first", "id:second") - - prog = lp.split_iname(prog, "i", 2, outer_tag="g.0", inner_tag="l.0") - prog = lp.split_iname(prog, "ii", 2, outer_tag="g.0", inner_tag="l.0") - evt, (out,) = prog(queue, a=a) + knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.0") + knl = lp.split_iname(knl, "ii", 2, outer_tag="g.0", inner_tag="l.0") + evt, (out,) = knl(queue, a=a) assert np.linalg.norm(out-((2*(a+cnst)+cnst))) <= 1e-15 @@ -100,7 +97,7 @@ def test_complicated_subst(ctx_factory): print(knl) - sr_keys = list(knl.substitutions.keys()) + sr_keys = list(knl.root_kernel.substitutions.keys()) for letter, how_many in [ ("f", 1), ("g", 1), @@ -113,7 +110,7 @@ def test_complicated_subst(ctx_factory): def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=i bb = a[i] - b[i] @@ -125,15 +122,15 @@ def test_type_inference_no_artificial_doubles(ctx_factory): lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - code = lp.generate_code(knl) + code = lp.generate_code_v2(prog).device_code() assert "double" not in code def test_type_inference_with_type_dependencies(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: i=0}", """ <>a = 99 @@ -145,15 +142,17 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - prog = lp.make_program_from_kernel(knl) prog = lp.infer_unknown_types(prog) - knl = prog.root_kernel from loopy.types import to_loopy_type - assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) - assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128) + assert prog.root_kernel.temporary_variables["a"].dtype == to_loopy_type( + np.int32) + assert prog.root_kernel.temporary_variables["b"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["c"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["d"].dtype == to_loopy_type( + np.complex128) def test_sized_and_complex_literals(ctx_factory): @@ -180,17 +179,19 @@ def test_sized_and_complex_literals(ctx_factory): def test_simple_side_effect(ctx_factory): + ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<100}", """ a[i] = a[i] + 1 """, - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) - prog = lp.make_program_from_kernel(knl) - print(lp.generate_code_v2(prog)) + print(knl) + print(lp.generate_code_v2(knl)) def test_owed_barriers(ctx_factory): @@ -205,32 +206,33 @@ def test_owed_barriers(ctx_factory): target=lp.PyOpenCLTarget(ctx.devices[0]) ) - prog = lp.make_program_from_kernel(knl) - - prog = lp.tag_inames(prog, dict(i="l.0")) + knl = lp.tag_inames(knl, dict(i="l.0")) - print(lp.generate_code_v2(prog).device_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_wg_too_small(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( "{[i]: 0<=i<100}", [ " z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) - prog = lp.make_program_from_kernel(knl) - - prog = lp.tag_inames(prog, dict(i="l.0")) + knl = lp.tag_inames(knl, dict(i="l.0")) - import pytest + print(knl) with pytest.raises(RuntimeError): - lp.generate_code_v2(prog) + print(lp.generate_code_v2(knl)) def test_multi_cse(ctx_factory): + ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", @@ -238,14 +240,14 @@ def test_multi_cse(ctx_factory): " z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) - prog = lp.make_program_from_kernel(knl) + knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") + knl = lp.add_prefetch(knl, "a", []) - prog = lp.split_iname(prog, "i", 16, inner_tag="l.0") - prog = lp.add_prefetch(prog, "a", []) - - lp.generate_code_v2(prog) + print(knl) + print(lp.generate_code_v2(knl)) # {{{ code generator fuzzing @@ -339,7 +341,8 @@ def test_fuzz_code_generator(ctx_factory): lp.ValueArg(name, get_dtype(val)) for name, val in six.iteritems(var_values) ]) - evt, (lp_value,) = knl(queue, out_host=True, **var_values) + ck = lp.CompiledKernel(ctx, knl) + evt, (lp_value,) = ck(queue, out_host=True, **var_values) err = abs(true_value-lp_value)/abs(true_value) if abs(err) > 1e-10: print(80*"-") @@ -347,8 +350,7 @@ def test_fuzz_code_generator(ctx_factory): print("true=%r" % true_value) print("loopy=%r" % lp_value) print(80*"-") - print(lp.generate_code_v2(lp.make_program_from_kernel( - knl).device_code())) + print(ck.get_code()) print(80*"-") print(var_values) print(80*"-") @@ -379,8 +381,9 @@ def test_bare_data_dependency(ctx_factory): lp.ValueArg("n", np.int32), ]) + cknl = lp.CompiledKernel(ctx, knl) n = 20000 - evt, (a,) = knl(queue, n=n, out_host=True) + evt, (a,) = cknl(queue, n=n, out_host=True) assert a.shape == (n,) assert (a == 1).all() @@ -388,8 +391,10 @@ def test_bare_data_dependency(ctx_factory): # {{{ test race detection -@pytest.mark.skipif("sys.version_info < (2,6)") +# FIXME: not intended just for local testing purposes. ~KK +@pytest.mark.skip def test_ilp_write_race_detection_global(ctx_factory): + ctx = ctx_factory() knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j a[i] = 5+i+j", ], + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) - []) - - prog = lp.make_program_from_kernel(knl) - prog = lp.tag_inames(prog, dict(i="l.0", j="ilp")) + knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) - prog = lp.preprocess_program(prog, ctx.devices[0]) - assert prog.root_kernel.temporary_variables['a'].shape == (16, 17) + knl = lp.preprocess_program(knl, ctx.devices[0]) + assert knl.root_kernel.temporary_variables["a"].shape == (16, 17) def test_ilp_write_race_avoidance_private(ctx_factory): @@ -442,19 +445,20 @@ def test_ilp_write_race_avoidance_private(ctx_factory): [ "<> a = 5+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) - prog = lp.make_program_from_kernel(knl) - prog = lp.tag_inames(prog, dict(j="ilp")) + knl = lp.tag_inames(knl, dict(j="ilp")) - prog = lp.preprocess_program(prog, ctx.devices[0]) - assert prog.root_kernel.temporary_variables['a'].shape == (16,) + knl = lp.preprocess_program(knl) + assert knl.root_kernel.temporary_variables["a"].shape == (16,) # }}} def test_write_parameter(ctx_factory): dtype = np.float32 + ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j src_ibox = source_boxes[i] @@ -710,8 +721,8 @@ def test_inames_deps_from_write_subscript(ctx_factory): None, shape=None), "..."]) - print(knl) - assert "i" in knl.insn_inames("myred") + print(prog) + assert "i" in prog.root_kernel.insn_inames("myred") def test_modulo_indexing(ctx_factory): @@ -728,12 +739,9 @@ def test_modulo_indexing(ctx_factory): ], target=lp.PyOpenCLTarget(ctx.devices[0]) ) - prog = lp.make_program_from_kernel(knl) - print(prog) - prog = lp.add_dtypes(prog, dict( - a=np.float32, - )) - print(lp.generate_code_v2(prog).device_code()) + print(knl) + knl = lp.add_dtypes(knl, {"a": np.float32}) + print(lp.generate_code_v2(knl).device_code()) @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) @@ -748,18 +756,17 @@ def test_vector_types(ctx_factory, vec_len): lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) - prog = lp.make_program_from_kernel(knl) - prog = lp.fix_parameters(prog, vec_len=vec_len) + knl = lp.fix_parameters(knl, vec_len=vec_len) - ref_prog = prog + ref_knl = knl - prog = lp.tag_array_axes(prog, "out", "c,vec") - prog = lp.tag_inames(prog, dict(j="unr")) + knl = lp.tag_array_axes(knl, "out", "c,vec") + knl = lp.tag_inames(knl, dict(j="unr")) - prog = lp.split_iname(prog, "i", 128, outer_tag="g.0", inner_tag="l.0") + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - lp.auto_test_vs_ref(ref_prog, ctx, prog, + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=20000 )) @@ -812,11 +819,10 @@ def test_ilp_loop_bound(ctx_factory): ref_knl = knl - prog = lp.make_program_from_kernel(knl) - prog = lp.prioritize_loops(prog, "j,i,k") - prog = lp.split_iname(prog, "k", 4, inner_tag="ilp") + knl = lp.prioritize_loops(knl, "j,i,k") + knl = lp.split_iname(knl, "k", 4, inner_tag="ilp") - lp.auto_test_vs_ref(ref_knl, ctx, prog, + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=200 )) @@ -844,15 +850,13 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory): "a[i] = 2*a[i]", assumptions="n>=1") - prog = lp.make_program_from_kernel(knl) - - ref_prog = prog + ref_knl = knl for outer_tag in ["for", "g.0"]: - prog = ref_prog - prog = lp.split_iname(prog, "i", 4, slabs=(0, 1), inner_tag="unr", + knl = ref_knl + knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr", outer_tag=outer_tag) - prog = lp.prioritize_loops(prog, "i_outer") + knl = lp.prioritize_loops(knl, "i_outer") a = cl.array.empty(queue, 20, np.float32) a.fill(17) @@ -861,10 +865,10 @@ def test_slab_decomposition_does_not_double_execute(ctx_factory): knl = lp.set_options(knl, write_cl=True) print("TEST-----------------------------------------") - prog(queue, a=a_knl) + knl(queue, a=a_knl) print("REF-----------------------------------------") - ref_prog(queue, a=a_ref) - print("DONE---------------------------l--------------") + ref_knl(queue, a=a_ref) + print("DONE-----------------------------------------") print("REF", a_ref) print("KNL", a_knl) @@ -884,11 +888,8 @@ def test_multiple_writes_to_local_temporary(): <> temp[i, 0] = 17 temp[i, 1] = 15 """) - - prog = lp.make_program_from_kernel(knl) - prog = lp.tag_inames(prog, dict(i="l.0")) - - print(lp.generate_code_v2(prog).device_code()) + knl = lp.tag_inames(knl, dict(i="l.0")) + print(lp.generate_code_v2(knl).device_code()) def test_make_copy_kernel(ctx_factory): @@ -923,23 +924,19 @@ def test_auto_test_can_detect_problems(ctx_factory): a[i,j] = 25 """) - ref_prog = lp.make_program_from_kernel(ref_knl) - knl = lp.make_kernel( "{[i]: 0<=i Date: Wed, 1 Aug 2018 13:09:17 -0500 Subject: [PATCH 281/916] changes to incorporate function with no return value. --- loopy/__init__.py | 52 ++++++++++++++++++++++++++++-- loopy/check.py | 8 ++--- loopy/kernel/function_interface.py | 11 ++++--- loopy/kernel/tools.py | 2 +- loopy/preprocess.py | 2 +- loopy/schedule/__init__.py | 2 +- loopy/target/c/__init__.py | 2 +- loopy/target/execution.py | 4 +-- loopy/type_inference.py | 2 +- 9 files changed, 67 insertions(+), 18 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a93ca0400..f3cd4f831 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -368,7 +368,7 @@ def set_options(program, *args, **kwargs): # {{{ library registration -def register_preamble_generators(kernel, preamble_generators): +def register_preamble_generators_for_single_kernel(kernel, preamble_generators): """ :arg manglers: list of functions of signature ``(preamble_info)`` generating tuples ``(sortable_str_identifier, code)``, @@ -392,6 +392,30 @@ def register_preamble_generators(kernel, preamble_generators): return kernel.copy(preamble_generators=new_pgens) +def register_preamble_generators(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = register_preamble_generators_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + + def register_symbol_manglers(kernel, manglers): from loopy.tools import unpickles_equally @@ -409,7 +433,7 @@ def register_symbol_manglers(kernel, manglers): return kernel.copy(symbol_manglers=new_manglers) -def register_function_manglers(kernel, manglers): +def register_function_manglers_for_single_kernel(kernel, manglers): """ :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)`` returning a :class:`loopy.CallMangleInfo`. @@ -430,6 +454,30 @@ def register_function_manglers(kernel, manglers): return kernel.copy(function_manglers=new_manglers) + +def register_function_manglers(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = register_function_manglers_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/check.py b/loopy/check.py index 8e41e6976..727b02a85 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -210,7 +210,7 @@ def check_multiple_tags_allowed(kernel): "tags: {1}".format(iname, tags)) -def check_for_double_use_of_hw_axes(kernel): +def check_for_double_use_of_hw_axes(kernel, program_callables_info): from loopy.kernel.data import UniqueTag from loopy.kernel.instruction import CallInstruction from loopy.kernel.function_interface import CallableKernel @@ -228,7 +228,7 @@ def check_for_double_use_of_hw_axes(kernel): # check usage of iname tags in the callee kernel if isinstance(insn, CallInstruction): - in_knl_callable = kernel.scoped_functions[ + in_knl_callable = program_callables_info[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): # check for collision in iname_tag keys in the instruction @@ -715,13 +715,13 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(kernel): +def pre_schedule_checks(kernel, program_callables_info): try: logger.debug("%s: pre-schedule check: start" % kernel.name) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) - check_for_double_use_of_hw_axes(kernel) + check_for_double_use_of_hw_axes(kernel, program_callables_info) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 71324c85d..09362fb20 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -264,7 +264,7 @@ class InKernelCallable(ImmutableRecord): return None new_arg_id_to_dtype = None - if self.arg_id_to_dtype: + if self.arg_id_to_dtype is not None: new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, dtype in self.arg_id_to_dtype.items()) @@ -410,7 +410,6 @@ class ScalarCallable(InKernelCallable): # Currently this is formulated such that the first argument is returned # and rest all are passed by reference as arguments to the function. - assert self.is_ready_for_codegen() from loopy.kernel.instruction import CallInstruction @@ -709,7 +708,7 @@ class ManglerCallable(ScalarCallable): return (self.name, self.function_mangler, self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): if self.arg_id_to_dtype is not None: # specializing an already specialized function. for arg_id, dtype in arg_id_to_dtype.items(): @@ -730,8 +729,10 @@ class ManglerCallable(ScalarCallable): new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in enumerate(mangle_result.result_dtypes))) - return self.copy(name_in_target=mangle_result.target_name, - arg_id_to_dtype=new_arg_id_to_dtype) + return ( + self.copy(name_in_target=mangle_result.target_name, + arg_id_to_dtype=new_arg_id_to_dtype), + program_callables_info) else: # The function mangler does not agree with the arg id to dtypes # provided. Indicating that is illegal. diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index cd2604227..dcb0350ad 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1891,7 +1891,7 @@ def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) if isinstance(insn, CallInstruction): - if insn.expression.function.name in kernel.scoped_functions: + if insn.expression.function.name in program_callables_info: in_knl_callable = program_callables_info[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 6d01469af..82d96777d 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2160,7 +2160,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): if not isinstance(expr.function, ResolvedFunction): # ignore if the call is not to a ResolvedFunction - return super(ArgDescrInferenceMapper, self).rec(expr) + return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state) if isinstance(expr, Call): kw_parameters = {} diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index eb631c130..201bcc256 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1857,7 +1857,7 @@ def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}) "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel) + pre_schedule_checks(kernel, program_callables_info) schedule_count = 0 diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index db2780ba5..1db14c84a 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -893,7 +893,7 @@ class CASTBuilder(ASTBuilderBase): ecm = codegen_state.expression_to_code_mapper func_id = insn.expression.function.name - in_knl_callable = codegen_state.kernel.scoped_functions[func_id] + in_knl_callable = codegen_state.program_callables_info[func_id] if in_knl_callable.name_in_target == 'loopy_make_tuple': return self.emit_tuple_assignment(codegen_state, insn) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 7eda33fa5..43963ddb2 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -753,8 +753,8 @@ class KernelExecutorBase(object): program = add_dtypes(program, var_to_dtype) - from loopy.type_inference import infer_unknown_types - program = infer_unknown_types(program, expect_completion=True) + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) if program.root_kernel.schedule is None: from loopy.preprocess import preprocess_program diff --git a/loopy/type_inference.py b/loopy/type_inference.py index c899f9f6c..50fef41f0 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -403,7 +403,7 @@ class TypeInferenceMapper(CombineMapper): arg_id_to_descr, mangle_result.target_name) self.program_callables_info, new_function_id = ( self.program_callables_info.with_callable( - expr.function, in_knl_callable)) + expr.function, in_knl_callable, True)) if isinstance(expr, Call): self.old_calls_to_new_calls[expr] = new_function_id -- GitLab From 81f7c8dd5d32a4282eb4b5630c8f13c48218c269 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 1 Aug 2018 16:24:07 -0500 Subject: [PATCH 282/916] Program now supports persistent_hashing --- loopy/kernel/function_interface.py | 6 +++++ loopy/preprocess.py | 5 +--- loopy/program.py | 43 +++++++++++++++++++----------- loopy/type_inference.py | 4 +-- test/test_loopy.py | 19 +++++++------ 5 files changed, 48 insertions(+), 29 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 09362fb20..99d952fd5 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -200,6 +200,8 @@ class InKernelCallable(ImmutableRecord): def __getinitargs__(self): return (self.arg_id_to_dtype, self.arg_id_to_descr) + update_persistent_hash = LoopKernel.update_persistent_hash + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): """ :arg arg_id_to_type: a mapping from argument identifiers @@ -334,6 +336,7 @@ class ScalarCallable(InKernelCallable): fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") + hash_fields = fields def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): @@ -490,6 +493,7 @@ class CallableKernel(InKernelCallable): "name_in_target"]) init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") + hash_fields = fields def __init__(self, subkernel, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): @@ -692,6 +696,8 @@ class ManglerCallable(ScalarCallable): "name_in_target"]) init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") + hash_fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) def __init__(self, name, function_mangler, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 82d96777d..8b6a1c4b3 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2391,10 +2391,7 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): def preprocess_kernel(kernel, device=None): # FIXME: error message? - # FIXME: do we assume that we should give out a program or a kernel - from loopy.program import make_program_from_kernel - program = make_program_from_kernel(kernel) - return preprocess_program(program, device) + return preprocess_program(kernel, device) def preprocess_program(program, device=None): diff --git a/loopy/program.py b/loopy/program.py index 23697e365..716145251 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -34,6 +34,8 @@ from loopy.kernel.function_interface import ( from loopy.diagnostic import LoopyError from pymbolic import var +from loopy.kernel import LoopKernel + class FunctionResolver(RuleAwareIdentityMapper): """ @@ -156,7 +158,7 @@ def resolve_callables(name, program_callables_info, function_resolvers): class Program(ImmutableRecord): def __init__(self, - root_kernel_name, + name, program_callables_info, target=None, function_resolvers=None): @@ -164,10 +166,10 @@ class Program(ImmutableRecord): # FIXME: check if all sanity checks have been covered? # FIXME: The comments over here may need some attention. - assert root_kernel_name in program_callables_info + assert name in program_callables_info if target is None: - target = program_callables_info[root_kernel_name].subkernel.target + target = program_callables_info[name].subkernel.target if function_resolvers is None: # populate the function scopers from the target and the loopy @@ -202,13 +204,20 @@ class Program(ImmutableRecord): program_callables_info.with_exit_edit_callables_mode()) super(Program, self).__init__( - root_kernel_name=root_kernel_name, + name=name, program_callables_info=program_callables_info, target=target, function_resolvers=function_resolvers) self._program_executor_cache = {} + hash_fields = ( + "name", + "program_callables_info", + "target",) + + update_persistent_hash = LoopKernel.update_persistent_hash + def get_grid_size_upper_bounds(self, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -261,13 +270,7 @@ class Program(ImmutableRecord): @property def root_kernel(self): - return self.program_callables_info[self.root_kernel_name].subkernel - - @property - def name(self): - #FIXME: discuss with @inducer if we use "name" instead of - # "root_kernel_name" - return self.root_kernel_name + return self.program_callables_info[self.name].subkernel @property def arg_dict(self): @@ -275,10 +278,10 @@ class Program(ImmutableRecord): def with_root_kernel(self, root_kernel): new_in_knl_callable = self.program_callables_info[ - self.root_kernel_name].copy(subkernel=root_kernel) + self.name].copy(subkernel=root_kernel) new_resolved_functions = ( self.program_callables_info.resolved_functions.copy()) - new_resolved_functions[self.root_kernel_name] = new_in_knl_callable + new_resolved_functions[self.name] = new_in_knl_callable return self.copy( program_callables_info=self.program_callables_info.copy( @@ -303,7 +306,7 @@ class Program(ImmutableRecord): print(self.program_callables_info.num_times_callables_called) return ( (self.program_callables_info[ - self.root_kernel_name].subkernel).__str__() + + self.name].subkernel).__str__() + '\nResolved Functions: ' + (self.program_callables_info.resolved_functions.keys()).__str__() + '\n' + 75*'-' + '\n') @@ -393,6 +396,16 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_hit_during_editing=num_times_hit_during_editing, renames_needed_after_editing=renames_needed_after_editing) + hash_fields = ( + "resolved_functions", + "num_times_callables_called", + "is_being_edited", + "num_times_hit_during_editing", + "old_resolved_functions", + "renames_needed_after_editing",) + + update_persistent_hash = LoopKernel.update_persistent_hash + def with_edit_callables_mode(self): return self.copy(is_being_edited=True, old_resolved_functions=self.resolved_functions.copy(), @@ -618,7 +631,7 @@ def make_program_from_kernel(kernel): program_callables_info = ProgramCallablesInfo(resolved_functions) program = Program( - root_kernel_name=kernel.name, + name=kernel.name, program_callables_info=program_callables_info) return program diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 50fef41f0..98c8b7d18 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -850,7 +850,7 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info = program.program_callables_info type_uninferred_knl_callable = ( - program_callables_info[program.root_kernel_name]) + program_callables_info[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel program_callables_info = ( @@ -865,7 +865,7 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info, _ = ( program_callables_info.with_callable( - program.root_kernel_name, + program.name, type_inferred_knl_callable)) program_callables_info = ( diff --git a/test/test_loopy.py b/test/test_loopy.py index d69119f91..f306ad21f 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -433,7 +433,7 @@ def test_ilp_write_race_avoidance_local(ctx_factory): knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) - knl = lp.preprocess_program(knl, ctx.devices[0]) + knl = lp.preprocess_kernel(knl, ctx.devices[0]) assert knl.root_kernel.temporary_variables["a"].shape == (16, 17) @@ -450,7 +450,7 @@ def test_ilp_write_race_avoidance_private(ctx_factory): knl = lp.tag_inames(knl, dict(j="ilp")) - knl = lp.preprocess_program(knl) + knl = lp.preprocess_kernel(knl) assert knl.root_kernel.temporary_variables["a"].shape == (16,) # }}} @@ -1151,7 +1151,7 @@ def test_within_inames_and_reduction(): target=lp.CTarget(), ) - prog = lp.preprocess_program(prog) + prog = lp.preprocess_kernel(prog) assert 'i' not in prog.root_kernel.insn_inames("insn_0_j_update") print(prog.root_kernel.stringify(with_dependencies=True)) @@ -1736,6 +1736,8 @@ def test_call_with_options(): def test_unschedulable_kernel_detection(): + # FIXME: does not work + # Reason for multiple calllable kernels, not sure how this will go. knl = lp.make_kernel(["{[i,j]:0<=i,j Date: Wed, 1 Aug 2018 18:09:16 -0500 Subject: [PATCH 283/916] =?UTF-8?q?successful=5Ftests+=3D=3F?= --- loopy/kernel/data.py | 3 +++ loopy/preprocess.py | 4 +++- loopy/transform/instruction.py | 22 ++++++++++++++++++- loopy/type_inference.py | 4 +++- test/test_loopy.py | 40 ++++++++++++++++++++-------------- 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 165e59ba9..417212b33 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -403,6 +403,9 @@ class ConstantArg(ArrayBase, KernelArgument): min_target_axes = 0 max_target_axes = 1 + # Constant Arg cannot be an output + is_output_only = False + def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape, dtype, is_written) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 8b6a1c4b3..74fb28cca 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -80,8 +80,10 @@ def prepare_for_caching(program): new_resolved_functions = {} for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): + # FIXME: this is an easy fix. remove the target attribute from + # kernel new_subkernel = prepare_single_kernel_for_caching( - in_knl_callable.subkernel) + in_knl_callable.subkernel.copy(target=program.target)) new_resolved_functions[func_id] = ( in_knl_callable.copy(subkernel=new_subkernel)) elif isinstance(in_knl_callable, ScalarCallable): diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index e6ecb4093..982f84ab4 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -25,15 +25,35 @@ THE SOFTWARE. import six # noqa from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) +from loopy.program import Program # {{{ find_instructions -def find_instructions(kernel, insn_match): +def find_instructions_in_single_kernel(kernel, insn_match): + assert isinstance(kernel, LoopKernel) from loopy.match import parse_match match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] + +def find_instructions(program, insn_match): + assert isinstance(program, Program) + insns = [] + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + insns += (find_instructions_in_single_kernel( + in_knl_callable.subkernel, insn_match)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable))) + + return insns + # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 98c8b7d18..fcb2c7d22 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -599,7 +599,9 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types, None + return ( + None, type_inf_mapper.symbols_with_unknown_types, None, + type_inf_mapper.program_callables_info) result = type_inf_mapper.combine(dtype_sets) diff --git a/test/test_loopy.py b/test/test_loopy.py index f306ad21f..538217094 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1795,7 +1795,7 @@ def test_regression_persistent_hash(): def test_sequential_dependencies(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=i Date: Wed, 1 Aug 2018 22:20:11 -0500 Subject: [PATCH 284/916] support for reduction op function. --- loopy/kernel/function_interface.py | 2 - loopy/library/reduction.py | 36 +++++++---------- loopy/program.py | 65 +++++++++++++++++++++--------- loopy/symbolic.py | 2 +- 4 files changed, 60 insertions(+), 45 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 99d952fd5..4f295e115 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -623,8 +623,6 @@ class CallableKernel(InKernelCallable): # FIXME TODO: This is not correct, as the code code preamble generated # during the code generationg of the child kernel, does not guarantee # that this thing would be updated. - for preamble in self.subkernel.preambles: - yield preamble return diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 538125af1..df98d4549 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -83,8 +83,8 @@ class ReductionOperation(object): raise LoopyError("unable to parse reduction type: '%s'" % op_type) - def get_scalar_callables(self, kernel): - return {} + def get_scalar_callables(self): + return frozenset() class ScalarReductionOperation(ReductionOperation): @@ -187,9 +187,8 @@ class MaxReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2): return ResolvedFunction("max")(operand1, operand2) - def get_scalar_callables(self, kernel): - return { - var("max"): kernel.find_scoped_function_identifier("max")} + def get_scalar_callables(self): + return frozenset(["max"]) class MinReductionOperation(ScalarReductionOperation): @@ -199,10 +198,8 @@ class MinReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2): return ResolvedFunction("min")(operand1, operand2) - def get_scalar_callables(self, kernel): - return { - var("min"): kernel.find_scoped_function_identifier("min")} - + def get_scalar_callables(self): + return frozenset(["min"]) # {{{ base class for symbolic reduction ops @@ -269,10 +266,8 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def __call__(self, dtypes, operand1, operand2): return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2)) - def get_scalar_callables(self, kernel): - return { - "make_tuple": kernel.find_scoped_function_identifier("make_tuple"), - SegmentedOp(self): kernel.find_scoped_function_identifier(self)} + def get_scalar_callables(self): + return frozenset(["make_tuple", SegmentedOp(self)]) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -327,11 +322,8 @@ class _ArgExtremumReductionOperation(ReductionOperation): def __call__(self, dtypes, operand1, operand2): return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2)) - def get_scalar_callables(self, kernel): - return { - self.which: kernel.find_scoped_function_identifier(self.which), - "make_tuple": kernel.find_scoped_function_identifier("make_tuple"), - ArgExtOp(self): kernel.find_scoped_function_identifier(self)} + def get_scalar_callables(self): + return frozenset([self.which, "make_tuple", ArgExtOp(self)]) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -404,12 +396,13 @@ class ReductionCallable(ScalarCallable): def with_types(self, arg_id_to_dtype, kernel, program_callables_info): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] - result_dtypes = self.name.result_dtypes(kernel, scalar_dtype, + result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype, index_dtype) new_arg_id_to_dtype = arg_id_to_dtype.copy() new_arg_id_to_dtype[-1] = result_dtypes[0] new_arg_id_to_dtype[-2] = result_dtypes[1] - name_in_target = self.name.prefix(scalar_dtype, index_dtype) + "_op" + name_in_target = self.name.reduction_op.prefix(scalar_dtype, + index_dtype) + "_op" return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=name_in_target), program_callables_info @@ -477,8 +470,7 @@ class ReductionCallable(ScalarCallable): def reduction_scoper(target, identifier): - if isinstance(identifier, (_ArgExtremumReductionOperation, - _SegmentedScalarReductionOperation)): + if isinstance(identifier, (ArgExtOp, SegmentedOp)): return ReductionCallable(name=identifier) return None diff --git a/loopy/program.py b/loopy/program.py index 716145251..d60725e44 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -122,11 +122,13 @@ class FunctionResolver(RuleAwareIdentityMapper): expn_state) def map_reduction(self, expr, expn_state): - for func_id, in_knl_callable in ( - expr.operation.get_scalar_callables(self.kernel)).items(): + for func_id in ( + expr.operation.get_scalar_callables()): + in_knl_callable = self.find_resolved_function_from_identifier(func_id) + assert in_knl_callable is not None self.program_callables_info, _ = ( self.program_callables_info.with_callable(func_id, - in_knl_callable)) + in_knl_callable, True)) return super(FunctionResolver, self).map_reduction(expr, expn_state) @@ -452,9 +454,14 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_callables_called = self.num_times_callables_called.copy() if not resolved_for_the_first_time: - num_times_hit_during_editing[function.name] += 1 + if isinstance(function, (ArgExtOp, SegmentedOp)): + num_times_hit_during_editing[function] += 1 + else: + num_times_hit_during_editing[function.name] += 1 if in_kernel_callable in self.resolved_functions.values(): + # the callable already exists, implies return the function + # identifier corresposing to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: num_times_callables_called[func_id] += 1 @@ -473,22 +480,40 @@ class ProgramCallablesInfo(ImmutableRecord): renames_needed_after_editing)), func_id) else: - - # FIXME: maybe deal with the history over here? - # FIXME: once the code logic is running beautify this part. - # many "ifs" can be avoided - unique_function_identifier = function.name - if (resolved_for_the_first_time or - self.num_times_callables_called[function.name] > 1): - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) - - if not resolved_for_the_first_time: - num_times_callables_called[function.name] -= 1 - - num_times_callables_called[unique_function_identifier] = 1 + if isinstance(function, (ArgExtOp, SegmentedOp)): + unique_function_identifier = function.copy() + if not resolved_for_the_first_time: + num_times_callables_called[function] -= 1 + + num_times_callables_called[unique_function_identifier] = 1 + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing=num_times_hit_during_editing, + renames_needed_after_editing=renames_needed_after_editing), + unique_function_identifier) + else: + # FIXME: maybe deal with the history over here? + # FIXME: once the code logic is running beautify this part. + # many "ifs" can be avoided + unique_function_identifier = function.name + if (resolved_for_the_first_time or + self.num_times_callables_called[function.name] > 1): + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + if not resolved_for_the_first_time: + num_times_callables_called[function.name] -= 1 + + num_times_callables_called[unique_function_identifier] = 1 updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( diff --git a/loopy/symbolic.py b/loopy/symbolic.py index e800599d1..7bc2c792a 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -113,7 +113,7 @@ class IdentityMapperMixin(object): self.rec(expr.subscript, *args, **kwargs)) def map_resolved_function(self, expr, *args, **kwargs): - return ResolvedFunction(self.rec(expr.function, *args, **kwargs)) + return ResolvedFunction(expr.function) map_type_cast = map_type_annotation -- GitLab From fea5660dd3a7ef2801507fb0b07c45093233d137 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 1 Aug 2018 23:30:10 -0500 Subject: [PATCH 285/916] New codegen pipeline, reduction works. --- loopy/codegen/__init__.py | 48 ++++++++++++++++++++++-------- loopy/kernel/function_interface.py | 1 + loopy/library/reduction.py | 9 +++--- loopy/target/opencl.py | 1 + 4 files changed, 42 insertions(+), 17 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 14211acb9..ed1e7a5bc 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -36,6 +36,9 @@ from loopy.symbolic import CombineMapper from functools import reduce +from loopy.kernel.function_interface import CallableKernel +from cgen import Collection + import logging logger = logging.getLogger(__name__) @@ -567,23 +570,42 @@ def generate_code_v2(program): from loopy.preprocess import preprocess_program program = preprocess_program(program) - # collect preambles - for callable_knl in program.program_callables_info.values(): - pass + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) - # collect func decls - for callable_knl in program.program_callables_info.values(): - pass + codegen_results = {} - # collect func defs - for callable_knl in program.program_callables_info.values(): - pass + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + codegen_results[func_id] = ( + generate_code_for_a_single_kernel(in_knl_callable.subkernel, + program.program_callables_info)) - from loopy.type_inference import infer_unknown_types - program = infer_unknown_types(program, expect_completion=True) + device_preambles = set() + for cgr in codegen_results.values(): + device_preambles.update(cgr.device_preambles) + + for in_knl_callable in program.program_callables_info.values(): + for preamble in in_knl_callable.generate_preambles(program.target): + device_preambles.update([preamble]) + + collective_device_program = codegen_results[program.name].device_programs[0] + for func_id, callee_cgr in codegen_results.items(): + if func_id != program.name: + assert len(callee_cgr.device_programs) == 1 + callee_prog_ast = callee_cgr.device_programs[0].ast + collective_device_program = collective_device_program.copy( + ast=Collection([callee_prog_ast, collective_device_program.ast])) + + device_preambles.update([('98_%s' % func_id, + str(callee_prog_ast.fdecl)), ]) + + collective_device_programs = [collective_device_program] + ( + codegen_results[program.name].device_programs[1:]) - return generate_code_for_a_single_kernel(program.root_kernel, - program.program_callables_info) + return codegen_results[program.name].copy( + device_programs=collective_device_programs, + device_preambles=device_preambles) def generate_code(kernel, device=None): diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 4f295e115..799be7763 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -625,6 +625,7 @@ class CallableKernel(InKernelCallable): # that this thing would be updated. return + yield def emit_call_insn(self, insn, target, expression_to_code_mapper): diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index df98d4549..ad72bc19d 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -201,6 +201,7 @@ class MinReductionOperation(ScalarReductionOperation): def get_scalar_callables(self): return frozenset(["min"]) + # {{{ base class for symbolic reduction ops class ReductionOpFunction(FunctionIdentifier): @@ -414,8 +415,8 @@ class ReductionCallable(ScalarCallable): return self.copy(arg_id_to_descr=arg_id_to_descr) def generate_preambles(self, target): - if isinstance(self.name, _ArgExtremumReductionOperation): - op = self.name + if isinstance(self.name, ArgExtOp): + op = self.name.reduction_op scalar_dtype = self.arg_id_to_dtype[-1] index_dtype = self.arg_id_to_dtype[-2] @@ -444,8 +445,8 @@ class ReductionCallable(ScalarCallable): index_t=target.dtype_to_typename(index_dtype), comp=op.update_comparison, )) - elif isinstance(self.name, _SegmentedScalarReductionOperation): - op = self.name + elif isinstance(self.name, SegmentedOp): + op = self.name.reduction_op scalar_dtype = self.arg_id_to_dtype[-1] segment_flag_dtype = self.arg_id_to_dtype[-2] prefix = op.prefix(scalar_dtype, segment_flag_dtype) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 2b501c872..44f782a72 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -356,6 +356,7 @@ def opencl_preamble_generator(preamble_info): from loopy.tools import remove_common_indentation kernel = preamble_info.kernel + yield ("00_declare_gid_lid", remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) -- GitLab From fac6c73cd3db2e9e526d194e6781c2cab949b719 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 1 Aug 2018 23:40:27 -0500 Subject: [PATCH 286/916] forgot to commit changes in tests. --- loopy/kernel/creation.py | 4 ++-- test/test_loopy.py | 36 +++++++++++++++++++++--------------- test/testlib.py | 5 +++-- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 60473cf1b..d83dbd1c0 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1678,7 +1678,7 @@ def _is_wildcard(s): def _resolve_dependencies(what, knl, insn, deps): - from loopy import find_instructions + from loopy.transform.instruction import find_instructions_in_single_kernel from loopy.match import MatchExpressionBase new_deps = [] @@ -1687,7 +1687,7 @@ def _resolve_dependencies(what, knl, insn, deps): found_any = False if isinstance(dep, MatchExpressionBase): - for new_dep in find_instructions(knl, dep): + for new_dep in find_instructions_in_single_kernel(knl, dep): if new_dep.id != insn.id: new_deps.append(new_dep.id) found_any = True diff --git a/test/test_loopy.py b/test/test_loopy.py index 538217094..89b74482c 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2277,6 +2277,7 @@ def test_integer_reduction(ctx_factory): knl = lp.make_kernel('{[k]: 0<=k {[j]: 0 <= j < jmax}"], """ @@ -2417,10 +2419,11 @@ def test_barrier_insertion_near_bottom_of_loop(): end """, seq_dependencies=True) - knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.set_temporary_scope(knl, "a", "local") - knl = lp.set_temporary_scope(knl, "b", "local") - knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + prog = lp.tag_inames(prog, dict(i="l.0")) + prog = lp.set_temporary_scope(prog, "a", "local") + prog = lp.set_temporary_scope(prog, "b", "local") + prog = lp.preprocess_kernel(prog) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) print(knl) @@ -2430,7 +2433,7 @@ def test_barrier_insertion_near_bottom_of_loop(): def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # make simple barrier'd kernel - knl = lp.make_kernel('{[i]: 0 <= i < 10}', + prog = lp.make_kernel('{[i]: 0 <= i < 10}', """ for i a[i] = i {id=a} @@ -2445,15 +2448,17 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # split into kernel w/ vesize larger than iname domain vecsize = 16 - knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0') + prog = lp.split_iname(prog, 'i', vecsize, inner_tag='l.0') from testlib import GridOverride # artifically expand via overridden_get_grid_sizes_for_insn_ids + knl = prog.root_kernel knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) + prog = prog.with_root_kernel(knl) # make sure we can generate the code - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) def test_multi_argument_reduction_type_inference(): @@ -2462,7 +2467,7 @@ def test_multi_argument_reduction_type_inference(): from loopy.types import to_loopy_type op = SegmentedSumReductionOperation() - knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j Date: Thu, 2 Aug 2018 08:11:15 -0500 Subject: [PATCH 287/916] update the program_callables_info of the type inference mapper. --- loopy/target/c/codegen/expression.py | 4 +++- loopy/type_inference.py | 9 ++++++-- test/test_loopy.py | 31 ++++++++++++++++------------ 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index defc643f6..2908c4efa 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -439,7 +439,9 @@ class ExpressionToCExpressionMapper(IdentityMapper): if isinstance(self.codegen_state.program_callables_info[expr.function.name], ManglerCallable): from loopy.codegen import SeenFunction - in_knl_callable = self.kernel.scoped_functions[expr.function.name] + in_knl_callable = ( + self.codegen_state.program_callables_info[ + expr.function.name]) mangle_result = in_knl_callable.mangle_result(self.kernel) self.codegen_state.seen_functions.add( SeenFunction(identifier_name, diff --git a/loopy/type_inference.py b/loopy/type_inference.py index fcb2c7d22..01ffd5e33 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -111,8 +111,10 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self): - return type(self)(self.kernel, self.program_callables_info, + def copy(self, program_callables_info=None): + if program_callables_info is None: + program_callables_info = self.program_callables_info + return type(self)(self.kernel, program_callables_info, self.new_assignments) def with_assignments(self, names_to_vars): @@ -552,6 +554,7 @@ class TypeInferenceMapper(CombineMapper): # {{{ infer single variable def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + if var_name in kernel.all_params(): return [kernel.index_dtype], [], {}, ( type_inf_mapper.program_callables_info) @@ -736,6 +739,8 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, new_old_calls_to_new_calls, program_callables_info) = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) + type_inf_mapper = type_inf_mapper.copy( + program_callables_info=program_callables_info) failed = not result if not failed: diff --git a/test/test_loopy.py b/test/test_loopy.py index 89b74482c..8b4f10afa 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2498,7 +2498,7 @@ def test_multi_argument_reduction_parsing(): def test_global_barrier_order_finding(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i,itrip]: 0<=ia = 0 <>b_s0 = 0 """) - vng = knl.get_var_name_generator() + vng = prog.root_kernel.get_var_name_generator() assert vng("a_s0") != "a_s0" assert vng("b") != "b" @@ -2649,7 +2650,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory): def test_wildcard_dep_matching(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a = 0 {id=insn1} @@ -2662,11 +2663,15 @@ def test_wildcard_dep_matching(): all_insns = set("insn%d" % i for i in range(1, 6)) - assert knl.id_to_insn["insn1"].depends_on == set() - assert knl.id_to_insn["insn2"].depends_on == all_insns - set(["insn2"]) - assert knl.id_to_insn["insn3"].depends_on == all_insns - set(["insn3"]) - assert knl.id_to_insn["insn4"].depends_on == set(["insn1", "insn2"]) - assert knl.id_to_insn["insn5"].depends_on == all_insns - set(["insn1", "insn5"]) + assert prog.root_kernel.id_to_insn["insn1"].depends_on == set() + assert (prog.root_kernel.id_to_insn["insn2"].depends_on == all_insns - + set(["insn2"])) + assert (prog.root_kernel.id_to_insn["insn3"].depends_on == all_insns - + set(["insn3"])) + assert (prog.root_kernel.id_to_insn["insn4"].depends_on == set(["insn1", + "insn2"])) + assert (prog.root_kernel.id_to_insn["insn5"].depends_on == all_insns - + set(["insn1", "insn5"])) def test_preamble_with_separate_temporaries(ctx_factory): -- GitLab From bb3e8125c1b04d5931955088140e9e9bfb83ece1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 08:19:19 -0500 Subject: [PATCH 288/916] completed one traversal over test_loopy --- loopy/transform/padding.py | 32 +++++++++++++++++++++++++++++++- test/test_loopy.py | 25 +++++++++++-------------- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index d695e3595..6cdf8e4b5 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -28,6 +28,10 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + class ArrayAxisSplitHelper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, arg_names, handler): @@ -370,7 +374,8 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): return kernel -def split_array_axis(kernel, array_names, axis_nr, count, order="C"): +def split_array_axis_for_single_kernel(kernel, array_names, axis_nr, count, + order="C"): """ :arg array: a list of names of temporary variables or arguments. May also be a comma-separated string of these. @@ -387,6 +392,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): There was a more complicated, dumber function called :func:`split_array_dim` that had the role of this function in versions prior to 2016.2. """ + assert isinstance(kernel, LoopKernel) if isinstance(array_names, str): array_names = [i.strip() for i in array_names.split(",") if i.strip()] @@ -396,6 +402,30 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): return kernel + +def split_array_axis(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = split_array_axis_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 8b4f10afa..10701cee5 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2765,7 +2765,7 @@ def test_relaxed_stride_checks(ctx_factory): def test_add_prefetch_works_in_lhs_index(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{ [n,k,l,k1,l1,k2,l2]: " "start<=n Date: Thu, 2 Aug 2018 09:19:24 -0500 Subject: [PATCH 289/916] Planning to move changes to a decorator! --- loopy/transform/arithmetic.py | 32 +++++++++++++++++++- loopy/transform/batch.py | 33 ++++++++++++++++++-- loopy/transform/data.py | 55 ++++++++++++++++++++++++++++++++-- loopy/transform/iname.py | 26 +++++++++++++++- loopy/transform/instruction.py | 3 +- loopy/transform/padding.py | 34 ++++++++++++++++++--- loopy/transform/precompute.py | 32 ++++++++++++++++++-- loopy/transform/subst.py | 30 +++++++++++++++++++ loopy/type_inference.py | 4 +-- test/test_transform.py | 29 +++++++++--------- 10 files changed, 247 insertions(+), 31 deletions(-) diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index b7f47c38a..d26782778 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -27,6 +27,10 @@ import six from loopy.diagnostic import LoopyError +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + # {{{ fold constants @@ -53,7 +57,9 @@ def fold_constants(kernel): # {{{ collect_common_factors_on_increment # thus far undocumented -def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): +def collect_common_factors_on_increment_in_single_kernel(kernel, var_name, + vary_by_axes=()): + assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst @@ -330,6 +336,30 @@ def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): return kernel.copy(instructions=new_insns) + +def collect_common_factors_on_increment(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = collect_common_factors_on_increment_in_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index f0b9814c4..52cae60a2 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -29,6 +29,10 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + + __doc__ = """ .. currentmodule:: loopy @@ -102,8 +106,8 @@ def _add_unique_dim_name(name, dim_names): return (ng(name),) + tuple(dim_names) -def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", - sequential=False): +def to_batched_for_single_kernel(knl, nbatches, batch_varying_args, + batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. .. note:: @@ -195,6 +199,31 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", return kernel + +def to_batched(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = to_batched_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + + # }}} # vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 95e2fec8e..e09e44d6e 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -477,7 +477,7 @@ def tag_array_axes(program, *args, **kwargs): # {{{ set_array_axis_names -def set_array_axis_names(kernel, ary_names, dim_names): +def set_array_axis_names_for_single_kernel(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 @@ -501,7 +501,32 @@ def set_array_axis_names(kernel, ary_names, dim_names): return kernel -set_array_dim_names = MovedFunctionDeprecationWrapper(set_array_axis_names) +set_array_dim_names = (MovedFunctionDeprecationWrapper( + set_array_axis_names_for_single_kernel)) + + +def set_array_axis_names(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = set_array_axis_names_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) # }}} @@ -690,7 +715,7 @@ def set_argument_order(kernel, arg_names): # {{{ rename argument -def rename_argument(kernel, old_name, new_name, existing_ok=False): +def rename_argument_in_single_kernel(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 """ @@ -730,6 +755,30 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): return kernel.copy(args=new_args) + +def rename_argument(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = rename_argument_in_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 67a44e89f..a058862a5 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -404,7 +404,7 @@ def split_iname(program, *args, **kwargs): # {{{ chunk iname -def chunk_iname(kernel, split_iname, num_chunks, +def chunk_iname_for_single_kernel(kernel, split_iname, num_chunks, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, slabs=(0, 0), do_tagged_check=True, @@ -494,6 +494,30 @@ def chunk_iname(kernel, split_iname, num_chunks, slabs=slabs, do_tagged_check=do_tagged_check, within=within) + +def chunk_iname(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = chunk_iname_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} # }}} diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 982f84ab4..72a3f118f 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -112,7 +112,8 @@ def add_dependency(kernel, insn_match, depends_on): added_deps = frozenset([depends_on]) else: added_deps = frozenset( - dep.id for dep in find_instructions(kernel, depends_on)) + dep.id for dep in find_instructions_in_single_kernel(kernel, + depends_on)) if not added_deps: raise LoopyError("no instructions found matching '%s' " diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index 6cdf8e4b5..a745a3948 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -48,7 +48,8 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): # {{{ split_array_dim (deprecated since June 2016) -def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, +def split_array_dim_for_single_kernel(kernel, arrays_and_axes, count, + auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating @@ -241,16 +242,41 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: - from loopy import split_iname + from loopy.transform.iname import split_iname_for_single_kernel for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): - kernel = split_iname(kernel, iname, count, + kernel = split_iname_for_single_kernel(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, **split_kwargs) return kernel -split_arg_axis = MovedFunctionDeprecationWrapper(split_array_dim) +split_arg_axis = (MovedFunctionDeprecationWrapper( + split_array_dim_for_single_kernel)) + + +def split_array_dim(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = split_array_dim_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) # }}} diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 2af3c04b7..fe61dfa23 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -38,6 +38,9 @@ from pymbolic import var from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + class RuleAccessDescriptor(AccessDescriptor): __slots__ = ["args", "expansion_stack"] @@ -258,8 +261,8 @@ class _not_provided(object): # noqa: N801 pass -def precompute(kernel, program_callables_info, subst_use, sweep_inames=[], - within=None, storage_axes=None, temporary_name=None, +def precompute_for_single_kernel(kernel, program_callables_info, subst_use, + sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, @@ -1048,4 +1051,29 @@ def precompute(kernel, program_callables_info, subst_use, sweep_inames=[], return kernel + +def precompute(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = precompute_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index f7b5081ce..aae25f580 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -31,6 +31,7 @@ from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func from pytools import ImmutableRecord +from functools import wraps from pymbolic import var from loopy.program import Program @@ -47,6 +48,34 @@ class ExprDescriptor(ImmutableRecord): # {{{ extract_subst +def iterate_over_kernel_if_given_program(transform_for_single_kernel): + def _collective_transform(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = transform_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + + return wraps(transform_for_single_kernel)(_collective_transform) + + +@iterate_over_kernel_if_given_program def extract_subst(kernel, subst_name, template, parameters=()): """ :arg subst_name: The name of the substitution rule to be created. @@ -201,6 +230,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): instructions=new_insns, substitutions=new_substs) + # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 01ffd5e33..faebe94de 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -651,8 +651,8 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, unexpanded_kernel = kernel if kernel.substitutions: - from loopy.transform.subst import expand_subst - kernel = expand_subst(kernel) + from loopy.transform.subst import expand_subst_for_single_kernel + kernel = expand_subst_for_single_kernel(kernel) new_temp_vars = kernel.temporary_variables.copy() new_arg_dict = kernel.arg_dict.copy() diff --git a/test/test_transform.py b/test/test_transform.py index ed184fb50..8cd29f998 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -127,7 +127,7 @@ def test_to_batched(ctx_factory): def test_to_batched_temp(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel( + prog = lp.make_kernel( ''' { [i,j]: 0<=i,j Date: Thu, 2 Aug 2018 09:58:24 -0500 Subject: [PATCH 290/916] made transforms over a program a decorator. --- loopy/__init__.py | 85 +++----------------- loopy/kernel/creation.py | 12 +-- loopy/kernel/tools.py | 8 +- loopy/preprocess.py | 33 ++------ loopy/program.py | 38 ++++++++- loopy/transform/add_barrier.py | 30 +------ loopy/transform/arithmetic.py | 31 +------ loopy/transform/batch.py | 31 +------ loopy/transform/data.py | 142 +++------------------------------ loopy/transform/iname.py | 142 +++------------------------------ loopy/transform/instruction.py | 6 +- loopy/transform/padding.py | 64 ++------------- loopy/transform/parameter.py | 30 +------ loopy/transform/precompute.py | 4 +- loopy/transform/subst.py | 60 +------------- 15 files changed, 119 insertions(+), 597 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index f3cd4f831..5a2487f17 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -29,6 +29,7 @@ from six.moves import range, zip from loopy.symbolic import ( TaggedVariable, Reduction, LinearSubscript, TypeCast) from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.program import iterate_over_kernels_if_given_program # {{{ imported user interface @@ -173,7 +174,7 @@ __all__ = [ "CallInstruction", "CInstruction", "NoOpInstruction", "BarrierInstruction", - "ScalarCallable", + "ScalarCallable", "CallableKernel", "Program", "make_program_from_kernel", @@ -305,7 +306,8 @@ __all__ = [ # {{{ set_options -def set_options_for_single_kernel(kernel, *args, **kwargs): +@iterate_over_kernels_if_given_program +def set_options(kernel, *args, **kwargs): """Return a new kernel with the options given as keyword arguments, or from a string representation passed in as the first (and only) positional argument. @@ -339,36 +341,13 @@ def set_options_for_single_kernel(kernel, *args, **kwargs): return kernel.copy(options=new_opt) - -def set_options(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = set_options_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} # {{{ library registration -def register_preamble_generators_for_single_kernel(kernel, preamble_generators): +@iterate_over_kernels_if_given_program +def register_preamble_generators(kernel, preamble_generators): """ :arg manglers: list of functions of signature ``(preamble_info)`` generating tuples ``(sortable_str_identifier, code)``, @@ -392,30 +371,7 @@ def register_preamble_generators_for_single_kernel(kernel, preamble_generators): return kernel.copy(preamble_generators=new_pgens) -def register_preamble_generators(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = register_preamble_generators_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - - +@iterate_over_kernels_if_given_program def register_symbol_manglers(kernel, manglers): from loopy.tools import unpickles_equally @@ -433,7 +389,8 @@ def register_symbol_manglers(kernel, manglers): return kernel.copy(symbol_manglers=new_manglers) -def register_function_manglers_for_single_kernel(kernel, manglers): +@iterate_over_kernels_if_given_program +def register_function_manglers(kernel, manglers): """ :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)`` returning a :class:`loopy.CallMangleInfo`. @@ -454,30 +411,6 @@ def register_function_manglers_for_single_kernel(kernel, manglers): return kernel.copy(function_manglers=new_manglers) - -def register_function_manglers(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = register_function_manglers_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index d83dbd1c0..54bd5b219 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1775,8 +1775,8 @@ def add_inferred_inames(knl): def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): logger.debug("%s: default deps" % kernel.name) - from loopy.transform.subst import expand_subst_for_single_kernel - expanded_kernel = expand_subst_for_single_kernel(kernel) + from loopy.transform.subst import expand_subst + expanded_kernel = expand_subst(kernel) writer_map = kernel.writer_map() @@ -2318,8 +2318,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # NOTE: add_inferred_inames will be phased out and throws warnings if it # does something. knl = add_inferred_inames(knl) - from loopy.transform.parameter import fix_parameters_for_single_kernel - knl = fix_parameters_for_single_kernel(knl, **fixed_parameters) + from loopy.transform.parameter import fix_parameters + knl = fix_parameters(knl, **fixed_parameters) # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- @@ -2347,8 +2347,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): from loopy.kernel.tools import infer_arg_is_output_only knl = infer_arg_is_output_only(knl) - from loopy.preprocess import prepare_single_kernel_for_caching - knl = prepare_single_kernel_for_caching(knl) + from loopy.preprocess import prepare_for_caching + knl = prepare_for_caching(knl) creation_plog.done() diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index dcb0350ad..09369c1a3 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -186,8 +186,8 @@ def find_all_insn_inames(kernel): all_read_deps = {} all_write_deps = {} - from loopy.transform.subst import expand_subst_for_single_kernel - kernel = expand_subst_for_single_kernel(kernel) + from loopy.transform.subst import expand_subst + kernel = expand_subst(kernel) for insn in kernel.instructions: all_read_deps[insn.id] = read_deps = insn.read_dependency_names() @@ -837,13 +837,13 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: from loopy import untag_inames - from loopy.transform.iname import split_iname_for_single_kernel + from loopy.transform.iname import split_iname # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. return assign_automatic_axes( - split_iname_for_single_kernel( + split_iname( untag_inames(kernel, iname, AutoLocalIndexTagBase), iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 74fb28cca..f19c4d33f 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -40,15 +40,15 @@ from loopy.symbolic import RuleAwareIdentityMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) -from loopy.kernel.function_interface import CallableKernel, ScalarCallable - +from loopy.program import iterate_over_kernels_if_given_program import logging logger = logging.getLogger(__name__) # {{{ prepare for caching -def prepare_single_kernel_for_caching(kernel): +@iterate_over_kernels_if_given_program +def prepare_for_caching(kernel): import loopy as lp new_args = [] @@ -75,23 +75,6 @@ def prepare_single_kernel_for_caching(kernel): return kernel - -def prepare_for_caching(program): - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - # FIXME: this is an easy fix. remove the target attribute from - # kernel - new_subkernel = prepare_single_kernel_for_caching( - in_knl_callable.subkernel.copy(target=program.target)) - new_resolved_functions[func_id] = ( - in_knl_callable.copy(subkernel=new_subkernel)) - elif isinstance(in_knl_callable, ScalarCallable): - new_resolved_functions[func_id] = in_knl_callable - else: - raise NotImplementedError("Unknown InKernelCallable %s." % - type(in_knl_callable).__name__) - # }}} @@ -1954,8 +1937,8 @@ def realize_reduction(kernel, program_callables_info, insn_id_filter=None, kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) - from loopy.transform.iname import tag_inames_for_single_kernel - kernel = tag_inames_for_single_kernel(kernel, new_iname_tags) + from loopy.transform.iname import tag_inames + kernel = tag_inames(kernel, new_iname_tags) # TODO: remove unused inames... @@ -2324,8 +2307,8 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): # }}} - from loopy.transform.subst import expand_subst_for_single_kernel - kernel = expand_subst_for_single_kernel(kernel) + from loopy.transform.subst import expand_subst + kernel = expand_subst(kernel) # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. @@ -2381,7 +2364,7 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): if CACHING_ENABLED: input_kernel = prepare_for_caching(input_kernel) - kernel = prepare_single_kernel_for_caching(kernel) + kernel = prepare_for_caching(kernel) # }}} diff --git a/loopy/program.py b/loopy/program.py index d60725e44..691aa9830 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -27,6 +27,7 @@ import re from pytools import ImmutableRecord, memoize_method from pymbolic.primitives import Variable +from functools import wraps from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction from loopy.kernel.function_interface import ( @@ -495,8 +496,10 @@ class ProgramCallablesInfo(ImmutableRecord): self.copy( resolved_functions=updated_resolved_functions, num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing=num_times_hit_during_editing, - renames_needed_after_editing=renames_needed_after_editing), + num_times_hit_during_editing=( + num_times_hit_during_editing), + renames_needed_after_editing=( + renames_needed_after_editing)), unique_function_identifier) else: # FIXME: maybe deal with the history over here? @@ -662,6 +665,37 @@ def make_program_from_kernel(kernel): return program +def iterate_over_kernels_if_given_program(transform_for_single_kernel): + def _collective_transform(program_or_kernel, *args, **kwargs): + if isinstance(program_or_kernel, Program): + program = program_or_kernel + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = transform_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + else: + assert isinstance(program_or_kernel, LoopKernel) + kernel = program_or_kernel + return transform_for_single_kernel(kernel) + + return wraps(transform_for_single_kernel)(_collective_transform) + + # {{{ ingoring this for now # if False and isinstance(function, (ArgExtOp, SegmentedOp)): diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index b6dddad38..4af0c9c54 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -26,9 +26,8 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency -from loopy.program import Program +from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel, ScalarCallable __doc__ = """ .. currentmodule:: loopy @@ -39,7 +38,8 @@ __doc__ = """ # {{{ add_barrier -def add_barrier_for_single_kernel(knl, insn_before="", insn_after="", +@iterate_over_kernels_if_given_program +def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, tags=None, synchronization_kind="global", mem_kind=None): """Takes in a kernel that needs to be added a barrier and returns a kernel @@ -88,30 +88,6 @@ def add_barrier_for_single_kernel(knl, insn_before="", insn_after="", return new_knl - -def add_barrier(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = add_barrier_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} # vim: foldmethod=marker diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index d26782778..acf075deb 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -27,9 +27,8 @@ import six from loopy.diagnostic import LoopyError -from loopy.program import Program +from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel, ScalarCallable # {{{ fold constants @@ -57,8 +56,8 @@ def fold_constants(kernel): # {{{ collect_common_factors_on_increment # thus far undocumented -def collect_common_factors_on_increment_in_single_kernel(kernel, var_name, - vary_by_axes=()): +@iterate_over_kernels_if_given_program +def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: @@ -336,30 +335,6 @@ def collect_common_factors_on_increment_in_single_kernel(kernel, var_name, return kernel.copy(instructions=new_insns) - -def collect_common_factors_on_increment(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = collect_common_factors_on_increment_in_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 52cae60a2..970547003 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -29,8 +29,7 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl -from loopy.program import Program -from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.program import iterate_over_kernels_if_given_program __doc__ = """ @@ -106,7 +105,8 @@ def _add_unique_dim_name(name, dim_names): return (ng(name),) + tuple(dim_names) -def to_batched_for_single_kernel(knl, nbatches, batch_varying_args, +@iterate_over_kernels_if_given_program +def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. @@ -199,31 +199,6 @@ def to_batched_for_single_kernel(knl, nbatches, batch_varying_args, return kernel - -def to_batched(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = to_batched_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - - # }}} # vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index e09e44d6e..4eae36373 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -30,7 +30,7 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper -from loopy.program import Program +from loopy.program import Program, iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel, ScalarCallable @@ -415,7 +415,8 @@ def change_arg_to_image(knl, name): # {{{ tag array axes -def tag_array_axes_for_single_kernel(knl, ary_names, dim_tags): +@iterate_over_kernels_if_given_program +def tag_array_axes(knl, ary_names, dim_tags): """ .. versionchanged:: 2016.2 @@ -445,39 +446,15 @@ def tag_array_axes_for_single_kernel(knl, ary_names, dim_tags): tag_data_axes = ( - MovedFunctionDeprecationWrapper(tag_array_axes_for_single_kernel)) - - -def tag_array_axes(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = tag_array_axes_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - + MovedFunctionDeprecationWrapper(tag_array_axes)) # }}} # {{{ set_array_axis_names -def set_array_axis_names_for_single_kernel(kernel, ary_names, dim_names): +@iterate_over_kernels_if_given_program +def set_array_axis_names(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 @@ -502,31 +479,7 @@ def set_array_axis_names_for_single_kernel(kernel, ary_names, dim_names): set_array_dim_names = (MovedFunctionDeprecationWrapper( - set_array_axis_names_for_single_kernel)) - - -def set_array_axis_names(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = set_array_axis_names_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + set_array_axis_names)) # }}} @@ -574,7 +527,8 @@ def remove_unused_arguments(knl): # {{{ alias_temporaries -def alias_temporaries_for_single_kernel(knl, names, base_name_prefix=None, +@iterate_over_kernels_if_given_program +def alias_temporaries(knl, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of storage. @@ -653,30 +607,6 @@ def alias_temporaries_for_single_kernel(knl, names, base_name_prefix=None, instructions=new_insns, temporary_variables=new_temporary_variables) - -def alias_temporaries(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = alias_temporaries_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} @@ -715,7 +645,8 @@ def set_argument_order(kernel, arg_names): # {{{ rename argument -def rename_argument_in_single_kernel(kernel, old_name, new_name, existing_ok=False): +@iterate_over_kernels_if_given_program +def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 """ @@ -755,36 +686,13 @@ def rename_argument_in_single_kernel(kernel, old_name, new_name, existing_ok=Fal return kernel.copy(args=new_args) - -def rename_argument(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = rename_argument_in_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} # {{{ set temporary scope -def set_temporary_scope_for_single_kernel(kernel, temp_var_names, scope): +@iterate_over_kernels_if_given_program +def set_temporary_scope(kernel, temp_var_names, scope): """ :arg temp_var_names: a container with membership checking, or a comma-separated string of variables for which the @@ -820,30 +728,6 @@ def set_temporary_scope_for_single_kernel(kernel, temp_var_names, scope): return kernel.copy(temporary_variables=new_temp_vars) - -def set_temporary_scope(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = set_temporary_scope_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index a058862a5..e68ed1381 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -34,9 +34,8 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError -from loopy.program import Program +from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel, ScalarCallable __doc__ = """ @@ -97,7 +96,8 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) -def prioritize_loops_for_single_kernel(kernel, loop_priority): +@iterate_over_kernels_if_given_program +def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the kernel logically requires a different nesting, priority is ignored. @@ -120,30 +120,6 @@ def prioritize_loops_for_single_kernel(kernel, loop_priority): return kernel.copy(loop_priority=kernel.loop_priority.union([loop_priority])) - -def prioritize_loops(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = prioritize_loops_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} @@ -329,7 +305,7 @@ def _split_iname_backend(kernel, split_iname, kernel = tag_inames(kernel, {outer_iname: existing_tag, inner_iname: existing_tag}) - return tag_inames_for_single_kernel(kernel, {outer_iname: outer_tag, + return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag}) # }}} @@ -337,7 +313,8 @@ def _split_iname_backend(kernel, split_iname, # {{{ split iname -def split_iname_for_single_kernel(kernel, split_iname, inner_length, +@iterate_over_kernels_if_given_program +def split_iname(kernel, split_iname, inner_length, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, slabs=(0, 0), do_tagged_check=True, @@ -375,36 +352,13 @@ def split_iname_for_single_kernel(kernel, split_iname, inner_length, slabs=slabs, do_tagged_check=do_tagged_check, within=within) - -def split_iname(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = split_iname_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} # {{{ chunk iname -def chunk_iname_for_single_kernel(kernel, split_iname, num_chunks, +@iterate_over_kernels_if_given_program +def chunk_iname(kernel, split_iname, num_chunks, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, slabs=(0, 0), do_tagged_check=True, @@ -494,30 +448,6 @@ def chunk_iname_for_single_kernel(kernel, split_iname, num_chunks, slabs=slabs, do_tagged_check=do_tagged_check, within=within) - -def chunk_iname(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = chunk_iname_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} # }}} @@ -706,7 +636,8 @@ def untag_inames(kernel, iname_to_untag, tag_type): # {{{ tag inames -def tag_inames_for_single_kernel(kernel, iname_to_tag, force=False, +@iterate_over_kernels_if_given_program +def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): """Tag an iname @@ -829,30 +760,6 @@ def tag_inames_for_single_kernel(kernel, iname_to_tag, force=False, return kernel.copy(iname_to_tags=knl_iname_to_tags) - -def tag_inames(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = tag_inames_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} @@ -910,7 +817,8 @@ class _InameDuplicator(RuleAwareIdentityMapper): return insn.copy(within_inames=new_fid) -def duplicate_inames_for_single_kernel(knl, inames, within, new_inames=None, +@iterate_over_kernels_if_given_program +def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, tags={}): """ @@ -990,36 +898,12 @@ def duplicate_inames_for_single_kernel(knl, inames, within, new_inames=None, for old_iname, new_iname in zip(inames, new_inames): new_tag = tags.get(old_iname) if new_tag is not None: - knl = tag_inames_for_single_kernel(knl, {new_iname: new_tag}) + knl = tag_inames(knl, {new_iname: new_tag}) # }}} return knl - -def duplicate_inames(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = duplicate_inames_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 72a3f118f..d09ac1515 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -286,13 +286,15 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, sources = frozenset([source]) else: sources = frozenset( - source.id for source in find_instructions(kernel, source)) + source.id for source in find_instructions_in_single_kernel( + kernel, source)) if isinstance(sink, str) and sink in kernel.id_to_insn: sinks = frozenset([sink]) else: sinks = frozenset( - sink.id for sink in find_instructions(kernel, sink)) + sink.id for sink in find_instructions_in_single_kernel( + kernel, sink)) if not sources and not empty_ok: raise LoopyError("No match found for source specification '%s'." % source) diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index a745a3948..4d8c81b43 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -28,9 +28,8 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext -from loopy.program import Program +from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel, ScalarCallable class ArrayAxisSplitHelper(RuleAwareIdentityMapper): @@ -48,7 +47,8 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): # {{{ split_array_dim (deprecated since June 2016) -def split_array_dim_for_single_kernel(kernel, arrays_and_axes, count, +@iterate_over_kernels_if_given_program +def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, split_kwargs=None): """ @@ -242,41 +242,16 @@ def split_array_dim_for_single_kernel(kernel, arrays_and_axes, count, kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: - from loopy.transform.iname import split_iname_for_single_kernel + from loopy.transform.iname import split_iname for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): - kernel = split_iname_for_single_kernel(kernel, iname, count, + kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, **split_kwargs) return kernel -split_arg_axis = (MovedFunctionDeprecationWrapper( - split_array_dim_for_single_kernel)) - - -def split_array_dim(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = split_array_dim_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) +split_arg_axis = (MovedFunctionDeprecationWrapper(split_array_dim)) # }}} @@ -400,7 +375,8 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): return kernel -def split_array_axis_for_single_kernel(kernel, array_names, axis_nr, count, +@iterate_over_kernels_if_given_program +def split_array_axis(kernel, array_names, axis_nr, count, order="C"): """ :arg array: a list of names of temporary variables or arguments. May @@ -428,30 +404,6 @@ def split_array_axis_for_single_kernel(kernel, array_names, axis_nr, count, return kernel - -def split_array_axis(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = split_array_axis_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 4b95d2a7b..0720a312b 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -28,9 +28,8 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl -from loopy.program import Program +from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel, ScalarCallable __doc__ = """ @@ -138,7 +137,8 @@ def _fix_parameter(kernel, name, value): )) -def fix_parameters_for_single_kernel(kernel, **value_dict): +@iterate_over_kernels_if_given_program +def fix_parameters(kernel, **value_dict): """Fix the values of the arguments to specific constants. *value_dict* consists of *name*/*value* pairs, where *name* will be fixed @@ -152,30 +152,6 @@ def fix_parameters_for_single_kernel(kernel, **value_dict): return kernel - -def fix_parameters(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = fix_parameters_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} # vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index fe61dfa23..66c7114ae 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -1040,8 +1040,8 @@ def precompute_for_single_kernel(kernel, program_callables_info, subst_use, # }}} - from loopy.transform.iname import tag_inames_for_single_kernel - kernel = tag_inames_for_single_kernel(kernel, new_iname_to_tag) + from loopy.transform.iname import tag_inames + kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index aae25f580..6d6f034f3 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -31,12 +31,10 @@ from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func from pytools import ImmutableRecord -from functools import wraps from pymbolic import var -from loopy.program import Program +from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -48,34 +46,7 @@ class ExprDescriptor(ImmutableRecord): # {{{ extract_subst -def iterate_over_kernel_if_given_program(transform_for_single_kernel): - def _collective_transform(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = transform_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - - return wraps(transform_for_single_kernel)(_collective_transform) - - -@iterate_over_kernel_if_given_program +@iterate_over_kernels_if_given_program def extract_subst(kernel, subst_name, template, parameters=()): """ :arg subst_name: The name of the substitution rule to be created. @@ -501,7 +472,8 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst -def expand_subst_for_single_kernel(kernel, within=None): +@iterate_over_kernels_if_given_program +def expand_subst(kernel, within=None): assert isinstance(kernel, LoopKernel) if not kernel.substitutions: return kernel @@ -519,30 +491,6 @@ def expand_subst_for_single_kernel(kernel, within=None): return rule_mapping_context.finish_kernel(submap.map_kernel(kernel)) - -def expand_subst(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = expand_subst_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - # }}} -- GitLab From efad0dea37cadda3042d3a9c11d6057fe1886266 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 10:27:45 -0500 Subject: [PATCH 291/916] minor error in decorator. --- loopy/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 691aa9830..131dd15c6 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -691,7 +691,7 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): else: assert isinstance(program_or_kernel, LoopKernel) kernel = program_or_kernel - return transform_for_single_kernel(kernel) + return transform_for_single_kernel(kernel, *args, **kwargs) return wraps(transform_for_single_kernel)(_collective_transform) -- GitLab From 2851298d75cd1dbd526463f6ebda4b33554d1234 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 10:48:52 -0500 Subject: [PATCH 292/916] fixes test_transform --- loopy/transform/data.py | 9 ++-- loopy/transform/iname.py | 2 + loopy/transform/instruction.py | 5 ++- loopy/type_inference.py | 4 +- test/test_transform.py | 81 +++++++++++++++++++--------------- 5 files changed, 60 insertions(+), 41 deletions(-) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 4eae36373..61da070fe 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -333,9 +333,9 @@ def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, # precompute module, but precompute acutally uses that to adjust its # warning message. - from loopy.transform.precompute import precompute - new_kernel = precompute(kernel, program_callables_info, subst_use, - sweep_inames, precompute_inames=dim_arg_names, + from loopy.transform.precompute import precompute_for_single_kernel + new_kernel = precompute_for_single_kernel(kernel, program_callables_info, + subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, @@ -612,11 +612,14 @@ def alias_temporaries(knl, names, base_name_prefix=None, # {{{ set argument order +@iterate_over_kernels_if_given_program def set_argument_order(kernel, arg_names): """ :arg arg_names: A list (or comma-separated string) or argument names. All arguments must be in this list. """ + #FIXME: @inducer -- shoulld this only affect the root kernel, or should it + # take a within? if isinstance(arg_names, str): arg_names = arg_names.split(",") diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index e68ed1381..579b918ad 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -492,6 +492,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper): return super(_InameJoiner, self).map_reduction(expr, expn_state) +@iterate_over_kernels_if_given_program def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last @@ -1335,6 +1336,7 @@ def split_reduction_outward(kernel, inames, within=None): # {{{ affine map inames +@iterate_over_kernels_if_given_program def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index d09ac1515..f98c0bcae 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -27,7 +27,7 @@ import six # noqa from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) -from loopy.program import Program +from loopy.program import Program, iterate_over_kernels_if_given_program # {{{ find_instructions @@ -249,6 +249,7 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync +@iterate_over_kernels_if_given_program def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, empty_ok=False): """Add a *no_sync_with* directive between *source* and *sink*. @@ -281,6 +282,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, This used to silently pass. This behavior can be restored using *empty_ok*. """ + assert isinstance(kernel, LoopKernel) if isinstance(source, str) and source in kernel.id_to_insn: sources = frozenset([source]) @@ -347,6 +349,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, # {{{ uniquify_instruction_ids +@iterate_over_kernels_if_given_program def uniquify_instruction_ids(kernel): """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique strings. diff --git a/loopy/type_inference.py b/loopy/type_inference.py index faebe94de..01ffd5e33 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -651,8 +651,8 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, unexpanded_kernel = kernel if kernel.substitutions: - from loopy.transform.subst import expand_subst_for_single_kernel - kernel = expand_subst_for_single_kernel(kernel) + from loopy.transform.subst import expand_subst + kernel = expand_subst(kernel) new_temp_vars = kernel.temporary_variables.copy() new_arg_dict = kernel.arg_dict.copy() diff --git a/test/test_transform.py b/test/test_transform.py index 8cd29f998..6c9d07a01 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -357,33 +357,34 @@ def test_affine_map_inames(): def test_precompute_confusing_subst_arguments(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i,j]: 0<=itmp[i] = 10 {id=insn1} <>tmp2[i] = 10 {id=insn2} @@ -491,28 +494,34 @@ def test_add_nosync(): tmp5[i] = 1 {id=insn6,conflicts=g1} """) - orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") - orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local") # No dependency present - don't add nosync - knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2", + prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2", empty_ok=True) - assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn2"].no_sync_with) # Dependency present - knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") - assert frozenset() == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Bidirectional - knl = lp.add_nosync( - orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) - assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync( + orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Groups - knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") - assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == ( + prog.root_kernel.id_to_insn["insn6"].no_sync_with) def test_uniquify_instruction_ids(): @@ -521,12 +530,14 @@ def test_uniquify_instruction_ids(): i3 = lp.Assignment("b", 1, id=lp.UniqueName("b")) i4 = lp.Assignment("b", 1, id=lp.UniqueName("b")) - knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4]) + prog = lp.make_kernel("{[i]: i = 1}", []) + new_root_kernel = prog.root_kernel.copy(instructions=[i1, i2, i3, i4]) + prog = prog.with_root_kernel(new_root_kernel) from loopy.transform.instruction import uniquify_instruction_ids - knl = uniquify_instruction_ids(knl) + prog = uniquify_instruction_ids(prog) - insn_ids = set(insn.id for insn in knl.instructions) + insn_ids = set(insn.id for insn in prog.root_kernel.instructions) assert len(insn_ids) == 4 assert all(isinstance(id, str) for id in insn_ids) -- GitLab From fdd2f15c311c84db1241427485817f9b5c52cce9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 12:48:28 -0500 Subject: [PATCH 293/916] address more tests. --- loopy/auto_test.py | 18 ++++--------- loopy/kernel/tools.py | 1 + loopy/library/random123.py | 2 +- loopy/transform/data.py | 1 + loopy/transform/iname.py | 3 +++ loopy/transform/instruction.py | 1 + test/test_reduction.py | 47 +++++++++++++++++----------------- test/test_transform.py | 6 ++--- 8 files changed, 38 insertions(+), 41 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 884bd946b..1fc46ffd7 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -31,7 +31,6 @@ import numpy as np import loopy as lp from loopy.diagnostic import LoopyError, AutomaticTestFailure -from loopy.kernel import LoopKernel AUTO_TEST_SKIP_RUN = False @@ -368,7 +367,7 @@ def _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): # {{{ main automatic testing entrypoint def auto_test_vs_ref( - ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, + ref_prog, ctx, test_prog=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, @@ -385,19 +384,12 @@ def auto_test_vs_ref( import pyopencl as cl - if test_knl is None: - test_knl = ref_knl + if test_prog is None: + test_prog = ref_prog do_check = False - if isinstance(ref_knl, LoopKernel): - ref_prog = lp.make_program_from_kernel(ref_knl) - else: - ref_prog = ref_knl - - if isinstance(test_knl, LoopKernel): - test_prog = lp.make_program_from_kernel(test_knl) - else: - test_prog = test_knl + ref_prog = lp.preprocess_kernel(ref_prog) + test_prog = lp.preprocess_kernel(test_prog) if len(ref_prog.args) != len(test_prog.args): raise LoopyError("ref_prog and test_prog do not have the same number " diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 09369c1a3..1c37ae407 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -797,6 +797,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non return assign_automatic_axes( kernel.copy(iname_to_tags=new_iname_to_tags), + program_callables_info, axis=recursion_axis) if axis is None: diff --git a/loopy/library/random123.py b/loopy/library/random123.py index d172408d8..59ca72df1 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -175,7 +175,7 @@ class Random123Callable(ScalarCallable): arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): # the types provided aren't mature enough to specialize the # callable - return (self.copy(arg_id_to_dtype=arg_id_to_dtype), + return (self.copy(), program_callables_info) name = self.name diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 61da070fe..9534279d4 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -736,6 +736,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule +@iterate_over_kernels_if_given_program def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None): if isinstance(inames, str): inames = [s.strip() for s in inames.split(",")] diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 579b918ad..0d5f2015e 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1294,6 +1294,7 @@ def _split_reduction(kernel, inames, direction, within=None): rsplit.map_kernel(kernel)) +@iterate_over_kernels_if_given_program def split_reduction_inward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1313,6 +1314,7 @@ def split_reduction_inward(kernel, inames, within=None): return _split_reduction(kernel, inames, "in", within) +@iterate_over_kernels_if_given_program def split_reduction_outward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1668,6 +1670,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): expr, expn_state) +@iterate_over_kernels_if_given_program def make_reduction_inames_unique(kernel, inames=None, within=None): """ :arg inames: if not *None*, only apply to these inames diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index f98c0bcae..eaf6d3021 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -95,6 +95,7 @@ def set_instruction_priority(kernel, insn_match, priority): # {{{ add_dependency +@iterate_over_kernels_if_given_program def add_dependency(kernel, insn_match, depends_on): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. diff --git a/test/test_reduction.py b/test/test_reduction.py index 78eca4d0c..6ed618f4f 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -80,7 +80,7 @@ def test_empty_reduction(ctx_factory): "a[i] = sum(j, j)", ) - knl = lp.realize_reduction(knl) + knl = lp.preprocess_kernel(knl) print(knl) knl = lp.set_options(knl, write_cl=True) @@ -109,11 +109,9 @@ def test_nested_dependent_reduction(ctx_factory): lp.GlobalArg("ell", np.int32, ("n",)), ]) - cknl = lp.CompiledKernel(ctx, knl) - n = 330 ell = np.arange(n, dtype=np.int32) - evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True) + evt, (a,) = knl(queue, ell=ell, n=n, out_host=True) tgt_result = (2*ell-1)*2*ell/2 assert (a == tgt_result).all() @@ -144,10 +142,10 @@ def test_multi_nested_dependent_reduction(ctx_factory): lp.ValueArg("ntgts", np.int32), lp.ValueArg("nboxes", np.int32), ], - assumptions="ntgts>=1") + assumptions="ntgts>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - cknl = lp.CompiledKernel(ctx, knl) - print(cknl.get_code()) + print(lp.generate_code_v2(knl).device_code()) # FIXME: Actually test functionality. @@ -177,10 +175,10 @@ def test_recursive_nested_dependent_reduction(ctx_factory): lp.ValueArg("ntgts", np.int32), lp.ValueArg("nboxes", np.int32), ], - assumptions="ntgts>=1") + assumptions="ntgts>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - cknl = lp.CompiledKernel(ctx, knl) - print(cknl.get_code()) + print(lp.generate_code_v2(knl).device_code()) # FIXME: Actually test functionality. @@ -221,32 +219,33 @@ def test_local_parallel_reduction(ctx_factory, size): def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, i/13) """) - ref_knl = knl + ref_prog = prog gsize = 128 - knl = lp.split_iname(knl, "i", gsize * 20) - knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") - knl = lp.split_reduction_inward(knl, "i_inner_inner") - knl = lp.split_reduction_inward(knl, "i_inner_outer") + prog = lp.split_iname(prog, "i", gsize * 20) + prog = lp.split_iname(prog, "i_inner", gsize, outer_tag="l.0") + prog = lp.split_reduction_inward(prog, "i_inner_inner") + prog = lp.split_reduction_inward(prog, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule - knl = reduction_arg_to_subst_rule(knl, "i_outer") - knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", + prog = reduction_arg_to_subst_rule(prog, "i_outer") + prog = lp.precompute(prog, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") - knl = lp.realize_reduction(knl) - knl = lp.add_dependency( - knl, "writes:acc_i_outer", + knl = lp.realize_reduction(prog.root_kernel, prog.program_callables_info) + prog = prog.with_root_kernel(knl) + prog = lp.add_dependency( + prog, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( - ref_knl, ctx, knl, parameters={"n": size}, + ref_prog, ctx, prog, parameters={"n": size}, print_ref_code=True) @@ -270,6 +269,7 @@ def test_global_mc_parallel_reduction(ctx_factory, size): """) ref_knl = knl + ref_knl = lp.add_dtypes(ref_knl, {"n": np.int32}) gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) @@ -281,7 +281,7 @@ def test_global_mc_parallel_reduction(ctx_factory, size): knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") - knl = lp.realize_reduction(knl) + knl = lp.preprocess_kernel(knl) knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") @@ -406,7 +406,6 @@ def test_parallel_multi_output_reduction(ctx_factory): """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.add_dtypes(knl, dict(a=np.float64)) - knl = lp.realize_reduction(knl) ctx = ctx_factory() diff --git a/test/test_transform.py b/test/test_transform.py index 6c9d07a01..d54a820a8 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -322,12 +322,12 @@ def test_tag_data_axes(ctx_factory): ref_knl = knl with pytest.raises(lp.LoopyError): - lp.tag_data_axes(knl, "out", "N1,N0,N5") + lp.tag_array_axes(knl, "out", "N1,N0,N5") with pytest.raises(lp.LoopyError): - lp.tag_data_axes(knl, "out", "N1,N0,c") + lp.tag_array_axes(knl, "out", "N1,N0,c") - knl = lp.tag_data_axes(knl, "out", "N1,N0,N2") + knl = lp.tag_array_axes(knl, "out", "N1,N0,N2") knl = lp.tag_inames(knl, dict(j="g.0", i="g.1")) lp.auto_test_vs_ref(ref_knl, ctx, knl, -- GitLab From 2bdacabc9fa8a138f9a92dbe486499d5840672fa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 12:54:24 -0500 Subject: [PATCH 294/916] changes to ArgExtOp in with_calllable --- loopy/program.py | 94 ++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 131dd15c6..8e1e13b78 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -460,6 +460,27 @@ class ProgramCallablesInfo(ImmutableRecord): else: num_times_hit_during_editing[function.name] += 1 + if isinstance(function, (ArgExtOp, SegmentedOp)): + unique_function_identifier = function.copy() + if not resolved_for_the_first_time: + num_times_callables_called[function] -= 1 + + num_times_callables_called[unique_function_identifier] = 1 + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing=( + num_times_hit_during_editing), + renames_needed_after_editing=( + renames_needed_after_editing)), + unique_function_identifier) + if in_kernel_callable in self.resolved_functions.values(): # the callable already exists, implies return the function # identifier corresposing to that callable. @@ -481,54 +502,33 @@ class ProgramCallablesInfo(ImmutableRecord): renames_needed_after_editing)), func_id) else: - if isinstance(function, (ArgExtOp, SegmentedOp)): - unique_function_identifier = function.copy() - if not resolved_for_the_first_time: - num_times_callables_called[function] -= 1 - - num_times_callables_called[unique_function_identifier] = 1 - - updated_resolved_functions = self.resolved_functions.copy() - updated_resolved_functions[unique_function_identifier] = ( - in_kernel_callable) - - return ( - self.copy( - resolved_functions=updated_resolved_functions, - num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing=( - num_times_hit_during_editing), - renames_needed_after_editing=( - renames_needed_after_editing)), - unique_function_identifier) - else: - # FIXME: maybe deal with the history over here? - # FIXME: once the code logic is running beautify this part. - # many "ifs" can be avoided - unique_function_identifier = function.name - if (resolved_for_the_first_time or - self.num_times_callables_called[function.name] > 1): - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) - - if not resolved_for_the_first_time: - num_times_callables_called[function.name] -= 1 - - num_times_callables_called[unique_function_identifier] = 1 - - updated_resolved_functions = self.resolved_functions.copy() - updated_resolved_functions[unique_function_identifier] = ( - in_kernel_callable) + # FIXME: maybe deal with the history over here? + # FIXME: once the code logic is running beautify this part. + # many "ifs" can be avoided + unique_function_identifier = function.name + if (resolved_for_the_first_time or + self.num_times_callables_called[function.name] > 1): + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + if not resolved_for_the_first_time: + num_times_callables_called[function.name] -= 1 + + num_times_callables_called[unique_function_identifier] = 1 + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) - return ( - self.copy( - resolved_functions=updated_resolved_functions, - num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing=num_times_hit_during_editing, - renames_needed_after_editing=renames_needed_after_editing), - Variable(unique_function_identifier)) + return ( + self.copy( + resolved_functions=updated_resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing=num_times_hit_during_editing, + renames_needed_after_editing=renames_needed_after_editing), + Variable(unique_function_identifier)) def with_exit_edit_callables_mode(self): assert self.is_being_edited -- GitLab From 2b56cf190d7e85131f15904545535265ec3679ec Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 13:20:37 -0500 Subject: [PATCH 295/916] passes all scan tests --- loopy/preprocess.py | 48 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f19c4d33f..2d1ef2b81 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -40,7 +40,8 @@ from loopy.symbolic import RuleAwareIdentityMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) -from loopy.program import iterate_over_kernels_if_given_program +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -892,9 +893,9 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -def realize_reduction(kernel, program_callables_info, insn_id_filter=None, - unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, - force_outer_iname_for_scan=None): +def realize_reduction_for_single_kernel(kernel, program_callables_info, + insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, + force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -1372,7 +1373,7 @@ def realize_reduction(kernel, program_callables_info, insn_id_filter=None, # {{{ sequential scan - def map_scan_seq(expr, rec, nresults, arg_dtypes, + def map_scan_seq(expr, rec, program_callables_info, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1787,15 +1788,17 @@ def realize_reduction(kernel, program_callables_info, insn_id_filter=None, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes, + expr, rec, program_callables_info, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes, - sweep_iname, scan_param.scan_iname, + expr, rec, program_callables_info, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, + scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) @@ -1948,6 +1951,31 @@ def realize_reduction(kernel, program_callables_info, insn_id_filter=None, return kernel + +def realize_reduction(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = realize_reduction_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} @@ -2328,8 +2356,8 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. - kernel = realize_reduction(kernel, program_callables_info, - unknown_types_ok=False) + kernel = realize_reduction_for_single_kernel(kernel, + program_callables_info, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators -- GitLab From 6a2249936240b0210f18a0a04f8ba11d4b5265b3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 15:21:47 -0500 Subject: [PATCH 296/916] mediocre work in statistics. --- loopy/statistics.py | 434 ++++++++++++++++++++++++++++---------------- 1 file changed, 278 insertions(+), 156 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 72f73f56a..3b926cc61 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -33,6 +33,7 @@ from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError from pytools import Record +from loopy.kernel.function_interface import ScalarCallable, CallableKernel __doc__ = """ @@ -59,6 +60,10 @@ __doc__ = """ """ +# FIXME: this is broken for the callable kernel design. +# the information of variable being referenced by different names must be taken +# into consideration. + # {{{ GuardedPwQPolynomial class GuardedPwQPolynomial(object): @@ -639,10 +644,11 @@ class MemAccess(Record): # {{{ counter base class CounterBase(CombineMapper): - def __init__(self, knl): + def __init__(self, knl, program_callables_info): self.knl = knl + self.program_callables_info = program_callables_info from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.type_inf = TypeInferenceMapper(knl, program_callables_info) def combine(self, values): return sum(values) @@ -697,10 +703,11 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl): + def __init__(self, knl, program_callables_info): self.knl = knl + self.program_callables_info = program_callables_info from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.type_inf = TypeInferenceMapper(knl, program_callables_info) def combine(self, values): return sum(values) @@ -714,7 +721,7 @@ class ExpressionOpCounter(CounterBase): def map_call(self, expr): from loopy.symbolic import ResolvedFunction if isinstance(expr.function, ResolvedFunction): - function_identifier = self.knl.scoped_functions[ + function_identifier = self.program_callables_info[ expr.function.name].name else: function_identifier = expr.function.name @@ -1195,9 +1202,10 @@ def count(kernel, set, space=None): return add_assumptions_guard(kernel, count) -def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): +def get_unused_hw_axes_factor(knl, program_callables_info, insn, + disregard_local_axes, space=None): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds() + gsize, lsize = knl.get_grid_size_upper_bounds(program_callables_info) g_used = set() l_used = set() @@ -1235,7 +1243,8 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): return add_assumptions_guard(knl, result) -def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): +def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, + disregard_local_axes=False): insn_inames = knl.insn_inames(insn) @@ -1255,9 +1264,8 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) c = count(knl, domain, space=space) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, insn, - disregard_local_axes=disregard_local_axes, - space=space) + unused_fac = get_unused_hw_axes_factor(knl, program_callables_info, + insn, disregard_local_axes=disregard_local_axes, space=space) return c * unused_fac else: return c @@ -1267,7 +1275,50 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) # {{{ get_op_map -def get_op_map(knl, numpy_types=True, count_redundant_work=False, + +def get_op_map_for_single_kernel(knl, program_callables_info, + numpy_types=True, count_redundant_work=False, + subgroup_size=None): + + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + op_map = ToCountMap() + op_counter = ExpressionOpCounter(knl, + program_callables_info=program_callables_info) + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + ops = op_counter(insn.assignee) + op_counter(insn.expression) + op_map = op_map + ops*count_insn_runs( + knl, program_callables_info, insn, + count_redundant_work=count_redundant_work) + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + if numpy_types: + return ToCountMap( + init_dict=dict( + (Op( + dtype=op.dtype.numpy_dtype, + name=op.name, + count_granularity=op.count_granularity), + ct) + for op, ct in six.iteritems(op_map.count_map)), + val_type=op_map.val_type + ) + else: + return op_map + + +def get_op_map(program, numpy_types=True, count_redundant_work=False, subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1325,44 +1376,31 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, """ - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + from loopy.preprocess import preprocess_program, infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl) - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_map = op_map + ops*count_insn_runs( - knl, insn, - count_redundant_work=count_redundant_work) - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_op_map = get_op_map_for_single_kernel(knl, + program.program_callables_info, numpy_types, + count_redundant_work, subgroup_size) + + for i in range(num_times_called): + op_map += knl_op_map + elif isinstance(in_knl_callable, ScalarCallable): pass else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) - if numpy_types: - return ToCountMap( - init_dict=dict( - (Op( - dtype=op.dtype.numpy_dtype, - name=op.name, - count_granularity=op.count_granularity), - ct) - for op, ct in six.iteritems(op_map.count_map)), - val_type=op_map.val_type - ) - else: - return op_map + return op_map # }}} @@ -1383,93 +1421,9 @@ def _find_subgroup_size_for_knl(knl): # {{{ get_mem_access_map -def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, - subgroup_size=None): - """Count the number of memory accesses in a loopy kernel. - - :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be - counted. - - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.LoopyType`. - - :arg count_redundant_work: Based on usage of hardware axes or other - specifics, a kernel may perform work redundantly. This :class:`bool` - flag indicates whether this work should be included in the count. - (Likely desirable for performance modeling, but undesirable for - code optimization.) - - :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or - *None* that specifies the sub-group size. An OpenCL sub-group is an - implementation-dependent grouping of work-items within a work-group, - analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when - counting a :class:`MemAccess` whose count_granularity specifies that it - should only be counted once per sub-group. If set to *None* an attempt - to find the sub-group size using the device will be made, if this fails - an error will be raised. If a :class:`str` ``'guess'`` is passed as - the subgroup_size, get_mem_access_map will attempt to find the - sub-group size using the device and, if unsuccessful, will make a wild - guess. - - :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** - :class:`islpy.PwQPolynomial` **}**. - - - The :class:`MemAccess` specifies the characteristics of the memory - access. - - - The :class:`islpy.PwQPolynomial` holds the number of memory accesses - with the characteristics specified in the key (in terms of the - :class:`loopy.LoopKernel` *inames*). - - Example usage:: - - # (first create loopy kernel and specify array data types) - - params = {'n': 512, 'm': 256, 'l': 128} - mem_map = get_mem_access_map(knl) - - f32_s1_g_ld_a = mem_map[MemAccess( - mtype='global', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='load', - variable='a', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - f32_s1_g_st_a = mem_map[MemAccess( - mtype='global', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='store', - variable='a', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - f32_s1_l_ld_x = mem_map[MemAccess( - mtype='local', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='load', - variable='x', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - f32_s1_l_st_x = mem_map[MemAccess( - mtype='local', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='store', - variable='x', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - - # (now use these counts to, e.g., predict performance) - """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types +def get_access_map_for_single_kernel(knl, program_callables_info, + numpy_types=True, count_redundant_work=False, subgroup_size=None): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " @@ -1525,11 +1479,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, + knl, program_callables_info, insn, + count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, + knl, program_callables_info, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: @@ -1537,7 +1492,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds(program_callables_info) workgroup_size = 1 if local_size: for size in local_size: @@ -1563,12 +1518,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl) - access_counter_l = LocalMemAccessCounter(knl) + access_counter_g = GlobalMemAccessCounter(knl, program_callables_info) + access_counter_l = LocalMemAccessCounter(knl, program_callables_info) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1624,12 +1576,129 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, else: return access_map + +def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, + subgroup_size=None): + """Count the number of memory accesses in a loopy kernel. + + :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be + counted. + + :arg numpy_types: A :class:`bool` specifying whether the types in the + returned mapping should be numpy types instead of + :class:`loopy.LoopyType`. + + :arg count_redundant_work: Based on usage of hardware axes or other + specifics, a kernel may perform work redundantly. This :class:`bool` + flag indicates whether this work should be included in the count. + (Likely desirable for performance modeling, but undesirable for + code optimization.) + + :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or + *None* that specifies the sub-group size. An OpenCL sub-group is an + implementation-dependent grouping of work-items within a work-group, + analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when + counting a :class:`MemAccess` whose count_granularity specifies that it + should only be counted once per sub-group. If set to *None* an attempt + to find the sub-group size using the device will be made, if this fails + an error will be raised. If a :class:`str` ``'guess'`` is passed as + the subgroup_size, get_mem_access_map will attempt to find the + sub-group size using the device and, if unsuccessful, will make a wild + guess. + + :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** + :class:`islpy.PwQPolynomial` **}**. + + - The :class:`MemAccess` specifies the characteristics of the memory + access. + + - The :class:`islpy.PwQPolynomial` holds the number of memory accesses + with the characteristics specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = get_mem_access_map(knl) + + f32_s1_g_ld_a = mem_map[MemAccess( + mtype='global', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='load', + variable='a', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + f32_s1_g_st_a = mem_map[MemAccess( + mtype='global', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='store', + variable='a', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + f32_s1_l_ld_x = mem_map[MemAccess( + mtype='local', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='load', + variable='x', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + f32_s1_l_st_x = mem_map[MemAccess( + mtype='local', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='store', + variable='x', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + + # (now use these counts to, e.g., predict performance) + + """ + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + access_map = ToCountMap() + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_access_map = get_access_map_for_single_kernel(knl, + program.program_callables_info, numpy_types, + count_redundant_work, subgroup_size) + + # FIXME: didn't see any easy way to multiply + for i in range(num_times_called): + access_map += knl_access_map + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + + return access_map + + # }}} # {{{ get_synchronization_map -def get_synchronization_map(knl, subgroup_size=None): +def get_synchronization_map_for_single_kernel(knl, program_callables_info, + subgroup_size=None): """Count the number of synchronization events each work-item encounters in a loopy kernel. @@ -1671,13 +1740,10 @@ def get_synchronization_map(knl, subgroup_size=None): raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) - from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl, program_callables_info) iname_list = [] result = ToCountMap() @@ -1720,12 +1786,42 @@ def get_synchronization_map(knl, subgroup_size=None): return result + +def get_synchronization_map(program, subgroup_size=None): + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + sync_map = ToCountMap() + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_sync_map = get_synchronization_map_for_single_kernel(knl, + program.program_callables_info, subgroup_size) + + # FIXME: didn't see any easy way to multiply + for i in range(num_times_called): + sync_map += knl_sync_map + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + + return sync_map + # }}} # {{{ gather_access_footprints -def gather_access_footprints(kernel, ignore_uncountable=False): +def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or @@ -1736,13 +1832,6 @@ def gather_access_footprints(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - from loopy.kernel import KernelState - if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) - write_footprints = [] read_footprints = [] @@ -1765,6 +1854,39 @@ def gather_access_footprints(kernel, ignore_uncountable=False): write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) + return write_footprints, read_footprints + + +def gather_access_footprints(program, ignore_uncountable=False): + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + write_footprints = [] + read_footprints = [] + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_write_footprints, knl_read_footprints = ( + gather_access_footprints_for_single_kernel(knl, + ignore_uncountable)) + + # FIXME: didn't see any easy way to multiply + for i in range(num_times_called): + write_footprints.extend(knl_write_footprints) + read_footprints.extend(knl_read_footprints) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) @@ -1779,7 +1901,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): return result -def gather_access_footprint_bytes(kernel, ignore_uncountable=False): +def gather_access_footprint_bytes(program, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.PwQPolynomial` instances capturing the number of bytes are read/written (where *direction* is either ``read`` or ``write`` on array @@ -1790,12 +1912,12 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.preprocess import preprocess_program, infer_unknown_types + kernel = infer_unknown_types(program, expect_completion=True) from loopy.kernel import KernelState if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) + kernel = preprocess_program(program) result = {} fp = gather_access_footprints(kernel, -- GitLab From ca5fe4d788615e256be054d6503aba30f1183c3e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 19:24:08 -0500 Subject: [PATCH 297/916] infer functions missed during type inference. --- loopy/; | 929 +++++++++++++++++++++++++++++++++++++ loopy/preprocess.py | 29 +- loopy/statistics.py | 6 +- loopy/transform/padding.py | 1 + loopy/type_inference.py | 90 +++- 5 files changed, 1028 insertions(+), 27 deletions(-) create mode 100644 loopy/; diff --git a/loopy/; b/loopy/; new file mode 100644 index 000000000..4dc55578f --- /dev/null +++ b/loopy/; @@ -0,0 +1,929 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six + +from pymbolic.mapper import CombineMapper +import numpy as np + +from loopy.tools import is_integer +from loopy.types import NumpyType + +from loopy.diagnostic import ( + LoopyError, + TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.kernel.instruction import _DataObliviousInstruction + +from loopy.program import ProgramCallablesInfo + +import logging +logger = logging.getLogger(__name__) + + +def _debug(kernel, s, *args): + if logger.isEnabledFor(logging.DEBUG): + logstr = s % args + logger.debug("%s: %s" % (kernel.name, logstr)) + + +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :param arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = dict((id, dtype) for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + +# {{{ type inference mapper + +class TypeInferenceMapper(CombineMapper): + def __init__(self, kernel, program_callables_info, new_assignments=None): + """ + :arg new_assignments: mapping from names to either + :class:`loopy.kernel.data.TemporaryVariable` + or + :class:`loopy.kernel.data.KernelArgument` + instances + """ + self.kernel = kernel + assert isinstance(program_callables_info, ProgramCallablesInfo) + if new_assignments is None: + new_assignments = {} + self.new_assignments = new_assignments + self.symbols_with_unknown_types = set() + self.program_callables_info = program_callables_info + self.old_calls_to_new_calls = {} + + def __call__(self, expr, return_tuple=False, return_dtype_set=False): + kwargs = {} + if return_tuple: + kwargs["return_tuple"] = True + + result = super(TypeInferenceMapper, self).__call__( + expr, **kwargs) + + assert isinstance(result, list) + + if return_tuple: + for result_i in result: + assert isinstance(result_i, tuple) + + assert return_dtype_set + return result + + else: + if return_dtype_set: + return result + else: + if not result: + raise DependencyTypeInferenceFailure( + ", ".join(sorted(self.symbols_with_unknown_types))) + + result, = result + return result + + # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) + # are Python-equal (for many common constants such as integers). + + def copy(self, program_callables_info=None): + if program_callables_info is None: + program_callables_info = self.program_callables_info + return type(self)(self.kernel, program_callables_info, + self.new_assignments) + + def with_assignments(self, names_to_vars): + new_ass = self.new_assignments.copy() + new_ass.update(names_to_vars) + return type(self)(self.kernel, self.program_callables_info, new_ass) + + @staticmethod + def combine(dtype_sets): + """ + :arg dtype_sets: A list of lists, where each of the inner lists + consists of either zero or one type. An empty list is + consistent with any type. A list with a type requires + that an operation be valid in conjunction with that type. + """ + dtype_sets = list(dtype_sets) + + from loopy.types import LoopyType, NumpyType + assert all( + all(isinstance(dtype, LoopyType) for dtype in dtype_set) + for dtype_set in dtype_sets) + assert all( + 0 <= len(dtype_set) <= 1 + for dtype_set in dtype_sets) + + from pytools import is_single_valued + + dtypes = [dtype + for dtype_set in dtype_sets + for dtype in dtype_set] + + if not all(isinstance(dtype, NumpyType) for dtype in dtypes): + if not is_single_valued(dtypes): + raise TypeInferenceFailure( + "Nothing known about operations between '%s'" + % ", ".join(str(dtype) for dtype in dtypes)) + + return [dtypes[0]] + + numpy_dtypes = [dtype.dtype for dtype in dtypes] + + if not numpy_dtypes: + return [] + + if is_single_valued(numpy_dtypes): + return [dtypes[0]] + + result = numpy_dtypes.pop() + while numpy_dtypes: + other = numpy_dtypes.pop() + + if result.fields is None and other.fields is None: + if (result, other) in [ + (np.int32, np.float32), (np.float32, np.int32)]: + # numpy makes this a double. I disagree. + result = np.dtype(np.float32) + else: + result = ( + np.empty(0, dtype=result) + + np.empty(0, dtype=other) + ).dtype + + elif result.fields is None and other.fields is not None: + # assume the non-native type takes over + # (This is used for vector types.) + result = other + elif result.fields is not None and other.fields is None: + # assume the non-native type takes over + # (This is used for vector types.) + pass + else: + if result is not other: + raise TypeInferenceFailure( + "nothing known about result of operation on " + "'%s' and '%s'" % (result, other)) + + return [NumpyType(result)] + + def map_sum(self, expr): + dtype_sets = [] + small_integer_dtype_sets = [] + for child in expr.children: + dtype_set = self.rec(child) + if is_integer(child) and abs(child) < 1024: + small_integer_dtype_sets.append(dtype_set) + else: + dtype_sets.append(dtype_set) + + if all(dtype.is_integral() + for dtype_set in dtype_sets + for dtype in dtype_set): + dtype_sets.extend(small_integer_dtype_sets) + + return self.combine(dtype_sets) + + map_product = map_sum + + def map_quotient(self, expr): + n_dtype_set = self.rec(expr.numerator) + d_dtype_set = self.rec(expr.denominator) + + dtypes = n_dtype_set + d_dtype_set + + if all(dtype.is_integral() for dtype in dtypes): + # both integers + return [NumpyType(np.dtype(np.float64))] + + else: + return self.combine([n_dtype_set, d_dtype_set]) + + def map_constant(self, expr): + if is_integer(expr): + for tp in [np.int32, np.int64]: + iinfo = np.iinfo(tp) + if iinfo.min <= expr <= iinfo.max: + return [NumpyType(np.dtype(tp))] + + else: + raise TypeInferenceFailure("integer constant '%s' too large" % expr) + + dt = np.asarray(expr).dtype + if hasattr(expr, "dtype"): + return [NumpyType(expr.dtype)] + elif isinstance(expr, np.number): + # Numpy types are sized + return [NumpyType(np.dtype(type(expr)))] + elif dt.kind == "f": + # deduce the smaller type by default + return [NumpyType(np.dtype(np.float32))] + elif dt.kind == "c": + if np.complex64(expr) == np.complex128(expr): + # (COMPLEX_GUESS_LOGIC) + # No precision is lost by 'guessing' single precision, use that. + # This at least covers simple cases like '1j'. + return [NumpyType(np.dtype(np.complex64))] + + # Codegen for complex types depends on exactly correct types. + # Refuse temptation to guess. + raise TypeInferenceFailure("Complex constant '%s' needs to " + "be sized (i.e. as numpy.complex64/128) for type inference " + % expr) + else: + raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr) + + def map_type_cast(self, expr): + subtype, = self.rec(expr.child) + if not issubclass(subtype.dtype.type, np.number): + raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type)) + return [expr.type] + + def map_subscript(self, expr): + return self.rec(expr.aggregate) + + def map_linear_subscript(self, expr): + return self.rec(expr.aggregate) + + def map_call(self, expr, return_tuple=False): + + from pymbolic.primitives import Variable, CallWithKwargs, Call + from loopy.symbolic import ResolvedFunction + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} + + identifier = expr.function + if isinstance(identifier, (Variable, ResolvedFunction)): + identifier = identifier.name + + def none_if_empty(d): + if d: + d, = d + return d + else: + return None + + arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) + + # specializing the known function wrt type + if isinstance(expr.function, ResolvedFunction): + in_knl_callable = self.program_callables_info[expr.function.name] + + # {{{ checking that there is no overwriting of types of in_knl_callable + + if in_knl_callable.arg_id_to_dtype is not None: + + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + if in_knl_callable.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int + + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue + + # }}} + + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") + + # }}} + + in_knl_callable, self.program_callables_info = ( + in_knl_callable.with_types( + arg_id_to_dtype, self.kernel, + self.program_callables_info)) + + in_knl_callable = in_knl_callable.with_target(self.kernel.target) + + # storing the type specialized function so that it can be used for + # later use + self.program_callables_info, new_function_id = ( + self.program_callables_info.with_callable( + expr.function.function, + in_knl_callable)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls[expr] = new_function_id + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if new_arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] + + elif isinstance(expr.function, Variable): + # Since, the function is not "scoped", attempt to infer using + # kernel.function_manglers + + # {{{ trying to infer using function manglers + + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in + expr.parameters) + + # finding the function_mangler which would be associated with the + # realized function. + + mangle_result = None + for function_mangler in self.kernel.function_manglers: + mangle_result = function_mangler(self.kernel, identifier, + arg_dtypes) + if mangle_result: + # found a match. + break + + if mangle_result is not None: + from loopy.kernel.function_interface import (ManglerCallable, + ValueArgDescriptor) + + # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes + arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) + for i, dt in enumerate(mangle_result.arg_dtypes)) + arg_id_to_dtype.update(dict((-i-1, + dtype.with_target(self.kernel.target)) for i, dtype in enumerate( + mangle_result.result_dtypes))) + arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.arg_dtypes)) + res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.result_dtypes)) + arg_id_to_descr = dict(arg_descrs+res_descrs) + + # creating the ManglerCallable object corresponding to the + # function. + in_knl_callable = ManglerCallable( + identifier, function_mangler, arg_id_to_dtype, + arg_id_to_descr, mangle_result.target_name) + self.program_callables_info, new_function_id = ( + self.program_callables_info.with_callable( + expr.function, in_knl_callable, True)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls = new_function_id + + # Returning the type. + if return_tuple: + if mangle_result is not None: + return [mangle_result.result_dtypes] + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not return_tuple: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct " + "assignments") + + return [mangle_result.result_dtypes[0]] + # }}} + + return [] + + map_call_with_kwargs = map_call + + def map_variable(self, expr): + if expr.name in self.kernel.all_inames(): + return [self.kernel.index_dtype] + + result = self.kernel.mangle_symbol( + self.kernel.target.get_device_ast_builder(), + expr.name) + + if result is not None: + result_dtype, _ = result + return [result_dtype] + + obj = self.new_assignments.get(expr.name) + + if obj is None: + obj = self.kernel.arg_dict.get(expr.name) + + if obj is None: + obj = self.kernel.temporary_variables.get(expr.name) + + if obj is None: + raise TypeInferenceFailure("name not known in type inference: %s" + % expr.name) + + from loopy.kernel.data import TemporaryVariable, KernelArgument + import loopy as lp + if isinstance(obj, (KernelArgument, TemporaryVariable)): + assert obj.dtype is not lp.auto + result = [obj.dtype] + if result[0] is None: + self.symbols_with_unknown_types.add(expr.name) + return [] + else: + return result + + else: + raise RuntimeError("unexpected type inference " + "object type for '%s'" % expr.name) + + map_tagged_variable = map_variable + + def map_lookup(self, expr): + agg_result = self.rec(expr.aggregate) + if not agg_result: + return agg_result + + numpy_dtype = agg_result[0].numpy_dtype + fields = numpy_dtype.fields + if fields is None: + raise LoopyError("cannot look up attribute '%s' in " + "non-aggregate expression '%s'" + % (expr.name, expr.aggregate)) + + try: + field = fields[expr.name] + except KeyError: + raise LoopyError("cannot look up attribute '%s' in " + "aggregate expression '%s' of dtype '%s'" + % (expr.aggregate, expr.name, numpy_dtype)) + + dtype = field[0] + return [NumpyType(dtype)] + + def map_comparison(self, expr): + # "bool" is unusable because OpenCL's bool has indeterminate memory + # format. + return [NumpyType(np.dtype(np.int32))] + + map_logical_not = map_comparison + map_logical_and = map_comparison + map_logical_or = map_comparison + + def map_group_hw_index(self, expr, *args): + return [self.kernel.index_dtype] + + def map_local_hw_index(self, expr, *args): + return [self.kernel.index_dtype] + + def map_reduction(self, expr, return_tuple=False): + """ + :arg return_tuple: If *True*, treat the reduction as having tuple type. + Otherwise, if *False*, the reduction must have scalar type. + """ + from loopy.symbolic import Reduction + from pymbolic.primitives import Call + + if not return_tuple and expr.is_tuple_typed: + raise LoopyError("reductions with more or fewer than one " + "return value may only be used in direct " + "assignments") + + if isinstance(expr.expr, tuple): + rec_results = [self.rec(sub_expr) for sub_expr in expr.expr] + from itertools import product + rec_results = product(*rec_results) + elif isinstance(expr.expr, Reduction): + rec_results = self.rec(expr.expr, return_tuple=return_tuple) + elif isinstance(expr.expr, Call): + rec_results = self.map_call(expr.expr, return_tuple=return_tuple) + else: + if return_tuple: + raise LoopyError("unknown reduction type for tuple reduction: '%s'" + % type(expr.expr).__name__) + else: + rec_results = self.rec(expr.expr) + + if return_tuple: + return [expr.operation.result_dtypes(self.kernel, *rec_result) + for rec_result in rec_results] + else: + return [expr.operation.result_dtypes(self.kernel, rec_result)[0] + for rec_result in rec_results] + + def map_sub_array_ref(self, expr): + return self.rec(expr.get_begin_subscript()) + + +# }}} + + +# {{{ infer single variable + +def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + + if var_name in kernel.all_params(): + return [kernel.index_dtype], [], {}, ( + type_inf_mapper.program_callables_info) + + from functools import partial + debug = partial(_debug, kernel) + + dtype_sets = [] + + import loopy as lp + + type_inf_mapper = type_inf_mapper.copy() + + for writer_insn_id in kernel.writer_map().get(var_name, []): + writer_insn = kernel.id_to_insn[writer_insn_id] + if not isinstance(writer_insn, lp.MultiAssignmentBase): + continue + + expr = subst_expander(writer_insn.expression) + + debug(" via expr %s", expr) + if isinstance(writer_insn, lp.Assignment): + result = type_inf_mapper(expr, return_dtype_set=True) + elif isinstance(writer_insn, lp.CallInstruction): + return_dtype_set = type_inf_mapper(expr, return_tuple=True, + return_dtype_set=True) + + result = [] + for return_dtype_set in return_dtype_set: + result_i = None + found = False + for assignee, comp_dtype_set in zip( + writer_insn.assignee_var_names(), return_dtype_set): + if assignee == var_name: + found = True + result_i = comp_dtype_set + break + + assert found + if result_i is not None: + result.append(result_i) + + debug(" result: %s", result) + + dtype_sets.append(result) + + if not dtype_sets: + return ( + None, type_inf_mapper.symbols_with_unknown_types, None, + type_inf_mapper.program_callables_info) + + result = type_inf_mapper.combine(dtype_sets) + + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.old_calls_to_new_calls, + type_inf_mapper.program_callables_info) + +# }}} + + +class _DictUnionView: + def __init__(self, children): + self.children = children + + def get(self, key): + try: + return self[key] + except KeyError: + return None + + def __getitem__(self, key): + for ch in self.children: + try: + return ch[key] + except KeyError: + pass + + raise KeyError(key) + + +# {{{ infer_unknown_types + +def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, + expect_completion=False): + """Infer types on temporaries and arguments.""" + + logger.debug("%s: infer types" % kernel.name) + + from functools import partial + debug = partial(_debug, kernel) + + import time + start_time = time.time() + + unexpanded_kernel = kernel + if kernel.substitutions: + from loopy.transform.subst import expand_subst + kernel = expand_subst(kernel) + + new_temp_vars = kernel.temporary_variables.copy() + new_arg_dict = kernel.arg_dict.copy() + + # {{{ find names_with_unknown_types + + # contains both arguments and temporaries + names_for_type_inference = [] + + import loopy as lp + for tv in six.itervalues(kernel.temporary_variables): + assert tv.dtype is not lp.auto + if tv.dtype is None: + names_for_type_inference.append(tv.name) + + for arg in kernel.args: + assert arg.dtype is not lp.auto + if arg.dtype is None: + names_for_type_inference.append(arg.name) + + # }}} + + logger.debug("finding types for {count:d} names".format( + count=len(names_for_type_inference))) + + writer_map = kernel.writer_map() + + dep_graph = dict( + (written_var, set( + read_var + for insn_id in writer_map.get(written_var, []) + for read_var in kernel.id_to_insn[insn_id].read_dependency_names() + if read_var in names_for_type_inference)) + for written_var in names_for_type_inference) + + from loopy.tools import compute_sccs + + # To speed up processing, we sort the variables by computing the SCCs of the + # type dependency graph. Each SCC represents a set of variables whose types + # mutually depend on themselves. The SCCs are returned and processed in + # topological order. + sccs = compute_sccs(dep_graph) + + item_lookup = _DictUnionView([ + new_temp_vars, + new_arg_dict + ]) + type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info, + item_lookup) + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + # {{{ work on type inference queue + + from loopy.kernel.data import TemporaryVariable, KernelArgument + + old_calls_to_new_calls = {} + + for var_chain in sccs: + changed_during_last_queue_run = False + queue = var_chain[:] + failed_names = set() + + while queue or changed_during_last_queue_run: + if not queue and changed_during_last_queue_run: + changed_during_last_queue_run = False + # Optimization: If there's a single variable in the SCC without + # a self-referential dependency, then the type is known after a + # single iteration (we don't need to look at the expressions + # again). + if len(var_chain) == 1: + single_var, = var_chain + if single_var not in dep_graph[single_var]: + break + queue = var_chain[:] + + name = queue.pop(0) + item = item_lookup[name] + + debug("inferring type for %s %s", type(item).__name__, item.name) + + (result, symbols_with_unavailable_types, + new_old_calls_to_new_calls, program_callables_info) = ( + _infer_var_type( + kernel, item.name, type_inf_mapper, subst_expander)) + type_inf_mapper = type_inf_mapper.copy( + program_callables_info=program_callables_info) + + failed = not result + if not failed: + new_dtype, = result + if new_dtype.target is None: + new_dtype = new_dtype.with_target(kernel.target) + + debug(" success: %s", new_dtype) + if new_dtype != item.dtype: + debug(" changed from: %s", item.dtype) + changed_during_last_queue_run = True + + if isinstance(item, TemporaryVariable): + new_temp_vars[name] = item.copy(dtype=new_dtype) + elif isinstance(item, KernelArgument): + new_arg_dict[name] = item.copy(dtype=new_dtype) + else: + raise LoopyError("unexpected item type in type inference") + # TODO: I dont like in-place updates. Change this to something + # else. Perhaps add a function for doing this, which does it + # using a bunch of copies? + old_calls_to_new_calls.update(new_old_calls_to_new_calls) + else: + debug(" failure") + + if failed: + if item.name in failed_names: + # this item has failed before, give up. + advice = "" + if symbols_with_unavailable_types: + advice += ( + " (need type of '%s'--check for missing arguments)" + % ", ".join(symbols_with_unavailable_types)) + + if expect_completion: + raise LoopyError( + "could not determine type of '%s'%s" + % (item.name, advice)) + + else: + # We're done here. + break + + # remember that this item failed + failed_names.add(item.name) + + if set(queue) == failed_names: + # We did what we could... + print(queue, failed_names, item.name) + assert not expect_completion + break + + # can't infer type yet, put back into queue + queue.append(name) + else: + # we've made progress, reset failure markers + failed_names = set() + + # }}} + + # FIXME: copy the explanation from make_function_ready_for_codegen + # here. + for insn in kernel.instructions: + if isinstance(insn, lp.MultiAssignmentBase): + # just a dummy run over the expression, to pass over all the + # functions + # FIXME: need a check over here which checks the instruction for + # unseen cases + type_inf_mapper(insn.expression, return_tuple=isinstance(insn, + lp.CallInstruction), return_dtype_set=True) + elif isinstance(insn, (_DataObliviousInstruction, + lp.CInstruction)): + pass + else: + raise NotImplementedError("Unknown instructions type %s." % ( + type(insn).__name__)) + + program_callables_info = type_inf_mapper.program_callables_info + old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) + + end_time = time.time() + logger.debug("type inference took {dur:.2f} seconds".format( + dur=end_time - start_time)) + + pre_type_specialized_knl = unexpanded_kernel.copy( + temporary_variables=new_temp_vars, + args=[new_arg_dict[arg.name] for arg in kernel.args], + ) + + # this has to be subsitutition + from loopy.kernel.function_interface import ( + change_names_of_pymbolic_calls) + type_specialized_kernel = change_names_of_pymbolic_calls( + pre_type_specialized_knl, old_calls_to_new_calls) + + # the check is unnecessary as we would first get TypeInfereceFailure before + # encountering this. Move this at the start once ManglerCallable is + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.check import check_functions_are_scoped + check_functions_are_scoped(type_specialized_kernel) + + return type_specialized_kernel, program_callables_info + + +def infer_unknown_types(program, expect_completion=False): + """Infer types on temporaries and arguments.""" + from loopy.kernel import LoopKernel + if isinstance(program, LoopKernel): + # FIXME: deprecate warning needed here + from loopy.program import make_program_from_kernel + program = make_program_from_kernel(program) + + program_callables_info = program.program_callables_info + + type_uninferred_knl_callable = ( + program_callables_info[program.name]) + type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel + + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + root_kernel, program_callables_info = ( + infer_unknown_types_for_a_single_kernel( + type_uninferred_root_kernel, + program_callables_info, expect_completion)) + + type_inferred_knl_callable = type_uninferred_knl_callable.copy( + subkernel=root_kernel) + + program_callables_info, _ = ( + program_callables_info.with_callable( + program.name, + type_inferred_knl_callable)) + + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + # FIXME: maybe put all of this in a function? + # need to infer functions that were left out during inference + return program.copy(program_callables_info=program_callables_info) + +# }}} + + +# {{{ reduction expression helper + +def infer_arg_and_reduction_dtypes_for_reduction_expression( + kernel, expr, program_callables_info, unknown_types_ok): + type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info) + import loopy as lp + + if expr.is_tuple_typed: + arg_dtypes_result = type_inf_mapper( + expr, return_tuple=True, return_dtype_set=True) + + if len(arg_dtypes_result) == 1: + arg_dtypes = arg_dtypes_result[0] + else: + if unknown_types_ok: + arg_dtypes = [lp.auto] * expr.operation.arg_count + else: + raise LoopyError("failed to determine types of accumulators for " + "reduction '%s'" % expr) + else: + try: + arg_dtypes = [type_inf_mapper(expr)] + except DependencyTypeInferenceFailure: + if unknown_types_ok: + arg_dtypes = [lp.auto] + else: + raise LoopyError("failed to determine type of accumulator for " + "reduction '%s'" % expr) + + reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes) + reduction_dtypes = tuple( + dt.with_target(kernel.target) + if dt is not lp.auto else dt + for dt in reduction_dtypes) + + return tuple(arg_dtypes), reduction_dtypes, ( + type_inf_mapper.program_callables_info) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 2d1ef2b81..0b65559b0 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2418,9 +2418,32 @@ def preprocess_program(program, device=None): # {{{ preprocess the root kernel - root_kernel = preprocess_single_kernel( - program.root_kernel, program.program_callables_info, device) - program = program.with_root_kernel(root_kernel) + # Callable editing restrictions: + # + # - cannot edit program_callables_info in :meth:`preprocess_single_kernel` + # as we are iterating over it. + # + # Refer: https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = preprocess_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + device) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + program = program.copy(program_callables_info=new_program_callables_info) # }}} diff --git a/loopy/statistics.py b/loopy/statistics.py index 3b926cc61..6a9744a06 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -61,8 +61,10 @@ __doc__ = """ # FIXME: this is broken for the callable kernel design. -# the information of variable being referenced by different names must be taken -# into consideration. +# Qns: +# - The variable name, what if multiple kernels use the same name? +# - We should also add the cumulative effect on the arguments of callee kernels +# into the caller kernel. # {{{ GuardedPwQPolynomial diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index 4d8c81b43..2ee3bd9b1 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -447,6 +447,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1 # {{{ add_padding +@iterate_over_kernels_if_given_program def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args)) arg_idx = arg_to_idx[variable] diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 01ffd5e33..13d9c722e 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -36,6 +36,8 @@ from loopy.diagnostic import ( from loopy.kernel.instruction import _DataObliviousInstruction from loopy.program import ProgramCallablesInfo +from loopy.symbolic import SubArrayRef +from pymbolic.primitives import Variable, Subscript import logging logger = logging.getLogger(__name__) @@ -801,24 +803,67 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, # }}} - if expect_completion: - # FIXME: copy the explanation from make_function_ready_for_codegen - # here. - for insn in kernel.instructions: - if isinstance(insn, lp.MultiAssignmentBase): - # just a dummy run over the expression, to pass over all the - # functions + # FIXME: copy the explanation from make_function_ready_for_codegen + # here. + + # {{{ check if insn missed during type inference + + def _instruction_missed_during_inference(insn): + for assignee in insn.assignees: + if isinstance(assignee, Variable): + if assignee.name in kernel.arg_dict: + if kernel.arg_dict[assignee.name].dtype is None: + return False + else: + assert assignee.name in kernel.temporary_variables + if kernel.temporary_variables[assignee.name].dtype is None: + return False + + elif isinstance(assignee, Subscript): + if assignee.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[assignee.aggregate.name].dtype is None: + return False + else: + assert assignee.aggregate.name in kernel.temporary_variables + if kernel.temporary_variables[ + assignee.aggregate.name].dtype is None: + return False + + else: + assert isinstance(assignee, SubArrayRef) + if assignee.subscript.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[ + assignee.subscript.aggregate.name].dtype is None: + return False + else: + assert assignee.subscript.aggregate.name in ( + kernel.temporary_variables) + if kernel.temporary_variables[ + assignee.subscript.aggregate.name] is None: + return False + + return True + + # }}} + + for insn in kernel.instructions: + if isinstance(insn, lp.MultiAssignmentBase): + # just a dummy run over the expression, to pass over all the + # functions + # FIXME: need a check over here which checks the instruction for + # unseen cases + if _instruction_missed_during_inference(insn): type_inf_mapper(insn.expression, return_tuple=isinstance(insn, lp.CallInstruction), return_dtype_set=True) - elif isinstance(insn, (_DataObliviousInstruction, - lp.CInstruction)): - pass - else: - raise NotImplementedError("Unknown instructions type %s." % ( - type(insn).__name__)) + elif isinstance(insn, (_DataObliviousInstruction, + lp.CInstruction)): + pass + else: + raise NotImplementedError("Unknown instructions type %s." % ( + type(insn).__name__)) - program_callables_info = type_inf_mapper.program_callables_info - old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) + program_callables_info = type_inf_mapper.program_callables_info + old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format( @@ -835,13 +880,14 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, type_specialized_kernel = change_names_of_pymbolic_calls( pre_type_specialized_knl, old_calls_to_new_calls) - # this code is dead, move it up after mangler callables are made - # illegal. - # if expect_completion: - # # if completion is expected, then it is important that all the - # # callables are scoped. - # from loopy.check import check_functions_are_scoped - # check_functions_are_scoped(type_specialized_kernel) + # the check is unnecessary as we would first get TypeInfereceFailure before + # encountering this. Move this at the start once ManglerCallable is + # deprecated. + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.check import check_functions_are_scoped + check_functions_are_scoped(type_specialized_kernel) return type_specialized_kernel, program_callables_info -- GitLab From 73015a8be3ee4fd6fe980ddd7cb31e9cba2e88c0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 19:46:31 -0500 Subject: [PATCH 298/916] Pro Tip: If the tests dont work, just change the tests. :P --- loopy/loop.py | 2 ++ loopy/transform/arithmetic.py | 1 + loopy/transform/buffer.py | 43 ++++++++++++++++++++++++++++++----- loopy/transform/parameter.py | 1 + loopy/transform/subst.py | 1 + test/test_fortran.py | 4 ++-- 6 files changed, 44 insertions(+), 8 deletions(-) diff --git a/loopy/loop.py b/loopy/loop.py index 459246382..66d413987 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -25,6 +25,7 @@ THE SOFTWARE. import islpy as isl import six +from loopy.program import iterate_over_kernels_if_given_program def potential_loop_nest_map(kernel): @@ -55,6 +56,7 @@ def potential_loop_nest_map(kernel): return result +@iterate_over_kernels_if_given_program def fuse_loop_domains(kernel): from loopy.kernel.tools import is_domain_dependent_on_inames diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index acf075deb..3df86e7ae 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -33,6 +33,7 @@ from loopy.kernel import LoopKernel # {{{ fold constants +@iterate_over_kernels_if_given_program def fold_constants(kernel): from loopy.symbolic import ConstantFoldingMapper cfm = ConstantFoldingMapper() diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 801da4c13..b848a6f98 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -33,6 +33,9 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import ScalarCallable, CallableKernel from pymbolic import var @@ -130,10 +133,10 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array(kernel, var_name, buffer_inames, init_expression=None, - store_expression=None, within=None, default_tag="l.auto", - temporary_scope=None, temporary_is_local=None, - fetch_bounding_box=False): +def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, + buffer_inames, init_expression=None, store_expression=None, + within=None, default_tag="l.auto", temporary_scope=None, + temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable @@ -169,6 +172,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, fetched. """ + assert isinstance(kernel, LoopKernel) + # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import AddressSpace @@ -240,7 +245,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) - cache_key = (key_kernel, var_name, tuple(buffer_inames), + cache_key = (key_kernel, program_callables_info, var_name, + tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) @@ -528,7 +534,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, program_callables_info) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -537,4 +543,29 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, return kernel + +def buffer_array(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = buffer_array_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 0720a312b..b7d017ec8 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -43,6 +43,7 @@ __doc__ = """ # {{{ assume +@iterate_over_kernels_if_given_program def assume(kernel, assumptions): """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. `n mod 4 = 0`. diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 6d6f034f3..0dbc7939e 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -289,6 +289,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) +@iterate_over_kernels_if_given_program def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) diff --git a/test/test_fortran.py b/test/test_fortran.py index e08033360..deca4d42e 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -71,7 +71,7 @@ def test_fill(ctx_factory): knl, = lp.parse_transformed_fortran(fortran_src, pre_transform_code="split_amount = 128") - assert "i_inner" in knl.all_inames() + assert "i_inner" in knl.root_kernel.all_inames() ctx = ctx_factory() @@ -295,7 +295,7 @@ def test_matmul(ctx_factory, buffer_inames): knl, = lp.parse_fortran(fortran_src) - assert len(knl.domains) == 1 + assert len(knl.root_kernel.domains) == 1 ref_knl = knl -- GitLab From 56217afbd15bdf86f5b9a92fb317dccd65de641d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 20:16:43 -0500 Subject: [PATCH 299/916] modernize tests. --- test/test_domain.py | 74 +++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/test/test_domain.py b/test/test_domain.py index ebfde8509..dd789d2cd 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -61,20 +61,15 @@ def test_assume(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i 10") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) - assert "if" not in compiled.get_code() + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code def test_divisibility_assumption(ctx_factory): @@ -90,16 +85,14 @@ def test_divisibility_assumption(ctx_factory): lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and (exists zz: n = 16*zz)") + assumptions="n>=1 and (exists zz: n = 16*zz)", + target=lp.PyOpenCLTarget(ctx.devices[0])) ref_knl = knl knl = lp.split_iname(knl, "i", 16) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "if" not in code + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3}) @@ -118,16 +111,12 @@ def test_eq_constraint(ctx_factory): [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for knl in kernel_gen: - print(lp.generate_code(knl)) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds(ctx_factory): @@ -150,12 +139,10 @@ def test_dependent_loop_bounds(ctx_factory): lp.GlobalArg("a_sum", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_2(ctx_factory): @@ -179,14 +166,13 @@ def test_dependent_loop_bounds_2(ctx_factory): lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_3(ctx_factory): @@ -211,25 +197,21 @@ def test_dependent_loop_bounds_3(ctx_factory): lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) - assert knl.parents_per_domain()[1] == 0 + assert knl.root_kernel.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) + list(lp.generate_code_v2(knl_bad)) def test_dependent_loop_bounds_4(): @@ -291,11 +273,10 @@ def test_independent_multi_domain(ctx_factory): inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - assert knl.parents_per_domain() == 2*[None] + assert knl.root_kernel.parents_per_domain() == 2*[None] n = 50 - cknl = lp.CompiledKernel(ctx, knl) - evt, (a, b) = cknl(queue, n=n, out_host=True) + evt, (a, b) = knl(queue, n=n, out_host=True) assert a.shape == (50,) assert b.shape == (50,) @@ -396,10 +377,11 @@ def test_triangle_domain(ctx_factory): knl = lp.make_kernel( "{[i,j]: 0<=i,j Date: Thu, 2 Aug 2018 22:59:13 -0500 Subject: [PATCH 300/916] changed the c-execution pipeline. --- loopy/target/c/c_execution.py | 10 +++++----- loopy/transform/instruction.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 6b80bae20..58a252ca2 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -389,11 +389,11 @@ class CKernelExecutor(KernelExecutorBase): return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() host_code = codegen_result.host_code() @@ -423,10 +423,10 @@ class CKernelExecutor(KernelExecutorBase): self.compiler)) return _KernelInfo( - kernel=kernel, + program=program, c_kernels=c_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) # }}} diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index eaf6d3021..910a6b2d3 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -231,6 +231,7 @@ def replace_instruction_ids(kernel, replacements): # {{{ tag_instructions +@iterate_over_kernels_if_given_program def tag_instructions(kernel, new_tag, within=None): from loopy.match import parse_match within = parse_match(within) -- GitLab From 8692e15863773a560871949c3bc03b79034c538a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 23:10:17 -0500 Subject: [PATCH 301/916] minor error in c execution. --- loopy/target/c/c_execution.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 58a252ca2..dad760229 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -443,7 +443,7 @@ class CKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info_info = self.program_info_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + return program_info_info.invoker( + program_info_info.c_program_infos, *args, **kwargs) -- GitLab From 16bd941905497f080a2e2ca0f238c50ed3cbd753 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 23:25:38 -0500 Subject: [PATCH 302/916] rename to `program_info` --- loopy/target/c/c_execution.py | 6 +++--- test/test_c_execution.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index dad760229..bb6710187 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -443,7 +443,7 @@ class CKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - program_info_info = self.program_info_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return program_info_info.invoker( - program_info_info.c_program_infos, *args, **kwargs) + return program_info.invoker( + program_info.c_kernels, *args, **kwargs) diff --git a/test/test_c_execution.py b/test/test_c_execution.py index c355893e4..7c7df2557 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -76,6 +76,7 @@ def test_c_target_strides(): # test with C-order knl = __get_kernel('C') + lp.generate_code_v2(knl) a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1), order='C') -- GitLab From 6ce566a181f3e3bc0be9432d0dd797c0d6f27727 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 2 Aug 2018 23:44:59 -0500 Subject: [PATCH 303/916] test_c_execution --- loopy/target/c/c_execution.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index bb6710187..feafb8dcd 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -373,7 +373,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, compiler=None): + def __init__(self, program, compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -382,7 +382,7 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel) + super(CKernelExecutor, self).__init__(program) def get_invoker_uncached(self, kernel, codegen_result): generator = CExecutionWrapperGenerator() @@ -399,18 +399,18 @@ class CKernelExecutor(KernelExecutorBase): host_code = codegen_result.host_code() all_code = '\n'.join([dev_code, '', host_code]) - if self.kernel.options.write_cl: + if self.program.root_kernel.options.write_cl: output = all_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(code=output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor @@ -419,7 +419,7 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_info, all_code, self.kernel.target, + codegen_result.implemented_data_info, all_code, self.program.target, self.compiler)) return _KernelInfo( -- GitLab From 34ccd115c347addf59ff5662a0b39d3ceb5c4478 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 3 Aug 2018 00:03:28 -0500 Subject: [PATCH 304/916] test_c_execution correciton --- loopy/target/c/c_execution.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index feafb8dcd..300fb3295 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -157,7 +157,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -166,12 +166,12 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.is_output_only)) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.is_output_only] if out_args: gen("return None, (%s,)" % ", ".join(arg.name for arg in out_args)) -- GitLab From 3cc5d49841cdd8780116f28aa78645a15698b9a6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 3 Aug 2018 00:19:01 -0500 Subject: [PATCH 305/916] test_c_execution correciton --- loopy/target/c/c_execution.py | 5 +++-- loopy/target/pyopencl_execution.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 300fb3295..b3c304d58 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -166,12 +166,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.is_output_only)) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.is_output_only] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return None, (%s,)" % ", ".join(arg.name for arg in out_args)) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 890208bf6..380ab1d9f 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -220,7 +220,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in program.get_written_variables())) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info -- GitLab From cc15754f92b21f4ad8df00b38e8689026c5f4b07 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 3 Aug 2018 11:16:00 -0500 Subject: [PATCH 306/916] pass one fuse_kernels test --- loopy/program.py | 70 --------------------------------------- loopy/transform/fusion.py | 52 +++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 73 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 8e1e13b78..394e9806f 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -33,7 +33,6 @@ from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.diagnostic import LoopyError -from pymbolic import var from loopy.kernel import LoopKernel @@ -568,75 +567,6 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_hit_during_editing={}, renames_needed_after_editing={}) - def merge_program(self, program2): - # FIXME: this is not correct and should not be touched till then. - 1/0 - # rename the callables in program2 to see no clash between the 2. - renames_needed_in_program2 = {} - - for old_func_id in program2.program_callables_info: - if old_func_id == program2.name: - # dont rename the root kernel - renames_needed_in_program2[old_func_id] = ( - old_func_id) - continue - unique_function_identifier = old_func_id - while unique_function_identifier in self.resolved_functions or ( - unique_function_identifier in - renames_needed_in_program2.values()): - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) - renames_needed_in_program2[old_func_id] = ( - unique_function_identifier) - - # rename ALL the callables in program2 - new_prog2_resolved_functions = {} - new_prog2_num_times_callables_called = {} - - for func_id, in_knl_callable in program2.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - old_subkernel = in_knl_callable.subkernel - new_subkernel = rename_resolved_functions_in_a_single_kernel( - old_subkernel, renames_needed_in_program2) - in_knl_callable = ( - in_knl_callable.copy(subkernel=new_subkernel)) - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callable type %s." % - type(in_knl_callable).__name__) - - new_func_id = renames_needed_in_program2[func_id] - new_prog2_resolved_functions[new_func_id] = ( - in_knl_callable) - new_prog2_num_times_callables_called[new_func_id] = ( - program2.program_callables_info.num_times_callables_called[ - func_id]) - - new_prog1_callables_info = self.with_edit_callables_mode() - # TODO: there maybe a case of trouble when merging the kernel being - # called from *self*, that's improbable, but can be fixed with a - # condition. - for old_func_id, in_knl_callable_in_prog2 in ( - new_prog2_resolved_functions.items()): - for i in range( - new_prog2_num_times_callables_called[old_func_id]): - new_prog1_callables_info, new_func_id = ( - new_prog1_callables_info.with_callable( - var(old_func_id), in_knl_callable_in_prog2)) - - # FIXME: perform all the edits on - merged_prog_callables_info = ( - new_prog1_callables_info.with_exit_edit_callables_mode()) - new_merged_resolved_functions = ( - merged_prog_callables_info.resolved_functions.copy()) - new_subkernel = new_merged_resolved_functions.pop( - program2.name).subkernel - new_merged_prog_callables_info = merged_prog_callables_info.copy( - resolved_functions=new_merged_resolved_functions) - return new_merged_prog_callables_info, new_subkernel - def __getitem__(self, item): return self.resolved_functions[item] diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 7bd03c1de..d43ce025b 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -32,6 +32,8 @@ from loopy.diagnostic import LoopyError from pymbolic import var from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel +from loopy.program import rename_resolved_functions_in_a_single_kernel def _apply_renames_in_exprs(kernel, var_renames): @@ -289,7 +291,7 @@ def _fuse_two_kernels(knla, knlb): # }}} -def fuse_kernels(kernels, suffixes=None, data_flow=None): +def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): """Return a kernel that performs all the operations in all entries of *kernels*. @@ -416,7 +418,51 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): return result -def fuse_programs(programs, suffixes=None, data_flow=None): - 1/0 +def fuse_kernels(programs, suffixes=None, data_flow=None): + main_prog_callables_info = ( + programs[0].program_callables_info.with_edit_callables_mode()) + old_root_kernel_callable = ( + programs[0].program_callables_info[programs[0].name]) + kernels = [programs[0].root_kernel] + + # removing the callable collisions that maybe present + for prog in programs[1:]: + root_kernel = prog.root_kernel + renames_needed = {} + for old_func_id, in_knl_callable in prog.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + if in_knl_callable.name != prog.name: + raise LoopyError("fuse_kernels cannot fuse programs with " + "multiple callable kernels.") + continue + num_times_called = ( + prog.program_callables_info.num_times_callables_called[ + old_func_id]) + for i in range(num_times_called): + main_prog_callables_info, new_func_id = ( + main_prog_callables_info.with_callables(var(old_func_id), + in_knl_callable, True)) + + if old_func_id != new_func_id: + renames_needed[old_func_id] = new_func_id + + if renames_needed: + root_kernel = rename_resolved_functions_in_a_single_kernel( + root_kernel, renames_needed) + + kernels.append(root_kernel) + + new_root_kernel = fuse_loop_kernels(kernels, suffixes, data_flow) + new_root_kernel_callable = old_root_kernel_callable.copy( + subkernel=new_root_kernel.copy(name=programs[0].name)) + + main_prog_callables_info, _ = main_prog_callables_info.with_callable( + var(programs[0].name), new_root_kernel_callable) + + main_prog_callables_info = ( + main_prog_callables_info.with_exit_edit_callables_mode()) + + return programs[0].copy( + program_callables_info=main_prog_callables_info) # vim: foldmethod=marker -- GitLab From 777fea57b5f0a9464c8e07e5c0ca2b16e73f26f4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 3 Aug 2018 11:58:04 -0500 Subject: [PATCH 307/916] test_numa_diff should now work. --- loopy/transform/buffer.py | 2 +- loopy/transform/iname.py | 1 + loopy/transform/subst.py | 14 ++++++++++++-- test/test_fortran.py | 2 +- test/test_numa_diff.py | 4 +++- 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index b848a6f98..57c4397f9 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -245,7 +245,7 @@ def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) - cache_key = (key_kernel, program_callables_info, var_name, + cache_key = (key_kernel, var_name, tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 0d5f2015e..20dc9a99b 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1088,6 +1088,7 @@ def has_schedulable_iname_nesting(knl): # {{{ rename_inames +@iterate_over_kernels_if_given_program def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 0dbc7939e..6a93e0bd9 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -35,6 +35,7 @@ from pymbolic import var from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -508,8 +509,17 @@ def find_rules_matching(knl, pattern): return [r for r in knl.substitutions if pattern.match(r)] -def find_one_rule_matching(knl, pattern): - rules = find_rules_matching(knl, pattern) +def find_one_rule_matching(program, pattern): + rules = [] + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + rules.extend(find_rules_matching(knl, pattern)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable types %s." % ( + type(in_knl_callable).__name__)) if len(rules) > 1: raise ValueError("more than one substitution rule matched '%s'" diff --git a/test/test_fortran.py b/test/test_fortran.py index deca4d42e..1a5a0c383 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -410,7 +410,7 @@ def test_fuse_kernels(ctx_factory): knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)]) knl = lp.prioritize_loops(knl, "e,i,j,k") - assert len(knl.temporary_variables) == 2 + assert len(knl.root_kernel.temporary_variables) == 2 ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 6b578838d..4f802f8bf 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -246,7 +246,9 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa "-cl-no-signed-zeros", ]) - hsv = hsv.copy(name="horizontalStrongVolumeKernel") + # FIXME: renaming's a bit tricky in this program model. + # add a simple transformation for it + # hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) -- GitLab From 0c531301d90092372401b5a7f794d00fb3b25ac5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 3 Aug 2018 18:25:43 -0500 Subject: [PATCH 308/916] started towards making register_callables work --- loopy/__init__.py | 5 +- loopy/kernel/function_interface.py | 3 + loopy/program.py | 107 ++++++++++++----------------- loopy/transform/callable.py | 84 ++++++++++++++++++---- test/test_callables.py | 6 +- 5 files changed, 124 insertions(+), 81 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 5a2487f17..8b5026032 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -121,7 +121,7 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.callable import (register_callable_kernel, - register_function_lookup, inline_callable_kernel) + register_function_id_to_in_knl_callable_mapper, inline_callable_kernel) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call # }}} @@ -238,7 +238,8 @@ __all__ = [ "add_barrier", - "register_callable_kernel", "register_function_lookup", + "register_callable_kernel", + "register_function_id_to_in_knl_callable_mapper", "inline_callable_kernel", "pack_and_unpack_args_for_call", diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 799be7763..095d5ff0e 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -42,6 +42,9 @@ from loopy.kernel import LoopKernel # {{{ argument descriptors class ValueArgDescriptor(ImmutableRecord): + hash_fields = () + + update_persistent_hash = LoopKernel.update_persistent_hash pass diff --git a/loopy/program.py b/loopy/program.py index 394e9806f..5d4bae1c0 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -37,7 +37,7 @@ from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel -class FunctionResolver(RuleAwareIdentityMapper): +class ResolvedFunctionMarker(RuleAwareIdentityMapper): """ Mapper to convert the ``function`` attribute of a :class:`pymbolic.primitives.Call` known in the kernel as instances of @@ -56,14 +56,15 @@ class FunctionResolver(RuleAwareIdentityMapper): the function identifiers to look for while scoping functions. """ def __init__(self, rule_mapping_context, kernel, program_callables_info, - function_resolvers): - super(FunctionResolver, self).__init__(rule_mapping_context) + function_id_to_in_knl_callable_mappers): + super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) self.kernel = kernel self.program_callables_info = program_callables_info # FIXME: function_resolvesrs looks like a very bad name change it - self.function_resolvers = function_resolvers + self.function_id_to_in_knl_callable_mappers = ( + function_id_to_in_knl_callable_mappers) - def find_resolved_function_from_identifier(self, identifier): + def find_in_knl_callable_from_identifier(self, identifier): """ Returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable` if the @@ -71,9 +72,11 @@ class FunctionResolver(RuleAwareIdentityMapper): *None*. """ # FIXME change docs - for scoper in self.function_resolvers: + for func_id_to_in_knl_callable_mapper in ( + self.function_id_to_in_knl_callable_mappers): # fixme: do we really need to given target for the function - in_knl_callable = scoper(self.kernel.target, identifier) + in_knl_callable = func_id_to_in_knl_callable_mapper( + self.kernel.target, identifier) if in_knl_callable is not None: return in_knl_callable @@ -98,7 +101,7 @@ class FunctionResolver(RuleAwareIdentityMapper): if not isinstance(expr.function, ResolvedFunction): # search the kernel for the function. - in_knl_callable = self.find_resolved_function_from_identifier( + in_knl_callable = self.find_in_knl_callable_from_identifier( expr.function.name) if in_knl_callable: @@ -118,7 +121,7 @@ class FunctionResolver(RuleAwareIdentityMapper): ) # this is an unknown function as of yet, do not modify it - return super(FunctionResolver, self).map_call_with_kwargs(expr, + return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, expn_state) def map_reduction(self, expr, expn_state): @@ -129,29 +132,32 @@ class FunctionResolver(RuleAwareIdentityMapper): self.program_callables_info, _ = ( self.program_callables_info.with_callable(func_id, in_knl_callable, True)) - return super(FunctionResolver, self).map_reduction(expr, expn_state) + return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) -def resolve_callables(name, program_callables_info, function_resolvers): - - kernel = program_callables_info[name].subkernel +def initialize_program_callables_info_from_kernel( + kernel, func_id_to_kernel_callable_mappers): + program_callables_info = ProgramCallablesInfo({}) + program_callables_info = program_callables_info.with_edit_callables_mode() from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) - function_resolver = FunctionResolver(rule_mapping_context, kernel, - program_callables_info, function_resolvers) + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, program_callables_info, + func_id_to_kernel_callable_mappers) # scoping fucntions and collecting the scoped functions kernel_with_functions_resolved = rule_mapping_context.finish_kernel( - function_resolver.map_kernel(kernel)) - program_callables_info = function_resolver.program_callables_info + resolved_function_marker.map_kernel(kernel)) + program_callables_info = resolved_function_marker.program_callables_info - new_in_knl_callable = program_callables_info[name].copy( - subkernel=kernel_with_functions_resolved) + callable_kernel = CallableKernel(kernel_with_functions_resolved) program_callables_info, _ = program_callables_info.with_callable( - Variable(name), new_in_knl_callable) + Variable(kernel.name), callable_kernel, True) + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) return program_callables_info @@ -162,54 +168,20 @@ class Program(ImmutableRecord): def __init__(self, name, program_callables_info, - target=None, - function_resolvers=None): + target, + func_id_to_in_knl_callable_mappers): assert isinstance(program_callables_info, ProgramCallablesInfo) # FIXME: check if all sanity checks have been covered? # FIXME: The comments over here may need some attention. assert name in program_callables_info - if target is None: - target = program_callables_info[name].subkernel.target - - if function_resolvers is None: - # populate the function scopers from the target and the loopy - # specific callable scopers - - # at this point only the root kernel can be present in the - # callables. - assert len(program_callables_info.resolved_functions) == 1 - - from loopy.library.function import loopy_specific_callable_scopers - function_resolvers = [loopy_specific_callable_scopers] + ( - target.get_device_ast_builder().function_scopers()) - - # new function resolvers have arrived, implies we need to resolve - # the callables identified by this set of resolvers - program_callables_info = ( - program_callables_info.with_edit_callables_mode()) - - for name, in_knl_callable in program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - # resolve the callables in the subkernel - program_callables_info = ( - resolve_callables(name, program_callables_info, - function_resolvers)) - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callable %s." % - type(in_knl_callable).__name__) - - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - super(Program, self).__init__( name=name, program_callables_info=program_callables_info, target=target, - function_resolvers=function_resolvers) + func_id_to_in_knl_callable_mappers=( + func_id_to_in_knl_callable_mappers)) self._program_executor_cache = {} @@ -583,14 +555,25 @@ class ProgramCallablesInfo(ImmutableRecord): # }}} +def default_func_id_to_kernel_callable_mappers(target): + + from loopy.library.function import loopy_specific_callable_scopers + return ( + [loopy_specific_callable_scopers] + ( + target.get_device_ast_builder().function_scopers())) + + def make_program_from_kernel(kernel): - callable_knl = CallableKernel(subkernel=kernel) - resolved_functions = {kernel.name: callable_knl} - program_callables_info = ProgramCallablesInfo(resolved_functions) + + program_callables_info = initialize_program_callables_info_from_kernel(kernel, + default_func_id_to_kernel_callable_mappers(kernel.target)) program = Program( name=kernel.name, - program_callables_info=program_callables_info) + program_callables_info=program_callables_info, + func_id_to_in_knl_callable_mappers=( + default_func_id_to_kernel_callable_mappers(kernel.target)), + target=kernel.target) return program diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 3c0caa9e5..c67b307fe 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -42,7 +42,7 @@ from loopy.kernel.function_interface import (get_kw_pos_association, __doc__ = """ .. currentmodule:: loopy -.. autofunction:: register_function_lookup +.. autofunction:: register_function_resolver .. autofunction:: register_callable_kernel """ @@ -50,29 +50,84 @@ __doc__ = """ # {{{ register function lookup -def register_function_lookup(kernel, function_lookup): +def resolved_callables_from_function_lookup(program, + func_id_to_kernel_callable_mapper): + from loopy.program import ResolvedFunctionMarker + program_callables_info = program.program_callables_info + program_callables_info = program_callables_info.with_edit_callables_mode() + + callable_knls = dict( + (func_id, in_knl_callable) for func_id, in_knl_callable in + program_callables_info.items() if isinstance(in_knl_callable, + CallableKernel)) + edited_callable_knls = {} + + for func_id, in_knl_callable in callable_knls.items(): + kernel = in_knl_callable.subkernel + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, program_callables_info, + [func_id_to_kernel_callable_mapper]) + + # scoping fucntions and collecting the scoped functions + new_subkernel = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(kernel)) + program_callables_info = resolved_function_marker.program_callables_info + + edited_callable_knls[func_id] = in_knl_callable.copy( + subkernel=new_subkernel) + + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + new_resolved_functions = {} + + for func_id, in_knl_callable in program_callables_info.items(): + if func_id in edited_callable_knls: + new_resolved_functions[func_id] = edited_callable_knls[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = program_callables_info.copy( + resolved_functions=new_resolved_functions) + + return program.copy(program_callables_info=program_callables_info) + + +def register_function_id_to_in_knl_callable_mapper(program, + func_id_to_in_knl_callable_mapper): """ Returns a copy of *kernel* with the *function_lookup* registered. - :arg function_lookup: A function of signature ``(target, identifier)`` - returning a :class:`loopy.kernel.function_interface.InKernelCallable`. + :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target, + identifier)`` returning a + :class:`loopy.kernel.function_interface.InKernelCallable` or *None* if + the *function_identifier* is not known. """ # adding the function lookup to the set of function lookers in the kernel. - if function_lookup not in kernel.function_scopers: + if func_id_to_in_knl_callable_mapper not in ( + program.func_id_to_in_knl_callable_mappers): from loopy.tools import unpickles_equally - if not unpickles_equally(function_lookup): + if not unpickles_equally(func_id_to_in_knl_callable_mapper): raise LoopyError("function '%s' does not " "compare equally after being upickled " "and would disrupt loopy's caches" - % function_lookup) - new_function_scopers = kernel.function_scopers + [function_lookup] - registered_kernel = kernel.copy(function_scopers=new_function_scopers) - from loopy.kernel.creation import scope_functions + % func_id_to_in_knl_callable_mapper) + new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( + [func_id_to_in_knl_callable_mapper]) + + program = resolved_callables_from_function_lookup(program, + func_id_to_in_knl_callable_mapper) + + new_program = program.copy( + func_id_to_in_knl_callable_mappers=new_func_id_mappers) - # returning the scoped_version of the kernel, as new functions maybe - # resolved. - return scope_functions(registered_kernel) + return new_program # }}} @@ -152,7 +207,8 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): from loopy import set_options callee_kernel = set_options(callee_kernel, "disable_global_barriers") - return register_function_lookup(caller_kernel, + return register_function_id_to_in_knl_callable_mapper( + caller_kernel, _RegisterCalleeKernel(function_name, callable_kernel)) # }}} diff --git a/test/test_callables.py b/test/test_callables.py index 3b27b2d5b..9dce5a84a 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -47,14 +47,14 @@ def test_register_function_lookup(ctx_factory): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=i<10}", """ y[i] = log2(x[i]) """) - knl = lp.register_function_lookup(knl, register_log2_lookup) + prog = lp.register_function_lookup(prog, register_log2_lookup) - evt, (out, ) = knl(queue, x=x) + evt, (out, ) = prog(queue, x=x) assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 -- GitLab From cff8646adca929e52ed5ed5ec1e22e676f27feba Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 4 Aug 2018 15:44:18 -0500 Subject: [PATCH 309/916] new design of resolving functions. --- loopy/; | 929 -------------------------------------------------------- 1 file changed, 929 deletions(-) delete mode 100644 loopy/; diff --git a/loopy/; b/loopy/; deleted file mode 100644 index 4dc55578f..000000000 --- a/loopy/; +++ /dev/null @@ -1,929 +0,0 @@ -from __future__ import division, absolute_import - -__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - -import six - -from pymbolic.mapper import CombineMapper -import numpy as np - -from loopy.tools import is_integer -from loopy.types import NumpyType - -from loopy.diagnostic import ( - LoopyError, - TypeInferenceFailure, DependencyTypeInferenceFailure) -from loopy.kernel.instruction import _DataObliviousInstruction - -from loopy.program import ProgramCallablesInfo - -import logging -logger = logging.getLogger(__name__) - - -def _debug(kernel, s, *args): - if logger.isEnabledFor(logging.DEBUG): - logstr = s % args - logger.debug("%s: %s" % (kernel.name, logstr)) - - -def get_return_types_as_tuple(arg_id_to_dtype): - """Returns the types of arguments in a tuple format. - - :param arg_id_to_dtype: An instance of :class:`dict` which denotes a - mapping from the arguments to their inferred types. - """ - return_arg_id_to_dtype = dict((id, dtype) for id, dtype in - arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) - return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) - - return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) - - -# {{{ type inference mapper - -class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, program_callables_info, new_assignments=None): - """ - :arg new_assignments: mapping from names to either - :class:`loopy.kernel.data.TemporaryVariable` - or - :class:`loopy.kernel.data.KernelArgument` - instances - """ - self.kernel = kernel - assert isinstance(program_callables_info, ProgramCallablesInfo) - if new_assignments is None: - new_assignments = {} - self.new_assignments = new_assignments - self.symbols_with_unknown_types = set() - self.program_callables_info = program_callables_info - self.old_calls_to_new_calls = {} - - def __call__(self, expr, return_tuple=False, return_dtype_set=False): - kwargs = {} - if return_tuple: - kwargs["return_tuple"] = True - - result = super(TypeInferenceMapper, self).__call__( - expr, **kwargs) - - assert isinstance(result, list) - - if return_tuple: - for result_i in result: - assert isinstance(result_i, tuple) - - assert return_dtype_set - return result - - else: - if return_dtype_set: - return result - else: - if not result: - raise DependencyTypeInferenceFailure( - ", ".join(sorted(self.symbols_with_unknown_types))) - - result, = result - return result - - # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) - # are Python-equal (for many common constants such as integers). - - def copy(self, program_callables_info=None): - if program_callables_info is None: - program_callables_info = self.program_callables_info - return type(self)(self.kernel, program_callables_info, - self.new_assignments) - - def with_assignments(self, names_to_vars): - new_ass = self.new_assignments.copy() - new_ass.update(names_to_vars) - return type(self)(self.kernel, self.program_callables_info, new_ass) - - @staticmethod - def combine(dtype_sets): - """ - :arg dtype_sets: A list of lists, where each of the inner lists - consists of either zero or one type. An empty list is - consistent with any type. A list with a type requires - that an operation be valid in conjunction with that type. - """ - dtype_sets = list(dtype_sets) - - from loopy.types import LoopyType, NumpyType - assert all( - all(isinstance(dtype, LoopyType) for dtype in dtype_set) - for dtype_set in dtype_sets) - assert all( - 0 <= len(dtype_set) <= 1 - for dtype_set in dtype_sets) - - from pytools import is_single_valued - - dtypes = [dtype - for dtype_set in dtype_sets - for dtype in dtype_set] - - if not all(isinstance(dtype, NumpyType) for dtype in dtypes): - if not is_single_valued(dtypes): - raise TypeInferenceFailure( - "Nothing known about operations between '%s'" - % ", ".join(str(dtype) for dtype in dtypes)) - - return [dtypes[0]] - - numpy_dtypes = [dtype.dtype for dtype in dtypes] - - if not numpy_dtypes: - return [] - - if is_single_valued(numpy_dtypes): - return [dtypes[0]] - - result = numpy_dtypes.pop() - while numpy_dtypes: - other = numpy_dtypes.pop() - - if result.fields is None and other.fields is None: - if (result, other) in [ - (np.int32, np.float32), (np.float32, np.int32)]: - # numpy makes this a double. I disagree. - result = np.dtype(np.float32) - else: - result = ( - np.empty(0, dtype=result) - + np.empty(0, dtype=other) - ).dtype - - elif result.fields is None and other.fields is not None: - # assume the non-native type takes over - # (This is used for vector types.) - result = other - elif result.fields is not None and other.fields is None: - # assume the non-native type takes over - # (This is used for vector types.) - pass - else: - if result is not other: - raise TypeInferenceFailure( - "nothing known about result of operation on " - "'%s' and '%s'" % (result, other)) - - return [NumpyType(result)] - - def map_sum(self, expr): - dtype_sets = [] - small_integer_dtype_sets = [] - for child in expr.children: - dtype_set = self.rec(child) - if is_integer(child) and abs(child) < 1024: - small_integer_dtype_sets.append(dtype_set) - else: - dtype_sets.append(dtype_set) - - if all(dtype.is_integral() - for dtype_set in dtype_sets - for dtype in dtype_set): - dtype_sets.extend(small_integer_dtype_sets) - - return self.combine(dtype_sets) - - map_product = map_sum - - def map_quotient(self, expr): - n_dtype_set = self.rec(expr.numerator) - d_dtype_set = self.rec(expr.denominator) - - dtypes = n_dtype_set + d_dtype_set - - if all(dtype.is_integral() for dtype in dtypes): - # both integers - return [NumpyType(np.dtype(np.float64))] - - else: - return self.combine([n_dtype_set, d_dtype_set]) - - def map_constant(self, expr): - if is_integer(expr): - for tp in [np.int32, np.int64]: - iinfo = np.iinfo(tp) - if iinfo.min <= expr <= iinfo.max: - return [NumpyType(np.dtype(tp))] - - else: - raise TypeInferenceFailure("integer constant '%s' too large" % expr) - - dt = np.asarray(expr).dtype - if hasattr(expr, "dtype"): - return [NumpyType(expr.dtype)] - elif isinstance(expr, np.number): - # Numpy types are sized - return [NumpyType(np.dtype(type(expr)))] - elif dt.kind == "f": - # deduce the smaller type by default - return [NumpyType(np.dtype(np.float32))] - elif dt.kind == "c": - if np.complex64(expr) == np.complex128(expr): - # (COMPLEX_GUESS_LOGIC) - # No precision is lost by 'guessing' single precision, use that. - # This at least covers simple cases like '1j'. - return [NumpyType(np.dtype(np.complex64))] - - # Codegen for complex types depends on exactly correct types. - # Refuse temptation to guess. - raise TypeInferenceFailure("Complex constant '%s' needs to " - "be sized (i.e. as numpy.complex64/128) for type inference " - % expr) - else: - raise TypeInferenceFailure("Cannot deduce type of constant '%s'" % expr) - - def map_type_cast(self, expr): - subtype, = self.rec(expr.child) - if not issubclass(subtype.dtype.type, np.number): - raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type)) - return [expr.type] - - def map_subscript(self, expr): - return self.rec(expr.aggregate) - - def map_linear_subscript(self, expr): - return self.rec(expr.aggregate) - - def map_call(self, expr, return_tuple=False): - - from pymbolic.primitives import Variable, CallWithKwargs, Call - from loopy.symbolic import ResolvedFunction - - if isinstance(expr, CallWithKwargs): - kw_parameters = expr.kw_parameters - else: - assert isinstance(expr, Call) - kw_parameters = {} - - identifier = expr.function - if isinstance(identifier, (Variable, ResolvedFunction)): - identifier = identifier.name - - def none_if_empty(d): - if d: - d, = d - return d - else: - return None - - arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in - tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) - - # specializing the known function wrt type - if isinstance(expr.function, ResolvedFunction): - in_knl_callable = self.program_callables_info[expr.function.name] - - # {{{ checking that there is no overwriting of types of in_knl_callable - - if in_knl_callable.arg_id_to_dtype is not None: - - # specializing an already specialized function. - for id, dtype in arg_id_to_dtype.items(): - if in_knl_callable.arg_id_to_dtype[id] != arg_id_to_dtype[id]: - - # {{{ ignoring the the cases when there is a discrepancy - # between np.uint and np.int - - import numpy as np - if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( - np.uint32) and ( - arg_id_to_dtype[id].dtype.type == np.int32): - continue - if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( - np.uint64) and ( - arg_id_to_dtype[id].dtype.type == - np.int64): - continue - - # }}} - - raise LoopyError("Overwriting a specialized function " - "is illegal--maybe start with new instance of " - "InKernelCallable?") - - # }}} - - in_knl_callable, self.program_callables_info = ( - in_knl_callable.with_types( - arg_id_to_dtype, self.kernel, - self.program_callables_info)) - - in_knl_callable = in_knl_callable.with_target(self.kernel.target) - - # storing the type specialized function so that it can be used for - # later use - self.program_callables_info, new_function_id = ( - self.program_callables_info.with_callable( - expr.function.function, - in_knl_callable)) - - if isinstance(expr, Call): - self.old_calls_to_new_calls[expr] = new_function_id - else: - assert isinstance(expr, CallWithKwargs) - self.old_calls_to_new_calls[expr] = new_function_id - - new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype - - if new_arg_id_to_dtype is None: - return [] - - # collecting result dtypes in order of the assignees - if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: - if return_tuple: - return [get_return_types_as_tuple(new_arg_id_to_dtype)] - else: - return [new_arg_id_to_dtype[-1]] - - elif isinstance(expr.function, Variable): - # Since, the function is not "scoped", attempt to infer using - # kernel.function_manglers - - # {{{ trying to infer using function manglers - - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in - expr.parameters) - - # finding the function_mangler which would be associated with the - # realized function. - - mangle_result = None - for function_mangler in self.kernel.function_manglers: - mangle_result = function_mangler(self.kernel, identifier, - arg_dtypes) - if mangle_result: - # found a match. - break - - if mangle_result is not None: - from loopy.kernel.function_interface import (ManglerCallable, - ValueArgDescriptor) - - # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes - arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) - for i, dt in enumerate(mangle_result.arg_dtypes)) - arg_id_to_dtype.update(dict((-i-1, - dtype.with_target(self.kernel.target)) for i, dtype in enumerate( - mangle_result.result_dtypes))) - arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in - enumerate(mangle_result.arg_dtypes)) - res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in - enumerate(mangle_result.result_dtypes)) - arg_id_to_descr = dict(arg_descrs+res_descrs) - - # creating the ManglerCallable object corresponding to the - # function. - in_knl_callable = ManglerCallable( - identifier, function_mangler, arg_id_to_dtype, - arg_id_to_descr, mangle_result.target_name) - self.program_callables_info, new_function_id = ( - self.program_callables_info.with_callable( - expr.function, in_knl_callable, True)) - - if isinstance(expr, Call): - self.old_calls_to_new_calls[expr] = new_function_id - else: - assert isinstance(expr, CallWithKwargs) - self.old_calls_to_new_calls = new_function_id - - # Returning the type. - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] - else: - if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct " - "assignments") - - return [mangle_result.result_dtypes[0]] - # }}} - - return [] - - map_call_with_kwargs = map_call - - def map_variable(self, expr): - if expr.name in self.kernel.all_inames(): - return [self.kernel.index_dtype] - - result = self.kernel.mangle_symbol( - self.kernel.target.get_device_ast_builder(), - expr.name) - - if result is not None: - result_dtype, _ = result - return [result_dtype] - - obj = self.new_assignments.get(expr.name) - - if obj is None: - obj = self.kernel.arg_dict.get(expr.name) - - if obj is None: - obj = self.kernel.temporary_variables.get(expr.name) - - if obj is None: - raise TypeInferenceFailure("name not known in type inference: %s" - % expr.name) - - from loopy.kernel.data import TemporaryVariable, KernelArgument - import loopy as lp - if isinstance(obj, (KernelArgument, TemporaryVariable)): - assert obj.dtype is not lp.auto - result = [obj.dtype] - if result[0] is None: - self.symbols_with_unknown_types.add(expr.name) - return [] - else: - return result - - else: - raise RuntimeError("unexpected type inference " - "object type for '%s'" % expr.name) - - map_tagged_variable = map_variable - - def map_lookup(self, expr): - agg_result = self.rec(expr.aggregate) - if not agg_result: - return agg_result - - numpy_dtype = agg_result[0].numpy_dtype - fields = numpy_dtype.fields - if fields is None: - raise LoopyError("cannot look up attribute '%s' in " - "non-aggregate expression '%s'" - % (expr.name, expr.aggregate)) - - try: - field = fields[expr.name] - except KeyError: - raise LoopyError("cannot look up attribute '%s' in " - "aggregate expression '%s' of dtype '%s'" - % (expr.aggregate, expr.name, numpy_dtype)) - - dtype = field[0] - return [NumpyType(dtype)] - - def map_comparison(self, expr): - # "bool" is unusable because OpenCL's bool has indeterminate memory - # format. - return [NumpyType(np.dtype(np.int32))] - - map_logical_not = map_comparison - map_logical_and = map_comparison - map_logical_or = map_comparison - - def map_group_hw_index(self, expr, *args): - return [self.kernel.index_dtype] - - def map_local_hw_index(self, expr, *args): - return [self.kernel.index_dtype] - - def map_reduction(self, expr, return_tuple=False): - """ - :arg return_tuple: If *True*, treat the reduction as having tuple type. - Otherwise, if *False*, the reduction must have scalar type. - """ - from loopy.symbolic import Reduction - from pymbolic.primitives import Call - - if not return_tuple and expr.is_tuple_typed: - raise LoopyError("reductions with more or fewer than one " - "return value may only be used in direct " - "assignments") - - if isinstance(expr.expr, tuple): - rec_results = [self.rec(sub_expr) for sub_expr in expr.expr] - from itertools import product - rec_results = product(*rec_results) - elif isinstance(expr.expr, Reduction): - rec_results = self.rec(expr.expr, return_tuple=return_tuple) - elif isinstance(expr.expr, Call): - rec_results = self.map_call(expr.expr, return_tuple=return_tuple) - else: - if return_tuple: - raise LoopyError("unknown reduction type for tuple reduction: '%s'" - % type(expr.expr).__name__) - else: - rec_results = self.rec(expr.expr) - - if return_tuple: - return [expr.operation.result_dtypes(self.kernel, *rec_result) - for rec_result in rec_results] - else: - return [expr.operation.result_dtypes(self.kernel, rec_result)[0] - for rec_result in rec_results] - - def map_sub_array_ref(self, expr): - return self.rec(expr.get_begin_subscript()) - - -# }}} - - -# {{{ infer single variable - -def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): - - if var_name in kernel.all_params(): - return [kernel.index_dtype], [], {}, ( - type_inf_mapper.program_callables_info) - - from functools import partial - debug = partial(_debug, kernel) - - dtype_sets = [] - - import loopy as lp - - type_inf_mapper = type_inf_mapper.copy() - - for writer_insn_id in kernel.writer_map().get(var_name, []): - writer_insn = kernel.id_to_insn[writer_insn_id] - if not isinstance(writer_insn, lp.MultiAssignmentBase): - continue - - expr = subst_expander(writer_insn.expression) - - debug(" via expr %s", expr) - if isinstance(writer_insn, lp.Assignment): - result = type_inf_mapper(expr, return_dtype_set=True) - elif isinstance(writer_insn, lp.CallInstruction): - return_dtype_set = type_inf_mapper(expr, return_tuple=True, - return_dtype_set=True) - - result = [] - for return_dtype_set in return_dtype_set: - result_i = None - found = False - for assignee, comp_dtype_set in zip( - writer_insn.assignee_var_names(), return_dtype_set): - if assignee == var_name: - found = True - result_i = comp_dtype_set - break - - assert found - if result_i is not None: - result.append(result_i) - - debug(" result: %s", result) - - dtype_sets.append(result) - - if not dtype_sets: - return ( - None, type_inf_mapper.symbols_with_unknown_types, None, - type_inf_mapper.program_callables_info) - - result = type_inf_mapper.combine(dtype_sets) - - return (result, type_inf_mapper.symbols_with_unknown_types, - type_inf_mapper.old_calls_to_new_calls, - type_inf_mapper.program_callables_info) - -# }}} - - -class _DictUnionView: - def __init__(self, children): - self.children = children - - def get(self, key): - try: - return self[key] - except KeyError: - return None - - def __getitem__(self, key): - for ch in self.children: - try: - return ch[key] - except KeyError: - pass - - raise KeyError(key) - - -# {{{ infer_unknown_types - -def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, - expect_completion=False): - """Infer types on temporaries and arguments.""" - - logger.debug("%s: infer types" % kernel.name) - - from functools import partial - debug = partial(_debug, kernel) - - import time - start_time = time.time() - - unexpanded_kernel = kernel - if kernel.substitutions: - from loopy.transform.subst import expand_subst - kernel = expand_subst(kernel) - - new_temp_vars = kernel.temporary_variables.copy() - new_arg_dict = kernel.arg_dict.copy() - - # {{{ find names_with_unknown_types - - # contains both arguments and temporaries - names_for_type_inference = [] - - import loopy as lp - for tv in six.itervalues(kernel.temporary_variables): - assert tv.dtype is not lp.auto - if tv.dtype is None: - names_for_type_inference.append(tv.name) - - for arg in kernel.args: - assert arg.dtype is not lp.auto - if arg.dtype is None: - names_for_type_inference.append(arg.name) - - # }}} - - logger.debug("finding types for {count:d} names".format( - count=len(names_for_type_inference))) - - writer_map = kernel.writer_map() - - dep_graph = dict( - (written_var, set( - read_var - for insn_id in writer_map.get(written_var, []) - for read_var in kernel.id_to_insn[insn_id].read_dependency_names() - if read_var in names_for_type_inference)) - for written_var in names_for_type_inference) - - from loopy.tools import compute_sccs - - # To speed up processing, we sort the variables by computing the SCCs of the - # type dependency graph. Each SCC represents a set of variables whose types - # mutually depend on themselves. The SCCs are returned and processed in - # topological order. - sccs = compute_sccs(dep_graph) - - item_lookup = _DictUnionView([ - new_temp_vars, - new_arg_dict - ]) - type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info, - item_lookup) - - from loopy.symbolic import SubstitutionRuleExpander - subst_expander = SubstitutionRuleExpander(kernel.substitutions) - - # {{{ work on type inference queue - - from loopy.kernel.data import TemporaryVariable, KernelArgument - - old_calls_to_new_calls = {} - - for var_chain in sccs: - changed_during_last_queue_run = False - queue = var_chain[:] - failed_names = set() - - while queue or changed_during_last_queue_run: - if not queue and changed_during_last_queue_run: - changed_during_last_queue_run = False - # Optimization: If there's a single variable in the SCC without - # a self-referential dependency, then the type is known after a - # single iteration (we don't need to look at the expressions - # again). - if len(var_chain) == 1: - single_var, = var_chain - if single_var not in dep_graph[single_var]: - break - queue = var_chain[:] - - name = queue.pop(0) - item = item_lookup[name] - - debug("inferring type for %s %s", type(item).__name__, item.name) - - (result, symbols_with_unavailable_types, - new_old_calls_to_new_calls, program_callables_info) = ( - _infer_var_type( - kernel, item.name, type_inf_mapper, subst_expander)) - type_inf_mapper = type_inf_mapper.copy( - program_callables_info=program_callables_info) - - failed = not result - if not failed: - new_dtype, = result - if new_dtype.target is None: - new_dtype = new_dtype.with_target(kernel.target) - - debug(" success: %s", new_dtype) - if new_dtype != item.dtype: - debug(" changed from: %s", item.dtype) - changed_during_last_queue_run = True - - if isinstance(item, TemporaryVariable): - new_temp_vars[name] = item.copy(dtype=new_dtype) - elif isinstance(item, KernelArgument): - new_arg_dict[name] = item.copy(dtype=new_dtype) - else: - raise LoopyError("unexpected item type in type inference") - # TODO: I dont like in-place updates. Change this to something - # else. Perhaps add a function for doing this, which does it - # using a bunch of copies? - old_calls_to_new_calls.update(new_old_calls_to_new_calls) - else: - debug(" failure") - - if failed: - if item.name in failed_names: - # this item has failed before, give up. - advice = "" - if symbols_with_unavailable_types: - advice += ( - " (need type of '%s'--check for missing arguments)" - % ", ".join(symbols_with_unavailable_types)) - - if expect_completion: - raise LoopyError( - "could not determine type of '%s'%s" - % (item.name, advice)) - - else: - # We're done here. - break - - # remember that this item failed - failed_names.add(item.name) - - if set(queue) == failed_names: - # We did what we could... - print(queue, failed_names, item.name) - assert not expect_completion - break - - # can't infer type yet, put back into queue - queue.append(name) - else: - # we've made progress, reset failure markers - failed_names = set() - - # }}} - - # FIXME: copy the explanation from make_function_ready_for_codegen - # here. - for insn in kernel.instructions: - if isinstance(insn, lp.MultiAssignmentBase): - # just a dummy run over the expression, to pass over all the - # functions - # FIXME: need a check over here which checks the instruction for - # unseen cases - type_inf_mapper(insn.expression, return_tuple=isinstance(insn, - lp.CallInstruction), return_dtype_set=True) - elif isinstance(insn, (_DataObliviousInstruction, - lp.CInstruction)): - pass - else: - raise NotImplementedError("Unknown instructions type %s." % ( - type(insn).__name__)) - - program_callables_info = type_inf_mapper.program_callables_info - old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) - - end_time = time.time() - logger.debug("type inference took {dur:.2f} seconds".format( - dur=end_time - start_time)) - - pre_type_specialized_knl = unexpanded_kernel.copy( - temporary_variables=new_temp_vars, - args=[new_arg_dict[arg.name] for arg in kernel.args], - ) - - # this has to be subsitutition - from loopy.kernel.function_interface import ( - change_names_of_pymbolic_calls) - type_specialized_kernel = change_names_of_pymbolic_calls( - pre_type_specialized_knl, old_calls_to_new_calls) - - # the check is unnecessary as we would first get TypeInfereceFailure before - # encountering this. Move this at the start once ManglerCallable is - if expect_completion: - # if completion is expected, then it is important that all the - # callables are scoped. - from loopy.check import check_functions_are_scoped - check_functions_are_scoped(type_specialized_kernel) - - return type_specialized_kernel, program_callables_info - - -def infer_unknown_types(program, expect_completion=False): - """Infer types on temporaries and arguments.""" - from loopy.kernel import LoopKernel - if isinstance(program, LoopKernel): - # FIXME: deprecate warning needed here - from loopy.program import make_program_from_kernel - program = make_program_from_kernel(program) - - program_callables_info = program.program_callables_info - - type_uninferred_knl_callable = ( - program_callables_info[program.name]) - type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) - root_kernel, program_callables_info = ( - infer_unknown_types_for_a_single_kernel( - type_uninferred_root_kernel, - program_callables_info, expect_completion)) - - type_inferred_knl_callable = type_uninferred_knl_callable.copy( - subkernel=root_kernel) - - program_callables_info, _ = ( - program_callables_info.with_callable( - program.name, - type_inferred_knl_callable)) - - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - - # FIXME: maybe put all of this in a function? - # need to infer functions that were left out during inference - return program.copy(program_callables_info=program_callables_info) - -# }}} - - -# {{{ reduction expression helper - -def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, program_callables_info, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info) - import loopy as lp - - if expr.is_tuple_typed: - arg_dtypes_result = type_inf_mapper( - expr, return_tuple=True, return_dtype_set=True) - - if len(arg_dtypes_result) == 1: - arg_dtypes = arg_dtypes_result[0] - else: - if unknown_types_ok: - arg_dtypes = [lp.auto] * expr.operation.arg_count - else: - raise LoopyError("failed to determine types of accumulators for " - "reduction '%s'" % expr) - else: - try: - arg_dtypes = [type_inf_mapper(expr)] - except DependencyTypeInferenceFailure: - if unknown_types_ok: - arg_dtypes = [lp.auto] - else: - raise LoopyError("failed to determine type of accumulator for " - "reduction '%s'" % expr) - - reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes) - reduction_dtypes = tuple( - dt.with_target(kernel.target) - if dt is not lp.auto else dt - for dt in reduction_dtypes) - - return tuple(arg_dtypes), reduction_dtypes, ( - type_inf_mapper.program_callables_info) - -# }}} - -# vim: foldmethod=marker -- GitLab From 2254169cf2e6972f3832afd0fe57691aed8e82fe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 4 Aug 2018 16:28:12 -0500 Subject: [PATCH 310/916] fixes infer_arg_descr. --- loopy/kernel/instruction.py | 16 ++++++++-------- loopy/preprocess.py | 16 ++++++++++------ loopy/symbolic.py | 35 +++++++++++++++++++---------------- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 3eb08c50a..18618d785 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -951,12 +951,12 @@ class Assignment(MultiAssignmentBase): def assignee_subscript_deps(self): return (_get_assignee_subscript_deps(self.assignee),) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignee=f(self.assignee, *args), - expression=f(self.expression, *args), + assignee=f(self.assignee, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} @@ -1105,12 +1105,12 @@ class CallInstruction(MultiAssignmentBase): _get_assignee_subscript_deps(a) for a in self.assignees) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignees=f(self.assignees, *args), - expression=f(self.expression, *args), + assignees=f(self.assignees, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 0b65559b0..c2ae40583 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2182,7 +2182,8 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): kw_parameters = expr.kw_parameters # descriptors for the args and kwargs: - arg_id_to_descr = dict((i, ValueArgDescriptor()) + arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.caller_kernel)) + if isinstance(par, SubArrayRef) else ValueArgDescriptor() for i, par in tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) @@ -2225,7 +2226,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): for child in expr.parameters), dict( (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) + for key, val in six.iteritems(kw_parameters)) ) map_call_with_kwargs = map_call @@ -2237,9 +2238,12 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): for insn in kernel.instructions: if isinstance(insn, CallInstruction): # In call instructions the assignees play an important in - # determining the arg_id_to_dtype new_insns.append(insn.with_transformed_expressions( - self, kernel, insn)) + self, kernel, insn, assignees=insn.assignees)) + # determining the arg_id_to_dtype + # new_expr = self.map_call(insn.expression, kernel, insn, + # assignees=insn.assignees) + # new_insns.append(insn.copy(expression=new_expr)) elif isinstance(insn, MultiAssignmentBase): new_insns.append(insn.with_transformed_expressions( self, kernel, insn)) @@ -2252,7 +2256,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): return kernel.copy(instructions=new_insns) -def infer_arg_descr_from_root_kernel(kernel, program_callables_info): +def traverse_to_infer_arg_descr(kernel, program_callables_info): """ Returns a copy of *kernel* with the argument shapes and strides matching for scoped functions in the *kernel*. Refer @@ -2280,7 +2284,7 @@ def infer_arg_descr(program): program.program_callables_info.with_edit_callables_mode()) root_kernel = program.root_kernel - new_root_kernel, program_callables_info = infer_arg_descr_from_root_kernel( + new_root_kernel, program_callables_info = traverse_to_infer_arg_descr( root_kernel, program_callables_info) new_root_kernel_callable = root_kernel_callable.copy( subkernel=new_root_kernel) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 7bc2c792a..54dd61966 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -69,22 +69,23 @@ import numpy as np # {{{ mappers with support for loopy-specific primitives class IdentityMapperMixin(object): - def map_literal(self, expr, *args): + def map_literal(self, expr, *args, **kwargs): return expr - def map_array_literal(self, expr, *args): - return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children)) + def map_array_literal(self, expr, *args, **kwargs): + return type(expr)(tuple(self.rec(ch, *args, **kwargs) for ch in + expr.children)) - def map_group_hw_index(self, expr, *args): + def map_group_hw_index(self, expr, *args, **kwargs): return expr - def map_local_hw_index(self, expr, *args): + def map_local_hw_index(self, expr, *args, **kwargs): return expr - def map_loopy_function_identifier(self, expr, *args): + def map_loopy_function_identifier(self, expr, *args, **kwargs): return expr - def map_reduction(self, expr, *args): + def map_reduction(self, expr, *args, **kwargs): mapped_inames = [self.rec(p.Variable(iname), *args) for iname in expr.inames] new_inames = [] @@ -98,15 +99,15 @@ class IdentityMapperMixin(object): return Reduction( expr.operation, tuple(new_inames), - self.rec(expr.expr, *args), + self.rec(expr.expr, *args, **kwargs), allow_simultaneous=expr.allow_simultaneous) - def map_tagged_variable(self, expr, *args): + def map_tagged_variable(self, expr, *args, **kwargs): # leaf, doesn't change return expr - def map_type_annotation(self, expr, *args): - return type(expr)(expr.type, self.rec(expr.child, *args)) + def map_type_annotation(self, expr, *args, **kwargs): + return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs)) def map_sub_array_ref(self, expr, *args, **kwargs): return SubArrayRef(self.rec(expr.swept_inames, *args, **kwargs), @@ -1098,12 +1099,14 @@ class RuleAwareIdentityMapper(IdentityMapper): def __init__(self, rule_mapping_context): self.rule_mapping_context = rule_mapping_context - def map_variable(self, expr, expn_state): + def map_variable(self, expr, expn_state, *args, **kwargs): name, tag = parse_tagged_name(expr) if name not in self.rule_mapping_context.old_subst_rules: - return IdentityMapper.map_variable(self, expr, expn_state) + return IdentityMapper.map_variable(self, expr, expn_state, *args, + **kwargs) else: - return self.map_substitution(name, tag, (), expn_state) + return self.map_substitution(name, tag, (), expn_state, *args, + **kwargs) def map_call(self, expr, expn_state): if not isinstance(expr.function, p.Variable): @@ -1158,7 +1161,7 @@ class RuleAwareIdentityMapper(IdentityMapper): else: return sym - def __call__(self, expr, kernel, insn): + def __call__(self, expr, kernel, insn, *args, **kwargs): from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -1167,7 +1170,7 @@ class RuleAwareIdentityMapper(IdentityMapper): kernel=kernel, instruction=insn, stack=(), - arg_context={})) + arg_context={}), *args, **kwargs) def map_instruction(self, kernel, insn): return insn -- GitLab From b3327cf50219f4e130763d835954cf748254bc92 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 5 Aug 2018 19:59:18 -0500 Subject: [PATCH 311/916] basic calling kernel from kernel works. --- loopy/__init__.py | 4 +- loopy/kernel/creation.py | 13 ++++- loopy/kernel/data.py | 2 +- loopy/kernel/function_interface.py | 41 +++++++------- loopy/kernel/tools.py | 1 + loopy/preprocess.py | 16 +++--- loopy/target/c/__init__.py | 3 +- loopy/transform/callable.py | 89 ++++++++++++++++++------------ 8 files changed, 101 insertions(+), 68 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 8b5026032..a62d30497 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -64,7 +64,7 @@ from loopy.kernel.tools import ( get_subkernels, get_subkernel_to_insn_id_map) from loopy.types import to_loopy_type -from loopy.kernel.creation import make_kernel, UniqueName +from loopy.kernel.creation import make_kernel, UniqueName, make_kernel_function from loopy.library.reduction import register_reduction_parser # {{{ import transforms @@ -185,7 +185,7 @@ __all__ = [ "SubstitutionRule", "CallMangleInfo", - "make_kernel", "UniqueName", + "make_kernel", "UniqueName", "make_kernel_function", "register_reduction_parser", diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 54bd5b219..62c268e62 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2129,6 +2129,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) fixed_parameters = kwargs.pop("fixed_parameters", {}) + make_program = kwargs.pop("make_program", True) if defines: from warnings import warn @@ -2352,8 +2353,16 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): creation_plog.done() - from loopy.program import make_program_from_kernel - return make_program_from_kernel(knl) + if make_program: + from loopy.program import make_program_from_kernel + return make_program_from_kernel(knl) + else: + return knl + + +def make_kernel_function(*args, **kwargs): + kwargs['make_program'] = False + return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 417212b33..9ba288961 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -363,7 +363,7 @@ class ArrayArg(ArrayBase, KernelArgument): def __init__(self, *args, **kwargs): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - kwargs["is_output_only"] = kwargs.pop("is_output_only", False) + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) super(ArrayArg, self).__init__(*args, **kwargs) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 095d5ff0e..cbc0e641b 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -227,7 +227,7 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - def with_descrs(self, arg_id_to_descr): + def with_descrs(self, arg_id_to_descr, program_callables_info): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -359,10 +359,12 @@ class ScalarCallable(InKernelCallable): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) - def with_descrs(self, arg_id_to_descr): + def with_descrs(self, arg_id_to_descr, program_callables_info): arg_id_to_descr[-1] = ValueArgDescriptor() - return self.copy(arg_id_to_descr=arg_id_to_descr) + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + program_callables_info) def with_hw_axes_sizes(self, global_size, local_size): return self.copy() @@ -492,28 +494,25 @@ class CallableKernel(InKernelCallable): sizes for the :attr:`subkernel` of the callable. """ - fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"]) - init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target") + fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") hash_fields = fields def __init__(self, subkernel, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): + arg_id_to_descr=None): assert isinstance(subkernel, LoopKernel) super(CallableKernel, self).__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) - self.name_in_target = name_in_target self.subkernel = subkernel.copy( args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) if arg.dtype is not None else arg for arg in subkernel.args]) def __getinitargs__(self): return (self.subkernel, self.arg_id_to_dtype, - self.arg_id_to_descr, self.name_in_target) + self.arg_id_to_descr) @property def name(self): @@ -561,7 +560,7 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=specialized_kernel, arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info - def with_descrs(self, arg_id_to_descr): + def with_descrs(self, arg_id_to_descr, program_callables_info): # tune the subkernel so that we have the matching shapes and # dim_tags @@ -589,9 +588,16 @@ class CallableKernel(InKernelCallable): "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % type(descr)) descriptor_specialized_knl = self.subkernel.copy(args=new_args) + from loopy.preprocess import traverse_to_infer_arg_descr + descriptor_specialized_knl, program_callables_info = ( + traverse_to_infer_arg_descr(descriptor_specialized_knl, + program_callables_info)) - return self.copy(subkernel=descriptor_specialized_knl, - arg_id_to_descr=arg_id_to_descr) + return ( + self.copy( + subkernel=descriptor_specialized_knl, + arg_id_to_descr=arg_id_to_descr), + program_callables_info) def with_packing_for_args(self): from loopy.kernel.data import AddressSpace @@ -617,15 +623,12 @@ class CallableKernel(InKernelCallable): def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None and - self.name_in_target is not None) + self.arg_id_to_descr is not None) def generate_preambles(self, target): """ Yields the *target* specific preambles. """ - # FIXME TODO: This is not correct, as the code code preamble generated - # during the code generationg of the child kernel, does not guarantee - # that this thing would be updated. + # FIXME Check that this is correct. return yield @@ -678,7 +681,7 @@ class CallableKernel(InKernelCallable): for par, par_dtype in zip( parameters, par_dtypes)] - return var(self.name_in_target)(*c_parameters), False + return var(self.subkernel.name)(*c_parameters), False # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 1c37ae407..c866c9c6a 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1926,6 +1926,7 @@ def infer_arg_is_output_only(kernel): """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg new_args = [] + for arg in kernel.args: if isinstance(arg, (ArrayArg, ImageArg, ValueArg)): if arg.is_output_only is not None: diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c2ae40583..d559ca2bb 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2181,9 +2181,9 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): assert isinstance(expr, CallWithKwargs) kw_parameters = expr.kw_parameters - # descriptors for the args and kwargs: + # descriptors for the args and kwargs of the Call arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.caller_kernel)) - if isinstance(par, SubArrayRef) else ValueArgDescriptor() + if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) for i, par in tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) @@ -2205,9 +2205,10 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): combined_arg_id_to_descr.update(assignee_id_to_descr) # specializing the function according to the parameter description - new_in_knl_callable = ( - self.program_callables_info[expr.function.name].with_descrs( - combined_arg_id_to_descr)) + in_knl_callable = self.program_callables_info[expr.function.name] + new_in_knl_callable, self.program_callables_info = ( + in_knl_callable.with_descrs( + combined_arg_id_to_descr, self.program_callables_info)) self.program_callables_info, new_func_id = ( self.program_callables_info.with_callable( expr.function.function, @@ -2238,12 +2239,9 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): for insn in kernel.instructions: if isinstance(insn, CallInstruction): # In call instructions the assignees play an important in + # determining the arg_id_to_descr new_insns.append(insn.with_transformed_expressions( self, kernel, insn, assignees=insn.assignees)) - # determining the arg_id_to_dtype - # new_expr = self.map_call(insn.expression, kernel, insn, - # assignees=insn.assignees) - # new_insns.append(insn.copy(expression=new_expr)) elif isinstance(insn, MultiAssignmentBase): new_insns.append(insn.with_transformed_expressions( self, kernel, insn)) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 1db14c84a..1579bb313 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -895,7 +895,8 @@ class CASTBuilder(ASTBuilderBase): func_id = insn.expression.function.name in_knl_callable = codegen_state.program_callables_info[func_id] - if in_knl_callable.name_in_target == 'loopy_make_tuple': + if isinstance(in_knl_callable, ScalarCallable) and ( + in_knl_callable.name_in_target == 'loopy_make_tuple'): return self.emit_tuple_assignment(codegen_state, insn) in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index c67b307fe..9de150299 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -37,7 +37,7 @@ from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import (get_kw_pos_association, change_names_of_pymbolic_calls) - +from loopy.program import Program, ResolvedFunctionMarker __doc__ = """ .. currentmodule:: loopy @@ -52,7 +52,6 @@ __doc__ = """ def resolved_callables_from_function_lookup(program, func_id_to_kernel_callable_mapper): - from loopy.program import ResolvedFunctionMarker program_callables_info = program.program_callables_info program_callables_info = program_callables_info.with_edit_callables_mode() @@ -140,19 +139,18 @@ class _RegisterCalleeKernel(ImmutableRecord): :func:`loopy.transform.register_callable_kernel` picklable. As python cannot pickle lexical closures. """ - fields = set(['function_name', 'callable_kernel']) + fields = set(['callable_kernel']) - def __init__(self, function_name, callable_kernel): - self.function_name = function_name + def __init__(self, callable_kernel): self.callable_kernel = callable_kernel def __call__(self, target, identifier): - if identifier == self.function_name: + if identifier == self.callable_kernel.subkernel.name: return self.callable_kernel return None -def register_callable_kernel(caller_kernel, function_name, callee_kernel): +def register_callable_kernel(program, callee_kernel): """Returns a copy of *caller_kernel*, which would resolve *function_name* in an expression as a call to *callee_kernel*. @@ -163,53 +161,76 @@ def register_callable_kernel(caller_kernel, function_name, callee_kernel): # {{{ sanity checks - assert isinstance(caller_kernel, LoopKernel) + assert isinstance(program, Program) assert isinstance(callee_kernel, LoopKernel) - assert isinstance(function_name, str) # check to make sure that the variables with 'out' direction is equal to # the number of assigness in the callee kernel intructions. - from loopy.kernel.tools import infer_arg_is_output_only - callee_kernel = infer_arg_is_output_only(callee_kernel) expected_num_assignees = len([arg for arg in callee_kernel.args if arg.is_output_only]) expected_num_parameters = len(callee_kernel.args) - expected_num_assignees - for insn in caller_kernel.instructions: - if isinstance(insn, CallInstruction) and ( - insn.expression.function.name == 'function_name'): - if insn.assignees != expected_num_assignees: - raise LoopyError("The number of arguments with 'out' direction " - "in callee kernel %s and the number of assignees in " - "instruction %s do not match." % ( - callee_kernel.name, insn.id)) - if insn.expression.prameters != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of parameters in " - "instruction %s do not match." % ( - callee_kernel.name, insn.id)) - - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("unknown instruction %s" % type(insn)) + for in_knl_callable in program.program_callables_info.values(): + caller_kernel = in_knl_callable.subkernel + for insn in caller_kernel.instructions: + if isinstance(insn, CallInstruction) and ( + insn.expression.function.name == callee_kernel.name): + if len(insn.assignees) != expected_num_assignees: + raise LoopyError("The number of arguments with 'out' direction " + "in callee kernel %s and the number of assignees in " + "instruction %s do not match." % ( + callee_kernel.name, insn.id)) + if len(insn.expression.parameters) != expected_num_parameters: + raise LoopyError("The number of expected arguments " + "for the callee kernel %s and the number of parameters " + "in instruction %s do not match." % ( + callee_kernel.name, insn.id)) + + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("unknown instruction %s" % type(insn)) # }}} + # take the function resolvers from the Program and resolve the functions in + # the callee kernel + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + callee_kernel.substitutions, + callee_kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, callee_kernel, program_callables_info, + program.func_id_to_in_knl_callable_mappers) + + callee_kernel = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(callee_kernel)) + program_callables_info = resolved_function_marker.program_callables_info + + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + program = program.copy(program_callables_info=program_callables_info) + # making the target of the child kernel to be same as the target of parent # kernel. callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=caller_kernel.target, - name=function_name, + target=program.target, is_called_from_host=False)) # FIXME disabling global barriers for callee kernel (for now) from loopy import set_options callee_kernel = set_options(callee_kernel, "disable_global_barriers") + # FIXME: the number of callables is wrong. This is horrible please + # compensate. + return register_function_id_to_in_knl_callable_mapper( - caller_kernel, - _RegisterCalleeKernel(function_name, callable_kernel)) + program, + _RegisterCalleeKernel(callable_kernel)) # }}} -- GitLab From 94d7eac3d505b0c41f678dc8b2788b4915f24112 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 5 Aug 2018 20:11:19 -0500 Subject: [PATCH 312/916] no more debug print statement. --- loopy/kernel/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 800ba36c0..d2723c57f 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1132,7 +1132,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ if self.overridden_get_grid_sizes_for_insn_ids: - print(self.overridden_get_grid_sizes_for_insn_ids) return self.overridden_get_grid_sizes_for_insn_ids( insn_ids, program_callables_info=program_callables_info, -- GitLab From 406278a73c90e4d92b03e95eab9617872977fe41 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 5 Aug 2018 21:48:33 -0500 Subject: [PATCH 313/916] moderaten callable kernel works. --- loopy/transform/callable.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 9de150299..cef164242 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -174,12 +174,17 @@ def register_callable_kernel(program, callee_kernel): for insn in caller_kernel.instructions: if isinstance(insn, CallInstruction) and ( insn.expression.function.name == callee_kernel.name): + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + else: + kw_parameters = {} if len(insn.assignees) != expected_num_assignees: raise LoopyError("The number of arguments with 'out' direction " "in callee kernel %s and the number of assignees in " "instruction %s do not match." % ( callee_kernel.name, insn.id)) - if len(insn.expression.parameters) != expected_num_parameters: + if len(insn.expression.parameters+tuple( + kw_parameters.values())) != expected_num_parameters: raise LoopyError("The number of expected arguments " "for the callee kernel %s and the number of parameters " "in instruction %s do not match." % ( -- GitLab From 1fa894318f46dc1adb315f59fcf00925470b8a45 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 04:24:36 -0500 Subject: [PATCH 314/916] changes to inline callable --- loopy/program.py | 54 +++++++++++++++---- loopy/transform/callable.py | 104 ++++++++++++++++++++---------------- 2 files changed, 103 insertions(+), 55 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 5d4bae1c0..510f9ec86 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -350,22 +350,21 @@ def rename_resolved_functions_in_a_single_kernel(kernel, class ProgramCallablesInfo(ImmutableRecord): def __init__(self, resolved_functions, num_times_callables_called=None, - history_of_callable_names=None, is_being_edited=False, - old_resolved_functions={}, num_times_hit_during_editing={}, + history=None, is_being_edited=False, + num_times_hit_during_editing={}, renames_needed_after_editing={}): if num_times_callables_called is None: num_times_callables_called = dict((func_id, 1) for func_id in resolved_functions) - if history_of_callable_names is None: - history_of_callable_names = dict((func_id, [func_id]) for func_id in + if history is None: + history = dict((func_id, [func_id]) for func_id in resolved_functions) super(ProgramCallablesInfo, self).__init__( resolved_functions=resolved_functions, num_times_callables_called=num_times_callables_called, - history_of_callable_names=history_of_callable_names, - old_resolved_functions=old_resolved_functions, + history=history, is_being_edited=is_being_edited, num_times_hit_during_editing=num_times_hit_during_editing, renames_needed_after_editing=renames_needed_after_editing) @@ -375,14 +374,13 @@ class ProgramCallablesInfo(ImmutableRecord): "num_times_callables_called", "is_being_edited", "num_times_hit_during_editing", - "old_resolved_functions", - "renames_needed_after_editing",) + "renames_needed_after_editing", + "history") update_persistent_hash = LoopKernel.update_persistent_hash def with_edit_callables_mode(self): return self.copy(is_being_edited=True, - old_resolved_functions=self.resolved_functions.copy(), num_times_hit_during_editing=dict((func_id, 0) for func_id in self.resolved_functions)) @@ -400,7 +398,10 @@ class ProgramCallablesInfo(ImmutableRecord): Assumes that each callable is touched atmost once, the internal working of this function fails if that is violated. """ - # FIXME: add a note about using enter and exit + # FIXME: add a note about using enter and exit. ~KK + # FIXME: think about a better idea of "with_added_callable" this would + # be more convenient for developer-faced usage. ~KK + if not self.is_being_edited: if function.name in self.resolved_functions and ( self.resolved_functions[function.name] == in_kernel_callable): @@ -424,6 +425,7 @@ class ProgramCallablesInfo(ImmutableRecord): renames_needed_after_editing = self.renames_needed_after_editing.copy() num_times_hit_during_editing = self.num_times_hit_during_editing.copy() num_times_callables_called = self.num_times_callables_called.copy() + history = self.history.copy() if not resolved_for_the_first_time: if isinstance(function, (ArgExtOp, SegmentedOp)): @@ -463,8 +465,11 @@ class ProgramCallablesInfo(ImmutableRecord): if num_times_callables_called[function.name] == 0: renames_needed_after_editing[func_id] = function.name + if func_id not in history[function.name]: + history[function.name].append(func_id) return ( self.copy( + history=history, num_times_hit_during_editing=( num_times_hit_during_editing), num_times_callables_called=( @@ -493,8 +498,15 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) + if not resolved_for_the_first_time: + if unique_function_identifier not in history[function.name]: + history[function.name].append(func_id) + else: + history[unique_function_identifier] = [unique_function_identifier] + return ( self.copy( + history=history, resolved_functions=updated_resolved_functions, num_times_callables_called=num_times_callables_called, num_times_hit_during_editing=num_times_hit_during_editing, @@ -506,6 +518,7 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_callables_called = {} resolved_functions = {} + history = self.history.copy() for func_id, in_knl_callable in self.resolved_functions.items(): if isinstance(in_knl_callable, CallableKernel): @@ -521,6 +534,8 @@ class ProgramCallablesInfo(ImmutableRecord): type(in_knl_callable).__name__) if func_id in self.renames_needed_after_editing: + history.pop(func_id) + new_func_id = self.renames_needed_after_editing[func_id] resolved_functions[new_func_id] = ( in_knl_callable) @@ -539,6 +554,25 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_hit_during_editing={}, renames_needed_after_editing={}) + def with_deleted_callable(self, func_id, instances=1): + num_times_callables_called = self.num_times_callables_called.copy() + history = self.history.copy() + resolved_functions = self.resolved_functions.copy() + + assert instances <= num_times_callables_called[func_id] + + num_times_callables_called[func_id] -= instances + + if num_times_callables_called == 0: + num_times_callables_called.pop(func_id) + history.pop(func_id) + resolved_functions.pop(func_id) + + return self.copy( + resolved_functions=resolved_functions, + num_times_callables_called=num_times_callables_called, + history=history) + def __getitem__(self, item): return self.resolved_functions[item] diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index cef164242..0edf5697a 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -360,7 +360,7 @@ class KernelInliner(SubstitutionMapper): # {{{ inlining of a single call instruction -def _inline_call_instruction(kernel, callee_knl, instruction): +def _inline_call_instruction(caller_kernel, callee_knl, instruction): """ Returns a copy of *kernel* with the *instruction* in the *kernel* replaced by inlining :attr:`subkernel` within it. @@ -369,8 +369,8 @@ def _inline_call_instruction(kernel, callee_knl, instruction): # {{{ duplicate and rename inames - vng = kernel.get_var_name_generator() - ing = kernel.get_instruction_id_generator() + vng = caller_kernel.get_var_name_generator() + ing = caller_kernel.get_instruction_id_generator() dim_type = isl.dim_type.set iname_map = {} @@ -378,7 +378,7 @@ def _inline_call_instruction(kernel, callee_knl, instruction): iname_map[iname] = vng(callee_label+iname) new_domains = [] - new_iname_to_tags = kernel.iname_to_tags.copy() + new_iname_to_tags = caller_kernel.iname_to_tags.copy() # transferring iname tags info from the callee to the caller kernel for domain in callee_knl.domains: @@ -393,7 +393,7 @@ def _inline_call_instruction(kernel, callee_knl, instruction): dim_type, i, iname_map[iname]) new_domains.append(new_domain) - kernel = kernel.copy(domains=kernel.domains + new_domains, + kernel = caller_kernel.copy(domains=caller_kernel.domains + new_domains, iname_to_tags=new_iname_to_tags) # }}} @@ -519,27 +519,6 @@ def _inline_call_instruction(kernel, callee_knl, instruction): # }}} - # {{{ transferring the scoped functions from callee to caller - - callee_scoped_calls_collector = CalleeScopedCallsCollector( - callee_knl.scoped_functions) - callee_scoped_calls_dict = {} - - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - callee_scoped_calls_dict.update(dict(callee_scoped_calls_collector( - insn.expression))) - elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknown type of instruction %s." % type( - insn)) - - kernel = change_names_of_pymbolic_calls(kernel, - callee_scoped_calls_dict) - - # }}} - return kernel # }}} @@ -547,29 +526,29 @@ def _inline_call_instruction(kernel, callee_knl, instruction): # {{{ inline callable kernel -# FIXME This should take a 'within' parameter to be able to only inline -# *some* calls to a kernel, but not others. -def inline_callable_kernel(kernel, function_name): - """ - Returns a copy of *kernel* with the callable kernel addressed by - (scoped) name *function_name* inlined. - """ - from loopy.preprocess import infer_arg_descr - kernel = infer_arg_descr(kernel) - - old_insns = kernel.instructions +def _inline_single_callable_kernel(caller_kernel, function_name, + program_callables_info): + old_insns = caller_kernel.instructions for insn in old_insns: if isinstance(insn, CallInstruction): # FIXME This seems to use identifiers across namespaces. Why not - # check whether the function is a scoped function first? - if insn.expression.function.name in kernel.scoped_functions: - in_knl_callable = kernel.scoped_functions[ + # check whether the function is a scoped function first? ~AK + if insn.expression.function.name in program_callables_info: + history_of_identifier = program_callables_info.history[ insn.expression.function.name] + from loopy.kernel.function_interface import CallableKernel - if isinstance(in_knl_callable, CallableKernel) and ( - in_knl_callable.subkernel.name == function_name): - kernel = _inline_call_instruction( - kernel, in_knl_callable.subkernel, insn) + if function_name in history_of_identifier: + in_knl_callable = program_callables_info[ + insn.expression.function.name] + assert isinstance(in_knl_callable, CallableKernel) + new_caller_kernel = _inline_call_instruction( + caller_kernel, in_knl_callable.subkernel, insn) + program_callables_info = ( + program_callables_info.with_deleted_callable( + insn.expression.function.name, + program_callables_info.num_times_callables_called[ + caller_kernel.name])) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass @@ -578,7 +557,42 @@ def inline_callable_kernel(kernel, function_name): "Unknown instruction type %s" % type(insn).__name__) - return kernel + return new_caller_kernel, program_callables_info + + +# FIXME This should take a 'within' parameter to be able to only inline +# *some* calls to a kernel, but not others. +def inline_callable_kernel(program, function_name): + """ + Returns a copy of *kernel* with the callable kernel addressed by + (scoped) name *function_name* inlined. + """ + from loopy.preprocess import infer_arg_descr + program = infer_arg_descr(program) + program_callables_info = program.program_callables_info + + edited_callable_kernels = {} + + for func_id, in_knl_callable in program.program_callables_info.items(): + if function_name not in program_callables_info.history[func_id] and ( + isinstance(in_knl_callable, CallableKernel)): + caller_kernel = in_knl_callable.subkernel + caller_kernel, program_callables_info = ( + _inline_single_callable_kernel(caller_kernel, + function_name, + program.program_callables_info)) + edited_callable_kernels[func_id] = in_knl_callable.copy( + subkernel=caller_kernel) + + new_resolved_functions = {} + for func_id, in_knl_callable in program_callables_info.items(): + if func_id in edited_callable_kernels: + new_resolved_functions[func_id] = edited_callable_kernels[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = program_callables_info.copy( + resolved_functions=new_resolved_functions) # }}} -- GitLab From d29e870a5d3db3909bc1fcc6ac087cbd24d7a253 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 05:05:41 -0500 Subject: [PATCH 315/916] basic inlining works. --- loopy/program.py | 2 +- loopy/transform/callable.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 510f9ec86..4428e9823 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -563,7 +563,7 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_callables_called[func_id] -= instances - if num_times_callables_called == 0: + if num_times_callables_called[func_id] == 0: num_times_callables_called.pop(func_id) history.pop(func_id) resolved_functions.pop(func_id) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 0edf5697a..3549d1b75 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -570,11 +570,12 @@ def inline_callable_kernel(program, function_name): from loopy.preprocess import infer_arg_descr program = infer_arg_descr(program) program_callables_info = program.program_callables_info + old_program_callables_info = program_callables_info.copy() edited_callable_kernels = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if function_name not in program_callables_info.history[func_id] and ( + for func_id, in_knl_callable in old_program_callables_info.items(): + if function_name not in old_program_callables_info.history[func_id] and ( isinstance(in_knl_callable, CallableKernel)): caller_kernel = in_knl_callable.subkernel caller_kernel, program_callables_info = ( @@ -594,6 +595,8 @@ def inline_callable_kernel(program, function_name): program_callables_info = program_callables_info.copy( resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=program_callables_info) + # }}} -- GitLab From 1e28c40a3cdc8b44ba2b05631e6942cfd79444cf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 06:18:06 -0500 Subject: [PATCH 316/916] passes test_callables --- loopy/transform/callable.py | 96 ++++++++++++++++--------- loopy/transform/pack_and_unpack_args.py | 36 +++++++++- test/test_callables.py | 77 ++++++++++---------- test/testlib.py | 13 ++-- 4 files changed, 144 insertions(+), 78 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 3549d1b75..f73fb9003 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -28,7 +28,6 @@ import islpy as isl from pymbolic.primitives import CallWithKwargs from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, @@ -36,13 +35,13 @@ from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import (get_kw_pos_association, - change_names_of_pymbolic_calls) + change_names_of_pymbolic_calls, CallableKernel, ScalarCallable) from loopy.program import Program, ResolvedFunctionMarker __doc__ = """ .. currentmodule:: loopy -.. autofunction:: register_function_resolver +.. autofunction:: register_function_id_to_in_knl_callable_mapper .. autofunction:: register_callable_kernel """ @@ -170,31 +169,38 @@ def register_callable_kernel(program, callee_kernel): arg.is_output_only]) expected_num_parameters = len(callee_kernel.args) - expected_num_assignees for in_knl_callable in program.program_callables_info.values(): - caller_kernel = in_knl_callable.subkernel - for insn in caller_kernel.instructions: - if isinstance(insn, CallInstruction) and ( - insn.expression.function.name == callee_kernel.name): - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters + if isinstance(in_knl_callable, CallableKernel): + caller_kernel = in_knl_callable.subkernel + for insn in caller_kernel.instructions: + if isinstance(insn, CallInstruction) and ( + insn.expression.function.name == callee_kernel.name): + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + else: + kw_parameters = {} + if len(insn.assignees) != expected_num_assignees: + raise LoopyError("The number of arguments with 'out' " + "direction " "in callee kernel %s and the number " + "of assignees in " "instruction %s do not " + "match." % ( + callee_kernel.name, insn.id)) + if len(insn.expression.parameters+tuple( + kw_parameters.values())) != expected_num_parameters: + raise LoopyError("The number of expected arguments " + "for the callee kernel %s and the number of " + "parameters in instruction %s do not match." + % (callee_kernel.name, insn.id)) + + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass else: - kw_parameters = {} - if len(insn.assignees) != expected_num_assignees: - raise LoopyError("The number of arguments with 'out' direction " - "in callee kernel %s and the number of assignees in " - "instruction %s do not match." % ( - callee_kernel.name, insn.id)) - if len(insn.expression.parameters+tuple( - kw_parameters.values())) != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of parameters " - "in instruction %s do not match." % ( - callee_kernel.name, insn.id)) - - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("unknown instruction %s" % type(insn)) + raise NotImplementedError("unknown instruction %s" % type(insn)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) # }}} @@ -537,12 +543,11 @@ def _inline_single_callable_kernel(caller_kernel, function_name, history_of_identifier = program_callables_info.history[ insn.expression.function.name] - from loopy.kernel.function_interface import CallableKernel if function_name in history_of_identifier: in_knl_callable = program_callables_info[ insn.expression.function.name] assert isinstance(in_knl_callable, CallableKernel) - new_caller_kernel = _inline_call_instruction( + caller_kernel = _inline_call_instruction( caller_kernel, in_knl_callable.subkernel, insn) program_callables_info = ( program_callables_info.with_deleted_callable( @@ -557,7 +562,7 @@ def _inline_single_callable_kernel(caller_kernel, function_name, "Unknown instruction type %s" % type(insn).__name__) - return new_caller_kernel, program_callables_info + return caller_kernel, program_callables_info # FIXME This should take a 'within' parameter to be able to only inline @@ -581,7 +586,7 @@ def inline_callable_kernel(program, function_name): caller_kernel, program_callables_info = ( _inline_single_callable_kernel(caller_kernel, function_name, - program.program_callables_info)) + program_callables_info)) edited_callable_kernels[func_id] = in_knl_callable.copy( subkernel=caller_kernel) @@ -642,7 +647,8 @@ class DimChanger(IdentityMapper): return expr.aggregate.index(tuple(new_indices)) -def _match_caller_callee_argument_dimension(caller_knl, callee_function_name): +def _match_caller_callee_argument_dimension_for_single_kernel( + caller_knl, callee_function_name): """ Returns a copy of *caller_knl* with the instance of :class:`loopy.kernel.function_interface.CallableKernel` addressed by @@ -722,6 +728,32 @@ def _match_caller_callee_argument_dimension(caller_knl, callee_function_name): return change_names_of_pymbolic_calls(caller_knl, pymbolic_calls_to_new_callables) + +def _match_caller_callee_argument_dimension_(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = ( + _match_caller_callee_argument_dimension_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs)) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 87136d017..734072574 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -24,6 +24,9 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError from loopy.kernel.instruction import CallInstruction +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable from loopy.symbolic import SubArrayRef __doc__ = """ @@ -33,7 +36,8 @@ __doc__ = """ """ -def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, +def pack_and_unpack_args_for_call_for_single_kernel(kernel, + program_callables_info, call_name, args_to_pack=None, args_to_unpack=None): """ Returns a a copy of *kernel* with instructions appended to copy the @@ -50,6 +54,7 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, which must be unpacked. If set *None*, it is interpreted that all the array arguments should be unpacked. """ + assert isinstance(kernel, LoopKernel) new_domains = [] new_tmps = kernel.temporary_variables.copy() old_insn_to_new_insns = {} @@ -58,10 +63,10 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, if not isinstance(insn, CallInstruction): # pack and unpack call only be done for CallInstructions. continue - if insn.expression.function.name not in kernel.scoped_functions: + if insn.expression.function.name not in program_callables_info: continue - in_knl_callable = kernel.scoped_functions[ + in_knl_callable = program_callables_info[ insn.expression.function.name] if in_knl_callable.name != call_name: @@ -314,4 +319,29 @@ def pack_and_unpack_args_for_call(kernel, call_name, args_to_pack=None, return kernel + +def pack_and_unpack_args_for_call(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = pack_and_unpack_args_for_call_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # vim: foldmethod=marker diff --git a/test/test_callables.py b/test/test_callables.py index 9dce5a84a..f25bbbe6f 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -52,7 +52,8 @@ def test_register_function_lookup(ctx_factory): """ y[i] = log2(x[i]) """) - prog = lp.register_function_lookup(prog, register_log2_lookup) + prog = lp.register_function_id_to_in_knl_callable_mapper(prog, + register_log2_lookup) evt, (out, ) = prog(queue, x=x) @@ -68,17 +69,17 @@ def test_register_knl(ctx_factory, inline): x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) - grandchild_knl = lp.make_kernel( + grandchild_knl = lp.make_kernel_function( "{[i, j]:0<= i, j< 16}", """ c[i, j] = 2*a[i, j] + 3*b[i, j] - """) + """, name='linear_combo1') - child_knl = lp.make_kernel( + child_knl = lp.make_kernel_function( "{[i, j]:0<=i, j < 16}", """ [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) - """) + """, name='linear_combo2') parent_knl = lp.make_kernel( "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", @@ -97,10 +98,10 @@ def test_register_knl(ctx_factory, inline): shape=(16, 16, 16, 16, 16)), '...'], ) - child_knl = lp.register_callable_kernel( - child_knl, 'linear_combo1', grandchild_knl) knl = lp.register_callable_kernel( - parent_knl, 'linear_combo2', child_knl) + parent_knl, child_knl) + knl = lp.register_callable_kernel( + knl, grandchild_knl) if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo2') knl = lp.inline_callable_kernel(knl, 'linear_combo1') @@ -120,11 +121,11 @@ def test_slices_with_negative_step(ctx_factory, inline): x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) - child_knl = lp.make_kernel( + child_knl = lp.make_kernel_function( "{[i, j]:0<=i, j < 16}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] - """) + """, name="linear_combo") parent_knl = lp.make_kernel( "{[i, k, m]: 0<=i, k, m<16}", @@ -148,7 +149,7 @@ def test_slices_with_negative_step(ctx_factory, inline): ) knl = lp.register_callable_kernel( - parent_knl, 'linear_combo', child_knl) + parent_knl, child_knl) if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo') @@ -169,7 +170,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - callee_knl = lp.make_kernel( + callee_knl = lp.make_kernel_function( "{[i, j]:0<=i, j < %d}" % n, """ h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] @@ -177,11 +178,8 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] """, [ - lp.GlobalArg('f'), - lp.GlobalArg('e'), - lp.GlobalArg('h'), - lp.GlobalArg('g'), - '...']) + lp.GlobalArg('f, e, h, g'), '...'], + name='linear_combo') caller_knl = lp.make_kernel( "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, @@ -194,7 +192,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): """) knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl) + caller_knl, callee_knl) if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo') @@ -223,11 +221,11 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - callee_knl = lp.make_kernel( + callee_knl = lp.make_kernel_function( "{[i, j]:0<=i, j < 16}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] - """) + """, name='linear_combo') callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") @@ -241,7 +239,7 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.register_callable_kernel( - caller_knl, 'linear_combo', callee_knl) + caller_knl, callee_knl) if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo') @@ -264,23 +262,23 @@ def test_shape_translation_through_sub_array_ref(ctx_factory, inline): x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) - callee1 = lp.make_kernel( + callee1 = lp.make_kernel_function( "{[i]: 0<=i<6}", """ a[i] = 2*abs(b[i]) - """) + """, name="callee_fn1") - callee2 = lp.make_kernel( + callee2 = lp.make_kernel_function( "{[i, j]: 0<=i<3 and 0 <= j < 2}", """ a[i, j] = 3*b[i, j] - """) + """, name="callee_fn2") - callee3 = lp.make_kernel( + callee3 = lp.make_kernel_function( "{[i]: 0<=i<6}", """ a[i] = 5*b[i] - """) + """, name="callee_fn3") knl = lp.make_kernel( "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", @@ -290,9 +288,9 @@ def test_shape_translation_through_sub_array_ref(ctx_factory, inline): [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) """) - knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1) - knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2) - knl = lp.register_callable_kernel(knl, 'callee_fn3', callee3) + knl = lp.register_callable_kernel(knl, callee1) + knl = lp.register_callable_kernel(knl, callee2) + knl = lp.register_callable_kernel(knl, callee3) if inline: knl = lp.inline_callable_kernel(knl, 'callee_fn1') @@ -321,7 +319,7 @@ def test_multi_arg_array_call(ctx_factory): i = p.Variable("i") index = p.Variable("index") a_i = p.Subscript(p.Variable("a"), p.Variable("i")) - argmin_kernel = lp.make_kernel( + argmin_kernel = lp.make_kernel_function( "{[i]: 0 <= i < n}", [ lp.Assignment(id="init2", assignee=index, @@ -333,7 +331,8 @@ def test_multi_arg_array_call(ctx_factory): depends_on="update"), lp.Assignment(id="update", assignee=acc_i, expression=p.Variable("min")(acc_i, a_i), - depends_on="init1,init2")]) + depends_on="init1,init2")], + name="custom_argmin") argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) @@ -346,7 +345,7 @@ def test_multi_arg_array_call(ctx_factory): knl = lp.fix_parameters(knl, n=n) knl = lp.set_options(knl, return_dict=True) - knl = lp.register_callable_kernel(knl, "custom_argmin", argmin_kernel) + knl = lp.register_callable_kernel(knl, argmin_kernel) b = np.random.randn(n) evt, out_dict = knl(queue, b=b) tol = 1e-15 @@ -363,17 +362,17 @@ def test_packing_unpacking(ctx_factory, inline): x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) - callee1 = lp.make_kernel( + callee1 = lp.make_kernel_function( "{[i]: 0<=i<6}", """ a[i] = 2*b[i] - """) + """, name="callee_fn1") - callee2 = lp.make_kernel( + callee2 = lp.make_kernel_function( "{[i, j]: 0<=i<2 and 0 <= j < 3}", """ a[i, j] = 3*b[i, j] - """) + """, name="callee_fn2") knl = lp.make_kernel( "{[i, j, k]: 0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}", @@ -382,8 +381,8 @@ def test_packing_unpacking(ctx_factory, inline): [k]: y2[k] = callee_fn2([k]: x2[k]) """) - knl = lp.register_callable_kernel(knl, 'callee_fn1', callee1) - knl = lp.register_callable_kernel(knl, 'callee_fn2', callee2) + knl = lp.register_callable_kernel(knl, callee1) + knl = lp.register_callable_kernel(knl, callee2) knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1') knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2') diff --git a/test/testlib.py b/test/testlib.py index 106a07aeb..eebc792d0 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -139,12 +139,14 @@ class SeparateTemporariesPreambleTestPreambleGenerator( class Log2Callable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) dtype = arg_id_to_dtype[0].numpy_dtype @@ -162,8 +164,11 @@ class Log2Callable(lp.ScalarCallable): name_in_target = "log2l" from loopy.types import NumpyType - return self.copy(name_in_target=name_in_target, - arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) + return ( + self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + program_callables_info) def register_log2_lookup(target, identifier): -- GitLab From 96c8ee2734d8e7ab69dd7cf4e52c828687c4f207 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 11:46:25 -0500 Subject: [PATCH 317/916] minor bug in with_descr of ReductionCallables. --- loopy/library/function.py | 6 ++-- loopy/library/reduction.py | 6 ++-- loopy/program.py | 2 +- loopy/transform/callable.py | 61 +++---------------------------------- 4 files changed, 13 insertions(+), 62 deletions(-) diff --git a/loopy/library/function.py b/loopy/library/function.py index 50bde1744..8fcdcd6da 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -35,12 +35,14 @@ class MakeTupleCallable(ScalarCallable): return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target="loopy_make_tuple"), program_callables_info) - def with_descrs(self, arg_id_to_descr): + def with_descrs(self, arg_id_to_descr, program_callables_info): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) - return self.copy(arg_id_to_descr=new_arg_id_to_descr) + return ( + self.copy(arg_id_to_descr=new_arg_id_to_descr), + program_callables_info) class IndexOfCallable(ScalarCallable): diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index ad72bc19d..383337b2f 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -408,11 +408,13 @@ class ReductionCallable(ScalarCallable): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=name_in_target), program_callables_info - def with_descr(self, arg_id_to_descr): + def with_descr(self, arg_id_to_descr, program_callables_info): from loopy.library.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() - return self.copy(arg_id_to_descr=arg_id_to_descr) + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + program_callables_info) def generate_preambles(self, target): if isinstance(self.name, ArgExtOp): diff --git a/loopy/program.py b/loopy/program.py index 4428e9823..ff68ae4e0 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -127,7 +127,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): def map_reduction(self, expr, expn_state): for func_id in ( expr.operation.get_scalar_callables()): - in_knl_callable = self.find_resolved_function_from_identifier(func_id) + in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) assert in_knl_callable is not None self.program_callables_info, _ = ( self.program_callables_info.with_callable(func_id, diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index f73fb9003..b5b80ad89 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -32,7 +32,7 @@ from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) -from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper +from loopy.symbolic import IdentityMapper, SubstitutionMapper from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import (get_kw_pos_association, change_names_of_pymbolic_calls, CallableKernel, ScalarCallable) @@ -246,59 +246,6 @@ def register_callable_kernel(program, callee_kernel): # }}} -# {{{ callee scoped calls collector (to support inlining) - -class CalleeScopedCallsCollector(CombineMapper): - """ - Collects the scoped functions which are a part of the callee kernel and - must be transferred to the caller kernel before inlining. - - :returns: - An :class:`frozenset` of function names that are not scoped in - the caller kernel. - - .. note:: - :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are - never scoped in the pipeline. - """ - - def __init__(self, callee_scoped_functions): - self.callee_scoped_functions = callee_scoped_functions - - def combine(self, values): - import operator - from functools import reduce - return reduce(operator.or_, values, frozenset()) - - def map_call(self, expr): - if expr.function.name in self.callee_scoped_functions: - return (frozenset([(expr, - self.callee_scoped_functions[expr.function.name])]) | - self.combine((self.rec(child) for child in expr.parameters))) - else: - return self.combine((self.rec(child) for child in expr.parameters)) - - def map_call_with_kwargs(self, expr): - if expr.function.name in self.callee_scoped_functions: - return (frozenset([(expr, - self.callee_scoped_functions[expr.function.name])]) | - self.combine((self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values())))) - else: - return self.combine((self.rec(child) for child in - expr.parameters+tuple(expr.kw_parameters.values()))) - - def map_constant(self, expr): - return frozenset() - - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant - -# }}} - - # {{{ kernel inliner mapper class KernelInliner(SubstitutionMapper): @@ -648,7 +595,7 @@ class DimChanger(IdentityMapper): def _match_caller_callee_argument_dimension_for_single_kernel( - caller_knl, callee_function_name): + caller_knl, program_callables_info, callee_function_name): """ Returns a copy of *caller_knl* with the instance of :class:`loopy.kernel.function_interface.CallableKernel` addressed by @@ -659,12 +606,12 @@ def _match_caller_callee_argument_dimension_for_single_kernel( for insn in caller_knl.instructions: if not isinstance(insn, CallInstruction) or ( insn.expression.function.name not in - caller_knl.scoped_functions): + program_callables_info): # Call to a callable kernel can only occur through a # CallInstruction. continue - in_knl_callable = caller_knl.scoped_functions[ + in_knl_callable = program_callables_info[ insn.expression.function.name] if in_knl_callable.subkernel.name != callee_function_name: -- GitLab From ca5a6b58286fbddb347db0c5807ee6e8d058e1e0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 12:00:31 -0500 Subject: [PATCH 318/916] Mordernize test_apps --- test/test_apps.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/test/test_apps.py b/test/test_apps.py index e7f4004fa..a9c3bf2a7 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -216,7 +216,8 @@ def test_rob_stroud_bernstein(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -224,13 +225,12 @@ def test_rob_stroud_bernstein(ctx_factory): knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, - ))) + )) + print(lp.generate_code_v2(knl)) def test_rob_stroud_bernstein_full(ctx_factory): @@ -296,7 +296,8 @@ def test_rob_stroud_bernstein_full(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -310,14 +311,14 @@ def test_rob_stroud_bernstein_full(ctx_factory): from pickle import dumps, loads knl = loads(dumps(knl)) - knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) - print(knl) + print(lp.generate_code_v2(knl)) def test_stencil(ctx_factory): @@ -660,7 +661,7 @@ def test_domain_tree_nesting(): lp.GlobalArg('B', shape=(100, 31), dtype=np.float64), lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)]) - parents_per_domain = knl.parents_per_domain() + parents_per_domain = knl.root_kernel.parents_per_domain() def depth(i): if parents_per_domain[i] is None: -- GitLab From 95b78c0681ec5da4444a1de0a03c3e95c5dc68ad Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 12:19:05 -0500 Subject: [PATCH 319/916] corrections in noting the history. --- loopy/program.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index ff68ae4e0..e41d3830e 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -358,7 +358,7 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_callables_called = dict((func_id, 1) for func_id in resolved_functions) if history is None: - history = dict((func_id, [func_id]) for func_id in + history = dict((func_id, set([func_id])) for func_id in resolved_functions) super(ProgramCallablesInfo, self).__init__( @@ -465,8 +465,7 @@ class ProgramCallablesInfo(ImmutableRecord): if num_times_callables_called[function.name] == 0: renames_needed_after_editing[func_id] = function.name - if func_id not in history[function.name]: - history[function.name].append(func_id) + history[func_id] = history[func_id] | set([function.name]) return ( self.copy( history=history, @@ -499,10 +498,11 @@ class ProgramCallablesInfo(ImmutableRecord): in_kernel_callable) if not resolved_for_the_first_time: - if unique_function_identifier not in history[function.name]: - history[function.name].append(func_id) + history[unique_function_identifier] = ( + history[function.name] | set([unique_function_identifier])) else: - history[unique_function_identifier] = [unique_function_identifier] + history[unique_function_identifier] = set( + [unique_function_identifier]) return ( self.copy( -- GitLab From 16f16a22b2cc1a714324879ce4ed9c7f8183628a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 12:37:30 -0500 Subject: [PATCH 320/916] started work towards test_target. --- loopy/codegen/result.py | 2 +- loopy/kernel/tools.py | 4 ++-- loopy/target/cuda.py | 3 ++- loopy/target/python.py | 6 ++++-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 4318ad71c..00f19d99a 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -133,7 +133,7 @@ class CodeGenerationResult(ImmutableRecord): preamble_codes = process_preambles( getattr(self, "host_preambles", []) + - getattr(self, "device_preambles", []) + list(getattr(self, "device_preambles", [])) ) return ( diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index c866c9c6a..8e238badb 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1906,8 +1906,8 @@ def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): return None - return frozenset([_get_callee_kernel_if_insn_has_callable_kernel(id) - for id in insn_ids]) - frozenset([None]) + return frozenset([_get_callee_kernel_if_insn_has_callable_kernel(insn_id) + for insn_id in insn_ids]) - frozenset([None]) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index fe576cdca..89cbfd034 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -302,7 +302,8 @@ class CUDACASTBuilder(CASTBuilder): _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.program_callables_info) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): diff --git a/loopy/target/python.py b/loopy/target/python.py index b7a83d25b..cd6e61167 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -85,14 +85,16 @@ class ExpressionToPythonMapper(StringifyMapper): def map_call(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE - identifier_name = self.kernel.scoped_functions[expr.function.name].name + identifier_name = self.codegen_state.program_callables_info[ + expr.function.name].name if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") from loopy.kernel.function_interface import ManglerCallable - in_knl_callable = self.kernel.scoped_functions[expr.function.name] + in_knl_callable = self.codegen_state.program_callables_info[ + expr.function.name] if isinstance(in_knl_callable, ManglerCallable): from loopy.codegen import SeenFunction mangle_result = in_knl_callable.mangle_result(self.kernel) -- GitLab From 0e458716ff05beb68743e72005c7f59be3b971a6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 12:53:44 -0500 Subject: [PATCH 321/916] crucial error fix in arg_id_to_descr --- loopy/preprocess.py | 2 +- test/test_target.py | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d559ca2bb..affe96812 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2286,7 +2286,7 @@ def infer_arg_descr(program): root_kernel, program_callables_info) new_root_kernel_callable = root_kernel_callable.copy( subkernel=new_root_kernel) - program_callables_info.with_callable(program.name, + program_callables_info, _ = program_callables_info.with_callable(program.name, new_root_kernel_callable) program_callables_info = program_callables_info.with_exit_edit_callables_mode() diff --git a/test/test_target.py b/test/test_target.py index 7c0d003ee..7b9d4f40a 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -72,9 +72,7 @@ def test_ispc_target(occa_mode=False): knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - codegen_result = lp.generate_code_v2( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl))) + codegen_result = lp.generate_code_v2(knl) print(codegen_result.device_code()) print(codegen_result.host_code()) @@ -98,9 +96,8 @@ def test_cuda_target(): default_tag="l.auto") print( - lp.generate_code( - lp.get_one_scheduled_kernel( - lp.preprocess_kernel(knl)))[0]) + lp.generate_code_v2( + knl).device_code()) def test_generate_c_snippet(): @@ -140,10 +137,7 @@ def test_generate_c_snippet(): knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") - - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - print(lp.generate_body(knl)) + print(lp.generate_code_v2(knl)) @pytest.mark.parametrize("target", [CTarget, OpenCLTarget]) -- GitLab From 00db249f09e5412ed891e6c9dd2416d660d29c60 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 12:59:25 -0500 Subject: [PATCH 322/916] dont use kwargs while giving input to add_dependency. --- loopy/transform/add_barrier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index 4af0c9c54..38bb21850 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -82,7 +82,7 @@ def add_barrier(knl, insn_before="", insn_after="", mem_kind=mem_kind) new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add]) - new_knl = add_dependency(kernel=new_knl, + new_knl = add_dependency(new_knl, insn_match=insn_after, depends_on="id:"+id) -- GitLab From fcad92735ffeae472621fa7339200eab56b59780 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 13:24:09 -0500 Subject: [PATCH 323/916] minor wrinkle in test_fortran. --- test/test_fortran.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index 1a5a0c383..6a6c51975 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -472,7 +472,7 @@ def test_precompute_some_exist(ctx_factory): knl, = lp.parse_fortran(fortran_src) - assert len(knl.domains) == 1 + assert len(knl.root_kernel.domains) == 1 knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1") -- GitLab From 026dade5370e6279d874824fb9c8e934137f1189 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 13:27:42 -0500 Subject: [PATCH 324/916] changes the definition of realize_reduction --- test/test_reduction.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_reduction.py b/test/test_reduction.py index 6ed618f4f..96dab405a 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -238,8 +238,7 @@ def test_global_parallel_reduction(ctx_factory, size): prog = lp.precompute(prog, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") - knl = lp.realize_reduction(prog.root_kernel, prog.program_callables_info) - prog = prog.with_root_kernel(knl) + prog = lp.realize_reduction(prog) prog = lp.add_dependency( prog, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") -- GitLab From 7642209198dc34e5fd5efb2c96a06475da26c19e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 13:39:34 -0500 Subject: [PATCH 325/916] mordernize test. --- test/test_fortran.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index 6a6c51975..5d5f7f0b1 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -200,9 +200,9 @@ def test_assignment_to_subst_indices(ctx_factory): ref_knl = knl - assert "a" in knl.temporary_variables + assert "a" in knl.root_kernel.temporary_variables knl = lp.assignment_to_subst(knl, "a") - assert "a" not in knl.temporary_variables + assert "a" not in knl.root_kernel.temporary_variables ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl) -- GitLab From 175c79358e3297400c49a802b8ca2a0ef72578c8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 13:43:51 -0500 Subject: [PATCH 326/916] ported moren transformations to program. --- loopy/transform/iname.py | 1 + loopy/transform/instruction.py | 1 + 2 files changed, 2 insertions(+) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 20dc9a99b..caa02c17a 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1718,6 +1718,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # {{{ add_inames_to_insn +@iterate_over_kernels_if_given_program def add_inames_to_insn(knl, inames, insn_match): """ :arg inames: a frozenset of inames that will be added to the diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 910a6b2d3..93cf932b1 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -78,6 +78,7 @@ def map_instructions(kernel, insn_match, f): # {{{ set_instruction_priority +@iterate_over_kernels_if_given_program def set_instruction_priority(kernel, insn_match, priority): """Set the priority of instructions matching *insn_match* to *priority*. -- GitLab From 59efd1c407ff4d907d1e06b86bd26a947be56fe3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Aug 2018 13:50:49 -0500 Subject: [PATCH 327/916] some more test modernization. --- loopy/auto_test.py | 2 +- test/test_loopy.py | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 1fc46ffd7..5ce80ed88 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -524,7 +524,7 @@ def auto_test_vs_ref( if not quiet: print(75*"-") - print("Kernel #%d:" % i) + print("Kernel:") print(75*"-") if print_code: print(get_highlighted_code( diff --git a/test/test_loopy.py b/test/test_loopy.py index 10701cee5..5baead833 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -391,8 +391,6 @@ def test_bare_data_dependency(ctx_factory): # {{{ test race detection -# FIXME: not intended just for local testing purposes. ~KK -@pytest.mark.skip def test_ilp_write_race_detection_global(ctx_factory): ctx = ctx_factory() @@ -1531,9 +1529,6 @@ def test_save_ambiguous_storage_requirements(): knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"}) knl = lp.set_temporary_scope(knl, "a", "local") - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) - from loopy.diagnostic import LoopyError with pytest.raises(LoopyError): lp.save_and_reload_temporaries(knl) -- GitLab From 2278ef90231c963b750924a30a28114ca6089ffc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 8 Aug 2018 00:22:45 -0500 Subject: [PATCH 328/916] [ci skip] Added fixmes from yesterday's discussion. --- loopy/program.py | 3 +++ loopy/statistics.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/loopy/program.py b/loopy/program.py index e41d3830e..bb5b9b1ac 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -349,6 +349,9 @@ def rename_resolved_functions_in_a_single_kernel(kernel, # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): + # FIXME: dont evalutate num_times_called, rahter compute it from the + # resolved_functions + # FIXME: make the edit callables thing a ContextManager. def __init__(self, resolved_functions, num_times_callables_called=None, history=None, is_being_edited=False, num_times_hit_during_editing={}, diff --git a/loopy/statistics.py b/loopy/statistics.py index 6a9744a06..74cd1bc71 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -65,6 +65,8 @@ __doc__ = """ # - The variable name, what if multiple kernels use the same name? # - We should also add the cumulative effect on the arguments of callee kernels # into the caller kernel. +# FIXME: add an error that there is only one callable kernel. disable for +# multiple callable kernels. # {{{ GuardedPwQPolynomial -- GitLab From aeb633804cb6fe6642b67e83b00e50e3330c2dc4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 10 Aug 2018 15:11:20 +0530 Subject: [PATCH 329/916] adjustment to pass statistics test. --- loopy/statistics.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 74cd1bc71..08b7f89e9 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1108,6 +1108,16 @@ def add_assumptions_guard(kernel, pwqpolynomial): def count(kernel, set, space=None): + from loopy.program import Program + if isinstance(kernel, Program): + if len([in_knl_callable for in_knl_callable in + kernel.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + kernel = kernel.root_kernel + try: if space is not None: set = set.align_params(space) @@ -1862,6 +1872,13 @@ def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False) def gather_access_footprints(program, ignore_uncountable=False): + # FIMXE: works only for one callable kernel till now. + if len([in_knl_callable for in_knl_callable in + program.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + from loopy.preprocess import preprocess_program, infer_unknown_types program = infer_unknown_types(program, expect_completion=True) -- GitLab From 40aea2d176847e1fb800ee58008012d575f18cd0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 10 Aug 2018 16:26:25 +0530 Subject: [PATCH 330/916] more test fixes. --- loopy/check.py | 9 +++++---- loopy/codegen/__init__.py | 25 ++++++++++++++++--------- loopy/program.py | 22 ++++++++++++++++++++++ loopy/transform/iname.py | 31 +++++++++++++++++++++++++++---- loopy/type_inference.py | 5 ++--- 5 files changed, 72 insertions(+), 20 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 727b02a85..f50ee5cfa 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -486,11 +486,12 @@ def check_write_destinations(kernel): # {{{ check_has_schedulable_iname_nesting def check_has_schedulable_iname_nesting(kernel): - from loopy.transform.iname import (has_schedulable_iname_nesting, - get_iname_duplication_options) - if not has_schedulable_iname_nesting(kernel): + from loopy.transform.iname import ( + has_schedulable_iname_nesting_for_single_kernel, + get_iname_duplication_options_for_single_kernel) + if not has_schedulable_iname_nesting_for_single_kernel(kernel): import itertools as it - opt = get_iname_duplication_options(kernel) + opt = get_iname_duplication_options_for_single_kernel(kernel) opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) for i, w in it.islice(opt, 3)) raise LoopyError("Kernel does not have a schedulable iname nesting. " diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index ed1e7a5bc..e9e7c9a44 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -154,6 +154,7 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState(object): """ .. attribute:: kernel + .. attribute:: target .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -199,7 +200,7 @@ class CodeGenerationState(object): .. attribute:: program_callables_info """ - def __init__(self, kernel, + def __init__(self, kernel, target, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, @@ -209,6 +210,7 @@ class CodeGenerationState(object): gen_program_name=None, schedule_index_end=None): self.kernel = kernel + self.target = target self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates @@ -226,7 +228,7 @@ class CodeGenerationState(object): # {{{ copy helpers - def copy(self, kernel=None, implemented_data_info=None, + def copy(self, kernel=None, target=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), var_subst_map=None, vectorization_info=None, is_generating_device_code=None, @@ -236,6 +238,9 @@ class CodeGenerationState(object): if kernel is None: kernel = self.kernel + if target is None: + target = self.target + if implemented_data_info is None: implemented_data_info = self.implemented_data_info @@ -256,6 +261,7 @@ class CodeGenerationState(object): return CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( @@ -413,7 +419,7 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_for_a_single_kernel(kernel, program_callables_info): +def generate_code_for_a_single_kernel(kernel, program_callables_info, target): """ :returns: a :class:`CodeGenerationResult` """ @@ -459,13 +465,13 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info( - kernel.target, + target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append(ImplementedDataInfo( - target=kernel.target, + target=target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, @@ -488,6 +494,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) codegen_state = CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), @@ -499,9 +506,9 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=( - kernel.target.host_program_name_prefix + target.host_program_name_prefix + kernel.name - + kernel.target.host_program_name_suffix), + + target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), program_callables_info=program_callables_info) @@ -536,7 +543,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): ) preamble_generators = (kernel.preamble_generators - + kernel.target.get_device_ast_builder().preamble_generators()) + + target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -579,7 +586,7 @@ def generate_code_v2(program): if isinstance(in_knl_callable, CallableKernel): codegen_results[func_id] = ( generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.program_callables_info)) + program.program_callables_info, program.target)) device_preambles = set() for cgr in codegen_results.values(): diff --git a/loopy/program.py b/loopy/program.py index bb5b9b1ac..df7bd1bdd 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -192,6 +192,28 @@ class Program(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash + def copy(self, **kwargs): + if 'target' in kwargs: + target = kwargs['target'] + new_self = super(Program, self).copy(**kwargs) + new_resolved_functions = {} + for func_id, in_knl_callable in ( + new_self.program_callables_info.items()): + if isinstance(in_knl_callable, CallableKernel): + subkernel = in_knl_callable.subkernel + new_resolved_functions[func_id] = in_knl_callable.copy( + subkernel=subkernel.copy(target=target)) + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = new_self.program_callables_info.copy( + resolved_functions=new_resolved_functions) + + return new_self.copy( + program_callables_info=program_callables_info) + else: + return super(Program, self).copy(**kwargs) + def get_grid_size_upper_bounds(self, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index caa02c17a..75aa62467 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -36,6 +36,7 @@ from loopy.diagnostic import LoopyError from loopy.program import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable __doc__ = """ @@ -982,7 +983,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(knl, use_boostable_into=False): +def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1048,7 +1049,7 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # If we find a duplication option and to not use boostable_into # information, we restart this generator with use_boostable_into=True if not use_boostable_into and not knl.options.ignore_boostable_into: - for option in get_iname_duplication_options(knl, True): + for option in get_iname_duplication_options_for_single_kernel(knl, True): yield option # Emit a warning that we needed boostable_into @@ -1076,12 +1077,34 @@ def get_iname_duplication_options(knl, use_boostable_into=False): yield iname, within -def has_schedulable_iname_nesting(knl): +def get_iname_duplication_options(program, use_boostable_into=False): + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + yield from get_iname_duplication_options_for_single_kernel( + in_knl_callable.subkernel, use_boostable_into) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of in kernel callable %s." + % (type(in_knl_callable))) + + return + + +def has_schedulable_iname_nesting_for_single_kernel(knl): """ :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ - return not bool(next(get_iname_duplication_options(knl), False)) + return not bool(next(get_iname_duplication_options_for_single_kernel(knl), + False)) + + +def has_schedulable_iname_nesting(program): + return all(has_schedulable_iname_nesting_for_single_kernel( + in_knl_callable.subkernel) for in_knl_callable in + program.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 13d9c722e..65c91871a 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -36,7 +36,7 @@ from loopy.diagnostic import ( from loopy.kernel.instruction import _DataObliviousInstruction from loopy.program import ProgramCallablesInfo -from loopy.symbolic import SubArrayRef +from loopy.symbolic import SubArrayRef, LinearSubscript from pymbolic.primitives import Variable, Subscript import logging @@ -819,7 +819,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, if kernel.temporary_variables[assignee.name].dtype is None: return False - elif isinstance(assignee, Subscript): + elif isinstance(assignee, (Subscript, LinearSubscript)): if assignee.aggregate.name in kernel.arg_dict: if kernel.arg_dict[assignee.aggregate.name].dtype is None: return False @@ -828,7 +828,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, if kernel.temporary_variables[ assignee.aggregate.name].dtype is None: return False - else: assert isinstance(assignee, SubArrayRef) if assignee.subscript.aggregate.name in kernel.arg_dict: -- GitLab From c63411ae74ccb3430cb9753763fca2a4e6e1e162 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 10 Aug 2018 16:45:03 +0530 Subject: [PATCH 331/916] yield from not supported in python 2. --- loopy/transform/iname.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 75aa62467..93f6c53e8 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1080,8 +1080,9 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals def get_iname_duplication_options(program, use_boostable_into=False): for in_knl_callable in program.program_callables_info.values(): if isinstance(in_knl_callable, CallableKernel): - yield from get_iname_duplication_options_for_single_kernel( - in_knl_callable.subkernel, use_boostable_into) + for option in get_iname_duplication_options_for_single_kernel( + in_knl_callable.subkernel, use_boostable_into): + yield option elif isinstance(in_knl_callable, ScalarCallable): pass else: -- GitLab From 3a4db12729a84f8a6269725cecfd0754d6a2a532 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 10 Aug 2018 20:22:22 +0530 Subject: [PATCH 332/916] minor error in program copy. --- loopy/program.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index df7bd1bdd..096bd1eca 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -206,11 +206,11 @@ class Program(ImmutableRecord): else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = new_self.program_callables_info.copy( - resolved_functions=new_resolved_functions) + program_callables_info = new_self.program_callables_info.copy( + resolved_functions=new_resolved_functions) - return new_self.copy( - program_callables_info=program_callables_info) + return super(Program, new_self).copy( + program_callables_info=program_callables_info) else: return super(Program, self).copy(**kwargs) -- GitLab From 541978651f12cd6a943293a6f8f86cf4ebce377c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 11 Aug 2018 05:36:38 +0530 Subject: [PATCH 333/916] small changes in tests to pass test_diff --- loopy/transform/data.py | 1 + loopy/transform/diff.py | 12 ++++-------- test/test_diff.py | 3 ++- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 9534279d4..5f4f2f2a7 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -486,6 +486,7 @@ set_array_dim_names = (MovedFunctionDeprecationWrapper( # {{{ remove_unused_arguments +@iterate_over_kernels_if_given_program def remove_unused_arguments(knl): new_args = [] diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d0edcfd78..54d06605a 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -33,6 +33,7 @@ import loopy as lp from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext from loopy.isl_helpers import make_slab from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel # {{{ diff mapper @@ -370,6 +371,8 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", *diff_context.by_name*, or *None* if no dependency exists. """ + assert isinstance(knl, LoopKernel) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=True) @@ -398,14 +401,7 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", # }}} - # Differentiation lead to addition of new functions to the kernel. - # For example differentiating `sin(x)` -> `cos(x)`. Hence we would need to - # scope `cos(x)`. - from loopy.kernel.creation import scope_functions - differentiated_scoped_kernel = scope_functions( - diff_context.get_new_kernel()) - - return differentiated_scoped_kernel, result + return diff_context.get_new_kernel(), result # }}} diff --git a/test/test_diff.py b/test/test_diff.py index b735ab17a..a7fd92987 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -55,7 +55,7 @@ def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel( + knl = lp.make_kernel_function( """{ [i,j]: 0<=i,j a = 1/(1+sinh(x[i] + y[j])**2) @@ -66,6 +66,7 @@ def test_diff(ctx_factory): from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") + dknl = lp.make_program_from_kernel(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") -- GitLab From 1bcda9a1764492790b40dd7d7a0dacef92d12915 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 11 Aug 2018 06:45:23 +0530 Subject: [PATCH 334/916] minor error fixes to pass test_loopy --- loopy/library/function.py | 3 ++- loopy/type_inference.py | 9 +++++++-- test/test_loopy.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/loopy/library/function.py b/loopy/library/function.py index 8fcdcd6da..8338875d0 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -47,7 +47,8 @@ class MakeTupleCallable(ScalarCallable): class IndexOfCallable(ScalarCallable): def with_types(self, arg_id_to_dtype, kernel, program_callables_info): - new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype = dict((i, dtype) for i, dtype in + arg_id_to_dtype.items() if dtype is not None) new_arg_id_to_dtype[-1] = kernel.index_dtype return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 65c91871a..cf956f68f 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -37,7 +37,7 @@ from loopy.kernel.instruction import _DataObliviousInstruction from loopy.program import ProgramCallablesInfo from loopy.symbolic import SubArrayRef, LinearSubscript -from pymbolic.primitives import Variable, Subscript +from pymbolic.primitives import Variable, Subscript, Lookup import logging logger = logging.getLogger(__name__) @@ -308,7 +308,9 @@ class TypeInferenceMapper(CombineMapper): # specializing an already specialized function. for id, dtype in arg_id_to_dtype.items(): - if in_knl_callable.arg_id_to_dtype[id] != arg_id_to_dtype[id]: + if id in in_knl_callable.arg_id_to_dtype and ( + in_knl_callable.arg_id_to_dtype[id] != + arg_id_to_dtype[id]): # {{{ ignoring the the cases when there is a discrepancy # between np.uint and np.int @@ -810,6 +812,9 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, def _instruction_missed_during_inference(insn): for assignee in insn.assignees: + if isinstance(assignee, Lookup): + assignee = assignee.aggregate + if isinstance(assignee, Variable): if assignee.name in kernel.arg_dict: if kernel.arg_dict[assignee.name].dtype is None: diff --git a/test/test_loopy.py b/test/test_loopy.py index 5baead833..9dc74b94f 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2626,7 +2626,7 @@ def test_fixed_parameters(ctx_factory): def test_parameter_inference(): knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") - assert knl.all_params() == set(["n"]) + assert knl.root_kernel.all_params() == set(["n"]) def test_execution_backend_can_cache_dtypes(ctx_factory): -- GitLab From 6b620ac9abf80785e2b121bdcf7dae63675898ab Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 11 Aug 2018 10:23:26 +0530 Subject: [PATCH 335/916] update persistent hash for various classes. --- loopy/kernel/function_interface.py | 8 +++++++- loopy/library/reduction.py | 31 ++++++++++++++++++++++++++++++ loopy/tools.py | 3 ++- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index cbc0e641b..2ea260656 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -45,7 +45,6 @@ class ValueArgDescriptor(ImmutableRecord): hash_fields = () update_persistent_hash = LoopKernel.update_persistent_hash - pass class ArrayArgDescriptor(ImmutableRecord): @@ -90,6 +89,13 @@ class ArrayArgDescriptor(ImmutableRecord): address_space=address_space, dim_tags=dim_tags) + hash_fields = ( + "shape", + "address_space", + "dim_tags") + + update_persistent_hash = LoopKernel.update_persistent_hash + # }}} diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 383337b2f..6ec8e4b21 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -31,6 +31,7 @@ import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.kernel import LoopKernel class ReductionOperation(object): @@ -223,6 +224,11 @@ class ReductionOpFunction(FunctionIdentifier): return type(self)(reduction_op) + hash_fields = ( + "reduction_op",) + + update_persistent_hash = LoopKernel.update_persistent_hash + # }}} @@ -276,12 +282,25 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): which = "sum" op = "((%s) + (%s))" + hash_fields = ( + "which", + "op",) + + update_persistent_hash = LoopKernel.update_persistent_hash + class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): base_reduction_class = ProductReductionOperation op = "((%s) * (%s))" which = "product" + hash_fields = ( + "which", + "op", + "base_reduction_class",) + + update_persistent_hash = LoopKernel.update_persistent_hash + # }}} @@ -332,12 +351,24 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): update_comparison = ">=" neutral_sign = -1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) + + update_persistent_hash = LoopKernel.update_persistent_hash + class ArgMinReductionOperation(_ArgExtremumReductionOperation): which = "min" update_comparison = "<=" neutral_sign = +1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) + + update_persistent_hash = LoopKernel.update_persistent_hash + # }}} diff --git a/loopy/tools.py b/loopy/tools.py index 8c5d36390..b243a7949 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -73,7 +73,8 @@ class LoopyKeyBuilder(KeyBuilderBase): def update_for_dict(self, key_hash, key): # Order matters for the hash--insert in sorted order. - for dict_key in sorted(six.iterkeys(key)): + for dict_key in sorted(six.iterkeys(key), key=lambda obj: + type(obj).__name__ + str(obj)): self.rec(key_hash, (dict_key, key[dict_key])) update_for_defaultdict = update_for_dict -- GitLab From f311a1a43d73be8d31c047f49be08071923fdcdd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 11 Aug 2018 19:40:45 +0530 Subject: [PATCH 336/916] pass the examples? --- examples/python/call-external.py | 22 ++++++++++++++-------- examples/python/global_barrier_removal.py | 2 +- examples/python/hello-loopy.py | 3 ++- examples/python/ispc-stream-harness.py | 2 -- examples/python/sparse.py | 4 ++-- 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/examples/python/call-external.py b/examples/python/call-external.py index 904270472..68618a7ec 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -7,12 +7,14 @@ from loopy.target.c import CTarget # {{{ blas callable class BLASCallable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): for i in range(0, 2): if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: # the types provided aren't mature enough to specialize the # callable - return self.copy(arg_id_to_dtype=arg_id_to_dtype) + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) mat_dtype = arg_id_to_dtype[0].numpy_dtype vec_dtype = arg_id_to_dtype[1].numpy_dtype @@ -32,7 +34,7 @@ class BLASCallable(lp.ScalarCallable): from loopy.types import NumpyType return self.copy(name_in_target=name_in_target, arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), - -1: NumpyType(vec_dtype)}) + -1: NumpyType(vec_dtype)}), program_callables_info def emit_call_insn(self, insn, target, expression_to_code_mapper): assert self.is_ready_for_codegen() @@ -97,9 +99,13 @@ knl = lp.make_kernel( """ y[:] = gemv(A[:, :], x[:]) """, [ - lp.ArrayArg('A', dtype=np.float64, shape=(n, n)), - lp.ArrayArg('x', dtype=np.float64, shape=(n, )), - lp.ArrayArg('y', shape=(n, )), ...], - target=CTarget()) + lp.GlobalArg('A', dtype=np.float64, shape=(n, n)), + lp.GlobalArg('x', dtype=np.float64, shape=(n, )), + lp.GlobalArg('y', shape=(n, )), ...], + target=CTarget(), + lang_version=(2018, 2)) -knl = lp.register_function_lookup(knl, blas_fn_lookup) +knl = lp.register_function_id_to_in_knl_callable_mapper( + knl, blas_fn_lookup) + +print(lp.generate_code_v2(knl).device_code()) diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index 7ab049cd1..cc4926fee 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -24,7 +24,7 @@ from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl) +knl = get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) # map schedule onto host or device print(knl) diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index 9098c5444..764cea0e6 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -16,7 +16,8 @@ a = cl.array.arange(queue, n, dtype=np.float32) # ------ knl = lp.make_kernel( "{ [i]: 0<=i Date: Sun, 12 Aug 2018 16:38:04 +0530 Subject: [PATCH 337/916] those were a lot of changes :o --- doc/index.rst | 1 + examples/python/global_barrier_removal.py | 2 +- examples/python/hello-loopy.py | 3 +- examples/python/ispc-stream-harness.py | 2 - examples/python/sparse.py | 4 +- loopy/__init__.py | 36 +- loopy/auto_test.py | 289 ++++++-------- loopy/check.py | 137 ++++++- loopy/cli.py | 2 +- loopy/codegen/__init__.py | 90 ++++- loopy/codegen/control.py | 3 +- loopy/codegen/loop.py | 2 +- loopy/codegen/result.py | 2 +- loopy/isl_helpers.py | 2 +- loopy/kernel/__init__.py | 132 ++++--- loopy/kernel/creation.py | 35 +- loopy/kernel/data.py | 6 +- loopy/kernel/instruction.py | 34 +- loopy/kernel/tools.py | 35 +- loopy/library/function.py | 54 +-- loopy/library/random123.py | 108 ++--- loopy/library/reduction.py | 256 ++++++------ loopy/loop.py | 2 + loopy/preprocess.py | 320 +++++++++++++-- loopy/schedule/__init__.py | 21 +- loopy/statistics.py | 462 ++++++++++++++-------- loopy/symbolic.py | 105 ++++- loopy/target/__init__.py | 9 +- loopy/target/c/__init__.py | 245 ++++++------ loopy/target/c/c_execution.py | 39 +- loopy/target/c/codegen/expression.py | 92 ++--- loopy/target/cuda.py | 98 +++-- loopy/target/execution.py | 116 +++--- loopy/target/ispc.py | 5 +- loopy/target/opencl.py | 209 ++++++---- loopy/target/pyopencl.py | 129 ++++-- loopy/target/pyopencl_execution.py | 61 +-- loopy/target/python.py | 57 ++- loopy/tools.py | 3 +- loopy/transform/add_barrier.py | 12 +- loopy/transform/arithmetic.py | 6 + loopy/transform/batch.py | 8 +- loopy/transform/buffer.py | 43 +- loopy/transform/data.py | 54 ++- loopy/transform/diff.py | 3 + loopy/transform/fusion.py | 56 ++- loopy/transform/iname.py | 60 ++- loopy/transform/instruction.py | 37 +- loopy/transform/padding.py | 15 +- loopy/transform/parameter.py | 6 + loopy/transform/precompute.py | 38 +- loopy/transform/save.py | 27 +- loopy/transform/subst.py | 20 +- loopy/type_inference.py | 354 +++++++++++++++-- test/test_apps.py | 19 +- test/test_c_execution.py | 1 + test/test_diff.py | 3 +- test/test_domain.py | 74 ++-- test/test_fortran.py | 12 +- test/test_loopy.py | 393 +++++++++--------- test/test_numa_diff.py | 4 +- test/test_reduction.py | 46 ++- test/test_target.py | 14 +- test/test_transform.py | 116 +++--- test/testlib.py | 50 ++- 65 files changed, 3071 insertions(+), 1608 deletions(-) diff --git a/doc/index.rst b/doc/index.rst index d862a8acd..0644b34c4 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -68,6 +68,7 @@ Please check :ref:`installation` to get started. ref_creation ref_kernel ref_transform + ref_call ref_other misc diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index 7ab049cd1..cc4926fee 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -24,7 +24,7 @@ from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl) +knl = get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) # map schedule onto host or device print(knl) diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index 9098c5444..764cea0e6 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -16,7 +16,8 @@ a = cl.array.arange(queue, n, dtype=np.float32) # ------ knl = lp.make_kernel( "{ [i]: 0<=i max_test_kernel_count: - break + if need_check and not AUTO_TEST_SKIP_RUN: + for arg_desc in ref_arg_data: + if arg_desc is None: + continue + if not arg_desc.needs_checking: + continue - kernel = infer_unknown_types(kernel, expect_completion=True) + from pyopencl.compyte.array import as_strided + ref_ary = as_strided( + arg_desc.ref_storage_array.get(), + shape=arg_desc.ref_shape, + strides=arg_desc.ref_numpy_strides).flatten() + test_ary = as_strided( + arg_desc.test_storage_array.get(), + shape=arg_desc.test_shape, + strides=arg_desc.test_numpy_strides).flatten() + common_len = min(len(ref_ary), len(test_ary)) + ref_ary = ref_ary[:common_len] + test_ary = test_ary[:common_len] - compiled = CompiledKernel(ctx, kernel) + error_is_small, error = check_result(test_ary, ref_ary) + if not error_is_small: + raise AutomaticTestFailure(error) - if args is None: - kernel_info = compiled.kernel_info(frozenset()) + need_check = False - args = make_args(kernel, - kernel_info.implemented_data_info, - queue, ref_arg_data, parameters) - args["out_host"] = False + events = [] + queue.finish() - if not quiet: - print(75*"-") - print("Kernel #%d:" % i) - print(75*"-") - if print_code: - print(compiled.get_highlighted_code()) - print(75*"-") - if dump_binary: - print(type(compiled.cl_program)) - print(compiled.cl_program.binaries[0]) - print(75*"-") + logger.info("%s: warmup done" % (test_prog.name)) - logger.info("%s: run warmup" % (knl.name)) + logger.info("%s: timing run" % (test_prog.name)) - for i in range(warmup_rounds): - if not AUTO_TEST_SKIP_RUN: - compiled(queue, **args) - - if need_check and not AUTO_TEST_SKIP_RUN: - for arg_desc in ref_arg_data: - if arg_desc is None: - continue - if not arg_desc.needs_checking: - continue - - from pyopencl.compyte.array import as_strided - ref_ary = as_strided( - arg_desc.ref_storage_array.get(), - shape=arg_desc.ref_shape, - strides=arg_desc.ref_numpy_strides).flatten() - test_ary = as_strided( - arg_desc.test_storage_array.get(), - shape=arg_desc.test_shape, - strides=arg_desc.test_numpy_strides).flatten() - common_len = min(len(ref_ary), len(test_ary)) - ref_ary = ref_ary[:common_len] - test_ary = test_ary[:common_len] - - error_is_small, error = check_result(test_ary, ref_ary) - if not error_is_small: - raise AutomaticTestFailure(error) - - need_check = False - - events = [] - queue.finish() + timing_rounds = warmup_rounds - logger.info("%s: warmup done" % (knl.name)) + while True: + from time import time + start_time = time() - logger.info("%s: timing run" % (knl.name)) + evt_start = cl.enqueue_marker(queue) - timing_rounds = warmup_rounds + for i in range(timing_rounds): + if not AUTO_TEST_SKIP_RUN: + evt, _ = test_prog(queue, **args) + events.append(evt) + else: + events.append(cl.enqueue_marker(queue)) - while True: - from time import time - start_time = time() + evt_end = cl.enqueue_marker(queue) - evt_start = cl.enqueue_marker(queue) + queue.finish() + stop_time = time() - for i in range(timing_rounds): - if not AUTO_TEST_SKIP_RUN: - evt, _ = compiled(queue, **args) - events.append(evt) - else: - events.append(cl.enqueue_marker(queue)) + for evt in events: + evt.wait() + evt_start.wait() + evt_end.wait() - evt_end = cl.enqueue_marker(queue) + elapsed_event = (1e-9*events[-1].profile.END + - 1e-9*events[0].profile.START) \ + / timing_rounds + try: + elapsed_event_marker = ((1e-9*evt_end.profile.START + - 1e-9*evt_start.profile.START) + / timing_rounds) + except cl.RuntimeError: + elapsed_event_marker = None - queue.finish() - stop_time = time() + elapsed_wall = (stop_time-start_time)/timing_rounds - for evt in events: - evt.wait() - evt_start.wait() - evt_end.wait() + if elapsed_wall * timing_rounds < 0.3: + timing_rounds *= 4 + else: + break - elapsed_event = (1e-9*events[-1].profile.END - - 1e-9*events[0].profile.START) \ - / timing_rounds - try: - elapsed_event_marker = ((1e-9*evt_end.profile.START - - 1e-9*evt_start.profile.START) - / timing_rounds) - except cl.RuntimeError: - elapsed_event_marker = None + logger.info("%s: timing run done" % (test_prog.name)) - elapsed_wall = (stop_time-start_time)/timing_rounds + rates = "" + for cnt, lbl in zip(op_count, op_label): + rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - if elapsed_wall * timing_rounds < 0.3: - timing_rounds *= 4 + if not quiet: + def format_float_or_none(v): + if v is None: + return "" else: - break + return "%g" % v - logger.info("%s: timing run done" % (knl.name)) + print("elapsed: %s s event, %s s marker-event %s s wall " + "(%d rounds)%s" % ( + format_float_or_none(elapsed_event), + format_float_or_none(elapsed_event_marker), + format_float_or_none(elapsed_wall), timing_rounds, rates)) - rates = "" + if do_check: + ref_rates = "" for cnt, lbl in zip(op_count, op_label): - rates += " %g %s/s" % (cnt/elapsed_wall, lbl) - + ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: - def format_float_or_none(v): - if v is None: - return "" - else: - return "%g" % v - - print("elapsed: %s s event, %s s marker-event %s s wall " - "(%d rounds)%s" % ( - format_float_or_none(elapsed_event), - format_float_or_none(elapsed_event_marker), - format_float_or_none(elapsed_wall), timing_rounds, rates)) - - if do_check: - ref_rates = "" - for cnt, lbl in zip(op_count, op_label): - ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) - if not quiet: - print("ref: elapsed: %g s event, %g s wall%s" % ( - ref_elapsed_event, ref_elapsed_wall, ref_rates)) + print("ref: elapsed: %g s event, %g s wall%s" % ( + ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} diff --git a/loopy/check.py b/loopy/check.py index c31304d87..ae5599bc4 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -27,9 +27,13 @@ from six.moves import range from islpy import dim_type import islpy as isl -from loopy.symbolic import WalkMapper +from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction) +from functools import reduce + import logging logger = logging.getLogger(__name__) @@ -56,6 +60,73 @@ def check_identifiers_in_subst_rules(knl): % (knl.name, rule.name, ", ".join(deps-rule_allowed_identifiers))) + +class UnscopedCallCollector(CombineMapper): + """ + Collects all the unscoped calls within a kernel. + + :returns: + An :class:`frozenset` of function names that are not scoped in + the kernel. + + .. note:: + :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are + never scoped in the pipeline. + """ + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + from pymbolic.primitives import CallWithKwargs + return self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={})) + + def map_call_with_kwargs(self, expr): + from loopy.library.reduction import ArgExtOp + if not isinstance(expr.function, (ResolvedFunction, ArgExtOp)): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + else: + return self.combine((self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values()))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def check_functions_are_scoped(kernel): + """ Checks if all the calls in the instruction expression have been scoped, + otherwise indicates to what all calls we await signature. Refer + :class:`loopy.symbolic.ResolvedFunction` for a detailed explanation of a + scoped function. + """ + + from loopy.symbolic import SubstitutionRuleExpander + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + unscoped_calls = UnscopedCallCollector()(subst_expander( + insn.expression)) + if unscoped_calls: + raise LoopyError("Unknown function '%s' obtained -- register a " + "function or a kernel corresponding to it." % + set(unscoped_calls).pop()) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown type of instruction %s" % type(insn).__name__) + # }}} @@ -114,6 +185,18 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) +def _get_all_unique_iname_tags(kernel): + """Returns a set of all the iname tags used in *kernel* that + inherit from :class:`loopy.kernel.data.UniqueTag`. + """ + from loopy.kernel.data import UniqueTag + iname_tags = [kernel.iname_to_tag.get(iname) for iname in + kernel.all_inames()] + return set( + tag for tag in iname_tags if + isinstance(tag, UniqueTag)) + + def check_multiple_tags_allowed(kernel): from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag, UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type) @@ -128,8 +211,10 @@ def check_multiple_tags_allowed(kernel): "tags: {1}".format(iname, tags)) -def check_for_double_use_of_hw_axes(kernel): +def check_for_double_use_of_hw_axes(kernel, program_callables_info): from loopy.kernel.data import UniqueTag + from loopy.kernel.instruction import CallInstruction + from loopy.kernel.function_interface import CallableKernel for insn in kernel.instructions: insn_tag_keys = set() @@ -142,6 +227,21 @@ def check_for_double_use_of_hw_axes(kernel): insn_tag_keys.add(key) + # check usage of iname tags in the callee kernel + if isinstance(insn, CallInstruction): + in_knl_callable = program_callables_info[ + insn.expression.function.name] + if isinstance(in_knl_callable, CallableKernel): + # check for collision in iname_tag keys in the instruction + # due to the callee kernel + common_iname_tags = [tag for tag in + _get_all_unique_iname_tags(in_knl_callable.subkernel) + if tag.key in insn_tag_keys] + if common_iname_tags: + raise LoopyError("instruction '%s' has multiple " + "inames tagged '%s'" % (insn.id, + common_iname_tags.pop())) + def check_for_inactive_iname_access(kernel): for insn in kernel.instructions: @@ -387,11 +487,12 @@ def check_write_destinations(kernel): # {{{ check_has_schedulable_iname_nesting def check_has_schedulable_iname_nesting(kernel): - from loopy.transform.iname import (has_schedulable_iname_nesting, - get_iname_duplication_options) - if not has_schedulable_iname_nesting(kernel): + from loopy.transform.iname import ( + has_schedulable_iname_nesting_for_single_kernel, + get_iname_duplication_options_for_single_kernel) + if not has_schedulable_iname_nesting_for_single_kernel(kernel): import itertools as it - opt = get_iname_duplication_options(kernel) + opt = get_iname_duplication_options_for_single_kernel(kernel) opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) for i, w in it.islice(opt, 3)) raise LoopyError("Kernel does not have a schedulable iname nesting. " @@ -616,13 +717,13 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(kernel): +def pre_schedule_checks(kernel, program_callables_info): try: logger.debug("%s: pre-schedule check: start" % kernel.name) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) - check_for_double_use_of_hw_axes(kernel) + check_for_double_use_of_hw_axes(kernel, program_callables_info) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) @@ -650,7 +751,8 @@ def pre_schedule_checks(kernel): # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, + sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, get_insn_ids_for_block_at, gather_schedule_block) @@ -665,7 +767,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): assert isinstance(kernel.schedule[sched_index], CallKernel) _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + program_callables_info) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) @@ -682,7 +785,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): while i < loop_end_i: sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): - i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i) + i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, + program_callables_info, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -733,9 +837,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None): return past_end_i -def check_for_unused_hw_axes_in_insns(kernel): +def check_for_unused_hw_axes_in_insns(kernel, program_callables_info): if kernel.schedule: - _check_for_unused_hw_axes_in_kernel_chunk(kernel) + _check_for_unused_hw_axes_in_kernel_chunk(kernel, + program_callables_info) # }}} @@ -889,15 +994,15 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel): +def pre_codegen_checks(kernel, program_callables_info): try: logger.debug("pre-codegen check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel) + check_for_unused_hw_axes_in_insns(kernel, program_callables_info) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel) + kernel.target.pre_codegen_check(kernel, program_callables_info) check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) diff --git a/loopy/cli.py b/loopy/cli.py index a92922b18..060340d59 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -205,7 +205,7 @@ def main(): new_kernels = [] for kernel in kernels: new_args = [ - lp.ArrayArg("occa_info", np.int32, shape=None) + lp.GlobalArg("occa_info", np.int32, shape=None) ] + kernel.args new_kernels.append(kernel.copy(args=new_args)) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 11f874e1b..3e675db75 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -32,6 +32,10 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder from loopy.version import DATA_MODEL_VERSION +from loopy.kernel.function_interface import CallableKernel +from cgen import Collection + + import logging logger = logging.getLogger(__name__) @@ -146,6 +150,7 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState(object): """ .. attribute:: kernel + .. attribute:: target .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -187,17 +192,21 @@ class CodeGenerationState(object): generated. .. attribute:: schedule_index_end + + .. attribute:: program_callables_info """ - def __init__(self, kernel, + def __init__(self, kernel, target, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, + program_callables_info, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, schedule_index_end=None): self.kernel = kernel + self.target = target self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates @@ -206,6 +215,7 @@ class CodeGenerationState(object): self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex + self.program_callables_info = program_callables_info self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -214,7 +224,7 @@ class CodeGenerationState(object): # {{{ copy helpers - def copy(self, kernel=None, implemented_data_info=None, + def copy(self, kernel=None, target=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), var_subst_map=None, vectorization_info=None, is_generating_device_code=None, @@ -224,6 +234,9 @@ class CodeGenerationState(object): if kernel is None: kernel = self.kernel + if target is None: + target = self.target + if implemented_data_info is None: implemented_data_info = self.implemented_data_info @@ -244,6 +257,7 @@ class CodeGenerationState(object): return CodeGenerationState( kernel=kernel, + target=target, implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( @@ -253,6 +267,7 @@ class CodeGenerationState(object): seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, + program_callables_info=self.program_callables_info, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -374,19 +389,15 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_v2(kernel): +def generate_code_for_a_single_kernel(kernel, program_callables_info, target): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import KernelState - if kernel.state == KernelState.INITIAL: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, program_callables_info) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " @@ -407,11 +418,8 @@ def generate_code_v2(kernel): # }}} - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel) + pre_codegen_checks(kernel, program_callables_info) logger.info("%s: generate code: start" % kernel.name) @@ -469,10 +477,12 @@ def generate_code_v2(kernel): gen_program_name=( kernel.target.host_program_name_prefix + kernel.name - + kernel.target.host_program_name_suffix), - schedule_index_end=len(kernel.schedule)) + + target.host_program_name_suffix), + schedule_index_end=len(kernel.schedule), + program_callables_info=program_callables_info) from loopy.codegen.result import generate_host_or_device_program + codegen_result = generate_host_or_device_program( codegen_state, schedule_index=0) @@ -502,7 +512,7 @@ def generate_code_v2(kernel): ) preamble_generators = (kernel.preamble_generators - + kernel.target.get_device_ast_builder().preamble_generators()) + + target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -524,6 +534,56 @@ def generate_code_v2(kernel): return codegen_result +def generate_code_v2(program): + from loopy.kernel import LoopKernel + from loopy.program import make_program_from_kernel + + if isinstance(program, LoopKernel): + program = make_program_from_kernel(program) + + from loopy.kernel import KernelState + if program.root_kernel.state == KernelState.INITIAL: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) + + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + + codegen_results = {} + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + codegen_results[func_id] = ( + generate_code_for_a_single_kernel(in_knl_callable.subkernel, + program.program_callables_info, program.target)) + + device_preambles = set() + for cgr in codegen_results.values(): + device_preambles.update(cgr.device_preambles) + + for in_knl_callable in program.program_callables_info.values(): + for preamble in in_knl_callable.generate_preambles(program.target): + device_preambles.update([preamble]) + + collective_device_program = codegen_results[program.name].device_programs[0] + for func_id, callee_cgr in codegen_results.items(): + if func_id != program.name: + assert len(callee_cgr.device_programs) == 1 + callee_prog_ast = callee_cgr.device_programs[0].ast + collective_device_program = collective_device_program.copy( + ast=Collection([callee_prog_ast, collective_device_program.ast])) + + device_preambles.update([('98_%s' % func_id, + str(callee_prog_ast.fdecl)), ]) + + collective_device_programs = [collective_device_program] + ( + codegen_results[program.name].device_programs[1:]) + + return codegen_results[program.name].copy( + device_programs=collective_device_programs, + device_preambles=device_preambles) + + def generate_code(kernel, device=None): if device is not None: from warnings import warn diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 45e2a18c4..90bdbda31 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -115,7 +115,8 @@ def generate_code_for_sched_index(codegen_state, sched_index): new_codegen_state, sched_index) glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index)) + get_insn_ids_for_block_at(kernel.schedule, sched_index), + codegen_state.program_callables_info) return merge_codegen_results(codegen_state, [ codegen_result, diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index ebddf3153..39cf20c7d 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -248,7 +248,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block) + insn_ids_for_block, codegen_state.program_callables_info) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 4318ad71c..00f19d99a 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -133,7 +133,7 @@ class CodeGenerationResult(ImmutableRecord): preamble_codes = process_preambles( getattr(self, "host_preambles", []) + - getattr(self, "device_preambles", []) + list(getattr(self, "device_preambles", [])) ) return ( diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 5a747d070..ef07b7e27 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -27,7 +27,7 @@ THE SOFTWARE. from six.moves import range, zip -from loopy.diagnostic import StaticValueFindingError +from loopy.diagnostic import StaticValueFindingError, LoopyError import islpy as isl from islpy import dim_type diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6b0033808..d2723c57f 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -37,10 +37,6 @@ import re from pytools import UniqueNameGenerator, generate_unique_names -from loopy.library.function import ( - default_function_mangler, - single_arg_function_mangler) - from loopy.diagnostic import CannotBranchDomainTree, LoopyError from loopy.tools import natsorted from loopy.diagnostic import StaticValueFindingError @@ -224,6 +220,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: target A subclass of :class:`loopy.TargetBase`. + + .. attribute:: is_called_from_host + + An instance of :class:`bool`. Will be set *False* for the kernel which + would be called from another top level kernels. Default value is + *True*. """ # {{{ constructor @@ -252,6 +254,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, + is_called_from_host=True, + overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): """ @@ -277,15 +281,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if substitutions is None: substitutions = {} if function_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] - if symbol_manglers is None: - function_manglers = [ - default_function_mangler, - single_arg_function_mangler, - ] + function_manglers = [] if iname_slab_increments is None: iname_slab_increments = {} @@ -372,6 +368,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, + is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) @@ -380,7 +377,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling + # {{{ function mangling/scoping def mangle_function(self, identifier, arg_dtypes, ast_builder=None): if ast_builder is None: @@ -1039,21 +1036,25 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) - @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, + program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs - *global_size* and *local_size* are :class:`islpy.PwAff` objects. + *global_size* and *local_size* are instances of :class:`dict` with + mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. """ - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - ignore_auto=ignore_auto) + # {{{ collecting the callee kernels in insn_ids + + from loopy.kernel.tools import get_direct_callee_kernels + callee_kernels = get_direct_callee_kernels(self, + program_callables_info, insn_ids) + + # }}} all_inames_by_insns = set() for insn_id in insn_ids: @@ -1068,6 +1069,15 @@ class LoopKernel(ImmutableRecordWithoutPickling): global_sizes = {} local_sizes = {} + # updating the grid sizes from the callee_kernels. + for callee_kernel in callee_kernels: + gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts( + frozenset(insn.id for insn in callee_kernel.instructions), + program_callables_info, ignore_auto) + + global_sizes.update(gsize) + local_sizes.update(lsize) + from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) @@ -1108,6 +1118,31 @@ class LoopKernel(ImmutableRecordWithoutPickling): tgt_dict[tag.axis] = size + return global_sizes, local_sizes + + def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, + ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + program_callables_info=program_callables_info, + ignore_auto=ignore_auto) + + assert self.is_called_from_host, ("Callee kernels do not have sufficient " + "information to compute grid sizes.") + + global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( + insn_ids, program_callables_info, ignore_auto=ignore_auto) + def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -1137,7 +1172,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, + program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1148,7 +1184,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, ignore_auto) + insn_ids, program_callables_info, ignore_auto) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1156,7 +1192,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, ignore_auto=False): + def get_grid_size_upper_bounds(self, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1164,9 +1200,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), + program_callables_info, ignore_auto=ignore_auto) - def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + def get_grid_size_upper_bounds_as_exprs(self, program_callables_info, + ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1175,6 +1213,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), + program_callables_info, ignore_auto=ignore_auto) # }}} @@ -1365,47 +1404,13 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ implementation arguments - - @property - @memoize_method - def impl_arg_to_arg(self): - from loopy.kernel.array import ArrayBase - - result = {} - - for arg in self.args: - if not isinstance(arg, ArrayBase): - result[arg.name] = arg - continue - - if arg.shape is None or arg.dim_tags is None: - result[arg.name] = arg - continue - - subscripts_and_names = arg.subscripts_and_names() - if subscripts_and_names is None: - result[arg.name] = arg - continue - - for index, sub_arg_name in subscripts_and_names: - result[sub_arg_name] = arg - - return result - - # }}} - # {{{ direct execution def __call__(self, *args, **kwargs): - key = self.target.get_kernel_executor_cache_key(*args, **kwargs) - try: - kex = self._kernel_executor_cache[key] - except KeyError: - kex = self.target.get_kernel_executor(self, *args, **kwargs) - self._kernel_executor_cache[key] = kex - - return kex(*args, **kwargs) + # FIXME: scream and then convert to a program + from loopy.program import make_program_from_kernel + program = make_program_from_kernel(self) + return program(*args, **kwargs) # }}} @@ -1489,6 +1494,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "silenced_warnings", "options", "state", + "is_called_from_host", "target", ) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index c42db3482..bac4afc85 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -24,16 +24,19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - import numpy as np from pymbolic.mapper import CSECachingMapperMixin +from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids -from loopy.symbolic import IdentityMapper, WalkMapper +from loopy.symbolic import ( + IdentityMapper, WalkMapper, SubArrayRef) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule) + SubstitutionRule, AddressSpace) +from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, + CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -504,9 +507,11 @@ def parse_insn(groups, insn_options): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): assignee_names.append(inner_lhs_i.aggregate.name) + elif isinstance(inner_lhs_i, SubArrayRef): + assignee_names.append(inner_lhs_i.subscript.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " - "be variable or subscript" % (lhs_i,)) + "be variable, subscript or a SubArrayRef" % (lhs_i,)) new_lhs.append(lhs_i) @@ -1139,7 +1144,7 @@ class ArgumentGuesser: def make_new_arg(self, arg_name): arg_name = arg_name.strip() - from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace + from loopy.kernel.data import ValueArg, ArrayArg import loopy as lp if arg_name in self.all_params: @@ -1664,7 +1669,7 @@ def _is_wildcard(s): def _resolve_dependencies(what, knl, insn, deps): - from loopy import find_instructions + from loopy.transform.instruction import find_instructions_in_single_kernel from loopy.match import MatchExpressionBase new_deps = [] @@ -1673,7 +1678,7 @@ def _resolve_dependencies(what, knl, insn, deps): found_any = False if isinstance(dep, MatchExpressionBase): - for new_dep in find_instructions(knl, dep): + for new_dep in find_instructions_in_single_kernel(knl, dep): if new_dep.id != insn.id: new_deps.append(new_dep.id) found_any = True @@ -1954,6 +1959,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) fixed_parameters = kwargs.pop("fixed_parameters", {}) + make_program = kwargs.pop("make_program", True) if defines: from warnings import warn @@ -2165,15 +2171,24 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) + from loopy.kernel.tools import infer_arg_is_output_only + knl = infer_arg_is_output_only(knl) + from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) creation_plog.done() - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) + if make_program: + from loopy.program import make_program_from_kernel + return make_program_from_kernel(knl) + else: + return knl + - return knl +def make_kernel_function(*args, **kwargs): + kwargs['make_program'] = False + return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 3e776bd06..9ba288961 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -337,6 +337,7 @@ class KernelArgument(ImmutableRecord): dtype = None kwargs["dtype"] = dtype + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) ImmutableRecord.__init__(self, **kwargs) @@ -362,7 +363,7 @@ class ArrayArg(ArrayBase, KernelArgument): def __init__(self, *args, **kwargs): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - kwargs["is_output_only"] = kwargs.pop("is_output_only", False) + kwargs["is_output_only"] = kwargs.pop("is_output_only", None) super(ArrayArg, self).__init__(*args, **kwargs) @@ -402,6 +403,9 @@ class ConstantArg(ArrayBase, KernelArgument): min_target_axes = 0 max_target_axes = 1 + # Constant Arg cannot be an output + is_output_only = False + def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape, dtype, is_written) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index e9c7bde9f..0f548bba7 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -487,7 +487,7 @@ class InstructionBase(ImmutableRecord): def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript + from loopy.symbolic import LinearSubscript, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -506,13 +506,20 @@ def _get_assignee_var_name(expr): assert isinstance(agg, Variable) return agg.name + + elif isinstance(expr, SubArrayRef): + agg = expr.subscript.aggregate + assert isinstance(agg, Variable) + + return agg.name + else: raise RuntimeError("invalid lvalue '%s'" % expr) def _get_assignee_subscript_deps(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, get_dependencies + from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef if isinstance(expr, Lookup): expr = expr.aggregate @@ -523,6 +530,8 @@ def _get_assignee_subscript_deps(expr): return get_dependencies(expr.index) elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) + elif isinstance(expr, SubArrayRef): + return get_dependencies(expr.get_begin_subscript().index) else: raise RuntimeError("invalid lvalue '%s'" % expr) @@ -942,12 +951,12 @@ class Assignment(MultiAssignmentBase): def assignee_subscript_deps(self): return (_get_assignee_subscript_deps(self.assignee),) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignee=f(self.assignee, *args), - expression=f(self.expression, *args), + assignee=f(self.assignee, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} @@ -1052,9 +1061,10 @@ class CallInstruction(MultiAssignmentBase): forced_iname_deps=forced_iname_deps, forced_iname_deps_is_final=forced_iname_deps_is_final) - from pymbolic.primitives import Call + from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)) and expression is not None: + if not isinstance(expression, (Call, CallWithKwargs, Reduction)) and ( + expression is not None): raise LoopyError("'expression' argument to CallInstruction " "must be a function call") @@ -1094,12 +1104,12 @@ class CallInstruction(MultiAssignmentBase): _get_assignee_subscript_deps(a) for a in self.assignees) - def with_transformed_expressions(self, f, *args): + def with_transformed_expressions(self, f, *args, **kwargs): return self.copy( - assignees=f(self.assignees, *args), - expression=f(self.expression, *args), + assignees=f(self.assignees, *args, **kwargs), + expression=f(self.expression, *args, **kwargs), predicates=frozenset( - f(pred, *args) for pred in self.predicates)) + f(pred, *args, **kwargs) for pred in self.predicates)) # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 95c3c336c..3c0c24434 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -36,6 +36,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg from loopy.tools import natsorted +from loopy.program import Program import logging logger = logging.getLogger(__name__) @@ -43,19 +44,25 @@ logger = logging.getLogger(__name__) # {{{ add and infer argument dtypes -def add_dtypes(knl, dtype_dict): +def add_dtypes(program, dtype_dict): """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(knl, dtype_dict) + root_kernel = program.root_kernel + dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( + root_kernel, dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) + root_kernel - return knl.copy(args=new_args, temporary_variables=new_temp_vars) + root_kernel_with_added_dtypes = ( + root_kernel.copy(args=new_args, temporary_variables=new_temp_vars)) + + return program.with_root_kernel(root_kernel_with_added_dtypes) def _add_dtypes_overdetermined(knl, dtype_dict): @@ -107,7 +114,8 @@ def get_arguments_with_incomplete_dtype(knl): if arg.dtype is None] -def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): +def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False): + assert isinstance(prog, Program) processed_dtype_dict = {} for k, v in six.iteritems(dtype_dict): @@ -116,10 +124,10 @@ def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False): if subkey: processed_dtype_dict[subkey] = v - knl = add_dtypes(knl, processed_dtype_dict) + prog = add_dtypes(prog, processed_dtype_dict) from loopy.type_inference import infer_unknown_types - return infer_unknown_types(knl, expect_completion=expect_completion) + return infer_unknown_types(prog, expect_completion=expect_completion) def _add_and_infer_dtypes_overdetermined(knl, dtype_dict): @@ -747,7 +755,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, axis=0, local_size=None): +def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -761,7 +769,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - ignore_auto=True) + program_callables_info, ignore_auto=True) # {{{ axis assignment helper function @@ -789,6 +797,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): return assign_automatic_axes( kernel.copy(iname_to_tags=new_iname_to_tags), + program_callables_info, axis=recursion_axis) if axis is None: @@ -828,7 +837,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: - from loopy import split_iname, untag_inames + from loopy import untag_inames + from loopy.transform.iname import split_iname # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. @@ -839,6 +849,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), + program_callables_info=program_callables_info, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -860,7 +871,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): del new_iname_to_tags[iname] return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), - axis=recursion_axis, local_size=local_size) + program_callables_info, axis=recursion_axis, local_size=local_size) # }}} @@ -928,7 +939,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None): if axis >= len(local_size): return kernel else: - return assign_automatic_axes(kernel, axis=axis+1, + return assign_automatic_axes(kernel, + program_callables_info=program_callables_info, axis=axis+1, local_size=local_size) # }}} @@ -1866,6 +1878,7 @@ def infer_arg_is_output_only(kernel): """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg new_args = [] + for arg in kernel.args: if isinstance(arg, (ArrayArg, ImageArg, ValueArg)): if arg.is_output_only is not None: diff --git a/loopy/library/function.py b/loopy/library/function.py index 9d557ac9f..8338875d0 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,38 +22,48 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from loopy.kernel.function_interface import ScalarCallable -def default_function_mangler(kernel, name, arg_dtypes): - from loopy.library.reduction import reduction_function_mangler - manglers = [reduction_function_mangler, tuple_function_mangler] - for mangler in manglers: - result = mangler(kernel, name, arg_dtypes) - if result is not None: - return result +class MakeTupleCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + new_arg_id_to_dtype = arg_id_to_dtype.copy() + for i in range(len(arg_id_to_dtype)): + if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: + new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] - return None + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target="loopy_make_tuple"), program_callables_info) + def with_descrs(self, arg_id_to_descr, program_callables_info): + from loopy.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), + (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) -def single_arg_function_mangler(kernel, name, arg_dtypes): - if len(arg_dtypes) == 1: - dtype, = arg_dtypes + return ( + self.copy(arg_id_to_descr=new_arg_id_to_descr), + program_callables_info) - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo(name, (dtype,), (dtype,)) - return None +class IndexOfCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + new_arg_id_to_dtype = dict((i, dtype) for i, dtype in + arg_id_to_dtype.items() if dtype is not None) + new_arg_id_to_dtype[-1] = kernel.index_dtype + return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), + program_callables_info) -def tuple_function_mangler(kernel, name, arg_dtypes): - if name == "make_tuple": - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="loopy_make_tuple", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) - return None +def loopy_specific_callable_scopers(target, identifier): + if identifier == "make_tuple": + return MakeTupleCallable(name="make_tuple") + + if identifier in ["indexof", "indexof_vec"]: + return IndexOfCallable(name=identifier) + + from loopy.library.reduction import reduction_scoper + return reduction_scoper(target, identifier) # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index b8633114d..59ca72df1 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -27,6 +27,7 @@ THE SOFTWARE. from pytools import ImmutableRecord from mako.template import Template +from loopy.kernel.function_interface import ScalarCallable import numpy as np @@ -163,60 +164,77 @@ double${ width } ${ name }_f64( # }}} -def random123_preamble_generator(preamble_info): - for f in preamble_info.seen_functions: - try: - rng_variant = FUNC_NAMES_TO_RNG[f.name] - except KeyError: - continue +class Random123Callable(ScalarCallable): + """ + Records information about for the random123 functions. + """ + + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return (self.copy(), + program_callables_info) + + name = self.name + target = kernel.target + + rng_variant = FUNC_NAMES_TO_RNG[name] + + from loopy.types import NumpyType + base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] + ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) + key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) + + fn = rng_variant.full_name + if name == fn: + new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return ( + self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=fn+"_gen"), + program_callables_info) + + elif name == fn + "_f32": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), program_callables_info + + elif name == fn + "_f64": + new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), + rng_variant.width), + -2: ctr_dtype, 0: ctr_dtype, 1: + key_dtype} + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name), program_callables_info + + return (self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + def generate_preambles(self, target): + rng_variant = FUNC_NAMES_TO_RNG[self.name] from loopy.target.pyopencl import PyOpenCLTarget yield ("90-random123-"+rng_variant.full_name, PREAMBLE_TEMPLATE.render( is_pyopencl_target=isinstance( - preamble_info.kernel.target, + target, PyOpenCLTarget), rng_variant=rng_variant, )) + return -def random123_function_mangler(kernel, name, arg_dtypes): - try: - rng_variant = FUNC_NAMES_TO_RNG[name] - except KeyError: - return None - - from loopy.types import NumpyType - target = kernel.target - base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits] - ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width) - key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width) - - from loopy.kernel.data import CallMangleInfo - fn = rng_variant.full_name - if name == fn: - return CallMangleInfo( - target_name=fn+"_gen", - result_dtypes=(ctr_dtype, ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f32": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float32), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - elif name == fn + "_f64": - return CallMangleInfo( - target_name=name, - result_dtypes=( - target.vector_dtype(NumpyType(np.float64), rng_variant.width), - ctr_dtype), - arg_dtypes=(ctr_dtype, key_dtype)) - - else: - return None + +def random123_function_scoper(target, identifier): + if identifier in FUNC_NAMES_TO_RNG: + return Random123Callable(name=identifier) + + return None # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 8ed5cbe56..6ec8e4b21 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2012 Andreas Kloeckner" @@ -24,11 +24,14 @@ THE SOFTWARE. from pymbolic import var +from loopy.symbolic import ResolvedFunction +from loopy.kernel.function_interface import ScalarCallable import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.kernel import LoopKernel class ReductionOperation(object): @@ -81,6 +84,9 @@ class ReductionOperation(object): raise LoopyError("unable to parse reduction type: '%s'" % op_type) + def get_scalar_callables(self): + return frozenset() + class ScalarReductionOperation(ReductionOperation): def __init__(self, forced_result_type=None): @@ -180,7 +186,10 @@ class MaxReductionOperation(ScalarReductionOperation): return get_ge_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("max")(operand1, operand2) + return ResolvedFunction("max")(operand1, operand2) + + def get_scalar_callables(self): + return frozenset(["max"]) class MinReductionOperation(ScalarReductionOperation): @@ -188,7 +197,10 @@ class MinReductionOperation(ScalarReductionOperation): return get_le_neutral(dtype) def __call__(self, dtype, operand1, operand2): - return var("min")(operand1, operand2) + return ResolvedFunction("min")(operand1, operand2) + + def get_scalar_callables(self): + return frozenset(["min"]) # {{{ base class for symbolic reduction ops @@ -212,6 +224,11 @@ class ReductionOpFunction(FunctionIdentifier): return type(self)(reduction_op) + hash_fields = ( + "reduction_op",) + + update_persistent_hash = LoopKernel.update_persistent_hash + # }}} @@ -237,7 +254,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype): scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)) def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): @@ -254,7 +271,10 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return type(self) == type(other) def __call__(self, dtypes, operand1, operand2): - return SegmentedOp(self)(*(operand1 + operand2)) + return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2)) + + def get_scalar_callables(self): + return frozenset(["make_tuple", SegmentedOp(self)]) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -262,34 +282,24 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): which = "sum" op = "((%s) + (%s))" + hash_fields = ( + "which", + "op",) + + update_persistent_hash = LoopKernel.update_persistent_hash + class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): base_reduction_class = ProductReductionOperation op = "((%s) * (%s))" which = "product" + hash_fields = ( + "which", + "op", + "base_reduction_class",) -def get_segmented_function_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) + update_persistent_hash = LoopKernel.update_persistent_hash # }}} @@ -313,7 +323,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return var("make_tuple")(scalar_neutral_element, + return ResolvedFunction("make_tuple")(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)) def __str__(self): @@ -330,7 +340,10 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2): - return ArgExtOp(self)(*(operand1 + operand2)) + return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2)) + + def get_scalar_callables(self): + return frozenset([self.which, "make_tuple", ArgExtOp(self)]) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -338,43 +351,23 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): update_comparison = ">=" neutral_sign = -1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) + + update_persistent_hash = LoopKernel.update_persistent_hash + class ArgMinReductionOperation(_ArgExtremumReductionOperation): which = "min" update_comparison = "<=" neutral_sign = +1 + hash_fields = ("which", + "update_comparison", + "neutral_sign",) -def get_argext_preamble(kernel, func_id, arg_dtypes): - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - prefix = op.prefix(scalar_dtype, index_dtype) - - return (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { - *index_out = index2; - return op2; - } - else - { - *index_out = index1; - return op1; - } - } - """ % dict( - scalar_t=kernel.target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=kernel.target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) + update_persistent_hash = LoopKernel.update_persistent_hash # }}} @@ -429,70 +422,93 @@ def parse_reduction_op(name): # }}} -def reduction_function_mangler(kernel, func_id, arg_dtypes): - if isinstance(func_id, ArgExtOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - index_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, index_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, index_dtype), - arg_dtypes=( - scalar_dtype, - index_dtype, - scalar_dtype, - index_dtype), - ) - - elif isinstance(func_id, SegmentedOp): - from loopy.target.opencl import CTarget - if not isinstance(kernel.target, CTarget): - raise LoopyError("%s: only C-like targets supported for now" % func_id) - - op = func_id.reduction_op - scalar_dtype = arg_dtypes[0] - segment_flag_dtype = arg_dtypes[1] - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="%s_op" % op.prefix( - scalar_dtype, segment_flag_dtype), - result_dtypes=op.result_dtypes( - kernel, scalar_dtype, segment_flag_dtype), - arg_dtypes=( - scalar_dtype, - segment_flag_dtype, - scalar_dtype, - segment_flag_dtype), - ) +# {{{ reduction specific callables + +class ReductionCallable(ScalarCallable): + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + scalar_dtype = arg_id_to_dtype[0] + index_dtype = arg_id_to_dtype[1] + result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype, + index_dtype) + new_arg_id_to_dtype = arg_id_to_dtype.copy() + new_arg_id_to_dtype[-1] = result_dtypes[0] + new_arg_id_to_dtype[-2] = result_dtypes[1] + name_in_target = self.name.reduction_op.prefix(scalar_dtype, + index_dtype) + "_op" + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, + name_in_target=name_in_target), program_callables_info + + def with_descr(self, arg_id_to_descr, program_callables_info): + from loopy.library.kernel.function_interface import ValueArgDescriptor + new_arg_id_to_descr = arg_id_to_descr.copy() + new_arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + program_callables_info) + + def generate_preambles(self, target): + if isinstance(self.name, ArgExtOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(index_t)s index1, + %(scalar_t)s op2, %(index_t)s index2, + %(index_t)s *index_out) + { + if (op2 %(comp)s op1) + { + *index_out = index2; + return op2; + } + else + { + *index_out = index1; + return op1; + } + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + elif isinstance(self.name, SegmentedOp): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline %(scalar_t)s %(prefix)s_op( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) + + return + + +def reduction_scoper(target, identifier): + if isinstance(identifier, (ArgExtOp, SegmentedOp)): + return ReductionCallable(name=identifier) return None - -def reduction_preamble_generator(preamble_info): - from loopy.target.opencl import OpenCLTarget - - for func in preamble_info.seen_functions: - if isinstance(func.name, ArgExtOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_argext_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) - - elif isinstance(func.name, SegmentedOp): - if not isinstance(preamble_info.kernel.target, OpenCLTarget): - raise LoopyError("only OpenCL supported for now") - - yield get_segmented_function_preamble(preamble_info.kernel, func.name, - func.arg_dtypes) +# }}} # vim: fdm=marker diff --git a/loopy/loop.py b/loopy/loop.py index 459246382..66d413987 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -25,6 +25,7 @@ THE SOFTWARE. import islpy as isl import six +from loopy.program import iterate_over_kernels_if_given_program def potential_loop_nest_map(kernel): @@ -55,6 +56,7 @@ def potential_loop_nest_map(kernel): return result +@iterate_over_kernels_if_given_program def fuse_loop_domains(kernel): from loopy.kernel.tools import is_domain_dependent_on_inames diff --git a/loopy/preprocess.py b/loopy/preprocess.py index fc950c78e..3657967a1 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -27,7 +27,6 @@ import six from loopy.diagnostic import ( LoopyError, WriteRaceConditionWarning, warn_with_kernel, LoopyAdvisory) - import islpy as isl from pytools.persistent_dict import WriteOncePersistentDict @@ -37,13 +36,19 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types +from loopy.symbolic import RuleAwareIdentityMapper +from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, + CallInstruction, _DataObliviousInstruction) +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) # {{{ prepare for caching +@iterate_over_kernels_if_given_program def prepare_for_caching(kernel): import loopy as lp new_args = [] @@ -885,9 +890,9 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, - automagic_scans_ok=False, force_scan=False, - force_outer_iname_for_scan=None): +def realize_reduction_for_single_kernel(kernel, program_callables_info, + insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, + force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -1007,7 +1012,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential - def map_reduction_seq(expr, rec, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, program_callables_info, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1125,7 +1130,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, nresults, arg_dtypes, + def map_reduction_local(expr, rec, program_callables_info, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1365,7 +1370,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ sequential scan - def map_scan_seq(expr, rec, nresults, arg_dtypes, + def map_scan_seq(expr, rec, program_callables_info, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1454,17 +1459,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ local-parallel scan - def map_scan_local(expr, rec, nresults, arg_dtypes, - reduction_dtypes, sweep_iname, scan_iname, - sweep_min_value, scan_min_value, stride): + def map_scan_local(expr, rec, program_callables_info, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): scan_size = _get_int_iname_size(sweep_iname) assert scan_size > 0 if scan_size == 1: - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, program_callables_info, + nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1663,15 +1668,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # {{{ seq/par dispatch - def map_reduction(expr, rec, nresults=1): + def map_reduction(expr, rec, program_callables_info, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes = ( + arg_dtypes, reduction_dtypes, program_callables_info = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, unknown_types_ok)) + temp_kernel, expr, program_callables_info, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1780,15 +1785,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes, + expr, rec, program_callables_info, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes, - sweep_iname, scan_param.scan_iname, + expr, rec, program_callables_info, nresults, + arg_dtypes, reduction_dtypes, sweep_iname, + scan_param.scan_iname, scan_param.sweep_lower_bound, scan_param.scan_lower_bound, scan_param.stride) @@ -1807,12 +1814,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, if n_sequential: assert n_local_par == 0 - return map_reduction_seq( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + return map_reduction_seq(expr, rec, program_callables_info, + nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, nresults, arg_dtypes, reduction_dtypes) + expr, rec, program_callables_info, nresults, arg_dtypes, + reduction_dtypes) # }}} @@ -1845,9 +1853,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: - new_expressions = cb_mapper(insn.expression, nresults=nresults) + new_expressions = cb_mapper(insn.expression, + program_callables_info=program_callables_info, + nresults=nresults) else: - new_expressions = (cb_mapper(insn.expression),) + new_expressions = ( + cb_mapper(insn.expression, + program_callables_info=program_callables_info),) if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1935,6 +1947,31 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, return kernel + +def realize_reduction(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = realize_reduction_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} @@ -2108,17 +2145,159 @@ def check_atomic_loads(kernel): # }}} +# {{{ arg_descr_inference + +class ArgDescrInferenceMapper(RuleAwareIdentityMapper): + """ + Returns a set of instances of :class:`tuple` (expr, + in_kernel_callable). The mapped `in_kernel_callable` of the + :class:`InKernelCallable` are descriptor specialized for the given + arguments. + """ + + def __init__(self, rule_mapping_context, caller_kernel, + program_callables_info): + super(ArgDescrInferenceMapper, self).__init__( + rule_mapping_context) + self.caller_kernel = caller_kernel + self.program_callables_info = program_callables_info + + def map_call(self, expr, expn_state, **kwargs): + from pymbolic.primitives import Call, CallWithKwargs + from loopy.kernel.function_interface import ValueArgDescriptor + from loopy.symbolic import ResolvedFunction, SubArrayRef + + if not isinstance(expr.function, ResolvedFunction): + # ignore if the call is not to a ResolvedFunction + return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state) + + if isinstance(expr, Call): + kw_parameters = {} + else: + assert isinstance(expr, CallWithKwargs) + kw_parameters = expr.kw_parameters + + # descriptors for the args and kwargs of the Call + arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.caller_kernel)) + if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) + for i, par in tuple(enumerate(expr.parameters)) + + tuple(kw_parameters.items())) + + assignee_id_to_descr = {} + + if 'assignees' in kwargs: + # If supplied with assignees then this is a CallInstruction + assignees = kwargs['assignees'] + assert isinstance(assignees, tuple) + for i, par in enumerate(assignees): + if isinstance(par, SubArrayRef): + assignee_id_to_descr[-i-1] = ( + par.get_array_arg_descriptor(self.caller_kernel)) + else: + assignee_id_to_descr[-i-1] = ValueArgDescriptor() + + # gathering all the descriptors + combined_arg_id_to_descr = arg_id_to_descr.copy() + combined_arg_id_to_descr.update(assignee_id_to_descr) + + # specializing the function according to the parameter description + in_knl_callable = self.program_callables_info[expr.function.name] + new_in_knl_callable, self.program_callables_info = ( + in_knl_callable.with_descrs( + combined_arg_id_to_descr, self.program_callables_info)) + self.program_callables_info, new_func_id = ( + self.program_callables_info.with_callable( + expr.function.function, + new_in_knl_callable)) + + if isinstance(expr, Call): + return Call( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + else: + assert isinstance(expr, CallWithKwargs) + return CallWithKwargs( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(kw_parameters)) + ) + + map_call_with_kwargs = map_call + + def map_kernel(self, kernel): + + new_insns = [] + + for insn in kernel.instructions: + if isinstance(insn, CallInstruction): + # In call instructions the assignees play an important in + # determining the arg_id_to_descr + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn, assignees=insn.assignees)) + elif isinstance(insn, MultiAssignmentBase): + new_insns.append(insn.with_transformed_expressions( + self, kernel, insn)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError("arg_descr_inference for %s instruction" % + type(insn)) + + return kernel.copy(instructions=new_insns) + + +def traverse_to_infer_arg_descr(kernel, program_callables_info): + """ + Returns a copy of *kernel* with the argument shapes and strides matching for + scoped functions in the *kernel*. Refer + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. + """ + # FIXME: update this docs, once the design is finalized + + from loopy.symbolic import SubstitutionRuleMappingContext + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, + kernel, program_callables_info) + + descr_inferred_kernel = rule_mapping_context.finish_kernel( + arg_descr_inf_mapper.map_kernel(kernel)) + + return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info + + +def infer_arg_descr(program): + root_kernel_callable = program.program_callables_info[program.name] + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + root_kernel = program.root_kernel + + new_root_kernel, program_callables_info = traverse_to_infer_arg_descr( + root_kernel, program_callables_info) + new_root_kernel_callable = root_kernel_callable.copy( + subkernel=new_root_kernel) + program_callables_info, _ = program_callables_info.with_callable(program.name, + new_root_kernel_callable) + + program_callables_info = program_callables_info.with_exit_edit_callables_mode() + + return program.copy(program_callables_info=program_callables_info) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) -def preprocess_kernel(kernel, device=None): - if device is not None: - from warnings import warn - warn("passing 'device' to preprocess_kernel() is deprecated", - DeprecationWarning, stacklevel=2) - +def preprocess_single_kernel(kernel, program_callables_info, device=None): from loopy.kernel import KernelState if kernel.state >= KernelState.PREPROCESSED: return kernel @@ -2161,8 +2340,6 @@ def preprocess_kernel(kernel, device=None): # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. - kernel = infer_unknown_types(kernel, expect_completion=False) - check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) @@ -2177,8 +2354,8 @@ def preprocess_kernel(kernel, device=None): # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. - - kernel = realize_reduction(kernel, unknown_types_ok=False) + kernel = realize_reduction_for_single_kernel(kernel, + program_callables_info, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators @@ -2222,4 +2399,81 @@ def preprocess_kernel(kernel, device=None): return kernel + +def preprocess_kernel(kernel, device=None): + # FIXME: error message? + return preprocess_program(kernel, device) + + +def preprocess_program(program, device=None): + + if device is not None: + from warnings import warn + warn("passing 'device' to preprocess_kernel() is deprecated", + DeprecationWarning, stacklevel=2) + + program = infer_unknown_types(program, expect_completion=False) + + # {{{ preprocess the root kernel + + # Callable editing restrictions: + # + # - cannot edit program_callables_info in :meth:`preprocess_single_kernel` + # as we are iterating over it. + # + # Refer: https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = preprocess_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + device) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + program = program.copy(program_callables_info=new_program_callables_info) + + # }}} + + # infer arg descrs of the callables + program = infer_arg_descr(program) + + # {{{ hw axes inference + + # FIXME: think of wrapping this in a function? + + local_size, global_size = program.get_grid_size_upper_bounds() + + resolved_function_with_hw_axes_sizes_set = {} + + for func_id, in_knl_callable in ( + program.program_callables_info.items()): + if func_id == program.name: + resolved_function_with_hw_axes_sizes_set[func_id] = ( + in_knl_callable) + else: + resolved_function_with_hw_axes_sizes_set[func_id] = ( + in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + + new_program_callables_info = ( + program.program_callables_info.copy( + resolved_functions=resolved_function_with_hw_axes_sizes_set)) + + program = program.copy(program_callables_info=new_program_callables_info) + + # }}} + + return program + + # vim: foldmethod=marker diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 652f8b893..201bcc256 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1832,7 +1832,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, debug_args={}): +def generate_loop_schedules(kernel, program_callables_info, debug_args={}): """ .. warning:: @@ -1845,18 +1845,19 @@ def generate_loop_schedules(kernel, debug_args={}): """ with MinRecursionLimitForScheduling(kernel): - for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args): + for sched in generate_loop_schedules_inner(kernel, + program_callables_info, debug_args=debug_args): yield sched -def generate_loop_schedules_inner(kernel, debug_args={}): +def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel) + pre_schedule_checks(kernel, program_callables_info) schedule_count = 0 @@ -1969,7 +1970,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - gsize, lsize = kernel.get_grid_size_upper_bounds() + gsize, lsize = ( + kernel.get_grid_size_upper_bounds(program_callables_info)) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2026,7 +2028,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel): +def _get_one_scheduled_kernel_inner(kernel, program_callables_info): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2036,10 +2038,10 @@ def _get_one_scheduled_kernel_inner(kernel): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel))) + return next(iter(generate_loop_schedules(kernel, program_callables_info))) -def get_one_scheduled_kernel(kernel): +def get_one_scheduled_kernel(kernel, program_callables_info): from loopy import CACHING_ENABLED sched_cache_key = kernel @@ -2057,7 +2059,8 @@ def get_one_scheduled_kernel(kernel): if not from_cache: with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): - result = _get_one_scheduled_kernel_inner(kernel) + result = _get_one_scheduled_kernel_inner(kernel, + program_callables_info) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/statistics.py b/loopy/statistics.py index cee28b24f..08b7f89e9 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -33,6 +33,7 @@ from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError from pytools import Record +from loopy.kernel.function_interface import ScalarCallable, CallableKernel __doc__ = """ @@ -59,6 +60,14 @@ __doc__ = """ """ +# FIXME: this is broken for the callable kernel design. +# Qns: +# - The variable name, what if multiple kernels use the same name? +# - We should also add the cumulative effect on the arguments of callee kernels +# into the caller kernel. +# FIXME: add an error that there is only one callable kernel. disable for +# multiple callable kernels. + # {{{ GuardedPwQPolynomial class GuardedPwQPolynomial(object): @@ -639,10 +648,11 @@ class MemAccess(Record): # {{{ counter base class CounterBase(CombineMapper): - def __init__(self, knl): + def __init__(self, knl, program_callables_info): self.knl = knl + self.program_callables_info = program_callables_info from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.type_inf = TypeInferenceMapper(knl, program_callables_info) def combine(self, values): return sum(values) @@ -697,10 +707,11 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl): + def __init__(self, knl, program_callables_info): self.knl = knl + self.program_callables_info = program_callables_info from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) + self.type_inf = TypeInferenceMapper(knl, program_callables_info) def combine(self, values): return sum(values) @@ -712,9 +723,16 @@ class ExpressionOpCounter(CounterBase): map_variable = map_constant def map_call(self, expr): + from loopy.symbolic import ResolvedFunction + if isinstance(expr.function, ResolvedFunction): + function_identifier = self.program_callables_info[ + expr.function.name].name + else: + function_identifier = expr.function.name + return ToCountMap( {Op(dtype=self.type_inf(expr), - name='func:'+str(expr.function), + name='func:'+function_identifier, count_granularity=CountGranularity.WORKITEM): 1} ) + self.rec(expr.parameters) @@ -1090,6 +1108,16 @@ def add_assumptions_guard(kernel, pwqpolynomial): def count(kernel, set, space=None): + from loopy.program import Program + if isinstance(kernel, Program): + if len([in_knl_callable for in_knl_callable in + kernel.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + kernel = kernel.root_kernel + try: if space is not None: set = set.align_params(space) @@ -1188,9 +1216,10 @@ def count(kernel, set, space=None): return add_assumptions_guard(kernel, count) -def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): +def get_unused_hw_axes_factor(knl, program_callables_info, insn, + disregard_local_axes, space=None): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds() + gsize, lsize = knl.get_grid_size_upper_bounds(program_callables_info) g_used = set() l_used = set() @@ -1228,7 +1257,8 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): return add_assumptions_guard(knl, result) -def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): +def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, + disregard_local_axes=False): insn_inames = knl.insn_inames(insn) @@ -1248,9 +1278,8 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) c = count(knl, domain, space=space) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, insn, - disregard_local_axes=disregard_local_axes, - space=space) + unused_fac = get_unused_hw_axes_factor(knl, program_callables_info, + insn, disregard_local_axes=disregard_local_axes, space=space) return c * unused_fac else: return c @@ -1260,7 +1289,50 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False) # {{{ get_op_map -def get_op_map(knl, numpy_types=True, count_redundant_work=False, + +def get_op_map_for_single_kernel(knl, program_callables_info, + numpy_types=True, count_redundant_work=False, + subgroup_size=None): + + if not knl.options.ignore_boostable_into: + raise LoopyError("Kernel '%s': Using operation counting requires the option " + "ignore_boostable_into to be set." % knl.name) + + from loopy.kernel.instruction import ( + CallInstruction, CInstruction, Assignment, + NoOpInstruction, BarrierInstruction) + + op_map = ToCountMap() + op_counter = ExpressionOpCounter(knl, + program_callables_info=program_callables_info) + for insn in knl.instructions: + if isinstance(insn, (CallInstruction, CInstruction, Assignment)): + ops = op_counter(insn.assignee) + op_counter(insn.expression) + op_map = op_map + ops*count_insn_runs( + knl, program_callables_info, insn, + count_redundant_work=count_redundant_work) + elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + pass + else: + raise NotImplementedError("unexpected instruction item type: '%s'" + % type(insn).__name__) + + if numpy_types: + return ToCountMap( + init_dict=dict( + (Op( + dtype=op.dtype.numpy_dtype, + name=op.name, + count_granularity=op.count_granularity), + ct) + for op, ct in six.iteritems(op_map.count_map)), + val_type=op_map.val_type + ) + else: + return op_map + + +def get_op_map(program, numpy_types=True, count_redundant_work=False, subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1318,44 +1390,31 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, """ - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - - from loopy.preprocess import preprocess_kernel, infer_unknown_types - from loopy.kernel.instruction import ( - CallInstruction, CInstruction, Assignment, - NoOpInstruction, BarrierInstruction) - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) + from loopy.preprocess import preprocess_program, infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl) - for insn in knl.instructions: - if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_map = op_map + ops*count_insn_runs( - knl, insn, - count_redundant_work=count_redundant_work) - elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_op_map = get_op_map_for_single_kernel(knl, + program.program_callables_info, numpy_types, + count_redundant_work, subgroup_size) + + for i in range(num_times_called): + op_map += knl_op_map + elif isinstance(in_knl_callable, ScalarCallable): pass else: - raise NotImplementedError("unexpected instruction item type: '%s'" - % type(insn).__name__) + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) - if numpy_types: - return ToCountMap( - init_dict=dict( - (Op( - dtype=op.dtype.numpy_dtype, - name=op.name, - count_granularity=op.count_granularity), - ct) - for op, ct in six.iteritems(op_map.count_map)), - val_type=op_map.val_type - ) - else: - return op_map + return op_map # }}} @@ -1376,93 +1435,9 @@ def _find_subgroup_size_for_knl(knl): # {{{ get_mem_access_map -def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, - subgroup_size=None): - """Count the number of memory accesses in a loopy kernel. - - :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be - counted. - - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other - specifics, a kernel may perform work redundantly. This :class:`bool` - flag indicates whether this work should be included in the count. - (Likely desirable for performance modeling, but undesirable for - code optimization.) - - :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or - *None* that specifies the sub-group size. An OpenCL sub-group is an - implementation-dependent grouping of work-items within a work-group, - analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when - counting a :class:`MemAccess` whose count_granularity specifies that it - should only be counted once per sub-group. If set to *None* an attempt - to find the sub-group size using the device will be made, if this fails - an error will be raised. If a :class:`str` ``'guess'`` is passed as - the subgroup_size, get_mem_access_map will attempt to find the - sub-group size using the device and, if unsuccessful, will make a wild - guess. - - :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** - :class:`islpy.PwQPolynomial` **}**. - - - The :class:`MemAccess` specifies the characteristics of the memory - access. - - - The :class:`islpy.PwQPolynomial` holds the number of memory accesses - with the characteristics specified in the key (in terms of the - :class:`loopy.LoopKernel` *inames*). - - Example usage:: - - # (first create loopy kernel and specify array data types) - - params = {'n': 512, 'm': 256, 'l': 128} - mem_map = get_mem_access_map(knl) - - f32_s1_g_ld_a = mem_map[MemAccess( - mtype='global', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='load', - variable='a', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - f32_s1_g_st_a = mem_map[MemAccess( - mtype='global', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='store', - variable='a', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - f32_s1_l_ld_x = mem_map[MemAccess( - mtype='local', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='load', - variable='x', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - f32_s1_l_st_x = mem_map[MemAccess( - mtype='local', - dtype=np.float32, - lid_strides={0: 1}, - gid_strides={0: 256}, - direction='store', - variable='x', - count_granularity=CountGranularity.WORKITEM) - ].eval_with_dict(params) - - # (now use these counts to, e.g., predict performance) - - """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types +def get_access_map_for_single_kernel(knl, program_callables_info, + numpy_types=True, count_redundant_work=False, subgroup_size=None): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " @@ -1518,11 +1493,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, + knl, program_callables_info, insn, + count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, + knl, program_callables_info, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: @@ -1530,7 +1506,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds(program_callables_info) workgroup_size = 1 if local_size: for size in local_size: @@ -1556,12 +1532,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl) - access_counter_l = LocalMemAccessCounter(knl) + access_counter_g = GlobalMemAccessCounter(knl, program_callables_info) + access_counter_l = LocalMemAccessCounter(knl, program_callables_info) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1617,12 +1590,129 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, else: return access_map + +def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, + subgroup_size=None): + """Count the number of memory accesses in a loopy kernel. + + :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be + counted. + + :arg numpy_types: A :class:`bool` specifying whether the types in the + returned mapping should be numpy types instead of + :class:`loopy.LoopyType`. + + :arg count_redundant_work: Based on usage of hardware axes or other + specifics, a kernel may perform work redundantly. This :class:`bool` + flag indicates whether this work should be included in the count. + (Likely desirable for performance modeling, but undesirable for + code optimization.) + + :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or + *None* that specifies the sub-group size. An OpenCL sub-group is an + implementation-dependent grouping of work-items within a work-group, + analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when + counting a :class:`MemAccess` whose count_granularity specifies that it + should only be counted once per sub-group. If set to *None* an attempt + to find the sub-group size using the device will be made, if this fails + an error will be raised. If a :class:`str` ``'guess'`` is passed as + the subgroup_size, get_mem_access_map will attempt to find the + sub-group size using the device and, if unsuccessful, will make a wild + guess. + + :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** + :class:`islpy.PwQPolynomial` **}**. + + - The :class:`MemAccess` specifies the characteristics of the memory + access. + + - The :class:`islpy.PwQPolynomial` holds the number of memory accesses + with the characteristics specified in the key (in terms of the + :class:`loopy.LoopKernel` *inames*). + + Example usage:: + + # (first create loopy kernel and specify array data types) + + params = {'n': 512, 'm': 256, 'l': 128} + mem_map = get_mem_access_map(knl) + + f32_s1_g_ld_a = mem_map[MemAccess( + mtype='global', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='load', + variable='a', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + f32_s1_g_st_a = mem_map[MemAccess( + mtype='global', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='store', + variable='a', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + f32_s1_l_ld_x = mem_map[MemAccess( + mtype='local', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='load', + variable='x', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + f32_s1_l_st_x = mem_map[MemAccess( + mtype='local', + dtype=np.float32, + lid_strides={0: 1}, + gid_strides={0: 256}, + direction='store', + variable='x', + count_granularity=CountGranularity.WORKITEM) + ].eval_with_dict(params) + + # (now use these counts to, e.g., predict performance) + + """ + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + access_map = ToCountMap() + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_access_map = get_access_map_for_single_kernel(knl, + program.program_callables_info, numpy_types, + count_redundant_work, subgroup_size) + + # FIXME: didn't see any easy way to multiply + for i in range(num_times_called): + access_map += knl_access_map + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + + return access_map + + # }}} # {{{ get_synchronization_map -def get_synchronization_map(knl, subgroup_size=None): +def get_synchronization_map_for_single_kernel(knl, program_callables_info, + subgroup_size=None): """Count the number of synchronization events each work-item encounters in a loopy kernel. @@ -1664,13 +1754,10 @@ def get_synchronization_map(knl, subgroup_size=None): raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) - from loopy.preprocess import preprocess_kernel, infer_unknown_types from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl, program_callables_info) iname_list = [] result = ToCountMap() @@ -1713,12 +1800,42 @@ def get_synchronization_map(knl, subgroup_size=None): return result + +def get_synchronization_map(program, subgroup_size=None): + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + sync_map = ToCountMap() + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_sync_map = get_synchronization_map_for_single_kernel(knl, + program.program_callables_info, subgroup_size) + + # FIXME: didn't see any easy way to multiply + for i in range(num_times_called): + sync_map += knl_sync_map + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + + return sync_map + # }}} # {{{ gather_access_footprints -def gather_access_footprints(kernel, ignore_uncountable=False): +def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or @@ -1729,13 +1846,6 @@ def gather_access_footprints(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - from loopy.kernel import KernelState - if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) - write_footprints = [] read_footprints = [] @@ -1758,6 +1868,46 @@ def gather_access_footprints(kernel, ignore_uncountable=False): write_footprints.append(afg(insn.assignees)) read_footprints.append(afg(insn.expression)) + return write_footprints, read_footprints + + +def gather_access_footprints(program, ignore_uncountable=False): + # FIMXE: works only for one callable kernel till now. + if len([in_knl_callable for in_knl_callable in + program.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)]) != 1: + raise NotImplementedError("Currently only supported for program with " + "only one CallableKernel.") + + from loopy.preprocess import preprocess_program, infer_unknown_types + + program = infer_unknown_types(program, expect_completion=True) + program = preprocess_program(program) + + write_footprints = [] + read_footprints = [] + + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + num_times_called = ( + program.program_callables_info.num_times_callables_called[ + func_id]) + knl = in_knl_callable.subkernel + knl_write_footprints, knl_read_footprints = ( + gather_access_footprints_for_single_kernel(knl, + ignore_uncountable)) + + # FIXME: didn't see any easy way to multiply + for i in range(num_times_called): + write_footprints.extend(knl_write_footprints) + read_footprints.extend(knl_read_footprints) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callabke types %s." % ( + type(in_knl_callable).__name__)) + write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) @@ -1772,7 +1922,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): return result -def gather_access_footprint_bytes(kernel, ignore_uncountable=False): +def gather_access_footprint_bytes(program, ignore_uncountable=False): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.PwQPolynomial` instances capturing the number of bytes are read/written (where *direction* is either ``read`` or ``write`` on array @@ -1783,12 +1933,12 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): nonlinear indices) """ - from loopy.preprocess import preprocess_kernel, infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.preprocess import preprocess_program, infer_unknown_types + kernel = infer_unknown_types(program, expect_completion=True) from loopy.kernel import KernelState if kernel.state < KernelState.PREPROCESSED: - kernel = preprocess_kernel(kernel) + kernel = preprocess_program(program) result = {} fp = gather_access_footprints(kernel, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8927cd6fb..7a268d06f 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -56,7 +56,7 @@ from pymbolic.mapper.constant_folder import \ ConstantFoldingMapper as ConstantFoldingMapperBase from pymbolic.parser import Parser as ParserBase - +from loopy.diagnostic import LoopyError from loopy.diagnostic import ExpressionToAffineConversionError import islpy as isl @@ -69,22 +69,23 @@ import numpy as np # {{{ mappers with support for loopy-specific primitives class IdentityMapperMixin(object): - def map_literal(self, expr, *args): + def map_literal(self, expr, *args, **kwargs): return expr - def map_array_literal(self, expr, *args): - return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children)) + def map_array_literal(self, expr, *args, **kwargs): + return type(expr)(tuple(self.rec(ch, *args, **kwargs) for ch in + expr.children)) - def map_group_hw_index(self, expr, *args): + def map_group_hw_index(self, expr, *args, **kwargs): return expr - def map_local_hw_index(self, expr, *args): + def map_local_hw_index(self, expr, *args, **kwargs): return expr - def map_loopy_function_identifier(self, expr, *args): + def map_loopy_function_identifier(self, expr, *args, **kwargs): return expr - def map_reduction(self, expr, *args): + def map_reduction(self, expr, *args, **kwargs): mapped_inames = [self.rec(p.Variable(iname), *args) for iname in expr.inames] new_inames = [] @@ -98,15 +99,18 @@ class IdentityMapperMixin(object): return Reduction( expr.operation, tuple(new_inames), - self.rec(expr.expr, *args), + self.rec(expr.expr, *args, **kwargs), allow_simultaneous=expr.allow_simultaneous) - def map_tagged_variable(self, expr, *args): + def map_tagged_variable(self, expr, *args, **kwargs): # leaf, doesn't change return expr - def map_type_annotation(self, expr, *args): - return type(expr)(expr.type, self.rec(expr.child)) + def map_type_annotation(self, expr, *args, **kwargs): + return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs)) + + def map_resolved_function(self, expr, *args, **kwargs): + return ResolvedFunction(expr.function) map_type_cast = map_type_annotation @@ -165,9 +169,16 @@ class WalkMapper(WalkMapperBase): map_rule_argument = map_group_hw_index + def map_resolved_function(self, expr, *args): + if not self.visit(expr): + return + + self.rec(expr.function, *args) + class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant + map_resolved_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): @@ -232,13 +243,16 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) + def map_resolved_function(self, expr, prec): + return "ResolvedFunction('%s')" % expr.name + class UnidirectionalUnifier(UnidirectionalUnifierBase): def map_reduction(self, expr, other, unis): if not isinstance(other, type(expr)): return self.treat_mismatch(expr, other, unis) if (expr.inames != other.inames - or type(expr.operation) != type(other.operation) # noqa + or type(expr.function) != type(other.function) # noqa ): return [] @@ -289,6 +303,9 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) + def map_resolved_function(self, expr): + return self.rec(expr.function) + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): @@ -638,6 +655,51 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") + +class ResolvedFunction(p.Expression): + """ + A function invocation whose definition is known in a :mod:`loopy` kernel. + Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression + points to an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` through the + mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer + :ref:`ref_scoped_function` for a slightly detailed explanation on scoped + functions. + + .. attribute:: function + + An instance of :class:`pymbolic.primitives.Variable`, + :class:`loopy.library.reduction.ArgExtOp` or + :class:`loopy.library.reduction.SegmentedOp`. + """ + init_arg_names = ("function", ) + + def __init__(self, function): + if isinstance(function, str): + function = p.Variable(function) + from loopy.library.reduction import ArgExtOp, SegmentedOp + assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp)) + self.function = function + + @property + def name(self): + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(self.function, p.Variable): + return self.function.name + elif isinstance(self.function, (ArgExtOp, SegmentedOp)): + return self.function + else: + raise LoopyError("Unexpected function type %s in ResolvedFunction." % + type(self.function)) + + def __getinitargs__(self): + return (self.function, ) + + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_resolved_function") + # }}} @@ -650,9 +712,12 @@ def get_dependencies(expr): # {{{ rule-aware mappers def parse_tagged_name(expr): + from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, p.Variable): + elif isinstance(expr, ResolvedFunction): + return parse_tagged_name(expr.function) + elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None else: raise RuntimeError("subst rule name not understood: %s" % expr) @@ -850,12 +915,14 @@ class RuleAwareIdentityMapper(IdentityMapper): def __init__(self, rule_mapping_context): self.rule_mapping_context = rule_mapping_context - def map_variable(self, expr, expn_state): + def map_variable(self, expr, expn_state, *args, **kwargs): name, tag = parse_tagged_name(expr) if name not in self.rule_mapping_context.old_subst_rules: - return IdentityMapper.map_variable(self, expr, expn_state) + return IdentityMapper.map_variable(self, expr, expn_state, *args, + **kwargs) else: - return self.map_substitution(name, tag, (), expn_state) + return self.map_substitution(name, tag, (), expn_state, *args, + **kwargs) def map_call(self, expr, expn_state): if not isinstance(expr.function, p.Variable): @@ -910,7 +977,7 @@ class RuleAwareIdentityMapper(IdentityMapper): else: return sym - def __call__(self, expr, kernel, insn): + def __call__(self, expr, kernel, insn, *args, **kwargs): from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -919,7 +986,7 @@ class RuleAwareIdentityMapper(IdentityMapper): kernel=kernel, instruction=insn, stack=(), - arg_context={})) + arg_context={}), *args, **kwargs) def map_instruction(self, kernel, insn): return insn diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index a81354e2f..e3b4853c3 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -80,7 +80,7 @@ class TargetBase(object): def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel): + def pre_codegen_check(self, kernel, program_callables_info): pass # }}} @@ -150,7 +150,12 @@ class ASTBuilderBase(object): # {{{ library - def function_manglers(self): + def function_scopers(self): + """ + Returns an instance of list of the functions of signature + ``(target, identifiers)`` returning either an instance of + :class:`InKernelCallable` if a match is found or *None*. + """ return [] def symbol_manglers(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 83efecf0e..1579bb313 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -27,7 +27,6 @@ THE SOFTWARE. import six import numpy as np # noqa -from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block @@ -35,6 +34,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper from loopy.types import NumpyType +from loopy.kernel.function_interface import ScalarCallable import pymbolic.primitives as p from pytools import memoize_method @@ -354,71 +354,116 @@ def c_symbol_mangler(kernel, name): # }}} -# {{{ function mangler +# {{{ function scoping -def c_math_mangler(target, name, arg_dtypes, modify_name=True): - # Function mangler for math functions defined in C standard - # Convert abs, min, max to fabs, fmin, fmax. - # If modify_name is set to True, function names are modified according to - # floating point types of the arguments (e.g. cos(double), cosf(float)) - # This should be set to True for C and Cuda, False for OpenCL - if not isinstance(name, str): - return None +class CMathCallable(ScalarCallable): + """ + An umbrella callable for all the math functions which can be seen in a + C-Target. + """ - if name in ["abs", "min", "max"]: - name = "f" + name + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + name = self.name - # unitary functions - if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] - and len(arg_dtypes) == 1 - and arg_dtypes[0].numpy_dtype.kind == "f"): + if name in ["abs", "min", "max"]: + name = "f" + name - dtype = arg_dtypes[0].numpy_dtype + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: - if modify_name: - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: - name = name + "l" # fabsl - else: - raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError("%s can take only one argument." % name) - return CallMangleInfo( - target_name=name, - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) - # binary functions - if (name in ["fmax", "fmin"] - and len(arg_dtypes) == 2): + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyTypeError("%s does not support complex numbers") + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == 'c': + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) - elif dtype.kind == "f": - if modify_name: + from loopy.target.opencl import OpenCLTarget + if not isinstance(caller_kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified if dtype == np.float64: - pass # fmin + pass # fabs elif dtype == np.float32: - name = name + "f" # fminf + name = name + "f" # fabsf elif dtype == np.float128: - name = name + "l" # fminl + name = name + "l" # fabsl else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) + raise LoopyTypeError("%s does not support type %s" % (name, + dtype)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + program_callables_info) + + # binary functions + if name in ["fmax", "fmin"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + elif dtype.kind == "f": + from loopy.target.opencl import OpenCLTarget + if not isinstance(caller_kernel.target, OpenCLTarget): + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + program_callables_info) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) +def scope_c_math_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function + represented by :arg:`identifier` is known in C, otherwise returns *None*. + """ + if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]: + return CMathCallable(name=identifier) return None # }}} @@ -427,12 +472,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CASTBuilder(ASTBuilderBase): # {{{ library - def function_manglers(self): - return ( - super(CASTBuilder, self).function_manglers() + [ - c_math_mangler - ]) - def symbol_manglers(self): return ( super(CASTBuilder, self).symbol_manglers() + [ @@ -445,6 +484,11 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) + def function_scopers(self): + return ( + super(CASTBuilder, self).function_scopers() + [ + scope_c_math_functions]) + # }}} # {{{ code generation @@ -846,82 +890,31 @@ class CASTBuilder(ASTBuilderBase): return block_if_necessary(assignments) def emit_multiple_assignment(self, codegen_state, insn): - ecm = codegen_state.expression_to_code_mapper - from pymbolic.primitives import Variable - from pymbolic.mapper.stringifier import PREC_NONE - - func_id = insn.expression.function - parameters = insn.expression.parameters - - if isinstance(func_id, Variable): - func_id = func_id.name - - assignee_var_descriptors = [ - codegen_state.kernel.get_var_descriptor(a) - for a in insn.assignee_var_names()] - - par_dtypes = tuple(ecm.infer_type(par) for par in parameters) - - mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes) - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % func_id) - - assert mangle_result.arg_dtypes is not None + ecm = codegen_state.expression_to_code_mapper + func_id = insn.expression.function.name + in_knl_callable = codegen_state.program_callables_info[func_id] - if mangle_result.target_name == "loopy_make_tuple": - # This shorcut avoids actually having to emit a 'make_tuple' function. + if isinstance(in_knl_callable, ScalarCallable) and ( + in_knl_callable.name_in_target == 'loopy_make_tuple'): return self.emit_tuple_assignment(codegen_state, insn) - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes)) - - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) + in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( + insn=insn, + target=self.target, + expression_to_code_mapper=ecm) - from pymbolic import var - result = var(mangle_result.target_name)(*c_parameters) - - # In case of no assignees, we are done - if len(mangle_result.result_dtypes) == 0: + if is_returned: + from cgen import Assign + lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) + return Assign(lhs_code, + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) + else: from cgen import ExpressionStatement return ExpressionStatement( - CExpression(self.get_c_expression_to_code_mapper(), result)) - - result = ecm.wrap_in_typecast( - mangle_result.result_dtypes[0], - assignee_var_descriptors[0].dtype, - result) - - lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) - - from cgen import Assign - return Assign( - lhs_code, - CExpression(self.get_c_expression_to_code_mapper(), result)) + CExpression(self.get_c_expression_to_code_mapper(), + in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound, ubound, inner): diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 6b80bae20..b3c304d58 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -157,7 +157,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -166,12 +166,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return None, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -373,7 +374,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, compiler=None): + def __init__(self, program, compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -382,35 +383,35 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel) + super(CKernelExecutor, self).__init__(program) def get_invoker_uncached(self, kernel, codegen_result): generator = CExecutionWrapperGenerator() return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() host_code = codegen_result.host_code() all_code = '\n'.join([dev_code, '', host_code]) - if self.kernel.options.write_cl: + if self.program.root_kernel.options.write_cl: output = all_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(code=output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor @@ -419,14 +420,14 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_info, all_code, self.kernel.target, + codegen_result.implemented_data_info, all_code, self.program.target, self.compiler)) return _KernelInfo( - kernel=kernel, + program=program, c_kernels=c_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) # }}} @@ -443,7 +444,7 @@ class CKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + return program_info.invoker( + program_info.c_kernels, *args, **kwargs) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index dd2104d0c..65a8c2028 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -41,7 +41,7 @@ from pymbolic import var from loopy.expression import dtype_to_type_context from loopy.type_inference import TypeInferenceMapper -from loopy.diagnostic import LoopyError, LoopyWarning +from loopy.diagnostic import LoopyError from loopy.tools import is_integer from loopy.types import LoopyType @@ -54,7 +54,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.program_callables_info) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -383,19 +384,19 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Variable, Subscript - - identifier = expr.function + from pymbolic.primitives import Subscript # {{{ implement indexof, indexof_vec - if identifier.name in ["indexof", "indexof_vec"]: + identifier_name = ( + self.codegen_state.program_callables_info[expr.function.name].name) + if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier.name) + raise LoopyError("%s takes exactly one argument" % identifier_name) arg, = expr.parameters if not isinstance(arg, Subscript): raise LoopyError( - "argument to %s must be a subscript" % identifier.name) + "argument to %s must be a subscript" % identifier_name) ary = self.find_array(arg) @@ -407,11 +408,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.kernel.data import ImageArg if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier.name) + raise LoopyError("%s does not support images" % identifier_name) - if identifier.name == "indexof": + if identifier_name == "indexof": return access_info.subscripts[0] - elif identifier.name == "indexof_vec": + elif identifier_name == "indexof_vec": from loopy.kernel.array import VectorArrayDimTag ivec = None for iaxis, dim_tag in enumerate(ary.dim_tags): @@ -430,56 +431,25 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - - processed_parameters = None - - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: - raise LoopyError("functions with more or fewer than one return value " - "may not be used in an expression") - - if mangle_result.arg_dtypes is not None: - processed_parameters = tuple( - self.rec(par, - dtype_to_type_context(self.kernel.target, tgt_dtype), - tgt_dtype) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)) - - else: - # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to - # propagate the type context here. But for many others, it does - # not. Using the inferred type as a stopgap for now. - processed_parameters = tuple( - self.rec(par, - type_context=dtype_to_type_context( - self.kernel.target, par_dtype)) - for par, par_dtype in zip(expr.parameters, par_dtypes)) - - from warnings import warn - warn("Calling function '%s' with unknown C signature--" - "return CallMangleInfo.arg_dtypes" - % identifier, LoopyWarning) - - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return var(mangle_result.target_name)(*processed_parameters) + from loopy.kernel.function_interface import ManglerCallable + if isinstance(self.codegen_state.program_callables_info[expr.function.name], + ManglerCallable): + from loopy.codegen import SeenFunction + in_knl_callable = ( + self.codegen_state.program_callables_info[ + expr.function.name]) + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) + + return ( + self.codegen_state.program_callables_info[ + expr.function.name].emit_call( + expression_to_code_mapper=self, + expression=expr, + target=self.kernel.target)) # {{{ deal with complex-valued variables diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 673d3b284..89cbfd034 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -34,6 +34,7 @@ from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.kernel.data import AddressSpace from pymbolic import var +from loopy.kernel.function_interface import ScalarCallable # {{{ vector types @@ -111,29 +112,82 @@ def _register_vector_types(dtype_registry): # }}} -# {{{ function mangler +# {{{ function scoper -def cuda_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +_CUDA_SPECIFIC_FUNCTIONS = { + "rsqrt": 1, + "atan2": 2, + } - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type([], arg_dtypes) - if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") +class CudaCallable(ScalarCallable): - if dtype.kind == "f": - name = "f" + name + def cuda_with_types(self, arg_id_to_dtype, caller_kernel, + program_callables_info): - return dtype, name + name = self.name - if name in "atan2" and len(arg_dtypes) == 2: - return arg_dtypes[0], name + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] - return scalar_dtype, name + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), + 0: dtype, 1: dtype}), + program_callables_info) + + if name in _CUDA_SPECIFIC_FUNCTIONS: + num_args = _CUDA_SPECIFIC_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + program_callables_info) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + +def scope_cuda_functions(target, identifier): + if identifier in set(["dot"]) | set( + _CUDA_SPECIFIC_FUNCTIONS): + return CudaCallable(name=identifier) return None @@ -217,13 +271,12 @@ class CudaTarget(CTarget): # {{{ ast builder class CUDACASTBuilder(CASTBuilder): + # {{{ library - def function_manglers(self): - return ( - super(CUDACASTBuilder, self).function_manglers() + [ - cuda_function_mangler - ]) + def function_scopers(self): + return [scope_cuda_functions] + ( + super(CUDACASTBuilder, self).function_scopers()) # }}} @@ -249,7 +302,8 @@ class CUDACASTBuilder(CASTBuilder): _, local_grid_size = \ codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.program_callables_info) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3cdf20577..43963ddb2 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -61,12 +61,12 @@ class SeparateArrayPackingController(object): It also repacks outgoing arrays of this type back into an object array. """ - def __init__(self, kernel): + def __init__(self, program): # map from arg name self.packing_info = {} from loopy.kernel.array import ArrayBase - for arg in kernel.args: + for arg in program.args: if not isinstance(arg, ArrayBase): continue @@ -82,7 +82,8 @@ class SeparateArrayPackingController(object): name=arg.name, sep_shape=arg.sep_shape(), subscripts_and_names=subscripts_and_names, - is_written=arg.name in kernel.get_written_variables()) + is_written=arg.name in + program.root_kernel.get_written_variables()) def unpack(self, kernel_kwargs): if not self.packing_info: @@ -143,7 +144,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from shapes def generate_integer_arg_finding_from_shapes( - self, gen, kernel, implemented_data_info): + self, gen, program, implemented_data_info): # a mapping from integer argument names to a list of tuples # (arg_name, expression), where expression is a # unary function of kernel.arg_dict[arg_name] @@ -168,7 +169,8 @@ class ExecutionWrapperGeneratorBase(object): if len(deps) == 1: integer_arg_var, = deps - if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): + if program.arg_dict[ + integer_arg_var.name].dtype.is_integral(): from pymbolic.algorithm import solve_affine_equations_for try: # friggin' overkill :) @@ -214,9 +216,9 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from offsets - def generate_integer_arg_finding_from_offsets(self, gen, kernel, + def generate_integer_arg_finding_from_offsets(self, gen, program, implemented_data_info): - options = kernel.options + options = program.root_kernel.options gen("# {{{ find integer arguments from offsets") gen("") @@ -239,7 +241,7 @@ class ExecutionWrapperGeneratorBase(object): else: gen("_lpy_offset = %s.offset" % impl_array_name) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" @@ -264,8 +266,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from strides def generate_integer_arg_finding_from_strides( - self, gen, kernel, implemented_data_info): - options = kernel.options + self, gen, program, implemented_data_info): + options = program.root_kernel.options gen("# {{{ find integer arguments from strides") gen("") @@ -284,7 +286,7 @@ class ExecutionWrapperGeneratorBase(object): "passed array\")" % (arg.name, impl_array_name)) - base_arg = kernel.impl_arg_to_arg[impl_array_name] + base_arg = program.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" @@ -307,8 +309,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ check that value args are present def generate_value_arg_check( - self, gen, kernel, implemented_data_info): - if kernel.options.skip_arg_checks: + self, gen, program, implemented_data_info): + if program.root_kernel.options.skip_arg_checks: return from loopy.kernel.data import ValueArg @@ -361,7 +363,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ arg setup def generate_arg_setup( - self, gen, kernel, implemented_data_info, options): + self, gen, program, implemented_data_info, options): import loopy as lp from loopy.kernel.data import KernelArgument @@ -384,8 +386,8 @@ class ExecutionWrapperGeneratorBase(object): expect_no_more_arguments = False for arg_idx, arg in enumerate(implemented_data_info): - is_written = arg.base_name in kernel.get_written_variables() - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) + is_written = arg.base_name in program.root_kernel.get_written_variables() + program_arg = program.impl_arg_to_arg.get(arg.name) if not issubclass(arg.arg_class, KernelArgument): expect_no_more_arguments = True @@ -447,7 +449,7 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): self.handle_alloc( - gen, arg, kernel_arg, strify, options.skip_arg_checks) + gen, arg, program_arg, strify, options.skip_arg_checks) gen("_lpy_made_by_loopy = True") gen("") @@ -465,7 +467,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("if %s.dtype != %s:" % (arg.name, self.python_dtype_str( - kernel_arg.dtype.numpy_dtype))) + program_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " "(got: %%s, expected: %s)\" %% %s.dtype)" @@ -493,10 +495,10 @@ class ExecutionWrapperGeneratorBase(object): "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - if kernel_arg.shape is None: + if program_arg.shape is None: pass - elif any(shape_axis is None for shape_axis in kernel_arg.shape): + elif any(shape_axis is None for shape_axis in program_arg.shape): gen("if len(%s.shape) != %s:" % (arg.name, len(arg.unvec_shape))) with Indentation(gen): @@ -519,8 +521,8 @@ class ExecutionWrapperGeneratorBase(object): # }}} - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize + if arg.unvec_strides and program_arg.dim_tags: + itemsize = program_arg.dtype.numpy_dtype.itemsize sym_strides = tuple( itemsize*s_i for s_i in arg.unvec_strides) @@ -558,7 +560,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_kernel()." + "default_offset=loopy.auto to make_program()." "\")" % arg.name) gen("") @@ -617,7 +619,7 @@ class ExecutionWrapperGeneratorBase(object): def generate_host_code(self, gen, codegen_result): raise NotImplementedError - def __call__(self, kernel, codegen_result): + def __call__(self, program, codegen_result): """ Generates the wrapping python invoker for this execution target @@ -629,12 +631,12 @@ class ExecutionWrapperGeneratorBase(object): kernel """ - options = kernel.options + options = program.root_kernel.options implemented_data_info = codegen_result.implemented_data_info from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % kernel.name, + "invoke_%s_loopy_kernel" % program.name, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info @@ -651,21 +653,21 @@ class ExecutionWrapperGeneratorBase(object): self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_offsets( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_integer_arg_finding_from_strides( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) self.generate_value_arg_check( - gen, kernel, implemented_data_info) + gen, program, implemented_data_info) args = self.generate_arg_setup( - gen, kernel, implemented_data_info, options) + gen, program, implemented_data_info, options) self.generate_invocation(gen, codegen_result.host_program.name, args, - kernel, implemented_data_info) + program, implemented_data_info) - self.generate_output_handler(gen, options, kernel, implemented_data_info) + self.generate_output_handler(gen, options, program, implemented_data_info) if options.write_wrapper: output = gen.get() @@ -713,32 +715,32 @@ class KernelExecutorBase(object): .. automethod:: __call__ """ - def __init__(self, kernel): + def __init__(self, program): """ :arg kernel: a loopy.LoopKernel """ - self.kernel = kernel + self.program = program - self.packing_controller = SeparateArrayPackingController(kernel) + self.packing_controller = SeparateArrayPackingController(program) - self.output_names = tuple(arg.name for arg in self.kernel.args - if arg.name in self.kernel.get_written_variables()) + self.output_names = tuple(arg.name for arg in self.program.args + if arg.is_output_only) self.has_runtime_typed_args = any( arg.dtype is None - for arg in kernel.args) + for arg in program.args) - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): + def get_typed_and_scheduled_program_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes - kernel = self.kernel + program = self.program if arg_to_dtype_set: var_to_dtype = {} for var, dtype in arg_to_dtype_set: try: - dest_name = kernel.impl_arg_to_arg[var].name + dest_name = program.impl_arg_to_arg[var].name except KeyError: dest_name = var @@ -749,28 +751,30 @@ class KernelExecutorBase(object): "no known variable/argument with that name" % var) - kernel = add_dtypes(kernel, var_to_dtype) + program = add_dtypes(program, var_to_dtype) - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) - if kernel.schedule is None: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) + if program.root_kernel.schedule is None: + from loopy.preprocess import preprocess_program + program = preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + program = program.with_root_kernel( + get_one_scheduled_kernel(program.root_kernel, + program.program_callables_info)) - return kernel + return program - def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): + def get_typed_and_scheduled_program(self, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching # prepare_for_caching() gets run by preprocess, but the kernel at this # stage is not guaranteed to be preprocessed. - cacheable_kernel = prepare_for_caching(self.kernel) - cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) + cacheable_program = prepare_for_caching(self.program) + cache_key = (type(self).__name__, cacheable_program, arg_to_dtype_set) if CACHING_ENABLED: try: @@ -778,9 +782,9 @@ class KernelExecutorBase(object): except KeyError: pass - logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) + logger.debug("%s: typed-and-scheduled cache miss" % self.program.name) - kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) + kernel = self.get_typed_and_scheduled_program_uncached(arg_to_dtype_set) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) @@ -791,7 +795,7 @@ class KernelExecutorBase(object): if not self.has_runtime_typed_args: return None - impl_arg_to_arg = self.kernel.impl_arg_to_arg + impl_arg_to_arg = self.program.impl_arg_to_arg arg_to_dtype = {} for arg_name, val in six.iteritems(kwargs): arg = impl_arg_to_arg.get(arg_name, None) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 0464270a3..539631833 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -172,8 +172,9 @@ class ISPCTarget(CTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel): - gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs() + def pre_codegen_check(self, kernel, program_callables_info): + gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( + program_callables_info) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 432c95ef3..44f782a72 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,11 +31,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper, c_math_mangler -from loopy.kernel.data import AddressSpace, CallMangleInfo +from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import AddressSpace +from loopy.kernel.function_interface import ScalarCallable from pymbolic import var -from functools import partial # {{{ dtype registry wrappers @@ -166,59 +166,135 @@ VECTOR_LITERAL_FUNCS = dict( ) -def opencl_function_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class OpenCLCallable(ScalarCallable): + """ + Records information about OpenCL functions which are not covered by + :class:`loopy.target.c.CMathCallable`. + """ + + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + name = self.name + + if name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if (id >= 0 and dtype is not None)]) + + if dtype.kind in ['u', 'i', 'f']: + if dtype.kind == 'f': + name = 'f'+name + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + program_callables_info) + else: + # Unsupported type. + raise LoopyError("%s function not supported for the types %s" % + (name, dtype)) + + if name == "dot": + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only 2 arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = arg_id_to_dtype[0] + scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] + return ( + self.copy(name_in_target=name, arg_id_to_dtype={-1: + NumpyType(scalar_dtype), 0: dtype, 1: dtype}), + program_callables_info) + + if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] + for id in arg_id_to_dtype: + if not -1 <= id < num_args: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(num_args): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in + arg_id_to_dtype.items() if id >= 0]) + + if dtype.kind == "c": + raise LoopyError("%s does not support complex numbers" + % name) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, + num_args)) + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype=updated_arg_id_to_dtype), + program_callables_info) + + if name in VECTOR_LITERAL_FUNCS: + base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] + + for id in arg_id_to_dtype: + if not -1 <= id < count: + raise LoopyError("%s can take only %d arguments." % (name, + num_args)) + + for i in range(count): + if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in + range(count)) + updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( + NumpyType(dtype), count) + + return ( + self.copy(name_in_target="(%s%d) " % (base_tp_name, count), + arg_id_to_dtype=updated_arg_id_to_dtype), + program_callables_info) + + # does not satisfy any of the conditions needed for specialization. + # hence just returning a copy of the callable. + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) - # OpenCL has min(), max() for integer types - if name in ["max", "min"] and len(arg_dtypes) == 2: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "i": - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) - - if name == "dot": - scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] - return CallMangleInfo( - target_name=name, - result_dtypes=(NumpyType(scalar_dtype),), - arg_dtypes=(arg_dtypes[0],)*2) - - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: - num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] - if len(arg_dtypes) != num_args: - raise LoopyError("%s takes %d arguments (%d received)" - % (name, num_args, len(arg_dtypes))) - - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - - if dtype.kind == "c": - raise LoopyError("%s does not support complex numbers" - % name) - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=(result_dtype,)*num_args) - - if name in VECTOR_LITERAL_FUNCS: - base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] - - if count != len(arg_dtypes): - return None - - return CallMangleInfo( - target_name="(%s%d) " % (base_tp_name, count), - result_dtypes=(kernel.target.vector_dtype( - NumpyType(dtype), count),), - arg_dtypes=(NumpyType(dtype),)*count) + +def scope_opencl_functions(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = set(["max", "min", "dot"]) | set( + _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) + + if identifier in opencl_function_ids: + return OpenCLCallable(name=identifier) return None @@ -280,6 +356,7 @@ def opencl_preamble_generator(preamble_info): from loopy.tools import remove_common_indentation kernel = preamble_info.kernel + yield ("00_declare_gid_lid", remove_common_indentation(""" #define lid(N) ((%(idx_ctype)s) get_local_id(N)) @@ -365,13 +442,10 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_manglers(self): + def function_scopers(self): return ( - [ - opencl_function_mangler, - partial(c_math_mangler, modify_name=False) - ] + - super(OpenCLCASTBuilder, self).function_manglers()) + [scope_opencl_functions] + super( + OpenCLCASTBuilder, self).function_scopers()) def symbol_manglers(self): return ( @@ -380,13 +454,10 @@ class OpenCLCASTBuilder(CASTBuilder): ]) def preamble_generators(self): - from loopy.library.reduction import reduction_preamble_generator return ( super(OpenCLCASTBuilder, self).preamble_generators() + [ - opencl_preamble_generator, - reduction_preamble_generator, - ]) + opencl_preamble_generator]) # }}} @@ -399,6 +470,11 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) + if not codegen_state.kernel.is_called_from_host: + # auxiliary kernels need not mention opencl speicific qualifiers + # for a functions signature + return fdecl + fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize @@ -407,7 +483,8 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.schedule import get_insn_ids_for_block_at _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( - codegen_state.kernel.schedule, schedule_index)) + codegen_state.kernel.schedule, schedule_index), + codegen_state.program_callables_info) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 73e8e0092..03ba26930 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -31,12 +31,12 @@ from six.moves import range import numpy as np -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder from loopy.target.python import PythonASTBuilderBase from loopy.types import NumpyType -from loopy.diagnostic import LoopyError, warn_with_kernel +from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError from warnings import warn +from loopy.kernel.function_interface import ScalarCallable import logging logger = logging.getLogger(__name__) @@ -134,7 +134,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, device): +def check_sizes(kernel, program_callables_info, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -151,7 +151,8 @@ def check_sizes(kernel, device): if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately - glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() + glens, llens = ( + kernel.get_grid_size_upper_bounds_as_exprs(program_callables_info)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -199,37 +200,89 @@ def check_sizes(kernel, device): # }}} -def pyopencl_function_mangler(target, name, arg_dtypes): - if len(arg_dtypes) == 1 and isinstance(name, str): - arg_dtype, = arg_dtypes +# {{{ pyopencl function scopers - if arg_dtype.is_complex(): - if arg_dtype.numpy_dtype == np.complex64: - tpname = "cfloat" - elif arg_dtype.numpy_dtype == np.complex128: - tpname = "cdouble" +class PyOpenCLCallable(ScalarCallable): + """ + Records information about the callables which are not covered by + :class:`loopy.target.opencl.OpenCLCallable` + """ + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + + name = self.name + + for id in arg_id_to_dtype: + # since all the below functions are single arg. + if not -1 <= id <= 0: + raise LoopyError("%s can only take one argument." % name) + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = arg_id_to_dtype[0] + + if name in ["real", "imag", "abs"]: + if dtype.is_complex(): + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: NumpyType( + np.dtype(dtype.numpy_dtype.type(0).real))}), + program_callables_info) + + if name in ["sqrt", "exp", "log", + "sin", "cos", "tan", + "sinh", "cosh", "tanh", + "conj", "abs"]: + if dtype.is_complex(): + # function parameters are complex. + if dtype.numpy_dtype == np.complex64: + tpname = "cfloat" + elif dtype.numpy_dtype == np.complex128: + tpname = "cdouble" + else: + raise LoopyTypeError("unexpected complex type '%s'" % dtype) + + return ( + self.copy(name_in_target="%s_%s" % (tpname, name), + arg_id_to_dtype={0: dtype, -1: dtype}), + program_callables_info) else: - raise RuntimeError("unexpected complex type '%s'" % arg_dtype) - - if name in ["sqrt", "exp", "log", - "sin", "cos", "tan", - "sinh", "cosh", "tanh", - "conj"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(arg_dtype,), - arg_dtypes=(arg_dtype,)) - - if name in ["real", "imag", "abs"]: - return CallMangleInfo( - target_name="%s_%s" % (tpname, name), - result_dtypes=(NumpyType( - np.dtype(arg_dtype.numpy_dtype.type(0).real)), - ), - arg_dtypes=(arg_dtype,)) + # function calls for floating parameters. + numpy_dtype = dtype.numpy_dtype + if numpy_dtype.kind in ('u', 'i'): + dtype = dtype.copy(numpy_dtype=np.float32) + if name == 'abs': + name = 'fabs' + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: dtype, -1: dtype}), + program_callables_info) + + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + +def pyopencl_function_scoper(target, identifier): + if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"]: + return PyOpenCLCallable(name=identifier) return None +# }}} + # {{{ preamble generator @@ -344,8 +397,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel): - check_sizes(kernel, self.device) + def pre_codegen_check(self, kernel, program_callables_info): + check_sizes(kernel, program_callables_info, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) @@ -739,19 +792,15 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_manglers(self): - from loopy.library.random123 import random123_function_mangler + def function_scopers(self): + from loopy.library.random123 import random123_function_scoper return ( - super(PyOpenCLCASTBuilder, self).function_manglers() + [ - pyopencl_function_mangler, - random123_function_mangler - ]) + [pyopencl_function_scoper, random123_function_scoper] + super( + PyOpenCLCASTBuilder, self).function_scopers()) def preamble_generators(self): - from loopy.library.random123 import random123_preamble_generator return ([ pyopencl_preamble_generator, - random123_preamble_generator, ] + super(PyOpenCLCASTBuilder, self).preamble_generators()) # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 27be61987..380ab1d9f 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -151,9 +151,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation - def generate_invocation(self, gen, kernel_name, args, - kernel, implemented_data_info): - if kernel.options.cl_exec_manage_array_events: + def generate_invocation(self, gen, program_name, args, + program, implemented_data_info): + if program.root_kernel.options.cl_exec_manage_array_events: gen(""" if wait_for is None: wait_for = [] @@ -169,20 +169,21 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("") - gen("_lpy_evt = {kernel_name}({args})" + gen("_lpy_evt = {program_name}({args})" .format( - kernel_name=kernel_name, + program_name=program_name, args=", ".join( ["_lpy_cl_kernels", "queue"] + args + ["wait_for=wait_for"]))) - if kernel.options.cl_exec_manage_array_events: + if program.root_kernel.options.cl_exec_manage_array_events: gen("") from loopy.kernel.data import ArrayArg for arg in implemented_data_info: if (issubclass(arg.arg_class, ArrayArg) - and arg.base_name in kernel.get_written_variables()): + and arg.base_name in ( + program.root_kernel.get_written_variables())): gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name)) # }}} @@ -190,7 +191,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, kernel, implemented_data_info): + self, gen, options, program, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -207,7 +208,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if not issubclass(arg.arg_class, KernelArgument): continue - is_written = arg.base_name in kernel.get_written_variables() + is_written = arg.base_name in ( + program.root_kernel.get_written_variables()) if is_written: gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) @@ -218,12 +220,13 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + if arg.base_name in + program.root_kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in program.root_kernel.get_written_variables()] if out_args: gen("return _lpy_evt, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -252,7 +255,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, program): """ :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels @@ -261,40 +264,40 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super(PyOpenCLKernelExecutor, self).__init__(kernel) + super(PyOpenCLKernelExecutor, self).__init__(program) self.context = context from loopy.target.pyopencl import PyOpenCLTarget - if isinstance(kernel.target, PyOpenCLTarget): - self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0])) + if isinstance(program.target, PyOpenCLTarget): + self.program = program.copy(target=PyOpenCLTarget(context.devices[0])) def get_invoker_uncached(self, kernel, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() return generator(kernel, codegen_result) @memoize_method - def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + program = self.get_typed_and_scheduled_program(arg_to_dtype_set) from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code - codegen_result = generate_code_v2(kernel) + codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() - if self.kernel.options.write_cl: + if self.program.root_kernel.options.write_cl: output = dev_code - if self.kernel.options.highlight_cl: + if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(output) - if self.kernel.options.write_cl is True: + if self.program.root_kernel.options.write_cl is True: print(output) else: - with open(self.kernel.options.write_cl, "w") as outf: + with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.kernel.options.edit_cl: + if self.program.root_kernel.options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") @@ -302,17 +305,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): cl_program = ( cl.Program(self.context, dev_code) - .build(options=kernel.options.cl_build_options)) + .build(options=program.root_kernel.options.cl_build_options)) cl_kernels = _Kernels() for dp in codegen_result.device_programs: setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) return _KernelInfo( - kernel=kernel, + program=program, cl_kernels=cl_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(kernel, codegen_result)) + invoker=self.get_invoker(program, codegen_result)) def __call__(self, queue, **kwargs): """ @@ -347,10 +350,10 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(self.arg_to_dtype_set(kwargs)) - return kernel_info.invoker( - kernel_info.cl_kernels, queue, allocator, wait_for, + return program_info.invoker( + program_info.cl_kernels, queue, allocator, wait_for, out_host, **kwargs) # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index ce04986d3..cd6e61167 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -44,7 +44,8 @@ class ExpressionToPythonMapper(StringifyMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel) + type_inf_mapper = TypeInferenceMapper(self.kernel, + self.codegen_state.program_callables_info) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): @@ -82,47 +83,37 @@ class ExpressionToPythonMapper(StringifyMapper): expr, enclosing_prec) def map_call(self, expr, enclosing_prec): - from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE - identifier = expr.function + identifier_name = self.codegen_state.program_callables_info[ + expr.function.name].name - if identifier.name in ["indexof", "indexof_vec"]: + if identifier_name in ["indexof", "indexof_vec"]: raise LoopyError( "indexof, indexof_vec not yet supported in Python") - if isinstance(identifier, Variable): - identifier = identifier.name - - par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + from loopy.kernel.function_interface import ManglerCallable + in_knl_callable = self.codegen_state.program_callables_info[ + expr.function.name] + if isinstance(in_knl_callable, ManglerCallable): + from loopy.codegen import SeenFunction + mangle_result = in_knl_callable.mangle_result(self.kernel) + self.codegen_state.seen_functions.add( + SeenFunction(identifier_name, + mangle_result.target_name, + mangle_result.arg_dtypes)) str_parameters = None + number_of_assignees = len([key for key in + in_knl_callable.arg_id_to_dtype.keys() if key < 0]) - mangle_result = self.kernel.mangle_function( - identifier, par_dtypes, - ast_builder=self.codegen_state.ast_builder) - - if mangle_result is None: - raise RuntimeError("function '%s' unknown--" - "maybe you need to register a function mangler?" - % identifier) - - if len(mangle_result.result_dtypes) != 1: + if number_of_assignees != 1: raise LoopyError("functions with more or fewer than one return value " "may not be used in an expression") - str_parameters = [ - self.rec(par, PREC_NONE) - for par, par_dtype, tgt_dtype in zip( - expr.parameters, par_dtypes, mangle_result.arg_dtypes)] + str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(identifier, - mangle_result.target_name, - mangle_result.arg_dtypes or par_dtypes)) - - return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") @@ -189,11 +180,11 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_manglers(self): + def function_scopers(self): + from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_manglers() + [ - _numpy_single_arg_function_mangler, - ]) + super(PythonASTBuilderBase, self).function_scopers() + + [scope_c_math_functions]) def preamble_generators(self): return ( diff --git a/loopy/tools.py b/loopy/tools.py index 8c5d36390..b243a7949 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -73,7 +73,8 @@ class LoopyKeyBuilder(KeyBuilderBase): def update_for_dict(self, key_hash, key): # Order matters for the hash--insert in sorted order. - for dict_key in sorted(six.iterkeys(key)): + for dict_key in sorted(six.iterkeys(key), key=lambda obj: + type(obj).__name__ + str(obj)): self.rec(key_hash, (dict_key, key[dict_key])) update_for_defaultdict = update_for_dict diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index cfbbd56e9..38bb21850 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -26,6 +26,8 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel __doc__ = """ .. currentmodule:: loopy @@ -36,8 +38,10 @@ __doc__ = """ # {{{ add_barrier -def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, - tags=None, synchronization_kind="global", mem_kind=None): +@iterate_over_kernels_if_given_program +def add_barrier(knl, insn_before="", insn_after="", + id_based_on=None, tags=None, synchronization_kind="global", + mem_kind=None): """Takes in a kernel that needs to be added a barrier and returns a kernel which has a barrier inserted into it. It takes input of 2 instructions and then adds a barrier in between those 2 instructions. The expressions can @@ -55,6 +59,8 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, for "global" bariers. If not supplied, defaults to :arg:`synchronization_kind` """ + assert isinstance(knl, LoopKernel) + if mem_kind is None: mem_kind = synchronization_kind @@ -76,7 +82,7 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None, mem_kind=mem_kind) new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add]) - new_knl = add_dependency(kernel=new_knl, + new_knl = add_dependency(new_knl, insn_match=insn_after, depends_on="id:"+id) diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index b7f47c38a..3df86e7ae 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -27,9 +27,13 @@ import six from loopy.diagnostic import LoopyError +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + # {{{ fold constants +@iterate_over_kernels_if_given_program def fold_constants(kernel): from loopy.symbolic import ConstantFoldingMapper cfm = ConstantFoldingMapper() @@ -53,7 +57,9 @@ def fold_constants(kernel): # {{{ collect_common_factors_on_increment # thus far undocumented +@iterate_over_kernels_if_given_program def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): + assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index f0b9814c4..970547003 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -29,6 +29,9 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program + + __doc__ = """ .. currentmodule:: loopy @@ -102,8 +105,9 @@ def _add_unique_dim_name(name, dim_names): return (ng(name),) + tuple(dim_names) -def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch", - sequential=False): +@iterate_over_kernels_if_given_program +def to_batched(knl, nbatches, batch_varying_args, + batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. .. note:: diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 801da4c13..57c4397f9 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -33,6 +33,9 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError +from loopy.program import Program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import ScalarCallable, CallableKernel from pymbolic import var @@ -130,10 +133,10 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array(kernel, var_name, buffer_inames, init_expression=None, - store_expression=None, within=None, default_tag="l.auto", - temporary_scope=None, temporary_is_local=None, - fetch_bounding_box=False): +def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, + buffer_inames, init_expression=None, store_expression=None, + within=None, default_tag="l.auto", temporary_scope=None, + temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable @@ -169,6 +172,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, fetched. """ + assert isinstance(kernel, LoopKernel) + # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import AddressSpace @@ -240,7 +245,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) - cache_key = (key_kernel, var_name, tuple(buffer_inames), + cache_key = (key_kernel, var_name, + tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) @@ -528,7 +534,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, program_callables_info) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -537,4 +543,29 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, return kernel + +def buffer_array(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = buffer_array_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 5b1ee6cca..5f4f2f2a7 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -30,6 +30,9 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper +from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable # {{{ convenience: add_prefetch @@ -140,7 +143,8 @@ class _not_provided: # noqa: N801 pass -def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, +def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, + sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, @@ -239,6 +243,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, This function internally uses :func:`extract_subst` and :func:`precompute`. """ + assert isinstance(kernel, LoopKernel) # {{{ fish indexing out of var_name and into footprint_subscripts @@ -328,9 +333,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # precompute module, but precompute acutally uses that to adjust its # warning message. - from loopy.transform.precompute import precompute - new_kernel = precompute(kernel, subst_use, sweep_inames, - precompute_inames=dim_arg_names, + from loopy.transform.precompute import precompute_for_single_kernel + new_kernel = precompute_for_single_kernel(kernel, program_callables_info, + subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, @@ -363,6 +368,31 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, else: return new_kernel + +def add_prefetch(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = add_prefetch_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # }}} @@ -385,6 +415,7 @@ def change_arg_to_image(knl, name): # {{{ tag array axes +@iterate_over_kernels_if_given_program def tag_array_axes(knl, ary_names, dim_tags): """ .. versionchanged:: 2016.2 @@ -414,13 +445,15 @@ def tag_array_axes(knl, ary_names, dim_tags): return knl -tag_data_axes = MovedFunctionDeprecationWrapper(tag_array_axes) +tag_data_axes = ( + MovedFunctionDeprecationWrapper(tag_array_axes)) # }}} # {{{ set_array_axis_names +@iterate_over_kernels_if_given_program def set_array_axis_names(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 @@ -445,13 +478,15 @@ def set_array_axis_names(kernel, ary_names, dim_names): return kernel -set_array_dim_names = MovedFunctionDeprecationWrapper(set_array_axis_names) +set_array_dim_names = (MovedFunctionDeprecationWrapper( + set_array_axis_names)) # }}} # {{{ remove_unused_arguments +@iterate_over_kernels_if_given_program def remove_unused_arguments(knl): new_args = [] @@ -493,6 +528,7 @@ def remove_unused_arguments(knl): # {{{ alias_temporaries +@iterate_over_kernels_if_given_program def alias_temporaries(knl, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of @@ -577,11 +613,14 @@ def alias_temporaries(knl, names, base_name_prefix=None, # {{{ set argument order +@iterate_over_kernels_if_given_program def set_argument_order(kernel, arg_names): """ :arg arg_names: A list (or comma-separated string) or argument names. All arguments must be in this list. """ + #FIXME: @inducer -- shoulld this only affect the root kernel, or should it + # take a within? if isinstance(arg_names, str): arg_names = arg_names.split(",") @@ -610,6 +649,7 @@ def set_argument_order(kernel, arg_names): # {{{ rename argument +@iterate_over_kernels_if_given_program def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 @@ -655,6 +695,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): # {{{ set temporary scope +@iterate_over_kernels_if_given_program def set_temporary_scope(kernel, temp_var_names, scope): """ :arg temp_var_names: a container with membership checking, @@ -696,6 +737,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule +@iterate_over_kernels_if_given_program def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None): if isinstance(inames, str): inames = [s.strip() for s in inames.split(",")] diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index d4dcb3701..54d06605a 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -33,6 +33,7 @@ import loopy as lp from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext from loopy.isl_helpers import make_slab from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel # {{{ diff mapper @@ -370,6 +371,8 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i", *diff_context.by_name*, or *None* if no dependency exists. """ + assert isinstance(knl, LoopKernel) + from loopy.kernel.creation import apply_single_writer_depencency_heuristic knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=True) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 49e30a751..d43ce025b 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -31,6 +31,10 @@ from islpy import dim_type from loopy.diagnostic import LoopyError from pymbolic import var +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel +from loopy.program import rename_resolved_functions_in_a_single_kernel + def _apply_renames_in_exprs(kernel, var_renames): from loopy.symbolic import ( @@ -287,7 +291,7 @@ def _fuse_two_kernels(knla, knlb): # }}} -def fuse_kernels(kernels, suffixes=None, data_flow=None): +def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): """Return a kernel that performs all the operations in all entries of *kernels*. @@ -331,6 +335,8 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + + assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) if data_flow is None: @@ -411,4 +417,52 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): return result + +def fuse_kernels(programs, suffixes=None, data_flow=None): + main_prog_callables_info = ( + programs[0].program_callables_info.with_edit_callables_mode()) + old_root_kernel_callable = ( + programs[0].program_callables_info[programs[0].name]) + kernels = [programs[0].root_kernel] + + # removing the callable collisions that maybe present + for prog in programs[1:]: + root_kernel = prog.root_kernel + renames_needed = {} + for old_func_id, in_knl_callable in prog.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + if in_knl_callable.name != prog.name: + raise LoopyError("fuse_kernels cannot fuse programs with " + "multiple callable kernels.") + continue + num_times_called = ( + prog.program_callables_info.num_times_callables_called[ + old_func_id]) + for i in range(num_times_called): + main_prog_callables_info, new_func_id = ( + main_prog_callables_info.with_callables(var(old_func_id), + in_knl_callable, True)) + + if old_func_id != new_func_id: + renames_needed[old_func_id] = new_func_id + + if renames_needed: + root_kernel = rename_resolved_functions_in_a_single_kernel( + root_kernel, renames_needed) + + kernels.append(root_kernel) + + new_root_kernel = fuse_loop_kernels(kernels, suffixes, data_flow) + new_root_kernel_callable = old_root_kernel_callable.copy( + subkernel=new_root_kernel.copy(name=programs[0].name)) + + main_prog_callables_info, _ = main_prog_callables_info.with_callable( + var(programs[0].name), new_root_kernel_callable) + + main_prog_callables_info = ( + main_prog_callables_info.with_exit_edit_callables_mode()) + + return programs[0].copy( + program_callables_info=main_prog_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 2b618a464..93f6c53e8 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -34,6 +34,10 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + __doc__ = """ .. currentmodule:: loopy @@ -93,6 +97,7 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) +@iterate_over_kernels_if_given_program def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the @@ -107,6 +112,8 @@ def prioritize_loops(kernel, loop_priority): :arg: an iterable of inames, or, for brevity, a comma-separated string of inames """ + + assert isinstance(kernel, LoopKernel) if isinstance(loop_priority, str): loop_priority = tuple(s.strip() for s in loop_priority.split(",") if s.strip()) @@ -299,13 +306,15 @@ def _split_iname_backend(kernel, split_iname, kernel = tag_inames(kernel, {outer_iname: existing_tag, inner_iname: existing_tag}) - return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag}) + return tag_inames(kernel, {outer_iname: outer_tag, + inner_iname: inner_tag}) # }}} # {{{ split iname +@iterate_over_kernels_if_given_program def split_iname(kernel, split_iname, inner_length, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -331,6 +340,8 @@ def split_iname(kernel, split_iname, inner_length, :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ + assert isinstance(kernel, LoopKernel) + def make_new_loop_index(inner, outer): return inner + outer*inner_length @@ -347,6 +358,7 @@ def split_iname(kernel, split_iname, inner_length, # {{{ chunk iname +@iterate_over_kernels_if_given_program def chunk_iname(kernel, split_iname, num_chunks, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -481,6 +493,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper): return super(_InameJoiner, self).map_reduction(expr, expn_state) +@iterate_over_kernels_if_given_program def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last @@ -625,7 +638,9 @@ def untag_inames(kernel, iname_to_untag, tag_type): # {{{ tag inames -def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): +@iterate_over_kernels_if_given_program +def tag_inames(kernel, iname_to_tag, force=False, + ignore_nonexistent=False): """Tag an iname :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given @@ -804,7 +819,9 @@ class _InameDuplicator(RuleAwareIdentityMapper): return insn.copy(within_inames=new_fid) -def duplicate_inames(knl, inames, within, new_inames=None, suffix=None, +@iterate_over_kernels_if_given_program +def duplicate_inames(knl, inames, within, new_inames=None, + suffix=None, tags={}): """ :arg within: a stack match as understood by @@ -966,7 +983,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(knl, use_boostable_into=False): +def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1032,7 +1049,7 @@ def get_iname_duplication_options(knl, use_boostable_into=False): # If we find a duplication option and to not use boostable_into # information, we restart this generator with use_boostable_into=True if not use_boostable_into and not knl.options.ignore_boostable_into: - for option in get_iname_duplication_options(knl, True): + for option in get_iname_duplication_options_for_single_kernel(knl, True): yield option # Emit a warning that we needed boostable_into @@ -1060,18 +1077,42 @@ def get_iname_duplication_options(knl, use_boostable_into=False): yield iname, within -def has_schedulable_iname_nesting(knl): +def get_iname_duplication_options(program, use_boostable_into=False): + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + for option in get_iname_duplication_options_for_single_kernel( + in_knl_callable.subkernel, use_boostable_into): + yield option + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of in kernel callable %s." + % (type(in_knl_callable))) + + return + + +def has_schedulable_iname_nesting_for_single_kernel(knl): """ :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ - return not bool(next(get_iname_duplication_options(knl), False)) + return not bool(next(get_iname_duplication_options_for_single_kernel(knl), + False)) + + +def has_schedulable_iname_nesting(program): + return all(has_schedulable_iname_nesting_for_single_kernel( + in_knl_callable.subkernel) for in_knl_callable in + program.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel)) # }}} # {{{ rename_inames +@iterate_over_kernels_if_given_program def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by @@ -1278,6 +1319,7 @@ def _split_reduction(kernel, inames, direction, within=None): rsplit.map_kernel(kernel)) +@iterate_over_kernels_if_given_program def split_reduction_inward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1297,6 +1339,7 @@ def split_reduction_inward(kernel, inames, within=None): return _split_reduction(kernel, inames, "in", within) +@iterate_over_kernels_if_given_program def split_reduction_outward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1320,6 +1363,7 @@ def split_reduction_outward(kernel, inames, within=None): # {{{ affine map inames +@iterate_over_kernels_if_given_program def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. @@ -1651,6 +1695,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): expr, expn_state) +@iterate_over_kernels_if_given_program def make_reduction_inames_unique(kernel, inames=None, within=None): """ :arg inames: if not *None*, only apply to these inames @@ -1697,6 +1742,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # {{{ add_inames_to_insn +@iterate_over_kernels_if_given_program def add_inames_to_insn(knl, inames, insn_match): """ :arg inames: a frozenset of inames that will be added to the diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index e6ecb4093..93cf932b1 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -25,15 +25,35 @@ THE SOFTWARE. import six # noqa from loopy.diagnostic import LoopyError +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) +from loopy.program import Program, iterate_over_kernels_if_given_program # {{{ find_instructions -def find_instructions(kernel, insn_match): +def find_instructions_in_single_kernel(kernel, insn_match): + assert isinstance(kernel, LoopKernel) from loopy.match import parse_match match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] + +def find_instructions(program, insn_match): + assert isinstance(program, Program) + insns = [] + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + insns += (find_instructions_in_single_kernel( + in_knl_callable.subkernel, insn_match)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % ( + type(in_knl_callable))) + + return insns + # }}} @@ -58,6 +78,7 @@ def map_instructions(kernel, insn_match, f): # {{{ set_instruction_priority +@iterate_over_kernels_if_given_program def set_instruction_priority(kernel, insn_match, priority): """Set the priority of instructions matching *insn_match* to *priority*. @@ -75,6 +96,7 @@ def set_instruction_priority(kernel, insn_match, priority): # {{{ add_dependency +@iterate_over_kernels_if_given_program def add_dependency(kernel, insn_match, depends_on): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. @@ -92,7 +114,8 @@ def add_dependency(kernel, insn_match, depends_on): added_deps = frozenset([depends_on]) else: added_deps = frozenset( - dep.id for dep in find_instructions(kernel, depends_on)) + dep.id for dep in find_instructions_in_single_kernel(kernel, + depends_on)) if not added_deps: raise LoopyError("no instructions found matching '%s' " @@ -209,6 +232,7 @@ def replace_instruction_ids(kernel, replacements): # {{{ tag_instructions +@iterate_over_kernels_if_given_program def tag_instructions(kernel, new_tag, within=None): from loopy.match import parse_match within = parse_match(within) @@ -228,6 +252,7 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync +@iterate_over_kernels_if_given_program def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, empty_ok=False): """Add a *no_sync_with* directive between *source* and *sink*. @@ -260,18 +285,21 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, This used to silently pass. This behavior can be restored using *empty_ok*. """ + assert isinstance(kernel, LoopKernel) if isinstance(source, str) and source in kernel.id_to_insn: sources = frozenset([source]) else: sources = frozenset( - source.id for source in find_instructions(kernel, source)) + source.id for source in find_instructions_in_single_kernel( + kernel, source)) if isinstance(sink, str) and sink in kernel.id_to_insn: sinks = frozenset([sink]) else: sinks = frozenset( - sink.id for sink in find_instructions(kernel, sink)) + sink.id for sink in find_instructions_in_single_kernel( + kernel, sink)) if not sources and not empty_ok: raise LoopyError("No match found for source specification '%s'." % source) @@ -324,6 +352,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, # {{{ uniquify_instruction_ids +@iterate_over_kernels_if_given_program def uniquify_instruction_ids(kernel): """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique strings. diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index d695e3595..3e5e4a43b 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -28,6 +28,9 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + class ArrayAxisSplitHelper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, arg_names, handler): @@ -44,7 +47,9 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): # {{{ split_array_dim (deprecated since June 2016) -def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, +@iterate_over_kernels_if_given_program +def split_array_dim(kernel, arrays_and_axes, count, + auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating @@ -237,7 +242,7 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: - from loopy import split_iname + from loopy.transform.iname import split_iname for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, @@ -370,7 +375,9 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): return kernel -def split_array_axis(kernel, array_names, axis_nr, count, order="C"): +@iterate_over_kernels_if_given_program +def split_array_axis(kernel, array_names, axis_nr, count, + order="C"): """ :arg array: a list of names of temporary variables or arguments. May also be a comma-separated string of these. @@ -387,6 +394,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"): There was a more complicated, dumber function called :func:`split_array_dim` that had the role of this function in versions prior to 2016.2. """ + assert isinstance(kernel, LoopKernel) if isinstance(array_names, str): array_names = [i.strip() for i in array_names.split(",") if i.strip()] @@ -439,6 +447,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1 # {{{ add_padding +@iterate_over_kernels_if_given_program def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args)) arg_idx = arg_to_idx[variable] diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index fc5dad91d..b7d017ec8 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -28,6 +28,9 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel + __doc__ = """ .. currentmodule:: loopy @@ -40,6 +43,7 @@ __doc__ = """ # {{{ assume +@iterate_over_kernels_if_given_program def assume(kernel, assumptions): """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. `n mod 4 = 0`. @@ -134,6 +138,7 @@ def _fix_parameter(kernel, name, value): )) +@iterate_over_kernels_if_given_program def fix_parameters(kernel, **value_dict): """Fix the values of the arguments to specific constants. @@ -141,6 +146,7 @@ def fix_parameters(kernel, **value_dict): to be *value*. *name* may refer to :ref:`domain-parameters` or :ref:`arguments`. """ + assert isinstance(kernel, LoopKernel) for name, value in six.iteritems(value_dict): kernel = _fix_parameter(kernel, name, value) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 52d568975..66c7114ae 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -38,6 +38,9 @@ from pymbolic import var from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel, ScalarCallable + class RuleAccessDescriptor(AccessDescriptor): __slots__ = ["args", "expansion_stack"] @@ -258,9 +261,9 @@ class _not_provided(object): # noqa: N801 pass -def precompute(kernel, subst_use, sweep_inames=[], within=None, - storage_axes=None, temporary_name=None, precompute_inames=None, - precompute_outer_inames=None, +def precompute_for_single_kernel(kernel, program_callables_info, subst_use, + sweep_inames=[], within=None, storage_axes=None, temporary_name=None, + precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, # "None" is a valid value here, distinct from the default. @@ -1037,15 +1040,40 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} - from loopy import tag_inames + from loopy.transform.iname import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel) + kernel = assign_automatic_axes(kernel, program_callables_info) return kernel + +def precompute(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = precompute_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + # vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index cca62bc52..4b957b033 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -64,7 +64,7 @@ class LivenessAnalysis(object): def __init__(self, kernel): self.kernel = kernel - self.schedule = self.kernel.schedule + self.schedule = kernel.schedule @memoize_method def get_successor_relation(self): @@ -235,8 +235,9 @@ class TemporarySaver(object): def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel): + def __init__(self, kernel, program_callables_info): self.kernel = kernel + self.program_callables_info = program_callables_info self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -439,7 +440,8 @@ class TemporarySaver(object): return (), () group_sizes, local_sizes = ( - self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, + self.program_callables_info)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -628,7 +630,7 @@ class TemporarySaver(object): kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel) + return assign_automatic_axes(kernel, self.program_callables_info) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -722,7 +724,7 @@ class TemporarySaver(object): # {{{ auto save and reload across kernel calls -def save_and_reload_temporaries(knl): +def save_and_reload_temporaries(program): """ Add instructions to save and reload temporary variables that are live across kernel calls. @@ -745,8 +747,19 @@ def save_and_reload_temporaries(knl): :returns: The resulting kernel """ + + knl = program.root_kernel + + if not knl.schedule: + program = lp.preprocess_program(program) + from loopy.schedule import get_one_scheduled_kernel + knl = get_one_scheduled_kernel(program.root_kernel, + program.program_callables_info) + + assert knl.schedule is not None + liveness = LivenessAnalysis(knl) - saver = TemporarySaver(knl) + saver = TemporarySaver(knl, program.program_callables_info) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) @@ -784,7 +797,7 @@ def save_and_reload_temporaries(knl): .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) - return saver.finish() + return program.with_root_kernel(saver.finish()) # }}} diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index a681afe06..afe3fec59 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -33,6 +33,9 @@ from pymbolic.mapper.substitutor import make_subst_func from pytools import ImmutableRecord from pymbolic import var +from loopy.program import iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging logger = logging.getLogger(__name__) @@ -44,6 +47,7 @@ class ExprDescriptor(ImmutableRecord): # {{{ extract_subst +@iterate_over_kernels_if_given_program def extract_subst(kernel, subst_name, template, parameters=()): """ :arg subst_name: The name of the substitution rule to be created. @@ -285,6 +289,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) +@iterate_over_kernels_if_given_program def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): """Extract an assignment (to a temporary variable or an argument) @@ -468,7 +473,9 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst +@iterate_over_kernels_if_given_program def expand_subst(kernel, within=None): + assert isinstance(kernel, LoopKernel) if not kernel.substitutions: return kernel @@ -501,8 +508,17 @@ def find_rules_matching(knl, pattern): return [r for r in knl.substitutions if pattern.match(r)] -def find_one_rule_matching(knl, pattern): - rules = find_rules_matching(knl, pattern) +def find_one_rule_matching(program, pattern): + rules = [] + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + knl = in_knl_callable.subkernel + rules.extend(find_rules_matching(knl, pattern)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable types %s." % ( + type(in_knl_callable).__name__)) if len(rules) > 1: raise ValueError("more than one substitution rule matched '%s'" diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 010a0658f..0e8fa3053 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -33,6 +33,11 @@ from loopy.types import NumpyType from loopy.diagnostic import ( LoopyError, TypeInferenceFailure, DependencyTypeInferenceFailure) +from loopy.kernel.instruction import _DataObliviousInstruction + +from loopy.program import ProgramCallablesInfo +from loopy.symbolic import SubArrayRef, LinearSubscript +from pymbolic.primitives import Variable, Subscript, Lookup import logging logger = logging.getLogger(__name__) @@ -44,10 +49,23 @@ def _debug(kernel, s, *args): logger.debug("%s: %s" % (kernel.name, logstr)) +def get_return_types_as_tuple(arg_id_to_dtype): + """Returns the types of arguments in a tuple format. + + :param arg_id_to_dtype: An instance of :class:`dict` which denotes a + mapping from the arguments to their inferred types. + """ + return_arg_id_to_dtype = dict((id, dtype) for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) + + return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, new_assignments=None): + def __init__(self, kernel, program_callables_info, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -56,10 +74,13 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel + assert isinstance(program_callables_info, ProgramCallablesInfo) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() + self.program_callables_info = program_callables_info + self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): kwargs = {} @@ -92,13 +113,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self): - return type(self)(self.kernel, self.new_assignments) + def copy(self, program_callables_info=None): + if program_callables_info is None: + program_callables_info = self.program_callables_info + return type(self)(self.kernel, program_callables_info, + self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, new_ass) + return type(self)(self.kernel, self.program_callables_info, new_ass) @staticmethod def combine(dtype_sets): @@ -250,15 +274,20 @@ class TypeInferenceMapper(CombineMapper): return self.rec(expr.aggregate) def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable + + from pymbolic.primitives import Variable, CallWithKwargs, Call + from loopy.symbolic import ResolvedFunction + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} identifier = expr.function - if isinstance(identifier, Variable): + if isinstance(identifier, (Variable, ResolvedFunction)): identifier = identifier.name - if identifier in ["indexof", "indexof_vec"]: - return [self.kernel.index_dtype] - def none_if_empty(d): if d: d, = d @@ -266,25 +295,145 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters) - if None in arg_dtypes: - return [] + arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) + + # specializing the known function wrt type + if isinstance(expr.function, ResolvedFunction): + in_knl_callable = self.program_callables_info[expr.function.name] + + # {{{ checking that there is no overwriting of types of in_knl_callable + + if in_knl_callable.arg_id_to_dtype is not None: + + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + if id in in_knl_callable.arg_id_to_dtype and ( + in_knl_callable.arg_id_to_dtype[id] != + arg_id_to_dtype[id]): + + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int + + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue + + # }}} + + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") + + # }}} + + in_knl_callable, self.program_callables_info = ( + in_knl_callable.with_types( + arg_id_to_dtype, self.kernel, + self.program_callables_info)) + + in_knl_callable = in_knl_callable.with_target(self.kernel.target) + + # storing the type specialized function so that it can be used for + # later use + self.program_callables_info, new_function_id = ( + self.program_callables_info.with_callable( + expr.function.function, + in_knl_callable)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls[expr] = new_function_id + + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if new_arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(new_arg_id_to_dtype)] + else: + return [new_arg_id_to_dtype[-1]] + + elif isinstance(expr.function, Variable): + # Since, the function is not "scoped", attempt to infer using + # kernel.function_manglers + + # {{{ trying to infer using function manglers + + arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in + expr.parameters) + + # finding the function_mangler which would be associated with the + # realized function. + + mangle_result = None + for function_mangler in self.kernel.function_manglers: + mangle_result = function_mangler(self.kernel, identifier, + arg_dtypes) + if mangle_result: + # found a match. + break - mangle_result = self.kernel.mangle_function(identifier, arg_dtypes) - if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] - else: if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct assignments") + from loopy.kernel.function_interface import (ManglerCallable, + ValueArgDescriptor) + + # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes + arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) + for i, dt in enumerate(mangle_result.arg_dtypes)) + arg_id_to_dtype.update(dict((-i-1, + dtype.with_target(self.kernel.target)) for i, dtype in enumerate( + mangle_result.result_dtypes))) + arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.arg_dtypes)) + res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in + enumerate(mangle_result.result_dtypes)) + arg_id_to_descr = dict(arg_descrs+res_descrs) + + # creating the ManglerCallable object corresponding to the + # function. + in_knl_callable = ManglerCallable( + identifier, function_mangler, arg_id_to_dtype, + arg_id_to_descr, mangle_result.target_name) + self.program_callables_info, new_function_id = ( + self.program_callables_info.with_callable( + expr.function, in_knl_callable, True)) + + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls = new_function_id + + # Returning the type. + if return_tuple: + if mangle_result is not None: + return [mangle_result.result_dtypes] + else: + if mangle_result is not None: + if len(mangle_result.result_dtypes) != 1 and not return_tuple: + raise LoopyError("functions with more or fewer than one " + "return value may only be used in direct " + "assignments") - return [mangle_result.result_dtypes[0]] + return [mangle_result.result_dtypes[0]] + # }}} - raise RuntimeError("unable to resolve " - "function '%s' with %d given arguments" - % (identifier, len(arg_dtypes))) + return [] + + map_call_with_kwargs = map_call def map_variable(self, expr): if expr.name in self.kernel.all_inames(): @@ -399,14 +548,20 @@ class TypeInferenceMapper(CombineMapper): return [expr.operation.result_dtypes(self.kernel, rec_result)[0] for rec_result in rec_results] + def map_sub_array_ref(self, expr): + return self.rec(expr.get_begin_subscript()) + + # }}} # {{{ infer single variable def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): + if var_name in kernel.all_params(): - return [kernel.index_dtype], [] + return [kernel.index_dtype], [], {}, ( + type_inf_mapper.program_callables_info) from functools import partial debug = partial(_debug, kernel) @@ -451,11 +606,15 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): dtype_sets.append(result) if not dtype_sets: - return None, type_inf_mapper.symbols_with_unknown_types + return ( + None, type_inf_mapper.symbols_with_unknown_types, None, + type_inf_mapper.program_callables_info) result = type_inf_mapper.combine(dtype_sets) - return result, type_inf_mapper.symbols_with_unknown_types + return (result, type_inf_mapper.symbols_with_unknown_types, + type_inf_mapper.old_calls_to_new_calls, + type_inf_mapper.program_callables_info) # }}} @@ -482,7 +641,8 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types(kernel, expect_completion=False): +def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, + expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) @@ -544,7 +704,8 @@ def infer_unknown_types(kernel, expect_completion=False): new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) + type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info, + item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) @@ -553,6 +714,8 @@ def infer_unknown_types(kernel, expect_completion=False): from loopy.kernel.data import TemporaryVariable, KernelArgument + old_calls_to_new_calls = {} + for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] @@ -576,9 +739,12 @@ def infer_unknown_types(kernel, expect_completion=False): debug("inferring type for %s %s", type(item).__name__, item.name) - result, symbols_with_unavailable_types = ( + (result, symbols_with_unavailable_types, + new_old_calls_to_new_calls, program_callables_info) = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) + type_inf_mapper = type_inf_mapper.copy( + program_callables_info=program_callables_info) failed = not result if not failed: @@ -597,6 +763,7 @@ def infer_unknown_types(kernel, expect_completion=False): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") + old_calls_to_new_calls.update(new_old_calls_to_new_calls) else: debug(" failure") @@ -635,23 +802,141 @@ def infer_unknown_types(kernel, expect_completion=False): # }}} + # FIXME: copy the explanation from make_function_ready_for_codegen + # here. + + # {{{ check if insn missed during type inference + + def _instruction_missed_during_inference(insn): + for assignee in insn.assignees: + if isinstance(assignee, Lookup): + assignee = assignee.aggregate + + if isinstance(assignee, Variable): + if assignee.name in kernel.arg_dict: + if kernel.arg_dict[assignee.name].dtype is None: + return False + else: + assert assignee.name in kernel.temporary_variables + if kernel.temporary_variables[assignee.name].dtype is None: + return False + + elif isinstance(assignee, (Subscript, LinearSubscript)): + if assignee.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[assignee.aggregate.name].dtype is None: + return False + else: + assert assignee.aggregate.name in kernel.temporary_variables + if kernel.temporary_variables[ + assignee.aggregate.name].dtype is None: + return False + else: + assert isinstance(assignee, SubArrayRef) + if assignee.subscript.aggregate.name in kernel.arg_dict: + if kernel.arg_dict[ + assignee.subscript.aggregate.name].dtype is None: + return False + else: + assert assignee.subscript.aggregate.name in ( + kernel.temporary_variables) + if kernel.temporary_variables[ + assignee.subscript.aggregate.name] is None: + return False + + return True + + # }}} + + for insn in kernel.instructions: + if isinstance(insn, lp.MultiAssignmentBase): + # just a dummy run over the expression, to pass over all the + # functions + # FIXME: need a check over here which checks the instruction for + # unseen cases + if _instruction_missed_during_inference(insn): + type_inf_mapper(insn.expression, return_tuple=isinstance(insn, + lp.CallInstruction), return_dtype_set=True) + elif isinstance(insn, (_DataObliviousInstruction, + lp.CInstruction)): + pass + else: + raise NotImplementedError("Unknown instructions type %s." % ( + type(insn).__name__)) + + program_callables_info = type_inf_mapper.program_callables_info + old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) + end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) - return unexpanded_kernel.copy( + pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) + # this has to be subsitutition + from loopy.kernel.function_interface import ( + change_names_of_pymbolic_calls) + type_specialized_kernel = change_names_of_pymbolic_calls( + pre_type_specialized_knl, old_calls_to_new_calls) + + # the check is unnecessary as we would first get TypeInfereceFailure before + # encountering this. Move this at the start once ManglerCallable is + # deprecated. + if expect_completion: + # if completion is expected, then it is important that all the + # callables are scoped. + from loopy.check import check_functions_are_scoped + check_functions_are_scoped(type_specialized_kernel) + + return type_specialized_kernel, program_callables_info + + +def infer_unknown_types(program, expect_completion=False): + """Infer types on temporaries and arguments.""" + from loopy.kernel import LoopKernel + if isinstance(program, LoopKernel): + # FIXME: deprecate warning needed here + from loopy.program import make_program_from_kernel + program = make_program_from_kernel(program) + + program_callables_info = program.program_callables_info + + type_uninferred_knl_callable = ( + program_callables_info[program.name]) + type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel + + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + root_kernel, program_callables_info = ( + infer_unknown_types_for_a_single_kernel( + type_uninferred_root_kernel, + program_callables_info, expect_completion)) + + type_inferred_knl_callable = type_uninferred_knl_callable.copy( + subkernel=root_kernel) + + program_callables_info, _ = ( + program_callables_info.with_callable( + program.name, + type_inferred_knl_callable)) + + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + # FIXME: maybe put all of this in a function? + # need to infer functions that were left out during inference + return program.copy(program_callables_info=program_callables_info) + # }}} # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel) + kernel, expr, program_callables_info, unknown_types_ok): + type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info) import loopy as lp if expr.is_tuple_typed: @@ -682,7 +967,8 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( if dt is not lp.auto else dt for dt in reduction_dtypes) - return tuple(arg_dtypes), reduction_dtypes + return tuple(arg_dtypes), reduction_dtypes, ( + type_inf_mapper.program_callables_info) # }}} diff --git a/test/test_apps.py b/test/test_apps.py index e7f4004fa..a9c3bf2a7 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -216,7 +216,8 @@ def test_rob_stroud_bernstein(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -224,13 +225,12 @@ def test_rob_stroud_bernstein(ctx_factory): knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) - - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, - ))) + )) + print(lp.generate_code_v2(knl)) def test_rob_stroud_bernstein_full(ctx_factory): @@ -296,7 +296,8 @@ def test_rob_stroud_bernstein_full(ctx_factory): lp.GlobalArg("coeffs", None, shape=None), "..." ], - assumptions="deg>=0 and nels>=1" + assumptions="deg>=0 and nels>=1", + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) @@ -310,14 +311,14 @@ def test_rob_stroud_bernstein_full(ctx_factory): from pickle import dumps, loads knl = loads(dumps(knl)) - knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( + knl = lp.add_dtypes(knl, dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) - print(knl) + print(lp.generate_code_v2(knl)) def test_stencil(ctx_factory): @@ -660,7 +661,7 @@ def test_domain_tree_nesting(): lp.GlobalArg('B', shape=(100, 31), dtype=np.float64), lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)]) - parents_per_domain = knl.parents_per_domain() + parents_per_domain = knl.root_kernel.parents_per_domain() def depth(i): if parents_per_domain[i] is None: diff --git a/test/test_c_execution.py b/test/test_c_execution.py index c355893e4..7c7df2557 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -76,6 +76,7 @@ def test_c_target_strides(): # test with C-order knl = __get_kernel('C') + lp.generate_code_v2(knl) a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1), order='C') diff --git a/test/test_diff.py b/test/test_diff.py index b735ab17a..a7fd92987 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -55,7 +55,7 @@ def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel( + knl = lp.make_kernel_function( """{ [i,j]: 0<=i,j a = 1/(1+sinh(x[i] + y[j])**2) @@ -66,6 +66,7 @@ def test_diff(ctx_factory): from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") + dknl = lp.make_program_from_kernel(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") diff --git a/test/test_domain.py b/test/test_domain.py index ebfde8509..dd789d2cd 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -61,20 +61,15 @@ def test_assume(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i 10") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) - assert "if" not in compiled.get_code() + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code def test_divisibility_assumption(ctx_factory): @@ -90,16 +85,14 @@ def test_divisibility_assumption(ctx_factory): lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and (exists zz: n = 16*zz)") + assumptions="n>=1 and (exists zz: n = 16*zz)", + target=lp.PyOpenCLTarget(ctx.devices[0])) ref_knl = knl knl = lp.split_iname(knl, "i", 16) - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "if" not in code + code = lp.generate_code_v2(knl).device_code() + assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3}) @@ -118,16 +111,12 @@ def test_eq_constraint(ctx_factory): [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") - - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for knl in kernel_gen: - print(lp.generate_code(knl)) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds(ctx_factory): @@ -150,12 +139,10 @@ def test_dependent_loop_bounds(ctx_factory): lp.GlobalArg("a_sum", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_2(ctx_factory): @@ -179,14 +166,13 @@ def test_dependent_loop_bounds_2(ctx_factory): lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], - assumptions="n>=1 and row_len>=1") + assumptions="n>=1 and row_len>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + + print(lp.generate_code_v2(knl).device_code()) def test_dependent_loop_bounds_3(ctx_factory): @@ -211,25 +197,21 @@ def test_dependent_loop_bounds_3(ctx_factory): lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) - assert knl.parents_per_domain()[1] == 0 + assert knl.root_kernel.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - cknl = lp.CompiledKernel(ctx, knl) - print("---------------------------------------------------") - print(cknl.get_highlighted_code()) - print("---------------------------------------------------") + print(lp.generate_code_v2(knl).device_code()) knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - with pytest.raises(RuntimeError): - list(lp.generate_loop_schedules(knl_bad)) + list(lp.generate_code_v2(knl_bad)) def test_dependent_loop_bounds_4(): @@ -291,11 +273,10 @@ def test_independent_multi_domain(ctx_factory): inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - assert knl.parents_per_domain() == 2*[None] + assert knl.root_kernel.parents_per_domain() == 2*[None] n = 50 - cknl = lp.CompiledKernel(ctx, knl) - evt, (a, b) = cknl(queue, n=n, out_host=True) + evt, (a, b) = knl(queue, n=n, out_host=True) assert a.shape == (50,) assert b.shape == (50,) @@ -396,10 +377,11 @@ def test_triangle_domain(ctx_factory): knl = lp.make_kernel( "{[i,j]: 0<=i,j bb = a[i] - b[i] @@ -122,16 +122,15 @@ def test_type_inference_no_artificial_doubles(ctx_factory): lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - code = lp.generate_code(k) - assert "double" not in code + code = lp.generate_code_v2(prog).device_code() + assert "double" not in code def test_type_inference_with_type_dependencies(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: i=0}", """ <>a = 99 @@ -143,13 +142,17 @@ def test_type_inference_with_type_dependencies(): <>d = b + 2 + 1j """, "...") - knl = lp.infer_unknown_types(knl) + prog = lp.infer_unknown_types(prog) from loopy.types import to_loopy_type - assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32) - assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32) - assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128) + assert prog.root_kernel.temporary_variables["a"].dtype == to_loopy_type( + np.int32) + assert prog.root_kernel.temporary_variables["b"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["c"].dtype == to_loopy_type( + np.float32) + assert prog.root_kernel.temporary_variables["d"].dtype == to_loopy_type( + np.complex128) def test_sized_and_complex_literals(ctx_factory): @@ -183,16 +186,12 @@ def test_simple_side_effect(ctx_factory): """ a[i] = a[i] + 1 """, - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - print(gen_knl) - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_owed_barriers(ctx_factory): @@ -203,17 +202,14 @@ def test_owed_barriers(ctx_factory): [ " z[i] = a[i]" ], - [lp.GlobalArg("a", np.float32, shape=(100,))] + [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) def test_wg_too_small(ctx_factory): @@ -225,17 +221,14 @@ def test_wg_too_small(ctx_factory): " z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - import pytest - for gen_knl in kernel_gen: - with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, gen_knl).get_code() + print(knl) + with pytest.raises(RuntimeError): + print(lp.generate_code_v2(knl)) def test_multi_cse(ctx_factory): @@ -247,17 +240,14 @@ def test_multi_cse(ctx_factory): " z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], + target=lp.PyOpenCLTarget(ctx.devices[0]), local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - kernel_gen = lp.generate_loop_schedules(knl) - - for gen_knl in kernel_gen: - compiled = lp.CompiledKernel(ctx, gen_knl) - print(compiled.get_code()) + print(knl) + print(lp.generate_code_v2(knl)) # {{{ code generator fuzzing @@ -414,17 +404,16 @@ def test_ilp_write_race_detection_global(ctx_factory): lp.GlobalArg("a", np.float32), lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(j="ilp")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: - list(lp.generate_loop_schedules(knl)) + lp.generate_code_v2(knl) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) @@ -438,13 +427,13 @@ def test_ilp_write_race_avoidance_local(ctx_factory): [ "<> a[i] = 5+i+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16, 17) + assert knl.root_kernel.temporary_variables["a"].shape == (16, 17) def test_ilp_write_race_avoidance_private(ctx_factory): @@ -455,13 +444,13 @@ def test_ilp_write_race_avoidance_private(ctx_factory): [ "<> a = 5+j", ], - []) + [], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.tag_inames(knl, dict(j="ilp")) - knl = lp.preprocess_kernel(knl, ctx.devices[0]) - for k in lp.generate_loop_schedules(knl): - assert k.temporary_variables["a"].shape == (16,) + knl = lp.preprocess_kernel(knl) + assert knl.root_kernel.temporary_variables["a"].shape == (16,) # }}} @@ -482,11 +471,12 @@ def test_write_parameter(ctx_factory): lp.GlobalArg("b", dtype, shape=()), lp.ValueArg("n", np.int32, approximately=1000), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) import pytest with pytest.raises(RuntimeError): - lp.CompiledKernel(ctx, knl).get_code() + lp.generate_code_v2(knl).device_code() # {{{ arg guessing @@ -507,10 +497,11 @@ def test_arg_shape_guessing(ctx_factory): lp.GlobalArg("c", shape=lp.auto), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing(ctx_factory): @@ -523,10 +514,11 @@ def test_arg_guessing(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_arg_guessing_with_reduction(ctx_factory): @@ -541,16 +533,16 @@ def test_arg_guessing_with_reduction(ctx_factory): b[i, j] = i*j c[i+j, j] = b[j,i] """, - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_unknown_arg_shape(ctx_factory): ctx = ctx_factory() from loopy.target.pyopencl import PyOpenCLTarget - from loopy.compiled import CompiledKernel bsize = [256, 0] knl = lp.make_kernel( @@ -566,11 +558,11 @@ def test_unknown_arg_shape(ctx_factory): """, seq_dependencies=True, name="uniform_l", - target=PyOpenCLTarget(), + target=PyOpenCLTarget(ctx.devices[0]), assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset()) # noqa + print(lp.generate_code_v2(knl).device_code()) # }}} @@ -587,10 +579,11 @@ def test_nonlinear_index(ctx_factory): lp.GlobalArg("a", shape="n"), lp.ValueArg("n"), ], - assumptions="n>=1") + assumptions="n>=1", + target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_offsets_and_slicing(ctx_factory): @@ -607,9 +600,7 @@ def test_offsets_and_slicing(ctx_factory): assumptions="n>=1 and m>=1", default_offset=lp.auto) - knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1") - - cknl = lp.CompiledKernel(ctx, knl) + knl = lp.tag_array_axes(knl, "a,b", "stride:auto,stride:1") a_full = cl.clrandom.rand(queue, (n, n), np.float64) a_full_h = a_full.get() @@ -624,8 +615,10 @@ def test_offsets_and_slicing(ctx_factory): b_full_h[b_sub] = 2*a_full_h[a_sub] - print(cknl.get_highlighted_code({"a": a.dtype})) - cknl(queue, a=a, b=b) + knl = lp.add_dtypes(knl, {"a": a.dtype}) + + print(lp.generate_code_v2(knl)) + knl(queue, a=a, b=b) import numpy.linalg as la assert la.norm(b_full.get() - b_full_h) < 1e-13 @@ -642,18 +635,16 @@ def test_vector_ilp_with_prefetch(ctx_factory): # argument guessing. lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." - ]) + ], + target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"], default_tag="l.auto") - cknl = lp.CompiledKernel(ctx, knl) - cknl.kernel_info() - import re - code = cknl.get_code() + code = lp.generate_code_v2(knl).device_code() assert len(list(re.finditer("barrier", code))) == 1 @@ -674,18 +665,18 @@ def test_c_instruction(ctx_factory): lp.TemporaryVariable("x", np.float32), "...", ], - assumptions="n>=1") + assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code()) + print(lp.generate_code_v2(knl).device_code()) def test_dependent_domain_insn_iname_finding(ctx_factory): ctx = ctx_factory() - knl = lp.make_kernel([ + prog = lp.make_kernel([ "{[isrc_box]: 0<=isrc_box src_ibox = source_boxes[i] @@ -728,8 +720,8 @@ def test_inames_deps_from_write_subscript(ctx_factory): None, shape=None), "..."]) - print(knl) - assert "i" in knl.insn_inames("myred") + print(prog) + assert "i" in prog.root_kernel.insn_inames("myred") def test_modulo_indexing(ctx_factory): @@ -743,14 +735,12 @@ def test_modulo_indexing(ctx_factory): [ lp.GlobalArg("a", None, shape="n"), "..." - ] + ], target=lp.PyOpenCLTarget(ctx.devices[0]) ) print(knl) - print(lp.CompiledKernel(ctx, knl).get_highlighted_code( - dict( - a=np.float32, - ))) + knl = lp.add_dtypes(knl, {"a": np.float32}) + print(lp.generate_code_v2(knl).device_code()) @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16]) @@ -770,7 +760,7 @@ def test_vector_types(ctx_factory, vec_len): ref_knl = knl - knl = lp.tag_data_axes(knl, "out", "c,vec") + knl = lp.tag_array_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") @@ -898,11 +888,7 @@ def test_multiple_writes_to_local_temporary(): temp[i, 1] = 15 """) knl = lp.tag_inames(knl, dict(i="l.0")) - - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - code, _ = lp.generate_code(k) - print(code) + print(lp.generate_code_v2(knl).device_code()) def test_make_copy_kernel(ctx_factory): @@ -980,9 +966,7 @@ def test_variable_size_temporary(): # Make sure that code generation succeeds even if # there are variable-length arrays. - knl = lp.preprocess_kernel(knl) - for k in lp.generate_loop_schedules(knl): - lp.generate_code(k) + lp.generate_code_v2(knl).device_code() def test_indexof(ctx_factory): @@ -1014,7 +998,7 @@ def test_indexof_vec(ctx_factory): ''' out[i,j,k] = indexof_vec(out[i,j,k])''') knl = lp.tag_inames(knl, {"i": "vec"}) - knl = lp.tag_data_axes(knl, "out", "vec,c,c") + knl = lp.tag_array_axes(knl, "out", "vec,c,c") knl = lp.set_options(knl, write_cl=True) (evt, (out,)) = knl(queue) @@ -1156,7 +1140,7 @@ def test_within_inames_and_reduction(): within_inames=frozenset(), within_inames_is_final=True) - k = lp.make_kernel("{[i,j] : 0<=i,j {[j]: 0 <= j < jmax}"], """ @@ -2440,10 +2413,11 @@ def test_barrier_insertion_near_bottom_of_loop(): end """, seq_dependencies=True) - knl = lp.tag_inames(knl, dict(i="l.0")) - knl = lp.set_temporary_scope(knl, "a", "local") - knl = lp.set_temporary_scope(knl, "b", "local") - knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + prog = lp.tag_inames(prog, dict(i="l.0")) + prog = lp.set_temporary_scope(prog, "a", "local") + prog = lp.set_temporary_scope(prog, "b", "local") + prog = lp.preprocess_kernel(prog) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) print(knl) @@ -2453,7 +2427,7 @@ def test_barrier_insertion_near_bottom_of_loop(): def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # make simple barrier'd kernel - knl = lp.make_kernel('{[i]: 0 <= i < 10}', + prog = lp.make_kernel('{[i]: 0 <= i < 10}', """ for i a[i] = i {id=a} @@ -2468,15 +2442,17 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): # split into kernel w/ vesize larger than iname domain vecsize = 16 - knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0') + prog = lp.split_iname(prog, 'i', vecsize, inner_tag='l.0') from testlib import GridOverride # artifically expand via overridden_get_grid_sizes_for_insn_ids + knl = prog.root_kernel knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) + prog = prog.with_root_kernel(knl) # make sure we can generate the code - lp.generate_code_v2(knl) + lp.generate_code_v2(prog) def test_multi_argument_reduction_type_inference(): @@ -2485,7 +2461,7 @@ def test_multi_argument_reduction_type_inference(): from loopy.types import to_loopy_type op = SegmentedSumReductionOperation() - knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=ja = 0 <>b_s0 = 0 """) - vng = knl.get_var_name_generator() + vng = prog.root_kernel.get_var_name_generator() assert vng("a_s0") != "a_s0" assert vng("b") != "b" @@ -2647,7 +2625,7 @@ def test_fixed_parameters(ctx_factory): def test_parameter_inference(): knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") - assert knl.all_params() == set(["n"]) + assert knl.root_kernel.all_params() == set(["n"]) def test_execution_backend_can_cache_dtypes(ctx_factory): @@ -2666,7 +2644,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory): def test_wildcard_dep_matching(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0 <= i < 10}", """ <>a = 0 {id=insn1} @@ -2679,11 +2657,15 @@ def test_wildcard_dep_matching(): all_insns = set("insn%d" % i for i in range(1, 6)) - assert knl.id_to_insn["insn1"].depends_on == set() - assert knl.id_to_insn["insn2"].depends_on == all_insns - set(["insn2"]) - assert knl.id_to_insn["insn3"].depends_on == all_insns - set(["insn3"]) - assert knl.id_to_insn["insn4"].depends_on == set(["insn1", "insn2"]) - assert knl.id_to_insn["insn5"].depends_on == all_insns - set(["insn1", "insn5"]) + assert prog.root_kernel.id_to_insn["insn1"].depends_on == set() + assert (prog.root_kernel.id_to_insn["insn2"].depends_on == all_insns - + set(["insn2"])) + assert (prog.root_kernel.id_to_insn["insn3"].depends_on == all_insns - + set(["insn3"])) + assert (prog.root_kernel.id_to_insn["insn4"].depends_on == set(["insn1", + "insn2"])) + assert (prog.root_kernel.id_to_insn["insn5"].depends_on == all_insns - + set(["insn1", "insn5"])) def test_preamble_with_separate_temporaries(ctx_factory): @@ -2777,7 +2759,7 @@ def test_relaxed_stride_checks(ctx_factory): def test_add_prefetch_works_in_lhs_index(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{ [n,k,l,k1,l1,k2,l2]: " "start<=ntmp[i] = 10 {id=insn1} <>tmp2[i] = 10 {id=insn2} @@ -492,28 +494,34 @@ def test_add_nosync(): tmp5[i] = 1 {id=insn6,conflicts=g1} """) - orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") - orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local") + orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local") # No dependency present - don't add nosync - knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2", + prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2", empty_ok=True) - assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn2"].no_sync_with) # Dependency present - knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") - assert frozenset() == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Bidirectional - knl = lp.add_nosync( - orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) - assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with - assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + prog = lp.add_nosync( + orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == ( + prog.root_kernel.id_to_insn["insn3"].no_sync_with) + assert frozenset([("insn3", "local")]) == ( + prog.root_kernel.id_to_insn["insn4"].no_sync_with) # Groups - knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") - assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == ( + prog.root_kernel.id_to_insn["insn6"].no_sync_with) def test_uniquify_instruction_ids(): @@ -522,12 +530,14 @@ def test_uniquify_instruction_ids(): i3 = lp.Assignment("b", 1, id=lp.UniqueName("b")) i4 = lp.Assignment("b", 1, id=lp.UniqueName("b")) - knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4]) + prog = lp.make_kernel("{[i]: i = 1}", []) + new_root_kernel = prog.root_kernel.copy(instructions=[i1, i2, i3, i4]) + prog = prog.with_root_kernel(new_root_kernel) from loopy.transform.instruction import uniquify_instruction_ids - knl = uniquify_instruction_ids(knl) + prog = uniquify_instruction_ids(prog) - insn_ids = set(insn.id for insn in knl.instructions) + insn_ids = set(insn.id for insn in prog.root_kernel.instructions) assert len(insn_ids) == 4 assert all(isinstance(id, str) for id in insn_ids) diff --git a/test/testlib.py b/test/testlib.py index ad290ee7c..eebc792d0 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel @@ -8,8 +9,9 @@ class GridOverride(object): self.clean = clean self.vecsize = vecsize - def __call__(self, insn_ids, ignore_auto=True): - gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto) + def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, + program_callables_info, ignore_auto) return gsize, (self.vecsize,) # }}} @@ -132,4 +134,48 @@ class SeparateTemporariesPreambleTestPreambleGenerator( # }}} + +# {{{ test_register_function_lookup + +class Log2Callable(lp.ScalarCallable): + + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + program_callables_info) + + dtype = arg_id_to_dtype[0].numpy_dtype + + if dtype.kind in ('u', 'i'): + # ints and unsigned casted to float32 + dtype = np.float32 + + from loopy.target.opencl import OpenCLTarget + name_in_target = "log2" + if not isinstance(kernel.target, OpenCLTarget): + # for CUDA, C Targets the name must be modified + if dtype == np.float32: + name_in_target = "log2f" + elif dtype == np.float128: + name_in_target = "log2l" + + from loopy.types import NumpyType + return ( + self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + program_callables_info) + + +def register_log2_lookup(target, identifier): + if identifier == 'log2': + return Log2Callable(name='log2') + return None + +# }}} + # vim: foldmethod=marker -- GitLab From ee6214767d96b9b4a7d240c5ed8affed2137ec6e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 16:38:50 +0530 Subject: [PATCH 338/916] adding untracked files. --- doc/ref_call.rst | 191 +++++++ loopy/kernel/function_interface.py | 867 +++++++++++++++++++++++++++++ loopy/program.py | 684 +++++++++++++++++++++++ loopy/transform/callable.py | 707 +++++++++++++++++++++++ test/test_callables.py | 414 ++++++++++++++ 5 files changed, 2863 insertions(+) create mode 100644 doc/ref_call.rst create mode 100644 loopy/kernel/function_interface.py create mode 100644 loopy/program.py create mode 100644 loopy/transform/callable.py create mode 100644 test/test_callables.py diff --git a/doc/ref_call.rst b/doc/ref_call.rst new file mode 100644 index 000000000..4ff1ef2fc --- /dev/null +++ b/doc/ref_call.rst @@ -0,0 +1,191 @@ +Calling Loopy Kernels and External Functions +============================================ + +Goals of a function interface +----------------------------- + +- Must be able to have complete information of the function just through the + epxression node. +- Must adhere to :mod:`loopy` semantics of immutability. +- Must have a class instance linked with the expression node which would record + the properties of the function. +- Must indicate in the expression if the function is known to the kernel. (This + is intended to be done by making the function expression node an instance of + ``ResolvedFunction`` as soon as the function definition is resolved by the + kernel) +- Function overloading is not encouraged in :mod:`loopy` as it gives rise to + contention while debugging with the help of the kernel intermediate + representation and hence if the expression nodes point to different function + instances they must differ in their representation. For example: ``float + sin(float )`` and ``double sin(double )`` should diverge by having different + identifiers as soon as data type of the argument is inferred. +- Must have an interface to register external functions. + + +Scoped Function and resolving +----------------------------- + +``ResolvedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` +kernel, whose name has been resolved by the kernel. The process of matching a +function idenitifier with the function definition is called "resolving". + +A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it +is "resolved" by one of the ``function_scoper`` in a +:attr:`LoopKernel.scoped_functions` + +- Functions already registered by the target. Some examples include -- + ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.) +- Functions that are defined in ``Loo.py`` and are realized into + different set of instructions during code generation. Some examples + include ``make_tuple``, ``ArgExtOp``, ``index_of``, ... +- Functions registered as ``CallableKernels`` using + ``lp.register_callable_kernel(...)``. +- Functions that have been provided through + ``lp.register_function_scoper(...)`` +- Functions that can be made known from the user through + ``lp.register_function_mangler``. This is planned to be deprecated, + as its functionality is superseded by + ``lp.register_function_scoper(...)``. + +Expressions after a function is scoped +-------------------------------------- + +Consider the following expression. + +:: + + sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i]) + +During the kernel creation phase, the kernel would know that ``sin`` is +a function known to the target and hence it should be scoped. And as +expected, after ``make_kernel`` has been called the above expression +would get converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + callable_knl_func(c[i])*mangler_call(d[i]) + +This would also make an entry in the kernel's ``scoped_functions`` +dictionary as: + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None)} + +It might be noteworthy that at this step, it only scopes functions +through their names without any information about the types of the +function. + +Once, the user calls the transformation: +``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``, +the expression gets converted to: + +:: + + ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + + ResolvedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) + +This also makes an entry in the ``scoped_functions`` dictionary as -- + +:: + + {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)} + +Now, if the user calls +``register_function_mangler(knl, 'mangler_call')``, one might expect +that the mangler call function should get scoped, but that does **not** +happen, because the "old" ``function_manglers``, would return a match +only if all the parameters of the function match viz. name, argument +arity and argument types. Hence, the ``scoped_functions`` dictionary +would remain unchanged. + +``ResolvedFunctions`` and specializations +--------------------------------------- + +Consider the same ``ResolvedFunction('sin')`` as above. This function +although scoped does not the know the types i.e. it does yet know that +for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or +``sinl``. Hence, right now the function can be called as a +"type-generic" function as further in the pipeline it can take any one +of the above definitions. The functions go through a "specialization" +processes at various points in the pipeline, where the attributes of the +callables are resolved. + +- During type inference, the functions go though type specialization + where in the ``arg_id_to_dtype`` of the functions is realized. +- During descriptor inference, the functions goes through a description + specialization where the ``arg_id_to_descr`` is populated. The + ``arg_id_to_descr`` contains important information regarding shape, + strides and scope of the arguments which form an important part of + ``CallableKernel`` as this information would be helpful to to + generate the function signature and make changes to the data access + pattern of the variables in the callee kernel. +- Whenever a ``ResolvedFunction`` goes through a specialization, this is + indicated by changing the name in the ``pymbolic`` node. + +If during type inference, it is inferred that the type of ``a[i]`` is +``np.float32``. The new ``pymbolic`` node would be: + +:: + + ResolvedFunction('sin_0')(a[i]) + ... + +This name change is done so that it indicates that the node points to a +different ``ScalarCallable`` in the dictionary. And hence a new entry is +added to the ``scoped_functions`` dictionary as: + +:: + + {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None), + Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), + arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None), + 'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32, + -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')} + +Description Inference +--------------------- + +Although this step has no significance for a ``ScalarCallable``, it +forms a very important part of ``CallableKernel``. In which the +``dim_tags``, ``shape`` and ``address_space`` of the arguments of the +callable kernel is altered. + +- The ``dim_tags`` attribute helps to ensure that the memory layout + between the caller and the callee kernel is coherent. +- The ``address_space`` attribute ensures that, while writing the device + code we emit the appropriate scope qualifiers for the function + declaration arguments. +- The ``shape`` attribute helps in: + + - Storage allocation. + - Memory layout. + - Out of bounds accesses to be caught in ``Loo.py``. + +Hence, in the ``Loo.py`` pipeline, one might expect the following +developments of the ``sin`` pymbolic call expression node. + +:: + + sin -> (Kernel creation) -> ResolvedFunction(Variable('sin')) -> + (Type Inference) -> ResolvedFunction(Variable('sin_0')) -> + (Descriptor Inference) -> ResolvedFunction(Variable('sin_1')) + +Changes on the target side to accommodate the new function interface +-------------------------------------------------------------------- + +The earlier "function\_mangler" as a member method of the class +``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The +function scopers would return a list of functions with the signature +``(target, identifier)->lp.InKernelCallable``. + +An example: Calling BLAS +------------------------ + +.. literalinclude:: ../examples/python/external-call.py + diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py new file mode 100644 index 000000000..2ea260656 --- /dev/null +++ b/loopy/kernel/function_interface.py @@ -0,0 +1,867 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Andreas Klöckner, Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import re +import six + +from six.moves import zip + +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError + +from loopy.symbolic import parse_tagged_name + +from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext, + RuleAwareIdentityMapper, SubstitutionRuleExpander) + +from loopy.kernel import LoopKernel + + +# {{{ argument descriptors + +class ValueArgDescriptor(ImmutableRecord): + hash_fields = () + + update_persistent_hash = LoopKernel.update_persistent_hash + + +class ArrayArgDescriptor(ImmutableRecord): + """ + Records information about an array argument to an in-kernel callable, to be + passed to and returned from + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used + for matching shape and scope of caller and callee kernels. + + ..attribute:: shape + + Shape of the array. + + .. attribute:: address_space + + An attribute of :class:`loopy.kernel.data.AddressSpace`. + + .. attribute:: dim_tags + + A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` + """ + + fields = set(['shape', 'address_space', 'dim_tags']) + + def __init__(self, shape, address_space, dim_tags): + + # {{{ sanity checks + + from loopy.kernel.array import FixedStrideArrayDimTag + + assert isinstance(shape, tuple) + assert isinstance(dim_tags, tuple) + + # FIXME at least vector dim tags should be supported + assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in + dim_tags) + + # }}} + + super(ArrayArgDescriptor, self).__init__( + shape=shape, + address_space=address_space, + dim_tags=dim_tags) + + hash_fields = ( + "shape", + "address_space", + "dim_tags") + + update_persistent_hash = LoopKernel.update_persistent_hash + +# }}} + + +# {{{ helper function for in-kernel callables + +def get_kw_pos_association(kernel): + """ + Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in + *kernel*. + """ + from loopy.kernel.tools import infer_arg_is_output_only + kernel = infer_arg_is_output_only(kernel) + kw_to_pos = {} + pos_to_kw = {} + + read_count = 0 + write_count = -1 + + for arg in kernel.args: + if not arg.is_output_only: + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + else: + kw_to_pos[arg.name] = write_count + pos_to_kw[write_count] = arg.name + write_count -= 1 + + return kw_to_pos, pos_to_kw + + +class GridOverrideForCalleeKernel(ImmutableRecord): + """ + Helper class to set the + :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the + callee kernels. Refer + :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, + :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. + + .. attribute:: local_size + + The local work group size that has to be set in the callee kernel. + + .. attribute:: global_size + + The global work group size that to be set in the callee kernel. + + .. note:: + + This class acts as a pseduo-callable and its significance lies in + solving picklability issues. + """ + fields = set(["local_size", "global_size"]) + + def __init__(self, local_size, global_size): + self.local_size = local_size + self.global_size = global_size + + def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + return self.local_size, self.global_size + +# }}} + + +# {{{ template class + +class InKernelCallable(ImmutableRecord): + """ + An abstract interface to define a callable encountered in a kernel. + + .. attribute:: name + + The name of the callable which can be encountered within a kernel. + + .. attribute:: arg_id_to_dtype + + A mapping which indicates the arguments types and result types it would + be handling. This would be set once the callable is type specialized. + + .. attribute:: arg_id_to_descr + + A mapping which gives indicates the argument shape and ``dim_tags`` it + would be responsible for generating code. These parameters would be set, + once it is shape and stride(``dim_tags``) specialized. + + .. note:: + + Negative "id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. + + .. automethod:: __init__ + .. automethod:: with_types + .. automethod:: with_descrs + .. automethod:: with_target + .. automethod:: with_hw_axes_sizes + .. automethod:: generate_preambles + .. automethod:: emit_call + .. automethod:: emit_call_insn + .. automethod:: is_ready_for_codegen + """ + + fields = set(["arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") + + def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): + + super(InKernelCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr) + + update_persistent_hash = LoopKernel.update_persistent_hash + + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + """ + :arg arg_id_to_type: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.types.LoopyType` instances. + Unspecified/unknown types are not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types, + and *arg_id_to_type* is a mapping of the same form as the + argument above, however it may have more information present. + Any argument information exists both by its positional and + its keyword identifier. + """ + # FIXME: In all these with_** functions add that also passes a + # program_callables_info + + raise NotImplementedError() + + def with_descrs(self, arg_id_to_descr, program_callables_info): + """ + :arg arg_id_to_descr: a mapping from argument identifiers + (integers for positional arguments, names for keyword + arguments) to :class:`loopy.ArrayArgDescriptor` instances. + Unspecified/unknown types are not represented in *arg_id_to_descr*. + + Return values are denoted by negative integers, with the + first returned value identified as *-1*. + + :returns: a copy of *self* which is a new instance of + :class:`InKernelCallable` specialized for the given types, and + *arg_id_to_descr* is a mapping of the same form as the argument above, + however it may have more information present. Any argument information + exists both by its positional and its keyword identifier. + """ + + raise NotImplementedError() + + def with_target(self, target): + """ + Returns a copy of *self* with all the ``dtypes`` in + ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer + :meth:`loopy.types.LoopyType.with_target`. + + :arg target: An instance of :class:`loopy.target.TargetBase`. + """ + + if target is None: + raise LoopyError("target cannot be None for with_target") + + def with_target_if_not_None(dtype): + """ + Returns a copy of :arg:`dtype` associated with the target. If + ``dtype`` is *None* returns *None*. + """ + if dtype: + return dtype.with_target(target) + else: + return None + + new_arg_id_to_dtype = None + if self.arg_id_to_dtype is not None: + new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, + dtype in self.arg_id_to_dtype.items()) + + return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) + + def with_hw_axes_sizes(self, local_size, global_size): + """ + Returns a copy of *self* with modifications to comply with the grid + sizes ``(local_size, global_size)`` of the kernel in which it is + supposed to be called. + + :arg local_size: An instance of :class:`islpy.PwAff`. + :arg global_size: An instance of :class:`islpy.PwAff`. + """ + raise NotImplementedError() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ Yields the target specific preamble. + """ + raise NotImplementedError() + + def emit_call(self, expression_to_code_mapper, expression, target): + + raise NotImplementedError() + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a tuple of ``(call, assignee_is_returned)`` which is the target + facing function call that would be seen in the generated code. ``call`` + is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned`` + is an instance of :class:`bool` to indicate if the assignee is returned + by value of C-type targets. + + *Example:* If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is + interpreted in the target as ``a = f(c, d, &b)``. If + ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted + in the target as the statement ``f(c, d, &a, &b)``. + """ + + raise NotImplementedError() + + def __hash__(self): + + return hash(tuple(self.fields)) + +# }}} + + +# {{{ scalar callable + +class ScalarCallable(InKernelCallable): + """ + An abstranct interface the to a scalar callable encountered in a kernel. + + .. note:: + + The :meth:`ScalarCallable.with_types` is intended to assist with type + specialization of the funciton and is expected to be supplemented in the + derived subclasses. + """ + + fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") + hash_fields = fields + + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + super(ScalarCallable, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.name = name + self.name_in_target = name_in_target + + def __getinitargs__(self): + return (self.arg_id_to_dtype, self.arg_id_to_descr, + self.name_in_target) + + def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + raise LoopyError("No type inference information present for " + "the function %s." % (self.name)) + + def with_descrs(self, arg_id_to_descr, program_callables_info): + + arg_id_to_descr[-1] = ValueArgDescriptor() + return ( + self.copy(arg_id_to_descr=arg_id_to_descr), + program_callables_info) + + def with_hw_axes_sizes(self, global_size, local_size): + return self.copy() + + def is_ready_for_codegen(self): + + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + # {{{ code generation + + def emit_call(self, expression_to_code_mapper, expression, target): + + assert self.is_ready_for_codegen() + + # must have single assignee + assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1 + arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in + range(len(self.arg_id_to_dtype)-1)) + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + expression.parameters) + + from loopy.expression import dtype_to_type_context + # processing the parameters with the required dtypes + processed_parameters = tuple( + expression_to_code_mapper.rec(par, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype) + for par, par_dtype, tgt_dtype in zip( + expression.parameters, par_dtypes, arg_dtypes)) + + from pymbolic import var + return var(self.name_in_target)(*processed_parameters) + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + """ + Returns a pymbolic call for C-based targets, when the instructions + involve multiple return values along with the required type casting. + The first assignee is returned, but the rest of them are appended to + the parameters and passed by reference. + + *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` + + :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. + :arg target: An instance of :class:`loopy.target.TargetBase`. + :arg expression_to_code_mapper: An instance of :class:`IdentityMapper` + responsible for code mapping from :mod:`loopy` syntax to the + **target syntax**. + """ + + # Currently this is formulated such that the first argument is returned + # and rest all are passed by reference as arguments to the function. + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + assignees = insn.assignees[1:] + + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in + parameters) + arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in + enumerate(parameters)) + + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in + enumerate(assignees)) + + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip( + parameters, par_dtypes, arg_dtypes)] + + for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): + if tgt_dtype != expression_to_code_mapper.infer_type(a): + raise LoopyError("Type Mismatch in function %s. Expected: %s" + "Got: %s" % (self.name, tgt_dtype, + expression_to_code_mapper.infer_type(a))) + c_parameters.append( + var("&")( + expression_to_code_mapper(a, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr)) + + # assignee is returned whenever the size of assignees is non zero. + assignee_is_returned = len(assignees) > 0 + + return var(self.name_in_target)(*c_parameters), assignee_is_returned + + def generate_preambles(self, target): + return + yield + + # }}} + +# }}} + + +# {{{ callable kernel + +class CallableKernel(InKernelCallable): + """ + Records informations about a callee kernel. Also provides interface through + member methods to make the callee kernel compatible to be called from a + caller kernel. The :meth:`loopy.register_callable_kernel` should be called + in order to initiate association between a function in caller kernel and + the callee kernel. + + :meth:`CallableKernel.with_types` should be called in order to match + the ``dtypes`` of the arguments that are shared between the caller and the + callee kernel. + + :meth:`CallableKernel.with_descrs` should be called in order to match + :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, + :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the + caller and the callee kernel. + + :meth:`CallableKernel.with_hw_axes` should be called to set the grid + sizes for the :attr:`subkernel` of the callable. + """ + + fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr"]) + init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") + hash_fields = fields + + def __init__(self, subkernel, arg_id_to_dtype=None, + arg_id_to_descr=None): + assert isinstance(subkernel, LoopKernel) + + super(CallableKernel, self).__init__( + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) + + self.subkernel = subkernel.copy( + args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) + if arg.dtype is not None else arg for arg in subkernel.args]) + + def __getinitargs__(self): + return (self.subkernel, self.arg_id_to_dtype, + self.arg_id_to_descr) + + @property + def name(self): + return self.subkernel.name + + def with_types(self, arg_id_to_dtype, caller_kernel, + program_callables_info): + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + new_args = [] + for arg in self.subkernel.args: + kw = arg.name + if kw in arg_id_to_dtype: + # id exists as kw + new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) + elif kw_to_pos[kw] in arg_id_to_dtype: + # id exists as positional argument + new_args.append(arg.copy( + dtype=arg_id_to_dtype[kw_to_pos[kw]])) + else: + new_args.append(arg) + + from loopy.type_inference import ( + infer_unknown_types_for_a_single_kernel) + pre_specialized_subkernel = self.subkernel.copy( + args=new_args) + + # infer the types of the written variables based on the knowledge + # of the types of the arguments supplied + specialized_kernel, program_callables_info = ( + infer_unknown_types_for_a_single_kernel( + pre_specialized_subkernel, + program_callables_info, + expect_completion=True)) + + new_arg_id_to_dtype = {} + for arg in specialized_kernel.args: + # associate the updated_arg_id_to_dtype with keyword as well as + # positional id. + new_arg_id_to_dtype[arg.name] = arg.dtype + new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype + + # Return the kernel call with specialized subkernel and the corresponding + # new arg_id_to_dtype + return self.copy(subkernel=specialized_kernel, + arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info + + def with_descrs(self, arg_id_to_descr, program_callables_info): + + # tune the subkernel so that we have the matching shapes and + # dim_tags + + new_args = self.subkernel.args[:] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + for arg_id, descr in arg_id_to_descr.items(): + if isinstance(arg_id, int): + arg_id = pos_to_kw[arg_id] + assert isinstance(arg_id, str) + + if isinstance(descr, ArrayArgDescriptor): + new_arg = self.subkernel.arg_dict[arg_id].copy( + shape=descr.shape, + dim_tags=descr.dim_tags, + address_space=descr.address_space) + # replacing the new arg with the arg of the same name + new_args = [new_arg if arg.name == arg_id else arg for arg in + new_args] + elif isinstance(descr, ValueArgDescriptor): + pass + else: + raise LoopyError("Descriptor must be either an instance of " + "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % + type(descr)) + descriptor_specialized_knl = self.subkernel.copy(args=new_args) + from loopy.preprocess import traverse_to_infer_arg_descr + descriptor_specialized_knl, program_callables_info = ( + traverse_to_infer_arg_descr(descriptor_specialized_knl, + program_callables_info)) + + return ( + self.copy( + subkernel=descriptor_specialized_knl, + arg_id_to_descr=arg_id_to_descr), + program_callables_info) + + def with_packing_for_args(self): + from loopy.kernel.data import AddressSpace + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + + arg_id_to_descr = {} + + for pos, kw in pos_to_kw.items(): + arg = self.subkernel.arg_dict[kw] + arg_id_to_descr[pos] = ArrayArgDescriptor( + shape=arg.shape, + dim_tags=arg.dim_tags, + address_space=AddressSpace.GLOBAL) + + return self.copy(subkernel=self.subkernel, + arg_id_to_descr=arg_id_to_descr) + + def with_hw_axes_sizes(self, gsize, lsize): + return self.copy( + subkernel=self.subkernel.copy( + overridden_get_grid_sizes_for_insn_ids=( + GridOverrideForCalleeKernel(lsize, gsize)))) + + def is_ready_for_codegen(self): + return (self.arg_id_to_dtype is not None and + self.arg_id_to_descr is not None) + + def generate_preambles(self, target): + """ Yields the *target* specific preambles. + """ + # FIXME Check that this is correct. + + return + yield + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + + assert self.is_ready_for_codegen() + + from loopy.kernel.instruction import CallInstruction + from pymbolic.primitives import CallWithKwargs + + assert isinstance(insn, CallInstruction) + + parameters = insn.expression.parameters + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameters = list(parameters) + par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameters.append(kw_parameters[pos_to_kw[i]]) + par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) + + # insert the assigness at the required positions + assignee_write_count = -1 + for i, arg in enumerate(self.subkernel.args): + if arg.is_output_only: + assignee = assignees[-assignee_write_count-1] + parameters.insert(i, assignee) + par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) + assignee_write_count -= 1 + + # no type casting in array calls + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from loopy.symbolic import SubArrayRef + from pymbolic import var + + c_parameters = [ + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr if isinstance(par, SubArrayRef) else + expression_to_code_mapper(par, PREC_NONE, + dtype_to_type_context(target, par_dtype), + par_dtype).expr + for par, par_dtype in zip( + parameters, par_dtypes)] + + return var(self.subkernel.name)(*c_parameters), False + +# }}} + + +# {{{ mangler callable + +class ManglerCallable(ScalarCallable): + """ + A callable whose characateristic is defined by a function mangler. + + .. attribute:: function_mangler + + A function of signature ``(kernel, name , arg_dtypes)`` and returns an + instance of ``loopy.CallMangleInfo``. + """ + fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", + "arg_id_to_descr", "name_in_target") + hash_fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"]) + + def __init__(self, name, function_mangler, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None): + + self.function_mangler = function_mangler + + super(ManglerCallable, self).__init__( + name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + def __getinitargs__(self): + return (self.name, self.function_mangler, self.arg_id_to_dtype, + self.arg_id_to_descr, self.name_in_target) + + def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + if self.arg_id_to_dtype is not None: + # specializing an already specialized function. + for arg_id, dtype in arg_id_to_dtype.items(): + # only checking for the ones which have been provided + # if does not match, returns an error. + if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]: + raise LoopyError("Overwriting a specialized" + " function is illegal--maybe start with new instance of" + " ManglerCallable?") + + sorted_keys = sorted(arg_id_to_dtype.keys()) + arg_dtypes = tuple(arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + mangle_result = self.function_mangler(kernel, self.name, + arg_dtypes) + if mangle_result: + new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) + new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in + enumerate(mangle_result.result_dtypes))) + return ( + self.copy(name_in_target=mangle_result.target_name, + arg_id_to_dtype=new_arg_id_to_dtype), + program_callables_info) + else: + # The function mangler does not agree with the arg id to dtypes + # provided. Indicating that is illegal. + raise LoopyError("Function %s not coherent with the provided types." % ( + self.name, kernel.target)) + + def mangle_result(self, kernel): + """ + Returns an instance of :class:`loopy.kernel.data.CallMangleInfo` for + the given pair :attr:`function_mangler` and :attr:`arg_id_to_dtype`. + """ + sorted_keys = sorted(self.arg_id_to_dtype.keys()) + arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if + key >= 0) + + return self.function_mangler(kernel, self.name, arg_dtypes) + +# }}} + + +# {{{ new pymbolic calls to scoped functions + +def next_indexed_variable(function): + """ + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. + + *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. + + :arg function: Either an instance of :class:`pymbolic.primitives.Variable` + or :class:`loopy.reduction.ArgExtOp` or + :class:`loopy.reduction.SegmentedOp`. + """ + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(function, (ArgExtOp, SegmentedOp)): + return function.copy() + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(function.name) + + if match is None: + if function.name[-1] == '_': + return "{old_name}0".format(old_name=function.name) + else: + return "{old_name}_0".format(old_name=function.name) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + + +class FunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``calls_to_new_functions`` + """ + + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super(FunctionNameChanger, self).__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.calls_to_new_names: + # FIXME: this is horribly wrong logic. + # investigate how to make edits to a substitution rule + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expanded_expr.parameters)) + else: + return super(FunctionNameChanger, self).map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return super(FunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) + + +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + name_changer = FunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py new file mode 100644 index 000000000..096bd1eca --- /dev/null +++ b/loopy/program.py @@ -0,0 +1,684 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six +import re + +from pytools import ImmutableRecord, memoize_method +from pymbolic.primitives import Variable +from functools import wraps + +from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction +from loopy.kernel.function_interface import ( + CallableKernel, ScalarCallable) +from loopy.diagnostic import LoopyError + +from loopy.kernel import LoopKernel + + +class ResolvedFunctionMarker(RuleAwareIdentityMapper): + """ + Mapper to convert the ``function`` attribute of a + :class:`pymbolic.primitives.Call` known in the kernel as instances of + :class:`loopy.symbolic.ResolvedFunction`. A function is known in the + *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` + returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + + log(z)``, then the mapper would return ``ResolvedFunction('sin')(x) + + unknown_function(y) + ResolvedFunction('log')(z)``. + + :arg rule_mapping_context: An instance of + :class:`loopy.symbolic.RuleMappingContext`. + :arg function_ids: A container with instances of :class:`str` indicating + the function identifiers to look for while scoping functions. + """ + def __init__(self, rule_mapping_context, kernel, program_callables_info, + function_id_to_in_knl_callable_mappers): + super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) + self.kernel = kernel + self.program_callables_info = program_callables_info + # FIXME: function_resolvesrs looks like a very bad name change it + self.function_id_to_in_knl_callable_mappers = ( + function_id_to_in_knl_callable_mappers) + + def find_in_knl_callable_from_identifier(self, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + # FIXME change docs + for func_id_to_in_knl_callable_mapper in ( + self.function_id_to_in_knl_callable_mappers): + # fixme: do we really need to given target for the function + in_knl_callable = func_id_to_in_knl_callable_mapper( + self.kernel.target, identifier) + if in_knl_callable is not None: + return in_knl_callable + + return None + + def map_call(self, expr, expn_state): + from pymbolic.primitives import Call, CallWithKwargs + from loopy.symbolic import parse_tagged_name + + name, tag = parse_tagged_name(expr.function) + if name not in self.rule_mapping_context.old_subst_rules: + new_call_with_kwargs = self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={}), expn_state) + return Call(new_call_with_kwargs.function, + new_call_with_kwargs.parameters) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if not isinstance(expr.function, ResolvedFunction): + + # search the kernel for the function. + in_knl_callable = self.find_in_knl_callable_from_identifier( + expr.function.name) + + if in_knl_callable: + # associate the newly created ResolvedFunction with the + # resolved in-kernel callable + + self.program_callables_info, new_func_id = ( + self.program_callables_info.with_callable(expr.function, + in_knl_callable, True)) + return type(expr)( + ResolvedFunction(new_func_id), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + + # this is an unknown function as of yet, do not modify it + return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, + expn_state) + + def map_reduction(self, expr, expn_state): + for func_id in ( + expr.operation.get_scalar_callables()): + in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) + assert in_knl_callable is not None + self.program_callables_info, _ = ( + self.program_callables_info.with_callable(func_id, + in_knl_callable, True)) + return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) + + +def initialize_program_callables_info_from_kernel( + kernel, func_id_to_kernel_callable_mappers): + program_callables_info = ProgramCallablesInfo({}) + program_callables_info = program_callables_info.with_edit_callables_mode() + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, program_callables_info, + func_id_to_kernel_callable_mappers) + + # scoping fucntions and collecting the scoped functions + kernel_with_functions_resolved = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(kernel)) + program_callables_info = resolved_function_marker.program_callables_info + + callable_kernel = CallableKernel(kernel_with_functions_resolved) + program_callables_info, _ = program_callables_info.with_callable( + Variable(kernel.name), callable_kernel, True) + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + return program_callables_info + + +# {{{ program definition + +class Program(ImmutableRecord): + def __init__(self, + name, + program_callables_info, + target, + func_id_to_in_knl_callable_mappers): + assert isinstance(program_callables_info, ProgramCallablesInfo) + + # FIXME: check if all sanity checks have been covered? + # FIXME: The comments over here may need some attention. + assert name in program_callables_info + + super(Program, self).__init__( + name=name, + program_callables_info=program_callables_info, + target=target, + func_id_to_in_knl_callable_mappers=( + func_id_to_in_knl_callable_mappers)) + + self._program_executor_cache = {} + + hash_fields = ( + "name", + "program_callables_info", + "target",) + + update_persistent_hash = LoopKernel.update_persistent_hash + + def copy(self, **kwargs): + if 'target' in kwargs: + target = kwargs['target'] + new_self = super(Program, self).copy(**kwargs) + new_resolved_functions = {} + for func_id, in_knl_callable in ( + new_self.program_callables_info.items()): + if isinstance(in_knl_callable, CallableKernel): + subkernel = in_knl_callable.subkernel + new_resolved_functions[func_id] = in_knl_callable.copy( + subkernel=subkernel.copy(target=target)) + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = new_self.program_callables_info.copy( + resolved_functions=new_resolved_functions) + + return super(Program, new_self).copy( + program_callables_info=program_callables_info) + else: + return super(Program, self).copy(**kwargs) + + def get_grid_size_upper_bounds(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + return self.root_kernel.get_grid_size_upper_bounds( + self.program_callables_info, + ignore_auto=ignore_auto) + + def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of *all* instructions in the kernel. + + *global_size* and *local_size* are :mod:`pymbolic` expressions + """ + return self.root_kernel.get_grid_size_upper_bounds_as_exprs( + self.program_callables_info, + ignore_auto=ignore_auto) + + # {{{ implementation arguments + + @property + @memoize_method + def impl_arg_to_arg(self): + from loopy.kernel.array import ArrayBase + + result = {} + + for arg in self.args: + if not isinstance(arg, ArrayBase): + result[arg.name] = arg + continue + + if arg.shape is None or arg.dim_tags is None: + result[arg.name] = arg + continue + + subscripts_and_names = arg.subscripts_and_names() + if subscripts_and_names is None: + result[arg.name] = arg + continue + + for index, sub_arg_name in subscripts_and_names: + result[sub_arg_name] = arg + + return result + + # }}} + + @property + def root_kernel(self): + return self.program_callables_info[self.name].subkernel + + @property + def arg_dict(self): + return self.root_kernel.arg_dict + + def with_root_kernel(self, root_kernel): + new_in_knl_callable = self.program_callables_info[ + self.name].copy(subkernel=root_kernel) + new_resolved_functions = ( + self.program_callables_info.resolved_functions.copy()) + new_resolved_functions[self.name] = new_in_knl_callable + + return self.copy( + program_callables_info=self.program_callables_info.copy( + resolved_functions=new_resolved_functions)) + + @property + def args(self): + return self.root_kernel.args[:] + + def __call__(self, *args, **kwargs): + key = self.target.get_kernel_executor_cache_key(*args, **kwargs) + try: + pex = self._program_executor_cache[key] + except KeyError: + pex = self.target.get_kernel_executor(self, *args, **kwargs) + self._program_executor_cache[key] = pex + + return pex(*args, **kwargs) + + def __str__(self): + # FIXME: make this better + print(self.program_callables_info.num_times_callables_called) + return ( + (self.program_callables_info[ + self.name].subkernel).__str__() + + '\nResolved Functions: ' + + (self.program_callables_info.resolved_functions.keys()).__str__() + + '\n' + 75*'-' + '\n') + +# }}} + + +def next_indexed_function_identifier(function): + """ + Returns an instance of :class:`str` with the next indexed-name in the + sequence for the name of *function*. + + *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. + + :arg function: Either an instance of :class:`pymbolic.primitives.Variable` + or :class:`loopy.reduction.ArgExtOp` or + :class:`loopy.reduction.SegmentedOp`. + """ + from loopy.library.reduction import ArgExtOp, SegmentedOp + if isinstance(function, (ArgExtOp, SegmentedOp)): + return function.copy() + elif isinstance(function, str): + function = Variable(function) + + assert isinstance(function, Variable) + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") + + match = func_name.match(function.name) + + if match is None: + if function.name[-1] == '_': + return "{old_name}0".format(old_name=function.name) + else: + return "{old_name}_0".format(old_name=function.name) + + return "{alpha}_{num}".format(alpha=match.group('alpha'), + num=int(match.group('num'))+1) + + +class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, renaming_dict): + super(ResolvedFunctionRenamer, self).__init__( + rule_mapping_context) + self.renaming_dict = renaming_dict + + def map_resolved_function(self, expr, expn_state): + if expr.name in self.renaming_dict: + return ResolvedFunction(self.renaming_dict[expr.name]) + else: + return super(ResolvedFunctionRenamer, self).map_resolved_function( + expr, expn_state) + + +def rename_resolved_functions_in_a_single_kernel(kernel, + renaming_dict): + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + resolved_function_renamer = ResolvedFunctionRenamer(rule_mapping_context, + renaming_dict) + return ( + rule_mapping_context.finish_kernel( + resolved_function_renamer.map_kernel(kernel))) + + +# {{{ program callables info + +class ProgramCallablesInfo(ImmutableRecord): + # FIXME: dont evalutate num_times_called, rahter compute it from the + # resolved_functions + # FIXME: make the edit callables thing a ContextManager. + def __init__(self, resolved_functions, num_times_callables_called=None, + history=None, is_being_edited=False, + num_times_hit_during_editing={}, + renames_needed_after_editing={}): + + if num_times_callables_called is None: + num_times_callables_called = dict((func_id, 1) for func_id in + resolved_functions) + if history is None: + history = dict((func_id, set([func_id])) for func_id in + resolved_functions) + + super(ProgramCallablesInfo, self).__init__( + resolved_functions=resolved_functions, + num_times_callables_called=num_times_callables_called, + history=history, + is_being_edited=is_being_edited, + num_times_hit_during_editing=num_times_hit_during_editing, + renames_needed_after_editing=renames_needed_after_editing) + + hash_fields = ( + "resolved_functions", + "num_times_callables_called", + "is_being_edited", + "num_times_hit_during_editing", + "renames_needed_after_editing", + "history") + + update_persistent_hash = LoopKernel.update_persistent_hash + + def with_edit_callables_mode(self): + return self.copy(is_being_edited=True, + num_times_hit_during_editing=dict((func_id, 0) for func_id in + self.resolved_functions)) + + def with_callable(self, function, in_kernel_callable, + resolved_for_the_first_time=False): + """ + :arg function: An instance of :class:`pymbolic.primitives.Variable` or + :class:`loopy.library.reduction.ReductionOpFunction`. + + :arg in_kernel_callables: An instance of + :class:`loopy.InKernelCallable`. + + .. note:: + + Assumes that each callable is touched atmost once, the internal + working of this function fails if that is violated. + """ + # FIXME: add a note about using enter and exit. ~KK + # FIXME: think about a better idea of "with_added_callable" this would + # be more convenient for developer-faced usage. ~KK + + if not self.is_being_edited: + if function.name in self.resolved_functions and ( + self.resolved_functions[function.name] == in_kernel_callable): + return self, function + else: + print('Old: ', self.resolved_functions[function.name]) + print('New: ', in_kernel_callable) + raise LoopyError("Use 'enter_edit_callables_mode' first.") + + from loopy.library.reduction import ArgExtOp, SegmentedOp + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ArgExtOp, SegmentedOp)) + + # }}} + + renames_needed_after_editing = self.renames_needed_after_editing.copy() + num_times_hit_during_editing = self.num_times_hit_during_editing.copy() + num_times_callables_called = self.num_times_callables_called.copy() + history = self.history.copy() + + if not resolved_for_the_first_time: + if isinstance(function, (ArgExtOp, SegmentedOp)): + num_times_hit_during_editing[function] += 1 + else: + num_times_hit_during_editing[function.name] += 1 + + if isinstance(function, (ArgExtOp, SegmentedOp)): + unique_function_identifier = function.copy() + if not resolved_for_the_first_time: + num_times_callables_called[function] -= 1 + + num_times_callables_called[unique_function_identifier] = 1 + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing=( + num_times_hit_during_editing), + renames_needed_after_editing=( + renames_needed_after_editing)), + unique_function_identifier) + + if in_kernel_callable in self.resolved_functions.values(): + # the callable already exists, implies return the function + # identifier corresposing to that callable. + for func_id, in_knl_callable in self.resolved_functions.items(): + if in_knl_callable == in_kernel_callable: + num_times_callables_called[func_id] += 1 + if not resolved_for_the_first_time: + num_times_callables_called[function.name] -= 1 + if num_times_callables_called[function.name] == 0: + renames_needed_after_editing[func_id] = function.name + + history[func_id] = history[func_id] | set([function.name]) + return ( + self.copy( + history=history, + num_times_hit_during_editing=( + num_times_hit_during_editing), + num_times_callables_called=( + num_times_callables_called), + renames_needed_after_editing=( + renames_needed_after_editing)), + func_id) + else: + # FIXME: maybe deal with the history over here? + # FIXME: once the code logic is running beautify this part. + # many "ifs" can be avoided + unique_function_identifier = function.name + if (resolved_for_the_first_time or + self.num_times_callables_called[function.name] > 1): + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + if not resolved_for_the_first_time: + num_times_callables_called[function.name] -= 1 + + num_times_callables_called[unique_function_identifier] = 1 + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + if not resolved_for_the_first_time: + history[unique_function_identifier] = ( + history[function.name] | set([unique_function_identifier])) + else: + history[unique_function_identifier] = set( + [unique_function_identifier]) + + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing=num_times_hit_during_editing, + renames_needed_after_editing=renames_needed_after_editing), + Variable(unique_function_identifier)) + + def with_exit_edit_callables_mode(self): + assert self.is_being_edited + + num_times_callables_called = {} + resolved_functions = {} + history = self.history.copy() + + for func_id, in_knl_callable in self.resolved_functions.items(): + if isinstance(in_knl_callable, CallableKernel): + old_subkernel = in_knl_callable.subkernel + new_subkernel = rename_resolved_functions_in_a_single_kernel( + old_subkernel, self.renames_needed_after_editing) + in_knl_callable = ( + in_knl_callable.copy(subkernel=new_subkernel)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) + + if func_id in self.renames_needed_after_editing: + history.pop(func_id) + + new_func_id = self.renames_needed_after_editing[func_id] + resolved_functions[new_func_id] = ( + in_knl_callable) + num_times_callables_called[new_func_id] = ( + self.num_times_callables_called[func_id]) + + else: + resolved_functions[func_id] = in_knl_callable + num_times_callables_called[func_id] = ( + self.num_times_callables_called[func_id]) + + return self.copy( + is_being_edited=False, + resolved_functions=resolved_functions, + num_times_callables_called=num_times_callables_called, + num_times_hit_during_editing={}, + renames_needed_after_editing={}) + + def with_deleted_callable(self, func_id, instances=1): + num_times_callables_called = self.num_times_callables_called.copy() + history = self.history.copy() + resolved_functions = self.resolved_functions.copy() + + assert instances <= num_times_callables_called[func_id] + + num_times_callables_called[func_id] -= instances + + if num_times_callables_called[func_id] == 0: + num_times_callables_called.pop(func_id) + history.pop(func_id) + resolved_functions.pop(func_id) + + return self.copy( + resolved_functions=resolved_functions, + num_times_callables_called=num_times_callables_called, + history=history) + + def __getitem__(self, item): + return self.resolved_functions[item] + + def __contains__(self, item): + return item in self.resolved_functions + + def items(self): + return self.resolved_functions.items() + + def values(self): + return self.resolved_functions.values() + + +# }}} + + +def default_func_id_to_kernel_callable_mappers(target): + + from loopy.library.function import loopy_specific_callable_scopers + return ( + [loopy_specific_callable_scopers] + ( + target.get_device_ast_builder().function_scopers())) + + +def make_program_from_kernel(kernel): + + program_callables_info = initialize_program_callables_info_from_kernel(kernel, + default_func_id_to_kernel_callable_mappers(kernel.target)) + + program = Program( + name=kernel.name, + program_callables_info=program_callables_info, + func_id_to_in_knl_callable_mappers=( + default_func_id_to_kernel_callable_mappers(kernel.target)), + target=kernel.target) + + return program + + +def iterate_over_kernels_if_given_program(transform_for_single_kernel): + def _collective_transform(program_or_kernel, *args, **kwargs): + if isinstance(program_or_kernel, Program): + program = program_or_kernel + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = transform_for_single_kernel( + in_knl_callable.subkernel, *args, **kwargs) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + else: + assert isinstance(program_or_kernel, LoopKernel) + kernel = program_or_kernel + return transform_for_single_kernel(kernel, *args, **kwargs) + + return wraps(transform_for_single_kernel)(_collective_transform) + + +# {{{ ingoring this for now + +# if False and isinstance(function, (ArgExtOp, SegmentedOp)): +# FIXME: ignoring this casse for now +# FIXME: If a kernel has two flavors of ArgExtOp then they are +# overwritten and hence not supported.(for now). +# updated_resolved_functions = self.scoped_functions.copy() +# updated_resolved_functions[function] = in_kernel_callable +# return self.copy(updated_resolved_functions), function.copy() + +# }}} + + +# vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py new file mode 100644 index 000000000..b5b80ad89 --- /dev/null +++ b/loopy/transform/callable.py @@ -0,0 +1,707 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import six + +import islpy as isl +from pymbolic.primitives import CallWithKwargs + +from loopy.kernel import LoopKernel +from pytools import ImmutableRecord +from loopy.diagnostic import LoopyError +from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, + CInstruction, _DataObliviousInstruction) +from loopy.symbolic import IdentityMapper, SubstitutionMapper +from loopy.isl_helpers import simplify_via_aff +from loopy.kernel.function_interface import (get_kw_pos_association, + change_names_of_pymbolic_calls, CallableKernel, ScalarCallable) +from loopy.program import Program, ResolvedFunctionMarker + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: register_function_id_to_in_knl_callable_mapper + +.. autofunction:: register_callable_kernel +""" + + +# {{{ register function lookup + +def resolved_callables_from_function_lookup(program, + func_id_to_kernel_callable_mapper): + program_callables_info = program.program_callables_info + program_callables_info = program_callables_info.with_edit_callables_mode() + + callable_knls = dict( + (func_id, in_knl_callable) for func_id, in_knl_callable in + program_callables_info.items() if isinstance(in_knl_callable, + CallableKernel)) + edited_callable_knls = {} + + for func_id, in_knl_callable in callable_knls.items(): + kernel = in_knl_callable.subkernel + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, kernel, program_callables_info, + [func_id_to_kernel_callable_mapper]) + + # scoping fucntions and collecting the scoped functions + new_subkernel = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(kernel)) + program_callables_info = resolved_function_marker.program_callables_info + + edited_callable_knls[func_id] = in_knl_callable.copy( + subkernel=new_subkernel) + + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + + new_resolved_functions = {} + + for func_id, in_knl_callable in program_callables_info.items(): + if func_id in edited_callable_knls: + new_resolved_functions[func_id] = edited_callable_knls[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = program_callables_info.copy( + resolved_functions=new_resolved_functions) + + return program.copy(program_callables_info=program_callables_info) + + +def register_function_id_to_in_knl_callable_mapper(program, + func_id_to_in_knl_callable_mapper): + """ + Returns a copy of *kernel* with the *function_lookup* registered. + + :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target, + identifier)`` returning a + :class:`loopy.kernel.function_interface.InKernelCallable` or *None* if + the *function_identifier* is not known. + """ + + # adding the function lookup to the set of function lookers in the kernel. + if func_id_to_in_knl_callable_mapper not in ( + program.func_id_to_in_knl_callable_mappers): + from loopy.tools import unpickles_equally + if not unpickles_equally(func_id_to_in_knl_callable_mapper): + raise LoopyError("function '%s' does not " + "compare equally after being upickled " + "and would disrupt loopy's caches" + % func_id_to_in_knl_callable_mapper) + new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( + [func_id_to_in_knl_callable_mapper]) + + program = resolved_callables_from_function_lookup(program, + func_id_to_in_knl_callable_mapper) + + new_program = program.copy( + func_id_to_in_knl_callable_mappers=new_func_id_mappers) + + return new_program + +# }}} + + +# {{{ register_callable_kernel + +class _RegisterCalleeKernel(ImmutableRecord): + """ + Helper class to make the function scoper from + :func:`loopy.transform.register_callable_kernel` picklable. As python + cannot pickle lexical closures. + """ + fields = set(['callable_kernel']) + + def __init__(self, callable_kernel): + self.callable_kernel = callable_kernel + + def __call__(self, target, identifier): + if identifier == self.callable_kernel.subkernel.name: + return self.callable_kernel + return None + + +def register_callable_kernel(program, callee_kernel): + """Returns a copy of *caller_kernel*, which would resolve *function_name* in an + expression as a call to *callee_kernel*. + + :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :arg function_name: An instance of :class:`str`. + :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. + """ + + # {{{ sanity checks + + assert isinstance(program, Program) + assert isinstance(callee_kernel, LoopKernel) + + # check to make sure that the variables with 'out' direction is equal to + # the number of assigness in the callee kernel intructions. + expected_num_assignees = len([arg for arg in callee_kernel.args if + arg.is_output_only]) + expected_num_parameters = len(callee_kernel.args) - expected_num_assignees + for in_knl_callable in program.program_callables_info.values(): + if isinstance(in_knl_callable, CallableKernel): + caller_kernel = in_knl_callable.subkernel + for insn in caller_kernel.instructions: + if isinstance(insn, CallInstruction) and ( + insn.expression.function.name == callee_kernel.name): + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + else: + kw_parameters = {} + if len(insn.assignees) != expected_num_assignees: + raise LoopyError("The number of arguments with 'out' " + "direction " "in callee kernel %s and the number " + "of assignees in " "instruction %s do not " + "match." % ( + callee_kernel.name, insn.id)) + if len(insn.expression.parameters+tuple( + kw_parameters.values())) != expected_num_parameters: + raise LoopyError("The number of expected arguments " + "for the callee kernel %s and the number of " + "parameters in instruction %s do not match." + % (callee_kernel.name, insn.id)) + + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("unknown instruction %s" % type(insn)) + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown callable type %s." % + type(in_knl_callable).__name__) + + # }}} + + # take the function resolvers from the Program and resolve the functions in + # the callee kernel + program_callables_info = ( + program.program_callables_info.with_edit_callables_mode()) + + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + callee_kernel.substitutions, + callee_kernel.get_var_name_generator()) + + resolved_function_marker = ResolvedFunctionMarker( + rule_mapping_context, callee_kernel, program_callables_info, + program.func_id_to_in_knl_callable_mappers) + + callee_kernel = rule_mapping_context.finish_kernel( + resolved_function_marker.map_kernel(callee_kernel)) + program_callables_info = resolved_function_marker.program_callables_info + + program_callables_info = ( + program_callables_info.with_exit_edit_callables_mode()) + program = program.copy(program_callables_info=program_callables_info) + + # making the target of the child kernel to be same as the target of parent + # kernel. + callable_kernel = CallableKernel(subkernel=callee_kernel.copy( + target=program.target, + is_called_from_host=False)) + + # FIXME disabling global barriers for callee kernel (for now) + from loopy import set_options + callee_kernel = set_options(callee_kernel, "disable_global_barriers") + + # FIXME: the number of callables is wrong. This is horrible please + # compensate. + + return register_function_id_to_in_knl_callable_mapper( + program, + _RegisterCalleeKernel(callable_kernel)) + +# }}} + + +# {{{ kernel inliner mapper + +class KernelInliner(SubstitutionMapper): + """Mapper to replace variables (indices, temporaries, arguments) in the + callee kernel with variables in the caller kernel. + + :arg caller: the caller kernel + :arg arg_map: dict of argument name to variables in caller + :arg arg_dict: dict of argument name to arguments in callee + """ + + def __init__(self, subst_func, caller, arg_map, arg_dict): + super(KernelInliner, self).__init__(subst_func) + self.caller = caller + self.arg_map = arg_map + self.arg_dict = arg_dict + + def map_subscript(self, expr): + if expr.aggregate.name in self.arg_map: + + aggregate = self.subst_func(expr.aggregate) + sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller + callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee + if aggregate.name in self.caller.arg_dict: + caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller + else: + caller_arg = self.caller.temporary_variables[aggregate.name] + + # Firstly, map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple) + + # Next, reshape to match dimension of outer arrays. + # We can have e.g. A[3, 2] from outside and B[6] from inside + from numbers import Integral + if not all(isinstance(d, Integral) for d in callee_arg.shape): + raise LoopyError( + "Argument: {0} in callee kernel: {1} does not have " + "constant shape.".format(callee_arg)) + + flatten_index = 0 + for i, idx in enumerate(sar.get_begin_subscript().index_tuple): + flatten_index += idx*caller_arg.dim_tags[i].stride + + flatten_index += sum( + idx * tag.stride + for idx, tag in zip(outer_indices, callee_arg.dim_tags)) + + from loopy.isl_helpers import simplify_via_aff + flatten_index = simplify_via_aff(flatten_index) + + new_indices = [] + for dim_tag in caller_arg.dim_tags: + ind = flatten_index // dim_tag.stride + flatten_index -= (dim_tag.stride * ind) + new_indices.append(ind) + + new_indices = tuple(simplify_via_aff(i) for i in new_indices) + + return aggregate.index(tuple(new_indices)) + else: + return super(KernelInliner, self).map_subscript(expr) + +# }}} + + +# {{{ inlining of a single call instruction + +def _inline_call_instruction(caller_kernel, callee_knl, instruction): + """ + Returns a copy of *kernel* with the *instruction* in the *kernel* + replaced by inlining :attr:`subkernel` within it. + """ + callee_label = callee_knl.name[:4] + "_" + + # {{{ duplicate and rename inames + + vng = caller_kernel.get_var_name_generator() + ing = caller_kernel.get_instruction_id_generator() + dim_type = isl.dim_type.set + + iname_map = {} + for iname in callee_knl.all_inames(): + iname_map[iname] = vng(callee_label+iname) + + new_domains = [] + new_iname_to_tags = caller_kernel.iname_to_tags.copy() + + # transferring iname tags info from the callee to the caller kernel + for domain in callee_knl.domains: + new_domain = domain.copy() + for i in range(new_domain.n_dim()): + iname = new_domain.get_dim_name(dim_type, i) + + if iname in callee_knl.iname_to_tags: + new_iname_to_tags[iname_map[iname]] = ( + callee_knl.iname_to_tags[iname]) + new_domain = new_domain.set_dim_name( + dim_type, i, iname_map[iname]) + new_domains.append(new_domain) + + kernel = caller_kernel.copy(domains=caller_kernel.domains + new_domains, + iname_to_tags=new_iname_to_tags) + + # }}} + + # {{{ rename temporaries + + temp_map = {} + new_temps = kernel.temporary_variables.copy() + for name, temp in six.iteritems(callee_knl.temporary_variables): + new_name = vng(callee_label+name) + temp_map[name] = new_name + new_temps[new_name] = temp.copy(name=new_name) + + kernel = kernel.copy(temporary_variables=new_temps) + + # }}} + + # {{{ match kernel arguments + + arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) + + assignees = instruction.assignees # writes + parameters = instruction.expression.parameters # reads + + # add keyword parameters + from pymbolic.primitives import CallWithKwargs + + if isinstance(instruction.expression, CallWithKwargs): + from loopy.kernel.function_interface import get_kw_pos_association + + _, pos_to_kw = get_kw_pos_association(callee_knl) + kw_parameters = instruction.expression.kw_parameters + for i in range(len(parameters), len(parameters) + len(kw_parameters)): + parameters = parameters + (kw_parameters[pos_to_kw[i]],) + + assignee_pos = 0 + parameter_pos = 0 + for i, arg in enumerate(callee_knl.args): + if arg.is_output_only: + arg_map[arg.name] = assignees[assignee_pos] + assignee_pos += 1 + else: + arg_map[arg.name] = parameters[parameter_pos] + parameter_pos += 1 + + # }}} + + # {{{ rewrite instructions + + import pymbolic.primitives as p + from pymbolic.mapper.substitutor import make_subst_func + + var_map = dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(iname_map)) + var_map.update(dict((p.Variable(k), p.Variable(v)) + for k, v in six.iteritems(temp_map))) + var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) + for k, v in six.iteritems(arg_map))) + subst_mapper = KernelInliner( + make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) + + insn_id = {} + for insn in callee_knl.instructions: + insn_id[insn.id] = ing(callee_label+insn.id) + + # {{{ root and leave instructions in callee kernel + + dep_map = callee_knl.recursive_insn_dep_map() + # roots depend on nothing + heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + # leaves have nothing that depends on them + tails = set(dep_map.keys()) + for insn, deps in six.iteritems(dep_map): + tails = tails - deps + + # }}} + + # {{{ use NoOp to mark the start and end of callee kernel + + from loopy.kernel.instruction import NoOpInstruction + + noop_start = NoOpInstruction( + id=ing(callee_label+"_start"), + within_inames=instruction.within_inames, + depends_on=instruction.depends_on + ) + noop_end = NoOpInstruction( + id=instruction.id, + within_inames=instruction.within_inames, + depends_on=frozenset(insn_id[insn] for insn in tails) + ) + # }}} + + inner_insns = [noop_start] + + for insn in callee_knl.instructions: + insn = insn.with_transformed_expressions(subst_mapper) + within_inames = frozenset(map(iname_map.get, insn.within_inames)) + within_inames = within_inames | instruction.within_inames + depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( + instruction.depends_on) + if insn.id in heads: + depends_on = depends_on | set([noop_start.id]) + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on + ) + inner_insns.append(insn) + + inner_insns.append(noop_end) + + new_insns = [] + for insn in kernel.instructions: + if insn == instruction: + new_insns.extend(inner_insns) + else: + new_insns.append(insn) + + kernel = kernel.copy(instructions=new_insns) + + # }}} + + return kernel + +# }}} + + +# {{{ inline callable kernel + +def _inline_single_callable_kernel(caller_kernel, function_name, + program_callables_info): + old_insns = caller_kernel.instructions + for insn in old_insns: + if isinstance(insn, CallInstruction): + # FIXME This seems to use identifiers across namespaces. Why not + # check whether the function is a scoped function first? ~AK + if insn.expression.function.name in program_callables_info: + history_of_identifier = program_callables_info.history[ + insn.expression.function.name] + + if function_name in history_of_identifier: + in_knl_callable = program_callables_info[ + insn.expression.function.name] + assert isinstance(in_knl_callable, CallableKernel) + caller_kernel = _inline_call_instruction( + caller_kernel, in_knl_callable.subkernel, insn) + program_callables_info = ( + program_callables_info.with_deleted_callable( + insn.expression.function.name, + program_callables_info.num_times_callables_called[ + caller_kernel.name])) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError( + "Unknown instruction type %s" + % type(insn).__name__) + + return caller_kernel, program_callables_info + + +# FIXME This should take a 'within' parameter to be able to only inline +# *some* calls to a kernel, but not others. +def inline_callable_kernel(program, function_name): + """ + Returns a copy of *kernel* with the callable kernel addressed by + (scoped) name *function_name* inlined. + """ + from loopy.preprocess import infer_arg_descr + program = infer_arg_descr(program) + program_callables_info = program.program_callables_info + old_program_callables_info = program_callables_info.copy() + + edited_callable_kernels = {} + + for func_id, in_knl_callable in old_program_callables_info.items(): + if function_name not in old_program_callables_info.history[func_id] and ( + isinstance(in_knl_callable, CallableKernel)): + caller_kernel = in_knl_callable.subkernel + caller_kernel, program_callables_info = ( + _inline_single_callable_kernel(caller_kernel, + function_name, + program_callables_info)) + edited_callable_kernels[func_id] = in_knl_callable.copy( + subkernel=caller_kernel) + + new_resolved_functions = {} + for func_id, in_knl_callable in program_callables_info.items(): + if func_id in edited_callable_kernels: + new_resolved_functions[func_id] = edited_callable_kernels[func_id] + else: + new_resolved_functions[func_id] = in_knl_callable + + program_callables_info = program_callables_info.copy( + resolved_functions=new_resolved_functions) + + return program.copy(program_callables_info=program_callables_info) + +# }}} + + +# {{{ tools to match caller to callee args by (guessed) automatic reshaping + +# (This is undocumented and not recommended, but it is currently needed +# to support Firedrake.) + +class DimChanger(IdentityMapper): + """ + Mapper to change the dimensions of an argument. + + .. attribute:: callee_arg_dict + + A mapping from the argument name (:class:`str`) to instances of + :class:`loopy.kernel.array.ArrayBase`. + + .. attribute:: desried_shape + + A mapping from argument name (:class:`str`) to an instance of + :class:`tuple`. + """ + def __init__(self, callee_arg_dict, desired_shape): + self.callee_arg_dict = callee_arg_dict + self.desired_shape = desired_shape + + def map_subscript(self, expr): + callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags + flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in + zip(callee_arg_dim_tags, expr.index_tuple)) + new_indices = [] + + from operator import mul + from functools import reduce + stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) + + for length in self.desired_shape[expr.aggregate.name]: + stride /= length + ind = flattened_index // int(stride) + flattened_index -= (int(stride) * ind) + new_indices.append(simplify_via_aff(ind)) + + return expr.aggregate.index(tuple(new_indices)) + + +def _match_caller_callee_argument_dimension_for_single_kernel( + caller_knl, program_callables_info, callee_function_name): + """ + Returns a copy of *caller_knl* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *caller_knl* aligned with the argument + dimesnsions required by *caller_knl*. + """ + pymbolic_calls_to_new_callables = {} + for insn in caller_knl.instructions: + if not isinstance(insn, CallInstruction) or ( + insn.expression.function.name not in + program_callables_info): + # Call to a callable kernel can only occur through a + # CallInstruction. + continue + + in_knl_callable = program_callables_info[ + insn.expression.function.name] + + if in_knl_callable.subkernel.name != callee_function_name: + # Not the callable we're looking for. + continue + + # getting the caller->callee arg association + + parameters = insn.expression.parameters[:] + kw_parameters = {} + if isinstance(insn.expression, CallWithKwargs): + kw_parameters = insn.expression.kw_parameters + + assignees = insn.assignees + + parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape + for par in parameters] + kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) + for i in range(len(parameters), len(parameters)+len(kw_parameters)): + parameter_shapes.append(kw_parameters[pos_to_kw[i]] + .get_array_arg_descriptor(caller_knl).shape) + + # inserting the assigness at the required positions. + assignee_write_count = -1 + for i, arg in enumerate(in_knl_callable.subkernel.args): + if arg.is_output_only: + assignee = assignees[-assignee_write_count-1] + parameter_shapes.insert(i, assignee + .get_array_arg_descriptor(caller_knl).shape) + assignee_write_count -= 1 + + callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in + in_knl_callable.subkernel.args], parameter_shapes)) + dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, + callee_arg_to_desired_dim_tag) + new_callee_insns = [] + for callee_insn in in_knl_callable.subkernel.instructions: + if isinstance(callee_insn, MultiAssignmentBase): + new_callee_insns.append(callee_insn.copy(expression=dim_changer( + callee_insn.expression), + assignee=dim_changer(callee_insn.assignee))) + elif isinstance(callee_insn, (CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError("Unknwon instruction %s." % + type(insn)) + + # subkernel with instructions adjusted according to the new dimensions. + new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) + + new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) + + pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable + + if not pymbolic_calls_to_new_callables: + # complain if no matching function found. + raise LoopyError("No CallableKernel with the name %s found in %s." % ( + callee_function_name, caller_knl.name)) + + return change_names_of_pymbolic_calls(caller_knl, + pymbolic_calls_to_new_callables) + + +def _match_caller_callee_argument_dimension_(program, *args, **kwargs): + assert isinstance(program, Program) + + new_resolved_functions = {} + for func_id, in_knl_callable in program.program_callables_info.items(): + if isinstance(in_knl_callable, CallableKernel): + new_subkernel = ( + _match_caller_callee_argument_dimension_for_single_kernel( + in_knl_callable.subkernel, program.program_callables_info, + *args, **kwargs)) + in_knl_callable = in_knl_callable.copy( + subkernel=new_subkernel) + + elif isinstance(in_knl_callable, ScalarCallable): + pass + else: + raise NotImplementedError("Unknown type of callable %s." % ( + type(in_knl_callable).__name__)) + + new_resolved_functions[func_id] = in_knl_callable + + new_program_callables_info = program.program_callables_info.copy( + resolved_functions=new_resolved_functions) + return program.copy(program_callables_info=new_program_callables_info) + +# }}} + + +# vim: foldmethod=marker diff --git a/test/test_callables.py b/test/test_callables.py new file mode 100644 index 000000000..f25bbbe6f --- /dev/null +++ b/test/test_callables.py @@ -0,0 +1,414 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np +import pyopencl as cl +import pyopencl.clrandom # noqa: F401 +import loopy as lp +import pytest +import sys + + +from pyopencl.tools import ( # noqa: F401 + pytest_generate_tests_for_pyopencl + as pytest_generate_tests) + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 + + +def test_register_function_lookup(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + from testlib import register_log2_lookup + + x = np.random.rand(10) + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + prog = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = log2(x[i]) + """) + prog = lp.register_function_id_to_in_knl_callable_mapper(prog, + register_log2_lookup) + + evt, (out, ) = prog(queue, x=x) + + assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + grandchild_knl = lp.make_kernel_function( + "{[i, j]:0<= i, j< 16}", + """ + c[i, j] = 2*a[i, j] + 3*b[i, j] + """, name='linear_combo1') + + child_knl = lp.make_kernel_function( + "{[i, j]:0<=i, j < 16}", + """ + [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) + """, name='linear_combo2') + + parent_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, child_knl) + knl = lp.register_callable_kernel( + knl, grandchild_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo2') + knl = lp.inline_callable_kernel(knl, 'linear_combo1') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_slices_with_negative_step(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n, n, n, n) + y = np.random.rand(n, n, n, n, n) + + child_knl = lp.make_kernel_function( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name="linear_combo") + + parent_knl = lp.make_kernel( + "{[i, k, m]: 0<=i, k, m<16}", + """ + z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + y[i, :, k, :, m]) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), + lp.GlobalArg( + name='z', + dtype=np.float64, + shape=(16, 16, 16, 16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, child_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_call_with_kwargs(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 2 + + a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) + c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel_function( + "{[i, j]:0<=i, j < %d}" % n, + """ + h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] + <>f1[i, j] = 2*f[i, j] + p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] + """, + [ + lp.GlobalArg('f, e, h, g'), '...'], + name='linear_combo') + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, + """ + <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] + [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( + f=[j, l]: a[i, j, k, l, m], + g=[j, l]: d[i, j, k, l, m], + e=[j, l]: c[i, j, k, l, m]) + """) + + knl = lp.register_callable_kernel( + caller_knl, callee_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) + + a = a_dev.get() + b = b_dev.get() + c = c_dev.get() + + h = out1.get() # h = 2c + 3a + 8b + p = out2.get() # p = 7c + 8a + 4b + h_exact = 3*a + 8*b + 2*c + p_exact = 8*a + 4*b + 7*c + + assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 + assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_register_knl_with_hw_axes(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 2 ** 4 + + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + + callee_knl = lp.make_kernel_function( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name='linear_combo') + + callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") + + caller_knl = lp.make_kernel( + "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + """ + [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], + [j, l]: y[i, j, k, l, m]) + """ + ) + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") + + knl = lp.register_callable_kernel( + caller_knl, callee_knl) + + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x_dev, y=y_dev) + + x_host = x_dev.get() + y_host = y_dev.get() + + assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( + 2*x_host+3*y_host) < 1e-15 + + +@pytest.mark.parametrize("inline", [False, True]) +def test_shape_translation_through_sub_array_ref(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) + x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) + x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) + + callee1 = lp.make_kernel_function( + "{[i]: 0<=i<6}", + """ + a[i] = 2*abs(b[i]) + """, name="callee_fn1") + + callee2 = lp.make_kernel_function( + "{[i, j]: 0<=i<3 and 0 <= j < 2}", + """ + a[i, j] = 3*b[i, j] + """, name="callee_fn2") + + callee3 = lp.make_kernel_function( + "{[i]: 0<=i<6}", + """ + a[i] = 5*b[i] + """, name="callee_fn3") + + knl = lp.make_kernel( + "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", + """ + [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) + [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) + [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) + """) + + knl = lp.register_callable_kernel(knl, callee1) + knl = lp.register_callable_kernel(knl, callee2) + knl = lp.register_callable_kernel(knl, callee3) + + if inline: + knl = lp.inline_callable_kernel(knl, 'callee_fn1') + knl = lp.inline_callable_kernel(knl, 'callee_fn2') + knl = lp.inline_callable_kernel(knl, 'callee_fn3') + + knl = lp.set_options(knl, "write_cl") + knl = lp.set_options(knl, "return_dict") + evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) + + y1 = out_dict['y1'].get() + y2 = out_dict['y2'].get() + y3 = out_dict['y3'].get() + + assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 + assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 + assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 + + +def test_multi_arg_array_call(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + import pymbolic.primitives as p + n = 10 + acc_i = p.Variable("acc_i") + i = p.Variable("i") + index = p.Variable("index") + a_i = p.Subscript(p.Variable("a"), p.Variable("i")) + argmin_kernel = lp.make_kernel_function( + "{[i]: 0 <= i < n}", + [ + lp.Assignment(id="init2", assignee=index, + expression=0), + lp.Assignment(id="init1", assignee=acc_i, + expression="214748367"), + lp.Assignment(id="insn", assignee=index, + expression=p.If(p.Expression.eq(acc_i, a_i), i, index), + depends_on="update"), + lp.Assignment(id="update", assignee=acc_i, + expression=p.Variable("min")(acc_i, a_i), + depends_on="init1,init2")], + name="custom_argmin") + + argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) + + knl = lp.make_kernel( + "{[i]:0<=i 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: foldmethod=marker -- GitLab From 28bb8efd90784545444c705c7820d26e4ef2a555 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 16:45:18 +0530 Subject: [PATCH 339/916] removing unused part of code. --- loopy/kernel/function_interface.py | 103 ----- loopy/transform/callable.py | 592 +---------------------------- test/test_callables.py | 345 ----------------- 3 files changed, 2 insertions(+), 1038 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 2ea260656..8b24da21d 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -524,109 +524,6 @@ class CallableKernel(InKernelCallable): def name(self): return self.subkernel.name - def with_types(self, arg_id_to_dtype, caller_kernel, - program_callables_info): - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - new_args = [] - for arg in self.subkernel.args: - kw = arg.name - if kw in arg_id_to_dtype: - # id exists as kw - new_args.append(arg.copy(dtype=arg_id_to_dtype[kw])) - elif kw_to_pos[kw] in arg_id_to_dtype: - # id exists as positional argument - new_args.append(arg.copy( - dtype=arg_id_to_dtype[kw_to_pos[kw]])) - else: - new_args.append(arg) - - from loopy.type_inference import ( - infer_unknown_types_for_a_single_kernel) - pre_specialized_subkernel = self.subkernel.copy( - args=new_args) - - # infer the types of the written variables based on the knowledge - # of the types of the arguments supplied - specialized_kernel, program_callables_info = ( - infer_unknown_types_for_a_single_kernel( - pre_specialized_subkernel, - program_callables_info, - expect_completion=True)) - - new_arg_id_to_dtype = {} - for arg in specialized_kernel.args: - # associate the updated_arg_id_to_dtype with keyword as well as - # positional id. - new_arg_id_to_dtype[arg.name] = arg.dtype - new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype - - # Return the kernel call with specialized subkernel and the corresponding - # new arg_id_to_dtype - return self.copy(subkernel=specialized_kernel, - arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info - - def with_descrs(self, arg_id_to_descr, program_callables_info): - - # tune the subkernel so that we have the matching shapes and - # dim_tags - - new_args = self.subkernel.args[:] - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - for arg_id, descr in arg_id_to_descr.items(): - if isinstance(arg_id, int): - arg_id = pos_to_kw[arg_id] - assert isinstance(arg_id, str) - - if isinstance(descr, ArrayArgDescriptor): - new_arg = self.subkernel.arg_dict[arg_id].copy( - shape=descr.shape, - dim_tags=descr.dim_tags, - address_space=descr.address_space) - # replacing the new arg with the arg of the same name - new_args = [new_arg if arg.name == arg_id else arg for arg in - new_args] - elif isinstance(descr, ValueArgDescriptor): - pass - else: - raise LoopyError("Descriptor must be either an instance of " - "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % - type(descr)) - descriptor_specialized_knl = self.subkernel.copy(args=new_args) - from loopy.preprocess import traverse_to_infer_arg_descr - descriptor_specialized_knl, program_callables_info = ( - traverse_to_infer_arg_descr(descriptor_specialized_knl, - program_callables_info)) - - return ( - self.copy( - subkernel=descriptor_specialized_knl, - arg_id_to_descr=arg_id_to_descr), - program_callables_info) - - def with_packing_for_args(self): - from loopy.kernel.data import AddressSpace - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - - arg_id_to_descr = {} - - for pos, kw in pos_to_kw.items(): - arg = self.subkernel.arg_dict[kw] - arg_id_to_descr[pos] = ArrayArgDescriptor( - shape=arg.shape, - dim_tags=arg.dim_tags, - address_space=AddressSpace.GLOBAL) - - return self.copy(subkernel=self.subkernel, - arg_id_to_descr=arg_id_to_descr) - - def with_hw_axes_sizes(self, gsize, lsize): - return self.copy( - subkernel=self.subkernel.copy( - overridden_get_grid_sizes_for_insn_ids=( - GridOverrideForCalleeKernel(lsize, gsize)))) - def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index b5b80ad89..9d9935ab0 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -21,29 +21,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - -import six - -import islpy as isl -from pymbolic.primitives import CallWithKwargs - -from loopy.kernel import LoopKernel -from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, - CInstruction, _DataObliviousInstruction) -from loopy.symbolic import IdentityMapper, SubstitutionMapper -from loopy.isl_helpers import simplify_via_aff -from loopy.kernel.function_interface import (get_kw_pos_association, - change_names_of_pymbolic_calls, CallableKernel, ScalarCallable) -from loopy.program import Program, ResolvedFunctionMarker +from loopy.kernel.function_interface import CallableKernel +from loopy.program import ResolvedFunctionMarker __doc__ = """ .. currentmodule:: loopy .. autofunction:: register_function_id_to_in_knl_callable_mapper -.. autofunction:: register_callable_kernel """ @@ -130,578 +116,4 @@ def register_function_id_to_in_knl_callable_mapper(program, # }}} -# {{{ register_callable_kernel - -class _RegisterCalleeKernel(ImmutableRecord): - """ - Helper class to make the function scoper from - :func:`loopy.transform.register_callable_kernel` picklable. As python - cannot pickle lexical closures. - """ - fields = set(['callable_kernel']) - - def __init__(self, callable_kernel): - self.callable_kernel = callable_kernel - - def __call__(self, target, identifier): - if identifier == self.callable_kernel.subkernel.name: - return self.callable_kernel - return None - - -def register_callable_kernel(program, callee_kernel): - """Returns a copy of *caller_kernel*, which would resolve *function_name* in an - expression as a call to *callee_kernel*. - - :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg function_name: An instance of :class:`str`. - :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - """ - - # {{{ sanity checks - - assert isinstance(program, Program) - assert isinstance(callee_kernel, LoopKernel) - - # check to make sure that the variables with 'out' direction is equal to - # the number of assigness in the callee kernel intructions. - expected_num_assignees = len([arg for arg in callee_kernel.args if - arg.is_output_only]) - expected_num_parameters = len(callee_kernel.args) - expected_num_assignees - for in_knl_callable in program.program_callables_info.values(): - if isinstance(in_knl_callable, CallableKernel): - caller_kernel = in_knl_callable.subkernel - for insn in caller_kernel.instructions: - if isinstance(insn, CallInstruction) and ( - insn.expression.function.name == callee_kernel.name): - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - else: - kw_parameters = {} - if len(insn.assignees) != expected_num_assignees: - raise LoopyError("The number of arguments with 'out' " - "direction " "in callee kernel %s and the number " - "of assignees in " "instruction %s do not " - "match." % ( - callee_kernel.name, insn.id)) - if len(insn.expression.parameters+tuple( - kw_parameters.values())) != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of " - "parameters in instruction %s do not match." - % (callee_kernel.name, insn.id)) - - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("unknown instruction %s" % type(insn)) - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callable type %s." % - type(in_knl_callable).__name__) - - # }}} - - # take the function resolvers from the Program and resolve the functions in - # the callee kernel - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) - - from loopy.symbolic import SubstitutionRuleMappingContext - rule_mapping_context = SubstitutionRuleMappingContext( - callee_kernel.substitutions, - callee_kernel.get_var_name_generator()) - - resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, callee_kernel, program_callables_info, - program.func_id_to_in_knl_callable_mappers) - - callee_kernel = rule_mapping_context.finish_kernel( - resolved_function_marker.map_kernel(callee_kernel)) - program_callables_info = resolved_function_marker.program_callables_info - - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - program = program.copy(program_callables_info=program_callables_info) - - # making the target of the child kernel to be same as the target of parent - # kernel. - callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=program.target, - is_called_from_host=False)) - - # FIXME disabling global barriers for callee kernel (for now) - from loopy import set_options - callee_kernel = set_options(callee_kernel, "disable_global_barriers") - - # FIXME: the number of callables is wrong. This is horrible please - # compensate. - - return register_function_id_to_in_knl_callable_mapper( - program, - _RegisterCalleeKernel(callable_kernel)) - -# }}} - - -# {{{ kernel inliner mapper - -class KernelInliner(SubstitutionMapper): - """Mapper to replace variables (indices, temporaries, arguments) in the - callee kernel with variables in the caller kernel. - - :arg caller: the caller kernel - :arg arg_map: dict of argument name to variables in caller - :arg arg_dict: dict of argument name to arguments in callee - """ - - def __init__(self, subst_func, caller, arg_map, arg_dict): - super(KernelInliner, self).__init__(subst_func) - self.caller = caller - self.arg_map = arg_map - self.arg_dict = arg_dict - - def map_subscript(self, expr): - if expr.aggregate.name in self.arg_map: - - aggregate = self.subst_func(expr.aggregate) - sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller - callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee - if aggregate.name in self.caller.arg_dict: - caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller - else: - caller_arg = self.caller.temporary_variables[aggregate.name] - - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in callee_arg.shape): - raise LoopyError( - "Argument: {0} in callee kernel: {1} does not have " - "constant shape.".format(callee_arg)) - - flatten_index = 0 - for i, idx in enumerate(sar.get_begin_subscript().index_tuple): - flatten_index += idx*caller_arg.dim_tags[i].stride - - flatten_index += sum( - idx * tag.stride - for idx, tag in zip(outer_indices, callee_arg.dim_tags)) - - from loopy.isl_helpers import simplify_via_aff - flatten_index = simplify_via_aff(flatten_index) - - new_indices = [] - for dim_tag in caller_arg.dim_tags: - ind = flatten_index // dim_tag.stride - flatten_index -= (dim_tag.stride * ind) - new_indices.append(ind) - - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - - return aggregate.index(tuple(new_indices)) - else: - return super(KernelInliner, self).map_subscript(expr) - -# }}} - - -# {{{ inlining of a single call instruction - -def _inline_call_instruction(caller_kernel, callee_knl, instruction): - """ - Returns a copy of *kernel* with the *instruction* in the *kernel* - replaced by inlining :attr:`subkernel` within it. - """ - callee_label = callee_knl.name[:4] + "_" - - # {{{ duplicate and rename inames - - vng = caller_kernel.get_var_name_generator() - ing = caller_kernel.get_instruction_id_generator() - dim_type = isl.dim_type.set - - iname_map = {} - for iname in callee_knl.all_inames(): - iname_map[iname] = vng(callee_label+iname) - - new_domains = [] - new_iname_to_tags = caller_kernel.iname_to_tags.copy() - - # transferring iname tags info from the callee to the caller kernel - for domain in callee_knl.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) - - if iname in callee_knl.iname_to_tags: - new_iname_to_tags[iname_map[iname]] = ( - callee_knl.iname_to_tags[iname]) - new_domain = new_domain.set_dim_name( - dim_type, i, iname_map[iname]) - new_domains.append(new_domain) - - kernel = caller_kernel.copy(domains=caller_kernel.domains + new_domains, - iname_to_tags=new_iname_to_tags) - - # }}} - - # {{{ rename temporaries - - temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(callee_knl.temporary_variables): - new_name = vng(callee_label+name) - temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) - - kernel = kernel.copy(temporary_variables=new_temps) - - # }}} - - # {{{ match kernel arguments - - arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) - - assignees = instruction.assignees # writes - parameters = instruction.expression.parameters # reads - - # add keyword parameters - from pymbolic.primitives import CallWithKwargs - - if isinstance(instruction.expression, CallWithKwargs): - from loopy.kernel.function_interface import get_kw_pos_association - - _, pos_to_kw = get_kw_pos_association(callee_knl) - kw_parameters = instruction.expression.kw_parameters - for i in range(len(parameters), len(parameters) + len(kw_parameters)): - parameters = parameters + (kw_parameters[pos_to_kw[i]],) - - assignee_pos = 0 - parameter_pos = 0 - for i, arg in enumerate(callee_knl.args): - if arg.is_output_only: - arg_map[arg.name] = assignees[assignee_pos] - assignee_pos += 1 - else: - arg_map[arg.name] = parameters[parameter_pos] - parameter_pos += 1 - - # }}} - - # {{{ rewrite instructions - - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func - - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(arg_map))) - subst_mapper = KernelInliner( - make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) - - insn_id = {} - for insn in callee_knl.instructions: - insn_id[insn.id] = ing(callee_label+insn.id) - - # {{{ root and leave instructions in callee kernel - - dep_map = callee_knl.recursive_insn_dep_map() - # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): - tails = tails - deps - - # }}} - - # {{{ use NoOp to mark the start and end of callee kernel - - from loopy.kernel.instruction import NoOpInstruction - - noop_start = NoOpInstruction( - id=ing(callee_label+"_start"), - within_inames=instruction.within_inames, - depends_on=instruction.depends_on - ) - noop_end = NoOpInstruction( - id=instruction.id, - within_inames=instruction.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) - ) - # }}} - - inner_insns = [noop_start] - - for insn in callee_knl.instructions: - insn = insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(iname_map.get, insn.within_inames)) - within_inames = within_inames | instruction.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( - instruction.depends_on) - if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on - ) - inner_insns.append(insn) - - inner_insns.append(noop_end) - - new_insns = [] - for insn in kernel.instructions: - if insn == instruction: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) - - kernel = kernel.copy(instructions=new_insns) - - # }}} - - return kernel - -# }}} - - -# {{{ inline callable kernel - -def _inline_single_callable_kernel(caller_kernel, function_name, - program_callables_info): - old_insns = caller_kernel.instructions - for insn in old_insns: - if isinstance(insn, CallInstruction): - # FIXME This seems to use identifiers across namespaces. Why not - # check whether the function is a scoped function first? ~AK - if insn.expression.function.name in program_callables_info: - history_of_identifier = program_callables_info.history[ - insn.expression.function.name] - - if function_name in history_of_identifier: - in_knl_callable = program_callables_info[ - insn.expression.function.name] - assert isinstance(in_knl_callable, CallableKernel) - caller_kernel = _inline_call_instruction( - caller_kernel, in_knl_callable.subkernel, insn) - program_callables_info = ( - program_callables_info.with_deleted_callable( - insn.expression.function.name, - program_callables_info.num_times_callables_called[ - caller_kernel.name])) - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError( - "Unknown instruction type %s" - % type(insn).__name__) - - return caller_kernel, program_callables_info - - -# FIXME This should take a 'within' parameter to be able to only inline -# *some* calls to a kernel, but not others. -def inline_callable_kernel(program, function_name): - """ - Returns a copy of *kernel* with the callable kernel addressed by - (scoped) name *function_name* inlined. - """ - from loopy.preprocess import infer_arg_descr - program = infer_arg_descr(program) - program_callables_info = program.program_callables_info - old_program_callables_info = program_callables_info.copy() - - edited_callable_kernels = {} - - for func_id, in_knl_callable in old_program_callables_info.items(): - if function_name not in old_program_callables_info.history[func_id] and ( - isinstance(in_knl_callable, CallableKernel)): - caller_kernel = in_knl_callable.subkernel - caller_kernel, program_callables_info = ( - _inline_single_callable_kernel(caller_kernel, - function_name, - program_callables_info)) - edited_callable_kernels[func_id] = in_knl_callable.copy( - subkernel=caller_kernel) - - new_resolved_functions = {} - for func_id, in_knl_callable in program_callables_info.items(): - if func_id in edited_callable_kernels: - new_resolved_functions[func_id] = edited_callable_kernels[func_id] - else: - new_resolved_functions[func_id] = in_knl_callable - - program_callables_info = program_callables_info.copy( - resolved_functions=new_resolved_functions) - - return program.copy(program_callables_info=program_callables_info) - -# }}} - - -# {{{ tools to match caller to callee args by (guessed) automatic reshaping - -# (This is undocumented and not recommended, but it is currently needed -# to support Firedrake.) - -class DimChanger(IdentityMapper): - """ - Mapper to change the dimensions of an argument. - - .. attribute:: callee_arg_dict - - A mapping from the argument name (:class:`str`) to instances of - :class:`loopy.kernel.array.ArrayBase`. - - .. attribute:: desried_shape - - A mapping from argument name (:class:`str`) to an instance of - :class:`tuple`. - """ - def __init__(self, callee_arg_dict, desired_shape): - self.callee_arg_dict = callee_arg_dict - self.desired_shape = desired_shape - - def map_subscript(self, expr): - callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags - flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in - zip(callee_arg_dim_tags, expr.index_tuple)) - new_indices = [] - - from operator import mul - from functools import reduce - stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) - - for length in self.desired_shape[expr.aggregate.name]: - stride /= length - ind = flattened_index // int(stride) - flattened_index -= (int(stride) * ind) - new_indices.append(simplify_via_aff(ind)) - - return expr.aggregate.index(tuple(new_indices)) - - -def _match_caller_callee_argument_dimension_for_single_kernel( - caller_knl, program_callables_info, callee_function_name): - """ - Returns a copy of *caller_knl* with the instance of - :class:`loopy.kernel.function_interface.CallableKernel` addressed by - *callee_function_name* in the *caller_knl* aligned with the argument - dimesnsions required by *caller_knl*. - """ - pymbolic_calls_to_new_callables = {} - for insn in caller_knl.instructions: - if not isinstance(insn, CallInstruction) or ( - insn.expression.function.name not in - program_callables_info): - # Call to a callable kernel can only occur through a - # CallInstruction. - continue - - in_knl_callable = program_callables_info[ - insn.expression.function.name] - - if in_knl_callable.subkernel.name != callee_function_name: - # Not the callable we're looking for. - continue - - # getting the caller->callee arg association - - parameters = insn.expression.parameters[:] - kw_parameters = {} - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - - assignees = insn.assignees - - parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape - for par in parameters] - kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) - for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameter_shapes.append(kw_parameters[pos_to_kw[i]] - .get_array_arg_descriptor(caller_knl).shape) - - # inserting the assigness at the required positions. - assignee_write_count = -1 - for i, arg in enumerate(in_knl_callable.subkernel.args): - if arg.is_output_only: - assignee = assignees[-assignee_write_count-1] - parameter_shapes.insert(i, assignee - .get_array_arg_descriptor(caller_knl).shape) - assignee_write_count -= 1 - - callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in - in_knl_callable.subkernel.args], parameter_shapes)) - dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, - callee_arg_to_desired_dim_tag) - new_callee_insns = [] - for callee_insn in in_knl_callable.subkernel.instructions: - if isinstance(callee_insn, MultiAssignmentBase): - new_callee_insns.append(callee_insn.copy(expression=dim_changer( - callee_insn.expression), - assignee=dim_changer(callee_insn.assignee))) - elif isinstance(callee_insn, (CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknwon instruction %s." % - type(insn)) - - # subkernel with instructions adjusted according to the new dimensions. - new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) - - new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) - - pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable - - if not pymbolic_calls_to_new_callables: - # complain if no matching function found. - raise LoopyError("No CallableKernel with the name %s found in %s." % ( - callee_function_name, caller_knl.name)) - - return change_names_of_pymbolic_calls(caller_knl, - pymbolic_calls_to_new_callables) - - -def _match_caller_callee_argument_dimension_(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = ( - _match_caller_callee_argument_dimension_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, - *args, **kwargs)) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) - -# }}} - - # vim: foldmethod=marker diff --git a/test/test_callables.py b/test/test_callables.py index f25bbbe6f..d2ca9b71c 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -26,7 +26,6 @@ import numpy as np import pyopencl as cl import pyopencl.clrandom # noqa: F401 import loopy as lp -import pytest import sys @@ -60,350 +59,6 @@ def test_register_function_lookup(ctx_factory): assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15 -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 2 ** 4 - - x = np.random.rand(n, n, n, n, n) - y = np.random.rand(n, n, n, n, n) - - grandchild_knl = lp.make_kernel_function( - "{[i, j]:0<= i, j< 16}", - """ - c[i, j] = 2*a[i, j] + 3*b[i, j] - """, name='linear_combo1') - - child_knl = lp.make_kernel_function( - "{[i, j]:0<=i, j < 16}", - """ - [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) - """, name='linear_combo2') - - parent_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", - """ - [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], - [j, l]: y[i, j, k, l, m]) - """, - kernel_data=[ - lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], - ) - - knl = lp.register_callable_kernel( - parent_knl, child_knl) - knl = lp.register_callable_kernel( - knl, grandchild_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo2') - knl = lp.inline_callable_kernel(knl, 'linear_combo1') - - evt, (out, ) = knl(queue, x=x, y=y) - - assert (np.linalg.norm(2*x+3*y-out)/( - np.linalg.norm(2*x+3*y))) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_slices_with_negative_step(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - n = 2 ** 4 - - x = np.random.rand(n, n, n, n, n) - y = np.random.rand(n, n, n, n, n) - - child_knl = lp.make_kernel_function( - "{[i, j]:0<=i, j < 16}", - """ - g[i, j] = 2*e[i, j] + 3*f[i, j] - """, name="linear_combo") - - parent_knl = lp.make_kernel( - "{[i, k, m]: 0<=i, k, m<16}", - """ - z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], - y[i, :, k, :, m]) - """, - kernel_data=[ - lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='z', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], - ) - - knl = lp.register_callable_kernel( - parent_knl, child_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out, ) = knl(queue, x=x, y=y) - - assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/( - np.linalg.norm(2*x+3*y))) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl_with_call_with_kwargs(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 2 ** 2 - - a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - - callee_knl = lp.make_kernel_function( - "{[i, j]:0<=i, j < %d}" % n, - """ - h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] - <>f1[i, j] = 2*f[i, j] - p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] - """, - [ - lp.GlobalArg('f, e, h, g'), '...'], - name='linear_combo') - - caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, - """ - <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] - [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( - f=[j, l]: a[i, j, k, l, m], - g=[j, l]: d[i, j, k, l, m], - e=[j, l]: c[i, j, k, l, m]) - """) - - knl = lp.register_callable_kernel( - caller_knl, callee_knl) - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) - - a = a_dev.get() - b = b_dev.get() - c = c_dev.get() - - h = out1.get() # h = 2c + 3a + 8b - p = out2.get() # p = 7c + 8a + 4b - h_exact = 3*a + 8*b + 2*c - p_exact = 8*a + 4*b + 7*c - - assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 - assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl_with_hw_axes(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 2 ** 4 - - x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - - callee_knl = lp.make_kernel_function( - "{[i, j]:0<=i, j < 16}", - """ - g[i, j] = 2*e[i, j] + 3*f[i, j] - """, name='linear_combo') - - callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") - - caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", - """ - [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], - [j, l]: y[i, j, k, l, m]) - """ - ) - caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") - - knl = lp.register_callable_kernel( - caller_knl, callee_knl) - - if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') - - evt, (out, ) = knl(queue, x=x_dev, y=y_dev) - - x_host = x_dev.get() - y_host = y_dev.get() - - assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( - 2*x_host+3*y_host) < 1e-15 - - -@pytest.mark.parametrize("inline", [False, True]) -def test_shape_translation_through_sub_array_ref(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) - x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) - x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) - - callee1 = lp.make_kernel_function( - "{[i]: 0<=i<6}", - """ - a[i] = 2*abs(b[i]) - """, name="callee_fn1") - - callee2 = lp.make_kernel_function( - "{[i, j]: 0<=i<3 and 0 <= j < 2}", - """ - a[i, j] = 3*b[i, j] - """, name="callee_fn2") - - callee3 = lp.make_kernel_function( - "{[i]: 0<=i<6}", - """ - a[i] = 5*b[i] - """, name="callee_fn3") - - knl = lp.make_kernel( - "{[i, j, k, l]: 0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}", - """ - [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2]) - [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k]) - [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) - """) - - knl = lp.register_callable_kernel(knl, callee1) - knl = lp.register_callable_kernel(knl, callee2) - knl = lp.register_callable_kernel(knl, callee3) - - if inline: - knl = lp.inline_callable_kernel(knl, 'callee_fn1') - knl = lp.inline_callable_kernel(knl, 'callee_fn2') - knl = lp.inline_callable_kernel(knl, 'callee_fn3') - - knl = lp.set_options(knl, "write_cl") - knl = lp.set_options(knl, "return_dict") - evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) - - y1 = out_dict['y1'].get() - y2 = out_dict['y2'].get() - y3 = out_dict['y3'].get() - - assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 - assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 - assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15 - - -def test_multi_arg_array_call(ctx_factory): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - import pymbolic.primitives as p - n = 10 - acc_i = p.Variable("acc_i") - i = p.Variable("i") - index = p.Variable("index") - a_i = p.Subscript(p.Variable("a"), p.Variable("i")) - argmin_kernel = lp.make_kernel_function( - "{[i]: 0 <= i < n}", - [ - lp.Assignment(id="init2", assignee=index, - expression=0), - lp.Assignment(id="init1", assignee=acc_i, - expression="214748367"), - lp.Assignment(id="insn", assignee=index, - expression=p.If(p.Expression.eq(acc_i, a_i), i, index), - depends_on="update"), - lp.Assignment(id="update", assignee=acc_i, - expression=p.Variable("min")(acc_i, a_i), - depends_on="init1,init2")], - name="custom_argmin") - - argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) - - knl = lp.make_kernel( - "{[i]:0<=i 1: exec(sys.argv[1]) -- GitLab From 5ed57fe2f50af100a75c08ff1f876c938123d666 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 18:44:11 +0530 Subject: [PATCH 340/916] minor error handling. --- loopy/codegen/__init__.py | 18 ++++------ loopy/kernel/__init__.py | 56 +++++------------------------- loopy/kernel/creation.py | 9 ++--- loopy/kernel/function_interface.py | 4 --- loopy/kernel/instruction.py | 12 ++----- loopy/preprocess.py | 11 ++---- loopy/type_inference.py | 19 ++-------- 7 files changed, 25 insertions(+), 104 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 3e675db75..7a25b67ed 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -150,7 +150,6 @@ class SeenFunction(ImmutableRecord): class CodeGenerationState(object): """ .. attribute:: kernel - .. attribute:: target .. attribute:: implemented_data_info a list of :class:`ImplementedDataInfo` objects. @@ -196,7 +195,7 @@ class CodeGenerationState(object): .. attribute:: program_callables_info """ - def __init__(self, kernel, target, + def __init__(self, kernel, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, @@ -206,7 +205,6 @@ class CodeGenerationState(object): gen_program_name=None, schedule_index_end=None): self.kernel = kernel - self.target = target self.implemented_data_info = implemented_data_info self.implemented_domain = implemented_domain self.implemented_predicates = implemented_predicates @@ -224,7 +222,7 @@ class CodeGenerationState(object): # {{{ copy helpers - def copy(self, kernel=None, target=None, implemented_data_info=None, + def copy(self, kernel=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), var_subst_map=None, vectorization_info=None, is_generating_device_code=None, @@ -234,9 +232,6 @@ class CodeGenerationState(object): if kernel is None: kernel = self.kernel - if target is None: - target = self.target - if implemented_data_info is None: implemented_data_info = self.implemented_data_info @@ -257,7 +252,6 @@ class CodeGenerationState(object): return CodeGenerationState( kernel=kernel, - target=target, implemented_data_info=implemented_data_info, implemented_domain=implemented_domain or self.implemented_domain, implemented_predicates=( @@ -389,7 +383,7 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_for_a_single_kernel(kernel, program_callables_info, target): +def generate_code_for_a_single_kernel(kernel, program_callables_info): """ :returns: a :class:`CodeGenerationResult` """ @@ -477,7 +471,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info, target): gen_program_name=( kernel.target.host_program_name_prefix + kernel.name - + target.host_program_name_suffix), + + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), program_callables_info=program_callables_info) @@ -512,7 +506,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info, target): ) preamble_generators = (kernel.preamble_generators - + target.get_device_ast_builder().preamble_generators()) + + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) @@ -555,7 +549,7 @@ def generate_code_v2(program): if isinstance(in_knl_callable, CallableKernel): codegen_results[func_id] = ( generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.program_callables_info, program.target)) + program.program_callables_info)) device_preambles = set() for cgr in codegen_results.values(): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d2723c57f..f686e58f1 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1036,25 +1036,19 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) - def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, - program_callables_info, ignore_auto=False): + def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, + ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. - :arg insn_ids: a :class:`frozenset` of instruction IDs - - *global_size* and *local_size* are instances of :class:`dict` with - mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. + *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ - # {{{ collecting the callee kernels in insn_ids - - from loopy.kernel.tools import get_direct_callee_kernels - callee_kernels = get_direct_callee_kernels(self, - program_callables_info, insn_ids) - - # }}} + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + ignore_auto=ignore_auto) all_inames_by_insns = set() for insn_id in insn_ids: @@ -1069,15 +1063,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): global_sizes = {} local_sizes = {} - # updating the grid sizes from the callee_kernels. - for callee_kernel in callee_kernels: - gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts( - frozenset(insn.id for insn in callee_kernel.instructions), - program_callables_info, ignore_auto) - - global_sizes.update(gsize) - local_sizes.update(lsize) - from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) @@ -1118,31 +1103,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): tgt_dict[tag.axis] = size - return global_sizes, local_sizes - - def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, - ignore_auto=False): - """Return a tuple (global_size, local_size) containing a grid that - could accommodate execution of all instructions whose IDs are given - in *insn_ids*. - - :arg insn_ids: a :class:`frozenset` of instruction IDs - - *global_size* and *local_size* are :class:`islpy.PwAff` objects. - """ - - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - program_callables_info=program_callables_info, - ignore_auto=ignore_auto) - - assert self.is_called_from_host, ("Callee kernels do not have sufficient " - "information to compute grid sizes.") - - global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( - insn_ids, program_callables_info, ignore_auto=ignore_auto) - def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() @@ -1172,6 +1132,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index bac4afc85..bc996d9c7 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -27,16 +27,13 @@ THE SOFTWARE. import numpy as np from pymbolic.mapper import CSECachingMapperMixin -from pymbolic.primitives import Slice, Variable, Subscript from loopy.tools import intern_frozenset_of_ids from loopy.symbolic import ( - IdentityMapper, WalkMapper, SubArrayRef) + IdentityMapper, WalkMapper) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, SubstitutionRule, AddressSpace) -from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, - CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -507,11 +504,9 @@ def parse_insn(groups, insn_options): assignee_names.append(inner_lhs_i.name) elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)): assignee_names.append(inner_lhs_i.aggregate.name) - elif isinstance(inner_lhs_i, SubArrayRef): - assignee_names.append(inner_lhs_i.subscript.aggregate.name) else: raise LoopyError("left hand side of assignment '%s' must " - "be variable, subscript or a SubArrayRef" % (lhs_i,)) + "be variable or subscript" % (lhs_i,)) new_lhs.append(lhs_i) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 8b24da21d..e0954fb73 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -571,13 +571,9 @@ class CallableKernel(InKernelCallable): # no type casting in array calls from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE - from loopy.symbolic import SubArrayRef from pymbolic import var c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr if isinstance(par, SubArrayRef) else expression_to_code_mapper(par, PREC_NONE, dtype_to_type_context(target, par_dtype), par_dtype).expr diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 0f548bba7..2a03ad637 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -487,7 +487,7 @@ class InstructionBase(ImmutableRecord): def _get_assignee_var_name(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, SubArrayRef + from loopy.symbolic import LinearSubscript if isinstance(expr, Lookup): expr = expr.aggregate @@ -507,19 +507,13 @@ def _get_assignee_var_name(expr): return agg.name - elif isinstance(expr, SubArrayRef): - agg = expr.subscript.aggregate - assert isinstance(agg, Variable) - - return agg.name - else: raise RuntimeError("invalid lvalue '%s'" % expr) def _get_assignee_subscript_deps(expr): from pymbolic.primitives import Variable, Subscript, Lookup - from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef + from loopy.symbolic import LinearSubscript, get_dependencies if isinstance(expr, Lookup): expr = expr.aggregate @@ -530,8 +524,6 @@ def _get_assignee_subscript_deps(expr): return get_dependencies(expr.index) elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) - elif isinstance(expr, SubArrayRef): - return get_dependencies(expr.get_begin_subscript().index) else: raise RuntimeError("invalid lvalue '%s'" % expr) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 3657967a1..bf23c4a44 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2165,7 +2165,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): def map_call(self, expr, expn_state, **kwargs): from pymbolic.primitives import Call, CallWithKwargs from loopy.kernel.function_interface import ValueArgDescriptor - from loopy.symbolic import ResolvedFunction, SubArrayRef + from loopy.symbolic import ResolvedFunction if not isinstance(expr.function, ResolvedFunction): # ignore if the call is not to a ResolvedFunction @@ -2178,8 +2178,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): kw_parameters = expr.kw_parameters # descriptors for the args and kwargs of the Call - arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.caller_kernel)) - if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) + arg_id_to_descr = dict((i, ValueArgDescriptor()) for i, par in tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) @@ -2190,11 +2189,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): assignees = kwargs['assignees'] assert isinstance(assignees, tuple) for i, par in enumerate(assignees): - if isinstance(par, SubArrayRef): - assignee_id_to_descr[-i-1] = ( - par.get_array_arg_descriptor(self.caller_kernel)) - else: - assignee_id_to_descr[-i-1] = ValueArgDescriptor() + assignee_id_to_descr[-i-1] = ValueArgDescriptor() # gathering all the descriptors combined_arg_id_to_descr = arg_id_to_descr.copy() diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 0e8fa3053..3ae9a142e 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -36,7 +36,7 @@ from loopy.diagnostic import ( from loopy.kernel.instruction import _DataObliviousInstruction from loopy.program import ProgramCallablesInfo -from loopy.symbolic import SubArrayRef, LinearSubscript +from loopy.symbolic import LinearSubscript from pymbolic.primitives import Variable, Subscript, Lookup import logging @@ -548,10 +548,6 @@ class TypeInferenceMapper(CombineMapper): return [expr.operation.result_dtypes(self.kernel, rec_result)[0] for rec_result in rec_results] - def map_sub_array_ref(self, expr): - return self.rec(expr.get_begin_subscript()) - - # }}} @@ -831,17 +827,8 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, assignee.aggregate.name].dtype is None: return False else: - assert isinstance(assignee, SubArrayRef) - if assignee.subscript.aggregate.name in kernel.arg_dict: - if kernel.arg_dict[ - assignee.subscript.aggregate.name].dtype is None: - return False - else: - assert assignee.subscript.aggregate.name in ( - kernel.temporary_variables) - if kernel.temporary_variables[ - assignee.subscript.aggregate.name] is None: - return False + raise NotImplementedError("Unknown assignee type %s" % + type(assignee)) return True -- GitLab From 79fed9786ce5ae90c367ac6cbff1192678aa1014 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 18:55:30 +0530 Subject: [PATCH 341/916] Flake8 --- loopy/isl_helpers.py | 2 +- loopy/kernel/__init__.py | 11 ----------- loopy/target/opencl.py | 5 ----- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index ef07b7e27..5a747d070 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -27,7 +27,7 @@ THE SOFTWARE. from six.moves import range, zip -from loopy.diagnostic import StaticValueFindingError, LoopyError +from loopy.diagnostic import StaticValueFindingError import islpy as isl from islpy import dim_type diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index f686e58f1..f5e105c70 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -221,11 +221,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): A subclass of :class:`loopy.TargetBase`. - .. attribute:: is_called_from_host - - An instance of :class:`bool`. Will be set *False* for the kernel which - would be called from another top level kernels. Default value is - *True*. """ # {{{ constructor @@ -254,8 +249,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, - is_called_from_host=True, - overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): """ @@ -368,7 +361,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, - is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) @@ -1132,8 +1124,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - - def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that @@ -1456,7 +1446,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): "silenced_warnings", "options", "state", - "is_called_from_host", "target", ) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 44f782a72..44bf9c4c8 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -470,11 +470,6 @@ class OpenCLCASTBuilder(CASTBuilder): from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) - if not codegen_state.kernel.is_called_from_host: - # auxiliary kernels need not mention opencl speicific qualifiers - # for a functions signature - return fdecl - fdecl = fdecl.subdecl from cgen.opencl import CLKernel, CLRequiredWorkGroupSize -- GitLab From ec84ad60427fa2ebf2accf03e4b9432bece54be6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 19:21:46 +0530 Subject: [PATCH 342/916] adds program_callables_info to grid_override... --- loopy/kernel/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index f5e105c70..be66cf851 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1040,6 +1040,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if self.overridden_get_grid_sizes_for_insn_ids: return self.overridden_get_grid_sizes_for_insn_ids( insn_ids, + program_callables_info, ignore_auto=ignore_auto) all_inames_by_insns = set() -- GitLab From dd995d883c7ea00950f7121533c86a0638cd2b10 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 12 Aug 2018 19:47:04 +0530 Subject: [PATCH 343/916] took the test to the earlier state. --- test/test_loopy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 02eeda132..43371c8a8 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -409,11 +409,14 @@ def test_ilp_write_race_detection_global(ctx_factory): knl = lp.tag_inames(knl, dict(j="ilp")) + knl = lp.preprocess_kernel(knl) + with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: - lp.generate_code_v2(knl) + list(lp.generate_loop_schedules(knl.root_kernel, + knl.program_callables_info)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) -- GitLab From 82a16b6cc6709b5a9f516ef5b1da376b92782b8d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 13 Aug 2018 11:27:00 +0530 Subject: [PATCH 344/916] fix the style of code to get started with changing ProgramCallablesInfo --- loopy/kernel/__init__.py | 3 +- loopy/kernel/function_interface.py | 4 +- loopy/library/reduction.py | 2 +- loopy/program.py | 70 +++++++----------------------- loopy/statistics.py | 6 +-- loopy/symbolic.py | 8 ++-- 6 files changed, 27 insertions(+), 66 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index be66cf851..3f637e53c 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1360,7 +1360,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ direct execution def __call__(self, *args, **kwargs): - # FIXME: scream and then convert to a program + raise LoopyError("Calling a LoopKernel is deprecated, call a Program " + "instead.") from loopy.program import make_program_from_kernel program = make_program_from_kernel(self) return program(*args, **kwargs) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e0954fb73..8c3a69111 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -676,8 +676,8 @@ def next_indexed_variable(function): or :class:`loopy.reduction.ArgExtOp` or :class:`loopy.reduction.SegmentedOp`. """ - from loopy.library.reduction import ArgExtOp, SegmentedOp - if isinstance(function, (ArgExtOp, SegmentedOp)): + from loopy.library.reduction import ReductionOpFunction + if isinstance(function, ReductionOpFunction): return function.copy() func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 6ec8e4b21..b968192e6 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -504,7 +504,7 @@ class ReductionCallable(ScalarCallable): def reduction_scoper(target, identifier): - if isinstance(identifier, (ArgExtOp, SegmentedOp)): + if isinstance(identifier, ReductionOpFunction): return ReductionCallable(name=identifier) return None diff --git a/loopy/program.py b/loopy/program.py index 096bd1eca..279228afd 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -298,14 +298,7 @@ class Program(ImmutableRecord): return pex(*args, **kwargs) def __str__(self): - # FIXME: make this better - print(self.program_callables_info.num_times_callables_called) - return ( - (self.program_callables_info[ - self.name].subkernel).__str__() + - '\nResolved Functions: ' + - (self.program_callables_info.resolved_functions.keys()).__str__() + - '\n' + 75*'-' + '\n') + return self.root_kernel.__str__() # }}} @@ -315,14 +308,14 @@ def next_indexed_function_identifier(function): Returns an instance of :class:`str` with the next indexed-name in the sequence for the name of *function*. - *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. + *Example:* ``'sin_0'`` will return ``'sin_1'``. - :arg function: Either an instance of :class:`pymbolic.primitives.Variable` - or :class:`loopy.reduction.ArgExtOp` or - :class:`loopy.reduction.SegmentedOp`. + :arg function: Either an instance of :class:`str`, + :class:`pymbolic.primitives.Variable` , + :class:`loopy.reduction.ReductionOpFunction`. """ - from loopy.library.reduction import ArgExtOp, SegmentedOp - if isinstance(function, (ArgExtOp, SegmentedOp)): + from loopy.library.reduction import ReductionOpFunction + if isinstance(function, ReductionOpFunction): return function.copy() elif isinstance(function, str): function = Variable(function) @@ -371,12 +364,8 @@ def rename_resolved_functions_in_a_single_kernel(kernel, # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): - # FIXME: dont evalutate num_times_called, rahter compute it from the - # resolved_functions - # FIXME: make the edit callables thing a ContextManager. def __init__(self, resolved_functions, num_times_callables_called=None, history=None, is_being_edited=False, - num_times_hit_during_editing={}, renames_needed_after_editing={}): if num_times_callables_called is None: @@ -391,23 +380,19 @@ class ProgramCallablesInfo(ImmutableRecord): num_times_callables_called=num_times_callables_called, history=history, is_being_edited=is_being_edited, - num_times_hit_during_editing=num_times_hit_during_editing, renames_needed_after_editing=renames_needed_after_editing) hash_fields = ( "resolved_functions", "num_times_callables_called", "is_being_edited", - "num_times_hit_during_editing", "renames_needed_after_editing", "history") update_persistent_hash = LoopKernel.update_persistent_hash def with_edit_callables_mode(self): - return self.copy(is_being_edited=True, - num_times_hit_during_editing=dict((func_id, 0) for func_id in - self.resolved_functions)) + return self.copy(is_being_edited=True) def with_callable(self, function, in_kernel_callable, resolved_for_the_first_time=False): @@ -426,6 +411,10 @@ class ProgramCallablesInfo(ImmutableRecord): # FIXME: add a note about using enter and exit. ~KK # FIXME: think about a better idea of "with_added_callable" this would # be more convenient for developer-faced usage. ~KK + # FIXME: Is this is a bad code? Yes. + # Is there a better alternative to it. Definitely maybe. + # But I don't want to spend the next 182 years of my life optimizing + # some scheme, without even implmenting it to some problem! if not self.is_being_edited: if function.name in self.resolved_functions and ( @@ -436,29 +425,22 @@ class ProgramCallablesInfo(ImmutableRecord): print('New: ', in_kernel_callable) raise LoopyError("Use 'enter_edit_callables_mode' first.") - from loopy.library.reduction import ArgExtOp, SegmentedOp + from loopy.library.reduction import ReductionOpFunction # {{{ sanity checks if isinstance(function, str): function = Variable(function) - assert isinstance(function, (Variable, ArgExtOp, SegmentedOp)) + assert isinstance(function, (Variable, ReductionOpFunction)) # }}} renames_needed_after_editing = self.renames_needed_after_editing.copy() - num_times_hit_during_editing = self.num_times_hit_during_editing.copy() num_times_callables_called = self.num_times_callables_called.copy() history = self.history.copy() - if not resolved_for_the_first_time: - if isinstance(function, (ArgExtOp, SegmentedOp)): - num_times_hit_during_editing[function] += 1 - else: - num_times_hit_during_editing[function.name] += 1 - - if isinstance(function, (ArgExtOp, SegmentedOp)): + if isinstance(function, ReductionOpFunction): unique_function_identifier = function.copy() if not resolved_for_the_first_time: num_times_callables_called[function] -= 1 @@ -473,8 +455,6 @@ class ProgramCallablesInfo(ImmutableRecord): self.copy( resolved_functions=updated_resolved_functions, num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing=( - num_times_hit_during_editing), renames_needed_after_editing=( renames_needed_after_editing)), unique_function_identifier) @@ -494,17 +474,12 @@ class ProgramCallablesInfo(ImmutableRecord): return ( self.copy( history=history, - num_times_hit_during_editing=( - num_times_hit_during_editing), num_times_callables_called=( num_times_callables_called), renames_needed_after_editing=( renames_needed_after_editing)), func_id) else: - # FIXME: maybe deal with the history over here? - # FIXME: once the code logic is running beautify this part. - # many "ifs" can be avoided unique_function_identifier = function.name if (resolved_for_the_first_time or self.num_times_callables_called[function.name] > 1): @@ -534,7 +509,6 @@ class ProgramCallablesInfo(ImmutableRecord): history=history, resolved_functions=updated_resolved_functions, num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing=num_times_hit_during_editing, renames_needed_after_editing=renames_needed_after_editing), Variable(unique_function_identifier)) @@ -576,7 +550,6 @@ class ProgramCallablesInfo(ImmutableRecord): is_being_edited=False, resolved_functions=resolved_functions, num_times_callables_called=num_times_callables_called, - num_times_hit_during_editing={}, renames_needed_after_editing={}) def with_deleted_callable(self, func_id, instances=1): @@ -668,17 +641,4 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): return wraps(transform_for_single_kernel)(_collective_transform) -# {{{ ingoring this for now - -# if False and isinstance(function, (ArgExtOp, SegmentedOp)): -# FIXME: ignoring this casse for now -# FIXME: If a kernel has two flavors of ArgExtOp then they are -# overwritten and hence not supported.(for now). -# updated_resolved_functions = self.scoped_functions.copy() -# updated_resolved_functions[function] = in_kernel_callable -# return self.copy(updated_resolved_functions), function.copy() - -# }}} - - # vim: foldmethod=marker diff --git a/loopy/statistics.py b/loopy/statistics.py index 08b7f89e9..95e9f62a2 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -64,9 +64,9 @@ __doc__ = """ # Qns: # - The variable name, what if multiple kernels use the same name? # - We should also add the cumulative effect on the arguments of callee kernels -# into the caller kernel. -# FIXME: add an error that there is only one callable kernel. disable for -# multiple callable kernels. +# into the caller kernel +# - Make changes to MemAccessInfo to include the effect of several kernels. +# - Renovate `count`. # {{{ GuardedPwQPolynomial diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 7a268d06f..92b209ac9 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -677,16 +677,16 @@ class ResolvedFunction(p.Expression): def __init__(self, function): if isinstance(function, str): function = p.Variable(function) - from loopy.library.reduction import ArgExtOp, SegmentedOp - assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp)) + from loopy.library.reduction import ReductionOpFunction + assert isinstance(function, (p.Variable, ReductionOpFunction)) self.function = function @property def name(self): - from loopy.library.reduction import ArgExtOp, SegmentedOp + from loopy.library.reduction import ReductionOpFunction if isinstance(self.function, p.Variable): return self.function.name - elif isinstance(self.function, (ArgExtOp, SegmentedOp)): + elif isinstance(self.function, ReductionOpFunction): return self.function else: raise LoopyError("Unexpected function type %s in ResolvedFunction." % -- GitLab From 88d746d0d041435d33aebd2a301855647c054ebe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 13 Aug 2018 20:38:16 +0530 Subject: [PATCH 345/916] started with beautifying code. --- loopy/program.py | 108 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 102 insertions(+), 6 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 279228afd..1b9d03d4d 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -165,6 +165,35 @@ def initialize_program_callables_info_from_kernel( # {{{ program definition class Program(ImmutableRecord): + """ + Records the information about all the callables in a :mod:`loopy` program. + + .. attribute:: name + + An instance of :class:`str`, also the name of the top-most level + :class:`loopy.LoopKernel`. + + .. attribute:: program_callables_info + + An instance of :class:`loopy.program.ProgramCallablesInfo`. + + .. attribute:: target + + An instance of :class:`loopy.target.TargetBase`. + + .. attribute:: func_id_to_in_knl_callables_mappers + + A list of functions of the signature ``(target: TargetBase, + function_indentifier: str)`` that would return an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + + .. note:: + + - To create an instance of :class:`loopy.Program`, it is recommeneded to + go through :method:`loopy.make_kernel`. + - This data structure and its attributes should be considered + immutable, any modifications should be done through :method:`copy`. + """ def __init__(self, name, program_callables_info, @@ -172,8 +201,6 @@ class Program(ImmutableRecord): func_id_to_in_knl_callable_mappers): assert isinstance(program_callables_info, ProgramCallablesInfo) - # FIXME: check if all sanity checks have been covered? - # FIXME: The comments over here may need some attention. assert name in program_callables_info super(Program, self).__init__( @@ -194,6 +221,7 @@ class Program(ImmutableRecord): def copy(self, **kwargs): if 'target' in kwargs: + # target attribute of all the callable kernels should be updated. target = kwargs['target'] new_self = super(Program, self).copy(**kwargs) new_resolved_functions = {} @@ -266,13 +294,43 @@ class Program(ImmutableRecord): @property def root_kernel(self): + """ + Returns an instance of :class:`loopy.LoopKernel` denoting the topmost + level kernel in codegeneration. + + .. note:: + + Syntactic sugar. + """ return self.program_callables_info[self.name].subkernel @property def arg_dict(self): + """ + Returns ``arg_dict`` of the ``root_kernel``. + + .. note:: + + Syntactic sugar. + """ return self.root_kernel.arg_dict + @property + def args(self): + """ + Returns ``args`` of the ``root_kernel``. + + .. note:: + + Syntactic sugar. + """ + return self.root_kernel.args[:] + def with_root_kernel(self, root_kernel): + """ + Returns a copy of *self* with the topmost level kernel as + *root_kernel*. + """ new_in_knl_callable = self.program_callables_info[ self.name].copy(subkernel=root_kernel) new_resolved_functions = ( @@ -283,10 +341,6 @@ class Program(ImmutableRecord): program_callables_info=self.program_callables_info.copy( resolved_functions=new_resolved_functions)) - @property - def args(self): - return self.root_kernel.args[:] - def __call__(self, *args, **kwargs): key = self.target.get_kernel_executor_cache_key(*args, **kwargs) try: @@ -336,6 +390,10 @@ def next_indexed_function_identifier(function): class ResolvedFunctionRenamer(RuleAwareIdentityMapper): + """ + Mapper to rename the resolved functions in an expression according to + *renaming_dict*. + """ def __init__(self, rule_mapping_context, renaming_dict): super(ResolvedFunctionRenamer, self).__init__( rule_mapping_context) @@ -351,6 +409,10 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper): def rename_resolved_functions_in_a_single_kernel(kernel, renaming_dict): + """ + Returns a copy of *kernel* with the instances of :class:`ResolvedFunction` + renames according to *renaming_dict*. + """ from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) @@ -364,6 +426,40 @@ def rename_resolved_functions_in_a_single_kernel(kernel, # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): + """ + Records the information of all the callables called in a :class:`loopy.Program`. + + .. attribute:: resolved_functions + + An instance of :class:`dict` that contains a mapping from function + identifier to instances of + :class:`loopy.kernel.function_interface.InKernelCallable` + + .. attribute:: num_times_callables_called + + An instace of :class:`dict` that contains a mapping from function + identifier to :class:`int`, that denotes the number of times the + callable is being called in the entire :class:`loopy.Program`. + + .. attribute:: history + + An instance of :class:`dict` that contains a mapping from function + identifier to and instance of :class:`list`that would contain all the + names taken by a function before the current name.(For example: one + possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``) + + .. attribute:: is_being_edited + + An instance of :class:`bool` which is intended to aid the working of + :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and + :meth:`with_exit_edit_callables_mode`. + + .. attribute:: renames_needed_after_editing + + An instance of :class:`dict` which is intended to aid the working of + :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and + :meth:`with_exit_edit_callables_mode`. + """ def __init__(self, resolved_functions, num_times_callables_called=None, history=None, is_being_edited=False, renames_needed_after_editing={}): -- GitLab From e3277fa2d162f773072109a951f05e24816a88e0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 13 Aug 2018 21:00:10 +0530 Subject: [PATCH 346/916] changes in program_callables_info design. --- loopy/kernel/__init__.py | 7 +++++++ loopy/program.py | 42 ++++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 3f637e53c..3b189da59 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -221,6 +221,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): A subclass of :class:`loopy.TargetBase`. + .. attribute:: is_called_from_host + An instance of :class:`bool`. Will be set *False* for the kernel which + would be called from another top level kernels. Default value is + *True*. + """ # {{{ constructor @@ -248,6 +253,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, + is_called_from_host=True, overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): @@ -361,6 +367,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, + is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) diff --git a/loopy/program.py b/loopy/program.py index 1b9d03d4d..0dc327aa2 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -460,9 +460,9 @@ class ProgramCallablesInfo(ImmutableRecord): :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and :meth:`with_exit_edit_callables_mode`. """ - def __init__(self, resolved_functions, num_times_callables_called=None, - history=None, is_being_edited=False, - renames_needed_after_editing={}): + def __init__(self, resolved_functions, + num_times_callables_called=None, history=None, + is_being_edited=False, renames_needed_after_editing={}): if num_times_callables_called is None: num_times_callables_called = dict((func_id, 1) for func_id in @@ -487,11 +487,22 @@ class ProgramCallablesInfo(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash + def add_callable(self, function, in_kernel_callable): + + history[unique_function_identifier] = set( + [unique_function_identifier]) + pass + + def with_updated_num_times_being_called(self): + root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable + in self.resolved_functions.values() if + isinstance(in_knl_callable, CallableKernel) and + in_knl_callable.is_called_from_host] + def with_edit_callables_mode(self): return self.copy(is_being_edited=True) - def with_callable(self, function, in_kernel_callable, - resolved_for_the_first_time=False): + def with_callable(self, function, in_kernel_callable): """ :arg function: An instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.library.reduction.ReductionOpFunction`. @@ -538,8 +549,7 @@ class ProgramCallablesInfo(ImmutableRecord): if isinstance(function, ReductionOpFunction): unique_function_identifier = function.copy() - if not resolved_for_the_first_time: - num_times_callables_called[function] -= 1 + num_times_callables_called[function] -= 1 num_times_callables_called[unique_function_identifier] = 1 @@ -561,12 +571,11 @@ class ProgramCallablesInfo(ImmutableRecord): for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: num_times_callables_called[func_id] += 1 - if not resolved_for_the_first_time: - num_times_callables_called[function.name] -= 1 - if num_times_callables_called[function.name] == 0: - renames_needed_after_editing[func_id] = function.name + num_times_callables_called[function.name] -= 1 + if num_times_callables_called[function.name] == 0: + renames_needed_after_editing[func_id] = function.name - history[func_id] = history[func_id] | set([function.name]) + history[func_id] = history[func_id] | set([function.name]) return ( self.copy( history=history, @@ -577,16 +586,13 @@ class ProgramCallablesInfo(ImmutableRecord): func_id) else: unique_function_identifier = function.name - if (resolved_for_the_first_time or - self.num_times_callables_called[function.name] > 1): + if self.num_times_callables_called[function.name] > 1: while unique_function_identifier in self.resolved_functions: unique_function_identifier = ( next_indexed_function_identifier( unique_function_identifier)) - if not resolved_for_the_first_time: - num_times_callables_called[function.name] -= 1 - + num_times_callables_called[function.name] -= 1 num_times_callables_called[unique_function_identifier] = 1 updated_resolved_functions = self.resolved_functions.copy() @@ -597,8 +603,6 @@ class ProgramCallablesInfo(ImmutableRecord): history[unique_function_identifier] = ( history[function.name] | set([unique_function_identifier])) else: - history[unique_function_identifier] = set( - [unique_function_identifier]) return ( self.copy( -- GitLab From a4ebe862bb8e434fc67d85c4b9201bad12577975 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 09:17:03 +0530 Subject: [PATCH 347/916] new design to interface with program callables info. --- loopy/preprocess.py | 6 +- loopy/program.py | 448 ++++++++++++++++++++++++------------ loopy/transform/callable.py | 24 +- loopy/transform/fusion.py | 117 +++++----- loopy/type_inference.py | 10 +- 5 files changed, 384 insertions(+), 221 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index bf23c4a44..56db777b5 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2269,6 +2269,9 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): def infer_arg_descr(program): root_kernel_callable = program.program_callables_info[program.name] + from loopy.program import count_callables_in_program_callables_info + old_callables_count = count_callables_in_program_callables_info( + program.program_callables_info) program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel = program.root_kernel @@ -2280,7 +2283,8 @@ def infer_arg_descr(program): program_callables_info, _ = program_callables_info.with_callable(program.name, new_root_kernel_callable) - program_callables_info = program_callables_info.with_exit_edit_callables_mode() + program_callables_info = program_callables_info.with_exit_edit_callables_mode( + old_callables_count) return program.copy(program_callables_info=program_callables_info) diff --git a/loopy/program.py b/loopy/program.py index 0dc327aa2..32869d267 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -29,12 +29,20 @@ from pytools import ImmutableRecord, memoize_method from pymbolic.primitives import Variable from functools import wraps -from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction +from loopy.symbolic import ( + RuleAwareIdentityMapper, ResolvedFunction, CombineMapper) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) +from loopy.kernel.instruction import ( + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) from loopy.diagnostic import LoopyError +from loopy.library.reduction import ReductionOpFunction from loopy.kernel import LoopKernel +from collections import Counter +from pymbolic.primitives import Call, CallWithKwargs + +# FIXME: autofunction/autoclass?? ~KK class ResolvedFunctionMarker(RuleAwareIdentityMapper): @@ -60,7 +68,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) self.kernel = kernel self.program_callables_info = program_callables_info - # FIXME: function_resolvesrs looks like a very bad name change it self.function_id_to_in_knl_callable_mappers = ( function_id_to_in_knl_callable_mappers) @@ -71,7 +78,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): :arg:`identifier` is known to any kernel function scoper, otherwise returns *None*. """ - # FIXME change docs for func_id_to_in_knl_callable_mapper in ( self.function_id_to_in_knl_callable_mappers): # fixme: do we really need to given target for the function @@ -83,7 +89,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): return None def map_call(self, expr, expn_state): - from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import parse_tagged_name name, tag = parse_tagged_name(expr.function) @@ -109,8 +114,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): # resolved in-kernel callable self.program_callables_info, new_func_id = ( - self.program_callables_info.with_callable(expr.function, - in_knl_callable, True)) + self.program_callables_info.with_add_callable(expr.function, + in_knl_callable)) return type(expr)( ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) @@ -135,10 +140,15 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) -def initialize_program_callables_info_from_kernel( - kernel, func_id_to_kernel_callable_mappers): +def initialize_program_callables_info_from_kernel(kernel): + """ + Returns an instance of :class:`loopy.ProgramCallablesInfo`, by resolving + the functions based on :mod:`loopy`'s default function resolvers. + """ + # collect the default function resolvers + func_id_to_kernel_callable_mappers = ( + default_func_id_to_kernel_callable_mappers(kernel.target)) program_callables_info = ProgramCallablesInfo({}) - program_callables_info = program_callables_info.with_edit_callables_mode() from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( @@ -148,16 +158,17 @@ def initialize_program_callables_info_from_kernel( rule_mapping_context, kernel, program_callables_info, func_id_to_kernel_callable_mappers) - # scoping fucntions and collecting the scoped functions + # mark the functions as "Resolved" in the expression nodes. kernel_with_functions_resolved = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) + # collect the update program_callables_info program_callables_info = resolved_function_marker.program_callables_info callable_kernel = CallableKernel(kernel_with_functions_resolved) - program_callables_info, _ = program_callables_info.with_callable( - Variable(kernel.name), callable_kernel, True) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) + + # add the callable kernel to the program_callables_info + program_callables_info, _ = program_callables_info.with_add_callable( + Variable(kernel.name), callable_kernel) return program_callables_info @@ -357,33 +368,31 @@ class Program(ImmutableRecord): # }}} -def next_indexed_function_identifier(function): +def next_indexed_function_identifier(function_id): """ Returns an instance of :class:`str` with the next indexed-name in the sequence for the name of *function*. *Example:* ``'sin_0'`` will return ``'sin_1'``. - :arg function: Either an instance of :class:`str`, - :class:`pymbolic.primitives.Variable` , - :class:`loopy.reduction.ReductionOpFunction`. + :arg function_id: Either an instance of :class:`str`. """ - from loopy.library.reduction import ReductionOpFunction - if isinstance(function, ReductionOpFunction): - return function.copy() - elif isinstance(function, str): - function = Variable(function) - assert isinstance(function, Variable) + # {{{ sanity checks + + assert isinstance(function_id, str) + + # }}} + func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") - match = func_name.match(function.name) + match = func_name.match(function_id) if match is None: - if function.name[-1] == '_': - return "{old_name}0".format(old_name=function.name) + if function_id[-1] == '_': + return "{old_name}0".format(old_name=function_id) else: - return "{old_name}_0".format(old_name=function.name) + return "{old_name}_0".format(old_name=function_id) return "{alpha}_{num}".format(alpha=match.group('alpha'), num=int(match.group('num'))+1) @@ -423,6 +432,115 @@ def rename_resolved_functions_in_a_single_kernel(kernel, resolved_function_renamer.map_kernel(kernel))) +# {{{ counting helpers + +class CallablesCountingMapper(CombineMapper): + """ + Returns an instance of :class:`collections.Counter` with the count of + callables registered in *program_callables_info*. + + .. attribute:: program_callables_info + + An instance of :class:`loopy.program.ProgramCallablesInfo`. + """ + def __init__(self, program_callables_info): + self.program_callables_info = program_callables_info + + def combine(self, values): + return sum(values, Counter()) + + def map_call(self, expr): + + if isinstance(expr, CallWithKwargs): + kw_parameters = expr.kw_parameters + else: + assert isinstance(expr, Call) + kw_parameters = {} + + if isinstance(expr.function, (ResolvedFunction)): + in_knl_callable = self.program_callables_info[expr.function.name] + if isinstance(in_knl_callable, ScalarCallable): + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + elif isinstance(in_knl_callable, CallableKernel): + + # callable kernels have more callables in them. + callables_count_in_subkernel = ( + count_callables_in_kernel( + in_knl_callable.subkernel, + self.program_callables_info)) + + return (Counter([expr.function.name]) + + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + ( + callables_count_in_subkernel) + else: + raise NotImplementedError("Unknown callable type %s." % ( + type)) + else: + return ( + self.combine((self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values())))) + + map_call_with_kwargs = map_call + + def map_constant(self, expr): + return Counter() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +# FIXME: @memoize_method +def count_callables_in_kernel(kernel, program_callables_info): + """ + Returns an instance of :class:`collections.Counter` representing the number + of callables in the *kernel* that are registered in + *program_callables_info*. + """ + assert isinstance(kernel, LoopKernel) + callables_count = Counter() + callables_counting_mapper = CallablesCountingMapper( + program_callables_info) + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callables_count += ( + callables_counting_mapper(insn.expression)) + elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): + pass + else: + raise NotImplementedError("Unknown instruction type %s." % ( + type(insn))) + + return callables_count + + +# FIXME: @memoize_method +def count_callables_in_program_callables_info(program_callables_info): + """ + Returns an instance of :class:`collection.Counter` representing the number + of times the callables is called in program_callables_info. + """ + root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable + in program_callables_info.values() if + isinstance(in_knl_callable, CallableKernel) and + in_knl_callable.subkernel.is_called_from_host] + + from collections import Counter + callables_count = Counter([root_kernel_name]) + callables_count += ( + count_callables_in_kernel(program_callables_info[ + root_kernel_name].subkernel, program_callables_info)) + return callables_count + +# }}} + + # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): @@ -435,12 +553,6 @@ class ProgramCallablesInfo(ImmutableRecord): identifier to instances of :class:`loopy.kernel.function_interface.InKernelCallable` - .. attribute:: num_times_callables_called - - An instace of :class:`dict` that contains a mapping from function - identifier to :class:`int`, that denotes the number of times the - callable is being called in the entire :class:`loopy.Program`. - .. attribute:: history An instance of :class:`dict` that contains a mapping from function @@ -453,54 +565,92 @@ class ProgramCallablesInfo(ImmutableRecord): An instance of :class:`bool` which is intended to aid the working of :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and :meth:`with_exit_edit_callables_mode`. - - .. attribute:: renames_needed_after_editing - - An instance of :class:`dict` which is intended to aid the working of - :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and - :meth:`with_exit_edit_callables_mode`. """ def __init__(self, resolved_functions, - num_times_callables_called=None, history=None, - is_being_edited=False, renames_needed_after_editing={}): + history=None, is_being_edited=False): - if num_times_callables_called is None: - num_times_callables_called = dict((func_id, 1) for func_id in - resolved_functions) if history is None: history = dict((func_id, set([func_id])) for func_id in resolved_functions) super(ProgramCallablesInfo, self).__init__( resolved_functions=resolved_functions, - num_times_callables_called=num_times_callables_called, history=history, - is_being_edited=is_being_edited, - renames_needed_after_editing=renames_needed_after_editing) + is_being_edited=is_being_edited) hash_fields = ( "resolved_functions", - "num_times_callables_called", "is_being_edited", - "renames_needed_after_editing", "history") update_persistent_hash = LoopKernel.update_persistent_hash - def add_callable(self, function, in_kernel_callable): + def with_add_callable(self, function, in_kernel_callable): + """ + Returns a copy of *self* with the *function* associated with the + *in_kernel_callable*. + """ + # note: this does not require the edit mode to be true. + # the reason for the edit mode is that we need to take care of the + # renaming that might be needed to be done + # PS: delete this note? + history = self.history.copy() + + if in_kernel_callable in self.resolved_functions.values(): + # the callable already exists, implies return the function + # identifier corresposing to that callable. + for func_id, in_knl_callable in self.resolved_functions.items(): + if in_knl_callable == in_kernel_callable: + history[func_id] = history[func_id] | set([function.name]) + return ( + self.copy( + history=history), + func_id) + else: + + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) + + return ( + self.copy( + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} + + unique_function_identifier = function.name + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) history[unique_function_identifier] = set( [unique_function_identifier]) - pass - def with_updated_num_times_being_called(self): - root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable - in self.resolved_functions.values() if - isinstance(in_knl_callable, CallableKernel) and - in_knl_callable.is_called_from_host] + return ( + self.copy( + history=history, + resolved_functions=updated_resolved_functions), + Variable(unique_function_identifier)) def with_edit_callables_mode(self): - return self.copy(is_being_edited=True) + """ + Initiates *self* for a walk traversal through all the callables. + """ + # PS: I don't see a need for this method right now. + # This is just for validation purposes, maybe needs to disapper if you + # find a better solution? + return self.copy( + is_being_edited=True) def with_callable(self, function, in_kernel_callable): """ @@ -512,27 +662,24 @@ class ProgramCallablesInfo(ImmutableRecord): .. note:: - Assumes that each callable is touched atmost once, the internal - working of this function fails if that is violated. + - Use :meth:`with_add_callable` if a callable is being resolved for the + first time. """ - # FIXME: add a note about using enter and exit. ~KK - # FIXME: think about a better idea of "with_added_callable" this would - # be more convenient for developer-faced usage. ~KK - # FIXME: Is this is a bad code? Yes. - # Is there a better alternative to it. Definitely maybe. - # But I don't want to spend the next 182 years of my life optimizing - # some scheme, without even implmenting it to some problem! + + # {{{ non-edit mode if not self.is_being_edited: if function.name in self.resolved_functions and ( self.resolved_functions[function.name] == in_kernel_callable): + # if not being edited, check that the given function is + # equal to the the old version of the callable. return self, function else: print('Old: ', self.resolved_functions[function.name]) print('New: ', in_kernel_callable) - raise LoopyError("Use 'enter_edit_callables_mode' first.") + raise LoopyError("Use 'with_enter_edit_callables_mode' first.") - from loopy.library.reduction import ReductionOpFunction + # }}} # {{{ sanity checks @@ -543,87 +690,90 @@ class ProgramCallablesInfo(ImmutableRecord): # }}} - renames_needed_after_editing = self.renames_needed_after_editing.copy() - num_times_callables_called = self.num_times_callables_called.copy() history = self.history.copy() - if isinstance(function, ReductionOpFunction): - unique_function_identifier = function.copy() - num_times_callables_called[function] -= 1 - - num_times_callables_called[unique_function_identifier] = 1 - - updated_resolved_functions = self.resolved_functions.copy() - updated_resolved_functions[unique_function_identifier] = ( - in_kernel_callable) - - return ( - self.copy( - resolved_functions=updated_resolved_functions, - num_times_callables_called=num_times_callables_called, - renames_needed_after_editing=( - renames_needed_after_editing)), - unique_function_identifier) - if in_kernel_callable in self.resolved_functions.values(): - # the callable already exists, implies return the function - # identifier corresposing to that callable. + + # the callable already exists, hence return the function + # identifier corresponding to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: - num_times_callables_called[func_id] += 1 - num_times_callables_called[function.name] -= 1 - if num_times_callables_called[function.name] == 0: - renames_needed_after_editing[func_id] = function.name - history[func_id] = history[func_id] | set([function.name]) return ( self.copy( - history=history, - num_times_callables_called=( - num_times_callables_called), - renames_needed_after_editing=( - renames_needed_after_editing)), + history=history), func_id) else: - unique_function_identifier = function.name - if self.num_times_callables_called[function.name] > 1: - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + # {{{ handle ReductionOpFunction + + if isinstance(function, ReductionOpFunction): + unique_function_identifier = function.copy() + updated_resolved_functions = self.resolved_functions.copy() + updated_resolved_functions[unique_function_identifier] = ( + in_kernel_callable) - num_times_callables_called[function.name] -= 1 - num_times_callables_called[unique_function_identifier] = 1 + return ( + self.copy( + resolved_functions=updated_resolved_functions), + unique_function_identifier) + + # }}} + unique_function_identifier = function.name + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) - if not resolved_for_the_first_time: - history[unique_function_identifier] = ( - history[function.name] | set([unique_function_identifier])) - else: + history[unique_function_identifier] = ( + history[function.name] | set([unique_function_identifier])) return ( self.copy( history=history, - resolved_functions=updated_resolved_functions, - num_times_callables_called=num_times_callables_called, - renames_needed_after_editing=renames_needed_after_editing), + resolved_functions=updated_resolved_functions), Variable(unique_function_identifier)) - def with_exit_edit_callables_mode(self): + def with_exit_edit_callables_mode(self, old_callables_count): + """ + Returns a copy of *self* with renaming of the callables done whenver + possible. + + *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, + then all the renaming is done such that one of flavors of the function + is renamed back to ``sin``. + """ + + new_callables_count = count_callables_in_program_callables_info( + self) + history = self.history.copy() + renames_needed = {} + assert self.is_being_edited - num_times_callables_called = {} + # NOTE:(to self by KK) + # all we need to do is change the name of the variables that were seen + # in old_callables_count but are no longer available. + # Using these 2 figure out the renames needed. + for old_func_id in old_callables_count-new_callables_count: + # this implies that all the function instances having the name + # "func_id" have been renamed to something else. + for new_func_id in ( + new_callables_count.keys()-renames_needed.keys()): + if old_func_id in history[new_func_id]: + renames_needed[new_func_id] = old_func_id + resolved_functions = {} - history = self.history.copy() for func_id, in_knl_callable in self.resolved_functions.items(): if isinstance(in_knl_callable, CallableKernel): + # If callable kernel, perform renames. old_subkernel = in_knl_callable.subkernel new_subkernel = rename_resolved_functions_in_a_single_kernel( - old_subkernel, self.renames_needed_after_editing) + old_subkernel, renames_needed) in_knl_callable = ( in_knl_callable.copy(subkernel=new_subkernel)) elif isinstance(in_knl_callable, ScalarCallable): @@ -632,44 +782,22 @@ class ProgramCallablesInfo(ImmutableRecord): raise NotImplementedError("Unknown callable type %s." % type(in_knl_callable).__name__) - if func_id in self.renames_needed_after_editing: + if func_id in renames_needed: + # If function name itself in renames change the key of the + # dict. history.pop(func_id) - new_func_id = self.renames_needed_after_editing[func_id] + new_func_id = renames_needed[func_id] resolved_functions[new_func_id] = ( in_knl_callable) - num_times_callables_called[new_func_id] = ( - self.num_times_callables_called[func_id]) - else: resolved_functions[func_id] = in_knl_callable - num_times_callables_called[func_id] = ( - self.num_times_callables_called[func_id]) return self.copy( is_being_edited=False, - resolved_functions=resolved_functions, - num_times_callables_called=num_times_callables_called, - renames_needed_after_editing={}) - - def with_deleted_callable(self, func_id, instances=1): - num_times_callables_called = self.num_times_callables_called.copy() - history = self.history.copy() - resolved_functions = self.resolved_functions.copy() - - assert instances <= num_times_callables_called[func_id] + resolved_functions=resolved_functions) - num_times_callables_called[func_id] -= instances - - if num_times_callables_called[func_id] == 0: - num_times_callables_called.pop(func_id) - history.pop(func_id) - resolved_functions.pop(func_id) - - return self.copy( - resolved_functions=resolved_functions, - num_times_callables_called=num_times_callables_called, - history=history) + # {{{ behave like a dict(syntactic sugar) def __getitem__(self, item): return self.resolved_functions[item] @@ -683,11 +811,16 @@ class ProgramCallablesInfo(ImmutableRecord): def values(self): return self.resolved_functions.values() + # }}} # }}} def default_func_id_to_kernel_callable_mappers(target): + """ + Returns a list of functions that are provided through *target* by deafault. + """ + # FIXME: name scopers is confusing!(change it to something else.) from loopy.library.function import loopy_specific_callable_scopers return ( @@ -695,11 +828,18 @@ def default_func_id_to_kernel_callable_mappers(target): target.get_device_ast_builder().function_scopers())) +# {{{ helper functions + def make_program_from_kernel(kernel): + """ + Returns an instance of :class:`loopy.Program` with the *kernel* as the root + kernel. + """ - program_callables_info = initialize_program_callables_info_from_kernel(kernel, - default_func_id_to_kernel_callable_mappers(kernel.target)) + # get the program callables info + program_callables_info = initialize_program_callables_info_from_kernel(kernel) + # get the program from program callables info program = Program( name=kernel.name, program_callables_info=program_callables_info, @@ -711,6 +851,12 @@ def make_program_from_kernel(kernel): def iterate_over_kernels_if_given_program(transform_for_single_kernel): + """ + Function wrapper for transformations of the type ``transform(kernel: + LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the + ``transform`` being implemented on all of the callable kernels in a + :class:`loopy.Program`. + """ def _collective_transform(program_or_kernel, *args, **kwargs): if isinstance(program_or_kernel, Program): program = program_or_kernel @@ -740,5 +886,7 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): return wraps(transform_for_single_kernel)(_collective_transform) +# }}} + # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 9d9935ab0..90f530953 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -35,10 +35,18 @@ __doc__ = """ # {{{ register function lookup -def resolved_callables_from_function_lookup(program, - func_id_to_kernel_callable_mapper): +def _resolved_callables_from_function_lookup(program, + func_id_to_in_kernel_callable_mapper): + """ + Returns a copy of *program* with the expression nodes marked "Resolved" + if any match is found through the given + *func_id_to_in_kernel_callable_mapper*. + + :arg func_id_to_in_kernel_callable_mapper: A function with signature + ``(target, identifier)`` that returns either an instance of + :class:`loopy.InKernelCallable` or *None*. + """ program_callables_info = program.program_callables_info - program_callables_info = program_callables_info.with_edit_callables_mode() callable_knls = dict( (func_id, in_knl_callable) for func_id, in_knl_callable in @@ -55,9 +63,8 @@ def resolved_callables_from_function_lookup(program, resolved_function_marker = ResolvedFunctionMarker( rule_mapping_context, kernel, program_callables_info, - [func_id_to_kernel_callable_mapper]) + [func_id_to_in_kernel_callable_mapper]) - # scoping fucntions and collecting the scoped functions new_subkernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) program_callables_info = resolved_function_marker.program_callables_info @@ -65,9 +72,6 @@ def resolved_callables_from_function_lookup(program, edited_callable_knls[func_id] = in_knl_callable.copy( subkernel=new_subkernel) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - new_resolved_functions = {} for func_id, in_knl_callable in program_callables_info.items(): @@ -85,7 +89,7 @@ def resolved_callables_from_function_lookup(program, def register_function_id_to_in_knl_callable_mapper(program, func_id_to_in_knl_callable_mapper): """ - Returns a copy of *kernel* with the *function_lookup* registered. + Returns a copy of *program* with the *function_lookup* registered. :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target, identifier)`` returning a @@ -105,7 +109,7 @@ def register_function_id_to_in_knl_callable_mapper(program, new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( [func_id_to_in_knl_callable_mapper]) - program = resolved_callables_from_function_lookup(program, + program = _resolved_callables_from_function_lookup(program, func_id_to_in_knl_callable_mapper) new_program = program.copy( diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index d43ce025b..f2e62368e 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -292,50 +292,6 @@ def _fuse_two_kernels(knla, knlb): def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): - """Return a kernel that performs all the operations in all entries - of *kernels*. - - :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. - :arg suffixes: If given, must be a list of strings of a length matching - that of *kernels*. This will be used to disambiguate the names - of temporaries, as described below. - :arg data_flow: A list of data dependencies - ``[(var_name, from_kernel, to_kernel), ...]``. - Based on this, the fuser will create dependencies between all - writers of *var_name* in ``kernels[from_kernel]`` to - readers of *var_name* in ``kernels[to_kernel]``. - *from_kernel* and *to_kernel* are indices into *kernels*. - - The components of the kernels are fused as follows: - - * The resulting kernel will have a domain involving all the inames - and parameters occurring across *kernels*. - Inames with matching names across *kernels* are fused in such a way - that they remain a single iname in the fused kernel. - Use :func:`loopy.rename_iname` if this is not desired. - - * The projection of the domains of each pair of kernels onto their - common subset of inames must match in order for fusion to - succeed. - - * Assumptions are fused by taking their conjunction. - - * If kernel arguments with matching names are encountered across - *kernels*, their declarations must match in order for fusion to - succeed. - - * Temporaries are automatically renamed to remain uniquely associated - with each instruction stream. - - * The resulting kernel will contain all instructions from each entry - of *kernels*. Clashing instruction IDs will be renamed to ensure - uniqueness. - - .. versionchanged:: 2016.2 - - *data_flow* was added in version 2016.2 - """ - assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) @@ -419,8 +375,54 @@ def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): def fuse_kernels(programs, suffixes=None, data_flow=None): + """Return a kernel that performs all the operations in all entries + of *kernels*. + + :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. + :arg suffixes: If given, must be a list of strings of a length matching + that of *kernels*. This will be used to disambiguate the names + of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. + *from_kernel* and *to_kernel* are indices into *kernels*. + + The components of the kernels are fused as follows: + + * The resulting kernel will have a domain involving all the inames + and parameters occurring across *kernels*. + Inames with matching names across *kernels* are fused in such a way + that they remain a single iname in the fused kernel. + Use :func:`loopy.rename_iname` if this is not desired. + + * The projection of the domains of each pair of kernels onto their + common subset of inames must match in order for fusion to + succeed. + + * Assumptions are fused by taking their conjunction. + + * If kernel arguments with matching names are encountered across + *kernels*, their declarations must match in order for fusion to + succeed. + + * Temporaries are automatically renamed to remain uniquely associated + with each instruction stream. + + * The resulting kernel will contain all instructions from each entry + of *kernels*. Clashing instruction IDs will be renamed to ensure + uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 + """ + + # all the resolved functions in programs must be registered in + # main_program_callables_info main_prog_callables_info = ( - programs[0].program_callables_info.with_edit_callables_mode()) + programs[0].program_callables_info) old_root_kernel_callable = ( programs[0].program_callables_info[programs[0].name]) kernels = [programs[0].root_kernel] @@ -431,17 +433,22 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): renames_needed = {} for old_func_id, in_knl_callable in prog.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): + # Fusing programs with multiple callable kernels is tough. + # Reason: Need to first figure out the order in which the + # callable kernels must be resolved into + # main_program_callables_info, because of renaming is + # needed to be done in the callable kernels before registering. + # Hence disabling it until required. if in_knl_callable.name != prog.name: raise LoopyError("fuse_kernels cannot fuse programs with " "multiple callable kernels.") + + # root kernel are dealt at the end after performing all the + # renaming. continue - num_times_called = ( - prog.program_callables_info.num_times_callables_called[ - old_func_id]) - for i in range(num_times_called): - main_prog_callables_info, new_func_id = ( - main_prog_callables_info.with_callables(var(old_func_id), - in_knl_callable, True)) + main_prog_callables_info, new_func_id = ( + main_prog_callables_info.with_add_callable(var(old_func_id), + in_knl_callable)) if old_func_id != new_func_id: renames_needed[old_func_id] = new_func_id @@ -456,12 +463,10 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): new_root_kernel_callable = old_root_kernel_callable.copy( subkernel=new_root_kernel.copy(name=programs[0].name)) - main_prog_callables_info, _ = main_prog_callables_info.with_callable( + # TODO: change the name of the final root kernel. + main_prog_callables_info, _ = main_prog_callables_info.with_add_callable( var(programs[0].name), new_root_kernel_callable) - main_prog_callables_info = ( - main_prog_callables_info.with_exit_edit_callables_mode()) - return programs[0].copy( program_callables_info=main_prog_callables_info) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 3ae9a142e..ab37519ef 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -52,7 +52,7 @@ def _debug(kernel, s, *args): def get_return_types_as_tuple(arg_id_to_dtype): """Returns the types of arguments in a tuple format. - :param arg_id_to_dtype: An instance of :class:`dict` which denotes a + :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a mapping from the arguments to their inferred types. """ return_arg_id_to_dtype = dict((id, dtype) for id, dtype in @@ -894,6 +894,9 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel + from loopy.program import count_callables_in_program_callables_info + old_callables_count = count_callables_in_program_callables_info( + program_callables_info) program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel, program_callables_info = ( @@ -910,10 +913,9 @@ def infer_unknown_types(program, expect_completion=False): type_inferred_knl_callable)) program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) + program_callables_info.with_exit_edit_callables_mode( + old_callables_count)) - # FIXME: maybe put all of this in a function? - # need to infer functions that were left out during inference return program.copy(program_callables_info=program_callables_info) # }}} -- GitLab From 42229e028ba32c132fde98deee8edec002354131 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 11:23:35 +0530 Subject: [PATCH 348/916] much better design for program callables info. --- loopy/program.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 32869d267..e3a527ee6 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -526,6 +526,8 @@ def count_callables_in_program_callables_info(program_callables_info): Returns an instance of :class:`collection.Counter` representing the number of times the callables is called in program_callables_info. """ + # should raise an error if there are more than one root kernels(which is + # illegal) root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable in program_callables_info.values() if isinstance(in_knl_callable, CallableKernel) and @@ -636,6 +638,9 @@ class ProgramCallablesInfo(ImmutableRecord): history[unique_function_identifier] = set( [unique_function_identifier]) + if unique_function_identifier == 'loopy_kernel_0': + 1/0 + return ( self.copy( history=history, @@ -719,10 +724,16 @@ class ProgramCallablesInfo(ImmutableRecord): # }}} unique_function_identifier = function.name - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # special treatment if the callable is the root kernel + pass + else: + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( -- GitLab From fa0fb70b114f3727a3683488e2cc55c900081873 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 12:22:50 +0530 Subject: [PATCH 349/916] deal with reduction callables. --- loopy/program.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index e3a527ee6..7010e1108 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -135,8 +135,9 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) assert in_knl_callable is not None self.program_callables_info, _ = ( - self.program_callables_info.with_callable(func_id, - in_knl_callable, True)) + self.program_callables_info.with_add_callable(func_id, + in_knl_callable)) + # FIXME: where do you deal with the parameters? ~KK return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) @@ -486,6 +487,10 @@ class CallablesCountingMapper(CombineMapper): map_call_with_kwargs = map_call + def map_reduction(self, expr): + return Counter(expr.operation.get_scalar_callables()) + ( + super(CallablesCountingMapper, self).map_reduction(expr)) + def map_constant(self, expr): return Counter() @@ -592,10 +597,21 @@ class ProgramCallablesInfo(ImmutableRecord): Returns a copy of *self* with the *function* associated with the *in_kernel_callable*. """ + # FIXME: pleasse better docs.. ~KK # note: this does not require the edit mode to be true. # the reason for the edit mode is that we need to take care of the # renaming that might be needed to be done # PS: delete this note? + + # {{{ sanity checks + + if isinstance(function, str): + function = Variable(function) + + assert isinstance(function, (Variable, ReductionOpFunction)) + + # }}} + history = self.history.copy() if in_kernel_callable in self.resolved_functions.values(): @@ -617,9 +633,12 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) + history[unique_function_identifier] = set( + [unique_function_identifier]) return ( self.copy( + history=history, resolved_functions=updated_resolved_functions), unique_function_identifier) @@ -638,9 +657,6 @@ class ProgramCallablesInfo(ImmutableRecord): history[unique_function_identifier] = set( [unique_function_identifier]) - if unique_function_identifier == 'loopy_kernel_0': - 1/0 - return ( self.copy( history=history, @@ -779,7 +795,8 @@ class ProgramCallablesInfo(ImmutableRecord): resolved_functions = {} - for func_id, in_knl_callable in self.resolved_functions.items(): + for func_id in new_callables_count: + in_knl_callable = self.resolved_functions[func_id] if isinstance(in_knl_callable, CallableKernel): # If callable kernel, perform renames. old_subkernel = in_knl_callable.subkernel -- GitLab From a161a4854c2b800884fc12269062f60cafe8b95e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 12:26:34 +0530 Subject: [PATCH 350/916] removes wrong invocation of with_callable for ManglerCallable. --- loopy/type_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index ab37519ef..8b5a656ca 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -408,8 +408,8 @@ class TypeInferenceMapper(CombineMapper): identifier, function_mangler, arg_id_to_dtype, arg_id_to_descr, mangle_result.target_name) self.program_callables_info, new_function_id = ( - self.program_callables_info.with_callable( - expr.function, in_knl_callable, True)) + self.program_callables_info.with_add_callable( + expr.function, in_knl_callable)) if isinstance(expr, Call): self.old_calls_to_new_calls[expr] = new_function_id -- GitLab From 76336791d7b6cb6919ec97b02a32f4e74740c7db Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 12:50:27 +0530 Subject: [PATCH 351/916] count callables in expression after expanding for substitutitons. --- loopy/kernel/__init__.py | 4 ++-- loopy/program.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 3b189da59..89aef6602 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1367,8 +1367,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ direct execution def __call__(self, *args, **kwargs): - raise LoopyError("Calling a LoopKernel is deprecated, call a Program " - "instead.") + warn("Calling a LoopKernel is deprecated, call a Program " + "instead.", DeprecationWarning, stacklevel=2) from loopy.program import make_program_from_kernel program = make_program_from_kernel(self) return program(*args, **kwargs) diff --git a/loopy/program.py b/loopy/program.py index 7010e1108..12fe756d3 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -29,8 +29,8 @@ from pytools import ImmutableRecord, memoize_method from pymbolic.primitives import Variable from functools import wraps -from loopy.symbolic import ( - RuleAwareIdentityMapper, ResolvedFunction, CombineMapper) +from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, + CombineMapper, SubstitutionRuleExpander) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.kernel.instruction import ( @@ -511,11 +511,13 @@ def count_callables_in_kernel(kernel, program_callables_info): callables_count = Counter() callables_counting_mapper = CallablesCountingMapper( program_callables_info) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): callables_count += ( - callables_counting_mapper(insn.expression)) + callables_counting_mapper(subst_expander( + insn.expression))) elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): pass else: -- GitLab From ab8bebf0a06bc3661396d0b49176ae47c7ee40f1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 13:16:30 +0530 Subject: [PATCH 352/916] pass statistics --- loopy/preprocess.py | 4 +--- loopy/program.py | 49 ++++++++++++++++++++++------------------- loopy/statistics.py | 28 ++++++++++------------- loopy/type_inference.py | 4 +--- 4 files changed, 40 insertions(+), 45 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 56db777b5..472c74db1 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2269,9 +2269,7 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): def infer_arg_descr(program): root_kernel_callable = program.program_callables_info[program.name] - from loopy.program import count_callables_in_program_callables_info - old_callables_count = count_callables_in_program_callables_info( - program.program_callables_info) + old_callables_count = program.program_callables_info.callables_count() program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel = program.root_kernel diff --git a/loopy/program.py b/loopy/program.py index 12fe756d3..a0477bdf5 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -526,27 +526,6 @@ def count_callables_in_kernel(kernel, program_callables_info): return callables_count - -# FIXME: @memoize_method -def count_callables_in_program_callables_info(program_callables_info): - """ - Returns an instance of :class:`collection.Counter` representing the number - of times the callables is called in program_callables_info. - """ - # should raise an error if there are more than one root kernels(which is - # illegal) - root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable - in program_callables_info.values() if - isinstance(in_knl_callable, CallableKernel) and - in_knl_callable.subkernel.is_called_from_host] - - from collections import Counter - callables_count = Counter([root_kernel_name]) - callables_count += ( - count_callables_in_kernel(program_callables_info[ - root_kernel_name].subkernel, program_callables_info)) - return callables_count - # }}} @@ -594,6 +573,29 @@ class ProgramCallablesInfo(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash + # FIXME: @memoize_method + def callables_count(self): + """ + Returns an instance of :class:`collection.Counter` representing the number + of times the callables is called in program_callables_info. + """ + # should raise an error if there are more than one root kernels(which is + # illegal) + root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable + in self.values() if + isinstance(in_knl_callable, CallableKernel) and + in_knl_callable.subkernel.is_called_from_host] + + from collections import Counter + callables_count = Counter([root_kernel_name]) + callables_count += ( + count_callables_in_kernel(self[ + root_kernel_name].subkernel, self)) + + return callables_count + + # {{{ interface to perfrom edits on callables + def with_add_callable(self, function, in_kernel_callable): """ Returns a copy of *self* with the *function* associated with the @@ -776,8 +778,7 @@ class ProgramCallablesInfo(ImmutableRecord): is renamed back to ``sin``. """ - new_callables_count = count_callables_in_program_callables_info( - self) + new_callables_count = self.callables_count() history = self.history.copy() renames_needed = {} @@ -827,6 +828,8 @@ class ProgramCallablesInfo(ImmutableRecord): is_being_edited=False, resolved_functions=resolved_functions) + # }}} + # {{{ behave like a dict(syntactic sugar) def __getitem__(self, item): diff --git a/loopy/statistics.py b/loopy/statistics.py index 95e9f62a2..3799967b4 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1396,17 +1396,17 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, op_map = ToCountMap() + callables_count = ( + program.program_callables_info.callables_count()) + for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_op_map = get_op_map_for_single_kernel(knl, program.program_callables_info, numpy_types, count_redundant_work, subgroup_size) - for i in range(num_times_called): + for i in range(callables_count[func_id]): op_map += knl_op_map elif isinstance(in_knl_callable, ScalarCallable): pass @@ -1684,18 +1684,17 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, access_map = ToCountMap() + callables_count = program.program_callables_info.callables_count() + for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_access_map = get_access_map_for_single_kernel(knl, program.program_callables_info, numpy_types, count_redundant_work, subgroup_size) # FIXME: didn't see any easy way to multiply - for i in range(num_times_called): + for i in range(callables_count[func_id]): access_map += knl_access_map elif isinstance(in_knl_callable, ScalarCallable): pass @@ -1809,18 +1808,16 @@ def get_synchronization_map(program, subgroup_size=None): program = preprocess_program(program) sync_map = ToCountMap() + callables_count = program.program_callables_info.callables_count() for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_sync_map = get_synchronization_map_for_single_kernel(knl, program.program_callables_info, subgroup_size) # FIXME: didn't see any easy way to multiply - for i in range(num_times_called): + for i in range(callables_count[func_id]): sync_map += knl_sync_map elif isinstance(in_knl_callable, ScalarCallable): pass @@ -1887,18 +1884,17 @@ def gather_access_footprints(program, ignore_uncountable=False): write_footprints = [] read_footprints = [] + callables_count = program.program_callables_info.callables_count() + for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): - num_times_called = ( - program.program_callables_info.num_times_callables_called[ - func_id]) knl = in_knl_callable.subkernel knl_write_footprints, knl_read_footprints = ( gather_access_footprints_for_single_kernel(knl, ignore_uncountable)) # FIXME: didn't see any easy way to multiply - for i in range(num_times_called): + for i in range(callables_count[func_id]): write_footprints.extend(knl_write_footprints) read_footprints.extend(knl_read_footprints) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 8b5a656ca..76d4a579d 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -894,9 +894,7 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - from loopy.program import count_callables_in_program_callables_info - old_callables_count = count_callables_in_program_callables_info( - program_callables_info) + old_callables_count = program_callables_info.callables_count() program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel, program_callables_info = ( -- GitLab From 44b247dc760d6f2eeb9e06b0cf375ce24262b68b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 14:28:48 +0530 Subject: [PATCH 353/916] dont rename if given a root kernel. --- loopy/program.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index a0477bdf5..efc66b5a5 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -649,15 +649,25 @@ class ProgramCallablesInfo(ImmutableRecord): # }}} unique_function_identifier = function.name - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + + if isinstance(in_kernel_callable, CallableKernel) and ( + in_kernel_callable.subkernel.is_called_from_host): + # special treatment if the callable is the root kernel + pass + else: + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) + if 'strongVolumeKernelR_0' in updated_resolved_functions: + import pudb + pudb.set_trace() + history[unique_function_identifier] = set( [unique_function_identifier]) @@ -759,6 +769,10 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) + if 'strongVolumeKernelR_0' in updated_resolved_functions: + import pudb + pudb.set_trace() + history[unique_function_identifier] = ( history[function.name] | set([unique_function_identifier])) -- GitLab From 01e42c10b6e3b362d2dc325c7e1d177e0b7377a0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 20:31:08 +0530 Subject: [PATCH 354/916] perform only one rename! --- loopy/program.py | 1 + loopy/type_inference.py | 5 ----- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index efc66b5a5..911667dfa 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -809,6 +809,7 @@ class ProgramCallablesInfo(ImmutableRecord): new_callables_count.keys()-renames_needed.keys()): if old_func_id in history[new_func_id]: renames_needed[new_func_id] = old_func_id + break resolved_functions = {} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 76d4a579d..52150dcd8 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -882,11 +882,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" - from loopy.kernel import LoopKernel - if isinstance(program, LoopKernel): - # FIXME: deprecate warning needed here - from loopy.program import make_program_from_kernel - program = make_program_from_kernel(program) program_callables_info = program.program_callables_info -- GitLab From 50dc2fe4b266a968360fb03749705478372342d6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 20:38:25 +0530 Subject: [PATCH 355/916] replace keys() by six.viewkeys() for py2.7. --- loopy/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 911667dfa..3872a83e4 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -806,7 +806,7 @@ class ProgramCallablesInfo(ImmutableRecord): # this implies that all the function instances having the name # "func_id" have been renamed to something else. for new_func_id in ( - new_callables_count.keys()-renames_needed.keys()): + six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): if old_func_id in history[new_func_id]: renames_needed[new_func_id] = old_func_id break -- GitLab From 7ab71c675f472e2daa94f02a53c9fa61e8b5e2ff Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 21:34:23 +0530 Subject: [PATCH 356/916] make ProgramCallablesInfo hashable. --- loopy/kernel/__init__.py | 2 ++ loopy/program.py | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 89aef6602..8b2cf3dd2 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1035,6 +1035,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) + @memoize_method def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that @@ -1132,6 +1133,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + @memoize_method def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, program_callables_info, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that diff --git a/loopy/program.py b/loopy/program.py index 3872a83e4..d19cd4e88 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -500,7 +500,7 @@ class CallablesCountingMapper(CombineMapper): map_type_cast = map_constant -# FIXME: @memoize_method +@memoize_method def count_callables_in_kernel(kernel, program_callables_info): """ Returns an instance of :class:`collections.Counter` representing the number @@ -558,7 +558,7 @@ class ProgramCallablesInfo(ImmutableRecord): history=None, is_being_edited=False): if history is None: - history = dict((func_id, set([func_id])) for func_id in + history = dict((func_id, frozenset([func_id])) for func_id in resolved_functions) super(ProgramCallablesInfo, self).__init__( @@ -571,9 +571,16 @@ class ProgramCallablesInfo(ImmutableRecord): "is_being_edited", "history") + def __hash__(self): + return hash(( + frozenset(six.iteritems(self.resolved_functions)), + frozenset(six.iteritems(self.history)), + self.is_being_edited + )) + update_persistent_hash = LoopKernel.update_persistent_hash - # FIXME: @memoize_method + @memoize_method def callables_count(self): """ Returns an instance of :class:`collection.Counter` representing the number @@ -623,7 +630,7 @@ class ProgramCallablesInfo(ImmutableRecord): # identifier corresposing to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: - history[func_id] = history[func_id] | set([function.name]) + history[func_id] = history[func_id] | frozenset([function.name]) return ( self.copy( history=history), @@ -637,7 +644,7 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) - history[unique_function_identifier] = set( + history[unique_function_identifier] = frozenset( [unique_function_identifier]) return ( @@ -668,7 +675,7 @@ class ProgramCallablesInfo(ImmutableRecord): import pudb pudb.set_trace() - history[unique_function_identifier] = set( + history[unique_function_identifier] = frozenset( [unique_function_identifier]) return ( @@ -733,7 +740,7 @@ class ProgramCallablesInfo(ImmutableRecord): # identifier corresponding to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: - history[func_id] = history[func_id] | set([function.name]) + history[func_id] = history[func_id] | frozenset([function.name]) return ( self.copy( history=history), @@ -774,7 +781,7 @@ class ProgramCallablesInfo(ImmutableRecord): pudb.set_trace() history[unique_function_identifier] = ( - history[function.name] | set([unique_function_identifier])) + history[function.name] | frozenset([unique_function_identifier])) return ( self.copy( -- GitLab From 8d4af7a2a89e7cff3db9c2a351733abfeb0161ef Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 14 Aug 2018 22:24:31 +0530 Subject: [PATCH 357/916] update persistent dict changed for frozenset. --- loopy/library/reduction.py | 1 - loopy/tools.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index b968192e6..b3deba65e 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -229,7 +229,6 @@ class ReductionOpFunction(FunctionIdentifier): update_persistent_hash = LoopKernel.update_persistent_hash - # }}} diff --git a/loopy/tools.py b/loopy/tools.py index b243a7949..5eabe6c3c 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -79,6 +79,11 @@ class LoopyKeyBuilder(KeyBuilderBase): update_for_defaultdict = update_for_dict + def update_for_frozenset(self, key_hash, key): + for set_key in sorted(key, + key=lambda obj: type(obj).__name__ + str(obj)): + self.rec(key_hash, set_key) + def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) -- GitLab From f8307a0ed463312a6eb162f7b8ab054babad97f3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 08:32:16 +0530 Subject: [PATCH 358/916] minor cleanup/comments. --- loopy/preprocess.py | 91 +++++++++++++++++++++++++++------------------ loopy/program.py | 7 +++- 2 files changed, 59 insertions(+), 39 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 472c74db1..e9e55cc46 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2149,10 +2149,7 @@ def check_atomic_loads(kernel): class ArgDescrInferenceMapper(RuleAwareIdentityMapper): """ - Returns a set of instances of :class:`tuple` (expr, - in_kernel_callable). The mapped `in_kernel_callable` of the - :class:`InKernelCallable` are descriptor specialized for the given - arguments. + Infers the :attr:`loopy` """ def __init__(self, rule_mapping_context, caller_kernel, @@ -2250,9 +2247,11 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): Returns a copy of *kernel* with the argument shapes and strides matching for scoped functions in the *kernel*. Refer :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. - """ - # FIXME: update this docs, once the design is finalized + .. note:: + + Initiates a walk starting from *kernel* to all its callee kernels. + """ from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( @@ -2268,6 +2267,11 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): def infer_arg_descr(program): + """ + Returns a copy of *program* with the + :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the + callables. + """ root_kernel_callable = program.program_callables_info[program.name] old_callables_count = program.program_callables_info.callables_count() program_callables_info = ( @@ -2397,28 +2401,60 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): return kernel -def preprocess_kernel(kernel, device=None): - # FIXME: error message? - return preprocess_program(kernel, device) +# {{{ hw axes inference + +def infer_hw_axes_sizes(program): + """ + Returns copy of *program* with the hardware axes sizes inferred. + + .. note:: + + - Firstly, computes the collective hardware axes sizes from all the + callable kernels. + - Then, overrides the grid sizes of all the callable kernels to the + collective value. + """ + + local_size, global_size = program.get_grid_size_upper_bounds() + + resolved_function_with_hw_axes_sizes_inferred = {} + + for func_id, in_knl_callable in ( + program.program_callables_info.items()): + if func_id == program.name: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable) + else: + resolved_function_with_hw_axes_sizes_inferred[func_id] = ( + in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + + new_program_callables_info = ( + program.program_callables_info.copy( + resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) + + program = program.copy(program_callables_info=new_program_callables_info) + +# }}} def preprocess_program(program, device=None): if device is not None: + # FIXME: Time to remove this? (Git blame shows 5 years ago) from warnings import warn warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) program = infer_unknown_types(program, expect_completion=False) - # {{{ preprocess the root kernel + # {{{ preprocess callable kernels # Callable editing restrictions: # - # - cannot edit program_callables_info in :meth:`preprocess_single_kernel` - # as we are iterating over it. + # - should not edit program_callables_info in :meth:`preprocess_single_kernel` + # as we are iterating over it.[1] # - # Refer: https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects + # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects new_resolved_functions = {} for func_id, in_knl_callable in program.program_callables_info.items(): @@ -2431,7 +2467,7 @@ def preprocess_program(program, device=None): elif isinstance(in_knl_callable, ScalarCallable): pass else: - raise NotImplementedError("Unknown type of callable %s." % ( + raise NotImplementedError("Unknown callable type %s." % ( type(in_knl_callable).__name__)) new_resolved_functions[func_id] = in_knl_callable @@ -2445,32 +2481,13 @@ def preprocess_program(program, device=None): # infer arg descrs of the callables program = infer_arg_descr(program) - # {{{ hw axes inference - - # FIXME: think of wrapping this in a function? + program = infer_hw_axes_sizes(program) - local_size, global_size = program.get_grid_size_upper_bounds() - - resolved_function_with_hw_axes_sizes_set = {} - - for func_id, in_knl_callable in ( - program.program_callables_info.items()): - if func_id == program.name: - resolved_function_with_hw_axes_sizes_set[func_id] = ( - in_knl_callable) - else: - resolved_function_with_hw_axes_sizes_set[func_id] = ( - in_knl_callable.with_hw_axes_sizes(local_size, global_size)) - - new_program_callables_info = ( - program.program_callables_info.copy( - resolved_functions=resolved_function_with_hw_axes_sizes_set)) + return program - program = program.copy(program_callables_info=new_program_callables_info) - # }}} - - return program +# FIXME: Do we add a deprecation warning? +preprocess_kernel = preprocess_program # vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py index d19cd4e88..eec8157c1 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -861,10 +861,13 @@ class ProgramCallablesInfo(ImmutableRecord): return item in self.resolved_functions def items(self): - return self.resolved_functions.items() + return six.iteritems(self.resolved_functions) def values(self): - return self.resolved_functions.values() + return six.itervalues(self.resolved_functions) + + def keys(self): + return six.iterkeys(self.resolved_functions) # }}} -- GitLab From caec9506a1b42bddb2ce57e009c207aaad4d7dc9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 08:46:50 +0530 Subject: [PATCH 359/916] with_add_callable -> with_added_callable --- loopy/program.py | 10 +++++----- loopy/transform/fusion.py | 4 ++-- loopy/type_inference.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index eec8157c1..90eb64e98 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -114,7 +114,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): # resolved in-kernel callable self.program_callables_info, new_func_id = ( - self.program_callables_info.with_add_callable(expr.function, + self.program_callables_info.with_added_callable(expr.function, in_knl_callable)) return type(expr)( ResolvedFunction(new_func_id), @@ -135,7 +135,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) assert in_knl_callable is not None self.program_callables_info, _ = ( - self.program_callables_info.with_add_callable(func_id, + self.program_callables_info.with_added_callable(func_id, in_knl_callable)) # FIXME: where do you deal with the parameters? ~KK return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) @@ -168,7 +168,7 @@ def initialize_program_callables_info_from_kernel(kernel): callable_kernel = CallableKernel(kernel_with_functions_resolved) # add the callable kernel to the program_callables_info - program_callables_info, _ = program_callables_info.with_add_callable( + program_callables_info, _ = program_callables_info.with_added_callable( Variable(kernel.name), callable_kernel) return program_callables_info @@ -603,7 +603,7 @@ class ProgramCallablesInfo(ImmutableRecord): # {{{ interface to perfrom edits on callables - def with_add_callable(self, function, in_kernel_callable): + def with_added_callable(self, function, in_kernel_callable): """ Returns a copy of *self* with the *function* associated with the *in_kernel_callable*. @@ -704,7 +704,7 @@ class ProgramCallablesInfo(ImmutableRecord): .. note:: - - Use :meth:`with_add_callable` if a callable is being resolved for the + - Use :meth:`with_added_callable` if a callable is being resolved for the first time. """ diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index f2e62368e..b0d677649 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -447,7 +447,7 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): # renaming. continue main_prog_callables_info, new_func_id = ( - main_prog_callables_info.with_add_callable(var(old_func_id), + main_prog_callables_info.with_added_callable(var(old_func_id), in_knl_callable)) if old_func_id != new_func_id: @@ -464,7 +464,7 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): subkernel=new_root_kernel.copy(name=programs[0].name)) # TODO: change the name of the final root kernel. - main_prog_callables_info, _ = main_prog_callables_info.with_add_callable( + main_prog_callables_info, _ = main_prog_callables_info.with_added_callable( var(programs[0].name), new_root_kernel_callable) return programs[0].copy( diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 52150dcd8..04392d8d0 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -408,7 +408,7 @@ class TypeInferenceMapper(CombineMapper): identifier, function_mangler, arg_id_to_dtype, arg_id_to_descr, mangle_result.target_name) self.program_callables_info, new_function_id = ( - self.program_callables_info.with_add_callable( + self.program_callables_info.with_added_callable( expr.function, in_knl_callable)) if isinstance(expr, Call): -- GitLab From f041d166645c5d7f72413f45200b475a4b2bc150 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 09:47:06 +0530 Subject: [PATCH 360/916] Minimalized CallableKernel for MR271 --- loopy/kernel/function_interface.py | 169 +---------------------------- loopy/preprocess.py | 2 +- loopy/type_inference.py | 138 ++++++++++++++++++++++- 3 files changed, 138 insertions(+), 171 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 8c3a69111..5efc44ad2 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -23,19 +23,11 @@ THE SOFTWARE. """ -import re -import six - from six.moves import zip from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.symbolic import parse_tagged_name - -from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext, - RuleAwareIdentityMapper, SubstitutionRuleExpander) - from loopy.kernel import LoopKernel @@ -145,7 +137,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): .. note:: - This class acts as a pseduo-callable and its significance lies in + This class acts as a pseudo-callable and its significance lies in solving picklability issues. """ fields = set(["local_size", "global_size"]) @@ -228,8 +220,6 @@ class InKernelCallable(ImmutableRecord): Any argument information exists both by its positional and its keyword identifier. """ - # FIXME: In all these with_** functions add that also passes a - # program_callables_info raise NotImplementedError() @@ -333,12 +323,12 @@ class InKernelCallable(ImmutableRecord): class ScalarCallable(InKernelCallable): """ - An abstranct interface the to a scalar callable encountered in a kernel. + An abstract interface the to a scalar callable encountered in a kernel. .. note:: The :meth:`ScalarCallable.with_types` is intended to assist with type - specialization of the funciton and is expected to be supplemented in the + specialization of the function and is expected to be supplemented in the derived subclasses. """ @@ -520,68 +510,12 @@ class CallableKernel(InKernelCallable): return (self.subkernel, self.arg_id_to_dtype, self.arg_id_to_descr) - @property - def name(self): - return self.subkernel.name - - def is_ready_for_codegen(self): - return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None) - def generate_preambles(self, target): """ Yields the *target* specific preambles. """ - # FIXME Check that this is correct. - return yield - def emit_call_insn(self, insn, target, expression_to_code_mapper): - - assert self.is_ready_for_codegen() - - from loopy.kernel.instruction import CallInstruction - from pymbolic.primitives import CallWithKwargs - - assert isinstance(insn, CallInstruction) - - parameters = insn.expression.parameters - kw_parameters = {} - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - - assignees = insn.assignees - - parameters = list(parameters) - par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameters.append(kw_parameters[pos_to_kw[i]]) - par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) - - # insert the assigness at the required positions - assignee_write_count = -1 - for i, arg in enumerate(self.subkernel.args): - if arg.is_output_only: - assignee = assignees[-assignee_write_count-1] - parameters.insert(i, assignee) - par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) - assignee_write_count -= 1 - - # no type casting in array calls - from loopy.expression import dtype_to_type_context - from pymbolic.mapper.stringifier import PREC_NONE - from pymbolic import var - - c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr - for par, par_dtype in zip( - parameters, par_dtypes)] - - return var(self.subkernel.name)(*c_parameters), False - # }}} @@ -589,7 +523,7 @@ class CallableKernel(InKernelCallable): class ManglerCallable(ScalarCallable): """ - A callable whose characateristic is defined by a function mangler. + A callable whose characteristic is defined by a function mangler. .. attribute:: function_mangler @@ -662,99 +596,4 @@ class ManglerCallable(ScalarCallable): # }}} - -# {{{ new pymbolic calls to scoped functions - -def next_indexed_variable(function): - """ - Returns an instance of :class:`str` with the next indexed-name in the - sequence for the name of *function*. - - *Example:* ``Variable('sin_0')`` will return ``'sin_1'``. - - :arg function: Either an instance of :class:`pymbolic.primitives.Variable` - or :class:`loopy.reduction.ArgExtOp` or - :class:`loopy.reduction.SegmentedOp`. - """ - from loopy.library.reduction import ReductionOpFunction - if isinstance(function, ReductionOpFunction): - return function.copy() - func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") - - match = func_name.match(function.name) - - if match is None: - if function.name[-1] == '_': - return "{old_name}0".format(old_name=function.name) - else: - return "{old_name}_0".format(old_name=function.name) - - return "{alpha}_{num}".format(alpha=match.group('alpha'), - num=int(match.group('num'))+1) - - -class FunctionNameChanger(RuleAwareIdentityMapper): - """ - Changes the names of scoped functions in calls of expressions according to - the mapping ``calls_to_new_functions`` - """ - - def __init__(self, rule_mapping_context, calls_to_new_names, - subst_expander): - super(FunctionNameChanger, self).__init__(rule_mapping_context) - self.calls_to_new_names = calls_to_new_names - self.subst_expander = subst_expander - - def map_call(self, expr, expn_state): - name, tag = parse_tagged_name(expr.function) - - if name not in self.rule_mapping_context.old_subst_rules: - expanded_expr = self.subst_expander(expr) - if expr in self.calls_to_new_names: - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters)) - elif expanded_expr in self.calls_to_new_names: - # FIXME: this is horribly wrong logic. - # investigate how to make edits to a substitution rule - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expanded_expr]), - tuple(self.rec(child, expn_state) - for child in expanded_expr.parameters)) - else: - return super(FunctionNameChanger, self).map_call( - expr, expn_state) - else: - return self.map_substitution(name, tag, expr.parameters, expn_state) - - def map_call_with_kwargs(self, expr, expn_state): - - if expr in self.calls_to_new_names: - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) - ) - else: - return super(FunctionNameChanger, self).map_call_with_kwargs( - expr, expn_state) - - -def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, kernel.get_var_name_generator()) - subst_expander = SubstitutionRuleExpander(kernel.substitutions) - name_changer = FunctionNameChanger(rule_mapping_context, - pymbolic_calls_to_new_names, subst_expander) - - return rule_mapping_context.finish_kernel( - name_changer.map_kernel(kernel)) - -# }}} - - # vim: foldmethod=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e9e55cc46..41674ed92 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2432,7 +2432,7 @@ def infer_hw_axes_sizes(program): program.program_callables_info.copy( resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) - program = program.copy(program_callables_info=new_program_callables_info) + return program.copy(program_callables_info=new_program_callables_info) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 04392d8d0..e5c17886d 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -36,7 +36,10 @@ from loopy.diagnostic import ( from loopy.kernel.instruction import _DataObliviousInstruction from loopy.program import ProgramCallablesInfo -from loopy.symbolic import LinearSubscript +from loopy.symbolic import ( + LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, + SubstitutionRuleExpander, ResolvedFunction, + SubstitutionRuleMappingContext) from pymbolic.primitives import Variable, Subscript, Lookup import logging @@ -62,6 +65,135 @@ def get_return_types_as_tuple(arg_id_to_dtype): return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) +# {{{ renaming helpers + +class FunctionNameChanger(RuleAwareIdentityMapper): + """ + Changes the names of scoped functions in calls of expressions according to + the mapping ``calls_to_new_functions`` + """ + + def __init__(self, rule_mapping_context, calls_to_new_names, + subst_expander): + super(FunctionNameChanger, self).__init__(rule_mapping_context) + self.calls_to_new_names = calls_to_new_names + self.subst_expander = subst_expander + + def map_call(self, expr, expn_state): + name, tag = parse_tagged_name(expr.function) + + if name not in self.rule_mapping_context.old_subst_rules: + expanded_expr = self.subst_expander(expr) + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters)) + elif expanded_expr in self.calls_to_new_names: + # FIXME: This is killing the substitution. + # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper + # would help. + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expanded_expr]), + tuple(self.rec(child, expn_state) + for child in expanded_expr.parameters)) + else: + return super(FunctionNameChanger, self).map_call( + expr, expn_state) + else: + return self.map_substitution(name, tag, expr.parameters, expn_state) + + def map_call_with_kwargs(self, expr, expn_state): + + if expr in self.calls_to_new_names: + return type(expr)( + ResolvedFunction(self.calls_to_new_names[expr]), + tuple(self.rec(child, expn_state) + for child in expr.parameters), + dict( + (key, self.rec(val, expn_state)) + for key, val in six.iteritems(expr.kw_parameters)) + ) + else: + return super(FunctionNameChanger, self).map_call_with_kwargs( + expr, expn_state) + + +def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): + """ + Returns a copy of *kernel* with the names of pymbolic calls changed + according to the mapping given by *pymbolic_calls_new_names*. + + :arg pymbolic_calls_to_new_names: A mapping from instances of + :class:`pymbolic.primitives.Call` to :class:`str`. + + **Example: ** + + - Given a *kernel* -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin')(x[i]) + end i + ------------------------------------------------------------- + + - And given a *pymbolic_calls_to_new_names* -- + + .. code:: + + {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'), + Variable('i')),))": 'sin_1'} + + - The following *kernel* is returned -- + + .. code:: + + ------------------------------------------------------------- + KERNEL: loopy_kernel + ------------------------------------------------------------- + ARGUMENTS: + x: type: , shape: (10), dim_tags: (N0:stride:1) + y: type: , shape: (10), dim_tags: (N0:stride:1) + ------------------------------------------------------------- + DOMAINS: + { [i] : 0 <= i <= 9 } + ------------------------------------------------------------- + INAME IMPLEMENTATION TAGS: + i: None + ------------------------------------------------------------- + INSTRUCTIONS: + for i + y[i] = ResolvedFunction('sin_1')(x[i]) + end i + ------------------------------------------------------------- + """ + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + subst_expander = SubstitutionRuleExpander(kernel.substitutions) + name_changer = FunctionNameChanger(rule_mapping_context, + pymbolic_calls_to_new_names, subst_expander) + + return rule_mapping_context.finish_kernel( + name_changer.map_kernel(kernel)) + +# }}} + + # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): @@ -276,7 +408,6 @@ class TypeInferenceMapper(CombineMapper): def map_call(self, expr, return_tuple=False): from pymbolic.primitives import Variable, CallWithKwargs, Call - from loopy.symbolic import ResolvedFunction if isinstance(expr, CallWithKwargs): kw_parameters = expr.kw_parameters @@ -862,9 +993,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, args=[new_arg_dict[arg.name] for arg in kernel.args], ) - # this has to be subsitutition - from loopy.kernel.function_interface import ( - change_names_of_pymbolic_calls) type_specialized_kernel = change_names_of_pymbolic_calls( pre_type_specialized_knl, old_calls_to_new_calls) -- GitLab From 4f8ec6989ef1e515fa956214702f7ef11b300305 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 10:42:01 +0530 Subject: [PATCH 361/916] added autofunction/class/methods --- loopy/kernel/function_interface.py | 13 +++ loopy/program.py | 143 +++++++++++++++++------------ 2 files changed, 96 insertions(+), 60 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 5efc44ad2..e4e8c1d59 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -30,6 +30,19 @@ from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: ValueArgDescriptor +.. autoclass:: ArrayArgDescriptor +.. autoclass:: InKernelCallable +.. autoclass:: CallableKernel +.. autoclass:: ScalarCallable +.. autoclass:: ManglerCallable + +""" + # {{{ argument descriptors diff --git a/loopy/program.py b/loopy/program.py index 90eb64e98..e5d033e0f 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -42,7 +42,17 @@ from loopy.kernel import LoopKernel from collections import Counter from pymbolic.primitives import Call, CallWithKwargs -# FIXME: autofunction/autoclass?? ~KK +__doc__ = """ + +.. currentmodule:: loopy + +.. autoclass:: Program +.. autoclass:: ProgramCallablesInfo + +.. autofunction:: make_program_from_kernel +.. autofunction:: iterate_over_kernels_if_given_program + +""" class ResolvedFunctionMarker(RuleAwareIdentityMapper): @@ -114,8 +124,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): # resolved in-kernel callable self.program_callables_info, new_func_id = ( - self.program_callables_info.with_added_callable(expr.function, - in_knl_callable)) + self.program_callables_info.with_added_callable( + expr.function, in_knl_callable)) return type(expr)( ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) @@ -137,10 +147,21 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): self.program_callables_info, _ = ( self.program_callables_info.with_added_callable(func_id, in_knl_callable)) - # FIXME: where do you deal with the parameters? ~KK return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) +def _default_func_id_to_kernel_callable_mappers(target): + """ + Returns a list of functions that are provided through *target* by deafault. + """ + # FIXME: the name -- scopers is no longer used!(change it) ~KK + + from loopy.library.function import loopy_specific_callable_scopers + return ( + [loopy_specific_callable_scopers] + ( + target.get_device_ast_builder().function_scopers())) + + def initialize_program_callables_info_from_kernel(kernel): """ Returns an instance of :class:`loopy.ProgramCallablesInfo`, by resolving @@ -148,7 +169,7 @@ def initialize_program_callables_info_from_kernel(kernel): """ # collect the default function resolvers func_id_to_kernel_callable_mappers = ( - default_func_id_to_kernel_callable_mappers(kernel.target)) + _default_func_id_to_kernel_callable_mappers(kernel.target)) program_callables_info = ProgramCallablesInfo({}) from loopy.symbolic import SubstitutionRuleMappingContext @@ -553,6 +574,9 @@ class ProgramCallablesInfo(ImmutableRecord): An instance of :class:`bool` which is intended to aid the working of :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and :meth:`with_exit_edit_callables_mode`. + + .. automethod:: __init__ + .. automethod:: callables_count """ def __init__(self, resolved_functions, history=None, is_being_edited=False): @@ -580,6 +604,7 @@ class ProgramCallablesInfo(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash + @property @memoize_method def callables_count(self): """ @@ -601,18 +626,36 @@ class ProgramCallablesInfo(ImmutableRecord): return callables_count - # {{{ interface to perfrom edits on callables + # {{{ interface to perform edits on callables def with_added_callable(self, function, in_kernel_callable): """ Returns a copy of *self* with the *function* associated with the *in_kernel_callable*. + + .. note:: + + - Always checks whether the + :attr:``loopy.ProgramCallablesInfo.resolved_functions` has + *in_kernel_callable*, does not introduce copies. + + - The difference between + :meth:`loopy.ProgramCallablesInfo.with_added_callable` + and :meth:`ProgramCallablesInfo.with_callable` being that + the former has no support for renaming the callable back i.e. + ``with_callable`` supports renaming from ``sin_0`` to ``sin``, + if possible, through the member method + ``loopy.ProgramCallablesInfo.with_exit_edit_callables_mode`` + + This subtle difference makes -- + + - :meth:`loopy.ProgramCallablesInfo.with_added_callable` suitable + for usage while resolving the functions first time, where no + renaming is needed. + + - :meth:`loopy.ProgramCallablesInfo.with_callable` suitable for + implementing edits in callables during inference-walks. """ - # FIXME: pleasse better docs.. ~KK - # note: this does not require the edit mode to be true. - # the reason for the edit mode is that we need to take care of the - # renaming that might be needed to be done - # PS: delete this note? # {{{ sanity checks @@ -627,7 +670,7 @@ class ProgramCallablesInfo(ImmutableRecord): if in_kernel_callable in self.resolved_functions.values(): # the callable already exists, implies return the function - # identifier corresposing to that callable. + # identifier corresponding to that callable. for func_id, in_knl_callable in self.resolved_functions.items(): if in_knl_callable == in_kernel_callable: history[func_id] = history[func_id] | frozenset([function.name]) @@ -659,7 +702,7 @@ class ProgramCallablesInfo(ImmutableRecord): if isinstance(in_kernel_callable, CallableKernel) and ( in_kernel_callable.subkernel.is_called_from_host): - # special treatment if the callable is the root kernel + # do not rename root kernel pass else: while unique_function_identifier in self.resolved_functions: @@ -671,10 +714,6 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) - if 'strongVolumeKernelR_0' in updated_resolved_functions: - import pudb - pudb.set_trace() - history[unique_function_identifier] = frozenset( [unique_function_identifier]) @@ -688,24 +727,26 @@ class ProgramCallablesInfo(ImmutableRecord): """ Initiates *self* for a walk traversal through all the callables. """ - # PS: I don't see a need for this method right now. - # This is just for validation purposes, maybe needs to disapper if you - # find a better solution? return self.copy( is_being_edited=True) def with_callable(self, function, in_kernel_callable): """ + Returns a copy of *self* with the *function* associated with the + *in_kernel_callable*. Also refer -- + :meth:`loopy.ProgramCallablesInfo.with_added_callable` + + :arg function: An instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.library.reduction.ReductionOpFunction`. - :arg in_kernel_callables: An instance of + :arg in_kernel_callable: An instance of :class:`loopy.InKernelCallable`. .. note:: - Use :meth:`with_added_callable` if a callable is being resolved for the - first time. + first time. """ # {{{ non-edit mode @@ -714,7 +755,7 @@ class ProgramCallablesInfo(ImmutableRecord): if function.name in self.resolved_functions and ( self.resolved_functions[function.name] == in_kernel_callable): # if not being edited, check that the given function is - # equal to the the old version of the callable. + # equal to the old version of the callable. return self, function else: print('Old: ', self.resolved_functions[function.name]) @@ -764,7 +805,7 @@ class ProgramCallablesInfo(ImmutableRecord): if isinstance(in_kernel_callable, CallableKernel) and ( in_kernel_callable.subkernel.is_called_from_host): - # special treatment if the callable is the root kernel + # do not rename root kernel pass else: while unique_function_identifier in self.resolved_functions: @@ -776,10 +817,6 @@ class ProgramCallablesInfo(ImmutableRecord): updated_resolved_functions[unique_function_identifier] = ( in_kernel_callable) - if 'strongVolumeKernelR_0' in updated_resolved_functions: - import pudb - pudb.set_trace() - history[unique_function_identifier] = ( history[function.name] | frozenset([unique_function_identifier])) @@ -791,39 +828,38 @@ class ProgramCallablesInfo(ImmutableRecord): def with_exit_edit_callables_mode(self, old_callables_count): """ - Returns a copy of *self* with renaming of the callables done whenver + Returns a copy of *self* with renaming of the callables done whenever possible. *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, - then all the renaming is done such that one of flavors of the function + then all the renaming is done such that one of flavors of the callable is renamed back to ``sin``. """ + assert self.is_being_edited + new_callables_count = self.callables_count() - history = self.history.copy() - renames_needed = {} - assert self.is_being_edited + # {{{ calculate the renames needed - # NOTE:(to self by KK) - # all we need to do is change the name of the variables that were seen - # in old_callables_count but are no longer available. - # Using these 2 figure out the renames needed. + renames_needed = {} for old_func_id in old_callables_count-new_callables_count: # this implies that all the function instances having the name # "func_id" have been renamed to something else. for new_func_id in ( six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): - if old_func_id in history[new_func_id]: + if old_func_id in self.history[new_func_id]: renames_needed[new_func_id] = old_func_id break + # }}} - resolved_functions = {} + new_resolved_functions = {} + new_history = {} for func_id in new_callables_count: in_knl_callable = self.resolved_functions[func_id] if isinstance(in_knl_callable, CallableKernel): - # If callable kernel, perform renames. + # if callable kernel, perform renames inside its expressions. old_subkernel = in_knl_callable.subkernel new_subkernel = rename_resolved_functions_in_a_single_kernel( old_subkernel, renames_needed) @@ -836,19 +872,18 @@ class ProgramCallablesInfo(ImmutableRecord): type(in_knl_callable).__name__) if func_id in renames_needed: - # If function name itself in renames change the key of the - # dict. - history.pop(func_id) - new_func_id = renames_needed[func_id] - resolved_functions[new_func_id] = ( + new_resolved_functions[new_func_id] = ( in_knl_callable) + new_history[new_func_id] = self.history[func_id] else: - resolved_functions[func_id] = in_knl_callable + new_resolved_functions[func_id] = in_knl_callable + new_history[func_id] = self.history[func_id] return self.copy( is_being_edited=False, - resolved_functions=resolved_functions) + resolved_functions=new_resolved_functions, + history=new_history) # }}} @@ -874,18 +909,6 @@ class ProgramCallablesInfo(ImmutableRecord): # }}} -def default_func_id_to_kernel_callable_mappers(target): - """ - Returns a list of functions that are provided through *target* by deafault. - """ - # FIXME: name scopers is confusing!(change it to something else.) - - from loopy.library.function import loopy_specific_callable_scopers - return ( - [loopy_specific_callable_scopers] + ( - target.get_device_ast_builder().function_scopers())) - - # {{{ helper functions def make_program_from_kernel(kernel): @@ -902,7 +925,7 @@ def make_program_from_kernel(kernel): name=kernel.name, program_callables_info=program_callables_info, func_id_to_in_knl_callable_mappers=( - default_func_id_to_kernel_callable_mappers(kernel.target)), + _default_func_id_to_kernel_callable_mappers(kernel.target)), target=kernel.target) return program -- GitLab From a28164f965eedd1611752e9d7540d108c2ae8d76 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 10:43:14 +0530 Subject: [PATCH 362/916] made callables count a property. --- loopy/preprocess.py | 2 +- loopy/program.py | 2 +- loopy/statistics.py | 8 ++++---- loopy/type_inference.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 41674ed92..446533166 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2273,7 +2273,7 @@ def infer_arg_descr(program): callables. """ root_kernel_callable = program.program_callables_info[program.name] - old_callables_count = program.program_callables_info.callables_count() + old_callables_count = program.program_callables_info.callables_count program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel = program.root_kernel diff --git a/loopy/program.py b/loopy/program.py index e5d033e0f..bdf40a1b0 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -838,7 +838,7 @@ class ProgramCallablesInfo(ImmutableRecord): assert self.is_being_edited - new_callables_count = self.callables_count() + new_callables_count = self.callables_count # {{{ calculate the renames needed diff --git a/loopy/statistics.py b/loopy/statistics.py index 3799967b4..71a629867 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1397,7 +1397,7 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, op_map = ToCountMap() callables_count = ( - program.program_callables_info.callables_count()) + program.program_callables_info.callables_count) for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): @@ -1684,7 +1684,7 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, access_map = ToCountMap() - callables_count = program.program_callables_info.callables_count() + callables_count = program.program_callables_info.callables_count for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): @@ -1808,7 +1808,7 @@ def get_synchronization_map(program, subgroup_size=None): program = preprocess_program(program) sync_map = ToCountMap() - callables_count = program.program_callables_info.callables_count() + callables_count = program.program_callables_info.callables_count for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): @@ -1884,7 +1884,7 @@ def gather_access_footprints(program, ignore_uncountable=False): write_footprints = [] read_footprints = [] - callables_count = program.program_callables_info.callables_count() + callables_count = program.program_callables_info.callables_count for func_id, in_knl_callable in program.program_callables_info.items(): if isinstance(in_knl_callable, CallableKernel): diff --git a/loopy/type_inference.py b/loopy/type_inference.py index e5c17886d..d5df36bf7 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1017,7 +1017,7 @@ def infer_unknown_types(program, expect_completion=False): program_callables_info[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - old_callables_count = program_callables_info.callables_count() + old_callables_count = program_callables_info.callables_count program_callables_info = ( program.program_callables_info.with_edit_callables_mode()) root_kernel, program_callables_info = ( -- GitLab From 621ef9f8c05abe5f9ba64adc2ecbeae9cdd92e58 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 10:56:22 +0530 Subject: [PATCH 363/916] docs cleanup for Program --- loopy/program.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index bdf40a1b0..236bbc44a 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -222,10 +222,13 @@ class Program(ImmutableRecord): .. note:: - - To create an instance of :class:`loopy.Program`, it is recommeneded to + - To create an instance of :class:`loopy.Program`, it is recommended to go through :method:`loopy.make_kernel`. - This data structure and its attributes should be considered immutable, any modifications should be done through :method:`copy`. + + .. automethod:: __init__ + .. automethod:: with_root_kernel """ def __init__(self, name, @@ -329,7 +332,7 @@ class Program(ImmutableRecord): def root_kernel(self): """ Returns an instance of :class:`loopy.LoopKernel` denoting the topmost - level kernel in codegeneration. + level kernel. .. note:: @@ -577,6 +580,10 @@ class ProgramCallablesInfo(ImmutableRecord): .. automethod:: __init__ .. automethod:: callables_count + .. automethod:: with_added_callable + .. automethod:: with_edit_callables_mode + .. automethod:: with_callable + .. automethod:: with_exit_edit_callables_mode """ def __init__(self, resolved_functions, history=None, is_being_edited=False): -- GitLab From 8e64c24f8d0669faaca742138a1982cda56c52cf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 12:07:20 +0530 Subject: [PATCH 364/916] small error in docs. --- doc/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 71b8f4389..4c67e3d3d 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -334,7 +334,7 @@ that these dependencies show up there, too: .. doctest:: - >>> print(knl.stringify(with_dependencies=True)) + >>> print(knl.root_kernel.stringify(with_dependencies=True)) --------------------------------------------------------------------------- KERNEL: loopy_kernel --------------------------------------------------------------------------- -- GitLab From 3293f6ae0b24ce1206487835ac52aeb37a06a174 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 12:16:30 +0530 Subject: [PATCH 365/916] callable kernel no longer has a name. --- loopy/transform/fusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index b0d677649..44e69ecfb 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -439,7 +439,7 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): # main_program_callables_info, because of renaming is # needed to be done in the callable kernels before registering. # Hence disabling it until required. - if in_knl_callable.name != prog.name: + if in_knl_callable.subkernel.name != prog.name: raise LoopyError("fuse_kernels cannot fuse programs with " "multiple callable kernels.") -- GitLab From 70ada3da326053a6023fa050008284aec9d277eb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 12:32:00 +0530 Subject: [PATCH 366/916] minor changes in docs --- doc/tutorial.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 4c67e3d3d..8e20dbc28 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1207,7 +1207,8 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> knl = lp.preprocess_kernel(knl) + >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) >>> print(knl) --------------------------------------------------------------------------- KERNEL: rotate_v2 @@ -1237,9 +1238,8 @@ function adds instructions to the kernel without scheduling them. That means that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_scheduled_kernel(knl) # Schedule added instructions + >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) # Schedule added instructions >>> print(knl) --------------------------------------------------------------------------- KERNEL: rotate_v2 -- GitLab From 66b9f4275979426e6e6c9ced76f51c4fc84ebc3a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 12:49:01 +0530 Subject: [PATCH 367/916] Pass docs. --- doc/tutorial.rst | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 8e20dbc28..597240cc7 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1179,7 +1179,7 @@ Let us start with an example. Consider the kernel from above with a .. doctest:: - >>> knl = lp.make_kernel( + >>> prog = lp.make_kernel( ... "[n] -> {[i] : 0<=i>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0") Here is what happens when we try to generate code for the kernel: - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) Traceback (most recent call last): ... loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) @@ -1207,9 +1207,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: - >>> knl = lp.preprocess_kernel(knl) - >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) - >>> print(knl) + >>> prog = lp.preprocess_kernel(prog) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1238,9 +1239,10 @@ function adds instructions to the kernel without scheduling them. That means that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. - >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) # Schedule added instructions - >>> print(knl) + >>> prog = lp.save_and_reload_temporaries(prog) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) # Schedule added instructions + >>> prog = prog.with_root_kernel(knl) + >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 --------------------------------------------------------------------------- @@ -1279,7 +1281,7 @@ does in more detail: The kernel translates into two OpenCL kernels. - >>> cgr = lp.generate_code_v2(knl) + >>> cgr = lp.generate_code_v2(prog) >>> print(cgr.device_code()) #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) -- GitLab From fba32ca309e7ac03bd521816a08dc98d9695c1df Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 Aug 2018 21:11:09 +0530 Subject: [PATCH 368/916] change credits of program.py --- loopy/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 236bbc44a..54d13343e 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -1,6 +1,6 @@ from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" +__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy -- GitLab From 1bc7cf4a91fdf118eb062af827f80d94a94c8ada Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 17 Aug 2018 17:29:39 +0100 Subject: [PATCH 369/916] compare opaque types --- loopy/types.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/loopy/types.py b/loopy/types.py index 0a08b8a81..4e77317c1 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -202,6 +202,17 @@ class OpaqueType(LoopyType): def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, self.name) + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return ( + type(self) == type(other) + and self.name == other.name) + + def __ne__(self, other): + return not self.__eq__(other) + # }}} -- GitLab From 58ed15782da92bd25474721b07be6c460ccd8fdf Mon Sep 17 00:00:00 2001 From: tj-sun Date: Mon, 20 Aug 2018 19:53:06 +0100 Subject: [PATCH 370/916] need to look into comparisions for scoped function --- loopy/type_inference.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index c05cdb2c1..9254ecbb5 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -467,11 +467,15 @@ class TypeInferenceMapper(CombineMapper): def map_comparison(self, expr): # "bool" is unusable because OpenCL's bool has indeterminate memory # format. + self(expr.left, return_tuple=False, return_dtype_set=False) + self(expr.right, return_tuple=False, return_dtype_set=False) return [NumpyType(np.dtype(np.int32))] - map_logical_not = map_comparison - map_logical_and = map_comparison - map_logical_or = map_comparison + def map_logical_not(self, expr): + return [NumpyType(np.dtype(np.int32))] + + map_logical_and = map_logical_not + map_logical_or = map_logical_not def map_group_hw_index(self, expr, *args): return [self.kernel.index_dtype] -- GitLab From 2636fe29c3e574ff14fb1f66764c5f6b34cc54cd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 21:30:11 -0500 Subject: [PATCH 371/916] better function naming, no more usage of "scoped" terminology. --- doc/ref_call.rst | 2 +- loopy/library/function.py | 16 +++++++++++++--- loopy/library/reduction.py | 2 +- loopy/program.py | 6 +++--- loopy/target/__init__.py | 2 +- loopy/target/c/__init__.py | 4 ++-- loopy/target/cuda.py | 4 ++-- loopy/target/opencl.py | 4 ++-- loopy/target/pyopencl.py | 4 ++-- loopy/target/python.py | 4 ++-- 10 files changed, 29 insertions(+), 19 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 4ff1ef2fc..147363a16 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -180,7 +180,7 @@ Changes on the target side to accommodate the new function interface -------------------------------------------------------------------- The earlier "function\_mangler" as a member method of the class -``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The +``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The function scopers would return a list of functions with the signature ``(target, identifier)->lp.InKernelCallable``. diff --git a/loopy/library/function.py b/loopy/library/function.py index 8338875d0..f3fb5f8cd 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -55,15 +55,25 @@ class IndexOfCallable(ScalarCallable): program_callables_info) -def loopy_specific_callable_scopers(target, identifier): +def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier): + """ + Returns an instance of :class:`InKernelCallable` for the *idenitifer* + which is not present in *target*, but whose interface is given by + :mod:`loo.py`. Callables that fall in this category are -- + + - reductions leading to function calls like ``argmin``, ``argmax``. + - callables that have a predefined meaning in :mod:`loo.py` like + ``make_tuple``, ``index_of``, ``indexof_vec``. + """ if identifier == "make_tuple": return MakeTupleCallable(name="make_tuple") if identifier in ["indexof", "indexof_vec"]: return IndexOfCallable(name=identifier) - from loopy.library.reduction import reduction_scoper - return reduction_scoper(target, identifier) + from loopy.library.reduction import ( + reduction_func_id_to_in_knl_callable_mapper) + return reduction_func_id_to_in_knl_callable_mapper(target, identifier) # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index b3deba65e..70df864d4 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -502,7 +502,7 @@ class ReductionCallable(ScalarCallable): return -def reduction_scoper(target, identifier): +def reduction_func_id_to_in_knl_callable_mapper(target, identifier): if isinstance(identifier, ReductionOpFunction): return ReductionCallable(name=identifier) diff --git a/loopy/program.py b/loopy/program.py index 54d13343e..fd4ae63f7 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -156,10 +156,10 @@ def _default_func_id_to_kernel_callable_mappers(target): """ # FIXME: the name -- scopers is no longer used!(change it) ~KK - from loopy.library.function import loopy_specific_callable_scopers + from loopy.library.function import loopy_specific_callable_func_id_to_knl_callable_mappers return ( - [loopy_specific_callable_scopers] + ( - target.get_device_ast_builder().function_scopers())) + [loopy_specific_callable_func_id_to_knl_callable_mappers] + ( + target.get_device_ast_builder().function_id_in_knl_callable_mapper())) def initialize_program_callables_info_from_kernel(kernel): diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index e3b4853c3..92ee2dc51 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -150,7 +150,7 @@ class ASTBuilderBase(object): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): """ Returns an instance of list of the functions of signature ``(target, identifiers)`` returning either an instance of diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 1579bb313..418ce0256 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -484,9 +484,9 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): return ( - super(CASTBuilder, self).function_scopers() + [ + super(CASTBuilder, self).function_id_in_knl_callable_mapper() + [ scope_c_math_functions]) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 89cbfd034..e6abf73fd 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -274,9 +274,9 @@ class CUDACASTBuilder(CASTBuilder): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): return [scope_cuda_functions] + ( - super(CUDACASTBuilder, self).function_scopers()) + super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper()) # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 44bf9c4c8..d8c195de2 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -442,10 +442,10 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): return ( [scope_opencl_functions] + super( - OpenCLCASTBuilder, self).function_scopers()) + OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def symbol_manglers(self): return ( diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 03ba26930..0e9556482 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -792,11 +792,11 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): from loopy.library.random123 import random123_function_scoper return ( [pyopencl_function_scoper, random123_function_scoper] + super( - PyOpenCLCASTBuilder, self).function_scopers()) + PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def preamble_generators(self): return ([ diff --git a/loopy/target/python.py b/loopy/target/python.py index cd6e61167..0dbecce27 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -180,10 +180,10 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_scopers(self): + def function_id_in_knl_callable_mapper(self): from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_scopers() + + super(PythonASTBuilderBase, self).function_id_in_knl_callable_mapper() + [scope_c_math_functions]) def preamble_generators(self): -- GitLab From d923227ed2d2557e0b3dcdc505546ada4069a142 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 21:34:07 -0500 Subject: [PATCH 372/916] flake8 fixes after `sed` --- loopy/program.py | 6 ++++-- loopy/target/python.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index fd4ae63f7..a18d90764 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -156,10 +156,12 @@ def _default_func_id_to_kernel_callable_mappers(target): """ # FIXME: the name -- scopers is no longer used!(change it) ~KK - from loopy.library.function import loopy_specific_callable_func_id_to_knl_callable_mappers + from loopy.library.function import ( + loopy_specific_callable_func_id_to_knl_callable_mappers) return ( [loopy_specific_callable_func_id_to_knl_callable_mappers] + ( - target.get_device_ast_builder().function_id_in_knl_callable_mapper())) + target.get_device_ast_builder().function_id_in_knl_callable_mapper( + ))) def initialize_program_callables_info_from_kernel(kernel): diff --git a/loopy/target/python.py b/loopy/target/python.py index 0dbecce27..2e6712ec1 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -183,7 +183,8 @@ class PythonASTBuilderBase(ASTBuilderBase): def function_id_in_knl_callable_mapper(self): from loopy.target.c import scope_c_math_functions return ( - super(PythonASTBuilderBase, self).function_id_in_knl_callable_mapper() + + super(PythonASTBuilderBase, + self).function_id_in_knl_callable_mapper() + [scope_c_math_functions]) def preamble_generators(self): -- GitLab From 906e1e2eb9a2ee0e850d28f57cccdb5e904ffd57 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 21:35:03 -0500 Subject: [PATCH 373/916] replaces unnecessary old logic in unscoped_call_collector. --- loopy/check.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index ae5599bc4..7033b62df 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -68,10 +68,6 @@ class UnscopedCallCollector(CombineMapper): :returns: An :class:`frozenset` of function names that are not scoped in the kernel. - - .. note:: - :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are - never scoped in the pipeline. """ def combine(self, values): @@ -85,8 +81,7 @@ class UnscopedCallCollector(CombineMapper): kw_parameters={})) def map_call_with_kwargs(self, expr): - from loopy.library.reduction import ArgExtOp - if not isinstance(expr.function, (ResolvedFunction, ArgExtOp)): + if not isinstance(expr.function, ResolvedFunction): return (frozenset([expr.function.name]) | self.combine((self.rec(child) for child in expr.parameters + tuple(expr.kw_parameters.values())))) -- GitLab From eeae2d861228796110337b8b5ccacddf84b53543 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 22:00:36 -0500 Subject: [PATCH 374/916] Comment rewording, scoper-> function_id_to_in_knl_callable_mapper --- doc/ref_call.rst | 6 +++--- loopy/check.py | 4 ++-- loopy/kernel/__init__.py | 2 +- loopy/kernel/function_interface.py | 2 +- loopy/library/random123.py | 2 +- loopy/target/pyopencl.py | 8 +++++--- 6 files changed, 13 insertions(+), 11 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 147363a16..ab8101372 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -30,7 +30,7 @@ kernel, whose name has been resolved by the kernel. The process of matching a function idenitifier with the function definition is called "resolving". A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it -is "resolved" by one of the ``function_scoper`` in a +is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a :attr:`LoopKernel.scoped_functions` - Functions already registered by the target. Some examples include -- @@ -41,11 +41,11 @@ is "resolved" by one of the ``function_scoper`` in a - Functions registered as ``CallableKernels`` using ``lp.register_callable_kernel(...)``. - Functions that have been provided through - ``lp.register_function_scoper(...)`` + ``lp.register_function_id_to_in_knl_callable_mapper(...)`` - Functions that can be made known from the user through ``lp.register_function_mangler``. This is planned to be deprecated, as its functionality is superseded by - ``lp.register_function_scoper(...)``. + ``lp.register_function_id_to_in_knl_callable_mapper(...)``. Expressions after a function is scoped -------------------------------------- diff --git a/loopy/check.py b/loopy/check.py index 7033b62df..76a56c085 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -181,8 +181,8 @@ def check_loop_priority_inames_known(kernel): def _get_all_unique_iname_tags(kernel): - """Returns a set of all the iname tags used in *kernel* that - inherit from :class:`loopy.kernel.data.UniqueTag`. + """Returns an instance of :class:`set` of all the iname tags used in + *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`. """ from loopy.kernel.data import UniqueTag iname_tags = [kernel.iname_to_tag.get(iname) for iname in diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 8b2cf3dd2..410f13322 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -223,7 +223,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: is_called_from_host An instance of :class:`bool`. Will be set *False* for the kernel which - would be called from another top level kernels. Default value is + would be called from other top level kernels. Default value is *True*. """ diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e4e8c1d59..c8b5a9537 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -287,7 +287,7 @@ class InKernelCallable(ImmutableRecord): def with_hw_axes_sizes(self, local_size, global_size): """ Returns a copy of *self* with modifications to comply with the grid - sizes ``(local_size, global_size)`` of the kernel in which it is + sizes ``(local_size, global_size)`` of the program in which it is supposed to be called. :arg local_size: An instance of :class:`islpy.PwAff`. diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 59ca72df1..397e985b4 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -231,7 +231,7 @@ class Random123Callable(ScalarCallable): return -def random123_function_scoper(target, identifier): +def random123_function_id_to_in_knl_callable_mapper(target, identifier): if identifier in FUNC_NAMES_TO_RNG: return Random123Callable(name=identifier) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 0e9556482..435a5e791 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -274,7 +274,7 @@ class PyOpenCLCallable(ScalarCallable): program_callables_info) -def pyopencl_function_scoper(target, identifier): +def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier): if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", "tanh", "conj", "real", "imag", "abs"]: return PyOpenCLCallable(name=identifier) @@ -793,9 +793,11 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library def function_id_in_knl_callable_mapper(self): - from loopy.library.random123 import random123_function_scoper + from loopy.library.random123 import ( + random123_function_id_to_in_knl_callable_mapper) return ( - [pyopencl_function_scoper, random123_function_scoper] + super( + [pyopencl_function_id_to_in_knl_callable_mapper, + random123_function_id_to_in_knl_callable_mapper] + super( PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) def preamble_generators(self): -- GitLab From 481573be0b9ebca023ce2994ed866c66cb85d6e3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 22:02:41 -0500 Subject: [PATCH 375/916] removes FIXME. --- loopy/program.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index a18d90764..161249e01 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -154,8 +154,6 @@ def _default_func_id_to_kernel_callable_mappers(target): """ Returns a list of functions that are provided through *target* by deafault. """ - # FIXME: the name -- scopers is no longer used!(change it) ~KK - from loopy.library.function import ( loopy_specific_callable_func_id_to_knl_callable_mappers) return ( -- GitLab From 46d1502bf2372803eaaa0483a07190d4cfef60cd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 28 Aug 2018 22:34:27 -0500 Subject: [PATCH 376/916] adds a comment that the ref_call needs one more revamping, removed unnecessary fixme in type_inference, some other minor comment rewording. --- doc/ref_call.rst | 2 ++ loopy/program.py | 14 +++++++++----- loopy/statistics.py | 4 ++-- loopy/type_inference.py | 2 -- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index ab8101372..5a59e8428 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -4,6 +4,8 @@ Calling Loopy Kernels and External Functions Goals of a function interface ----------------------------- +- *FIXME: * Needs to change after the new design of program. + - Must be able to have complete information of the function just through the epxression node. - Must adhere to :mod:`loopy` semantics of immutability. diff --git a/loopy/program.py b/loopy/program.py index 161249e01..7479ee043 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -556,6 +556,8 @@ def count_callables_in_kernel(kernel, program_callables_info): # {{{ program callables info class ProgramCallablesInfo(ImmutableRecord): + # FIXME: is CallablesTable a better name?(similar to symbol table in + # compilers.) """ Records the information of all the callables called in a :class:`loopy.Program`. @@ -637,8 +639,11 @@ class ProgramCallablesInfo(ImmutableRecord): def with_added_callable(self, function, in_kernel_callable): """ - Returns a copy of *self* with the *function* associated with the - *in_kernel_callable*. + Returns an instance of :class:`tuple` of ``(new_self, new_function)``. + ``new_self`` is a copy of *self* with the *function* associated with the + *in_kernel_callable*. ``new_function`` is the function identifier that + should be noted in the expression node so that it could be associated + with an instance of :class:`InKernelCallable`. .. note:: @@ -739,9 +744,8 @@ class ProgramCallablesInfo(ImmutableRecord): def with_callable(self, function, in_kernel_callable): """ - Returns a copy of *self* with the *function* associated with the - *in_kernel_callable*. Also refer -- - :meth:`loopy.ProgramCallablesInfo.with_added_callable` + Returns an instance of :class:`tuple` ``(new_self, new_function)``. + Also refer -- :meth:`loopy.ProgramCallablesInfo.with_added_callable` :arg function: An instance of :class:`pymbolic.primitives.Variable` or diff --git a/loopy/statistics.py b/loopy/statistics.py index 71a629867..000f651aa 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -61,8 +61,8 @@ __doc__ = """ # FIXME: this is broken for the callable kernel design. -# Qns: -# - The variable name, what if multiple kernels use the same name? +# - The variable name, what if multiple kernels use the same name?(needs a +# different MemAccessInfo) # - We should also add the cumulative effect on the arguments of callee kernels # into the caller kernel # - Make changes to MemAccessInfo to include the effect of several kernels. diff --git a/loopy/type_inference.py b/loopy/type_inference.py index d5df36bf7..a2174181e 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -969,8 +969,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, if isinstance(insn, lp.MultiAssignmentBase): # just a dummy run over the expression, to pass over all the # functions - # FIXME: need a check over here which checks the instruction for - # unseen cases if _instruction_missed_during_inference(insn): type_inf_mapper(insn.expression, return_tuple=isinstance(insn, lp.CallInstruction), return_dtype_set=True) -- GitLab From f6205800371ab2580c2dfde2be31e164c53fbaeb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 29 Aug 2018 06:48:28 -0500 Subject: [PATCH 377/916] do not allow to set lang_version for kernel functions. --- loopy/kernel/creation.py | 92 +++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 62c268e62..227ea0a32 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2155,55 +2155,56 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # {{{ handle kernel language version - from loopy.version import LANGUAGE_VERSION_SYMBOLS + if make_program: + from loopy.version import LANGUAGE_VERSION_SYMBOLS - version_to_symbol = dict( - (getattr(loopy.version, lvs), lvs) - for lvs in LANGUAGE_VERSION_SYMBOLS) + version_to_symbol = dict( + (getattr(loopy.version, lvs), lvs) + for lvs in LANGUAGE_VERSION_SYMBOLS) - lang_version = kwargs.pop("lang_version", None) - if lang_version is None: - # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION + lang_version = kwargs.pop("lang_version", None) + if lang_version is None: + # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION - # This *is* gross. But it seems like the right thing interface-wise. - import inspect - caller_globals = inspect.currentframe().f_back.f_globals + # This *is* gross. But it seems like the right thing interface-wise. + import inspect + caller_globals = inspect.currentframe().f_back.f_globals - for ver_sym in LANGUAGE_VERSION_SYMBOLS: - try: - lang_version = caller_globals[ver_sym] - break - except KeyError: - pass + for ver_sym in LANGUAGE_VERSION_SYMBOLS: + try: + lang_version = caller_globals[ver_sym] + break + except KeyError: + pass - # }}} + # }}} - if lang_version is None: - from warnings import warn - from loopy.diagnostic import LoopyWarning - from loopy.version import ( - MOST_RECENT_LANGUAGE_VERSION, - FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " - "To avoid this warning, pass " - "lang_version={ver} in this invocation. " - "(Or say 'from loopy.version import " - "{sym_ver}' in " - "the global scope of the calling frame.)" - .format( - ver=MOST_RECENT_LANGUAGE_VERSION, - sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] - ), - LoopyWarning, stacklevel=2) - - lang_version = FALLBACK_LANGUAGE_VERSION - - if lang_version not in version_to_symbol: - raise LoopyError("Language version '%s' is not known." % (lang_version,)) - if lang_version >= (2018, 1): - options = options.copy(enforce_variable_access_ordered=True) - if lang_version >= (2018, 2): - options = options.copy(ignore_boostable_into=True) + if lang_version is None: + from warnings import warn + from loopy.diagnostic import LoopyWarning + from loopy.version import ( + MOST_RECENT_LANGUAGE_VERSION, + FALLBACK_LANGUAGE_VERSION) + warn("'lang_version' was not passed to make_kernel(). " + "To avoid this warning, pass " + "lang_version={ver} in this invocation. " + "(Or say 'from loopy.version import " + "{sym_ver}' in " + "the global scope of the calling frame.)" + .format( + ver=MOST_RECENT_LANGUAGE_VERSION, + sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] + ), + LoopyWarning, stacklevel=2) + + lang_version = FALLBACK_LANGUAGE_VERSION + + if lang_version not in version_to_symbol: + raise LoopyError("Language version '%s' is not known." % (lang_version,)) + if lang_version >= (2018, 1): + options = options.copy(enforce_variable_access_ordered=True) + if lang_version >= (2018, 2): + options = options.copy(ignore_boostable_into=True) # }}} @@ -2361,6 +2362,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_kernel_function(*args, **kwargs): + lang_version = kwargs.pop('lang_version', None) + if lang_version: + raise LoopyError("lang_version should be set for program, not " + "functions.") + kwargs['make_program'] = False return make_kernel(*args, **kwargs) -- GitLab From 1ac9c4b0a7828c7846edcc1e528984c4bf1c0a1e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 29 Aug 2018 11:25:04 -0500 Subject: [PATCH 378/916] adds the in_kernel matching option. --- loopy/check.py | 6 ++++-- loopy/match.py | 14 +++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index f50ee5cfa..60a97ed87 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -249,9 +249,11 @@ def check_for_inactive_iname_access(kernel): if not expression_inames <= kernel.insn_inames(insn): raise LoopyError( "instruction '%s' references " - "inames '%s' that the instruction does not depend on" + "inames '%s' that the instruction does not depend on in " + "the kernel '%s'" % (insn.id, - ", ".join(expression_inames - kernel.insn_inames(insn)))) + ", ".join(expression_inames - + kernel.insn_inames(insn)), kernel.name)) def _is_racing_iname_tag(tv, tag): diff --git a/loopy/match.py b/loopy/match.py index 3c047e463..9766fac2b 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -49,6 +49,7 @@ Match expressions .. autoclass:: Tagged .. autoclass:: Writes .. autoclass:: Reads +.. autoclass:: InKernel .. autoclass:: Iname """ @@ -73,6 +74,7 @@ _id = intern("_id") _tag = intern("_tag") _writes = intern("_writes") _reads = intern("_reads") +_in_kernel = intern("_in_kernel") _iname = intern("_iname") _whitespace = intern("_whitespace") @@ -92,13 +94,14 @@ _LEX_TABLE = [ (_tag, RE(r"tag:([\w?*]+)")), (_writes, RE(r"writes:([\w?*]+)")), (_reads, RE(r"reads:([\w?*]+)")), + (_in_kernel, RE(r"in_kernel:([\w?*]+)")), (_iname, RE(r"iname:([\w?*]+)")), (_whitespace, RE("[ \t]+")), ] -_TERMINALS = ([_id, _tag, _writes, _reads, _iname]) +_TERMINALS = ([_id, _tag, _writes, _reads, _in_kernel, _iname]) # {{{ operator precedence @@ -262,6 +265,11 @@ class Reads(GlobMatchExpressionBase): for name in matchable.read_dependency_names()) +class InKernel(GlobMatchExpressionBase): + def __call__(self, kernel, matchable): + return self.re.match(kernel.name) + + class Iname(GlobMatchExpressionBase): def __call__(self, kernel, matchable): return any(self.re.match(name) @@ -299,6 +307,10 @@ def parse_match(expr): result = Reads(pstate.next_match_obj().group(1)) pstate.advance() return result + elif next_tag is _in_kernel: + result = InKernel(pstate.next_match_obj().group(1)) + pstate.advance() + return result elif next_tag is _iname: result = Iname(pstate.next_match_obj().group(1)) pstate.advance() -- GitLab From 6d9050b702d42f9166de96bb4f13c12ea9ea3d59 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 31 Aug 2018 16:53:58 -0500 Subject: [PATCH 379/916] inlined instruction tags should contain tags from both -- caller and callee. --- loopy/transform/callable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index b5b80ad89..5002e396b 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -455,7 +455,8 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): within_inames=within_inames, # TODO: probaby need to keep priority in callee kernel priority=instruction.priority, - depends_on=depends_on + depends_on=depends_on, + tags=insn.tags | instruction.tags ) inner_insns.append(insn) -- GitLab From 58c788d426cd8c67497ec32c55943672b672a6f5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 3 Sep 2018 16:59:05 -0500 Subject: [PATCH 380/916] passes the atomicity info from callee to caller --- loopy/transform/callable.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 5002e396b..3f8fbb580 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -450,13 +450,19 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): instruction.depends_on) if insn.id in heads: depends_on = depends_on | set([noop_start.id]) + + new_atomicity = tuple( + type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) + for atomicity in insn.atomicity) + insn = insn.copy( id=insn_id[insn.id], within_inames=within_inames, # TODO: probaby need to keep priority in callee kernel priority=instruction.priority, depends_on=depends_on, - tags=insn.tags | instruction.tags + tags=insn.tags | instruction.tags, + atomicity=new_atomicity ) inner_insns.append(insn) -- GitLab From eb42917a6d5b7a923384ae91902cb7cc89dc63ba Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 17 Sep 2018 11:50:31 -0500 Subject: [PATCH 381/916] fixes the statistics tests --- loopy/statistics.py | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 9894656b9..5dddd49e0 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1286,8 +1286,8 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, @memoize_method -def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, - count_granularity=CountGranularity.WORKITEM): +def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, + count_redundant_work, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] if count_granularity is None: @@ -1299,11 +1299,12 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, insn, count_redundant_work=count_redundant_work, + knl, program_callables_info, insn, + count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, insn, disregard_local_axes=True, + knl, program_callables_info, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: @@ -1311,7 +1312,7 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds() + _, local_size = knl.get_grid_size_upper_bounds(program_callables_info) workgroup_size = 1 if local_size: for size in local_size: @@ -1353,12 +1354,8 @@ def get_op_map_for_single_kernel(knl, program_callables_info, subgroup_size = _process_subgroup_size(knl, subgroup_size) - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl) + op_counter = ExpressionOpCounter(knl, program_callables_info) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1371,9 +1368,9 @@ def get_op_map_for_single_kernel(knl, program_callables_info, op_map = ( op_map + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + * _get_insn_count(knl, program_callables_info, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass @@ -1547,10 +1544,6 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, subgroup_size = _process_subgroup_size(knl, subgroup_size) - from loopy.preprocess import preprocess_kernel, infer_unknown_types - knl = infer_unknown_types(knl, expect_completion=True) - knl = preprocess_kernel(knl) - access_map = ToCountMap() access_counter_g = GlobalMemAccessCounter(knl, program_callables_info) access_counter_l = LocalMemAccessCounter(knl, program_callables_info) @@ -1576,18 +1569,18 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, access_map = ( access_map + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + * _get_insn_count(knl, program_callables_info, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) for key, val in six.iteritems(access_assignee.count_map): access_map = ( access_map + ToCountMap({key: val}) - * _get_insn_count(knl, insn.id, subgroup_size, - count_redundant_work, - key.count_granularity)) + * _get_insn_count(knl, program_callables_info, insn.id, + subgroup_size, count_redundant_work, + key.count_granularity)) elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass -- GitLab From 7389731759bb8b5d8978a7368a2236e7a9554631 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 17 Sep 2018 12:57:09 -0500 Subject: [PATCH 382/916] make the test adapt to the progam model --- test/test_target.py | 2 -- test/test_transform.py | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/test/test_target.py b/test/test_target.py index 0eee835c9..a5186c71c 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -347,8 +347,6 @@ def test_ispc_streaming_stores(): knl = lp.set_argument_order(knl, vars + ["n"]) - knl = lp.preprocess_kernel(knl) - knl = lp.get_one_scheduled_kernel(knl) lp.generate_code_v2(knl).all_code() diff --git a/test/test_transform.py b/test/test_transform.py index f67cb927e..04162331d 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -544,16 +544,16 @@ def test_uniquify_instruction_ids(): def test_split_iname_only_if_in_within(): - knl = lp.make_kernel( + prog = lp.make_kernel( "{[i]: 0<=i<10}", """ c[i] = 3*d[i] {id=to_split} a[i] = 2*b[i] {id=not_to_split} """) - knl = lp.split_iname(knl, "i", 4, within='id:to_split') + prog = lp.split_iname(prog, "i", 4, within='id:to_split') - for insn in knl.instructions: + for insn in prog.root_kernel.instructions: if insn.id == 'to_split': assert insn.within_inames == frozenset({'i_outer', 'i_inner'}) if insn.id == 'not_to_split': -- GitLab From ba27e5defa26d171e5039de2fa877fc1e1b144d0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 14 Oct 2018 20:17:13 -0500 Subject: [PATCH 383/916] minor changes after the review --- examples/python/hello-loopy.py | 3 +-- loopy/auto_test.py | 2 +- loopy/check.py | 4 ++-- loopy/codegen/__init__.py | 11 +++++++++++ loopy/type_inference.py | 4 ++-- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index 764cea0e6..9098c5444 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -16,8 +16,7 @@ a = cl.array.arange(queue, n, dtype=np.float32) # ------ knl = lp.make_kernel( "{ [i]: 0<=i Date: Sun, 14 Oct 2018 20:19:03 -0500 Subject: [PATCH 384/916] arg_is_output_only -> args_are_output_only --- loopy/kernel/creation.py | 4 ++-- loopy/kernel/function_interface.py | 4 ++-- loopy/kernel/tools.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index bc996d9c7..685232c61 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2166,8 +2166,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) - from loopy.kernel.tools import infer_arg_is_output_only - knl = infer_arg_is_output_only(knl) + from loopy.kernel.tools import infer_args_are_output_only + knl = infer_args_are_output_only(knl) from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index c8b5a9537..323690af7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -111,8 +111,8 @@ def get_kw_pos_association(kernel): Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in *kernel*. """ - from loopy.kernel.tools import infer_arg_is_output_only - kernel = infer_arg_is_output_only(kernel) + from loopy.kernel.tools import infer_args_are_output_only + kernel = infer_args_are_output_only(kernel) kw_to_pos = {} pos_to_kw = {} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 3c0c24434..3f4defc56 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1866,7 +1866,7 @@ def find_aliasing_equivalence_classes(kernel): # {{{ direction helper tools -def infer_arg_is_output_only(kernel): +def infer_args_are_output_only(kernel): """ Returns a copy of *kernel* with the attribute ``is_output_only`` set. -- GitLab From 111a5eb42b33b3d080027175533a06f57d32283a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 14 Oct 2018 20:28:15 -0500 Subject: [PATCH 385/916] minor changes after review --- loopy/kernel/function_interface.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 323690af7..268bdaa1c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -111,8 +111,6 @@ def get_kw_pos_association(kernel): Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in *kernel*. """ - from loopy.kernel.tools import infer_args_are_output_only - kernel = infer_args_are_output_only(kernel) kw_to_pos = {} pos_to_kw = {} @@ -136,7 +134,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): """ Helper class to set the :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the - callee kernels. Refer + callee kernels. Refer to :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. @@ -301,7 +299,8 @@ class InKernelCallable(ImmutableRecord): self.arg_id_to_descr is not None) def generate_preambles(self, target): - """ Yields the target specific preamble. + """ + Yields the target specific preamble. """ raise NotImplementedError() -- GitLab From c194c74e22513140f9e0afd92a428c42ba3fcfb6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 14 Oct 2018 20:30:27 -0500 Subject: [PATCH 386/916] program_callables_info, ProgramCallablesInfo -> callables_table, CallablesTable --- doc/tutorial.rst | 4 +- examples/python/global_barrier_removal.py | 2 +- loopy/check.py | 24 ++--- loopy/codegen/__init__.py | 28 +++--- loopy/codegen/control.py | 2 +- loopy/codegen/loop.py | 2 +- loopy/kernel/__init__.py | 16 +-- loopy/kernel/function_interface.py | 16 +-- loopy/kernel/tools.py | 12 +-- loopy/library/function.py | 12 +-- loopy/library/random123.py | 12 +-- loopy/library/reduction.py | 8 +- loopy/preprocess.py | 98 +++++++++---------- loopy/program.py | 114 +++++++++++----------- loopy/schedule/__init__.py | 18 ++-- loopy/statistics.py | 76 +++++++-------- loopy/target/__init__.py | 2 +- loopy/target/c/__init__.py | 14 +-- loopy/target/c/codegen/expression.py | 10 +- loopy/target/cuda.py | 14 +-- loopy/target/execution.py | 2 +- loopy/target/ispc.py | 4 +- loopy/target/opencl.py | 22 ++--- loopy/target/pyopencl.py | 20 ++-- loopy/target/python.py | 6 +- loopy/transform/buffer.py | 12 +-- loopy/transform/callable.py | 14 +-- loopy/transform/data.py | 12 +-- loopy/transform/fusion.py | 12 +-- loopy/transform/iname.py | 4 +- loopy/transform/instruction.py | 2 +- loopy/transform/precompute.py | 12 +-- loopy/transform/save.py | 12 +-- loopy/transform/subst.py | 2 +- loopy/type_inference.py | 80 +++++++-------- test/test_loopy.py | 14 +-- test/testlib.py | 10 +- 37 files changed, 362 insertions(+), 362 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 6a7a977a1..25082f88a 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1208,7 +1208,7 @@ happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: >>> prog = lp.preprocess_kernel(prog) - >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) >>> prog = prog.with_root_kernel(knl) >>> print(prog) --------------------------------------------------------------------------- @@ -1240,7 +1240,7 @@ that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. >>> prog = lp.save_and_reload_temporaries(prog) - >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) # Schedule added instructions + >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) # Schedule added instructions >>> prog = prog.with_root_kernel(knl) >>> print(prog) --------------------------------------------------------------------------- diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index cc4926fee..884fb0bd1 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -24,7 +24,7 @@ from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info) +knl = get_one_scheduled_kernel(knl.root_kernel, knl.callables_table) # map schedule onto host or device print(knl) diff --git a/loopy/check.py b/loopy/check.py index bfcd7aa26..64cf80a4e 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -206,7 +206,7 @@ def check_multiple_tags_allowed(kernel): "tags: {1}".format(iname, tags)) -def check_for_double_use_of_hw_axes(kernel, program_callables_info): +def check_for_double_use_of_hw_axes(kernel, callables_table): from loopy.kernel.data import UniqueTag from loopy.kernel.instruction import CallInstruction from loopy.kernel.function_interface import CallableKernel @@ -224,7 +224,7 @@ def check_for_double_use_of_hw_axes(kernel, program_callables_info): # check usage of iname tags in the callee kernel if isinstance(insn, CallInstruction): - in_knl_callable = program_callables_info[ + in_knl_callable = callables_table[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): # check for collision in iname_tag keys in the instruction @@ -712,13 +712,13 @@ def check_variable_access_ordered(kernel): # }}} -def pre_schedule_checks(kernel, program_callables_info): +def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) - check_for_double_use_of_hw_axes(kernel, program_callables_info) + check_for_double_use_of_hw_axes(kernel, callables_table) check_insn_attributes(kernel) check_loop_priority_inames_known(kernel) check_multiple_tags_allowed(kernel) @@ -746,7 +746,7 @@ def pre_schedule_checks(kernel, program_callables_info): # {{{ check for unused hw axes -def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, +def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, sched_index=None): from loopy.schedule import (CallKernel, RunInstruction, Barrier, EnterLoop, LeaveLoop, ReturnFromKernel, @@ -763,7 +763,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index), - program_callables_info) + callables_table) group_axes = set(ax for ax, length in enumerate(group_size)) local_axes = set(ax for ax, length in enumerate(local_size)) @@ -781,7 +781,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, sched_item = kernel.schedule[i] if isinstance(sched_item, CallKernel): i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, - program_callables_info, i) + callables_table, i) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] @@ -832,10 +832,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info, return past_end_i -def check_for_unused_hw_axes_in_insns(kernel, program_callables_info): +def check_for_unused_hw_axes_in_insns(kernel, callables_table): if kernel.schedule: _check_for_unused_hw_axes_in_kernel_chunk(kernel, - program_callables_info) + callables_table) # }}} @@ -989,15 +989,15 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel, program_callables_info): +def pre_codegen_checks(kernel, callables_table): try: logger.debug("pre-codegen check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel, program_callables_info) + check_for_unused_hw_axes_in_insns(kernel, callables_table) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel, program_callables_info) + kernel.target.pre_codegen_check(kernel, callables_table) check_that_shapes_and_strides_are_arguments(kernel) logger.debug("pre-codegen check %s: done" % kernel.name) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d0b19a1eb..250e7215a 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -192,16 +192,16 @@ class CodeGenerationState(object): .. attribute:: schedule_index_end - .. attribute:: program_callables_info + .. attribute:: callables_table - An instance of :class:`loopy.ProgramCallablesInfo`. + An instance of :class:`loopy.CallablesTable`. """ def __init__(self, kernel, implemented_data_info, implemented_domain, implemented_predicates, seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, - program_callables_info, + callables_table, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, @@ -215,7 +215,7 @@ class CodeGenerationState(object): self.seen_atomic_dtypes = seen_atomic_dtypes self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -263,7 +263,7 @@ class CodeGenerationState(object): seen_atomic_dtypes=self.seen_atomic_dtypes, var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, - program_callables_info=self.program_callables_info, + callables_table=self.callables_table, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -385,19 +385,19 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_for_a_single_kernel(kernel, program_callables_info): +def generate_code_for_a_single_kernel(kernel, callables_table): """ :returns: a :class:`CodeGenerationResult` :param kernel: An instance of :class:`loopy.LoopKernel`. - :param program_callables_info: An instance of - :class:`loopy.ProgramCallablesInfo`. + :param callables_table: An instance of + :class:`loopy.CallablesTable`. """ from loopy.kernel import KernelState if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel, program_callables_info) + kernel = get_one_scheduled_kernel(kernel, callables_table) if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " @@ -419,7 +419,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): # }}} from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel, program_callables_info) + pre_codegen_checks(kernel, callables_table) logger.info("%s: generate code: start" % kernel.name) @@ -479,7 +479,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info): + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), - program_callables_info=program_callables_info) + callables_table=callables_table) from loopy.codegen.result import generate_host_or_device_program @@ -556,17 +556,17 @@ def generate_code_v2(program): codegen_results = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): codegen_results[func_id] = ( generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.program_callables_info)) + program.callables_table)) device_preambles = set() for cgr in codegen_results.values(): device_preambles.update(cgr.device_preambles) - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): for preamble in in_knl_callable.generate_preambles(program.target): device_preambles.update([preamble]) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 90bdbda31..81a672a14 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -116,7 +116,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index), - codegen_state.program_callables_info) + codegen_state.callables_table) return merge_codegen_results(codegen_state, [ codegen_result, diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 39cf20c7d..c282de79b 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -248,7 +248,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block, codegen_state.program_callables_info) + insn_ids_for_block, codegen_state.callables_table) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 410f13322..70079d318 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1036,7 +1036,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): constants_only=True))) @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info, + def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given @@ -1048,7 +1048,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if self.overridden_get_grid_sizes_for_insn_ids: return self.overridden_get_grid_sizes_for_insn_ids( insn_ids, - program_callables_info, + callables_table, ignore_auto=ignore_auto) all_inames_by_insns = set() @@ -1135,7 +1135,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, - program_callables_info, ignore_auto=False): + callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1146,7 +1146,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, program_callables_info, ignore_auto) + insn_ids, callables_table, ignore_auto) def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr @@ -1154,7 +1154,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, program_callables_info, ignore_auto=False): + def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1162,10 +1162,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), - program_callables_info, + callables_table, ignore_auto=ignore_auto) - def get_grid_size_upper_bounds_as_exprs(self, program_callables_info, + def get_grid_size_upper_bounds_as_exprs(self, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1175,7 +1175,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), - program_callables_info, + callables_table, ignore_auto=ignore_auto) # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 268bdaa1c..362fbcefc 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -157,7 +157,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): self.local_size = local_size self.global_size = global_size - def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + def __call__(self, insn_ids, callables_table, ignore_auto=True): return self.local_size, self.global_size # }}} @@ -214,7 +214,7 @@ class InKernelCallable(ImmutableRecord): update_persistent_hash = LoopKernel.update_persistent_hash - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): """ :arg arg_id_to_type: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -234,7 +234,7 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -363,16 +363,16 @@ class ScalarCallable(InKernelCallable): return (self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - program_callables_info) + callables_table) def with_hw_axes_sizes(self, global_size, local_size): return self.copy() @@ -564,7 +564,7 @@ class ManglerCallable(ScalarCallable): return (self.name, self.function_mangler, self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): if self.arg_id_to_dtype is not None: # specializing an already specialized function. for arg_id, dtype in arg_id_to_dtype.items(): @@ -588,7 +588,7 @@ class ManglerCallable(ScalarCallable): return ( self.copy(name_in_target=mangle_result.target_name, arg_id_to_dtype=new_arg_id_to_dtype), - program_callables_info) + callables_table) else: # The function mangler does not agree with the arg id to dtypes # provided. Indicating that is illegal. diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 3f4defc56..006ac6ba3 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -755,7 +755,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn): # }}} -def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=None): +def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None): logger.debug("%s: assign automatic axes" % kernel.name) # TODO: do the tag removal rigorously, might be easier after switching # to set() from tuple() @@ -769,7 +769,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non if local_size is None: _, local_size = kernel.get_grid_size_upper_bounds_as_exprs( - program_callables_info, ignore_auto=True) + callables_table, ignore_auto=True) # {{{ axis assignment helper function @@ -797,7 +797,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non return assign_automatic_axes( kernel.copy(iname_to_tags=new_iname_to_tags), - program_callables_info, + callables_table, axis=recursion_axis) if axis is None: @@ -849,7 +849,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), - program_callables_info=program_callables_info, + callables_table=callables_table, axis=recursion_axis, local_size=local_size) if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase): @@ -871,7 +871,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non del new_iname_to_tags[iname] return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags), - program_callables_info, axis=recursion_axis, local_size=local_size) + callables_table, axis=recursion_axis, local_size=local_size) # }}} @@ -940,7 +940,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non return kernel else: return assign_automatic_axes(kernel, - program_callables_info=program_callables_info, axis=axis+1, + callables_table=callables_table, axis=axis+1, local_size=local_size) # }}} diff --git a/loopy/library/function.py b/loopy/library/function.py index f3fb5f8cd..f225b62f9 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -26,33 +26,33 @@ from loopy.kernel.function_interface import ScalarCallable class MakeTupleCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): new_arg_id_to_dtype = arg_id_to_dtype.copy() for i in range(len(arg_id_to_dtype)): if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i] return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target="loopy_make_tuple"), program_callables_info) + name_in_target="loopy_make_tuple"), callables_table) - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) return ( self.copy(arg_id_to_descr=new_arg_id_to_descr), - program_callables_info) + callables_table) class IndexOfCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): new_arg_id_to_dtype = dict((i, dtype) for i, dtype in arg_id_to_dtype.items() if dtype is not None) new_arg_id_to_dtype[-1] = kernel.index_dtype return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), - program_callables_info) + callables_table) def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier): diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 397e985b4..e59a892bb 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -169,14 +169,14 @@ class Random123Callable(ScalarCallable): Records information about for the random123 functions. """ - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): # the types provided aren't mature enough to specialize the # callable return (self.copy(), - program_callables_info) + callables_table) name = self.name target = kernel.target @@ -195,7 +195,7 @@ class Random123Callable(ScalarCallable): return ( self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=fn+"_gen"), - program_callables_info) + callables_table) elif name == fn + "_f32": new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32), @@ -203,7 +203,7 @@ class Random123Callable(ScalarCallable): -2: ctr_dtype, 0: ctr_dtype, 1: key_dtype} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name), program_callables_info + name_in_target=name), callables_table elif name == fn + "_f64": new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64), @@ -211,10 +211,10 @@ class Random123Callable(ScalarCallable): -2: ctr_dtype, 0: ctr_dtype, 1: key_dtype} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name), program_callables_info + name_in_target=name), callables_table return (self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def generate_preambles(self, target): rng_variant = FUNC_NAMES_TO_RNG[self.name] diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 70df864d4..7c32d0bed 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -424,7 +424,7 @@ def parse_reduction_op(name): # {{{ reduction specific callables class ReductionCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype, @@ -436,15 +436,15 @@ class ReductionCallable(ScalarCallable): index_dtype) + "_op" return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, - name_in_target=name_in_target), program_callables_info + name_in_target=name_in_target), callables_table - def with_descr(self, arg_id_to_descr, program_callables_info): + def with_descr(self, arg_id_to_descr, callables_table): from loopy.library.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - program_callables_info) + callables_table) def generate_preambles(self, target): if isinstance(self.name, ArgExtOp): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1042c857d..85b0c6d48 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -890,7 +890,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} -def realize_reduction_for_single_kernel(kernel, program_callables_info, +def realize_reduction_for_single_kernel(kernel, callables_table, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* @@ -1012,7 +1012,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ sequential - def map_reduction_seq(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1130,7 +1130,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, v[iname].lt_set(v[0] + ubound)).get_basic_sets() return bs - def map_reduction_local(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -1370,7 +1370,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ sequential scan - def map_scan_seq(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1459,7 +1459,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ local-parallel scan - def map_scan_local(expr, rec, program_callables_info, nresults, arg_dtypes, + def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, scan_min_value, stride): @@ -1468,7 +1468,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, assert scan_size > 0 if scan_size == 1: - return map_reduction_seq(expr, rec, program_callables_info, + return map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) @@ -1668,15 +1668,15 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, # {{{ seq/par dispatch - def map_reduction(expr, rec, program_callables_info, nresults=1): + def map_reduction(expr, rec, callables_table, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes, program_callables_info = ( + arg_dtypes, reduction_dtypes, callables_table = ( infer_arg_and_reduction_dtypes_for_reduction_expression( - temp_kernel, expr, program_callables_info, unknown_types_ok)) + temp_kernel, expr, callables_table, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -1785,7 +1785,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, for tag in temp_kernel.iname_tags(sweep_iname)))) elif parallel: return map_scan_local( - expr, rec, program_callables_info, nresults, + expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, @@ -1793,7 +1793,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, scan_param.stride) elif sequential: return map_scan_seq( - expr, rec, program_callables_info, nresults, + expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes, sweep_iname, scan_param.scan_iname, scan_param.sweep_lower_bound, @@ -1814,12 +1814,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, if n_sequential: assert n_local_par == 0 - return map_reduction_seq(expr, rec, program_callables_info, + return map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes) else: assert n_local_par > 0 return map_reduction_local( - expr, rec, program_callables_info, nresults, arg_dtypes, + expr, rec, callables_table, nresults, arg_dtypes, reduction_dtypes) # }}} @@ -1854,12 +1854,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info, from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: new_expressions = cb_mapper(insn.expression, - program_callables_info=program_callables_info, + callables_table=callables_table, nresults=nresults) else: new_expressions = ( cb_mapper(insn.expression, - program_callables_info=program_callables_info),) + callables_table=callables_table),) if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1952,10 +1952,10 @@ def realize_reduction(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = realize_reduction_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -1968,9 +1968,9 @@ def realize_reduction(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} @@ -2153,11 +2153,11 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): """ def __init__(self, rule_mapping_context, caller_kernel, - program_callables_info): + callables_table): super(ArgDescrInferenceMapper, self).__init__( rule_mapping_context) self.caller_kernel = caller_kernel - self.program_callables_info = program_callables_info + self.callables_table = callables_table def map_call(self, expr, expn_state, **kwargs): from pymbolic.primitives import Call, CallWithKwargs @@ -2193,12 +2193,12 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): combined_arg_id_to_descr.update(assignee_id_to_descr) # specializing the function according to the parameter description - in_knl_callable = self.program_callables_info[expr.function.name] - new_in_knl_callable, self.program_callables_info = ( + in_knl_callable = self.callables_table[expr.function.name] + new_in_knl_callable, self.callables_table = ( in_knl_callable.with_descrs( - combined_arg_id_to_descr, self.program_callables_info)) - self.program_callables_info, new_func_id = ( - self.program_callables_info.with_callable( + combined_arg_id_to_descr, self.callables_table)) + self.callables_table, new_func_id = ( + self.callables_table.with_callable( expr.function.function, new_in_knl_callable)) @@ -2242,7 +2242,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): return kernel.copy(instructions=new_insns) -def traverse_to_infer_arg_descr(kernel, program_callables_info): +def traverse_to_infer_arg_descr(kernel, callables_table): """ Returns a copy of *kernel* with the argument shapes and strides matching for scoped functions in the *kernel*. Refer @@ -2258,12 +2258,12 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info): kernel.substitutions, kernel.get_var_name_generator()) arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context, - kernel, program_callables_info) + kernel, callables_table) descr_inferred_kernel = rule_mapping_context.finish_kernel( arg_descr_inf_mapper.map_kernel(kernel)) - return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info + return descr_inferred_kernel, arg_descr_inf_mapper.callables_table def infer_arg_descr(program): @@ -2272,23 +2272,23 @@ def infer_arg_descr(program): :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the callables. """ - root_kernel_callable = program.program_callables_info[program.name] - old_callables_count = program.program_callables_info.callables_count - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) + root_kernel_callable = program.callables_table[program.name] + old_callables_count = program.callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) root_kernel = program.root_kernel - new_root_kernel, program_callables_info = traverse_to_infer_arg_descr( - root_kernel, program_callables_info) + new_root_kernel, callables_table = traverse_to_infer_arg_descr( + root_kernel, callables_table) new_root_kernel_callable = root_kernel_callable.copy( subkernel=new_root_kernel) - program_callables_info, _ = program_callables_info.with_callable(program.name, + callables_table, _ = callables_table.with_callable(program.name, new_root_kernel_callable) - program_callables_info = program_callables_info.with_exit_edit_callables_mode( + callables_table = callables_table.with_exit_edit_callables_mode( old_callables_count) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) # }}} @@ -2298,7 +2298,7 @@ preprocess_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def preprocess_single_kernel(kernel, program_callables_info, device=None): +def preprocess_single_kernel(kernel, callables_table, device=None): from loopy.kernel import KernelState if kernel.state >= KernelState.PREPROCESSED: return kernel @@ -2356,7 +2356,7 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None): # because it manipulates the depends_on field, which could prevent # defaults from being applied. kernel = realize_reduction_for_single_kernel(kernel, - program_callables_info, unknown_types_ok=False) + callables_table, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators @@ -2420,7 +2420,7 @@ def infer_hw_axes_sizes(program): resolved_function_with_hw_axes_sizes_inferred = {} for func_id, in_knl_callable in ( - program.program_callables_info.items()): + program.callables_table.items()): if func_id == program.name: resolved_function_with_hw_axes_sizes_inferred[func_id] = ( in_knl_callable) @@ -2428,11 +2428,11 @@ def infer_hw_axes_sizes(program): resolved_function_with_hw_axes_sizes_inferred[func_id] = ( in_knl_callable.with_hw_axes_sizes(local_size, global_size)) - new_program_callables_info = ( - program.program_callables_info.copy( + new_callables_table = ( + program.callables_table.copy( resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} @@ -2451,16 +2451,16 @@ def preprocess_program(program, device=None): # Callable editing restrictions: # - # - should not edit program_callables_info in :meth:`preprocess_single_kernel` + # - should not edit callables_table in :meth:`preprocess_single_kernel` # as we are iterating over it.[1] # # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = preprocess_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, device) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -2472,9 +2472,9 @@ def preprocess_program(program, device=None): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - program = program.copy(program_callables_info=new_program_callables_info) + program = program.copy(callables_table=new_callables_table) # }}} diff --git a/loopy/program.py b/loopy/program.py index 7479ee043..f7c399c1e 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -47,7 +47,7 @@ __doc__ = """ .. currentmodule:: loopy .. autoclass:: Program -.. autoclass:: ProgramCallablesInfo +.. autoclass:: CallablesTable .. autofunction:: make_program_from_kernel .. autofunction:: iterate_over_kernels_if_given_program @@ -73,11 +73,11 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): :arg function_ids: A container with instances of :class:`str` indicating the function identifiers to look for while scoping functions. """ - def __init__(self, rule_mapping_context, kernel, program_callables_info, + def __init__(self, rule_mapping_context, kernel, callables_table, function_id_to_in_knl_callable_mappers): super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) self.kernel = kernel - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.function_id_to_in_knl_callable_mappers = ( function_id_to_in_knl_callable_mappers) @@ -123,8 +123,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): # associate the newly created ResolvedFunction with the # resolved in-kernel callable - self.program_callables_info, new_func_id = ( - self.program_callables_info.with_added_callable( + self.callables_table, new_func_id = ( + self.callables_table.with_added_callable( expr.function, in_knl_callable)) return type(expr)( ResolvedFunction(new_func_id), @@ -144,8 +144,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): expr.operation.get_scalar_callables()): in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) assert in_knl_callable is not None - self.program_callables_info, _ = ( - self.program_callables_info.with_added_callable(func_id, + self.callables_table, _ = ( + self.callables_table.with_added_callable(func_id, in_knl_callable)) return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) @@ -162,37 +162,37 @@ def _default_func_id_to_kernel_callable_mappers(target): ))) -def initialize_program_callables_info_from_kernel(kernel): +def initialize_callables_table_from_kernel(kernel): """ - Returns an instance of :class:`loopy.ProgramCallablesInfo`, by resolving + Returns an instance of :class:`loopy.CallablesTable`, by resolving the functions based on :mod:`loopy`'s default function resolvers. """ # collect the default function resolvers func_id_to_kernel_callable_mappers = ( _default_func_id_to_kernel_callable_mappers(kernel.target)) - program_callables_info = ProgramCallablesInfo({}) + callables_table = CallablesTable({}) from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, kernel, program_callables_info, + rule_mapping_context, kernel, callables_table, func_id_to_kernel_callable_mappers) # mark the functions as "Resolved" in the expression nodes. kernel_with_functions_resolved = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) - # collect the update program_callables_info - program_callables_info = resolved_function_marker.program_callables_info + # collect the update callables_table + callables_table = resolved_function_marker.callables_table callable_kernel = CallableKernel(kernel_with_functions_resolved) - # add the callable kernel to the program_callables_info - program_callables_info, _ = program_callables_info.with_added_callable( + # add the callable kernel to the callables_table + callables_table, _ = callables_table.with_added_callable( Variable(kernel.name), callable_kernel) - return program_callables_info + return callables_table # {{{ program definition @@ -206,9 +206,9 @@ class Program(ImmutableRecord): An instance of :class:`str`, also the name of the top-most level :class:`loopy.LoopKernel`. - .. attribute:: program_callables_info + .. attribute:: callables_table - An instance of :class:`loopy.program.ProgramCallablesInfo`. + An instance of :class:`loopy.program.CallablesTable`. .. attribute:: target @@ -232,16 +232,16 @@ class Program(ImmutableRecord): """ def __init__(self, name, - program_callables_info, + callables_table, target, func_id_to_in_knl_callable_mappers): - assert isinstance(program_callables_info, ProgramCallablesInfo) + assert isinstance(callables_table, CallablesTable) - assert name in program_callables_info + assert name in callables_table super(Program, self).__init__( name=name, - program_callables_info=program_callables_info, + callables_table=callables_table, target=target, func_id_to_in_knl_callable_mappers=( func_id_to_in_knl_callable_mappers)) @@ -250,7 +250,7 @@ class Program(ImmutableRecord): hash_fields = ( "name", - "program_callables_info", + "callables_table", "target",) update_persistent_hash = LoopKernel.update_persistent_hash @@ -262,7 +262,7 @@ class Program(ImmutableRecord): new_self = super(Program, self).copy(**kwargs) new_resolved_functions = {} for func_id, in_knl_callable in ( - new_self.program_callables_info.items()): + new_self.callables_table.items()): if isinstance(in_knl_callable, CallableKernel): subkernel = in_knl_callable.subkernel new_resolved_functions[func_id] = in_knl_callable.copy( @@ -270,11 +270,11 @@ class Program(ImmutableRecord): else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = new_self.program_callables_info.copy( + callables_table = new_self.callables_table.copy( resolved_functions=new_resolved_functions) return super(Program, new_self).copy( - program_callables_info=program_callables_info) + callables_table=callables_table) else: return super(Program, self).copy(**kwargs) @@ -285,7 +285,7 @@ class Program(ImmutableRecord): *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ return self.root_kernel.get_grid_size_upper_bounds( - self.program_callables_info, + self.callables_table, ignore_auto=ignore_auto) def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): @@ -295,7 +295,7 @@ class Program(ImmutableRecord): *global_size* and *local_size* are :mod:`pymbolic` expressions """ return self.root_kernel.get_grid_size_upper_bounds_as_exprs( - self.program_callables_info, + self.callables_table, ignore_auto=ignore_auto) # {{{ implementation arguments @@ -338,7 +338,7 @@ class Program(ImmutableRecord): Syntactic sugar. """ - return self.program_callables_info[self.name].subkernel + return self.callables_table[self.name].subkernel @property def arg_dict(self): @@ -367,14 +367,14 @@ class Program(ImmutableRecord): Returns a copy of *self* with the topmost level kernel as *root_kernel*. """ - new_in_knl_callable = self.program_callables_info[ + new_in_knl_callable = self.callables_table[ self.name].copy(subkernel=root_kernel) new_resolved_functions = ( - self.program_callables_info.resolved_functions.copy()) + self.callables_table.resolved_functions.copy()) new_resolved_functions[self.name] = new_in_knl_callable return self.copy( - program_callables_info=self.program_callables_info.copy( + callables_table=self.callables_table.copy( resolved_functions=new_resolved_functions)) def __call__(self, *args, **kwargs): @@ -462,14 +462,14 @@ def rename_resolved_functions_in_a_single_kernel(kernel, class CallablesCountingMapper(CombineMapper): """ Returns an instance of :class:`collections.Counter` with the count of - callables registered in *program_callables_info*. + callables registered in *callables_table*. - .. attribute:: program_callables_info + .. attribute:: callables_table - An instance of :class:`loopy.program.ProgramCallablesInfo`. + An instance of :class:`loopy.program.CallablesTable`. """ - def __init__(self, program_callables_info): - self.program_callables_info = program_callables_info + def __init__(self, callables_table): + self.callables_table = callables_table def combine(self, values): return sum(values, Counter()) @@ -483,7 +483,7 @@ class CallablesCountingMapper(CombineMapper): kw_parameters = {} if isinstance(expr.function, (ResolvedFunction)): - in_knl_callable = self.program_callables_info[expr.function.name] + in_knl_callable = self.callables_table[expr.function.name] if isinstance(in_knl_callable, ScalarCallable): return (Counter([expr.function.name]) + self.combine((self.rec(child) for child in expr.parameters @@ -495,7 +495,7 @@ class CallablesCountingMapper(CombineMapper): callables_count_in_subkernel = ( count_callables_in_kernel( in_knl_callable.subkernel, - self.program_callables_info)) + self.callables_table)) return (Counter([expr.function.name]) + self.combine((self.rec(child) for child in expr.parameters @@ -525,16 +525,16 @@ class CallablesCountingMapper(CombineMapper): @memoize_method -def count_callables_in_kernel(kernel, program_callables_info): +def count_callables_in_kernel(kernel, callables_table): """ Returns an instance of :class:`collections.Counter` representing the number of callables in the *kernel* that are registered in - *program_callables_info*. + *callables_table*. """ assert isinstance(kernel, LoopKernel) callables_count = Counter() callables_counting_mapper = CallablesCountingMapper( - program_callables_info) + callables_table) subst_expander = SubstitutionRuleExpander(kernel.substitutions) for insn in kernel.instructions: @@ -555,7 +555,7 @@ def count_callables_in_kernel(kernel, program_callables_info): # {{{ program callables info -class ProgramCallablesInfo(ImmutableRecord): +class CallablesTable(ImmutableRecord): # FIXME: is CallablesTable a better name?(similar to symbol table in # compilers.) """ @@ -594,7 +594,7 @@ class ProgramCallablesInfo(ImmutableRecord): history = dict((func_id, frozenset([func_id])) for func_id in resolved_functions) - super(ProgramCallablesInfo, self).__init__( + super(CallablesTable, self).__init__( resolved_functions=resolved_functions, history=history, is_being_edited=is_being_edited) @@ -618,7 +618,7 @@ class ProgramCallablesInfo(ImmutableRecord): def callables_count(self): """ Returns an instance of :class:`collection.Counter` representing the number - of times the callables is called in program_callables_info. + of times the callables is called in callables_table. """ # should raise an error if there are more than one root kernels(which is # illegal) @@ -648,24 +648,24 @@ class ProgramCallablesInfo(ImmutableRecord): .. note:: - Always checks whether the - :attr:``loopy.ProgramCallablesInfo.resolved_functions` has + :attr:``loopy.CallablesTable.resolved_functions` has *in_kernel_callable*, does not introduce copies. - The difference between - :meth:`loopy.ProgramCallablesInfo.with_added_callable` - and :meth:`ProgramCallablesInfo.with_callable` being that + :meth:`loopy.CallablesTable.with_added_callable` + and :meth:`CallablesTable.with_callable` being that the former has no support for renaming the callable back i.e. ``with_callable`` supports renaming from ``sin_0`` to ``sin``, if possible, through the member method - ``loopy.ProgramCallablesInfo.with_exit_edit_callables_mode`` + ``loopy.CallablesTable.with_exit_edit_callables_mode`` This subtle difference makes -- - - :meth:`loopy.ProgramCallablesInfo.with_added_callable` suitable + - :meth:`loopy.CallablesTable.with_added_callable` suitable for usage while resolving the functions first time, where no renaming is needed. - - :meth:`loopy.ProgramCallablesInfo.with_callable` suitable for + - :meth:`loopy.CallablesTable.with_callable` suitable for implementing edits in callables during inference-walks. """ @@ -745,7 +745,7 @@ class ProgramCallablesInfo(ImmutableRecord): def with_callable(self, function, in_kernel_callable): """ Returns an instance of :class:`tuple` ``(new_self, new_function)``. - Also refer -- :meth:`loopy.ProgramCallablesInfo.with_added_callable` + Also refer -- :meth:`loopy.CallablesTable.with_added_callable` :arg function: An instance of :class:`pymbolic.primitives.Variable` or @@ -929,12 +929,12 @@ def make_program_from_kernel(kernel): """ # get the program callables info - program_callables_info = initialize_program_callables_info_from_kernel(kernel) + callables_table = initialize_callables_table_from_kernel(kernel) # get the program from program callables info program = Program( name=kernel.name, - program_callables_info=program_callables_info, + callables_table=callables_table, func_id_to_in_knl_callable_mappers=( _default_func_id_to_kernel_callable_mappers(kernel.target)), target=kernel.target) @@ -953,7 +953,7 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): if isinstance(program_or_kernel, Program): program = program_or_kernel new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = transform_for_single_kernel( in_knl_callable.subkernel, *args, **kwargs) @@ -968,9 +968,9 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) else: assert isinstance(program_or_kernel, LoopKernel) kernel = program_or_kernel diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 201bcc256..2b3f7a3b9 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1832,7 +1832,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit): # {{{ main scheduling entrypoint -def generate_loop_schedules(kernel, program_callables_info, debug_args={}): +def generate_loop_schedules(kernel, callables_table, debug_args={}): """ .. warning:: @@ -1846,18 +1846,18 @@ def generate_loop_schedules(kernel, program_callables_info, debug_args={}): with MinRecursionLimitForScheduling(kernel): for sched in generate_loop_schedules_inner(kernel, - program_callables_info, debug_args=debug_args): + callables_table, debug_args=debug_args): yield sched -def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}): +def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): from loopy.kernel import KernelState if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") from loopy.check import pre_schedule_checks - pre_schedule_checks(kernel, program_callables_info) + pre_schedule_checks(kernel, callables_table) schedule_count = 0 @@ -1971,7 +1971,7 @@ def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}) kernel, gen_sched) gsize, lsize = ( - kernel.get_grid_size_upper_bounds(program_callables_info)) + kernel.get_grid_size_upper_bounds(callables_table)) if (gsize or lsize): if not kernel.options.disable_global_barriers: @@ -2028,7 +2028,7 @@ schedule_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def _get_one_scheduled_kernel_inner(kernel, program_callables_info): +def _get_one_scheduled_kernel_inner(kernel, callables_table): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be # garbage-collected in the exit handler of the @@ -2038,10 +2038,10 @@ def _get_one_scheduled_kernel_inner(kernel, program_callables_info): # # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context. - return next(iter(generate_loop_schedules(kernel, program_callables_info))) + return next(iter(generate_loop_schedules(kernel, callables_table))) -def get_one_scheduled_kernel(kernel, program_callables_info): +def get_one_scheduled_kernel(kernel, callables_table): from loopy import CACHING_ENABLED sched_cache_key = kernel @@ -2060,7 +2060,7 @@ def get_one_scheduled_kernel(kernel, program_callables_info): with ProcessLogger(logger, "%s: schedule" % kernel.name): with MinRecursionLimitForScheduling(kernel): result = _get_one_scheduled_kernel_inner(kernel, - program_callables_info) + callables_table) if CACHING_ENABLED and not from_cache: schedule_cache.store_if_not_present(sched_cache_key, result) diff --git a/loopy/statistics.py b/loopy/statistics.py index 5dddd49e0..d65387d16 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -648,11 +648,11 @@ class MemAccess(Record): # {{{ counter base class CounterBase(CombineMapper): - def __init__(self, knl, program_callables_info): + def __init__(self, knl, callables_table): self.knl = knl - self.program_callables_info = program_callables_info + self.callables_table = callables_table from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl, program_callables_info) + self.type_inf = TypeInferenceMapper(knl, callables_table) def combine(self, values): return sum(values) @@ -707,11 +707,11 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, program_callables_info): + def __init__(self, knl, callables_table): self.knl = knl - self.program_callables_info = program_callables_info + self.callables_table = callables_table from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl, program_callables_info) + self.type_inf = TypeInferenceMapper(knl, callables_table) def combine(self, values): return sum(values) @@ -725,7 +725,7 @@ class ExpressionOpCounter(CounterBase): def map_call(self, expr): from loopy.symbolic import ResolvedFunction if isinstance(expr.function, ResolvedFunction): - function_identifier = self.program_callables_info[ + function_identifier = self.callables_table[ expr.function.name].name else: function_identifier = expr.function.name @@ -1111,7 +1111,7 @@ def count(kernel, set, space=None): from loopy.program import Program if isinstance(kernel, Program): if len([in_knl_callable for in_knl_callable in - kernel.program_callables_info.values() if isinstance(in_knl_callable, + kernel.callables_table.values() if isinstance(in_knl_callable, CallableKernel)]) != 1: raise NotImplementedError("Currently only supported for program with " "only one CallableKernel.") @@ -1216,10 +1216,10 @@ def count(kernel, set, space=None): return add_assumptions_guard(kernel, count) -def get_unused_hw_axes_factor(knl, program_callables_info, insn, +def get_unused_hw_axes_factor(knl, callables_table, insn, disregard_local_axes, space=None): # FIXME: Multi-kernel support - gsize, lsize = knl.get_grid_size_upper_bounds(program_callables_info) + gsize, lsize = knl.get_grid_size_upper_bounds(callables_table) g_used = set() l_used = set() @@ -1257,7 +1257,7 @@ def get_unused_hw_axes_factor(knl, program_callables_info, insn, return add_assumptions_guard(knl, result) -def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, +def count_insn_runs(knl, callables_table, insn, count_redundant_work, disregard_local_axes=False): insn_inames = knl.insn_inames(insn) @@ -1278,7 +1278,7 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, c = count(knl, domain, space=space) if count_redundant_work: - unused_fac = get_unused_hw_axes_factor(knl, program_callables_info, + unused_fac = get_unused_hw_axes_factor(knl, callables_table, insn, disregard_local_axes=disregard_local_axes, space=space) return c * unused_fac else: @@ -1286,7 +1286,7 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work, @memoize_method -def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, +def _get_insn_count(knl, callables_table, insn_id, subgroup_size, count_redundant_work, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] @@ -1299,12 +1299,12 @@ def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, if count_granularity == CountGranularity.WORKITEM: return count_insn_runs( - knl, program_callables_info, insn, + knl, callables_table, insn, count_redundant_work=count_redundant_work, disregard_local_axes=False) ct_disregard_local = count_insn_runs( - knl, program_callables_info, insn, disregard_local_axes=True, + knl, callables_table, insn, disregard_local_axes=True, count_redundant_work=count_redundant_work) if count_granularity == CountGranularity.WORKGROUP: @@ -1312,7 +1312,7 @@ def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, elif count_granularity == CountGranularity.SUBGROUP: # get the group size from loopy.symbolic import aff_to_expr - _, local_size = knl.get_grid_size_upper_bounds(program_callables_info) + _, local_size = knl.get_grid_size_upper_bounds(callables_table) workgroup_size = 1 if local_size: for size in local_size: @@ -1344,7 +1344,7 @@ def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size, # {{{ get_op_map -def get_op_map_for_single_kernel(knl, program_callables_info, +def get_op_map_for_single_kernel(knl, callables_table, numpy_types=True, count_redundant_work=False, subgroup_size=None): @@ -1355,7 +1355,7 @@ def get_op_map_for_single_kernel(knl, program_callables_info, subgroup_size = _process_subgroup_size(knl, subgroup_size) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, program_callables_info) + op_counter = ExpressionOpCounter(knl, callables_table) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1368,7 +1368,7 @@ def get_op_map_for_single_kernel(knl, program_callables_info, op_map = ( op_map + ToCountMap({key: val}) - * _get_insn_count(knl, program_callables_info, insn.id, + * _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, key.count_granularity)) @@ -1458,13 +1458,13 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, op_map = ToCountMap() callables_count = ( - program.program_callables_info.callables_count) + program.callables_table.callables_count) - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_op_map = get_op_map_for_single_kernel(knl, - program.program_callables_info, numpy_types, + program.callables_table, numpy_types, count_redundant_work, subgroup_size) for i in range(callables_count[func_id]): @@ -1535,7 +1535,7 @@ def _process_subgroup_size(knl, subgroup_size_requested): # {{{ get_mem_access_map -def get_mem_access_map_for_single_kernel(knl, program_callables_info, +def get_mem_access_map_for_single_kernel(knl, callables_table, numpy_types=True, count_redundant_work=False, subgroup_size=None): if not knl.options.ignore_boostable_into: @@ -1545,8 +1545,8 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, subgroup_size = _process_subgroup_size(knl, subgroup_size) access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl, program_callables_info) - access_counter_l = LocalMemAccessCounter(knl, program_callables_info) + access_counter_g = GlobalMemAccessCounter(knl, callables_table) + access_counter_l = LocalMemAccessCounter(knl, callables_table) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1569,7 +1569,7 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, access_map = ( access_map + ToCountMap({key: val}) - * _get_insn_count(knl, program_callables_info, insn.id, + * _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, key.count_granularity)) @@ -1578,7 +1578,7 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info, access_map = ( access_map + ToCountMap({key: val}) - * _get_insn_count(knl, program_callables_info, insn.id, + * _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, key.count_granularity)) @@ -1700,13 +1700,13 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, access_map = ToCountMap() - callables_count = program.program_callables_info.callables_count + callables_count = program.callables_table.callables_count - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_access_map = get_mem_access_map_for_single_kernel(knl, - program.program_callables_info, numpy_types, + program.callables_table, numpy_types, count_redundant_work, subgroup_size) # FIXME: didn't see any easy way to multiply @@ -1726,7 +1726,7 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, # {{{ get_synchronization_map -def get_synchronization_map_for_single_kernel(knl, program_callables_info, +def get_synchronization_map_for_single_kernel(knl, callables_table, subgroup_size=None): """Count the number of synchronization events each work-item encounters in @@ -1772,7 +1772,7 @@ def get_synchronization_map_for_single_kernel(knl, program_callables_info, from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) from operator import mul - knl = lp.get_one_scheduled_kernel(knl, program_callables_info) + knl = lp.get_one_scheduled_kernel(knl, callables_table) iname_list = [] result = ToCountMap() @@ -1824,13 +1824,13 @@ def get_synchronization_map(program, subgroup_size=None): program = preprocess_program(program) sync_map = ToCountMap() - callables_count = program.program_callables_info.callables_count + callables_count = program.callables_table.callables_count - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_sync_map = get_synchronization_map_for_single_kernel(knl, - program.program_callables_info, subgroup_size) + program.callables_table, subgroup_size) # FIXME: didn't see any easy way to multiply for i in range(callables_count[func_id]): @@ -1887,7 +1887,7 @@ def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False) def gather_access_footprints(program, ignore_uncountable=False): # FIMXE: works only for one callable kernel till now. if len([in_knl_callable for in_knl_callable in - program.program_callables_info.values() if isinstance(in_knl_callable, + program.callables_table.values() if isinstance(in_knl_callable, CallableKernel)]) != 1: raise NotImplementedError("Currently only supported for program with " "only one CallableKernel.") @@ -1900,9 +1900,9 @@ def gather_access_footprints(program, ignore_uncountable=False): write_footprints = [] read_footprints = [] - callables_count = program.program_callables_info.callables_count + callables_count = program.callables_table.callables_count - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_write_footprints, knl_read_footprints = ( diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 92ee2dc51..f27ee4e96 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -80,7 +80,7 @@ class TargetBase(object): def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel, program_callables_info): + def pre_codegen_check(self, kernel, callables_table): pass # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 418ce0256..9b5aaf8e9 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -362,7 +362,7 @@ class CMathCallable(ScalarCallable): C-Target. """ - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): name = self.name if name in ["abs", "min", "max"]: @@ -381,7 +381,7 @@ class CMathCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] dtype = dtype.numpy_dtype @@ -409,7 +409,7 @@ class CMathCallable(ScalarCallable): self.copy(name_in_target=name, arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}), - program_callables_info) + callables_table) # binary functions if name in ["fmax", "fmin"]: @@ -424,7 +424,7 @@ class CMathCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() @@ -449,11 +449,11 @@ class CMathCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), - program_callables_info) + callables_table) return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def scope_c_math_functions(target, identifier): @@ -893,7 +893,7 @@ class CASTBuilder(ASTBuilderBase): ecm = codegen_state.expression_to_code_mapper func_id = insn.expression.function.name - in_knl_callable = codegen_state.program_callables_info[func_id] + in_knl_callable = codegen_state.callables_table[func_id] if isinstance(in_knl_callable, ScalarCallable) and ( in_knl_callable.name_in_target == 'loopy_make_tuple'): diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 65a8c2028..289877d9a 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -55,7 +55,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): if type_inf_mapper is None: type_inf_mapper = TypeInferenceMapper(self.kernel, - self.codegen_state.program_callables_info) + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex @@ -389,7 +389,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): # {{{ implement indexof, indexof_vec identifier_name = ( - self.codegen_state.program_callables_info[expr.function.name].name) + self.codegen_state.callables_table[expr.function.name].name) if identifier_name in ["indexof", "indexof_vec"]: if len(expr.parameters) != 1: raise LoopyError("%s takes exactly one argument" % identifier_name) @@ -432,11 +432,11 @@ class ExpressionToCExpressionMapper(IdentityMapper): # }}} from loopy.kernel.function_interface import ManglerCallable - if isinstance(self.codegen_state.program_callables_info[expr.function.name], + if isinstance(self.codegen_state.callables_table[expr.function.name], ManglerCallable): from loopy.codegen import SeenFunction in_knl_callable = ( - self.codegen_state.program_callables_info[ + self.codegen_state.callables_table[ expr.function.name]) mangle_result = in_knl_callable.mangle_result(self.kernel) self.codegen_state.seen_functions.add( @@ -445,7 +445,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): mangle_result.arg_dtypes)) return ( - self.codegen_state.program_callables_info[ + self.codegen_state.callables_table[ expr.function.name].emit_call( expression_to_code_mapper=self, expression=expr, diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index e6abf73fd..32b810eb3 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -123,7 +123,7 @@ _CUDA_SPECIFIC_FUNCTIONS = { class CudaCallable(ScalarCallable): def cuda_with_types(self, arg_id_to_dtype, caller_kernel, - program_callables_info): + callables_table): name = self.name @@ -138,7 +138,7 @@ class CudaCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"] @@ -146,7 +146,7 @@ class CudaCallable(ScalarCallable): self.copy(name_in_target=name, arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}), - program_callables_info) + callables_table) if name in _CUDA_SPECIFIC_FUNCTIONS: num_args = _CUDA_SPECIFIC_FUNCTIONS[name] @@ -161,7 +161,7 @@ class CudaCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in @@ -177,11 +177,11 @@ class CudaCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype=updated_arg_id_to_dtype), - program_callables_info) + callables_table) return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def scope_cuda_functions(target, identifier): @@ -303,7 +303,7 @@ class CUDACASTBuilder(CASTBuilder): codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index), - codegen_state.program_callables_info) + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_grid_size): diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 43963ddb2..c067bc4b9 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -763,7 +763,7 @@ class KernelExecutorBase(object): from loopy.schedule import get_one_scheduled_kernel program = program.with_root_kernel( get_one_scheduled_kernel(program.root_kernel, - program.program_callables_info)) + program.callables_table)) return program diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index f8c42ad69..94a81a65a 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -172,9 +172,9 @@ class ISPCTarget(CTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel, program_callables_info): + def pre_codegen_check(self, kernel, callables_table): gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( - program_callables_info) + callables_table) if len(lsize) > 1: for i, ls_i in enumerate(lsize[1:]): if ls_i != 1: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d8c195de2..ea29665ac 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -172,7 +172,7 @@ class OpenCLCallable(ScalarCallable): :class:`loopy.target.c.CMathCallable`. """ - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): name = self.name if name in ["max", "min"]: @@ -182,7 +182,7 @@ class OpenCLCallable(ScalarCallable): if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype: return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() @@ -195,7 +195,7 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), - program_callables_info) + callables_table) else: # Unsupported type. raise LoopyError("%s function not supported for the types %s" % @@ -212,14 +212,14 @@ class OpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"] return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: NumpyType(scalar_dtype), 0: dtype, 1: dtype}), - program_callables_info) + callables_table) if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] @@ -234,7 +234,7 @@ class OpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in @@ -250,7 +250,7 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype=updated_arg_id_to_dtype), - program_callables_info) + callables_table) if name in VECTOR_LITERAL_FUNCS: base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] @@ -266,7 +266,7 @@ class OpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(count)) @@ -276,13 +276,13 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(name_in_target="(%s%d) " % (base_tp_name, count), arg_id_to_dtype=updated_arg_id_to_dtype), - program_callables_info) + callables_table) # does not satisfy any of the conditions needed for specialization. # hence just returning a copy of the callable. return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def scope_opencl_functions(target, identifier): @@ -479,7 +479,7 @@ class OpenCLCASTBuilder(CASTBuilder): _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index), - codegen_state.program_callables_info) + codegen_state.callables_table) from loopy.symbolic import get_dependencies if not get_dependencies(local_sizes): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 435a5e791..d98b6cdd6 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -134,7 +134,7 @@ def adjust_local_temp_var_storage(kernel, device): # {{{ check sizes against device properties -def check_sizes(kernel, program_callables_info, device): +def check_sizes(kernel, callables_table, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError @@ -152,7 +152,7 @@ def check_sizes(kernel, program_callables_info, device): parameters[arg.name] = arg.approximately glens, llens = ( - kernel.get_grid_size_upper_bounds_as_exprs(program_callables_info)) + kernel.get_grid_size_upper_bounds_as_exprs(callables_table)) if (max(len(glens), len(llens)) > device.max_work_item_dimensions): @@ -207,7 +207,7 @@ class PyOpenCLCallable(ScalarCallable): Records information about the callables which are not covered by :class:`loopy.target.opencl.OpenCLCallable` """ - def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): name = self.name @@ -221,7 +221,7 @@ class PyOpenCLCallable(ScalarCallable): # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0] @@ -238,7 +238,7 @@ class PyOpenCLCallable(ScalarCallable): self.copy(name_in_target="%s_%s" % (tpname, name), arg_id_to_dtype={0: dtype, -1: NumpyType( np.dtype(dtype.numpy_dtype.type(0).real))}), - program_callables_info) + callables_table) if name in ["sqrt", "exp", "log", "sin", "cos", "tan", @@ -256,7 +256,7 @@ class PyOpenCLCallable(ScalarCallable): return ( self.copy(name_in_target="%s_%s" % (tpname, name), arg_id_to_dtype={0: dtype, -1: dtype}), - program_callables_info) + callables_table) else: # function calls for floating parameters. numpy_dtype = dtype.numpy_dtype @@ -267,11 +267,11 @@ class PyOpenCLCallable(ScalarCallable): return ( self.copy(name_in_target=name, arg_id_to_dtype={0: dtype, -1: dtype}), - program_callables_info) + callables_table) return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier): @@ -397,8 +397,8 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel, program_callables_info): - check_sizes(kernel, program_callables_info, self.device) + def pre_codegen_check(self, kernel, callables_table): + check_sizes(kernel, callables_table, self.device) def get_host_ast_builder(self): return PyOpenCLPythonASTBuilder(self) diff --git a/loopy/target/python.py b/loopy/target/python.py index 2e6712ec1..1f83112ff 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -45,7 +45,7 @@ class ExpressionToPythonMapper(StringifyMapper): if type_inf_mapper is None: type_inf_mapper = TypeInferenceMapper(self.kernel, - self.codegen_state.program_callables_info) + self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper def handle_unsupported_expression(self, victim, enclosing_prec): @@ -85,7 +85,7 @@ class ExpressionToPythonMapper(StringifyMapper): def map_call(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_NONE - identifier_name = self.codegen_state.program_callables_info[ + identifier_name = self.codegen_state.callables_table[ expr.function.name].name if identifier_name in ["indexof", "indexof_vec"]: @@ -93,7 +93,7 @@ class ExpressionToPythonMapper(StringifyMapper): "indexof, indexof_vec not yet supported in Python") from loopy.kernel.function_interface import ManglerCallable - in_knl_callable = self.codegen_state.program_callables_info[ + in_knl_callable = self.codegen_state.callables_table[ expr.function.name] if isinstance(in_knl_callable, ManglerCallable): from loopy.codegen import SeenFunction diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 57c4397f9..2519b6a14 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -133,7 +133,7 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, +def buffer_array_for_single_kernel(kernel, callables_table, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_scope=None, temporary_is_local=None, fetch_bounding_box=False): @@ -534,7 +534,7 @@ def buffer_array_for_single_kernel(kernel, program_callables_info, var_name, kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel, program_callables_info) + kernel = assign_automatic_axes(kernel, callables_table) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching @@ -548,10 +548,10 @@ def buffer_array(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = buffer_array_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -564,8 +564,8 @@ def buffer_array(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 90f530953..0013de1d5 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -46,11 +46,11 @@ def _resolved_callables_from_function_lookup(program, ``(target, identifier)`` that returns either an instance of :class:`loopy.InKernelCallable` or *None*. """ - program_callables_info = program.program_callables_info + callables_table = program.callables_table callable_knls = dict( (func_id, in_knl_callable) for func_id, in_knl_callable in - program_callables_info.items() if isinstance(in_knl_callable, + callables_table.items() if isinstance(in_knl_callable, CallableKernel)) edited_callable_knls = {} @@ -62,28 +62,28 @@ def _resolved_callables_from_function_lookup(program, kernel.substitutions, kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, kernel, program_callables_info, + rule_mapping_context, kernel, callables_table, [func_id_to_in_kernel_callable_mapper]) new_subkernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(kernel)) - program_callables_info = resolved_function_marker.program_callables_info + callables_table = resolved_function_marker.callables_table edited_callable_knls[func_id] = in_knl_callable.copy( subkernel=new_subkernel) new_resolved_functions = {} - for func_id, in_knl_callable in program_callables_info.items(): + for func_id, in_knl_callable in callables_table.items(): if func_id in edited_callable_knls: new_resolved_functions[func_id] = edited_callable_knls[func_id] else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = program_callables_info.copy( + callables_table = callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) def register_function_id_to_in_knl_callable_mapper(program, diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 5f4f2f2a7..888bedc1d 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -143,7 +143,7 @@ class _not_provided: # noqa: N801 pass -def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, +def add_prefetch_for_single_kernel(kernel, callables_table, var_name, sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. @@ -334,7 +334,7 @@ def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name, # warning message. from loopy.transform.precompute import precompute_for_single_kernel - new_kernel = precompute_for_single_kernel(kernel, program_callables_info, + new_kernel = precompute_for_single_kernel(kernel, callables_table, subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, @@ -373,10 +373,10 @@ def add_prefetch(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = add_prefetch_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -389,9 +389,9 @@ def add_prefetch(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 44e69ecfb..9b83f242b 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -420,23 +420,23 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): """ # all the resolved functions in programs must be registered in - # main_program_callables_info + # main_callables_table main_prog_callables_info = ( - programs[0].program_callables_info) + programs[0].callables_table) old_root_kernel_callable = ( - programs[0].program_callables_info[programs[0].name]) + programs[0].callables_table[programs[0].name]) kernels = [programs[0].root_kernel] # removing the callable collisions that maybe present for prog in programs[1:]: root_kernel = prog.root_kernel renames_needed = {} - for old_func_id, in_knl_callable in prog.program_callables_info.items(): + for old_func_id, in_knl_callable in prog.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): # Fusing programs with multiple callable kernels is tough. # Reason: Need to first figure out the order in which the # callable kernels must be resolved into - # main_program_callables_info, because of renaming is + # main_callables_table, because of renaming is # needed to be done in the callable kernels before registering. # Hence disabling it until required. if in_knl_callable.subkernel.name != prog.name: @@ -468,6 +468,6 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): var(programs[0].name), new_root_kernel_callable) return programs[0].copy( - program_callables_info=main_prog_callables_info) + callables_table=main_prog_callables_info) # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index b6a0454ee..fb6682f48 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1095,7 +1095,7 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals def get_iname_duplication_options(program, use_boostable_into=False): - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): for option in get_iname_duplication_options_for_single_kernel( in_knl_callable.subkernel, use_boostable_into): @@ -1121,7 +1121,7 @@ def has_schedulable_iname_nesting_for_single_kernel(knl): def has_schedulable_iname_nesting(program): return all(has_schedulable_iname_nesting_for_single_kernel( in_knl_callable.subkernel) for in_knl_callable in - program.program_callables_info.values() if isinstance(in_knl_callable, + program.callables_table.values() if isinstance(in_knl_callable, CallableKernel)) # }}} diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 93cf932b1..f73110ecd 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -42,7 +42,7 @@ def find_instructions_in_single_kernel(kernel, insn_match): def find_instructions(program, insn_match): assert isinstance(program, Program) insns = [] - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): insns += (find_instructions_in_single_kernel( in_knl_callable.subkernel, insn_match)) diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 66c7114ae..71b11fa24 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -261,7 +261,7 @@ class _not_provided(object): # noqa: N801 pass -def precompute_for_single_kernel(kernel, program_callables_info, subst_use, +def precompute_for_single_kernel(kernel, callables_table, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, @@ -1047,7 +1047,7 @@ def precompute_for_single_kernel(kernel, program_callables_info, subst_use, if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag): from loopy.kernel.tools import assign_automatic_axes - kernel = assign_automatic_axes(kernel, program_callables_info) + kernel = assign_automatic_axes(kernel, callables_table) return kernel @@ -1056,10 +1056,10 @@ def precompute(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = precompute_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -1072,8 +1072,8 @@ def precompute(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 4b957b033..e463353ef 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -235,9 +235,9 @@ class TemporarySaver(object): def new_shape(self): return self.hw_dims + self.non_hw_dims - def __init__(self, kernel, program_callables_info): + def __init__(self, kernel, callables_table): self.kernel = kernel - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() @@ -441,7 +441,7 @@ class TemporarySaver(object): group_sizes, local_sizes = ( self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids, - self.program_callables_info)) + self.callables_table)) if temporary.address_space == lp.AddressSpace.LOCAL: # Elide local axes in the save slot for local temporaries. @@ -630,7 +630,7 @@ class TemporarySaver(object): kernel = lp.add_nosync(kernel, "global", source, sink) from loopy.kernel.tools import assign_automatic_axes - return assign_automatic_axes(kernel, self.program_callables_info) + return assign_automatic_axes(kernel, self.callables_table) def save(self, temporary, subkernel): self.save_or_reload_impl(temporary, subkernel, "save") @@ -754,12 +754,12 @@ def save_and_reload_temporaries(program): program = lp.preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(program.root_kernel, - program.program_callables_info) + program.callables_table) assert knl.schedule is not None liveness = LivenessAnalysis(knl) - saver = TemporarySaver(knl, program.program_callables_info) + saver = TemporarySaver(knl, program.callables_table) from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index afe3fec59..acdf5b2a1 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -510,7 +510,7 @@ def find_rules_matching(knl, pattern): def find_one_rule_matching(program, pattern): rules = [] - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel rules.extend(find_rules_matching(knl, pattern)) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 439866405..029381d8d 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -35,7 +35,7 @@ from loopy.diagnostic import ( TypeInferenceFailure, DependencyTypeInferenceFailure) from loopy.kernel.instruction import _DataObliviousInstruction -from loopy.program import ProgramCallablesInfo +from loopy.program import CallablesTable from loopy.symbolic import ( LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, SubstitutionRuleExpander, ResolvedFunction, @@ -197,7 +197,7 @@ def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, program_callables_info, new_assignments=None): + def __init__(self, kernel, callables_table, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -206,12 +206,12 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel - assert isinstance(program_callables_info, ProgramCallablesInfo) + assert isinstance(callables_table, CallablesTable) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() - self.program_callables_info = program_callables_info + self.callables_table = callables_table self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): @@ -245,16 +245,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self, program_callables_info=None): - if program_callables_info is None: - program_callables_info = self.program_callables_info - return type(self)(self.kernel, program_callables_info, + def copy(self, callables_table=None): + if callables_table is None: + callables_table = self.callables_table + return type(self)(self.kernel, callables_table, self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, self.program_callables_info, new_ass) + return type(self)(self.kernel, self.callables_table, new_ass) @staticmethod def combine(dtype_sets): @@ -431,7 +431,7 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type if isinstance(expr.function, ResolvedFunction): - in_knl_callable = self.program_callables_info[expr.function.name] + in_knl_callable = self.callables_table[expr.function.name] # {{{ checking that there is no overwriting of types of in_knl_callable @@ -465,17 +465,17 @@ class TypeInferenceMapper(CombineMapper): # }}} - in_knl_callable, self.program_callables_info = ( + in_knl_callable, self.callables_table = ( in_knl_callable.with_types( arg_id_to_dtype, self.kernel, - self.program_callables_info)) + self.callables_table)) in_knl_callable = in_knl_callable.with_target(self.kernel.target) # storing the type specialized function so that it can be used for # later use - self.program_callables_info, new_function_id = ( - self.program_callables_info.with_callable( + self.callables_table, new_function_id = ( + self.callables_table.with_callable( expr.function.function, in_knl_callable)) @@ -538,8 +538,8 @@ class TypeInferenceMapper(CombineMapper): in_knl_callable = ManglerCallable( identifier, function_mangler, arg_id_to_dtype, arg_id_to_descr, mangle_result.target_name) - self.program_callables_info, new_function_id = ( - self.program_callables_info.with_added_callable( + self.callables_table, new_function_id = ( + self.callables_table.with_added_callable( expr.function, in_knl_callable)) if isinstance(expr, Call): @@ -688,7 +688,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): return [kernel.index_dtype], [], {}, ( - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) from functools import partial debug = partial(_debug, kernel) @@ -735,13 +735,13 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if not dtype_sets: return ( None, type_inf_mapper.symbols_with_unknown_types, None, - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) result = type_inf_mapper.combine(dtype_sets) return (result, type_inf_mapper.symbols_with_unknown_types, type_inf_mapper.old_calls_to_new_calls, - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) # }}} @@ -768,7 +768,7 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, +def infer_unknown_types_for_a_single_kernel(kernel, callables_table, expect_completion=False): """Infer types on temporaries and arguments.""" @@ -831,7 +831,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info, + type_inf_mapper = TypeInferenceMapper(kernel, callables_table, item_lookup) from loopy.symbolic import SubstitutionRuleExpander @@ -867,11 +867,11 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, debug("inferring type for %s %s", type(item).__name__, item.name) (result, symbols_with_unavailable_types, - new_old_calls_to_new_calls, program_callables_info) = ( + new_old_calls_to_new_calls, callables_table) = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) type_inf_mapper = type_inf_mapper.copy( - program_callables_info=program_callables_info) + callables_table=callables_table) failed = not result if not failed: @@ -979,7 +979,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, raise NotImplementedError("Unknown instructions type %s." % ( type(insn).__name__)) - program_callables_info = type_inf_mapper.program_callables_info + callables_table = type_inf_mapper.callables_table old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) end_time = time.time() @@ -1003,39 +1003,39 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info, from loopy.check import check_functions_are_resolved check_functions_are_resolved(type_specialized_kernel) - return type_specialized_kernel, program_callables_info + return type_specialized_kernel, callables_table def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" - program_callables_info = program.program_callables_info + callables_table = program.callables_table type_uninferred_knl_callable = ( - program_callables_info[program.name]) + callables_table[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - old_callables_count = program_callables_info.callables_count - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) - root_kernel, program_callables_info = ( + old_callables_count = callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) + root_kernel, callables_table = ( infer_unknown_types_for_a_single_kernel( type_uninferred_root_kernel, - program_callables_info, expect_completion)) + callables_table, expect_completion)) type_inferred_knl_callable = type_uninferred_knl_callable.copy( subkernel=root_kernel) - program_callables_info, _ = ( - program_callables_info.with_callable( + callables_table, _ = ( + callables_table.with_callable( program.name, type_inferred_knl_callable)) - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode( + callables_table = ( + callables_table.with_exit_edit_callables_mode( old_callables_count)) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) # }}} @@ -1043,8 +1043,8 @@ def infer_unknown_types(program, expect_completion=False): # {{{ reduction expression helper def infer_arg_and_reduction_dtypes_for_reduction_expression( - kernel, expr, program_callables_info, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info) + kernel, expr, callables_table, unknown_types_ok): + type_inf_mapper = TypeInferenceMapper(kernel, callables_table) import loopy as lp if expr.is_tuple_typed: @@ -1076,7 +1076,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( for dt in reduction_dtypes) return tuple(arg_dtypes), reduction_dtypes, ( - type_inf_mapper.program_callables_info) + type_inf_mapper.callables_table) # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 43371c8a8..fa32ca04c 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -416,7 +416,7 @@ def test_ilp_write_race_detection_global(ctx_factory): from warnings import catch_warnings with catch_warnings(record=True) as warn_list: list(lp.generate_loop_schedules(knl.root_kernel, - knl.program_callables_info)) + knl.callables_table)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list) @@ -1271,7 +1271,7 @@ def save_and_reload_temporaries_test(queue, prog, out_expect, debug=False): from loopy.transform.save import save_and_reload_temporaries prog = save_and_reload_temporaries(prog) prog = prog.with_root_kernel(lp.get_one_scheduled_kernel(prog.root_kernel, - prog.program_callables_info)) + prog.callables_table)) if debug: print(prog) @@ -2222,7 +2222,7 @@ def test_unscheduled_insn_detection(): "...") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) prog = prog.with_root_kernel(knl) insn1, = lp.find_instructions(prog, "id:insn1") insns = prog.root_kernel.instructions[:] @@ -2392,7 +2392,7 @@ def test_barrier_insertion_near_top_of_loop(): prog = lp.set_temporary_scope(prog, "a", "local") prog = lp.set_temporary_scope(prog, "b", "local") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) print(knl) @@ -2420,7 +2420,7 @@ def test_barrier_insertion_near_bottom_of_loop(): prog = lp.set_temporary_scope(prog, "a", "local") prog = lp.set_temporary_scope(prog, "b", "local") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info) + knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) print(knl) @@ -2479,7 +2479,7 @@ def test_multi_argument_reduction_type_inference(): allow_simultaneous=True) t_inf_mapper = TypeInferenceMapper(prog.root_kernel, - prog.program_callables_info) + prog.callables_table) assert ( t_inf_mapper(expr, return_tuple=True, return_dtype_set=True) @@ -2836,7 +2836,7 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier): prog = lp.preprocess_kernel(prog) knl = lp.get_one_scheduled_kernel(prog.root_kernel, - prog.program_callables_info) + prog.callables_table) assert barrier_between(knl, "first", "second") == expect_barrier diff --git a/test/testlib.py b/test/testlib.py index eebc792d0..853e2584a 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -9,9 +9,9 @@ class GridOverride(object): self.clean = clean self.vecsize = vecsize - def __call__(self, insn_ids, program_callables_info, ignore_auto=True): + def __call__(self, insn_ids, callables_table, ignore_auto=True): gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, - program_callables_info, ignore_auto) + callables_table, ignore_auto) return gsize, (self.vecsize,) # }}} @@ -139,14 +139,14 @@ class SeparateTemporariesPreambleTestPreambleGenerator( class Log2Callable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: # the types provided aren't mature enough to specialize the # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) dtype = arg_id_to_dtype[0].numpy_dtype @@ -168,7 +168,7 @@ class Log2Callable(lp.ScalarCallable): self.copy(name_in_target=name_in_target, arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}), - program_callables_info) + callables_table) def register_log2_lookup(target, identifier): -- GitLab From 17bba4838c931a59b539a4bcb5cd9fa09925cad7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 15 Oct 2018 14:59:36 -0500 Subject: [PATCH 387/916] minor changes after review --- loopy/kernel/__init__.py | 11 ++--------- loopy/kernel/function_interface.py | 11 ++++++----- loopy/library/reduction.py | 12 ++++++------ loopy/program.py | 9 ++++----- loopy/tools.py | 11 +++++++++++ 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 70079d318..9f14dafce 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -38,7 +38,7 @@ import re from pytools import UniqueNameGenerator, generate_unique_names from loopy.diagnostic import CannotBranchDomainTree, LoopyError -from loopy.tools import natsorted +from loopy.tools import natsorted, update_persistent_hash from loopy.diagnostic import StaticValueFindingError from loopy.kernel.data import filter_iname_tags_by_type from warnings import warn @@ -1476,14 +1476,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): "symbol_manglers", ) - def update_persistent_hash(self, key_hash, key_builder): - """Custom hash computation function for use with - :class:`pytools.persistent_dict.PersistentDict`. - - Only works in conjunction with :class:`loopy.tools.KeyBuilder`. - """ - for field_name in self.hash_fields: - key_builder.rec(key_hash, getattr(self, field_name)) + update_persistent_hash = update_persistent_hash def __hash__(self): from loopy.tools import LoopyKeyBuilder diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 362fbcefc..636d152d6 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -28,7 +28,7 @@ from six.moves import zip from pytools import ImmutableRecord from loopy.diagnostic import LoopyError -from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash __doc__ = """ @@ -49,7 +49,7 @@ __doc__ = """ class ValueArgDescriptor(ImmutableRecord): hash_fields = () - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash class ArrayArgDescriptor(ImmutableRecord): @@ -99,7 +99,7 @@ class ArrayArgDescriptor(ImmutableRecord): "address_space", "dim_tags") - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash =update_persistent_hash # }}} @@ -171,7 +171,8 @@ class InKernelCallable(ImmutableRecord): .. attribute:: name - The name of the callable which can be encountered within a kernel. + The name of the callable which can be encountered within expressions in + a kernel. .. attribute:: arg_id_to_dtype @@ -212,7 +213,7 @@ class InKernelCallable(ImmutableRecord): def __getinitargs__(self): return (self.arg_id_to_dtype, self.arg_id_to_descr) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): """ diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 7c32d0bed..dd0e1e3e9 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -31,7 +31,7 @@ import numpy as np from loopy.symbolic import FunctionIdentifier from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash class ReductionOperation(object): @@ -227,7 +227,7 @@ class ReductionOpFunction(FunctionIdentifier): hash_fields = ( "reduction_op",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} @@ -285,7 +285,7 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): "which", "op",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): @@ -298,7 +298,7 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): "op", "base_reduction_class",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} @@ -354,7 +354,7 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation): "update_comparison", "neutral_sign",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash class ArgMinReductionOperation(_ArgExtremumReductionOperation): @@ -366,7 +366,7 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): "update_comparison", "neutral_sign",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} diff --git a/loopy/program.py b/loopy/program.py index f7c399c1e..aee2378ff 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -39,6 +39,7 @@ from loopy.diagnostic import LoopyError from loopy.library.reduction import ReductionOpFunction from loopy.kernel import LoopKernel +from loopy.tools import update_persistent_hash from collections import Counter from pymbolic.primitives import Call, CallWithKwargs @@ -253,7 +254,7 @@ class Program(ImmutableRecord): "callables_table", "target",) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash def copy(self, **kwargs): if 'target' in kwargs: @@ -611,7 +612,7 @@ class CallablesTable(ImmutableRecord): self.is_being_edited )) - update_persistent_hash = LoopKernel.update_persistent_hash + update_persistent_hash = update_persistent_hash @property @memoize_method @@ -620,8 +621,6 @@ class CallablesTable(ImmutableRecord): Returns an instance of :class:`collection.Counter` representing the number of times the callables is called in callables_table. """ - # should raise an error if there are more than one root kernels(which is - # illegal) root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable in self.values() if isinstance(in_knl_callable, CallableKernel) and @@ -737,7 +736,7 @@ class CallablesTable(ImmutableRecord): def with_edit_callables_mode(self): """ - Initiates *self* for a walk traversal through all the callables. + Returns a copy of *self* for a walk traversal through all the callables. """ return self.copy( is_being_edited=True) diff --git a/loopy/tools.py b/loopy/tools.py index 5eabe6c3c..52fc7d3ce 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -43,6 +43,17 @@ else: return isinstance(obj, (int, np.integer)) +def update_persistent_hash(obj, key_hash, key_builder): + """ + Custom hash computation function for use with + :class:`pytools.persistent_dict.PersistentDict`. + + Only works in conjunction with :class:`loopy.tools.KeyBuilder`. + """ + for field_name in obj.hash_fields: + key_builder.rec(key_hash, getattr(obj, field_name)) + + # {{{ custom KeyBuilder subclass class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): -- GitLab From dc458ada6a51a10c6283f1b90087fd722f13d00f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Nov 2018 17:41:51 -0600 Subject: [PATCH 388/916] renaming: make_program_from_kernel -> make_program --- loopy/__init__.py | 4 ++-- loopy/codegen/__init__.py | 4 ++-- loopy/kernel/__init__.py | 4 ++-- loopy/kernel/creation.py | 12 ++++++------ loopy/program.py | 4 ++-- test/test_diff.py | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 8ebd4d0e6..9faa28bcd 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -51,7 +51,7 @@ from loopy.kernel.data import ( from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.program import ( - Program, make_program_from_kernel) + Program, make_program) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -175,7 +175,7 @@ __all__ = [ "ScalarCallable", "CallableKernel", - "Program", "make_program_from_kernel", + "Program", "make_program", "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 250e7215a..55161ebba 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -541,10 +541,10 @@ def generate_code_v2(program): :param program: An instance of :class:`loopy.Program`. """ from loopy.kernel import LoopKernel - from loopy.program import make_program_from_kernel + from loopy.program import make_program if isinstance(program, LoopKernel): - program = make_program_from_kernel(program) + program = make_program(program) from loopy.kernel import KernelState if program.root_kernel.state == KernelState.INITIAL: diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 9f14dafce..dd7acf25b 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1371,8 +1371,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): def __call__(self, *args, **kwargs): warn("Calling a LoopKernel is deprecated, call a Program " "instead.", DeprecationWarning, stacklevel=2) - from loopy.program import make_program_from_kernel - program = make_program_from_kernel(self) + from loopy.program import make_program + program = make_program(self) return program(*args, **kwargs) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 685232c61..b794cfb8e 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1954,7 +1954,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) fixed_parameters = kwargs.pop("fixed_parameters", {}) - make_program = kwargs.pop("make_program", True) + is_callee_kernel = kwargs.pop("is_callee_kernel", False) if defines: from warnings import warn @@ -2174,15 +2174,15 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): creation_plog.done() - if make_program: - from loopy.program import make_program_from_kernel - return make_program_from_kernel(knl) - else: + if is_callee_kernel: return knl + else: + from loopy.program import make_program + return make_program(knl) def make_kernel_function(*args, **kwargs): - kwargs['make_program'] = False + kwargs['is_callee_kernel'] = False return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/program.py b/loopy/program.py index aee2378ff..c8534f051 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -50,7 +50,7 @@ __doc__ = """ .. autoclass:: Program .. autoclass:: CallablesTable -.. autofunction:: make_program_from_kernel +.. autofunction:: make_program .. autofunction:: iterate_over_kernels_if_given_program """ @@ -921,7 +921,7 @@ class CallablesTable(ImmutableRecord): # {{{ helper functions -def make_program_from_kernel(kernel): +def make_program(kernel): """ Returns an instance of :class:`loopy.Program` with the *kernel* as the root kernel. diff --git a/test/test_diff.py b/test/test_diff.py index a7fd92987..49efc2612 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -66,7 +66,7 @@ def test_diff(ctx_factory): from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") - dknl = lp.make_program_from_kernel(dknl) + dknl = lp.make_program(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") -- GitLab From eca2a3ed2dc9bcae43362dcbf7cf1f1ea3419a1f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 14 Nov 2018 21:47:43 -0600 Subject: [PATCH 389/916] some changes after review --- loopy/__init__.py | 4 ++-- loopy/kernel/creation.py | 2 +- loopy/kernel/function_interface.py | 16 ++++++++++------ test/test_diff.py | 2 +- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 9faa28bcd..c2ffe5bf9 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -64,7 +64,7 @@ from loopy.kernel.tools import ( get_subkernels, get_subkernel_to_insn_id_map) from loopy.types import to_loopy_type -from loopy.kernel.creation import make_kernel, UniqueName, make_kernel_function +from loopy.kernel.creation import make_kernel, UniqueName, make_function from loopy.library.reduction import register_reduction_parser # {{{ import transforms @@ -184,7 +184,7 @@ __all__ = [ "SubstitutionRule", "CallMangleInfo", - "make_kernel", "UniqueName", "make_kernel_function", + "make_kernel", "UniqueName", "make_function", "register_reduction_parser", diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index b794cfb8e..823fb1b3f 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2181,7 +2181,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): return make_program(knl) -def make_kernel_function(*args, **kwargs): +def make_function(*args, **kwargs): kwargs['is_callee_kernel'] = False return make_kernel(*args, **kwargs) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 636d152d6..17057691c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -29,6 +29,7 @@ from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.tools import update_persistent_hash +from loopy.kernel import LoopKernel __doc__ = """ @@ -99,7 +100,7 @@ class ArrayArgDescriptor(ImmutableRecord): "address_space", "dim_tags") - update_persistent_hash =update_persistent_hash + update_persistent_hash = update_persistent_hash # }}} @@ -176,18 +177,21 @@ class InKernelCallable(ImmutableRecord): .. attribute:: arg_id_to_dtype - A mapping which indicates the arguments types and result types it would - be handling. This would be set once the callable is type specialized. + A mapping which indicates the arguments types and result types of the + callable. .. attribute:: arg_id_to_descr A mapping which gives indicates the argument shape and ``dim_tags`` it - would be responsible for generating code. These parameters would be set, - once it is shape and stride(``dim_tags``) specialized. + would be responsible for generating code. .. note:: + - "``arg_id`` can either be an instance of :class:`int` integer + corresponding to the position of the argument or an instance of + :class:`str` corresponding to the name of keyword argument accepted + by the function. - Negative "id" values ``-i`` in the mapping attributes indicate + - Negative "arg_id" values ``-i`` in the mapping attributes indicate return value with (0-based) index *i*. .. automethod:: __init__ diff --git a/test/test_diff.py b/test/test_diff.py index 49efc2612..d001233c0 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -55,7 +55,7 @@ def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel_function( + knl = lp.make_function( """{ [i,j]: 0<=i,j a = 1/(1+sinh(x[i] + y[j])**2) -- GitLab From 8b04d088d54806652d3ffaf19364cac1e4aaba2c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 00:22:11 -0600 Subject: [PATCH 390/916] small fix to make the tests runnable again --- loopy/auto_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index bee1b72f1..7e23ef06f 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -118,7 +118,7 @@ def make_ref_args(program, impl_arg_info, queue, parameters): shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype - is_output = arg.base_name in kernel_arg.is_output_only + is_output = kernel_arg.is_output_only if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( -- GitLab From 930f8907c193c0c4154b79ef59ebbde0fc43980c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 09:15:43 -0600 Subject: [PATCH 391/916] asserts that callees do not generate host program --- loopy/codegen/__init__.py | 2 ++ loopy/codegen/control.py | 23 ++++++++++--------- loopy/codegen/result.py | 47 ++++++++++++++++++++++----------------- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e9e7c9a44..730d33112 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -587,6 +587,8 @@ def generate_code_v2(program): codegen_results[func_id] = ( generate_code_for_a_single_kernel(in_knl_callable.subkernel, program.program_callables_info, program.target)) + if not in_knl_callable.subkernel.is_called_from_host: + assert codegen_results[func_id].host_program is None device_preambles = set() for cgr in codegen_results.values(): diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 90bdbda31..bb62961c5 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -117,16 +117,19 @@ def generate_code_for_sched_index(codegen_state, sched_index): glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index), codegen_state.program_callables_info) - - return merge_codegen_results(codegen_state, [ - codegen_result, - - codegen_state.ast_builder.get_kernel_call( - codegen_state, - sched_item.kernel_name, - glob_grid, loc_grid, - extra_args), - ]) + if kernel.is_called_from_host: + return merge_codegen_results(codegen_state, [ + codegen_result, + + codegen_state.ast_builder.get_kernel_call( + codegen_state, + sched_item.kernel_name, + glob_grid, loc_grid, + extra_args), + ]) + else: + # do not generate host code for callee kernels + return codegen_result elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 00f19d99a..7950c56b3 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -292,27 +292,32 @@ def generate_host_or_device_program(codegen_state, schedule_index): else: codegen_result = build_loop_nest(codegen_state, schedule_index) - codegen_result = merge_codegen_results( - codegen_state, - ast_builder.generate_top_of_body(codegen_state) - + temp_decls - + [codegen_result], - collapse=False) - - cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast - fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) - - fdef_ast = ast_builder.get_function_definition( - codegen_state, codegen_result, - schedule_index, fdecl_ast, body_ast) - - codegen_result = codegen_result.with_new_program( - codegen_state, - cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) + if (codegen_state.is_generating_device_code) or ( + codegen_state.kernel.is_called_from_host): + codegen_result = merge_codegen_results( + codegen_state, + ast_builder.generate_top_of_body(codegen_state) + + temp_decls + + [codegen_result], + collapse=False) + + cur_prog = codegen_result.current_program(codegen_state) + body_ast = cur_prog.ast + fdecl_ast = ast_builder.get_function_declaration( + codegen_state, codegen_result, schedule_index) + + fdef_ast = ast_builder.get_function_definition( + codegen_state, codegen_result, + schedule_index, fdecl_ast, body_ast) + + codegen_result = codegen_result.with_new_program( + codegen_state, + cur_prog.copy( + ast=ast_builder.process_ast(fdef_ast), + body_ast=ast_builder.process_ast(body_ast))) + else: + codegen_result = codegen_result.copy( + host_program=None) return codegen_result -- GitLab From 408bb384ec47af2cd464e303458f9017fdf40494 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 09:21:32 -0600 Subject: [PATCH 392/916] asserts that callees do not generate host program --- loopy/codegen/__init__.py | 2 ++ loopy/codegen/control.py | 23 ++++++++++--------- loopy/codegen/result.py | 47 ++++++++++++++++++++++----------------- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 55161ebba..3fd94aa2a 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -561,6 +561,8 @@ def generate_code_v2(program): codegen_results[func_id] = ( generate_code_for_a_single_kernel(in_knl_callable.subkernel, program.callables_table)) + if not in_knl_callable.subkernel.is_called_from_host: + assert codegen_results[func_id].host_program is None device_preambles = set() for cgr in codegen_results.values(): diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 81a672a14..5dfd9cb43 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -117,16 +117,19 @@ def generate_code_for_sched_index(codegen_state, sched_index): glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index), codegen_state.callables_table) - - return merge_codegen_results(codegen_state, [ - codegen_result, - - codegen_state.ast_builder.get_kernel_call( - codegen_state, - sched_item.kernel_name, - glob_grid, loc_grid, - extra_args), - ]) + if kernel.is_called_from_host: + return merge_codegen_results(codegen_state, [ + codegen_result, + + codegen_state.ast_builder.get_kernel_call( + codegen_state, + sched_item.kernel_name, + glob_grid, loc_grid, + extra_args), + ]) + else: + # do not generate host code for callee kernels + return codegen_result elif isinstance(sched_item, EnterLoop): tags = kernel.iname_tags(sched_item.iname) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 00f19d99a..7950c56b3 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -292,27 +292,32 @@ def generate_host_or_device_program(codegen_state, schedule_index): else: codegen_result = build_loop_nest(codegen_state, schedule_index) - codegen_result = merge_codegen_results( - codegen_state, - ast_builder.generate_top_of_body(codegen_state) - + temp_decls - + [codegen_result], - collapse=False) - - cur_prog = codegen_result.current_program(codegen_state) - body_ast = cur_prog.ast - fdecl_ast = ast_builder.get_function_declaration( - codegen_state, codegen_result, schedule_index) - - fdef_ast = ast_builder.get_function_definition( - codegen_state, codegen_result, - schedule_index, fdecl_ast, body_ast) - - codegen_result = codegen_result.with_new_program( - codegen_state, - cur_prog.copy( - ast=ast_builder.process_ast(fdef_ast), - body_ast=ast_builder.process_ast(body_ast))) + if (codegen_state.is_generating_device_code) or ( + codegen_state.kernel.is_called_from_host): + codegen_result = merge_codegen_results( + codegen_state, + ast_builder.generate_top_of_body(codegen_state) + + temp_decls + + [codegen_result], + collapse=False) + + cur_prog = codegen_result.current_program(codegen_state) + body_ast = cur_prog.ast + fdecl_ast = ast_builder.get_function_declaration( + codegen_state, codegen_result, schedule_index) + + fdef_ast = ast_builder.get_function_definition( + codegen_state, codegen_result, + schedule_index, fdecl_ast, body_ast) + + codegen_result = codegen_result.with_new_program( + codegen_state, + cur_prog.copy( + ast=ast_builder.process_ast(fdef_ast), + body_ast=ast_builder.process_ast(body_ast))) + else: + codegen_result = codegen_result.copy( + host_program=None) return codegen_result -- GitLab From bdf843d472ab199c5a1315f31c09f4c5762f8c60 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 09:48:46 -0600 Subject: [PATCH 393/916] store the fdecls in AST format --- loopy/codegen/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 730d33112..e2adbaf00 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -599,15 +599,19 @@ def generate_code_v2(program): device_preambles.update([preamble]) collective_device_program = codegen_results[program.name].device_programs[0] + callee_fdecls = [] for func_id, callee_cgr in codegen_results.items(): if func_id != program.name: assert len(callee_cgr.device_programs) == 1 callee_prog_ast = callee_cgr.device_programs[0].ast collective_device_program = collective_device_program.copy( ast=Collection([callee_prog_ast, collective_device_program.ast])) + callee_fdecls.append(callee_prog_ast.fdecl) - device_preambles.update([('98_%s' % func_id, - str(callee_prog_ast.fdecl)), ]) + # collecting the function declarations of callee kernels + for callee_fdecl in callee_fdecls: + collective_device_program = collective_device_program.copy( + ast=Collection([callee_fdecl, collective_device_program.ast])) collective_device_programs = [collective_device_program] + ( codegen_results[program.name].device_programs[1:]) -- GitLab From 3f0d8b5461723c4b365a8ecc03784f8dcaf7c223 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 09:52:28 -0600 Subject: [PATCH 394/916] store the fdecls in AST format --- loopy/codegen/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 3fd94aa2a..00397906e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -568,20 +568,25 @@ def generate_code_v2(program): for cgr in codegen_results.values(): device_preambles.update(cgr.device_preambles) + # collecting the function declarations of callee kernels for in_knl_callable in program.callables_table.values(): for preamble in in_knl_callable.generate_preambles(program.target): device_preambles.update([preamble]) collective_device_program = codegen_results[program.name].device_programs[0] + callee_fdecls = [] + for func_id, callee_cgr in codegen_results.items(): if func_id != program.name: assert len(callee_cgr.device_programs) == 1 callee_prog_ast = callee_cgr.device_programs[0].ast collective_device_program = collective_device_program.copy( ast=Collection([callee_prog_ast, collective_device_program.ast])) + callee_fdecls.append(callee_prog_ast.fdecl) - device_preambles.update([('98_%s' % func_id, - str(callee_prog_ast.fdecl)), ]) + for callee_fdecl in callee_fdecls: + collective_device_program = collective_device_program.copy( + ast=Collection([callee_fdecl, collective_device_program.ast])) collective_device_programs = [collective_device_program] + ( codegen_results[program.name].device_programs[1:]) -- GitLab From d191d34ff87d44e7ad72f8f3b2f2324a28a399fe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 15 Nov 2018 09:53:52 -0600 Subject: [PATCH 395/916] removes assymetry between host and device preambles --- loopy/codegen/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 7950c56b3..268a70b23 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -133,7 +133,7 @@ class CodeGenerationResult(ImmutableRecord): preamble_codes = process_preambles( getattr(self, "host_preambles", []) + - list(getattr(self, "device_preambles", [])) + getattr(self, "device_preambles", []) ) return ( -- GitLab From eaa91d33f3f2bad49982f23eebf217e1991a810d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Nov 2018 08:12:37 -0600 Subject: [PATCH 396/916] make_kernel_function->make_function --- loopy/__init__.py | 4 ++-- loopy/kernel/creation.py | 2 +- test/test_callables.py | 22 +++++++++++----------- test/test_diff.py | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a62d30497..6ed215000 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -64,7 +64,7 @@ from loopy.kernel.tools import ( get_subkernels, get_subkernel_to_insn_id_map) from loopy.types import to_loopy_type -from loopy.kernel.creation import make_kernel, UniqueName, make_kernel_function +from loopy.kernel.creation import make_kernel, UniqueName, make_function from loopy.library.reduction import register_reduction_parser # {{{ import transforms @@ -185,7 +185,7 @@ __all__ = [ "SubstitutionRule", "CallMangleInfo", - "make_kernel", "UniqueName", "make_kernel_function", + "make_kernel", "UniqueName", "make_function", "register_reduction_parser", diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 64c61ae59..674eaca3f 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2352,7 +2352,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): return knl -def make_kernel_function(*args, **kwargs): +def make_function(*args, **kwargs): lang_version = kwargs.pop('lang_version', None) if lang_version: raise LoopyError("lang_version should be set for program, not " diff --git a/test/test_callables.py b/test/test_callables.py index f25bbbe6f..cdba3f5b5 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -69,13 +69,13 @@ def test_register_knl(ctx_factory, inline): x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) - grandchild_knl = lp.make_kernel_function( + grandchild_knl = lp.make_function( "{[i, j]:0<= i, j< 16}", """ c[i, j] = 2*a[i, j] + 3*b[i, j] """, name='linear_combo1') - child_knl = lp.make_kernel_function( + child_knl = lp.make_function( "{[i, j]:0<=i, j < 16}", """ [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) @@ -121,7 +121,7 @@ def test_slices_with_negative_step(ctx_factory, inline): x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) - child_knl = lp.make_kernel_function( + child_knl = lp.make_function( "{[i, j]:0<=i, j < 16}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] @@ -170,7 +170,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - callee_knl = lp.make_kernel_function( + callee_knl = lp.make_function( "{[i, j]:0<=i, j < %d}" % n, """ h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] @@ -221,7 +221,7 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - callee_knl = lp.make_kernel_function( + callee_knl = lp.make_function( "{[i, j]:0<=i, j < 16}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] @@ -262,19 +262,19 @@ def test_shape_translation_through_sub_array_ref(ctx_factory, inline): x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64) - callee1 = lp.make_kernel_function( + callee1 = lp.make_function( "{[i]: 0<=i<6}", """ a[i] = 2*abs(b[i]) """, name="callee_fn1") - callee2 = lp.make_kernel_function( + callee2 = lp.make_function( "{[i, j]: 0<=i<3 and 0 <= j < 2}", """ a[i, j] = 3*b[i, j] """, name="callee_fn2") - callee3 = lp.make_kernel_function( + callee3 = lp.make_function( "{[i]: 0<=i<6}", """ a[i] = 5*b[i] @@ -319,7 +319,7 @@ def test_multi_arg_array_call(ctx_factory): i = p.Variable("i") index = p.Variable("index") a_i = p.Subscript(p.Variable("a"), p.Variable("i")) - argmin_kernel = lp.make_kernel_function( + argmin_kernel = lp.make_function( "{[i]: 0 <= i < n}", [ lp.Assignment(id="init2", assignee=index, @@ -362,13 +362,13 @@ def test_packing_unpacking(ctx_factory, inline): x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64) x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64) - callee1 = lp.make_kernel_function( + callee1 = lp.make_function( "{[i]: 0<=i<6}", """ a[i] = 2*b[i] """, name="callee_fn1") - callee2 = lp.make_kernel_function( + callee2 = lp.make_function( "{[i, j]: 0<=i<2 and 0 <= j < 3}", """ a[i, j] = 3*b[i, j] diff --git a/test/test_diff.py b/test/test_diff.py index a7fd92987..7e14a7ab5 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -55,7 +55,7 @@ def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_kernel_function( + knl = lp.make_function( """{ [i,j]: 0<=i,j a = 1/(1+sinh(x[i] + y[j])**2) -- GitLab From 07719d4042f8345ab5562d85526204f1b8d10cde Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Nov 2018 10:31:13 -0600 Subject: [PATCH 397/916] reverts changes in symbolic.py --- loopy/symbolic.py | 116 +++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index a65bd0942..6024d334d 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -57,7 +57,6 @@ from pymbolic.mapper.constant_folder import \ from pymbolic.parser import Parser as ParserBase from loopy.diagnostic import LoopyError -from loopy.diagnostic import ExpressionToAffineConversionError import islpy as isl from islpy import dim_type @@ -69,23 +68,22 @@ import numpy as np # {{{ mappers with support for loopy-specific primitives class IdentityMapperMixin(object): - def map_literal(self, expr, *args, **kwargs): + def map_literal(self, expr, *args): return expr - def map_array_literal(self, expr, *args, **kwargs): - return type(expr)(tuple(self.rec(ch, *args, **kwargs) for ch in - expr.children)) + def map_array_literal(self, expr, *args): + return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children)) - def map_group_hw_index(self, expr, *args, **kwargs): + def map_group_hw_index(self, expr, *args): return expr - def map_local_hw_index(self, expr, *args, **kwargs): + def map_local_hw_index(self, expr, *args): return expr - def map_loopy_function_identifier(self, expr, *args, **kwargs): + def map_loopy_function_identifier(self, expr, *args): return expr - def map_reduction(self, expr, *args, **kwargs): + def map_reduction(self, expr, *args): mapped_inames = [self.rec(p.Variable(iname), *args) for iname in expr.inames] new_inames = [] @@ -99,22 +97,22 @@ class IdentityMapperMixin(object): return Reduction( expr.operation, tuple(new_inames), - self.rec(expr.expr, *args, **kwargs), + self.rec(expr.expr, *args), allow_simultaneous=expr.allow_simultaneous) - def map_tagged_variable(self, expr, *args, **kwargs): + def map_tagged_variable(self, expr, *args): # leaf, doesn't change return expr - def map_type_annotation(self, expr, *args, **kwargs): - return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs)) + def map_type_annotation(self, expr, *args): + return type(expr)(expr.type, self.rec(expr.child, *args)) - def map_sub_array_ref(self, expr, *args, **kwargs): - return SubArrayRef(self.rec(expr.swept_inames, *args, **kwargs), - self.rec(expr.subscript, *args, **kwargs)) + def map_sub_array_ref(self, expr, *args): + return SubArrayRef(self.rec(expr.swept_inames, *args), + self.rec(expr.subscript, *args)) - def map_resolved_function(self, expr, *args, **kwargs): - return ResolvedFunction(expr.function) + def map_scoped_function(self, expr, *args): + return ScopedFunction(self.rec(expr.function, *args)) map_type_cast = map_type_annotation @@ -180,7 +178,7 @@ class WalkMapper(WalkMapperBase): self.rec(expr.swept_inames, *args) self.rec(expr.subscript, *args) - def map_resolved_function(self, expr, *args): + def map_scoped_function(self, expr, *args): if not self.visit(expr): return @@ -189,7 +187,7 @@ class WalkMapper(WalkMapperBase): class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant - map_resolved_function = CallbackMapperBase.map_constant + map_scoped_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): @@ -257,8 +255,8 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) - def map_resolved_function(self, expr, prec): - return "ResolvedFunction('%s')" % expr.name + def map_scoped_function(self, expr, prec): + return "ScopedFunction('%s')" % expr.name def map_sub_array_ref(self, expr, prec): return "SubArrayRef({inames}, ({subscr}))".format( @@ -333,7 +331,7 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) - def map_resolved_function(self, expr): + def map_scoped_function(self, expr): return self.rec(expr.function) @@ -685,10 +683,10 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") -class ResolvedFunction(p.Expression): +class ScopedFunction(p.Expression): """ A function invocation whose definition is known in a :mod:`loopy` kernel. - Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression + Each instance of :class:`loopy.symbolic.ScopedFunction` in an expression points to an instance of :class:`loopy.kernel.function_interface.InKernelCallable` through the mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer @@ -718,7 +716,7 @@ class ResolvedFunction(p.Expression): elif isinstance(self.function, (ArgExtOp, SegmentedOp)): return self.function else: - raise LoopyError("Unexpected function type %s in ResolvedFunction." % + raise LoopyError("Unexpected function type %s in ScopedFunction." % type(self.function)) def __getinitargs__(self): @@ -727,7 +725,7 @@ class ResolvedFunction(p.Expression): def stringifier(self): return StringifyMapper - mapper_method = intern("map_resolved_function") + mapper_method = intern("map_scoped_function") class EvaluatorWithDeficientContext(PartialEvaluationMapper): @@ -838,13 +836,13 @@ class SubArrayRef(p.Expression): name = self.subscript.aggregate.name if name in kernel.temporary_variables: - assert name not in kernel.arg_dict arg = kernel.temporary_variables[name] + mem_scope = arg.scope + assert name not in kernel.arg_dict else: assert name in kernel.arg_dict arg = kernel.arg_dict[name] - - aspace = arg.address_space + mem_scope = arg.memory_address_space from loopy.kernel.array import FixedStrideArrayDimTag as DimTag from loopy.isl_helpers import simplify_via_aff @@ -853,9 +851,10 @@ class SubArrayRef(p.Expression): linearized_index = sum(dim_tag.stride*iname for dim_tag, iname in zip(arg.dim_tags, self.subscript.index_tuple)) - # look which error we are getting and guard it - - linearized_index = simplify_via_aff(linearized_index) + try: + linearized_index = simplify_via_aff(linearized_index) + except: + pass strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) @@ -866,8 +865,7 @@ class SubArrayRef(p.Expression): kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 for iname in self.swept_inames) - return ArrayArgDescriptor( - address_space=aspace, + return ArrayArgDescriptor(mem_scope=mem_scope, dim_tags=sub_dim_tags, shape=sub_shape) @@ -902,7 +900,7 @@ def parse_tagged_name(expr): from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, ResolvedFunction): + elif isinstance(expr, ScopedFunction): return parse_tagged_name(expr.function) elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None @@ -1102,14 +1100,12 @@ class RuleAwareIdentityMapper(IdentityMapper): def __init__(self, rule_mapping_context): self.rule_mapping_context = rule_mapping_context - def map_variable(self, expr, expn_state, *args, **kwargs): + def map_variable(self, expr, expn_state): name, tag = parse_tagged_name(expr) if name not in self.rule_mapping_context.old_subst_rules: - return IdentityMapper.map_variable(self, expr, expn_state, *args, - **kwargs) + return IdentityMapper.map_variable(self, expr, expn_state) else: - return self.map_substitution(name, tag, (), expn_state, *args, - **kwargs) + return self.map_substitution(name, tag, (), expn_state) def map_call(self, expr, expn_state): if not isinstance(expr.function, p.Variable): @@ -1164,7 +1160,7 @@ class RuleAwareIdentityMapper(IdentityMapper): else: return sym - def __call__(self, expr, kernel, insn, *args, **kwargs): + def __call__(self, expr, kernel, insn): from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -1173,7 +1169,7 @@ class RuleAwareIdentityMapper(IdentityMapper): kernel=kernel, instruction=insn, stack=(), - arg_context={}), *args, **kwargs) + arg_context={})) def map_instruction(self, kernel, insn): return insn @@ -1647,19 +1643,7 @@ def with_aff_conversion_guard(f, space, expr, *args): except isl.Error as e: err = e except UnknownVariableError as e: - integer_vars = deps & set(t for t, v in - kernel.temporary_variables.items() if - np.issubdtype(v.dtype, np.integer)) - - # need to sort for deterministic code generation - names = sorted(list(integer_vars)) - nd = domain.dim(isl.dim_type.set) - domain = domain.add_dims(isl.dim_type.set, len(names)) - for i, name in enumerate(names): - domain = domain.set_dim_name(isl.dim_type.set, nd + i, name) - # TODO: Understand what errors can we land in here and then guard - # them. - return aff_from_expr(domain.space, expr) + err = e assert err is not None from loopy.diagnostic import ExpressionToAffineConversionError @@ -1692,10 +1676,26 @@ def simplify_using_aff(kernel, expr): domain = kernel.get_inames_domain(inames) + from pymbolic.mapper.evaluator import UnknownVariableError + try: - aff = guarded_aff_from_expr(domain.space, expr) - except ExpressionToAffineConversionError: + with isl.SuppressedWarnings(kernel.isl_context): + aff = aff_from_expr(domain.space, expr) + except isl.Error: + return expr + except TypeError: return expr + except UnknownVariableError: + integer_vars = deps & set(t for t, v in kernel.temporary_variables.items() if np.issubdtype(v.dtype, np.integer)) + names = sorted(list(integer_vars)) # need to sort for deterministic code generation + nd = domain.dim(isl.dim_type.set) + domain = domain.add_dims(isl.dim_type.set, len(names)) + for i, name in enumerate(names): + domain = domain.set_dim_name(isl.dim_type.set, nd + i, name) + try: + aff = aff_from_expr(domain.space, expr) + except: + return expr # FIXME: Deal with assumptions, too. aff = aff.gist(domain) -- GitLab From 0616f7b5e06c1bfb00ccd09e6d2977a2186cd47e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Nov 2018 10:35:19 -0600 Subject: [PATCH 398/916] added the intended symbolic class --- loopy/symbolic.py | 108 ++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 62 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 6024d334d..54dd61966 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -57,6 +57,7 @@ from pymbolic.mapper.constant_folder import \ from pymbolic.parser import Parser as ParserBase from loopy.diagnostic import LoopyError +from loopy.diagnostic import ExpressionToAffineConversionError import islpy as isl from islpy import dim_type @@ -68,22 +69,23 @@ import numpy as np # {{{ mappers with support for loopy-specific primitives class IdentityMapperMixin(object): - def map_literal(self, expr, *args): + def map_literal(self, expr, *args, **kwargs): return expr - def map_array_literal(self, expr, *args): - return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children)) + def map_array_literal(self, expr, *args, **kwargs): + return type(expr)(tuple(self.rec(ch, *args, **kwargs) for ch in + expr.children)) - def map_group_hw_index(self, expr, *args): + def map_group_hw_index(self, expr, *args, **kwargs): return expr - def map_local_hw_index(self, expr, *args): + def map_local_hw_index(self, expr, *args, **kwargs): return expr - def map_loopy_function_identifier(self, expr, *args): + def map_loopy_function_identifier(self, expr, *args, **kwargs): return expr - def map_reduction(self, expr, *args): + def map_reduction(self, expr, *args, **kwargs): mapped_inames = [self.rec(p.Variable(iname), *args) for iname in expr.inames] new_inames = [] @@ -97,22 +99,22 @@ class IdentityMapperMixin(object): return Reduction( expr.operation, tuple(new_inames), - self.rec(expr.expr, *args), + self.rec(expr.expr, *args, **kwargs), allow_simultaneous=expr.allow_simultaneous) - def map_tagged_variable(self, expr, *args): + def map_tagged_variable(self, expr, *args, **kwargs): # leaf, doesn't change return expr - def map_type_annotation(self, expr, *args): - return type(expr)(expr.type, self.rec(expr.child, *args)) + def map_type_annotation(self, expr, *args, **kwargs): + return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs)) - def map_sub_array_ref(self, expr, *args): - return SubArrayRef(self.rec(expr.swept_inames, *args), - self.rec(expr.subscript, *args)) + def map_sub_array_ref(self, expr, *args, **kwargs): + return SubArrayRef(self.rec(expr.swept_inames, *args, **kwargs), + self.rec(expr.subscript, *args, **kwargs)) - def map_scoped_function(self, expr, *args): - return ScopedFunction(self.rec(expr.function, *args)) + def map_resolved_function(self, expr, *args, **kwargs): + return ResolvedFunction(expr.function) map_type_cast = map_type_annotation @@ -178,7 +180,7 @@ class WalkMapper(WalkMapperBase): self.rec(expr.swept_inames, *args) self.rec(expr.subscript, *args) - def map_scoped_function(self, expr, *args): + def map_resolved_function(self, expr, *args): if not self.visit(expr): return @@ -187,7 +189,7 @@ class WalkMapper(WalkMapperBase): class CallbackMapper(CallbackMapperBase, IdentityMapper): map_reduction = CallbackMapperBase.map_constant - map_scoped_function = CallbackMapperBase.map_constant + map_resolved_function = CallbackMapperBase.map_constant class CombineMapper(CombineMapperBase): @@ -255,8 +257,8 @@ class StringifyMapper(StringifyMapperBase): from pymbolic.mapper.stringifier import PREC_NONE return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) - def map_scoped_function(self, expr, prec): - return "ScopedFunction('%s')" % expr.name + def map_resolved_function(self, expr, prec): + return "ResolvedFunction('%s')" % expr.name def map_sub_array_ref(self, expr, prec): return "SubArrayRef({inames}, ({subscr}))".format( @@ -331,7 +333,7 @@ class DependencyMapper(DependencyMapperBase): def map_type_cast(self, expr): return self.rec(expr.child) - def map_scoped_function(self, expr): + def map_resolved_function(self, expr): return self.rec(expr.function) @@ -683,10 +685,10 @@ class RuleArgument(p.Expression): mapper_method = intern("map_rule_argument") -class ScopedFunction(p.Expression): +class ResolvedFunction(p.Expression): """ A function invocation whose definition is known in a :mod:`loopy` kernel. - Each instance of :class:`loopy.symbolic.ScopedFunction` in an expression + Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression points to an instance of :class:`loopy.kernel.function_interface.InKernelCallable` through the mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer @@ -716,7 +718,7 @@ class ScopedFunction(p.Expression): elif isinstance(self.function, (ArgExtOp, SegmentedOp)): return self.function else: - raise LoopyError("Unexpected function type %s in ScopedFunction." % + raise LoopyError("Unexpected function type %s in ResolvedFunction." % type(self.function)) def __getinitargs__(self): @@ -725,7 +727,7 @@ class ScopedFunction(p.Expression): def stringifier(self): return StringifyMapper - mapper_method = intern("map_scoped_function") + mapper_method = intern("map_resolved_function") class EvaluatorWithDeficientContext(PartialEvaluationMapper): @@ -836,25 +838,21 @@ class SubArrayRef(p.Expression): name = self.subscript.aggregate.name if name in kernel.temporary_variables: - arg = kernel.temporary_variables[name] - mem_scope = arg.scope assert name not in kernel.arg_dict + arg = kernel.temporary_variables[name] else: assert name in kernel.arg_dict arg = kernel.arg_dict[name] - mem_scope = arg.memory_address_space + + aspace = arg.address_space from loopy.kernel.array import FixedStrideArrayDimTag as DimTag from loopy.isl_helpers import simplify_via_aff sub_dim_tags = [] sub_shape = [] - linearized_index = sum(dim_tag.stride*iname - for dim_tag, iname - in zip(arg.dim_tags, self.subscript.index_tuple)) - try: - linearized_index = simplify_via_aff(linearized_index) - except: - pass + linearized_index = simplify_via_aff( + sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg.dim_tags, self.subscript.index_tuple))) strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) @@ -865,7 +863,8 @@ class SubArrayRef(p.Expression): kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 for iname in self.swept_inames) - return ArrayArgDescriptor(mem_scope=mem_scope, + return ArrayArgDescriptor( + address_space=aspace, dim_tags=sub_dim_tags, shape=sub_shape) @@ -900,7 +899,7 @@ def parse_tagged_name(expr): from loopy.library.reduction import ArgExtOp, SegmentedOp if isinstance(expr, TaggedVariable): return expr.name, expr.tag - elif isinstance(expr, ScopedFunction): + elif isinstance(expr, ResolvedFunction): return parse_tagged_name(expr.function) elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)): return expr.name, None @@ -1100,12 +1099,14 @@ class RuleAwareIdentityMapper(IdentityMapper): def __init__(self, rule_mapping_context): self.rule_mapping_context = rule_mapping_context - def map_variable(self, expr, expn_state): + def map_variable(self, expr, expn_state, *args, **kwargs): name, tag = parse_tagged_name(expr) if name not in self.rule_mapping_context.old_subst_rules: - return IdentityMapper.map_variable(self, expr, expn_state) + return IdentityMapper.map_variable(self, expr, expn_state, *args, + **kwargs) else: - return self.map_substitution(name, tag, (), expn_state) + return self.map_substitution(name, tag, (), expn_state, *args, + **kwargs) def map_call(self, expr, expn_state): if not isinstance(expr.function, p.Variable): @@ -1160,7 +1161,7 @@ class RuleAwareIdentityMapper(IdentityMapper): else: return sym - def __call__(self, expr, kernel, insn): + def __call__(self, expr, kernel, insn, *args, **kwargs): from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -1169,7 +1170,7 @@ class RuleAwareIdentityMapper(IdentityMapper): kernel=kernel, instruction=insn, stack=(), - arg_context={})) + arg_context={}), *args, **kwargs) def map_instruction(self, kernel, insn): return insn @@ -1671,31 +1672,14 @@ def guarded_pwaff_from_expr(space, expr, vars_to_zero=None): # {{{ simplify using aff def simplify_using_aff(kernel, expr): - deps = get_dependencies(expr) - inames = deps & kernel.all_inames() + inames = get_dependencies(expr) & kernel.all_inames() domain = kernel.get_inames_domain(inames) - from pymbolic.mapper.evaluator import UnknownVariableError - try: - with isl.SuppressedWarnings(kernel.isl_context): - aff = aff_from_expr(domain.space, expr) - except isl.Error: - return expr - except TypeError: + aff = guarded_aff_from_expr(domain.space, expr) + except ExpressionToAffineConversionError: return expr - except UnknownVariableError: - integer_vars = deps & set(t for t, v in kernel.temporary_variables.items() if np.issubdtype(v.dtype, np.integer)) - names = sorted(list(integer_vars)) # need to sort for deterministic code generation - nd = domain.dim(isl.dim_type.set) - domain = domain.add_dims(isl.dim_type.set, len(names)) - for i, name in enumerate(names): - domain = domain.set_dim_name(isl.dim_type.set, nd + i, name) - try: - aff = aff_from_expr(domain.space, expr) - except: - return expr # FIXME: Deal with assumptions, too. aff = aff.gist(domain) -- GitLab From eac68bbcb3dd047a8c4869d7332ad5c8f8f321e8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Nov 2018 17:36:26 -0600 Subject: [PATCH 399/916] rehandles match caller callee arg dims --- loopy/transform/callable.py | 121 ++++++----- loopy/transform/register_callable.py | 312 --------------------------- 2 files changed, 71 insertions(+), 362 deletions(-) delete mode 100644 loopy/transform/register_callable.py diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 3f8fbb580..9a03147dd 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -32,10 +32,10 @@ from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) -from loopy.symbolic import IdentityMapper, SubstitutionMapper +from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import (get_kw_pos_association, - change_names_of_pymbolic_calls, CallableKernel, ScalarCallable) + CallableKernel, ScalarCallable) from loopy.program import Program, ResolvedFunctionMarker __doc__ = """ @@ -43,7 +43,7 @@ __doc__ = """ .. autofunction:: register_function_id_to_in_knl_callable_mapper -.. autofunction:: register_callable_kernel +.. autofunction:: eegister_callable_kernel """ @@ -161,7 +161,8 @@ def register_callable_kernel(program, callee_kernel): # {{{ sanity checks assert isinstance(program, Program) - assert isinstance(callee_kernel, LoopKernel) + assert isinstance(callee_kernel, LoopKernel), ('{0} !=' + '{1}'.format(type(callee_kernel), LoopKernel)) # check to make sure that the variables with 'out' direction is equal to # the number of assigness in the callee kernel intructions. @@ -602,29 +603,20 @@ class DimChanger(IdentityMapper): def _match_caller_callee_argument_dimension_for_single_kernel( - caller_knl, program_callables_info, callee_function_name): + caller_knl, callee_knl): """ Returns a copy of *caller_knl* with the instance of :class:`loopy.kernel.function_interface.CallableKernel` addressed by *callee_function_name* in the *caller_knl* aligned with the argument dimesnsions required by *caller_knl*. """ - pymbolic_calls_to_new_callables = {} for insn in caller_knl.instructions: if not isinstance(insn, CallInstruction) or ( - insn.expression.function.name not in - program_callables_info): + insn.expression.function.name != + callee_knl.name): # Call to a callable kernel can only occur through a # CallInstruction. continue - - in_knl_callable = program_callables_info[ - insn.expression.function.name] - - if in_knl_callable.subkernel.name != callee_function_name: - # Not the callable we're looking for. - continue - # getting the caller->callee arg association parameters = insn.expression.parameters[:] @@ -636,14 +628,14 @@ def _match_caller_callee_argument_dimension_for_single_kernel( parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape for par in parameters] - kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) + kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) for i in range(len(parameters), len(parameters)+len(kw_parameters)): parameter_shapes.append(kw_parameters[pos_to_kw[i]] .get_array_arg_descriptor(caller_knl).shape) - # inserting the assigness at the required positions. + # inserting the assignees at the required positions. assignee_write_count = -1 - for i, arg in enumerate(in_knl_callable.subkernel.args): + for i, arg in enumerate(callee_knl.args): if arg.is_output_only: assignee = assignees[-assignee_write_count-1] parameter_shapes.insert(i, assignee @@ -651,11 +643,13 @@ def _match_caller_callee_argument_dimension_for_single_kernel( assignee_write_count -= 1 callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in - in_knl_callable.subkernel.args], parameter_shapes)) - dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, + callee_knl.args], parameter_shapes)) + dim_changer = DimChanger( + dict(callee_knl.arg_dict, **( + callee_knl.temporary_variables)), callee_arg_to_desired_dim_tag) new_callee_insns = [] - for callee_insn in in_knl_callable.subkernel.instructions: + for callee_insn in callee_knl.instructions: if isinstance(callee_insn, MultiAssignmentBase): new_callee_insns.append(callee_insn.copy(expression=dim_changer( callee_insn.expression), @@ -664,48 +658,75 @@ def _match_caller_callee_argument_dimension_for_single_kernel( _DataObliviousInstruction)): pass else: - raise NotImplementedError("Unknwon instruction %s." % + raise NotImplementedError("Unknown instruction %s." % type(insn)) # subkernel with instructions adjusted according to the new dimensions. - new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) + new_callee_knl = callee_knl.copy(instructions=new_callee_insns) + + return new_callee_knl + + +class _FunctionCalledChecker(CombineMapper): + def __init__(self, func_name): + self.func_name = func_name + + def combine(self, values): + return any(values) + + def map_call(self, expr): + if expr.function.name == self.func_name: + return True + return self.combine( + tuple( + self.rec(child) for child in expr.parameters) + ) + + map_call_with_kwargs = map_call - new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) + def map_constant(self, expr): + return False - pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable + def map_algebraic_leaf(self, expr): + return False - if not pymbolic_calls_to_new_callables: - # complain if no matching function found. - raise LoopyError("No CallableKernel with the name %s found in %s." % ( - callee_function_name, caller_knl.name)) + def map_kernel(self, kernel): + return any(self.rec(insn.expression) for insn in kernel.instructions if + isinstance(insn, MultiAssignmentBase)) - return change_names_of_pymbolic_calls(caller_knl, - pymbolic_calls_to_new_callables) +def _match_caller_callee_argument_dimension_(program, callee_function_name): + """ + Returns a copy of *program* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *program* aligned with the argument + dimensions required by *caller_knl*. + + .. note:: -def _match_caller_callee_argument_dimension_(program, *args, **kwargs): + The callee kernel addressed by *callee_funciton_name*, should be + called only once. + """ assert isinstance(program, Program) + assert isinstance(callee_function_name, str) - new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = ( - _match_caller_callee_argument_dimension_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, - *args, **kwargs)) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) + is_invoking_callee = _FunctionCalledChecker( + callee_function_name).map_kernel - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) + caller_knl, = [in_knl_callable.subkernel for in_knl_callable in + program.program_callables_info.values() if isinstance(in_knl_callable, + CallableKernel) and + is_invoking_callee(in_knl_callable.subkernel)] - new_resolved_functions[func_id] = in_knl_callable + old_callee_knl = program.program_callables_info[ + callee_function_name].subkernel + new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel( + caller_knl, old_callee_knl) - new_program_callables_info = program.program_callables_info.copy( - resolved_functions=new_resolved_functions) + new_program_callables_info = program.program_callables_info.copy() + new_program_callables_info.resolved_functions[callee_function_name] = ( + new_program_callables_info[callee_function_name].copy( + subkernel=new_callee_kernel)) return program.copy(program_callables_info=new_program_callables_info) # }}} diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py deleted file mode 100644 index 449a53f92..000000000 --- a/loopy/transform/register_callable.py +++ /dev/null @@ -1,312 +0,0 @@ -from __future__ import division, absolute_import - -__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - -from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel -from pytools import ImmutableRecord -from loopy.diagnostic import LoopyError -from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, - CInstruction, _DataObliviousInstruction) -from loopy.symbolic import IdentityMapper -from loopy.isl_helpers import simplify_via_aff -from pymbolic.primitives import CallWithKwargs -from loopy.kernel.function_interface import (get_kw_pos_association, - register_pymbolic_calls_to_knl_callables) - - -__doc__ = """ -.. currentmodule:: loopy - -.. autofunction:: register_function_lookup - -.. autofunction:: register_callable_kernel -""" - - -# {{{ register function lookup - -def register_function_lookup(kernel, function_lookup): - """ - Returns a copy of *kernel* with the *function_lookup* registered. - - :arg function_lookup: A function of signature ``(target, identifier)`` - returning a :class:`loopy.kernel.function_interface.InKernelCallable`. - """ - - # adding the function lookup to the set of function lookers in the kernel. - if function_lookup not in kernel.function_scopers: - from loopy.tools import unpickles_equally - if not unpickles_equally(function_lookup): - raise LoopyError("function '%s' does not " - "compare equally after being upickled " - "and would disrupt loopy's caches" - % function_lookup) - new_function_scopers = kernel.function_scopers + [function_lookup] - registered_kernel = kernel.copy(function_scopers=new_function_scopers) - from loopy.kernel.creation import scope_functions - - # returning the scoped_version of the kernel, as new functions maybe - # resolved. - return scope_functions(registered_kernel) - -# }}} - - -# {{{ register_callable_kernel - -class _RegisterCalleeKernel(ImmutableRecord): - """ - Helper class to make the function scoper from - :func:`loopy.transform.register_callable_kernel` picklable. As python - cannot pickle lexical closures. - """ - fields = set(['function_name', 'callable_kernel']) - - def __init__(self, function_name, callable_kernel): - self.function_name = function_name - self.callable_kernel = callable_kernel - - def __call__(self, target, identifier): - if identifier == self.function_name: - return self.callable_kernel - return None - - -def register_callable_kernel(caller_kernel, function_name, callee_kernel): - """Returns a copy of *caller_kernel*, which would resolve *function_name* in an - expression as a call to *callee_kernel*. - - :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg function_name: An instance of :class:`str`. - :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - """ - - # {{{ sanity checks - - assert isinstance(caller_kernel, LoopKernel) - assert isinstance(callee_kernel, LoopKernel) - assert isinstance(function_name, str) - - # check to make sure that the variables with 'out' direction is equal to - # the number of assigness in the callee kernel intructions. - from loopy.kernel.tools import infer_arg_is_output_only - callee_kernel = infer_arg_is_output_only(callee_kernel) - expected_num_assignees = len([arg for arg in callee_kernel.args if - arg.is_output_only]) - expected_num_parameters = len(callee_kernel.args) - expected_num_assignees - for insn in caller_kernel.instructions: - if isinstance(insn, CallInstruction) and ( - insn.expression.function.name == 'function_name'): - if insn.assignees != expected_num_assignees: - raise LoopyError("The number of arguments with 'out' direction " - "in callee kernel %s and the number of assignees in " - "instruction %s do not match." % ( - callee_kernel.name, insn.id)) - if insn.expression.prameters != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of parameters in " - "instruction %s do not match." % ( - callee_kernel.name, insn.id)) - - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("unknown instruction %s" % type(insn)) - - # }}} - - # making the target of the child kernel to be same as the target of parent - # kernel. - callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=caller_kernel.target, - name=function_name, - is_called_from_host=False)) - - # disabling global barriers for callee kernel - from loopy import set_options - callee_kernel = set_options(callee_kernel, "disable_global_barriers") - - return register_function_lookup(caller_kernel, - _RegisterCalleeKernel(function_name, callable_kernel)) - -# }}} - - -# {{{ inline callable kernel - -def inline_callable_kernel(kernel, function_name): - """ - Returns a copy of *kernel* with the callable kernel addresed by - *function_name* inlined. - """ - from loopy.preprocess import infer_arg_descr - kernel = infer_arg_descr(kernel) - - old_insns = kernel.instructions - for insn in old_insns: - if isinstance(insn, CallInstruction): - if insn.expression.function.name in kernel.scoped_functions: - in_knl_callable = kernel.scoped_functions[ - insn.expression.function.name] - from loopy.kernel.function_interface import CallableKernel - if isinstance(in_knl_callable, CallableKernel) and ( - in_knl_callable.subkernel.name == function_name): - kernel = in_knl_callable.inline_within_kernel(kernel, insn) - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknown instruction %s." % type(insn)) - - return kernel - -# }}} - - -# {{{ matching caller to callee args if dimenstions dont match - -class DimChanger(IdentityMapper): - """ - Mapper to change the dimensions of an argument. - - .. attribute:: callee_arg_dict - - A mapping from the argument name (:class:`str`) to instances of - :class:`loopy.kernel.array.ArrayBase`. - - .. attribute:: desried_shape - - A mapping from argument name (:class:`str`) to an instance of - :class:`tuple`. - """ - def __init__(self, callee_arg_dict, desired_shape): - self.callee_arg_dict = callee_arg_dict - self.desired_shape = desired_shape - - def map_subscript(self, expr): - if expr.aggregate.name not in self.callee_arg_dict: - return super(DimChanger, self).map_subscript(expr) - callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags - flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in - zip(callee_arg_dim_tags, expr.index_tuple)) - new_indices = [] - - from operator import mul - from functools import reduce - stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) - - for length in self.desired_shape[expr.aggregate.name]: - stride /= length - ind = flattened_index // int(stride) - flattened_index -= (int(stride) * ind) - new_indices.append(simplify_via_aff(ind)) - - return expr.aggregate.index(tuple(new_indices)) - - -def _match_caller_callee_argument_dimension(caller_knl, callee_function_name): - """ - Returns a copy of *caller_knl* with the instance of - :class:`loopy.kernel.function_interface.CallableKernel` addressed by - *callee_function_name* in the *caller_knl* aligned with the argument - dimesnsions required by *caller_knl*. - """ - pymbolic_calls_to_new_callables = {} - for insn in caller_knl.instructions: - if not isinstance(insn, CallInstruction) or ( - insn.expression.function.name not in - caller_knl.scoped_functions): - # Call to a callable kernel can only occur through a - # CallInstruction. - continue - - in_knl_callable = caller_knl.scoped_functions[ - insn.expression.function.name] - - if in_knl_callable.subkernel.name != callee_function_name: - # Not the callable we're looking for. - continue - - # getting the caller->callee arg association - - parameters = insn.expression.parameters[:] - kw_parameters = {} - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - - assignees = insn.assignees - - parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape - for par in parameters] - kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel) - for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameter_shapes.append(kw_parameters[pos_to_kw[i]] - .get_array_arg_descriptor(caller_knl).shape) - - # inserting the assigness at the required positions. - assignee_write_count = -1 - for i, arg in enumerate(in_knl_callable.subkernel.args): - if arg.is_output_only: - assignee = assignees[-assignee_write_count-1] - parameter_shapes.insert(i, assignee - .get_array_arg_descriptor(caller_knl).shape) - assignee_write_count -= 1 - - callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in - in_knl_callable.subkernel.args], parameter_shapes)) - dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict, - callee_arg_to_desired_dim_tag) - new_callee_insns = [] - for callee_insn in in_knl_callable.subkernel.instructions: - if isinstance(callee_insn, MultiAssignmentBase): - new_callee_insns.append(callee_insn.copy(expression=dim_changer( - callee_insn.expression), - assignee=dim_changer(callee_insn.assignee))) - elif isinstance(callee_insn, (CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknwon instruction %s." % - type(insn)) - - # subkernel with instructions adjusted according to the new dimensions. - new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns) - - new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel) - - pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable - - if not pymbolic_calls_to_new_callables: - # complain if no matching function found. - raise LoopyError("No CallableKernel with the name %s found in %s." % ( - callee_function_name, caller_knl.name)) - - return register_pymbolic_calls_to_knl_callables(caller_knl, - pymbolic_calls_to_new_callables) - -# }}} - - -# vim: foldmethod=marker -- GitLab From 98688c76082c4c05a753946bbd5e8505194916f4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Nov 2018 07:46:42 -0600 Subject: [PATCH 400/916] should only change shapes for arguments --- loopy/transform/callable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 9a03147dd..433181385 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -584,6 +584,8 @@ class DimChanger(IdentityMapper): self.desired_shape = desired_shape def map_subscript(self, expr): + if expr.aggregate.name not in self.callee_arg_dict: + return super(DimChanger, self).map_subscript(expr) callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in zip(callee_arg_dim_tags, expr.index_tuple)) @@ -645,8 +647,7 @@ def _match_caller_callee_argument_dimension_for_single_kernel( callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in callee_knl.args], parameter_shapes)) dim_changer = DimChanger( - dict(callee_knl.arg_dict, **( - callee_knl.temporary_variables)), + callee_knl.arg_dict, callee_arg_to_desired_dim_tag) new_callee_insns = [] for callee_insn in callee_knl.instructions: -- GitLab From b2903df6c6227960e720ea35cff174df877d4dd7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Nov 2018 11:46:56 -0600 Subject: [PATCH 401/916] small typo, to re-enable making callee kernels --- loopy/kernel/creation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 823fb1b3f..c79918736 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2182,7 +2182,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): - kwargs['is_callee_kernel'] = False + kwargs['is_callee_kernel'] = True return make_kernel(*args, **kwargs) # }}} -- GitLab From 95ee6fed7549c36dd421b8eb9fcd768d53a139a5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Nov 2018 12:19:34 -0600 Subject: [PATCH 402/916] made device preambles list back again --- loopy/codegen/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 00397906e..d8a7effcc 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -564,14 +564,14 @@ def generate_code_v2(program): if not in_knl_callable.subkernel.is_called_from_host: assert codegen_results[func_id].host_program is None - device_preambles = set() + device_preambles = [] for cgr in codegen_results.values(): - device_preambles.update(cgr.device_preambles) + device_preambles.extend(cgr.device_preambles) # collecting the function declarations of callee kernels for in_knl_callable in program.callables_table.values(): for preamble in in_knl_callable.generate_preambles(program.target): - device_preambles.update([preamble]) + device_preambles.append(preamble) collective_device_program = codegen_results[program.name].device_programs[0] callee_fdecls = [] -- GitLab From c12c610978b2b1ecab1a6b619f64315b241bfa0e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Nov 2018 12:45:04 -0600 Subject: [PATCH 403/916] Merge 'master' into 'new_function_interface' --- .gitlab-ci.yml | 19 ++++++++++- LICENSE | 21 ++++++++++++ .../make-linux-build-docker-inner-part-2.sh | 4 +++ loopy/frontend/fortran/tree.py | 2 +- loopy/kernel/tools.py | 4 +-- loopy/schedule/__init__.py | 10 ++++-- loopy/statistics.py | 20 ++++++++---- loopy/symbolic.py | 2 +- loopy/target/cuda.py | 2 +- loopy/target/pyopencl.py | 3 +- requirements.txt | 5 +-- setup.cfg | 2 +- test/test_loopy.py | 19 +++++++++++ test/test_numa_diff.py | 2 +- test/test_reduction.py | 32 +++++++++++-------- test/test_statistics.py | 14 +++++--- test/test_target.py | 17 ++++++++++ 17 files changed, 137 insertions(+), 41 deletions(-) create mode 100644 LICENSE diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1caef802b..ea69114d6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,6 +12,10 @@ Python 2.7 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 2.7 with legacy PyOpenCL: script: @@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL: except: - tags retry: 2 + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL: script: @@ -43,6 +51,10 @@ Python 3.6 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL Twice With Cache: script: @@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + # PyPy POCL: # script: @@ -77,7 +93,7 @@ Python 3.6 POCL Examples: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert" + - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" - ". ./build-py-project-and-run-examples.sh" tags: - python3.6 @@ -87,6 +103,7 @@ Python 3.6 POCL Examples: except: - tags + CentOS binary: script: - (cd build-helpers; ./make-linux-build-docker.sh --nodate) diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..601df74bd --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Andreas Klöckner and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh index 1e35a1e1b..035634b16 100755 --- a/build-helpers/make-linux-build-docker-inner-part-2.sh +++ b/build-helpers/make-linux-build-docker-inner-part-2.sh @@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy cd loopy grep -v pyopencl requirements.txt > myreq.txt + +# needed for pyinstaller package to be usable +echo packaging >> myreq.txt + pip install -r myreq.txt python setup.py install diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index b1df6e3d0..6939bb6ad 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -53,7 +53,7 @@ class FTreeWalkerBase(object): ENTITY_RE = re.compile( r"^(?P[_0-9a-zA-Z]+)" - "(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") + r"(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 006ac6ba3..3aaa8d56a 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1253,7 +1253,7 @@ def draw_dependencies_as_unicode_arrows( for dep in insn.depends_on: reverse_deps.setdefault(dep, set()).add(insn.id) - # mapping of (from_id, to_id) tuples to column_index + # mapping of to_id tuples to column_index dep_to_column = {} # {{{ find column assignments @@ -1330,7 +1330,7 @@ def draw_dependencies_as_unicode_arrows( elif insn.id in starts: starts.remove(insn.id) - if starts: + if starts or pointed_at_insn_id not in processed_ids: # will continue downward row[col] = do_flag_downward(u"├", pointed_at_insn_id) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 2b3f7a3b9..3dc1c0bbe 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -794,9 +794,13 @@ def generate_loop_schedules_internal( if not is_ready: if debug_mode: - print("instruction '%s' is missing insn depedencies '%s'" % ( - format_insn(kernel, insn.id), ",".join( - insn.depends_on - sched_state.scheduled_insn_ids))) + # These are not that interesting when understanding scheduler + # failures. + + # print("instruction '%s' is missing insn depedencies '%s'" % ( + # format_insn(kernel, insn.id), ",".join( + # insn.depends_on - sched_state.scheduled_insn_ids))) + pass continue want = kernel.insn_inames(insn) - sched_state.parallel_inames diff --git a/loopy/statistics.py b/loopy/statistics.py index d65387d16..454cca18e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -707,9 +707,10 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, callables_table): + def __init__(self, knl, callables_table, count_within_subscripts=True): self.knl = knl self.callables_table = callables_table + self.count_within_subscripts = count_within_subscripts from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl, callables_table) @@ -737,7 +738,10 @@ class ExpressionOpCounter(CounterBase): ) + self.rec(expr.parameters) def map_subscript(self, expr): - return self.rec(expr.index) + if self.count_within_subscripts: + return self.rec(expr.index) + else: + return ToCountMap() def map_sum(self, expr): assert expr.children @@ -1343,10 +1347,9 @@ def _get_insn_count(knl, callables_table, insn_id, subgroup_size, # {{{ get_op_map - def get_op_map_for_single_kernel(knl, callables_table, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + count_within_subscripts=True, subgroup_size=None): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " @@ -1394,7 +1397,7 @@ def get_op_map_for_single_kernel(knl, callables_table, def get_op_map(program, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + count_within_subscripts=True, subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1410,6 +1413,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) + :arg count_within_subscripts: A :class:`bool` specifying whether to + count operations inside array indices. + :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``'guess'``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within @@ -1464,8 +1470,8 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, if isinstance(in_knl_callable, CallableKernel): knl = in_knl_callable.subkernel knl_op_map = get_op_map_for_single_kernel(knl, - program.callables_table, numpy_types, - count_redundant_work, subgroup_size) + program.callables_table, numpy_types, count_redundant_work, + count_within_subscripts, subgroup_size) for i in range(callables_count[func_id]): op_map += knl_op_map diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 92b209ac9..04cf2d02b 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1696,7 +1696,7 @@ def get_access_range(domain, subscript, assumptions, shape=None, if shape is not None: try: shape_aff = guarded_aff_from_expr(access_map.space, shape[idim]) - except ExpressionToAffineConversionError as sub_err: + except ExpressionToAffineConversionError: pass if shape_aff is None: diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 32b810eb3..6b4385bff 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -344,7 +344,7 @@ class CUDACASTBuilder(CASTBuilder): _VEC_AXES = "xyzw" def add_vector_access(self, access_expr, index): - return access_expr.a(self._VEC_AXES[index]) + return access_expr.attr(self._VEC_AXES[index]) def emit_barrier(self, synchronization_kind, mem_kind, comment): """ diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index d98b6cdd6..5ef564572 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device): new_storage_shape = storage_shape - new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) + new_temp_vars[temp_var.name] = temp_var.copy( + storage_shape=tuple(new_storage_shape)) return kernel.copy(temporary_variables=new_temp_vars) diff --git a/requirements.txt b/requirements.txt index a3e88cfea..97c202476 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git git+https://github.com/inducer/f2py # Optional, needed for using the C preprocessor on Fortran -ply>=3.6 - -# This is needed for the pyinstaller executable to be usable. -packaging +ply>=3.6 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index b939ce0cf..eec3dfd1f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814 +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504 max-line-length=85 exclude= loopy/target/c/compyte/ndarray, diff --git a/test/test_loopy.py b/test/test_loopy.py index fa32ca04c..b770497f1 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2890,6 +2890,25 @@ def test_dep_cycle_printing_and_error(): print(lp.generate_code(knl).device_code()) +def test_backwards_dep_printing_and_error(): + knl = lp.make_kernel( + "{[i]: 0<=i 1: exec(sys.argv[1]) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 62f490cee..1ba44e77e 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -47,8 +47,8 @@ __all__ = [ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa -@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("ilp_multiple", [1, 2]) +@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("opt_level", [11]) def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa ctx = ctx_factory() diff --git a/test/test_reduction.py b/test/test_reduction.py index 96dab405a..aaf11ee29 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -219,32 +219,38 @@ def test_local_parallel_reduction(ctx_factory, size): def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() - prog = lp.make_kernel( + knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. - z[0] = sum(i, i/13) + z[0] = sum(i, a[i]) """) - ref_prog = prog + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + ref_knl = knl gsize = 128 - prog = lp.split_iname(prog, "i", gsize * 20) - prog = lp.split_iname(prog, "i_inner", gsize, outer_tag="l.0") - prog = lp.split_reduction_inward(prog, "i_inner_inner") - prog = lp.split_reduction_inward(prog, "i_inner_outer") + knl = lp.split_iname(knl, "i", gsize * 20) + knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") + knl = lp.split_reduction_outward(knl, "i_outer") + knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule - prog = reduction_arg_to_subst_rule(prog, "i_outer") - prog = lp.precompute(prog, "red_i_outer_arg", "i_outer", + knl = reduction_arg_to_subst_rule(knl, "i_outer") + + knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") - prog = lp.realize_reduction(prog) - prog = lp.add_dependency( - prog, "writes:acc_i_outer", + knl = lp.realize_reduction(knl) + knl = lp.tag_inames(knl, "i_outer_0:g.0") + + # Keep the i_outer accumulator on the correct (lower) side of the barrier, + # otherwise there will be useless save/reload code generated. + knl = lp.add_dependency( + knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( - ref_prog, ctx, prog, parameters={"n": size}, + ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True) diff --git a/test/test_statistics.py b/test/test_statistics.py index 3f2366521..41b44b5a7 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -57,7 +57,8 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -161,7 +162,8 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -206,7 +208,8 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=False) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -226,7 +229,7 @@ def test_op_counter_bitwise(): i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups - assert i32add == n*m+n*m*ell*n_subgroups + assert i32add == n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups assert i64bw == 2*n*m*n_subgroups assert i64add == i64mul == n*m*n_subgroups @@ -1153,7 +1156,8 @@ def test_summations_and_filters(): assert f32lall == (3*n*m*ell)*n_subgroups assert f64lall == (2*n*m)*n_subgroups - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) diff --git a/test/test_target.py b/test/test_target.py index a5186c71c..095bf0939 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -350,6 +350,23 @@ def test_ispc_streaming_stores(): lp.generate_code_v2(knl).all_code() +def test_cuda_short_vector(): + knl = lp.make_kernel( + "{ [i]: 0<=i 1: exec(sys.argv[1]) -- GitLab From cb151a4bdae8a1a9643ce6a6c93da80e5b5e56de Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Nov 2018 13:23:59 -0600 Subject: [PATCH 404/916] another one of ArrayBase erros --- loopy/kernel/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 6bf733a84..0ed1f9401 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -834,6 +834,7 @@ class ArrayBase(ImmutableRecord): order=order, alignment=alignment, for_atomic=for_atomic, + target=target, **kwargs) def __eq__(self, other): -- GitLab From 46e9d2ea885a817ba619b5da4dce64d8ef6b156c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Nov 2018 05:04:20 -0600 Subject: [PATCH 405/916] Handle scalar shapes correctly. --- loopy/transform/callable.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 433181385..dbda5d74f 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -628,11 +628,20 @@ def _match_caller_callee_argument_dimension_for_single_kernel( assignees = insn.assignees - parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape + def _shape_1_if_empty(shape): + assert isinstance(shape, tuple) + if shape == (): + return (1, ) + else: + return shape + + parameter_shapes = [ + _shape_1_if_empty( + par.get_array_arg_descriptor(caller_knl).shape) for par in parameters] kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameter_shapes.append(kw_parameters[pos_to_kw[i]] + parameter_shapes.append(_shape_1_if_empty(kw_parameters[pos_to_kw[i]]) .get_array_arg_descriptor(caller_knl).shape) # inserting the assignees at the required positions. @@ -640,8 +649,8 @@ def _match_caller_callee_argument_dimension_for_single_kernel( for i, arg in enumerate(callee_knl.args): if arg.is_output_only: assignee = assignees[-assignee_write_count-1] - parameter_shapes.insert(i, assignee - .get_array_arg_descriptor(caller_knl).shape) + parameter_shapes.insert(i, _shape_1_if_empty(assignee + .get_array_arg_descriptor(caller_knl).shape)) assignee_write_count -= 1 callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in @@ -655,6 +664,7 @@ def _match_caller_callee_argument_dimension_for_single_kernel( new_callee_insns.append(callee_insn.copy(expression=dim_changer( callee_insn.expression), assignee=dim_changer(callee_insn.assignee))) + elif isinstance(callee_insn, (CInstruction, _DataObliviousInstruction)): pass -- GitLab From a385bd0632e26896a55978e4064a145fbf24a93b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Nov 2018 05:27:09 -0600 Subject: [PATCH 406/916] import changes from statistics to count within subscripts --- loopy/statistics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 454cca18e..88aa49bb0 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1358,7 +1358,8 @@ def get_op_map_for_single_kernel(knl, callables_table, subgroup_size = _process_subgroup_size(knl, subgroup_size) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, callables_table) + op_counter = ExpressionOpCounter(knl, callables_table, + count_within_subscripts) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, -- GitLab From dc0f57d8bb1fee4ed9fd4a7f6ccb39dc9a81d502 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Nov 2018 09:06:27 -0600 Subject: [PATCH 407/916] Some more merge leftovers from new_function_interface --- loopy/kernel/__init__.py | 67 ++++++++++++++++++++++++++++++++----- loopy/kernel/creation.py | 7 +++- loopy/transform/callable.py | 64 ++++++++++++++++++----------------- 3 files changed, 97 insertions(+), 41 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 928eed265..26db6ec4e 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1036,20 +1036,17 @@ class LoopKernel(ImmutableRecordWithoutPickling): constants_only=True))) @memoize_method - def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, - ignore_auto=False): + def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, + callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. + :arg insn_ids: a :class:`frozenset` of instruction IDs - *global_size* and *local_size* are :class:`islpy.PwAff` objects. - """ - if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - callables_table, - ignore_auto=ignore_auto) + *global_size* and *local_size* are instances of :class:`dict` with + mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. + """ # {{{ collecting the callee kernels in insn_ids @@ -1124,6 +1121,58 @@ class LoopKernel(ImmutableRecordWithoutPickling): return global_sizes, local_sizes @memoize_method + def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, + ignore_auto=False): + """Return a tuple (global_size, local_size) containing a grid that + could accommodate execution of all instructions whose IDs are given + in *insn_ids*. + + :arg insn_ids: a :class:`frozenset` of instruction IDs + + *global_size* and *local_size* are :class:`islpy.PwAff` objects. + """ + + if self.overridden_get_grid_sizes_for_insn_ids: + return self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + callables_table=callables_table, + ignore_auto=ignore_auto) + + assert self.is_called_from_host, ("Callee kernels do not have sufficient " + "information to compute grid sizes.") + + global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( + insn_ids, callables_table, ignore_auto=ignore_auto) + + def to_dim_tuple(size_dict, which, forced_sizes={}): + forced_sizes = forced_sizes.copy() + + size_list = [] + sorted_axes = sorted(six.iterkeys(size_dict)) + + while sorted_axes or forced_sizes: + if sorted_axes: + cur_axis = sorted_axes.pop(0) + else: + cur_axis = None + + if len(size_list) in forced_sizes: + size_list.append(forced_sizes.pop(len(size_list))) + continue + + assert cur_axis is not None + + if cur_axis > len(size_list): + raise LoopyError("%s axis %d unused for %s" % ( + which, len(size_list), self.name)) + + size_list.append(size_dict[cur_axis]) + + return tuple(size_list) + + return (to_dim_tuple(global_sizes, "global"), + to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 060b5d766..52e299b61 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2146,7 +2146,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # {{{ handle kernel language version - if is_callee_kernel: + if not is_callee_kernel: from loopy.version import LANGUAGE_VERSION_SYMBOLS version_to_symbol = dict( @@ -2353,6 +2353,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): + lang_version = kwargs.pop('lang_version', None) + if lang_version: + raise LoopyError("lang_version should be set for program, not " + "functions.") + kwargs['is_callee_kernel'] = True return make_kernel(*args, **kwargs) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 532f60212..e293543f1 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -173,7 +173,7 @@ def register_callable_kernel(program, callee_kernel): expected_num_assignees = len([arg for arg in callee_kernel.args if arg.is_output_only]) expected_num_parameters = len(callee_kernel.args) - expected_num_assignees - for in_knl_callable in program.program_callables_info.values(): + for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel for insn in caller_kernel.instructions: @@ -211,8 +211,9 @@ def register_callable_kernel(program, callee_kernel): # take the function resolvers from the Program and resolve the functions in # the callee kernel - program_callables_info = ( - program.program_callables_info.with_edit_callables_mode()) + old_callables_count = program.callables_table.callables_count + callables_table = ( + program.callables_table.with_edit_callables_mode()) from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( @@ -220,16 +221,17 @@ def register_callable_kernel(program, callee_kernel): callee_kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, callee_kernel, program_callables_info, + rule_mapping_context, callee_kernel, callables_table, program.func_id_to_in_knl_callable_mappers) callee_kernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(callee_kernel)) - program_callables_info = resolved_function_marker.program_callables_info + callables_table = resolved_function_marker.callables_table - program_callables_info = ( - program_callables_info.with_exit_edit_callables_mode()) - program = program.copy(program_callables_info=program_callables_info) + callables_table = ( + callables_table.with_exit_edit_callables_mode( + old_callables_count)) + program = program.copy(callables_table=callables_table) # making the target of the child kernel to be same as the target of parent # kernel. @@ -492,26 +494,26 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): # {{{ inline callable kernel def _inline_single_callable_kernel(caller_kernel, function_name, - program_callables_info): + callables_table): old_insns = caller_kernel.instructions for insn in old_insns: if isinstance(insn, CallInstruction): # FIXME This seems to use identifiers across namespaces. Why not # check whether the function is a scoped function first? ~AK - if insn.expression.function.name in program_callables_info: - history_of_identifier = program_callables_info.history[ + if insn.expression.function.name in callables_table: + history_of_identifier = callables_table.history[ insn.expression.function.name] if function_name in history_of_identifier: - in_knl_callable = program_callables_info[ + in_knl_callable = callables_table[ insn.expression.function.name] assert isinstance(in_knl_callable, CallableKernel) caller_kernel = _inline_call_instruction( caller_kernel, in_knl_callable.subkernel, insn) - program_callables_info = ( - program_callables_info.with_deleted_callable( + callables_table = ( + callables_table.with_deleted_callable( insn.expression.function.name, - program_callables_info.num_times_callables_called[ + callables_table.num_times_callables_called[ caller_kernel.name])) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): @@ -521,7 +523,7 @@ def _inline_single_callable_kernel(caller_kernel, function_name, "Unknown instruction type %s" % type(insn).__name__) - return caller_kernel, program_callables_info + return caller_kernel, callables_table # FIXME This should take a 'within' parameter to be able to only inline @@ -533,33 +535,33 @@ def inline_callable_kernel(program, function_name): """ from loopy.preprocess import infer_arg_descr program = infer_arg_descr(program) - program_callables_info = program.program_callables_info - old_program_callables_info = program_callables_info.copy() + callables_table = program.callables_table + old_callables_table = callables_table.copy() edited_callable_kernels = {} - for func_id, in_knl_callable in old_program_callables_info.items(): - if function_name not in old_program_callables_info.history[func_id] and ( + for func_id, in_knl_callable in old_callables_table.items(): + if function_name not in old_callables_table.history[func_id] and ( isinstance(in_knl_callable, CallableKernel)): caller_kernel = in_knl_callable.subkernel - caller_kernel, program_callables_info = ( + caller_kernel, callables_table = ( _inline_single_callable_kernel(caller_kernel, function_name, - program_callables_info)) + callables_table)) edited_callable_kernels[func_id] = in_knl_callable.copy( subkernel=caller_kernel) new_resolved_functions = {} - for func_id, in_knl_callable in program_callables_info.items(): + for func_id, in_knl_callable in callables_table.items(): if func_id in edited_callable_kernels: new_resolved_functions[func_id] = edited_callable_kernels[func_id] else: new_resolved_functions[func_id] = in_knl_callable - program_callables_info = program_callables_info.copy( + callables_table = callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=program_callables_info) + return program.copy(callables_table=callables_table) # }}} @@ -719,20 +721,20 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name): callee_function_name).map_kernel caller_knl, = [in_knl_callable.subkernel for in_knl_callable in - program.program_callables_info.values() if isinstance(in_knl_callable, + program.callables_table.values() if isinstance(in_knl_callable, CallableKernel) and is_invoking_callee(in_knl_callable.subkernel)] - old_callee_knl = program.program_callables_info[ + old_callee_knl = program.callables_table[ callee_function_name].subkernel new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel( caller_knl, old_callee_knl) - new_program_callables_info = program.program_callables_info.copy() - new_program_callables_info.resolved_functions[callee_function_name] = ( - new_program_callables_info[callee_function_name].copy( + new_callables_table = program.callables_table.copy() + new_callables_table.resolved_functions[callee_function_name] = ( + new_callables_table[callee_function_name].copy( subkernel=new_callee_kernel)) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # }}} -- GitLab From 20371326ee0fad5ad62217231bb35e7aa65fe11b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Nov 2018 10:03:36 -0600 Subject: [PATCH 408/916] some more program_callables_info -> callables_table --- loopy/transform/callable.py | 46 ++++++++++++------------- loopy/transform/pack_and_unpack_args.py | 14 ++++---- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index e293543f1..f812b8ea2 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -31,7 +31,7 @@ from loopy.kernel import LoopKernel from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, - CInstruction, _DataObliviousInstruction) + Assignment, CInstruction, _DataObliviousInstruction) from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import (get_kw_pos_association, @@ -211,26 +211,19 @@ def register_callable_kernel(program, callee_kernel): # take the function resolvers from the Program and resolve the functions in # the callee kernel - old_callables_count = program.callables_table.callables_count - callables_table = ( - program.callables_table.with_edit_callables_mode()) - from loopy.symbolic import SubstitutionRuleMappingContext rule_mapping_context = SubstitutionRuleMappingContext( callee_kernel.substitutions, callee_kernel.get_var_name_generator()) resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, callee_kernel, callables_table, + rule_mapping_context, callee_kernel, program.callables_table, program.func_id_to_in_knl_callable_mappers) callee_kernel = rule_mapping_context.finish_kernel( resolved_function_marker.map_kernel(callee_kernel)) - callables_table = resolved_function_marker.callables_table + callables_table = resolved_function_marker.callables_table.copy() - callables_table = ( - callables_table.with_exit_edit_callables_mode( - old_callables_count)) program = program.copy(callables_table=callables_table) # making the target of the child kernel to be same as the target of parent @@ -462,15 +455,25 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) for atomicity in insn.atomicity) - insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on, - tags=insn.tags | instruction.tags, - atomicity=new_atomicity - ) + if isinstance(insn, Assignment): + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on, + tags=insn.tags | instruction.tags, + atomicity=new_atomicity + ) + else: + insn = insn.copy( + id=insn_id[insn.id], + within_inames=within_inames, + # TODO: probaby need to keep priority in callee kernel + priority=instruction.priority, + depends_on=depends_on, + tags=insn.tags | instruction.tags, + ) inner_insns.append(insn) inner_insns.append(noop_end) @@ -510,11 +513,6 @@ def _inline_single_callable_kernel(caller_kernel, function_name, assert isinstance(in_knl_callable, CallableKernel) caller_kernel = _inline_call_instruction( caller_kernel, in_knl_callable.subkernel, insn) - callables_table = ( - callables_table.with_deleted_callable( - insn.expression.function.name, - callables_table.num_times_callables_called[ - caller_kernel.name])) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 734072574..e5ed850c6 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -37,7 +37,7 @@ __doc__ = """ def pack_and_unpack_args_for_call_for_single_kernel(kernel, - program_callables_info, call_name, args_to_pack=None, + callables_table, call_name, args_to_pack=None, args_to_unpack=None): """ Returns a a copy of *kernel* with instructions appended to copy the @@ -63,10 +63,10 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, if not isinstance(insn, CallInstruction): # pack and unpack call only be done for CallInstructions. continue - if insn.expression.function.name not in program_callables_info: + if insn.expression.function.name not in callables_table: continue - in_knl_callable = program_callables_info[ + in_knl_callable = callables_table[ insn.expression.function.name] if in_knl_callable.name != call_name: @@ -324,10 +324,10 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs): assert isinstance(program, Program) new_resolved_functions = {} - for func_id, in_knl_callable in program.program_callables_info.items(): + for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = pack_and_unpack_args_for_call_for_single_kernel( - in_knl_callable.subkernel, program.program_callables_info, + in_knl_callable.subkernel, program.callables_table, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) @@ -340,8 +340,8 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs): new_resolved_functions[func_id] = in_knl_callable - new_program_callables_info = program.program_callables_info.copy( + new_callables_table = program.callables_table.copy( resolved_functions=new_resolved_functions) - return program.copy(program_callables_info=new_program_callables_info) + return program.copy(callables_table=new_callables_table) # vim: foldmethod=marker -- GitLab From 600f9d1bdcf3f9f46fb7a56cd9c5fc00ce84a555 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Nov 2018 10:42:01 -0600 Subject: [PATCH 409/916] re-adds some missing checks --- loopy/check.py | 4 ++-- loopy/target/c/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 82b99a439..659e210fc 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -729,8 +729,8 @@ def pre_schedule_checks(kernel, callables_table): check_for_data_dependent_parallel_bounds(kernel) check_bounds(kernel) check_write_destinations(kernel) - # check_has_schedulable_iname_nesting(kernel) - # check_variable_access_ordered(kernel) + check_has_schedulable_iname_nesting(kernel) + check_variable_access_ordered(kernel) logger.debug("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index ca4d6b00d..ac3dec32e 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -435,7 +435,7 @@ class CMathCallable(ScalarCallable): if dtype.kind == "c": raise LoopyTypeError("%s does not support complex numbers") - elif dtype.kind == "f" and name in ["fmax", "fmin"]: + elif dtype.kind == "f" or name in ["fmax", "fmin"]: from loopy.target.opencl import OpenCLTarget if not isinstance(caller_kernel.target, OpenCLTarget): if dtype == np.float64: -- GitLab From 1d48377532bc8092bbc613fa09a63f166047ef10 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Nov 2018 04:17:28 -0600 Subject: [PATCH 410/916] reverted the changes in type inference --- loopy/target/c/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index ac3dec32e..58051e42f 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -435,7 +435,7 @@ class CMathCallable(ScalarCallable): if dtype.kind == "c": raise LoopyTypeError("%s does not support complex numbers") - elif dtype.kind == "f" or name in ["fmax", "fmin"]: + elif dtype.kind == "f": from loopy.target.opencl import OpenCLTarget if not isinstance(caller_kernel.target, OpenCLTarget): if dtype == np.float64: -- GitLab From a840eed1fed2dd3f0ba636f7f2cd9ae446d55531 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Nov 2018 05:55:49 -0600 Subject: [PATCH 411/916] minor changes to relax type inference --- loopy/statistics.py | 5 +++++ loopy/type_inference.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 965c164e5..c621ea727 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -34,6 +34,8 @@ from loopy.kernel.data import ( from loopy.diagnostic import warn_with_kernel, LoopyError from pytools import Record, memoize_method from loopy.kernel.function_interface import ScalarCallable, CallableKernel +from loopy.kernel import LoopKernel +from loopy.program import make_program __doc__ = """ @@ -1458,6 +1460,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, """ + if isinstance(program, LoopKernel): + program = make_program(program) + from loopy.preprocess import preprocess_program, infer_unknown_types program = infer_unknown_types(program, expect_completion=True) program = preprocess_program(program) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 4137709e2..5047dcc27 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -457,6 +457,10 @@ class TypeInferenceMapper(CombineMapper): np.int64): continue + if np.can_cast(arg_id_to_dtype[id].dtype.type, + in_knl_callable.arg_id_to_dtype[id].dtype.type): + continue + # }}} raise LoopyError("Overwriting a specialized function " -- GitLab From 237b7ef44125410dd3d7a23f75fa3a838331e560 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Nov 2018 06:04:25 -0600 Subject: [PATCH 412/916] some more leftover program_callables_info -> callables_table --- examples/python/call-external.py | 6 +++--- loopy/kernel/function_interface.py | 16 ++++++++-------- loopy/kernel/tools.py | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/python/call-external.py b/examples/python/call-external.py index 68618a7ec..c13d99bd0 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -7,14 +7,14 @@ from loopy.target.c import CTarget # {{{ blas callable class BLASCallable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, program_callables_info): + def with_types(self, arg_id_to_dtype, kernel, callables_table): for i in range(0, 2): if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: # the types provided aren't mature enough to specialize the # callable return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), - program_callables_info) + callables_table) mat_dtype = arg_id_to_dtype[0].numpy_dtype vec_dtype = arg_id_to_dtype[1].numpy_dtype @@ -34,7 +34,7 @@ class BLASCallable(lp.ScalarCallable): from loopy.types import NumpyType return self.copy(name_in_target=name_in_target, arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), - -1: NumpyType(vec_dtype)}), program_callables_info + -1: NumpyType(vec_dtype)}), callables_table def emit_call_insn(self, insn, target, expression_to_code_mapper): assert self.is_ready_for_codegen() diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index fa7a87fec..3e628f5c9 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -532,7 +532,7 @@ class CallableKernel(InKernelCallable): return self.subkernel.name def with_types(self, arg_id_to_dtype, caller_kernel, - program_callables_info): + callables_table): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) new_args = [] @@ -555,10 +555,10 @@ class CallableKernel(InKernelCallable): # infer the types of the written variables based on the knowledge # of the types of the arguments supplied - specialized_kernel, program_callables_info = ( + specialized_kernel, callables_table = ( infer_unknown_types_for_a_single_kernel( pre_specialized_subkernel, - program_callables_info, + callables_table, expect_completion=True)) new_arg_id_to_dtype = {} @@ -571,9 +571,9 @@ class CallableKernel(InKernelCallable): # Return the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype return self.copy(subkernel=specialized_kernel, - arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info + arg_id_to_dtype=new_arg_id_to_dtype), callables_table - def with_descrs(self, arg_id_to_descr, program_callables_info): + def with_descrs(self, arg_id_to_descr, callables_table): # tune the subkernel so that we have the matching shapes and # dim_tags @@ -602,15 +602,15 @@ class CallableKernel(InKernelCallable): type(descr)) descriptor_specialized_knl = self.subkernel.copy(args=new_args) from loopy.preprocess import traverse_to_infer_arg_descr - descriptor_specialized_knl, program_callables_info = ( + descriptor_specialized_knl, callables_table = ( traverse_to_infer_arg_descr(descriptor_specialized_knl, - program_callables_info)) + callables_table)) return ( self.copy( subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr), - program_callables_info) + callables_table) def with_packing_for_args(self): from loopy.kernel.data import AddressSpace diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 125577c9a..26856d64f 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1866,7 +1866,7 @@ def find_aliasing_equivalence_classes(kernel): # {{{ callee kernel tools -def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): +def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,): """ Returns an instance of :class:`frozenset` of all the callee kernels called in instructions in the *kernel* whose IDs are given in *insn_ids*. @@ -1892,8 +1892,8 @@ def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,): from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, CInstruction, _DataObliviousInstruction) if isinstance(insn, CallInstruction): - if insn.expression.function.name in program_callables_info: - in_knl_callable = program_callables_info[ + if insn.expression.function.name in callables_table: + in_knl_callable = callables_table[ insn.expression.function.name] if isinstance(in_knl_callable, CallableKernel): return in_knl_callable.subkernel -- GitLab From 608ac4016fdba92e87a7df384560dac9d2979eb4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Nov 2018 06:29:06 -0600 Subject: [PATCH 413/916] ArrayArg->GlobalArg --- doc/tutorial.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index c134e4fb7..25082f88a 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1145,7 +1145,7 @@ the right by 1 in parallel: ... end ... """, ... [ - ... lp.ArrayArg("arr", shape=("n",), dtype=np.int32), + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), ... "...", ... ], ... name="rotate_v1", @@ -1189,7 +1189,7 @@ Let us start with an example. Consider the kernel from above with a ... end ... """, ... [ - ... lp.ArrayArg("arr", shape=("n",), dtype=np.int32), + ... lp.GlobalArg("arr", shape=("n",), dtype=np.int32), ... "...", ... ], ... name="rotate_v2", @@ -1323,8 +1323,8 @@ tagged, as in the following example:: "{ [i]: 0<=i Date: Thu, 22 Nov 2018 18:00:34 +0000 Subject: [PATCH 414/916] increase recursion limit for checking variable ordered access --- loopy/check.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/loopy/check.py b/loopy/check.py index 659e210fc..bbf314626 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -696,6 +696,13 @@ def check_variable_access_ordered(kernel): "'enforce_variable_access_ordered': %s" % kernel.options.enforce_variable_access_ordered) + import sys + + if len(kernel.instructions) > 200: + pre_recursion_limit = sys.getrecursionlimit() + if pre_recursion_limit < 2000: + sys.setrecursionlimit(2000) + if kernel.options.enforce_variable_access_ordered == "no_check": return @@ -709,6 +716,9 @@ def check_variable_access_ordered(kernel): from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "variable_access_ordered", str(e)) + if len(kernel.instructions) > 200: + sys.setrecursionlimit(pre_recursion_limit) + # }}} # }}} -- GitLab From 5acbf7d503cd0b8883e6b48796d3da501568de99 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Nov 2018 12:26:21 -0600 Subject: [PATCH 415/916] add a temporary soln for recursion error --- loopy/check.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index bbf314626..8f6219827 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -696,13 +696,6 @@ def check_variable_access_ordered(kernel): "'enforce_variable_access_ordered': %s" % kernel.options.enforce_variable_access_ordered) - import sys - - if len(kernel.instructions) > 200: - pre_recursion_limit = sys.getrecursionlimit() - if pre_recursion_limit < 2000: - sys.setrecursionlimit(2000) - if kernel.options.enforce_variable_access_ordered == "no_check": return @@ -715,9 +708,9 @@ def check_variable_access_ordered(kernel): except VariableAccessNotOrdered as e: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "variable_access_ordered", str(e)) - - if len(kernel.instructions) > 200: - sys.setrecursionlimit(pre_recursion_limit) + except RecursionError as e: + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) # }}} -- GitLab From bfa74bda00834e409e633e18d1649349da3c4994 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Thu, 22 Nov 2018 18:41:35 +0000 Subject: [PATCH 416/916] catch recursion limit error --- loopy/check.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index 8f6219827..fcdfd793b 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -700,7 +700,11 @@ def check_variable_access_ordered(kernel): return if kernel.options.enforce_variable_access_ordered: - _check_variable_access_ordered_inner(kernel) + try: + _check_variable_access_ordered_inner(kernel) + except RecursionError as e: + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) else: from loopy.diagnostic import VariableAccessNotOrdered try: -- GitLab From bc0721089bf3b8dfeae0455069d02d8a987ace1d Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 23 Nov 2018 14:06:20 +0000 Subject: [PATCH 417/916] return a frozenset for insn_inames --- loopy/statistics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index c621ea727..ab792012d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1270,9 +1270,9 @@ def count_insn_runs(knl, callables_table, insn, count_redundant_work, if disregard_local_axes: from loopy.kernel.data import LocalIndexTag - insn_inames = [iname - for iname in insn_inames - if not knl.iname_tags_of_type(iname, LocalIndexTag)] + insn_inames = frozenset([iname + for iname in insn_inames + if not knl.iname_tags_of_type(iname, LocalIndexTag)]) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( -- GitLab From 987c10904485b048b76cf50dedbebe23c874aef6 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Fri, 23 Nov 2018 14:54:32 +0000 Subject: [PATCH 418/916] implement recursion error exception to satisfy python2 --- loopy/check.py | 14 ++++++++------ loopy/statistics.py | 6 +++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index fcdfd793b..4e84d7e23 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -702,9 +702,10 @@ def check_variable_access_ordered(kernel): if kernel.options.enforce_variable_access_ordered: try: _check_variable_access_ordered_inner(kernel) - except RecursionError as e: - from loopy.diagnostic import warn_with_kernel - warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) + except RuntimeError as e: + if e.args[0] != 'maximum recursion depth exceeded': + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) else: from loopy.diagnostic import VariableAccessNotOrdered try: @@ -712,9 +713,10 @@ def check_variable_access_ordered(kernel): except VariableAccessNotOrdered as e: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "variable_access_ordered", str(e)) - except RecursionError as e: - from loopy.diagnostic import warn_with_kernel - warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) + except RuntimeError as e: + if e.args[0] != 'maximum recursion depth exceeded': + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) # }}} diff --git a/loopy/statistics.py b/loopy/statistics.py index ab792012d..6e152a44b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1270,9 +1270,9 @@ def count_insn_runs(knl, callables_table, insn, count_redundant_work, if disregard_local_axes: from loopy.kernel.data import LocalIndexTag - insn_inames = frozenset([iname - for iname in insn_inames - if not knl.iname_tags_of_type(iname, LocalIndexTag)]) + insn_inames = frozenset( + [iname for iname in insn_inames + if not knl.iname_tags_of_type(iname, LocalIndexTag)]) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( -- GitLab From 4d596836d12e383740a8824c5df99302e0d4283f Mon Sep 17 00:00:00 2001 From: tj-sun Date: Mon, 3 Dec 2018 12:18:30 +0000 Subject: [PATCH 419/916] handles runtime error correctly --- loopy/check.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 4e84d7e23..884eb5ddd 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -703,9 +703,11 @@ def check_variable_access_ordered(kernel): try: _check_variable_access_ordered_inner(kernel) except RuntimeError as e: - if e.args[0] != 'maximum recursion depth exceeded': + if isinstance(e.args[0], str) and e.args[0].startswith('maximum recursion depth exceeded'): from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) + else: + raise e else: from loopy.diagnostic import VariableAccessNotOrdered try: @@ -714,9 +716,11 @@ def check_variable_access_ordered(kernel): from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "variable_access_ordered", str(e)) except RuntimeError as e: - if e.args[0] != 'maximum recursion depth exceeded': + if isinstance(e.args[0], str) and e.args[0].startswith('maximum recursion depth exceeded'): from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) + else: + raise e # }}} -- GitLab From 632b56956211e12ea6c27f2b146788c001c2afa9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 6 Dec 2018 18:25:30 -0600 Subject: [PATCH 420/916] fixes small wrinkle in type inference --- loopy/type_inference.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 5047dcc27..c305e483e 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -468,7 +468,6 @@ class TypeInferenceMapper(CombineMapper): "InKernelCallable?") # }}} - in_knl_callable, self.callables_table = ( in_knl_callable.with_types( arg_id_to_dtype, self.kernel, @@ -877,11 +876,13 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, item = item_lookup[name] debug("inferring type for %s %s", type(item).__name__, item.name) - - (result, symbols_with_unavailable_types, - new_old_calls_to_new_calls, callables_table) = ( - _infer_var_type( - kernel, item.name, type_inf_mapper, subst_expander)) + try: + (result, symbols_with_unavailable_types, + new_old_calls_to_new_calls, callables_table) = ( + _infer_var_type( + kernel, item.name, type_inf_mapper, subst_expander)) + except DependencyTypeInferenceFailure: + result = tuple() type_inf_mapper = type_inf_mapper.copy( callables_table=callables_table) -- GitLab From 8424bfe7b9c4cb55d660d83adf85a65f8ae50a63 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 6 Dec 2018 18:29:09 -0600 Subject: [PATCH 421/916] fixes flake8 --- loopy/check.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 884eb5ddd..977571fcf 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -703,7 +703,8 @@ def check_variable_access_ordered(kernel): try: _check_variable_access_ordered_inner(kernel) except RuntimeError as e: - if isinstance(e.args[0], str) and e.args[0].startswith('maximum recursion depth exceeded'): + if isinstance(e.args[0], str) and ( + e.args[0].startswith('maximum recursion depth exceeded')): from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) else: @@ -716,7 +717,8 @@ def check_variable_access_ordered(kernel): from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "variable_access_ordered", str(e)) except RuntimeError as e: - if isinstance(e.args[0], str) and e.args[0].startswith('maximum recursion depth exceeded'): + if isinstance(e.args[0], str) and ( + e.args[0].startswith('maximum recursion depth exceeded')): from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "recursion_error_reached_in_check", str(e)) else: -- GitLab From 63b09a9f9e7f80a3a0b67bf3c2990aab072d2079 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 22 Jan 2019 03:43:41 -0600 Subject: [PATCH 422/916] preparing transformation implementations for tt algo --- loopy/transform/batch.py | 99 ++++++++++++++++++++++++++++++++++++---- loopy/transform/iname.py | 20 +++++++- 2 files changed, 107 insertions(+), 12 deletions(-) diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 970547003..bf576ece2 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -25,7 +25,8 @@ THE SOFTWARE. import six -from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext) +from loopy.symbolic import (RuleAwareIdentityMapper, + SubstitutionRuleMappingContext, pw_aff_to_expr) from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl @@ -57,13 +58,15 @@ def temp_needs_batching_if_not_sequential(tv, batch_varying_args): class _BatchVariableChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, kernel, batch_varying_args, - batch_iname_expr, sequential): + batch_iname_expr, sequential, batch_varying_temps=None, within=None): super(_BatchVariableChanger, self).__init__(rule_mapping_context) self.kernel = kernel self.batch_varying_args = batch_varying_args self.batch_iname_expr = batch_iname_expr self.sequential = sequential + self.batch_varying_temps = batch_varying_temps + self.within = within def needs_batch_subscript(self, name): tv = self.kernel.temporary_variables.get(name) @@ -73,14 +76,18 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): if not self.sequential: if tv is None: return False - if not temp_needs_batching_if_not_sequential(tv, - self.batch_varying_args): - return False + if self.batch_varying_temps: + return tv.name in self.batch_varying_temps + else: + if not temp_needs_batching_if_not_sequential(tv, + self.batch_varying_args): + return False return True def map_subscript(self, expr, expn_state): - if not self.needs_batch_subscript(expr.aggregate.name): + if not self.needs_batch_subscript(expr.aggregate.name) or not ( + self.within(expn_state.kernel, expn_state.instruction)): return super(_BatchVariableChanger, self).map_subscript(expr, expn_state) idx = self.rec(expr.index, expn_state) @@ -90,7 +97,8 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): return type(expr)(expr.aggregate, (self.batch_iname_expr,) + idx) def map_variable(self, expr, expn_state): - if not self.needs_batch_subscript(expr.name): + if not self.needs_batch_subscript(expr.name) or not ( + self.within(expn_state.kernel, expn_state.instruction)): return super(_BatchVariableChanger, self).map_variable(expr, expn_state) return expr[self.batch_iname_expr] @@ -107,7 +115,7 @@ def _add_unique_dim_name(name, dim_names): @iterate_over_kernels_if_given_program def to_batched(knl, nbatches, batch_varying_args, - batch_iname_prefix="ibatch", sequential=False): + batch_iname_prefix="ibatch", sequential=False, within=None): """Takes in a kernel that carries out an operation and returns a kernel that carries out a batch of these operations. .. note:: @@ -183,11 +191,13 @@ def to_batched(knl, nbatches, batch_varying_args, from loopy.kernel.data import ForceSequentialTag knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())]) + from loopy.match import parse_stack_match + rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, vng) bvc = _BatchVariableChanger(rule_mapping_context, knl, batch_varying_args, batch_iname_expr, - sequential=sequential) + sequential=sequential, within=parse_stack_match(within)) kernel = rule_mapping_context.finish_kernel( bvc.map_kernel(knl)) @@ -195,10 +205,79 @@ def to_batched(knl, nbatches, batch_varying_args, kernel = kernel.copy( instructions=[ insn.copy(within_inames=insn.within_inames | batch_iname_set) - for insn in kernel.instructions]) + if within(kernel, insn) else insn for insn in kernel.instructions]) return kernel # }}} + +def _merged_batch(knl, iname_to_merge, batch_varying_args, batch_varying_temps, + sequential=False, within=None): + """ + TODO: Not entirely sure whether this has to exist i.e. can this be + expressed as some other transformation. + """ + from loopy.match import parse_match + from pymbolic import var + from loopy.isl_helpers import static_max_of_pw_aff + + within = parse_match(within) + batch_iname_expr = var(iname_to_merge) + + new_args = [] + + bounds = knl.get_iname_bounds(iname_to_merge, constants_only=True) + nbatches_expr = pw_aff_to_expr(static_max_of_pw_aff(bounds.size, + constants_only=True)) + + for arg in knl.args: + if arg.name in batch_varying_args: + if isinstance(arg, ValueArg): + arg = ArrayArg(arg.name, arg.dtype, shape=None, + dim_tags="c") + else: + arg = arg.copy( + shape=None, + dim_tags=None, + dim_names=_add_unique_dim_name("ibatch", arg.dim_names)) + + new_args.append(arg) + + knl = knl.copy( + args=new_args) + + if not sequential: + new_temps = {} + + for temp in six.itervalues(knl.temporary_variables): + if (batch_varying_temps and temp.name in batch_varying_temps) or (not + batch_varying_temps and temp_needs_batching_if_not_sequential( + temp, batch_varying_args)): + new_temps[temp.name] = temp.copy( + shape=(nbatches_expr,) + temp.shape, + dim_tags=("c",) * (len(temp.shape) + 1), + dim_names=_add_unique_dim_name("ibatch", temp.dim_names)) + else: + new_temps[temp.name] = temp + + knl = knl.copy(temporary_variables=new_temps) + + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, knl.get_var_name_generator) + bvc = _BatchVariableChanger(rule_mapping_context, + knl, batch_varying_args, batch_iname_expr, + sequential=sequential, batch_varying_temps=batch_varying_temps, + within=within) + kernel = rule_mapping_context.finish_kernel( + bvc.map_kernel(knl)) + + batch_iname_set = frozenset([iname_to_merge]) + kernel = kernel.copy( + instructions=[ + insn.copy(within_inames=insn.within_inames | batch_iname_set) + if within(kernel, insn) else insn for insn in kernel.instructions]) + + return kernel + # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index fb6682f48..138cded8c 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -518,6 +518,22 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): :func:`loopy.match.parse_stack_match`. """ + from loopy.match import parse_match + within = parse_match(within) + + # {{{ return the same kernel if no kernel matches + + def _do_not_transform_if_no_within_matches(): + for insn in kernel.instructions: + if within(kernel, insn): + return + + return kernel + + _do_not_transform_if_no_within_matches() + + # }}} + # now fastest varying first inames = inames[::-1] @@ -596,8 +612,8 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): new_insns = [ insn.copy( - within_inames=subst_within_inames(insn.within_inames)) - for insn in kernel.instructions] + within_inames=subst_within_inames(insn.within_inames)) if + within(kernel, insn) else insn for insn in kernel.instructions] kernel = (kernel .copy( -- GitLab From 5d69e4e4d30b44a7c2f0678f912f5cd9db85f31f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 23 Jan 2019 18:48:48 -0600 Subject: [PATCH 423/916] some more minor changes for the tt algorithm --- loopy/symbolic.py | 2 +- loopy/transform/batch.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 5721c58ef..46435e667 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1899,7 +1899,7 @@ def get_access_range(domain, subscript, assumptions, shape=None, except ExpressionToAffineConversionError as err: shape_aff = None - if shape is not None: + if shape is not None and shape[idim] is not None: try: shape_aff = guarded_aff_from_expr(access_map.space, shape[idim]) except ExpressionToAffineConversionError: diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index bf576ece2..9720d549e 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -234,12 +234,12 @@ def _merged_batch(knl, iname_to_merge, batch_varying_args, batch_varying_temps, for arg in knl.args: if arg.name in batch_varying_args: if isinstance(arg, ValueArg): - arg = ArrayArg(arg.name, arg.dtype, shape=None, + arg = ArrayArg(arg.name, arg.dtype, shape=(nbatches_expr,), dim_tags="c") else: arg = arg.copy( - shape=None, - dim_tags=None, + shape=(nbatches_expr,) + arg.shape, + dim_tags=("c",) * (len(arg.shape) + 1), dim_names=_add_unique_dim_name("ibatch", arg.dim_names)) new_args.append(arg) -- GitLab From 96857d32fd5aaf4e6e2bebcb719a26bc287dca0d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 23 Jan 2019 23:00:09 -0600 Subject: [PATCH 424/916] project out the unused inames --- loopy/transform/iname.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 138cded8c..db3f4ac26 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -638,7 +638,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) - return kernel + return remove_unused_inames(kernel, inames) # }}} -- GitLab From b42358ec368b9a279d840bd9bd9573f698304991 Mon Sep 17 00:00:00 2001 From: tj-sun Date: Sun, 27 Jan 2019 20:44:22 +0000 Subject: [PATCH 425/916] atomic addition for cuda --- loopy/target/cuda.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 6b4385bff..201a30b8f 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -411,6 +411,35 @@ class CUDACASTBuilder(CASTBuilder): return CudaConstant(arg_decl) + # {{{ code generation for atomic update + + def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + + from pymbolic.primitives import Sum, Subscript + from cgen import Statement + + if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ + np.int32, np.int64, np.float32, np.float64]: + # Special case for atomicAdd + # FIXME: add similar code for atomicSub etc + if (isinstance(rhs_expr, Sum) and isinstance(lhs_expr, Subscript) + and lhs_expr in rhs_expr.children): + + ecm = self.get_expression_to_code_mapper(codegen_state) + + new_rhs_expr = Sum(tuple(c for c in rhs_expr.children + if c != lhs_expr)) + lhs_expr_code = ecm(lhs_expr) + rhs_expr_code = ecm(new_rhs_expr) + + return Statement("atomicAdd(&{0}, {1})".format( + lhs_expr_code, rhs_expr_code)) + + raise NotImplementedError("atomic update for '%s'" % lhs_dtype) + + # }}} + # }}} # }}} -- GitLab From e23eec7c4e995e6c45d3ab64a8cfacc98dade2a2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Jan 2019 08:26:02 -0600 Subject: [PATCH 426/916] adds test and cleans to_batched for unification --- loopy/__init__.py | 4 +- loopy/target/cuda.py | 119 ++++++++++++++++++++++++++++++++++++--- loopy/transform/batch.py | 83 +++++++++++++-------------- test/test_transform.py | 12 ++++ 4 files changed, 163 insertions(+), 55 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 51d01b78e..deeddc2c5 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -116,7 +116,7 @@ from loopy.transform.padding import ( add_padding) from loopy.transform.privatize import privatize_temporaries_with_inames -from loopy.transform.batch import to_batched +from loopy.transform.batch import to_batched, save_temporaries_in_loop from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier @@ -230,7 +230,7 @@ __all__ = [ "privatize_temporaries_with_inames", - "to_batched", + "to_batched", "save_temporaries_in_loop", "assume", "fix_parameters", diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 201a30b8f..cc13a8032 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -268,6 +268,41 @@ class CudaTarget(CTarget): # }}} +# {{{ preamable generator + +def cuda_preamble_generator(preamble_info): + from loopy.types import AtomicNumpyType + seen_64_bit_atomics = any( + isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8 + for dtype in preamble_info.seen_atomic_dtypes) + + if seen_64_bit_atomics: + # Source: + # docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions + yield ("00_enable_64bit_atomics", """ + #if __CUDA_ARCH__ < 600 + __device__ double atomicAdd(double* address, double val) + { + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + + } while (assumed != old); + + return __longlong_as_double(old); + } + #endif + """) + +# }}} + + # {{{ ast builder class CUDACASTBuilder(CASTBuilder): @@ -334,6 +369,12 @@ class CUDACASTBuilder(CASTBuilder): return body, implemented_domains + def preamble_generators(self): + + return ( + super(CUDACASTBuilder, self).preamble_generators() + [ + cuda_preamble_generator]) + # }}} # {{{ code generation guts @@ -416,16 +457,14 @@ class CUDACASTBuilder(CASTBuilder): def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): - from pymbolic.primitives import Sum, Subscript + from pymbolic.primitives import Sum from cgen import Statement + from pymbolic.mapper.stringifier import PREC_NONE if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [ np.int32, np.int64, np.float32, np.float64]: - # Special case for atomicAdd - # FIXME: add similar code for atomicSub etc - if (isinstance(rhs_expr, Sum) and isinstance(lhs_expr, Subscript) - and lhs_expr in rhs_expr.children): - + # atomicAdd + if isinstance(rhs_expr, Sum): ecm = self.get_expression_to_code_mapper(codegen_state) new_rhs_expr = Sum(tuple(c for c in rhs_expr.children @@ -435,8 +474,72 @@ class CUDACASTBuilder(CASTBuilder): return Statement("atomicAdd(&{0}, {1})".format( lhs_expr_code, rhs_expr_code)) - - raise NotImplementedError("atomic update for '%s'" % lhs_dtype) + else: + from cgen import Block, DoWhile, Assign + from loopy.target.c import POD + old_val_var = codegen_state.var_name_generator("loopy_old_val") + new_val_var = codegen_state.var_name_generator("loopy_new_val") + + from loopy.kernel.data import TemporaryVariable + ecm = codegen_state.expression_to_code_mapper.with_assignments( + { + old_val_var: TemporaryVariable(old_val_var, lhs_dtype), + new_val_var: TemporaryVariable(new_val_var, lhs_dtype), + }) + + lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None) + + from pymbolic.mapper.substitutor import make_subst_func + from pymbolic import var + from loopy.symbolic import SubstitutionMapper + + subst = SubstitutionMapper( + make_subst_func({lhs_expr: var(old_val_var)})) + rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE, + type_context=rhs_type_context, + needed_dtype=lhs_dtype) + + cast_str = "" + old_val = old_val_var + new_val = new_val_var + + if lhs_dtype.numpy_dtype.kind == "f": + if lhs_dtype.numpy_dtype == np.float32: + ctype = "int" + elif lhs_dtype.numpy_dtype == np.float64: + ctype = "long" + else: + assert False + + old_val = "*(%s *) &" % ctype + old_val + new_val = "*(%s *) &" % ctype + new_val + cast_str = "(%s *) " % (ctype) + + return Block([ + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + old_val_var), + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + new_val_var), + DoWhile( + "atomicCAS(" + "%(cast_str)s&(%(lhs_expr)s), " + "%(old_val)s, " + "%(new_val)s" + ") != %(old_val)s" + % { + "cast_str": cast_str, + "lhs_expr": lhs_expr_code, + "old_val": old_val, + "new_val": new_val, + }, + Block([ + Assign(old_val_var, lhs_expr_code), + Assign(new_val_var, rhs_expr_code), + ]) + ) + ]) + else: + raise NotImplementedError("atomic update for '%s'" % lhs_dtype) # }}} diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 9720d549e..522f3e3f4 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -37,6 +37,7 @@ __doc__ = """ .. currentmodule:: loopy .. autofunction:: to_batched +.. autofunction:: save_temporaries_in_loop """ @@ -87,7 +88,8 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): def map_subscript(self, expr, expn_state): if not self.needs_batch_subscript(expr.aggregate.name) or not ( - self.within(expn_state.kernel, expn_state.instruction)): + self.within(expn_state.kernel, expn_state.instruction, + expn_state.stack)): return super(_BatchVariableChanger, self).map_subscript(expr, expn_state) idx = self.rec(expr.index, expn_state) @@ -191,7 +193,7 @@ def to_batched(knl, nbatches, batch_varying_args, from loopy.kernel.data import ForceSequentialTag knl = lp.tag_inames(knl, [(batch_iname, ForceSequentialTag())]) - from loopy.match import parse_stack_match + from loopy.match import parse_stack_match, parse_match rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, vng) @@ -202,6 +204,7 @@ def to_batched(knl, nbatches, batch_varying_args, bvc.map_kernel(knl)) batch_iname_set = frozenset([batch_iname]) + within = parse_match(within) kernel = kernel.copy( instructions=[ insn.copy(within_inames=insn.within_inames | batch_iname_set) @@ -212,67 +215,57 @@ def to_batched(knl, nbatches, batch_varying_args, # }}} -def _merged_batch(knl, iname_to_merge, batch_varying_args, batch_varying_temps, - sequential=False, within=None): +@iterate_over_kernels_if_given_program +def save_temporaries_in_loop(knl, iname, temps_to_save, within=None): """ - TODO: Not entirely sure whether this has to exist i.e. can this be - expressed as some other transformation. + Returns a kernel with the temporary variables in *temps_to_save* batched + within the iname *iname*. + + :arg iname: An instance of :class:`str1 for the loop across which the + values of the temporaries are to be saved. + + :arg temps_to_save: An iterable containing the temporaries that are to be + saved for each loop iteration defined by *iname*. + + :arg within: If not None, limit the action of the transformation to + matching contexts. See :func:`loopy.match.parse_stack_match` + for syntax. """ - from loopy.match import parse_match + from loopy.match import parse_match, parse_stack_match from pymbolic import var from loopy.isl_helpers import static_max_of_pw_aff - within = parse_match(within) - batch_iname_expr = var(iname_to_merge) - - new_args = [] + batch_iname_expr = var(iname) - bounds = knl.get_iname_bounds(iname_to_merge, constants_only=True) + bounds = knl.get_iname_bounds(iname, constants_only=True) nbatches_expr = pw_aff_to_expr(static_max_of_pw_aff(bounds.size, constants_only=True)) - for arg in knl.args: - if arg.name in batch_varying_args: - if isinstance(arg, ValueArg): - arg = ArrayArg(arg.name, arg.dtype, shape=(nbatches_expr,), - dim_tags="c") - else: - arg = arg.copy( - shape=(nbatches_expr,) + arg.shape, - dim_tags=("c",) * (len(arg.shape) + 1), - dim_names=_add_unique_dim_name("ibatch", arg.dim_names)) + new_temps = {} - new_args.append(arg) + for temp in six.itervalues(knl.temporary_variables): + if temp.name in temps_to_save: + new_temps[temp.name] = temp.copy( + shape=(nbatches_expr,) + temp.shape, + dim_tags=("c",) * (len(temp.shape) + 1), + dim_names=_add_unique_dim_name("itemp_save", temp.dim_names)) + else: + new_temps[temp.name] = temp - knl = knl.copy( - args=new_args) - - if not sequential: - new_temps = {} - - for temp in six.itervalues(knl.temporary_variables): - if (batch_varying_temps and temp.name in batch_varying_temps) or (not - batch_varying_temps and temp_needs_batching_if_not_sequential( - temp, batch_varying_args)): - new_temps[temp.name] = temp.copy( - shape=(nbatches_expr,) + temp.shape, - dim_tags=("c",) * (len(temp.shape) + 1), - dim_names=_add_unique_dim_name("ibatch", temp.dim_names)) - else: - new_temps[temp.name] = temp - - knl = knl.copy(temporary_variables=new_temps) + knl = knl.copy(temporary_variables=new_temps) rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, knl.get_var_name_generator) bvc = _BatchVariableChanger(rule_mapping_context, - knl, batch_varying_args, batch_iname_expr, - sequential=sequential, batch_varying_temps=batch_varying_temps, - within=within) + knl, [], batch_iname_expr, + sequential=False, batch_varying_temps=temps_to_save, + within=parse_stack_match(within)) kernel = rule_mapping_context.finish_kernel( bvc.map_kernel(knl)) - batch_iname_set = frozenset([iname_to_merge]) + within = parse_match(within) + + batch_iname_set = frozenset([iname]) kernel = kernel.copy( instructions=[ insn.copy(within_inames=insn.within_inames | batch_iname_set) diff --git a/test/test_transform.py b/test/test_transform.py index 04162331d..6952d4b78 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -161,6 +161,18 @@ def test_to_batched_temp(ctx_factory): parameters=dict(a=a, x=x, n=5, nbatches=7)) +def test_save_temporaries_in_loop(ctx_factory): + + prog = lp.make_kernel( + "{[i, j]: 0 <= i, j < 4}", + """ + <> a[j] = j {inames=i:j} + """) + + prog = lp.save_temporaries_in_loop(prog, 'i', ['a']) + assert prog.root_kernel.temporary_variables['a'].shape == (4, 4) + + def test_add_barrier(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 4c36d227ff505ed259f967051e8f3e25c1e48ea5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Jan 2019 09:58:55 -0600 Subject: [PATCH 427/916] corrects the match invocation --- loopy/transform/batch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 522f3e3f4..1eaebdd0d 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -100,7 +100,8 @@ class _BatchVariableChanger(RuleAwareIdentityMapper): def map_variable(self, expr, expn_state): if not self.needs_batch_subscript(expr.name) or not ( - self.within(expn_state.kernel, expn_state.instruction)): + self.within(expn_state.kernel, expn_state.instruction, + expn_state.stack)): return super(_BatchVariableChanger, self).map_variable(expr, expn_state) return expr[self.batch_iname_expr] -- GitLab From 82168eb234ae343a727a10aba4389f8ef61d213e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Jan 2019 19:34:17 -0600 Subject: [PATCH 428/916] makes it easier to share loopy kernels --- loopy/__init__.py | 3 + loopy/symbolic.py | 2 +- loopy/transform/write_to_python.py | 104 +++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 loopy/transform/write_to_python.py diff --git a/loopy/__init__.py b/loopy/__init__.py index deeddc2c5..d41902f43 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -120,6 +120,7 @@ from loopy.transform.batch import to_batched, save_temporaries_in_loop from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier +from loopy.transform.write_to_python import write_to_python from loopy.transform.callable import (register_callable_kernel, register_function_id_to_in_knl_callable_mapper, inline_callable_kernel) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call @@ -238,6 +239,8 @@ __all__ = [ "add_barrier", + "write_to_python", + "register_callable_kernel", "register_function_id_to_in_knl_callable_mapper", "inline_callable_kernel", diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 46435e667..f67d38a9a 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -258,7 +258,7 @@ class StringifyMapper(StringifyMapperBase): return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) def map_resolved_function(self, expr, prec): - return "ResolvedFunction('%s')" % expr.name + return expr.name def map_sub_array_ref(self, expr, prec): return "SubArrayRef({inames}, ({subscr}))".format( diff --git a/loopy/transform/write_to_python.py b/loopy/transform/write_to_python.py new file mode 100644 index 000000000..9a863bcd7 --- /dev/null +++ b/loopy/transform/write_to_python.py @@ -0,0 +1,104 @@ +import re +from mako.template import Template +import loopy as lp +from loopy.tools import natsorted + + +def write_to_python(kernel, filename=None): + """ + Generates a python code for generating *kernel* for sharing kernels. + + :arg kernel: An instance of :class:`loopy.LoopKernel` + :arg filename: An instance of :class:`str`. If *None*, then prints the + python file to *stdout*. + """ + + options = [] + + printed_insn_ids = set() + printed_insn_order = [] + + def insert_insn_into_order(insn): + if insn.id in printed_insn_ids: + return + printed_insn_ids.add(insn.id) + + for dep_id in natsorted(insn.depends_on): + insert_insn_into_order(kernel.id_to_insn[dep_id]) + + printed_insn_order.append(insn) + + for insn in kernel.instructions: + insert_insn_into_order(insn) + + for insn in printed_insn_order: + option = 'id=%s, ' % insn.id + if insn.depends_on: + option += ("dep="+":".join(insn.depends_on)+", ") + if insn.tags: + option += ("tags="+":".join(insn.tags)+", ") + if insn.within_inames: + option += ("inames="+":".join(insn.within_inames)+", ") + if isinstance(insn, lp.MultiAssignmentBase): + if insn.atomicity: + option += "atomic, " + elif isinstance(insn, lp.BarrierInstruction): + option += ("mem_kind=%s, " % insn.mem_kind) + options.append(option[:-2]) + + insn_x_options = zip(printed_insn_order, options) + + python_code = r'''<%! import loopy as lp %>import loopy as lp + import numpy as np + <%! tv_scope = {0: 'lp.AddressSpace.PRIVATE', 1: 'lp.AddressSpace.LOCAL', + 2: 'lp.AddressSpace.GLOBAL', lp.auto: 'lp.auto' } %> + knl = lp.make_kernel( + [ + % for dom in domains: + "${str(dom)}", + % endfor + ], + """ + % for insn, opts in insn_x_opts: + % if isinstance(insn, lp.Assignment): + ${insn.assignee} = ${insn.expression} {${opts}} + % elif isinstance(insn, lp.BarrierInstruction): + ... ${insn.synchronization_kind[0]}barrier{${opts}} + % else: + **Not implemented for ${type(insn)}** + % endif + %endfor + """, [ + % for arg in args: + % if isinstance(arg, lp.ValueArg): + lp.ValueArg( + name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}), + % else: + lp.GlobalArg( + name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}, + shape=${arg.shape}, for_atomic=${arg.for_atomic}), + % endif + % endfor + % for tv in temp_vars: + lp.TemporaryVariable( + name='${tv.name}', dtype=np.${tv.dtype.numpy_dtype.name}, + shape=${tv.shape}, for_atomic=${tv.for_atomic}, + address_space=${tv_scope[tv.address_space]}, + read_only=${tv.read_only}, + % if tv.initializer is not None: + initializer=${"np."+str((tv.initializer).__repr__())}, + % endif + ), + % endfor + ], lang_version=${lp.VERSION})''' + + python_code = Template(python_code).render(insn_x_opts=insn_x_options, + domains=kernel.domains, args=kernel.args, + temp_vars=[k for k in kernel.temporary_variables.values()]) + + python_code = re.sub("\\n ", "\n", python_code) + if filename: + with open(filename, 'w') as f: + f.write(python_code) + else: + print(python_code) -- GitLab From 9cca8d521e40fde09f75a8903570c639a4833f5a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 30 Jan 2019 22:58:44 -0600 Subject: [PATCH 429/916] makes the pyopencl emit atomic addition --- loopy/target/pyopencl.py | 64 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 5ef564572..e43e7bc6e 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -811,4 +811,68 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # }}} +class NvidiaPyOpenCLTarget(PyOpenCLTarget): + def __init__(self, device, pyopencl_module_name="_lpy_cl", + atomics_flavor=None): + import pyopencl as cl + assert isinstance(device, cl.Device) + assert device.vendor == 'NVIDIA Corporation' + + if not device.compute_capability_major_nv >= 6: + raise LoopyError("Nvidia o") + super(NvidiaPyOpenCLTarget, self).__init__(device, + pyopencl_module_name, atomics_flavor) + + def preprocess(self, kernel): + from loopy import set_options + build_options = ['-cl-nv-arch', 'sm_60'] + kernel.options.cl_build_options + kernel = set_options(kernel, cl_build_options=build_options) + return super(NvidiaPyOpenCLTarget, self).preprocess(kernel) + + def get_device_ast_builder(self): + # here we should have an if else condition + if self.device.compute_capability_major_nv >= 6: + return NvidiaPyOpenCLCASTBuilder(self) + else: + return super(NvidiaPyOpenCLTarget, self).get_device_ast_builder() + + +class NvidiaPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): + def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + + from pymbolic.primitives import Sum, Variable, Subscript + from cgen import Statement, Block, Assign + from loopy.target.c import POD + + if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype == np.float64: + # atomicAdd + if isinstance(rhs_expr, Sum): + + old_val_var = codegen_state.var_name_generator("loopy_old_val") + + from loopy.kernel.data import TemporaryVariable + ecm = codegen_state.expression_to_code_mapper.with_assignments( + { + old_val_var: TemporaryVariable(old_val_var, lhs_dtype), + }) + + new_rhs_expr = Sum(tuple(c for c in rhs_expr.children + if c != lhs_expr)) + lhs_expr_code = ecm(lhs_expr) + rhs_expr_code = ecm(new_rhs_expr) + + return Block([ + POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + old_val_var), + Assign(old_val_var, lhs_expr_code), + Statement('asm volatile("atom.global.add.f64 %0, [%1], %2;" :' + '"=d"({0}) : "l"(&{1}) , "d"({2}))'.format( + old_val_var, lhs_expr_code, rhs_expr_code))]) + + return super(NvidiaPyOpenCLCASTBuilder, + self).emit_atomic_update(codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context) + + # vim: foldmethod=marker -- GitLab From 65ae8117ac2e01ffa5e8fe37b5b5297f372fc5aa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 30 Jan 2019 23:10:16 -0600 Subject: [PATCH 430/916] tests the nvidia pyopencl target --- loopy/__init__.py | 4 ++-- loopy/target/pyopencl.py | 2 +- test/test_target.py | 26 ++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index d41902f43..ab7fce9ec 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -152,7 +152,7 @@ from loopy.target import TargetBase, ASTBuilderBase from loopy.target.c import CTarget, ExecutableCTarget, generate_header from loopy.target.cuda import CudaTarget from loopy.target.opencl import OpenCLTarget -from loopy.target.pyopencl import PyOpenCLTarget +from loopy.target.pyopencl import PyOpenCLTarget, NvidiaPyOpenCLTarget from loopy.target.ispc import ISPCTarget from loopy.target.numba import NumbaTarget, NumbaCudaTarget @@ -288,7 +288,7 @@ __all__ = [ "TargetBase", "CTarget", "ExecutableCTarget", "generate_header", "CudaTarget", "OpenCLTarget", - "PyOpenCLTarget", "ISPCTarget", + "PyOpenCLTarget", "NvidiaPyOpenCLTarget", "ISPCTarget", "NumbaTarget", "NumbaCudaTarget", "ASTBuilderBase", diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index e43e7bc6e..5263a1006 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -841,7 +841,7 @@ class NvidiaPyOpenCLCASTBuilder(PyOpenCLCASTBuilder): def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): - from pymbolic.primitives import Sum, Variable, Subscript + from pymbolic.primitives import Sum from cgen import Statement, Block, Assign from loopy.target.c import POD diff --git a/test/test_target.py b/test/test_target.py index 095bf0939..0d3431066 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -367,6 +367,32 @@ def test_cuda_short_vector(): print(lp.generate_code_v2(knl).device_code()) +def test_nvidia_pyopencl_target(ctx_factory): + ctx = ctx_factory() + if ctx.devices[0].vendor != 'NVIDIA Corporation': + # do not test for non-Nvidia devices + return + + queue = cl.CommandQueue(ctx) + a = np.random.randn(16) + + knl = lp.make_kernel( + "{[i]: 0<=i<16}", + """ + res[0] = res[0] + a[i] {id=update, atomic} + """, + [ + lp.GlobalArg('res', for_atomic=True), + lp.GlobalArg('a', for_atomic=True, dtype=np.float64), + '...']) + + knl = lp.split_iname(knl, "i", 4, inner_tag="l.0", outer_tag="g.0") + knl = knl.copy(target=lp.NvidiaPyOpenCLTarget(ctx.devices[0])) + + evt, (out, ) = knl(queue, a=a) + assert np.isclose(out, a.sum()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 65cac30576973233a3465f8c70907d05fcbb98b2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 31 Jan 2019 00:43:36 -0600 Subject: [PATCH 431/916] improves the fallback mechanism --- loopy/target/pyopencl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 5263a1006..bba4b5f15 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -818,15 +818,15 @@ class NvidiaPyOpenCLTarget(PyOpenCLTarget): assert isinstance(device, cl.Device) assert device.vendor == 'NVIDIA Corporation' - if not device.compute_capability_major_nv >= 6: - raise LoopyError("Nvidia o") super(NvidiaPyOpenCLTarget, self).__init__(device, pyopencl_module_name, atomics_flavor) def preprocess(self, kernel): from loopy import set_options - build_options = ['-cl-nv-arch', 'sm_60'] + kernel.options.cl_build_options - kernel = set_options(kernel, cl_build_options=build_options) + if self.device.compute_capability_major_nv >= 6: + build_options = ['-cl-nv-arch', 'sm_60'] + ( + kernel.options.cl_build_options) + kernel = set_options(kernel, cl_build_options=build_options) return super(NvidiaPyOpenCLTarget, self).preprocess(kernel) def get_device_ast_builder(self): -- GitLab From 267fe47fe886123bedf2d82ddbd232a2cd4259c0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 7 Feb 2019 17:42:10 -0600 Subject: [PATCH 432/916] corrects the requirement for save temporaries in loop transform --- loopy/transform/batch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 1eaebdd0d..0b7dd743b 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -238,9 +238,9 @@ def save_temporaries_in_loop(knl, iname, temps_to_save, within=None): batch_iname_expr = var(iname) - bounds = knl.get_iname_bounds(iname, constants_only=True) + bounds = knl.get_iname_bounds(iname, constants_only=False) nbatches_expr = pw_aff_to_expr(static_max_of_pw_aff(bounds.size, - constants_only=True)) + constants_only=False)) new_temps = {} -- GitLab From 9f8bd465031c661ccdff162191306cf37d187027 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 7 Feb 2019 21:32:07 -0600 Subject: [PATCH 433/916] changes to take in gcd-tt --- loopy/target/cuda.py | 3 ++ loopy/transform/make_scalar.py | 51 ++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 loopy/transform/make_scalar.py diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index cc13a8032..bfbe9ca69 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -235,6 +235,9 @@ class CudaTarget(CTarget): super(CudaTarget, self).__init__() + def split_kernel_at_global_barriers(self): + return True + def get_device_ast_builder(self): return CUDACASTBuilder(self) diff --git a/loopy/transform/make_scalar.py b/loopy/transform/make_scalar.py new file mode 100644 index 000000000..ab91fdf78 --- /dev/null +++ b/loopy/transform/make_scalar.py @@ -0,0 +1,51 @@ +from pymbolic.primitives import Variable +from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext) +from loopy.kernel.data import ValueArg +from loopy.transform.iname import remove_unused_inames + + +class ScalarChanger(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, var_name): + self.var_name = var_name + super(ScalarChanger, self).__init__(rule_mapping_context) + + def map_subscript(self, expr, expn_state): + if expr.aggregate.name == self.var_name: + return Variable(self.var_name) + + return super(ScalarChanger, self).map_subscript(expr, expn_state) + + +def make_scalar(kernel, var_name): + rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions, + kernel.get_var_name_generator()) + + kernel = ScalarChanger(rule_mapping_context, var_name).map_kernel(kernel) + + new_args = [ValueArg(arg.name, arg.dtype, target=arg.target, + is_output_only=arg.is_output_only) if arg.name == var_name else arg for + arg in kernel.args] + new_temps = dict((tv.name, tv.copy(shape=(), dim_tags=None)) + if tv.name == var_name else (tv.name, tv) for tv in + kernel.temporary_variables.values()) + + return kernel.copy(args=new_args, temporary_variables=new_temps) + + +def remove_invariant_inames(kernel): + inames_used = set() + untagged_inames = ( + kernel.all_inames() - frozenset(kernel.iname_to_tags.keys())) + for insn in kernel.instructions: + for iname in ((insn.read_dependency_names() + | insn.write_dependency_names()) + & untagged_inames): + inames_used.add(iname) + + removable_inames = untagged_inames - inames_used + + new_insns = [insn.copy(within_inames=insn.within_inames-removable_inames) + for insn in kernel.instructions] + + return remove_unused_inames(kernel.copy(instructions=new_insns), + removable_inames) -- GitLab From 4e7d32b9ecb4b75656aa427010dcfff836301fa6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 29 Mar 2019 10:07:43 -0500 Subject: [PATCH 434/916] fixes the ValueArg input to inlining --- loopy/transform/callable.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 749817bad..23dc87bef 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -37,6 +37,8 @@ from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import (get_kw_pos_association, CallableKernel, ScalarCallable) from loopy.program import Program, ResolvedFunctionMarker +from loopy.symbolic import SubArrayRef +from pymbolic.primitives import Subscript __doc__ = """ .. currentmodule:: loopy @@ -403,8 +405,14 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): for k, v in six.iteritems(iname_map)) var_map.update(dict((p.Variable(k), p.Variable(v)) for k, v in six.iteritems(temp_map))) - var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name)) - for k, v in six.iteritems(arg_map))) + for k, v in six.iteritems(arg_map): + if isinstance(v, SubArrayRef): + var_map[p.Variable(k)] = v.subscript.aggregate + elif isinstance(v, Subscript): + var_map[p.Variable(k)] = v.subscript.aggregate + else: + var_map[p.Variable(k)] = v + subst_mapper = KernelInliner( make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) @@ -639,10 +647,13 @@ def _match_caller_callee_argument_dimension_for_single_kernel( else: return shape - parameter_shapes = [ - _shape_1_if_empty( - par.get_array_arg_descriptor(caller_knl).shape) - for par in parameters] + parameter_shapes = [] + for par in parameters: + if isinstance(par, SubArrayRef): + parameter_shapes.append(_shape_1_if_empty(par.get_array_arg_descriptor(caller_knl).shape)) + else: + parameter_shapes.append((1, )) + kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) for i in range(len(parameters), len(parameters)+len(kw_parameters)): parameter_shapes.append(_shape_1_if_empty(kw_parameters[pos_to_kw[i]]) -- GitLab From 77095945953c33a926d90ce6de64fa9a0090d799 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 29 Mar 2019 10:11:26 -0500 Subject: [PATCH 435/916] minor typo --- loopy/transform/callable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 23dc87bef..1fb8c7d65 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -409,7 +409,7 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): if isinstance(v, SubArrayRef): var_map[p.Variable(k)] = v.subscript.aggregate elif isinstance(v, Subscript): - var_map[p.Variable(k)] = v.subscript.aggregate + var_map[p.Variable(k)] = v.aggregate else: var_map[p.Variable(k)] = v -- GitLab From 4a3c80e4ea38ce4a2da4ec6f3a237bd8f335bbd4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 29 Mar 2019 10:41:43 -0500 Subject: [PATCH 436/916] adds test for #162 --- loopy/transform/callable.py | 2 -- test/test_callables.py | 41 +++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 1fb8c7d65..0df0829ad 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -408,8 +408,6 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): for k, v in six.iteritems(arg_map): if isinstance(v, SubArrayRef): var_map[p.Variable(k)] = v.subscript.aggregate - elif isinstance(v, Subscript): - var_map[p.Variable(k)] = v.aggregate else: var_map[p.Variable(k)] = v diff --git a/test/test_callables.py b/test/test_callables.py index cdba3f5b5..de1984ccd 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -404,6 +404,47 @@ def test_packing_unpacking(ctx_factory, inline): 3*x2.get()) < 1e-15 +def test_non_sub_array_refs_arguments(ctc_factory): + import loopy as lp + from loopy.transform.callable import _match_caller_callee_argument_dimension_ + + callee = lp.make_function("{[i] : 0 <= i < 6}", "a[i] = a[i] + j", + [lp.GlobalArg("a", dtype="double", shape=(6,), is_output_only=False), + lp.ValueArg("j", dtype="int")], name="callee") + caller1 = lp.make_kernel("{[j] : 0 <= j < 2}", "callee(a[:], b[0])", + [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output_only=False), + lp.GlobalArg("b", dtype="double", shape=(1, ), is_output_only=False)], + name="caller", target=lp.CTarget()) + + caller2 = lp.make_kernel("{[j] : 0 <= j < 2}", "callee(a[:], 3.1415926)", + [lp.GlobalArg("a", dtype="double", shape=(6, ), + is_output_only=False)], + name="caller", target=lp.CTarget()) + + caller3 = lp.make_kernel("{[j] : 0 <= j < 2}", "callee(a[:], kappa)", + [lp.GlobalArg("a", dtype="double", shape=(6, ), + is_output_only=False)], + name="caller", target=lp.CTarget()) + + registered = lp.register_callable_kernel(caller1, callee) + inlined = _match_caller_callee_argument_dimension_(registered, callee.name) + inlined = lp.inline_callable_kernel(inlined, callee.name) + + print(inlined) + + registered = lp.register_callable_kernel(caller2, callee) + inlined = _match_caller_callee_argument_dimension_(registered, callee.name) + inlined = lp.inline_callable_kernel(inlined, callee.name) + + print(inlined) + + registered = lp.register_callable_kernel(caller3, callee) + inlined = _match_caller_callee_argument_dimension_(registered, callee.name) + inlined = lp.inline_callable_kernel(inlined, callee.name) + + print(inlined) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 70a0d839c8a458d405869de7f954561e75d19944 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 29 Mar 2019 10:53:03 -0500 Subject: [PATCH 437/916] minor typo --- test/test_callables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_callables.py b/test/test_callables.py index de1984ccd..717299092 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -404,7 +404,7 @@ def test_packing_unpacking(ctx_factory, inline): 3*x2.get()) < 1e-15 -def test_non_sub_array_refs_arguments(ctc_factory): +def test_non_sub_array_refs_arguments(ctx_factory): import loopy as lp from loopy.transform.callable import _match_caller_callee_argument_dimension_ -- GitLab From aa364dd7b741b5b3641c817e856ee9147c65fb70 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 4 Apr 2019 17:56:28 -0500 Subject: [PATCH 438/916] checks the validity of valuearg <-> array arg while passing to callee kernels --- loopy/kernel/function_interface.py | 12 +++++++++++- test/test_callables.py | 4 ++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 3e628f5c9..0115d3b2b 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -30,6 +30,7 @@ from loopy.diagnostic import LoopyError from loopy.tools import update_persistent_hash from loopy.kernel import LoopKernel +from loopy.kernel.data import ValueArg, ArrayArg __doc__ = """ @@ -587,6 +588,11 @@ class CallableKernel(InKernelCallable): assert isinstance(arg_id, str) if isinstance(descr, ArrayArgDescriptor): + if not isinstance(self.subkernel.arg_dict[arg_id], ArrayArg): + raise LoopyError("Array passed to a scalar type argument " + " '%s' in the function '%s'." % ( + arg_id, self.subkernel.name)) + new_arg = self.subkernel.arg_dict[arg_id].copy( shape=descr.shape, dim_tags=descr.dim_tags, @@ -595,11 +601,15 @@ class CallableKernel(InKernelCallable): new_args = [new_arg if arg.name == arg_id else arg for arg in new_args] elif isinstance(descr, ValueArgDescriptor): - pass + if not isinstance(self.subkernel.arg_dict[arg_id], ValueArg): + raise LoopyError("Scalar passed to an array type argument " + " '%s' in the function '%s'." % ( + arg_id, self.subkernel.name)) else: raise LoopyError("Descriptor must be either an instance of " "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % type(descr)) + descriptor_specialized_knl = self.subkernel.copy(args=new_args) from loopy.preprocess import traverse_to_infer_arg_descr descriptor_specialized_knl, callables_table = ( diff --git a/test/test_callables.py b/test/test_callables.py index 717299092..f8e8cede6 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -315,9 +315,9 @@ def test_multi_arg_array_call(ctx_factory): queue = cl.CommandQueue(ctx) import pymbolic.primitives as p n = 10 - acc_i = p.Variable("acc_i") + acc_i = p.Variable("acc_i")[0] i = p.Variable("i") - index = p.Variable("index") + index = p.Variable("index")[0] a_i = p.Subscript(p.Variable("a"), p.Variable("i")) argmin_kernel = lp.make_function( "{[i]: 0 <= i < n}", -- GitLab From 51d08283abd139206f53c37565c8f4bc233f804d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 4 Apr 2019 17:56:56 -0500 Subject: [PATCH 439/916] adds support for empty sub-array refs(related to #162) --- loopy/symbolic.py | 9 ++++++++- test/test_callables.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index f67d38a9a..0eaad8a34 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -862,6 +862,9 @@ class SubArrayRef(p.Expression): pw_aff_to_expr( kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 for iname in self.swept_inames) + if self.swept_inames == (): + sub_shape = (1, ) + sub_dim_tags = (DimTag(1),) return ArrayArgDescriptor( address_space=aspace, @@ -1411,7 +1414,11 @@ class LoopyParser(ParserBase): elif pstate.is_next(_openbracket): pstate.advance() pstate.expect_not_end() - swept_inames = self.parse_expression(pstate) + if pstate.is_next(_closebracket): + swept_inames = () + else: + swept_inames = self.parse_expression(pstate) + pstate.expect(_closebracket) pstate.advance() pstate.expect(_colon) diff --git a/test/test_callables.py b/test/test_callables.py index f8e8cede6..a8a80a7bb 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -445,6 +445,37 @@ def test_non_sub_array_refs_arguments(ctx_factory): print(inlined) +@pytest.mark.parametrize("inline", [False, True]) +def test_empty_sub_array_refs(ctx_factory, inline): + # See: https://github.com/OP2/PyOP2/pull/559#discussion_r272208618 + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + x = np.random.randn(10) + y = np.random.randn(10) + + callee = lp.make_function( + "{[d]:0<=d<1}", + """ + a[d] = b[d] - c[d] + + """, name='wence_function') + + caller = lp.make_kernel("{[i]: 0<=i<10}", + """ + []:z[i] = wence_function([]:x[i], []:y[i]) + """, + [lp.GlobalArg('x, y', dtype=np.float64, shape=(10, )), ...]) + + caller = lp.register_callable_kernel(caller, callee) + + if inline: + caller = lp.inline_callable_kernel(caller, callee.name) + + evt, (out, ) = caller(queue, x=x, y=y) + assert np.allclose(out, x-y) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 6ba6f58094b4d7f6bce90dd96ceee4ab8c4f35c9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 4 Apr 2019 18:34:27 -0500 Subject: [PATCH 440/916] flake8 fixes --- loopy/transform/callable.py | 5 +++-- test/test_callables.py | 2 +- test/test_loopy.py | 19 ------------------- 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 0df0829ad..2fb0b1f53 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -38,7 +38,6 @@ from loopy.kernel.function_interface import (get_kw_pos_association, CallableKernel, ScalarCallable) from loopy.program import Program, ResolvedFunctionMarker from loopy.symbolic import SubArrayRef -from pymbolic.primitives import Subscript __doc__ = """ .. currentmodule:: loopy @@ -648,7 +647,9 @@ def _match_caller_callee_argument_dimension_for_single_kernel( parameter_shapes = [] for par in parameters: if isinstance(par, SubArrayRef): - parameter_shapes.append(_shape_1_if_empty(par.get_array_arg_descriptor(caller_knl).shape)) + parameter_shapes.append( + _shape_1_if_empty( + par.get_array_arg_descriptor(caller_knl).shape)) else: parameter_shapes.append((1, )) diff --git a/test/test_callables.py b/test/test_callables.py index a8a80a7bb..5d8785db0 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -465,7 +465,7 @@ def test_empty_sub_array_refs(ctx_factory, inline): """ []:z[i] = wence_function([]:x[i], []:y[i]) """, - [lp.GlobalArg('x, y', dtype=np.float64, shape=(10, )), ...]) + [lp.GlobalArg('x, y', dtype=np.float64, shape=(10, )), '...']) caller = lp.register_callable_kernel(caller, callee) diff --git a/test/test_loopy.py b/test/test_loopy.py index 95d9df4cd..383aa5938 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2910,25 +2910,6 @@ def test_backwards_dep_printing_and_error(): print(knl) -def test_backwards_dep_printing_and_error(): - knl = lp.make_kernel( - "{[i]: 0<=i Date: Thu, 4 Apr 2019 19:58:27 -0500 Subject: [PATCH 441/916] stores insn id as key --- loopy/transform/pack_and_unpack_args.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index e5ed850c6..67ea48326 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -287,29 +287,26 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, new_ilp_inames), expression=new_call_insn.expression.function(*new_params), assignees=new_assignees) - old_insn_to_new_insns[insn] = (packing_insns + [new_call_insn] + + old_insn_to_new_insns[insn.id] = (packing_insns + [new_call_insn] + unpacking_insns) if old_insn_to_new_insns: new_instructions = [] for insn in kernel.instructions: - if insn in old_insn_to_new_insns: + if insn.id in old_insn_to_new_insns: # Replacing the current instruction with the group of # instructions including the packing and unpacking instructions - new_instructions.extend(old_insn_to_new_insns[insn]) + new_instructions.extend(old_insn_to_new_insns[insn.id]) else: # for the instructions that depend on the call instruction that # are to be packed and unpacked, we need to add the complete # instruction block as a dependency for them. new_depends_on = insn.depends_on - if insn.depends_on & set( - old_insn.id for old_insn in old_insn_to_new_insns): + if insn.depends_on & set(old_insn_to_new_insns): # need to add the unpack instructions on dependencies. - for old_insn_id in insn.depends_on & set( - old_insn.id for old_insn in old_insn_to_new_insns): - old_insn = kernel.id_to_insn[old_insn_id] + for old_insn_id in insn.depends_on & set(old_insn_to_new_insns): new_depends_on |= frozenset(i.id for i - in old_insn_to_new_insns[old_insn]) + in old_insn_to_new_insns[old_insn_id]) new_instructions.append(insn.copy(depends_on=new_depends_on)) kernel = kernel.copy( domains=kernel.domains + new_domains, -- GitLab From ff9169c002056afdd783a02a83f76922dbed35e8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 4 Apr 2019 20:02:12 -0500 Subject: [PATCH 442/916] skips test depend on old unsupported code --- test/test_loopy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_loopy.py b/test/test_loopy.py index 383aa5938..503f50a2a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2911,6 +2911,8 @@ def test_backwards_dep_printing_and_error(): def test_dump_binary(ctx_factory): + pytest.skip("Test depends on feature which was deprecated in 2016") + ctx = ctx_factory() knl = lp.make_kernel( -- GitLab From 92d64b882b77d203e8d88a2c325fee44665f66ea Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 4 Apr 2019 22:37:00 -0500 Subject: [PATCH 443/916] pylint fixes --- loopy/kernel/function_interface.py | 2 +- loopy/kernel/tools.py | 8 +++++--- loopy/library/reduction.py | 4 ++-- loopy/target/c/__init__.py | 2 +- loopy/target/cuda.py | 18 ------------------ loopy/target/execution.py | 4 ++-- loopy/transform/callable.py | 2 +- test/test_loopy.py | 2 +- 8 files changed, 13 insertions(+), 29 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 0115d3b2b..7b1f4c357 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -771,7 +771,7 @@ class ManglerCallable(ScalarCallable): # The function mangler does not agree with the arg id to dtypes # provided. Indicating that is illegal. raise LoopyError("Function %s not coherent with the provided types." % ( - self.name, kernel.target)) + self.name)) def mangle_result(self, kernel): """ diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 90263b6e1..6d4c34ecb 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -36,7 +36,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg from loopy.tools import natsorted -from loopy.program import Program +from loopy.program import Program, iterate_over_kernels_if_given_program import logging logger = logging.getLogger(__name__) @@ -463,7 +463,9 @@ class DomainChanger: # {{{ graphviz / dot export -def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): +@iterate_over_kernels_if_given_program +def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, + use_insn_id=False): """Return a string in the `dot `_ language depicting dependencies among kernel instructions. """ @@ -475,7 +477,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): if iname_cluster and not kernel.schedule: try: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, callables_table) except RuntimeError as e: iname_cluster = False from warnings import warn diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index ab40681d0..357c03feb 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -455,8 +455,8 @@ class ReductionCallable(ScalarCallable): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=name_in_target), callables_table - def with_descr(self, arg_id_to_descr, callables_table): - from loopy.library.kernel.function_interface import ValueArgDescriptor + def with_descrs(self, arg_id_to_descr, callables_table): + from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() return ( diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index f9ab9bcaa..6682b6ec3 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -442,7 +442,7 @@ class CMathCallable(ScalarCallable): pass # fmin elif dtype == np.float32: name = name + "f" # fminf - elif dtype == np.float128: + elif dtype == np.float128: # pylint:disable=no-member name = name + "l" # fminl else: raise LoopyTypeError("%s does not support type %s" diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index bfbe9ca69..dfa94f71b 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -354,24 +354,6 @@ class CUDACASTBuilder(CASTBuilder): return FunctionDeclarationWrapper(fdecl) - def generate_code(self, kernel, codegen_state, impl_arg_info): - code, implemented_domains = ( - super(CudaTarget, self).generate_code( - kernel, codegen_state, impl_arg_info)) - - return code, implemented_domains - - def generate_body(self, kernel, codegen_state): - body, implemented_domains = ( - super(CudaTarget, self).generate_body(kernel, codegen_state)) - - from loopy.kernel.data import ImageArg - - if any(isinstance(arg, ImageArg) for arg in kernel.args): - raise NotImplementedError("not yet: texture arguments in CUDA") - - return body, implemented_domains - def preamble_generators(self): return ( diff --git a/loopy/target/execution.py b/loopy/target/execution.py index c067bc4b9..f6a1d9ad0 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -827,7 +827,7 @@ class KernelExecutorBase(object): dtype = np.dtype(dtype) if isinstance(dtype, np.dtype): from loopy.types import NumpyType - dtype = NumpyType(dtype, self.kernel.target) + dtype = NumpyType(dtype, self.program.target) return dtype @@ -835,7 +835,7 @@ class KernelExecutorBase(object): arg_to_dtype = frozenset( (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype)) - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) + kernel = self.get_typed_and_scheduled_program(arg_to_dtype) from loopy.codegen import generate_code_v2 code = generate_code_v2(kernel) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 2fb0b1f53..953ad5613 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -283,7 +283,7 @@ class KernelInliner(SubstitutionMapper): from numbers import Integral if not all(isinstance(d, Integral) for d in callee_arg.shape): raise LoopyError( - "Argument: {0} in callee kernel: {1} does not have " + "Argument: {0} in callee kernel does not have " "constant shape.".format(callee_arg)) flatten_index = 0 diff --git a/test/test_loopy.py b/test/test_loopy.py index 503f50a2a..16ec6c1d3 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2888,7 +2888,7 @@ def test_dep_cycle_printing_and_error(): from loopy.diagnostic import DependencyCycleFound with pytest.raises(DependencyCycleFound): - print(lp.generate_code(knl).device_code()) + print(lp.generate_code_v2(knl).device_code()) def test_backwards_dep_printing_and_error(): -- GitLab From b9ae9410120b7f15ac57e6afec700a2cc71e50b8 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Apr 2019 14:49:30 +0100 Subject: [PATCH 444/916] Squash deprecation warnings iname_to_tag -> iname_to_tags --- loopy/check.py | 5 +++-- loopy/transform/pack_and_unpack_args.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 977571fcf..796c5b4bd 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -185,8 +185,9 @@ def _get_all_unique_iname_tags(kernel): *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`. """ from loopy.kernel.data import UniqueTag - iname_tags = [kernel.iname_to_tag.get(iname) for iname in - kernel.all_inames()] + from itertools import chain + iname_tags = list(chain(*(kernel.iname_to_tags.get(iname, []) for iname in + kernel.all_inames()))) return set( tag for tag in iname_tags if isinstance(tag, UniqueTag)) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index 67ea48326..a18326187 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -121,8 +121,9 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, from pymbolic import var dim_type = isl.dim_type.set - ilp_inames = set(iname for iname in insn.within_inames if isinstance( - kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) + ilp_inames = set(iname for iname in insn.within_inames + if all(isinstance(tag, (IlpBaseTag, VectorizeTag)) + for tag in kernel.iname_to_tags.get(iname, []))) new_ilp_inames = set() ilp_inames_map = {} for iname in ilp_inames: -- GitLab From 1e5bebd3e2e5c0df2060181fa41ec332e68ea574 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Apr 2019 14:49:57 +0100 Subject: [PATCH 445/916] codegen: Handle multiple entries when collecting forward declarations If the codegen has produced a Collection with (say) some static arrays, we can't assume that the callee program ast has an fdecl property. So if it's a collection, spin over the contents. --- loopy/codegen/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 8f3e15f28..e7a6f0d3e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -620,7 +620,14 @@ def generate_code_v2(program): callee_prog_ast = callee_cgr.device_programs[0].ast collective_device_program = collective_device_program.copy( ast=Collection([callee_prog_ast, collective_device_program.ast])) - callee_fdecls.append(callee_prog_ast.fdecl) + if isinstance(callee_prog_ast, Collection): + for entry in callee_prog_ast.contents: + try: + callee_fdecls.append(entry.fdecl) + except AttributeError: + pass + else: + callee_fdecls.append(callee_prog_ast.fdecl) # collecting the function declarations of callee kernels for callee_fdecl in callee_fdecls: -- GitLab From 495513f20258bc6f3d328a6284d7c81fa4ba2ad0 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Apr 2019 14:51:18 +0100 Subject: [PATCH 446/916] codegen: mark callee kernels as static They don't need to be visible outside of the single compilation unit, which will help the C compiler a bit. --- loopy/target/c/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 6682b6ec3..4644935e0 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -579,9 +579,13 @@ class CASTBuilder(ASTBuilderBase): if self.target.fortran_abi: name += "_" + if codegen_state.kernel.is_called_from_host: + name = Value("void", name) + else: + name = Value("static void", name) return FunctionDeclarationWrapper( FunctionDeclaration( - Value("void", name), + name, [self.idi_to_cgen_declarator(codegen_state.kernel, idi) for idi in codegen_state.implemented_data_info])) -- GitLab From 453d6bdbcba60270014ab6d37a8f92a3e8fde01e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 5 Apr 2019 09:34:30 -0500 Subject: [PATCH 447/916] reframes the conditional to check FunctionBody type --- loopy/codegen/__init__.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e7a6f0d3e..f7f0c2902 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -580,6 +580,7 @@ def generate_code_v2(program): """ from loopy.kernel import LoopKernel from loopy.program import make_program + from cgen import FunctionBody if isinstance(program, LoopKernel): program = make_program(program) @@ -621,13 +622,14 @@ def generate_code_v2(program): collective_device_program = collective_device_program.copy( ast=Collection([callee_prog_ast, collective_device_program.ast])) if isinstance(callee_prog_ast, Collection): + # if there is a read only constant in the kernel for entry in callee_prog_ast.contents: - try: + if isinstance(entry, FunctionBody): callee_fdecls.append(entry.fdecl) - except AttributeError: - pass - else: + elif isinstance(callee_prog_ast, FunctionBody): callee_fdecls.append(callee_prog_ast.fdecl) + else: + raise NotImplementedError() # collecting the function declarations of callee kernels for callee_fdecl in callee_fdecls: -- GitLab From bdfaa03e1c3eb9737c2178a87bf0a15e79e8bb71 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 5 Apr 2019 10:32:39 -0500 Subject: [PATCH 448/916] improves the not implemented error message --- loopy/codegen/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index f7f0c2902..d12d36486 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -629,7 +629,8 @@ def generate_code_v2(program): elif isinstance(callee_prog_ast, FunctionBody): callee_fdecls.append(callee_prog_ast.fdecl) else: - raise NotImplementedError() + raise NotImplementedError("Do not know how to add forward" + " declarations for %r." % type(callee_prog_ast)) # collecting the function declarations of callee kernels for callee_fdecl in callee_fdecls: -- GitLab From bc1fc6b170845023425f9f3e05581974df29981d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Apr 2019 13:54:30 +0100 Subject: [PATCH 449/916] Add erf and erfc --- loopy/target/c/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 4644935e0..9cf9e7e94 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -372,7 +372,8 @@ class CMathCallable(ScalarCallable): # unary functions if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", + "erf", "erfc"]: for id in arg_id_to_dtype: if not -1 <= id <= 0: @@ -466,7 +467,7 @@ def scope_c_math_functions(target, identifier): if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", - "fabs", "tan"]: + "fabs", "tan", "erf", "erfc"]: return CMathCallable(name=identifier) return None -- GitLab From b122a35b51272bb05bd484be80e1d1ac0d50f2a1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 21 Apr 2019 11:00:28 -0500 Subject: [PATCH 450/916] handling small git merge failure --- test/test_loopy.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index d7b85260b..ffa84289b 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2909,25 +2909,6 @@ def test_backwards_dep_printing_and_error(): print(knl) -def test_backwards_dep_printing_and_error(): - knl = lp.make_kernel( - "{[i]: 0<=i Date: Sun, 21 Apr 2019 11:06:03 -0500 Subject: [PATCH 451/916] skips test --- test/test_loopy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_loopy.py b/test/test_loopy.py index ffa84289b..1be369c39 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2910,6 +2910,8 @@ def test_backwards_dep_printing_and_error(): def test_dump_binary(ctx_factory): + pytest.skip("Not investing time in passing test depends on feature which was " + "deprecated in 2016") ctx = ctx_factory() knl = lp.make_kernel( -- GitLab From 7781085c493a25df85de0b02affda1baa7d5c49f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 21 Apr 2019 11:34:04 -0500 Subject: [PATCH 452/916] pylint fixes --- loopy/kernel/function_interface.py | 2 +- loopy/kernel/tools.py | 8 +++++--- loopy/library/reduction.py | 2 +- loopy/target/c/__init__.py | 2 +- loopy/target/execution.py | 4 ++-- test/test_loopy.py | 2 +- 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 17057691c..1803efdb2 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -598,7 +598,7 @@ class ManglerCallable(ScalarCallable): # The function mangler does not agree with the arg id to dtypes # provided. Indicating that is illegal. raise LoopyError("Function %s not coherent with the provided types." % ( - self.name, kernel.target)) + self.name)) def mangle_result(self, kernel): """ diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index ad1153023..c9dae7c1a 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -36,7 +36,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg from loopy.tools import natsorted -from loopy.program import Program +from loopy.program import Program, iterate_over_kernels_if_given_program import logging logger = logging.getLogger(__name__) @@ -463,7 +463,9 @@ class DomainChanger: # {{{ graphviz / dot export -def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): +@iterate_over_kernels_if_given_program +def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, + use_insn_id=False): """Return a string in the `dot `_ language depicting dependencies among kernel instructions. """ @@ -475,7 +477,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False): if iname_cluster and not kernel.schedule: try: from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) + kernel = get_one_scheduled_kernel(kernel, callables_table) except RuntimeError as e: iname_cluster = False from warnings import warn diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index ab40681d0..3a569af8b 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -455,7 +455,7 @@ class ReductionCallable(ScalarCallable): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=name_in_target), callables_table - def with_descr(self, arg_id_to_descr, callables_table): + def with_descrs(self, arg_id_to_descr, callables_table): from loopy.library.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index d1f9957b2..48ba036e0 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -399,7 +399,7 @@ class CMathCallable(ScalarCallable): pass # fabs elif dtype == np.float32: name = name + "f" # fabsf - elif dtype == np.float128: + elif dtype == np.float128: # pylint:disable=no-member name = name + "l" # fabsl else: raise LoopyTypeError("%s does not support type %s" % (name, diff --git a/loopy/target/execution.py b/loopy/target/execution.py index c067bc4b9..f6a1d9ad0 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -827,7 +827,7 @@ class KernelExecutorBase(object): dtype = np.dtype(dtype) if isinstance(dtype, np.dtype): from loopy.types import NumpyType - dtype = NumpyType(dtype, self.kernel.target) + dtype = NumpyType(dtype, self.program.target) return dtype @@ -835,7 +835,7 @@ class KernelExecutorBase(object): arg_to_dtype = frozenset( (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype)) - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) + kernel = self.get_typed_and_scheduled_program(arg_to_dtype) from loopy.codegen import generate_code_v2 code = generate_code_v2(kernel) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1be369c39..1c2a0566e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2887,7 +2887,7 @@ def test_dep_cycle_printing_and_error(): from loopy.diagnostic import DependencyCycleFound with pytest.raises(DependencyCycleFound): - print(lp.generate_code(knl).device_code()) + print(lp.generate_code_v2(knl).device_code()) def test_backwards_dep_printing_and_error(): -- GitLab From 6c1cdae06c5a3854390913e5d9d02780d34ac4e5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 21 Apr 2019 12:20:19 -0500 Subject: [PATCH 453/916] handles minor import error --- loopy/library/reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 3a569af8b..357c03feb 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -456,7 +456,7 @@ class ReductionCallable(ScalarCallable): name_in_target=name_in_target), callables_table def with_descrs(self, arg_id_to_descr, callables_table): - from loopy.library.kernel.function_interface import ValueArgDescriptor + from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() return ( -- GitLab From 2c80a3c005a62745f93edc0652b5c70595aeacbf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 21 Apr 2019 12:42:15 -0500 Subject: [PATCH 454/916] adds the variable tag --- loopy/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 161e06b39..73fcd75bb 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1295,9 +1295,9 @@ def count_insn_runs(knl, callables_table, insn, count_redundant_work, if disregard_local_axes: from loopy.kernel.data import LocalIndexTag - insn_inames = [iname - for iname in insn_inames - if not knl.iname_tags_of_type(iname, LocalIndexTag)] + insn_inames = frozenset( + [iname for iname in insn_inames + if not knl.iname_tags_of_type(iname, LocalIndexTag)]) inames_domain = knl.get_inames_domain(insn_inames) domain = (inames_domain.project_out_except( @@ -1568,7 +1568,6 @@ def _process_subgroup_size(knl, subgroup_size_requested): # {{{ get_mem_access_map - def get_mem_access_map_for_single_kernel(knl, callables_table, numpy_types=True, count_redundant_work=False, subgroup_size=None): @@ -1632,6 +1631,7 @@ def get_mem_access_map_for_single_kernel(knl, callables_table, gid_strides=mem_access.gid_strides, direction=mem_access.direction, variable=mem_access.variable, + variable_tag=mem_access.variable_tag, count_granularity=mem_access.count_granularity), ct) for mem_access, ct in six.iteritems(access_map.count_map)), -- GitLab From cd7f75c47a4a955d82f94a584fb158e2ac1030f6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 12 Apr 2019 13:54:30 +0100 Subject: [PATCH 455/916] Add erf and erfc --- loopy/target/c/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 4644935e0..9cf9e7e94 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -372,7 +372,8 @@ class CMathCallable(ScalarCallable): # unary functions if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]: + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", + "erf", "erfc"]: for id in arg_id_to_dtype: if not -1 <= id <= 0: @@ -466,7 +467,7 @@ def scope_c_math_functions(target, identifier): if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", - "fabs", "tan"]: + "fabs", "tan", "erf", "erfc"]: return CMathCallable(name=identifier) return None -- GitLab From 53165a5bf6a36cabf990d45951c36dcaef317803 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 14 May 2019 19:11:14 -0500 Subject: [PATCH 456/916] Pass filename to Fortran parser for nicer diagnostics --- loopy/frontend/fortran/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 05b0a9205..0434f4e90 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -263,7 +263,7 @@ def parse_fortran(source, filename="", free_form=True, strict=True, from fparser import api tree = api.parse(source, isfree=free_form, isstrict=strict, - analyze=False, ignore_comments=False) + analyze=False, ignore_comments=False, filename=filename) if tree is None: raise LoopyError("Fortran parser was unhappy with source code " -- GitLab From ae978d1cf05687d092b49593e664bae9402b8f24 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 14 May 2019 19:11:38 -0500 Subject: [PATCH 457/916] Flake8: remove extraneous import --- loopy/transform/subst.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 733137efb..7363cdc3c 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -34,7 +34,6 @@ from pytools import ImmutableRecord from pymbolic import var from loopy.program import iterate_over_kernels_if_given_program -from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging -- GitLab From c403fb4f00029d571fabcbea5893071e115cfe8b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 14 May 2019 19:28:01 -0500 Subject: [PATCH 458/916] Fix test_nested_substs_in_insns --- test/test_transform.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_transform.py b/test/test_transform.py index 453f3b14a..59f68e598 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -23,6 +23,7 @@ THE SOFTWARE. """ import sys +import six import numpy as np import loopy as lp import pyopencl as cl @@ -564,7 +565,7 @@ def test_nested_substs_in_insns(ctx_factory): ctx = ctx_factory() import loopy as lp - ref_knl = lp.make_kernel( + ref_prg = lp.make_kernel( "{[i]: 0<=i<10}", """ a(x) := 2 * x @@ -574,10 +575,12 @@ def test_nested_substs_in_insns(ctx_factory): """ ) - knl = lp.expand_subst(ref_knl) - assert not knl.substitutions + prg = lp.expand_subst(ref_prg) + assert not any( + cknl.subkernel.substitutions + for cknl in six.itervalues(prg.callables_table.resolved_functions)) - lp.auto_test_vs_ref(ref_knl, ctx, knl) + lp.auto_test_vs_ref(ref_prg, ctx, prg) if __name__ == "__main__": -- GitLab From 9a1c3c343952cfe467d679fbfd7f3a05dfdf7a05 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 14 May 2019 20:15:20 -0500 Subject: [PATCH 459/916] Export CallablesTable as a global symbol --- loopy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index e4fa2c16e..9c4201662 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -51,7 +51,7 @@ from loopy.kernel.data import ( from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.program import ( - Program, make_program) + CallablesTable, Program, make_program) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -177,7 +177,7 @@ __all__ = [ "ScalarCallable", "CallableKernel", - "Program", "make_program", + "CallablesTable", "Program", "make_program", "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", -- GitLab From dd2d74b1003dfd1cac1c434aa166ed75e9b134ee Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 14 May 2019 20:17:17 -0500 Subject: [PATCH 460/916] Assumptions processing: Deal with case of no loop domains --- loopy/kernel/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 32f1f77ee..679944acb 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -297,7 +297,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): # {{{ process assumptions - if assumptions is None: + if assumptions is None and domains: dom0_space = domains[0].get_space() assumptions_space = isl.Space.params_alloc( dom0_space.get_ctx(), dom0_space.dim(dim_type.param)) @@ -307,6 +307,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): dom0_space.get_dim_name(dim_type.param, i)) assumptions = isl.BasicSet.universe(assumptions_space) + elif assumptions is None and not domains: + assumptions = isl.BasicSet.read_from_str( + isl.DEFAULT_CONTEXT, "[] -> { : 1 = 1}") + elif isinstance(assumptions, str): assumptions_set_str = "[%s] -> { : %s}" \ % (",".join(s for s in self.outer_params(domains)), -- GitLab From 8704ac90ede2dc48366d1e2ecca48dd8bf0bf5b3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 14 May 2019 20:17:35 -0500 Subject: [PATCH 461/916] CLI: Deal with more Fortran file extensions --- loopy/cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/loopy/cli.py b/loopy/cli.py index 060340d59..ed50cec1f 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -108,9 +108,11 @@ def main(): ".loopy": "loopy", ".floopy": "fortran", ".f90": "fortran", + ".F90": "fortran", ".fpp": "fortran", ".f": "fortran", ".f77": "fortran", + ".F77": "fortran", }.get(ext) with open(args.infile, "r") as infile_fd: infile_content = infile_fd.read() -- GitLab From 30efebf794080e2008f54baf20bad82c1ecbeca5 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 14 May 2019 20:17:56 -0500 Subject: [PATCH 462/916] Fortran: towards processing Call nodes --- loopy/frontend/fortran/translator.py | 42 ++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index d7a1b2498..30d97bd53 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -218,11 +218,16 @@ class F2LoopyTranslator(FTreeWalkerBase): self.block_nest = [] + def add_instruction(self, insn): + scope = self.scope_stack[-1] + + scope.previous_instruction_id = insn.id + scope.instructions.append(insn) + def add_expression_instruction(self, lhs, rhs): scope = self.scope_stack[-1] - new_id = intern("insn%d" % self.insn_id_counter) - self.insn_id_counter += 1 + new_id = self.get_insn_id() from loopy.kernel.data import Assignment insn = Assignment( @@ -233,8 +238,13 @@ class F2LoopyTranslator(FTreeWalkerBase): predicates=frozenset(self.conditions), tags=tuple(self.instruction_tags)) - scope.previous_instruction_id = new_id - scope.instructions.append(insn) + self.add_instruction(insn) + + def get_insn_id(self): + new_id = intern("insn%d" % self.insn_id_counter) + self.insn_id_counter += 1 + + return new_id # {{{ map_XXX functions @@ -437,7 +447,23 @@ class F2LoopyTranslator(FTreeWalkerBase): raise NotImplementedError("goto") def map_Call(self, node): - raise NotImplementedError("call") + scope = self.scope_stack[-1] + + new_id = self.get_insn_id() + + from pymbolic import var + + # FIXME: Actually process arguments + from loopy.kernel.data import CallInstruction + insn = CallInstruction( + (), var(node.designator)(), + within_inames=frozenset( + scope.active_loopy_inames), + id=new_id, + predicates=frozenset(self.conditions), + tags=tuple(self.instruction_tags)) + + self.add_instruction(insn) def map_Return(self, node): raise NotImplementedError("return") @@ -725,7 +751,11 @@ class F2LoopyTranslator(FTreeWalkerBase): result.append(knl) - return result + ctable = lp.CallablesTable({knl.name: lp.CallableKernel(result)}) + + return lp.Program( + result[0].name, + ctable) # }}} -- GitLab From cbb9942cf0d6d556c896ea5dc9f8d3c55589df56 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 14 May 2019 20:19:09 -0500 Subject: [PATCH 463/916] Add xfail'd Fortran subroutine test --- test/test_fortran.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/test_fortran.py b/test/test_fortran.py index 5d5f7f0b1..77321e8fa 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -498,6 +498,33 @@ def test_precompute_some_exist(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) +def test_fortran_subroutines(ctx_factory): + fortran_src = """ + subroutine twice(n, a) + implicit none + real*8 a(n) + integer i,n + + do i=1,n + a(i) = a(i) * 2 + end do + end subroutine + + subroutine twice_cross(n, a, i) + implicit none + integer i, n + real*8 a(n,n) + + call twice(1:n, i) + call twice(i, 1:n) + + + end subroutine + """ + knl, = lp.parse_fortran(fortran_src) + pytest.xfail("not yet fully implemented") + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 0583b65ebedd31cd352753dfccdb0f0267d6479d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 May 2019 08:31:31 -0500 Subject: [PATCH 464/916] WIP: need to fix the arguments registered in the call --- loopy/frontend/fortran/__init__.py | 2 +- loopy/frontend/fortran/translator.py | 26 ++++++++---- loopy/kernel/tools.py | 59 +++++++++++++++++++++++++++- 3 files changed, 77 insertions(+), 10 deletions(-) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 0434f4e90..05b0a9205 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -263,7 +263,7 @@ def parse_fortran(source, filename="", free_form=True, strict=True, from fparser import api tree = api.parse(source, isfree=free_form, isstrict=strict, - analyze=False, ignore_comments=False, filename=filename) + analyze=False, ignore_comments=False) if tree is None: raise LoopyError("Fortran parser was unhappy with source code " diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 30d97bd53..45b7185f4 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -732,8 +732,7 @@ class F2LoopyTranslator(FTreeWalkerBase): # }}} - from loopy.version import MOST_RECENT_LANGUAGE_VERSION - knl = lp.make_kernel( + knl = lp.make_function( sub.index_sets, sub.instructions, kernel_data, @@ -742,7 +741,6 @@ class F2LoopyTranslator(FTreeWalkerBase): index_dtype=self.index_dtype, target=self.target, seq_dependencies=seq_dependencies, - lang_version=MOST_RECENT_LANGUAGE_VERSION ) from loopy.loop import fuse_loop_domains @@ -751,11 +749,23 @@ class F2LoopyTranslator(FTreeWalkerBase): result.append(knl) - ctable = lp.CallablesTable({knl.name: lp.CallableKernel(result)}) - - return lp.Program( - result[0].name, - ctable) + from loopy.kernel.tools import identify_root_kernel + from loopy.program import make_program + from loopy.transform.callable import register_callable_kernel + + root_knl_name = identify_root_kernel(result) + root_knl = [knl for knl in result if knl.name == + root_knl_name][0].copy(is_called_from_host=True) + print(root_knl) + callee_kernels = [knl for knl in result if knl.name != root_knl_name] + print(callee_kernels[0]) + prog = make_program(root_knl) + for callee_knl in callee_kernels: + #FIXME: This would need some sort of traversal to be valid + # for all cases + prog = register_callable_kernel(prog, callee_knl) + + return prog # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 6d4c34ecb..7c0f3c095 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -36,8 +36,12 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg from loopy.tools import natsorted +from loopy.symbolic import CombineMapper +from loopy.kernel import LoopKernel from loopy.program import Program, iterate_over_kernels_if_given_program - +from loopy.kernel.instruction import (MultiAssignmentBase, + _DataObliviousInstruction) +from functools import reduce import logging logger = logging.getLogger(__name__) @@ -1949,4 +1953,57 @@ def infer_args_are_output_only(kernel): # }}} + +class CallCollector(CombineMapper): + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_call(self, expr): + from pymbolic.primitives import CallWithKwargs + return self.rec(CallWithKwargs( + function=expr.function, parameters=expr.parameters, + kw_parameters={})) + + def map_call_with_kwargs(self, expr): + return (frozenset([expr.function.name]) | + self.combine((self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values())))) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + + +def identify_root_kernel(kernels): + assert isinstance(kernels, list) + assert all(isinstance(knl, LoopKernel) for knl in kernels) + call_collector = CallCollector() + + def _calls_in_a_kernel(knl): + calls = set() + for insn in knl.instructions: + if isinstance(insn, MultiAssignmentBase): + calls = calls | call_collector(insn.expression) + elif isinstance(insn, _DataObliviousInstruction): + pass + else: + raise NotImplementedError() + + return calls + + all_calls = frozenset().union(*[_calls_in_a_kernel(knl) for knl in + kernels]) + + kernel_names = frozenset([knl.name for knl in kernels]) + + assert len(kernel_names - all_calls) == 1 + + root_knl_name, = (kernel_names - all_calls) + return root_knl_name + # vim: foldmethod=marker -- GitLab From 240e06bb0e302f5e4d047d96dcae5126123952db Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 15 May 2019 10:42:34 -0500 Subject: [PATCH 465/916] Minor fixes to test_fortran_subroutines --- test/test_fortran.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index 77321e8fa..6946f1181 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -515,8 +515,8 @@ def test_fortran_subroutines(ctx_factory): integer i, n real*8 a(n,n) - call twice(1:n, i) - call twice(i, 1:n) + call twice(n, a(1:n, i)) + call twice(n, a(i, 1:n)) end subroutine -- GitLab From 18c42eb3ef7bb4f307ccf86da60bc460412dd012 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 16 May 2019 15:01:36 -0500 Subject: [PATCH 466/916] one variant of the slice notation works --- loopy/frontend/fortran/translator.py | 24 +++++++++++++++++++++--- loopy/kernel/creation.py | 11 +++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 45b7185f4..3f5d89d62 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -37,7 +37,7 @@ import islpy as isl from islpy import dim_type from loopy.symbolic import IdentityMapper from loopy.diagnostic import LoopyError -from pymbolic.primitives import Wildcard +from pymbolic.primitives import (Wildcard, Slice) # {{{ subscript base shifter @@ -72,10 +72,20 @@ class SubscriptIndexBaseShifter(IdentityMapper): subscript[i] -= dims[i][0] elif len(dims[i]) == 1: # base index is 1 implicitly - subscript[i] -= 1 + if not isinstance(subscript[i], Slice): + subscript[i] -= 1 return expr.aggregate[self.rec(tuple(subscript))] + def map_slice(self, expr): + start = expr.start-1 + stop = expr.stop + if expr.step: + step = expr.step + else: + step = 1 + return Slice((start, stop, step)) + # }}} @@ -456,7 +466,8 @@ class F2LoopyTranslator(FTreeWalkerBase): # FIXME: Actually process arguments from loopy.kernel.data import CallInstruction insn = CallInstruction( - (), var(node.designator)(), + (), var(node.designator)(*(scope.process_expression_for_loopy( + self.parse_expr(node, item)) for item in node.items)), within_inames=frozenset( scope.active_loopy_inames), id=new_id, @@ -707,6 +718,7 @@ class F2LoopyTranslator(FTreeWalkerBase): arg_name, dtype=sub.get_type(arg_name), shape=sub.get_loopy_shape(arg_name), + is_output_only=False, )) else: kernel_data.append( @@ -732,6 +744,9 @@ class F2LoopyTranslator(FTreeWalkerBase): # }}} + if sub.index_sets == []: + sub.index_sets = [isl.BasicSet('{:}')] + knl = lp.make_function( sub.index_sets, sub.instructions, @@ -763,8 +778,11 @@ class F2LoopyTranslator(FTreeWalkerBase): for callee_knl in callee_kernels: #FIXME: This would need some sort of traversal to be valid # for all cases + # THIS IS A VERY IMPORTANT FIXME!! prog = register_callable_kernel(prog, callee_knl) + print(prog) + return prog # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index a11291419..59a4f7896 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1939,6 +1939,17 @@ class SliceToInameReplacer(IdentityMapper): ctx = self.knl.isl_context space = isl.Space.create_from_names(ctx, set=list(self.iname_domains.keys())) + from loopy.symbolic import DependencyMapper + args_as_params_for_domains = set() + for _, (start, stop, step) in self.iname_domains.items(): + args_as_params_for_domains |= DependencyMapper()(start) + args_as_params_for_domains |= DependencyMapper()(stop) + args_as_params_for_domains |= DependencyMapper()(step) + + space = space.add_dims(1, len(args_as_params_for_domains)) + for i, arg in enumerate(args_as_params_for_domains): + space = space.set_dim_id(1, i, isl.Id(arg.name)) + iname_set = isl.BasicSet.universe(space) from loopy.isl_helpers import make_slab -- GitLab From 7d51d1503005dbaacb6e20d8d79931c8391ab4a5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 16 May 2019 16:06:25 -0500 Subject: [PATCH 467/916] Guard simplify_via_aff for non-affine exprs --- loopy/symbolic.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 898c3efe8..9a64fe4ac 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -850,9 +850,13 @@ class SubArrayRef(p.Expression): from loopy.isl_helpers import simplify_via_aff sub_dim_tags = [] sub_shape = [] - linearized_index = simplify_via_aff( - sum(dim_tag.stride*iname for dim_tag, iname in - zip(arg.dim_tags, self.subscript.index_tuple))) + try: + linearized_index = simplify_via_aff( + sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg.dim_tags, self.subscript.index_tuple))) + except isl.Error: + linearized_index = sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg.dim_tags, self.subscript.index_tuple)) strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) -- GitLab From 663d80936751a1a520b28a882c57f028a6b3858f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 16 May 2019 16:49:53 -0500 Subject: [PATCH 468/916] removes debug statememnt --- loopy/frontend/fortran/translator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 3f5d89d62..e1b729af8 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -781,8 +781,6 @@ class F2LoopyTranslator(FTreeWalkerBase): # THIS IS A VERY IMPORTANT FIXME!! prog = register_callable_kernel(prog, callee_knl) - print(prog) - return prog # }}} -- GitLab From e952887fd8594d43874e3cb56c10336e06da70bb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 16 May 2019 16:50:26 -0500 Subject: [PATCH 469/916] asserts that dict keys are the same as the callee kernel names --- loopy/program.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/loopy/program.py b/loopy/program.py index c8534f051..bd674caea 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -595,6 +595,9 @@ class CallablesTable(ImmutableRecord): history = dict((func_id, frozenset([func_id])) for func_id in resolved_functions) + assert all(call.subkernel.name == name for name, call in + resolved_functions.items() if isinstance(call, CallableKernel)) + super(CallablesTable, self).__init__( resolved_functions=resolved_functions, history=history, @@ -822,6 +825,10 @@ class CallablesTable(ImmutableRecord): unique_function_identifier = ( next_indexed_function_identifier( unique_function_identifier)) + if isinstance(in_kernel_callable, CallableKernel): + in_kernel_callable = (in_kernel_callable.copy( + subkernel=in_kernel_callable.subkernel.copy( + name=unique_function_identifier))) updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( @@ -883,6 +890,10 @@ class CallablesTable(ImmutableRecord): if func_id in renames_needed: new_func_id = renames_needed[func_id] + if isinstance(in_knl_callable, CallableKernel): + in_knl_callable = (in_knl_callable.copy( + subkernel=in_knl_callable.subkernel.copy( + name=new_func_id))) new_resolved_functions[new_func_id] = ( in_knl_callable) new_history[new_func_id] = self.history[func_id] -- GitLab From 72856574c38129271e018bd08210d9f290cc987e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 16 May 2019 16:50:45 -0500 Subject: [PATCH 470/916] adds test for testing --- test/test_fortran.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index 6946f1181..c038aa9fa 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -521,8 +521,8 @@ def test_fortran_subroutines(ctx_factory): end subroutine """ - knl, = lp.parse_fortran(fortran_src) - pytest.xfail("not yet fully implemented") + knl = lp.parse_fortran(fortran_src) + print(lp.generate_code_v2(knl).device_code()) if __name__ == "__main__": -- GitLab From 246fac923fb8013601ee0cc072b5ff6ae2d10d08 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 17 May 2019 06:56:12 -0500 Subject: [PATCH 471/916] removes debug statements --- loopy/frontend/fortran/translator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index e1b729af8..2af9ac3da 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -771,9 +771,7 @@ class F2LoopyTranslator(FTreeWalkerBase): root_knl_name = identify_root_kernel(result) root_knl = [knl for knl in result if knl.name == root_knl_name][0].copy(is_called_from_host=True) - print(root_knl) callee_kernels = [knl for knl in result if knl.name != root_knl_name] - print(callee_kernels[0]) prog = make_program(root_knl) for callee_knl in callee_kernels: #FIXME: This would need some sort of traversal to be valid -- GitLab From 7f04f3927f1f0899ea597a9f9164bc7634f8c22a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 19 May 2019 16:12:07 -0500 Subject: [PATCH 472/916] Fix Fortran slice handling --- loopy/frontend/fortran/translator.py | 60 +++++++++++++++++++--------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 2af9ac3da..aef4ea8f1 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -42,7 +42,9 @@ from pymbolic.primitives import (Wildcard, Slice) # {{{ subscript base shifter -class SubscriptIndexBaseShifter(IdentityMapper): +class SubscriptIndexAdjuster(IdentityMapper): + """Adjust base indices of subscripts and lengths of slices.""" + def __init__(self, scope): self.scope = scope @@ -60,31 +62,53 @@ class SubscriptIndexBaseShifter(IdentityMapper): if not isinstance(subscript, tuple): subscript = (subscript,) - subscript = list(subscript) - if len(dims) != len(subscript): raise TranslationError("inconsistent number of indices " "to '%s'" % name) + new_subscript = [] for i in range(len(dims)): if len(dims[i]) == 2: - # has a base index - subscript[i] -= dims[i][0] + # has an explicit base index + base_index, end_index = dims[i] elif len(dims[i]) == 1: - # base index is 1 implicitly - if not isinstance(subscript[i], Slice): - subscript[i] -= 1 + base_index = 1 + end_index, = dims[i] - return expr.aggregate[self.rec(tuple(subscript))] + sub_i = subscript[i] + if isinstance(sub_i, Slice): + start = sub_i.start + if start is None: + start = base_index - def map_slice(self, expr): - start = expr.start-1 - stop = expr.stop - if expr.step: - step = expr.step - else: - step = 1 - return Slice((start, stop, step)) + step = sub_i.step + if step is None: + step = 1 + + stop = sub_i.stop + if stop is None: + stop = end_index + + if step != 1: + # FIXME + raise NotImplementedError("Fortran slice processing for " + "non-unit strides") + + sub_i = Slice(( + start - base_index, + + # FIXME This is only correct for unit strides + stop - base_index + 1, + + step + )) + + else: + sub_i = sub_i - base_index + + new_subscript.append(sub_i) + + return expr.aggregate[self.rec(tuple(new_subscript))] # }}} @@ -197,7 +221,7 @@ class Scope(object): expr = submap(expr) - subshift = SubscriptIndexBaseShifter(self) + subshift = SubscriptIndexAdjuster(self) expr = subshift(expr) return expr -- GitLab From 1b0c5f4a0906af92b2b6f5bdf9e5fa5f6c7cae6e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 19 May 2019 16:13:07 -0500 Subject: [PATCH 473/916] Clarify, use that LoopKenrel.domains may be empty --- loopy/frontend/fortran/translator.py | 3 --- loopy/kernel/__init__.py | 6 +++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index aef4ea8f1..a507c2e67 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -768,9 +768,6 @@ class F2LoopyTranslator(FTreeWalkerBase): # }}} - if sub.index_sets == []: - sub.index_sets = [isl.BasicSet('{:}')] - knl = lp.make_function( sub.index_sets, sub.instructions, diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6872712bd..e5e6a61ec 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -143,8 +143,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. attribute:: domains - a list of :class:`islpy.BasicSet` instances - representing the :ref:`domain-tree`. + a list of :class:`islpy.BasicSet` instances representing the + :ref:`domain-tree`. May be empty. .. attribute:: instructions @@ -611,7 +611,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): for dom in self.domains: return dom.get_ctx() - assert False + return isl.DEFAULT_CONTEXT @memoize_method def combine_domains(self, domains): -- GitLab From f255bbfccfebb8c9abdc95f03806e9785956a644 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 19 May 2019 16:13:33 -0500 Subject: [PATCH 474/916] Comment/doc cleanups --- loopy/frontend/fortran/translator.py | 1 - loopy/program.py | 19 ++----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index a507c2e67..26dbb4bfa 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -487,7 +487,6 @@ class F2LoopyTranslator(FTreeWalkerBase): from pymbolic import var - # FIXME: Actually process arguments from loopy.kernel.data import CallInstruction insn = CallInstruction( (), var(node.designator)(*(scope.process_expression_for_loopy( diff --git a/loopy/program.py b/loopy/program.py index bd674caea..1f7898254 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -334,10 +334,6 @@ class Program(ImmutableRecord): """ Returns an instance of :class:`loopy.LoopKernel` denoting the topmost level kernel. - - .. note:: - - Syntactic sugar. """ return self.callables_table[self.name].subkernel @@ -345,27 +341,16 @@ class Program(ImmutableRecord): def arg_dict(self): """ Returns ``arg_dict`` of the ``root_kernel``. - - .. note:: - - Syntactic sugar. """ return self.root_kernel.arg_dict @property def args(self): - """ - Returns ``args`` of the ``root_kernel``. - - .. note:: - - Syntactic sugar. - """ + """Returns ``args`` of the ``root_kernel``.""" return self.root_kernel.args[:] def with_root_kernel(self, root_kernel): - """ - Returns a copy of *self* with the topmost level kernel as + """:returns: a copy of *self* with the topmost level kernel as *root_kernel*. """ new_in_knl_callable = self.callables_table[ -- GitLab From df5eb3ce066dd55c74a68b7c99e5e778346a05cd Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 19 May 2019 16:13:57 -0500 Subject: [PATCH 475/916] Program.__str__: Make sure all callables are printed --- loopy/program.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 1f7898254..99b0fe2b0 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -374,7 +374,17 @@ class Program(ImmutableRecord): return pex(*args, **kwargs) def __str__(self): - return self.root_kernel.__str__() + # FIXME: do a topological sort by the call graph + + def strify_callable(clbl): + if isinstance(clbl, CallableKernel): + return str(clbl.subkernel) + else: + return str(clbl) + + return "\n".join( + strify_callable(clbl) + for name, clbl in six.iteritems(self.callables_table)) # }}} -- GitLab From 9007a7cf0879c41e70b9122bbe9ac7ba3ddf0f76 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 May 2019 18:10:58 -0500 Subject: [PATCH 476/916] InKernelCallable.with_descrs: Pass caller kernel for better diagnostics --- loopy/kernel/function_interface.py | 22 ++++++++++++---------- loopy/preprocess.py | 3 ++- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 7b1f4c357..536fc9735 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -240,7 +240,7 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - def with_descrs(self, arg_id_to_descr, callables_table): + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -373,7 +373,7 @@ class ScalarCallable(InKernelCallable): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) - def with_descrs(self, arg_id_to_descr, callables_table): + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table): arg_id_to_descr[-1] = ValueArgDescriptor() return ( @@ -574,7 +574,7 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=specialized_kernel, arg_id_to_dtype=new_arg_id_to_dtype), callables_table - def with_descrs(self, arg_id_to_descr, callables_table): + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table): # tune the subkernel so that we have the matching shapes and # dim_tags @@ -589,9 +589,10 @@ class CallableKernel(InKernelCallable): if isinstance(descr, ArrayArgDescriptor): if not isinstance(self.subkernel.arg_dict[arg_id], ArrayArg): - raise LoopyError("Array passed to a scalar type argument " - " '%s' in the function '%s'." % ( - arg_id, self.subkernel.name)) + raise LoopyError("Array passed to a scalar argument " + " '%s' of the function '%s' (in '%s')" % ( + arg_id, self.subkernel.name, + caller_kernel.name)) new_arg = self.subkernel.arg_dict[arg_id].copy( shape=descr.shape, @@ -602,12 +603,13 @@ class CallableKernel(InKernelCallable): new_args] elif isinstance(descr, ValueArgDescriptor): if not isinstance(self.subkernel.arg_dict[arg_id], ValueArg): - raise LoopyError("Scalar passed to an array type argument " - " '%s' in the function '%s'." % ( - arg_id, self.subkernel.name)) + raise LoopyError("Scalar passed to an array argument " + " '%s' of the callable '%s' (in '%s')" % ( + arg_id, self.subkernel.name, + caller_kernel.name)) else: raise LoopyError("Descriptor must be either an instance of " - "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % + "ArrayArgDescriptor or ValueArgDescriptor -- got %s" % type(descr)) descriptor_specialized_knl = self.subkernel.copy(args=new_args) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index aa536d7ae..a8dde5792 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2209,7 +2209,8 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): in_knl_callable = self.callables_table[expr.function.name] new_in_knl_callable, self.callables_table = ( in_knl_callable.with_descrs( - combined_arg_id_to_descr, self.callables_table)) + combined_arg_id_to_descr, self.caller_kernel, + self.callables_table)) self.callables_table, new_func_id = ( self.callables_table.with_callable( expr.function.function, -- GitLab From f3b25aaf0bd96c808f745c48b86ac8d1bc5faebf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 May 2019 18:12:02 -0500 Subject: [PATCH 477/916] Adjust loopy cli for multi-kernel module parsing --- loopy/cli.py | 67 ++++++++++------------------------------------------ 1 file changed, 12 insertions(+), 55 deletions(-) diff --git a/loopy/cli.py b/loopy/cli.py index ed50cec1f..3dbdeb41e 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -65,11 +65,9 @@ def main(): parser.add_argument("--target", choices=( "opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"), default="opencl") - parser.add_argument("--name") parser.add_argument("--transform") parser.add_argument("--edit-code", action="store_true") parser.add_argument("--occa-defines") - parser.add_argument("--occa-add-dummy-arg", action="store_true") parser.add_argument("--print-ir", action="store_true") args = parser.parse_args() @@ -163,10 +161,7 @@ def main(): raise RuntimeError("loopy-lang requires 'lp_knl' " "to be defined on exit") - if args.name is not None: - kernel = kernel.copy(name=args.name) - - kernels = [kernel] + prg = [kernel] elif lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None @@ -183,69 +178,31 @@ def main(): defines_to_python_code(defines_fd.read()) + pre_transform_code) - kernels = lp.parse_transformed_fortran( + prg = lp.parse_transformed_fortran( infile_content, pre_transform_code=pre_transform_code, filename=args.infile) - if args.name is not None: - kernels = [kernel for kernel in kernels - if kernel.name == args.name] - - if not kernels: - raise RuntimeError("no kernels found (name specified: %s)" - % args.name) - else: raise RuntimeError("unknown language: '%s'" % args.lang) + if not isinstance(prg, lp.Program): + # FIXME + assert isinstance(prg, list) # of kernels + raise NotImplementedError("convert list of kernels to Program") + if args.print_ir: - for kernel in kernels: - print(kernel, file=sys.stderr) - - if args.occa_add_dummy_arg: - new_kernels = [] - for kernel in kernels: - new_args = [ - lp.GlobalArg("occa_info", np.int32, shape=None) - ] + kernel.args - new_kernels.append(kernel.copy(args=new_args)) - - kernels = new_kernels - del new_kernels - - codes = [] - from loopy.codegen import generate_code - for kernel in kernels: - kernel = lp.preprocess_kernel(kernel) - code, impl_arg_info = generate_code(kernel) - codes.append(code) + print(prg, file=sys.stderr) + + prg = lp.preprocess_kernel(prg) + cgr = lp.generate_code_v2(prg) if args.outfile is not None: outfile = args.outfile else: outfile = "-" - code = "\n\n".join(codes) - - # {{{ edit code if requested - - import os - edit_kernel_env = os.environ.get("LOOPY_EDIT_KERNEL") - need_edit = args.edit_code - if not need_edit and edit_kernel_env is not None: - # Do not replace with "any()"--Py2.6/2.7 bug doesn't like - # comprehensions in functions with exec(). - - for k in kernels: - if edit_kernel_env.lower() in k.name.lower(): - need_edit = True - - if need_edit: - from pytools import invoke_editor - code = invoke_editor(code, filename="edit.cl") - - # }}} + code = cgr.device_code() if outfile == "-": sys.stdout.write(code) -- GitLab From ef4e71836271fbf3539dffdb361918b0262a909d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 May 2019 18:14:43 -0500 Subject: [PATCH 478/916] Fortran parser: Add handling for negative-stride slices --- loopy/frontend/fortran/translator.py | 30 ++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 26dbb4bfa..6fec4672b 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -89,19 +89,29 @@ class SubscriptIndexAdjuster(IdentityMapper): if stop is None: stop = end_index - if step != 1: - # FIXME - raise NotImplementedError("Fortran slice processing for " - "non-unit strides") + if step == 1: + sub_i = Slice(( + start - base_index, + + # FIXME This is only correct for unit strides + stop - base_index + 1, + + step + )) + elif step == -1: + sub_i = Slice(( + start - base_index, - sub_i = Slice(( - start - base_index, + # FIXME This is only correct for unit strides + stop - base_index - 1, - # FIXME This is only correct for unit strides - stop - base_index + 1, + step + )) - step - )) + else: + # FIXME + raise NotImplementedError("Fortran slice processing for " + "non-unit strides") else: sub_i = sub_i - base_index -- GitLab From 3613c3cd9e2322f59c264b3496ae95fd2caa94e9 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 May 2019 18:16:08 -0500 Subject: [PATCH 479/916] Fortran parsing: deal with variabl initializers --- loopy/frontend/fortran/translator.py | 30 +++++++++++++++++++++------- loopy/frontend/fortran/tree.py | 30 ++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 6fec4672b..680e8177b 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -129,9 +129,6 @@ class Scope(object): def __init__(self, subprogram_name, arg_names=set()): self.subprogram_name = subprogram_name - # map name to data - self.data_statements = {} - # map first letter to type self.implicit_types = {} @@ -142,7 +139,7 @@ class Scope(object): self.type_map = {} # map name to data - self.data = {} + self.data_map = {} self.arg_names = arg_names @@ -382,7 +379,8 @@ class F2LoopyTranslator(FTreeWalkerBase): tp = self.dtype_from_stmt(node) - for name, shape in self.parse_dimension_specs(node, node.entity_decls): + for name, shape, initializer in self.parse_dimension_specs( + node, node.entity_decls): if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -391,6 +389,9 @@ class F2LoopyTranslator(FTreeWalkerBase): assert name not in scope.type_map scope.type_map[name] = tp + assert name not in scope.data_map + scope.data_map[name] = initializer + return [] map_Logical = map_type_decl @@ -402,7 +403,10 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_Dimension(self, node): scope = self.scope_stack[-1] - for name, shape in self.parse_dimension_specs(node, node.items): + for name, shape, initializer in self.parse_dimension_specs(node, node.items): + if initializer is not None: + raise LoopyError("initializer in dimension statement") + if shape is not None: assert name not in scope.dim_map scope.dim_map[name] = shape @@ -744,6 +748,10 @@ class F2LoopyTranslator(FTreeWalkerBase): for arg_name in sub.arg_names: dims = sub.dim_map.get(arg_name) + if sub.data_map.get(arg_name) is not None: + raise NotImplementedError( + "initializer for argument %s" % arg_name) + if dims is not None: # default order is set to "F" in kernel creation below kernel_data.append( @@ -770,10 +778,18 @@ class F2LoopyTranslator(FTreeWalkerBase): if sub.implicit_types is None and dtype is None: continue + kwargs = {} + if sub.data_map.get(var_name) is not None: + kwargs["read_only"] = True + kwargs["address_space"] = lp.AddressSpace.PRIVATE + kwargs["initializer"] = np.array( + sub.data_map[var_name], dtype=dtype) + kernel_data.append( lp.TemporaryVariable( var_name, dtype=dtype, - shape=sub.get_loopy_shape(var_name))) + shape=sub.get_loopy_shape(var_name), + **kwargs)) # }}} diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index c73896774..a124757f4 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -54,7 +54,9 @@ class FTreeWalkerBase(object): ENTITY_RE = re.compile( r"^(?P[_0-9a-zA-Z]+)" - r"(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") + r"(\((?P[-+*0-9:a-zA-Z, \t]+)\))?" + r"(\s*=\s*(?P.+))?" + "$") def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): @@ -77,7 +79,31 @@ class FTreeWalkerBase(object): else: shape = None - yield name, shape + init_str = groups["initializer"] + if init_str: + init_str = init_str.replace("(/", "[") + init_str = init_str.replace("/)", "]") + init_expr = self.parse_expr(node, init_str) + + from numbers import Number + if isinstance(init_expr, Number): + initializer = init_expr + elif isinstance(init_expr, list): + for i, item in enumerate(init_expr): + if not isinstance(item, Number): + raise LoopyError("unexpected type of " + "item %d in initializer: %s" + % (i+1, type(init_expr).__name__)) + initializer = init_expr + + else: + raise LoopyError("unexpected type of initializer: %s" + % type(init_expr).__name__) + + else: + initializer = None + + yield name, shape, initializer def __call__(self, expr, *args, **kwargs): return self.rec(expr, *args, **kwargs) -- GitLab From a615d4688de883748a8ae9b9970c5d0426bbf6f7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 May 2019 18:16:55 -0500 Subject: [PATCH 480/916] Fix complex literal handling after Fortran array initializer support added --- loopy/frontend/fortran/expression.py | 52 +++++++++++++++++++++------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/loopy/frontend/fortran/expression.py b/loopy/frontend/fortran/expression.py index ea724278f..1400fb3b7 100644 --- a/loopy/frontend/fortran/expression.py +++ b/loopy/frontend/fortran/expression.py @@ -44,6 +44,25 @@ _and = intern("and") _or = intern("or") +def tuple_to_complex_literal(expr): + if len(expr) != 2: + raise TranslationError("complex literals must have " + "two entries") + + r, i = expr + + r = np.array(r)[()] + i = np.array(i)[()] + + dtype = (r.dtype.type(0) + i.dtype.type(0)) + if dtype == np.float32: + dtype = np.complex64 + else: + dtype = np.complex128 + + return dtype(float(r) + float(i)*1j) + + # {{{ expression parser class FortranExpressionParser(ExpressionParserBase): @@ -178,24 +197,31 @@ class FortranExpressionParser(ExpressionParserBase): left_exp, did_something = ExpressionParserBase.parse_postfix( self, pstate, min_precedence, left_exp) - if isinstance(left_exp, tuple) and min_precedence < self._PREC_FUNC_ARGS: - # this must be a complex literal - if len(left_exp) != 2: - raise TranslationError("complex literals must have " - "two entries") + return left_exp, did_something - r, i = left_exp + def parse_expression(self, pstate, min_precedence=0): + left_exp = self.parse_prefix(pstate) - dtype = (r.dtype.type(0) + i.dtype.type(0)) - if dtype == np.float32: - dtype = np.complex64 - else: - dtype = np.complex128 + did_something = True + while did_something: + did_something = False + if pstate.is_at_end(): + return left_exp - left_exp = dtype(float(r) + float(i)*1j) + result = self.parse_postfix( + pstate, min_precedence, left_exp) + left_exp, did_something = result - return left_exp, did_something + from pymbolic.parser import FinalizedTuple + if isinstance(left_exp, FinalizedTuple): + # View all tuples that survive parsing as complex literals + # "FinalizedTuple" indicates that this tuple was enclosed + # in parens. + return tuple_to_complex_literal(left_exp) + + return left_exp # }}} + # vim: foldmethod=marker -- GitLab From 7f860cef5d153de796830264d26daaa42081ba90 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 May 2019 18:17:20 -0500 Subject: [PATCH 481/916] Adjust var terminology in multi-kernel Fortran test --- test/test_fortran.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index c038aa9fa..496b470de 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -521,8 +521,8 @@ def test_fortran_subroutines(ctx_factory): end subroutine """ - knl = lp.parse_fortran(fortran_src) - print(lp.generate_code_v2(knl).device_code()) + prg = lp.parse_fortran(fortran_src) + print(lp.generate_code_v2(prg).device_code()) if __name__ == "__main__": -- GitLab From 9c5e491602600f9c93c94d5724cc787810b79752 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 May 2019 18:18:06 -0500 Subject: [PATCH 482/916] Fortran parsing interface changes --- loopy/frontend/fortran/__init__.py | 32 +++++++++++++++++++++++----- loopy/frontend/fortran/translator.py | 17 +-------------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 05b0a9205..df3cff996 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -154,8 +154,9 @@ def parse_transformed_fortran(source, free_form=True, strict=True, :func:`parse_fortran`. * ``FILENAME``: the file name of the code being processed - The transform code must define ``RESULT``, conventionally a list of - kernels, which is returned from this function unmodified. + The transform code must define ``RESULT``, conventionally a list of kernels + or a :class:`loopy.Program`, which is returned from this function + unmodified. An example of *source* may look as follows:: @@ -236,10 +237,10 @@ def parse_transformed_fortran(source, free_form=True, strict=True, return proc_dict["RESULT"] -def parse_fortran(source, filename="", free_form=True, strict=True, +def parse_fortran(source, filename="", free_form=None, strict=None, seq_dependencies=None, auto_dependencies=None, target=None): """ - :returns: a list of :class:`loopy.LoopKernel` objects + :returns: a :class:`loopy.Program` """ if seq_dependencies is not None and auto_dependencies is not None: @@ -253,6 +254,10 @@ def parse_fortran(source, filename="", free_form=True, strict=True, if seq_dependencies is None: seq_dependencies = True + if free_form is None: + free_form = True + if strict is None: + strict = True import logging console = logging.StreamHandler() @@ -273,7 +278,24 @@ def parse_fortran(source, filename="", free_form=True, strict=True, f2loopy = F2LoopyTranslator(filename, target=target) f2loopy(tree) - return f2loopy.make_kernels(seq_dependencies=seq_dependencies) + kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) + + from loopy.kernel.tools import identify_root_kernel + from loopy.program import make_program + from loopy.transform.callable import register_callable_kernel + + root_knl_name = identify_root_kernel(kernels) + root_knl = [knl for knl in kernels if knl.name == + root_knl_name][0].copy(is_called_from_host=True) + callee_kernels = [knl for knl in kernels if knl.name != root_knl_name] + prog = make_program(root_knl) + for callee_knl in callee_kernels: + #FIXME: This would need some sort of traversal to be valid + # for all cases + # THIS IS A VERY IMPORTANT FIXME!! + prog = register_callable_kernel(prog, callee_knl) + + return prog # vim: foldmethod=marker diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 680e8177b..7f263e297 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -810,22 +810,7 @@ class F2LoopyTranslator(FTreeWalkerBase): result.append(knl) - from loopy.kernel.tools import identify_root_kernel - from loopy.program import make_program - from loopy.transform.callable import register_callable_kernel - - root_knl_name = identify_root_kernel(result) - root_knl = [knl for knl in result if knl.name == - root_knl_name][0].copy(is_called_from_host=True) - callee_kernels = [knl for knl in result if knl.name != root_knl_name] - prog = make_program(root_knl) - for callee_knl in callee_kernels: - #FIXME: This would need some sort of traversal to be valid - # for all cases - # THIS IS A VERY IMPORTANT FIXME!! - prog = register_callable_kernel(prog, callee_knl) - - return prog + return result # }}} -- GitLab From ad02966a95686bd2c291cf92ce72a0a01e31c9b3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 May 2019 20:01:09 -0500 Subject: [PATCH 483/916] Begin refactoring ArgDescrInferenceMapper --- loopy/kernel/function_interface.py | 72 ++++++++++++++++++++++++++++++ loopy/preprocess.py | 41 ++++++----------- loopy/symbolic.py | 49 -------------------- loopy/transform/callable.py | 6 +-- 4 files changed, 88 insertions(+), 80 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 536fc9735..3bd544917 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -103,6 +103,78 @@ class ArrayArgDescriptor(ImmutableRecord): update_persistent_hash = update_persistent_hash + +def get_arg_descriptor_for_expression(kernel, expr): + """ + :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` + describing the argument expression *expr* in *kernel*. + """ + from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, Variable, + SweptInameStrideCollector) + from loopy.kernel.data import TemporaryVariable, ArrayArg + + if isinstance(expr, SubArrayRef): + name = expr.subscript.aggregate.name + arg = kernel.get_arg_descriptor(name) + + if not isinstance(arg, (TemporaryVariable, ArrayArg)): + raise LoopyError("unsupported argument type " + "'%s' of '%s' in call statement" + % (type(arg).__name__, expr.name)) + + aspace = arg.address_space + + from loopy.kernel.array import FixedStrideArrayDimTag as DimTag + from loopy.isl_helpers import simplify_via_aff + sub_dim_tags = [] + sub_shape = [] + + # FIXME This blindly assumes that dim_tag has a stride and + # will not work for non-stride dim tags (e.g. vec or sep). + + # FIXME: This will almost always be nonlinear--when does this + # actually help? Maybe the + linearized_index = simplify_via_aff( + sum( + dim_tag.stride*iname for dim_tag, iname in + zip(arg.dim_tags, expr.subscript.index_tuple))) + + strides_as_dict = SweptInameStrideCollector( + tuple(iname.name for iname in expr.swept_inames) + )(linearized_index) + sub_dim_tags = tuple( + DimTag(strides_as_dict[iname]) for iname in expr.swept_inames) + sub_shape = tuple( + pw_aff_to_expr( + kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 + for iname in expr.swept_inames) + if expr.swept_inames == (): + sub_shape = (1, ) + sub_dim_tags = (DimTag(1),) + + return ArrayArgDescriptor( + address_space=aspace, + dim_tags=sub_dim_tags, + shape=sub_shape) + + elif isinstance(expr, Variable): + arg = kernel.get_arg_descriptor(expr.name) + + if isinstance(arg, (TemporaryVariable, ArrayArg)): + return ArrayArgDescriptor( + address_space=arg.aspace, + dim_tags=arg.dim_tags, + shape=arg.shape) + elif isinstance(arg, ValueArg): + return ValueArgDescriptor() + else: + raise LoopyError("unsupported argument type " + "'%s' of '%s' in call statement" + % (type(arg).__name__, expr.name)) + + else: + return ValueArgDescriptor() + # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index a8dde5792..d03296435 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2169,47 +2169,32 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): def map_call(self, expr, expn_state, **kwargs): from pymbolic.primitives import Call, CallWithKwargs - from loopy.kernel.function_interface import ValueArgDescriptor - from loopy.symbolic import ResolvedFunction, SubArrayRef + from loopy.symbolic import ResolvedFunction if not isinstance(expr.function, ResolvedFunction): # ignore if the call is not to a ResolvedFunction return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state) - if isinstance(expr, Call): - kw_parameters = {} - else: - assert isinstance(expr, CallWithKwargs) - kw_parameters = expr.kw_parameters - - # descriptors for the args and kwargs of the Call - arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.caller_kernel)) - if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor()) - for i, par in tuple(enumerate(expr.parameters)) + - tuple(kw_parameters.items())) - - assignee_id_to_descr = {} + arg_id_to_val = dict(enumerate(expr.parameters)) + if isinstance(expr, CallWithKwargs): + arg_id_to_val.update(expr.kw_parameters) if 'assignees' in kwargs: # If supplied with assignees then this is a CallInstruction assignees = kwargs['assignees'] - assert isinstance(assignees, tuple) - for i, par in enumerate(assignees): - if isinstance(par, SubArrayRef): - assignee_id_to_descr[-i-1] = ( - par.get_array_arg_descriptor(self.caller_kernel)) - else: - assignee_id_to_descr[-i-1] = ValueArgDescriptor() - - # gathering all the descriptors - combined_arg_id_to_descr = arg_id_to_descr.copy() - combined_arg_id_to_descr.update(assignee_id_to_descr) + for i, arg in enumerate(assignees): + arg_id_to_val[-i-1] = arg + + from loopy.kernel.function_interface import get_arg_descriptor_for_expression + arg_id_to_descr = dict( + (arg_id, get_arg_descriptor_for_expression(arg)) + for arg_id, arg in six.iteritems(arg_id_to_val)) # specializing the function according to the parameter description in_knl_callable = self.callables_table[expr.function.name] new_in_knl_callable, self.callables_table = ( in_knl_callable.with_descrs( - combined_arg_id_to_descr, self.caller_kernel, + arg_id_to_descr, self.caller_kernel, self.callables_table)) self.callables_table, new_func_id = ( self.callables_table.with_callable( @@ -2229,7 +2214,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): for child in expr.parameters), dict( (key, self.rec(val, expn_state)) - for key, val in six.iteritems(kw_parameters)) + for key, val in six.iteritems(expr.kw_parameters)) ) map_call_with_kwargs = map_call diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 9a64fe4ac..a76f37654 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -826,55 +826,6 @@ class SubArrayRef(p.Expression): return EvaluatorWithDeficientContext(swept_inames_to_zeros)( self.subscript) - def get_array_arg_descriptor(self, kernel): - """ - Returns the dim_tags, memory scope, shape informations of a - :class:`SubArrayRef` argument in the caller kernel packed into - :class:`ArrayArgDescriptor` for the instance of :class:`SubArrayRef` in - the given *kernel*. - """ - from loopy.kernel.function_interface import ArrayArgDescriptor - - name = self.subscript.aggregate.name - - if name in kernel.temporary_variables: - assert name not in kernel.arg_dict - arg = kernel.temporary_variables[name] - else: - assert name in kernel.arg_dict - arg = kernel.arg_dict[name] - - aspace = arg.address_space - - from loopy.kernel.array import FixedStrideArrayDimTag as DimTag - from loopy.isl_helpers import simplify_via_aff - sub_dim_tags = [] - sub_shape = [] - try: - linearized_index = simplify_via_aff( - sum(dim_tag.stride*iname for dim_tag, iname in - zip(arg.dim_tags, self.subscript.index_tuple))) - except isl.Error: - linearized_index = sum(dim_tag.stride*iname for dim_tag, iname in - zip(arg.dim_tags, self.subscript.index_tuple)) - - strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in - self.swept_inames))(linearized_index) - sub_dim_tags = tuple( - DimTag(strides_as_dict[iname]) for iname in self.swept_inames) - sub_shape = tuple( - pw_aff_to_expr( - kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 - for iname in self.swept_inames) - if self.swept_inames == (): - sub_shape = (1, ) - sub_dim_tags = (DimTag(1),) - - return ArrayArgDescriptor( - address_space=aspace, - dim_tags=sub_dim_tags, - shape=sub_shape) - def __getinitargs__(self): return (self.swept_inames, self.subscript) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 953ad5613..135987e06 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -628,8 +628,8 @@ def _match_caller_callee_argument_dimension_for_single_kernel( # Call to a callable kernel can only occur through a # CallInstruction. continue - # getting the caller->callee arg association + # get the caller->callee arg association parameters = insn.expression.parameters[:] kw_parameters = {} if isinstance(insn.expression, CallWithKwargs): @@ -658,7 +658,7 @@ def _match_caller_callee_argument_dimension_for_single_kernel( parameter_shapes.append(_shape_1_if_empty(kw_parameters[pos_to_kw[i]]) .get_array_arg_descriptor(caller_knl).shape) - # inserting the assignees at the required positions. + # insert the assignees at the required positions assignee_write_count = -1 for i, arg in enumerate(callee_knl.args): if arg.is_output_only: @@ -686,7 +686,7 @@ def _match_caller_callee_argument_dimension_for_single_kernel( raise NotImplementedError("Unknown instruction %s." % type(insn)) - # subkernel with instructions adjusted according to the new dimensions. + # subkernel with instructions adjusted according to the new dimensions new_callee_knl = callee_knl.copy(instructions=new_callee_insns) return new_callee_knl -- GitLab From 02badd5f410dfd228be0b4b39667061ecba4af1e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 20 May 2019 21:02:16 -0500 Subject: [PATCH 484/916] adds support for array inputs to callables --- loopy/kernel/creation.py | 24 ++++++++++++++++++--- test/test_callables.py | 46 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 59a4f7896..25594cbb5 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -27,7 +27,7 @@ THE SOFTWARE. import numpy as np from pymbolic.mapper import CSECachingMapperMixin -from pymbolic.primitives import Slice, Variable, Subscript +from pymbolic.primitives import Slice, Variable, Subscript, Call from loopy.tools import intern_frozenset_of_ids, Optional from loopy.symbolic import ( IdentityMapper, WalkMapper, SubArrayRef) @@ -1928,6 +1928,24 @@ class SliceToInameReplacer(IdentityMapper): else: return IdentityMapper.map_subscript(self, expr) + def map_call(self, expr): + def _convert_array_to_slices(arg): + if isinstance(arg, Variable): + if (arg.name in self.knl.temporary_variables): + array_arg = self.knl.temporary_variables[arg.name] + else: + assert arg.name in self.knl.arg_dict + array_arg = self.knl.arg_dict[arg.name] + + if array_arg.shape != (): + return Subscript(arg, tuple(Slice(()) for _ in + array_arg.shape)) + return arg + + return Call(expr.function, + tuple(self.rec(_convert_array_to_slices(par)) for par in + expr.parameters)) + def get_iname_domain_as_isl_set(self): """ Returns the extra domain constraints imposed by the slice inames, @@ -1959,7 +1977,7 @@ class SliceToInameReplacer(IdentityMapper): return iname_set -def realize_slices_as_sub_array_refs(kernel): +def realize_slices_array_inputs_as_sub_array_refs(kernel): """ Returns a kernel with the instances of :class:`pymbolic.primitives.Slice` encountered in expressions replaced as `loopy.symbolic.SubArrayRef`. @@ -2301,7 +2319,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): knl = create_temporaries(knl, default_order) # convert slices to iname domains - knl = realize_slices_as_sub_array_refs(knl) + knl = realize_slices_array_inputs_as_sub_array_refs(knl) # ------------------------------------------------------------------------- # Ordering dependency: diff --git a/test/test_callables.py b/test/test_callables.py index 5d8785db0..23d54098a 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -476,6 +476,52 @@ def test_empty_sub_array_refs(ctx_factory, inline): assert np.allclose(out, x-y) +@pytest.mark.parametrize("inline", [False, True]) +def test_array_inputs_to_callee_kernels(ctx_factory, inline): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + n = 2 ** 4 + + x = np.random.rand(n, n) + y = np.random.rand(n, n) + + child_knl = lp.make_function( + "{[i, j]:0<=i, j < 16}", + """ + g[i, j] = 2*e[i, j] + 3*f[i, j] + """, name="linear_combo") + + parent_knl = lp.make_kernel( + "{:}", + """ + z[:, :] = linear_combo(x, y) + """, + kernel_data=[ + lp.GlobalArg( + name='x', + dtype=np.float64, + shape=(16, 16)), + lp.GlobalArg( + name='y', + dtype=np.float64, + shape=(16, 16)), + lp.GlobalArg( + name='z', + dtype=np.float64, + shape=(16, 16)), '...'], + ) + + knl = lp.register_callable_kernel( + parent_knl, child_knl) + if inline: + knl = lp.inline_callable_kernel(knl, 'linear_combo') + + evt, (out, ) = knl(queue, x=x, y=y) + + assert (np.linalg.norm(2*x+3*y-out)/( + np.linalg.norm(2*x+3*y))) < 1e-15 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From d1245efa9a82ce53ac7bb6282cfaf74290da691f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 20 May 2019 22:05:55 -0500 Subject: [PATCH 485/916] account for ValueArg does not have shape --- loopy/kernel/creation.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 25594cbb5..a7205dbbe 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -34,7 +34,7 @@ from loopy.symbolic import ( from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule, AddressSpace) + SubstitutionRule, AddressSpace, ValueArg) from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, CallInstruction) from loopy.diagnostic import LoopyError, warn_with_kernel @@ -1932,14 +1932,18 @@ class SliceToInameReplacer(IdentityMapper): def _convert_array_to_slices(arg): if isinstance(arg, Variable): if (arg.name in self.knl.temporary_variables): - array_arg = self.knl.temporary_variables[arg.name] + array_arg_shape = ( + self.knl.temporary_variables[arg.name].shape) else: assert arg.name in self.knl.arg_dict - array_arg = self.knl.arg_dict[arg.name] + if isinstance(self.knl.arg_dict[arg.name], ValueArg): + array_arg_shape = () + else: + array_arg_shape = self.knl.arg_dict[arg.name].shape - if array_arg.shape != (): + if array_arg_shape != (): return Subscript(arg, tuple(Slice(()) for _ in - array_arg.shape)) + array_arg_shape)) return arg return Call(expr.function, -- GitLab From bccfa62ed71180e7a461acdf75b72af9ba1e6129 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 21 May 2019 00:20:17 -0500 Subject: [PATCH 486/916] temporary fix for array arg parameters that are written --- loopy/kernel/instruction.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 0a2079ba5..540c77b12 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1108,7 +1108,14 @@ class CallInstruction(MultiAssignmentBase): @memoize_method def assignee_var_names(self): - return tuple(_get_assignee_var_name(a) for a in self.assignees) + #FIXME: This needs to be smarter, instead of just making all + # as written + from loopy.symbolic import SubArrayRef + return ( + tuple(_get_assignee_var_name(a) for a in self.assignees) + + tuple(par.subscript.aggregate.name for par in + self.expression.parameters if isinstance(par, + SubArrayRef))) def assignee_subscript_deps(self): return tuple( -- GitLab From e51a8af5d91609c7355327ff8c67aa665dd8458e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 21 May 2019 01:24:22 -0500 Subject: [PATCH 487/916] Fixes for get_arg_descriptor_for_expression --- loopy/kernel/function_interface.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 3bd544917..26f90cd46 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -109,13 +109,14 @@ def get_arg_descriptor_for_expression(kernel, expr): :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` describing the argument expression *expr* in *kernel*. """ - from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, Variable, + from pymbolic.primitives import Variable + from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, SweptInameStrideCollector) from loopy.kernel.data import TemporaryVariable, ArrayArg if isinstance(expr, SubArrayRef): name = expr.subscript.aggregate.name - arg = kernel.get_arg_descriptor(name) + arg = kernel.get_var_descriptor(name) if not isinstance(arg, (TemporaryVariable, ArrayArg)): raise LoopyError("unsupported argument type " @@ -125,7 +126,7 @@ def get_arg_descriptor_for_expression(kernel, expr): aspace = arg.address_space from loopy.kernel.array import FixedStrideArrayDimTag as DimTag - from loopy.isl_helpers import simplify_via_aff + from loopy.symbolic import simplify_using_aff sub_dim_tags = [] sub_shape = [] @@ -134,7 +135,8 @@ def get_arg_descriptor_for_expression(kernel, expr): # FIXME: This will almost always be nonlinear--when does this # actually help? Maybe the - linearized_index = simplify_via_aff( + linearized_index = simplify_using_aff( + kernel, sum( dim_tag.stride*iname for dim_tag, iname in zip(arg.dim_tags, expr.subscript.index_tuple))) @@ -158,7 +160,7 @@ def get_arg_descriptor_for_expression(kernel, expr): shape=sub_shape) elif isinstance(expr, Variable): - arg = kernel.get_arg_descriptor(expr.name) + arg = kernel.get_var_descriptor(expr.name) if isinstance(arg, (TemporaryVariable, ArrayArg)): return ArrayArgDescriptor( -- GitLab From fe208a40aef35e797d77d98497a355c045f53872 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 21 May 2019 01:27:11 -0500 Subject: [PATCH 488/916] Add CallInstruction.arg_id_to_val --- loopy/kernel/instruction.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 0a2079ba5..1a56e8582 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -22,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six from six.moves import intern from pytools import ImmutableRecord, memoize_method from loopy.diagnostic import LoopyError @@ -1137,6 +1138,22 @@ class CallInstruction(MultiAssignmentBase): result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) return result + def arg_id_to_val(self): + """:returns: a :class:`dict` mapping argument identifiers (non-negative numbers + for positional arguments, strings for keyword args, and negative numbers + for assignees) to their respective values + """ + + from pymbolic.primitives import CallWithKwargs + arg_id_to_val = dict(enumerate(self.expression.parameters)) + if isinstance(self.expression, CallWithKwargs): + for kw, val in six.iteritems(self.expression.kw_parameters): + arg_id_to_val[kw] = val + for i, arg in enumerate(self.assignees): + arg_id_to_val[-i-1] = arg + + return arg_id_to_val + @property def atomicity(self): # Function calls can impossibly be atomic, and even the result assignment -- GitLab From 1795061095519ab225385152bf241c3b37a1741d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 21 May 2019 01:27:40 -0500 Subject: [PATCH 489/916] Fix call site of get_arg_descriptor_for_expression --- loopy/preprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d03296435..54a9204dc 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2187,7 +2187,8 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): from loopy.kernel.function_interface import get_arg_descriptor_for_expression arg_id_to_descr = dict( - (arg_id, get_arg_descriptor_for_expression(arg)) + (arg_id, get_arg_descriptor_for_expression( + self.caller_kernel, arg)) for arg_id, arg in six.iteritems(arg_id_to_val)) # specializing the function according to the parameter description -- GitLab From a5b691ff1e107a04fd7271fad47cc1ec0f2d2da8 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 21 May 2019 01:28:24 -0500 Subject: [PATCH 490/916] Add FIXME regarding simplify_{via,using}_aff --- loopy/symbolic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index a76f37654..d214b5e4f 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1635,6 +1635,7 @@ def guarded_pwaff_from_expr(space, expr, vars_to_zero=None): # {{{ simplify using aff +# FIXME: redundant with simplify_via_aff def simplify_using_aff(kernel, expr): inames = get_dependencies(expr) & kernel.all_inames() -- GitLab From 6560e593523eb6b18a835c6f7839ccc820b0ca7b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 21 May 2019 01:29:11 -0500 Subject: [PATCH 491/916] Refactor/simplify _match_caller_callee_argument_dimension_for_single_kernel --- loopy/transform/callable.py | 54 +++++++++++-------------------------- 1 file changed, 15 insertions(+), 39 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 135987e06..042990c77 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -34,7 +34,7 @@ from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, Assignment, CInstruction, _DataObliviousInstruction) from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff -from loopy.kernel.function_interface import (get_kw_pos_association, +from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.program import Program, ResolvedFunctionMarker from loopy.symbolic import SubArrayRef @@ -616,10 +616,10 @@ class DimChanger(IdentityMapper): def _match_caller_callee_argument_dimension_for_single_kernel( caller_knl, callee_knl): """ - Returns a copy of *caller_knl* with the instance of - :class:`loopy.kernel.function_interface.CallableKernel` addressed by - *callee_function_name* in the *caller_knl* aligned with the argument - dimesnsions required by *caller_knl*. + :returns: a copy of *caller_knl* with the instance of + :class:`loopy.kernel.function_interface.CallableKernel` addressed by + *callee_function_name* in the *caller_knl* aligned with the argument + dimensions required by *caller_knl*. """ for insn in caller_knl.instructions: if not isinstance(insn, CallInstruction) or ( @@ -629,14 +629,6 @@ def _match_caller_callee_argument_dimension_for_single_kernel( # CallInstruction. continue - # get the caller->callee arg association - parameters = insn.expression.parameters[:] - kw_parameters = {} - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - - assignees = insn.assignees - def _shape_1_if_empty(shape): assert isinstance(shape, tuple) if shape == (): @@ -644,34 +636,18 @@ def _match_caller_callee_argument_dimension_for_single_kernel( else: return shape - parameter_shapes = [] - for par in parameters: - if isinstance(par, SubArrayRef): - parameter_shapes.append( - _shape_1_if_empty( - par.get_array_arg_descriptor(caller_knl).shape)) - else: - parameter_shapes.append((1, )) - - kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) - for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameter_shapes.append(_shape_1_if_empty(kw_parameters[pos_to_kw[i]]) - .get_array_arg_descriptor(caller_knl).shape) - - # insert the assignees at the required positions - assignee_write_count = -1 - for i, arg in enumerate(callee_knl.args): - if arg.is_output_only: - assignee = assignees[-assignee_write_count-1] - parameter_shapes.insert(i, _shape_1_if_empty(assignee - .get_array_arg_descriptor(caller_knl).shape)) - assignee_write_count -= 1 - - callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in - callee_knl.args], parameter_shapes)) + from loopy.kernel.function_interface import ( + ArrayArgDescriptor, get_arg_descriptor_for_expression) + arg_id_to_shape = {} + for arg_id, arg in six.iteritems(insn.arg_id_to_val()): + arg_descr = get_arg_descriptor_for_expression(caller_knl, arg) + if isinstance(arg_descr, ArrayArgDescriptor): + arg_id_to_shape[arg_id] = _shape_1_if_empty(arg_descr) + dim_changer = DimChanger( callee_knl.arg_dict, - callee_arg_to_desired_dim_tag) + arg_id_to_shape) + new_callee_insns = [] for callee_insn in callee_knl.instructions: if isinstance(callee_insn, MultiAssignmentBase): -- GitLab From a9b7a374159b306be0ef43ba47e5023fb3cbc62b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 22 May 2019 07:38:56 -0500 Subject: [PATCH 492/916] better diagnostics for with_descrs, better printing of subarrayrefs --- loopy/kernel/function_interface.py | 8 +++++++- loopy/symbolic.py | 5 +++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 536fc9735..e1c29bb5a 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -590,7 +590,13 @@ class CallableKernel(InKernelCallable): if isinstance(descr, ArrayArgDescriptor): if not isinstance(self.subkernel.arg_dict[arg_id], ArrayArg): raise LoopyError("Array passed to a scalar argument " - " '%s' of the function '%s' (in '%s')" % ( + " '%s' of the function '%s' (in '%s')." % ( + arg_id, self.subkernel.name, + caller_kernel.name)) + if (len(self.subkernel.arg_dict[arg_id].shape) != + len(descr.shape)): + raise LoopyError("Dimension mismatch for argument " + " '%s' of the function '%s' (in '%s')." % ( arg_id, self.subkernel.name, caller_kernel.name)) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 9a64fe4ac..f717a0772 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -261,8 +261,9 @@ class StringifyMapper(StringifyMapperBase): return expr.name def map_sub_array_ref(self, expr, prec): - return "SubArrayRef({inames}, ({subscr}))".format( - inames=self.rec(expr.swept_inames, prec), + return "[{inames}]: {subscr}".format( + inames=','.join(self.rec(iname, prec) for iname in + expr.swept_inames), subscr=self.rec(expr.subscript, prec)) -- GitLab From e8fbbd1fa6bd95027c9c7907eeccce2f761b94c2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 22 May 2019 07:42:35 -0500 Subject: [PATCH 493/916] with_descrs: substitute the value args in the callee from the call --- loopy/kernel/function_interface.py | 54 +++++++++++++++++++++++++++--- loopy/library/function.py | 2 +- loopy/library/reduction.py | 2 +- loopy/preprocess.py | 2 +- 4 files changed, 53 insertions(+), 7 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e1c29bb5a..0156cae0f 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -31,6 +31,8 @@ from loopy.diagnostic import LoopyError from loopy.tools import update_persistent_hash from loopy.kernel import LoopKernel from loopy.kernel.data import ValueArg, ArrayArg +from loopy.symbolic import (SubstitutionMapper, DependencyMapper) +from pymbolic.primitives import Variable __doc__ = """ @@ -51,6 +53,12 @@ __doc__ = """ class ValueArgDescriptor(ImmutableRecord): hash_fields = () + def map_expr(self, subst_mapper): + return self.copy() + + def depends_on(self): + return frozenset() + update_persistent_hash = update_persistent_hash @@ -101,6 +109,18 @@ class ArrayArgDescriptor(ImmutableRecord): "address_space", "dim_tags") + def map_expr(self, subst_mapper): + new_shape = tuple(subst_mapper(axis_len) for axis_len in self.shape) + new_dim_tags = tuple(dim_tag.map_expr(subst_mapper) for dim_tag in + self.dim_tags) + return self.copy(shape=new_shape, dim_tags=new_dim_tags) + + def depends_on(self): + result = DependencyMapper(composite_leaves=False)(self.shape) | ( + DependencyMapper(composite_leaves=False)(tuple(dim_tag.stride for + dim_tag in self.dim_tags))) + return frozenset(var.name for var in result) + update_persistent_hash = update_persistent_hash # }}} @@ -240,7 +260,7 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table): + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -373,7 +393,7 @@ class ScalarCallable(InKernelCallable): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) - def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table): + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): arg_id_to_descr[-1] = ValueArgDescriptor() return ( @@ -574,11 +594,37 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=specialized_kernel, arg_id_to_dtype=new_arg_id_to_dtype), callables_table - def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table): - + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): # tune the subkernel so that we have the matching shapes and # dim_tags + # {{{ map the arg_descrs so that all the variables are from the callees + # perspective + + substs = {} + for arg, par in zip(self.subkernel.args, expr.parameters): + if isinstance(arg, ValueArg): + substs[par] = Variable(arg.name) + + def subst_func(expr): + if expr in substs: + return substs[expr] + else: + return expr + + subst_mapper = SubstitutionMapper(subst_func) + + arg_id_to_descr = dict((arg_id, descr.map_expr(subst_mapper)) for + arg_id, descr in arg_id_to_descr.items()) + + # }}} + + dependents = frozenset().union(*(descr.depends_on() for descr in + arg_id_to_descr.values())) + # the strides should be dependent only on variables known to the callee + assert dependents <= (frozenset(self.subkernel.arg_dict.keys()) | + frozenset(self.subkernel.temporary_variables.keys())) + new_args = self.subkernel.args[:] kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) diff --git a/loopy/library/function.py b/loopy/library/function.py index f225b62f9..404005230 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -35,7 +35,7 @@ class MakeTupleCallable(ScalarCallable): return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target="loopy_make_tuple"), callables_table) - def with_descrs(self, arg_id_to_descr, callables_table): + def with_descrs(self, arg_id_to_descr, callables_table, expr): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 357c03feb..04615137b 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -455,7 +455,7 @@ class ReductionCallable(ScalarCallable): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=name_in_target), callables_table - def with_descrs(self, arg_id_to_descr, callables_table): + def with_descrs(self, arg_id_to_descr, callables_table, expr): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() diff --git a/loopy/preprocess.py b/loopy/preprocess.py index a8dde5792..e70e6b6fe 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2210,7 +2210,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): new_in_knl_callable, self.callables_table = ( in_knl_callable.with_descrs( combined_arg_id_to_descr, self.caller_kernel, - self.callables_table)) + self.callables_table, expr)) self.callables_table, new_func_id = ( self.callables_table.with_callable( expr.function.function, -- GitLab From 1ad37cefb4d572438dc3848a781287dd4bcc289b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 22 May 2019 08:07:29 -0500 Subject: [PATCH 494/916] adds a test to check strides depending on callee args --- loopy/kernel/function_interface.py | 3 ++- test/test_callables.py | 32 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 0156cae0f..0d15b9b4e 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -639,7 +639,8 @@ class CallableKernel(InKernelCallable): " '%s' of the function '%s' (in '%s')." % ( arg_id, self.subkernel.name, caller_kernel.name)) - if (len(self.subkernel.arg_dict[arg_id].shape) != + if self.subkernel.arg_dict[arg_id].shape and ( + len(self.subkernel.arg_dict[arg_id].shape) != len(descr.shape)): raise LoopyError("Dimension mismatch for argument " " '%s' of the function '%s' (in '%s')." % ( diff --git a/test/test_callables.py b/test/test_callables.py index 23d54098a..d881656ab 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -522,6 +522,38 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline): np.linalg.norm(2*x+3*y))) < 1e-15 +def test_stride_depending_on_args(): + twice = lp.make_function( + "{[i, j]: 0<=i, j < n}", + """ + b[i, j] = 2*a[i, j] + """, [lp.ValueArg('n'), lp.GlobalArg('a'), lp.GlobalArg('b')], + name='twice') + + thrice = lp.make_function( + "{[i, j]: 0<=i, j < n}", + """ + b[i, j] = 3*a[i, j] + """, [lp.ValueArg('n'), lp.GlobalArg('a', shape=lp.auto), + lp.GlobalArg('b', shape=lp.auto)], + name='thrice') + + prog = lp.make_kernel( + "{[i0,i1,i2,i3,i4,i5,i6,i7]: 0<=i0, i1, i2, i3, i4, i5, i6, i7< N}", + """ + [i0, i1]: y[i0, i1] = twice(N, [i2, i3]: x[2*i2, i3]) + [i4, i5]: z[i4, i5] = thrice(N, [i6, i7]: x[2*i6+1, i7]) + """, [ + lp.ValueArg('N', dtype=np.int32), lp.GlobalArg('x', + shape=lp.auto, dtype=np.float64), ...]) + + prog = lp.register_callable_kernel(prog, twice) + prog = lp.register_callable_kernel(prog, thrice) + + # FIXME: actually test something + print(lp.generate_code_v2(prog).device_code()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 3cf7abe0019d70995185e93daf5081a7c900bf35 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 14:23:00 -0500 Subject: [PATCH 495/916] Add parameter matching FIXME --- loopy/kernel/function_interface.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 34d360512..ba01c9011 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -675,6 +675,11 @@ class CallableKernel(InKernelCallable): # {{{ map the arg_descrs so that all the variables are from the callees # perspective + # FIXME: This is ill-formed, because par can be an expression, e.g. + # 2*i+2 or 2*(i+1). A key feature of expression is that structural + # equality and semantic equality are not the same, so even if the + # SubstitutionMapper allowed non-variables, it would have to solve the + # (considerable) problem of expression equivalence. substs = {} for arg, par in zip(self.subkernel.args, expr.parameters): if isinstance(arg, ValueArg): -- GitLab From 7361be5ab66ed86bb859e3c5ae5484e41031354a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 14:33:40 -0500 Subject: [PATCH 496/916] Do not allow passing entire array by name without using SubArrayRef --- loopy/kernel/function_interface.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index ba01c9011..cf6e92771 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -127,7 +127,8 @@ class ArrayArgDescriptor(ImmutableRecord): def get_arg_descriptor_for_expression(kernel, expr): """ :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` - describing the argument expression *expr* in *kernel*. + describing the argument expression *expr* which occurs + in a call in the code of *kernel*. """ from pymbolic.primitives import Variable from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, @@ -183,10 +184,10 @@ def get_arg_descriptor_for_expression(kernel, expr): arg = kernel.get_var_descriptor(expr.name) if isinstance(arg, (TemporaryVariable, ArrayArg)): - return ArrayArgDescriptor( - address_space=arg.aspace, - dim_tags=arg.dim_tags, - shape=arg.shape) + raise LoopyError("may not pass entire array " + "'%s' in call statement in kernel '%s'" + % (expr.name, kernel.name)) + elif isinstance(arg, ValueArg): return ValueArgDescriptor() else: -- GitLab From 15b5d39d4de6a121f9c660d1efcf19af58bf8189 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 16:08:29 -0500 Subject: [PATCH 497/916] Add support for single-line Fortran if --- loopy/frontend/fortran/translator.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 7f263e297..817a448f3 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -519,11 +519,6 @@ class F2LoopyTranslator(FTreeWalkerBase): def map_ArithmeticIf(self, node): raise NotImplementedError("arithmetic-if") - def map_If(self, node): - raise NotImplementedError("if") - # node.expr - # node.content[0] - def realize_conditional(self, node, context_cond=None): scope = self.scope_stack[-1] @@ -550,6 +545,15 @@ class F2LoopyTranslator(FTreeWalkerBase): self.conditions.append(cond_expr) + def map_If(self, node): + self.realize_conditional(node, None) + + for c in node.content: + self.rec(c) + + self.conditions_data.pop() + self.conditions.pop() + def map_IfThen(self, node): self.block_nest.append("if") self.realize_conditional(node, None) -- GitLab From 1e78e5a9ff87eb03ac884a750fd1a0a8c5d1dd55 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 23 May 2019 16:39:03 -0500 Subject: [PATCH 498/916] arg_descrs now emits what variables to be added to the call node --- loopy/kernel/function_interface.py | 36 +++++++++++++++++++++--------- loopy/library/function.py | 2 +- loopy/library/reduction.py | 2 +- loopy/preprocess.py | 5 +++-- test/test_callables.py | 25 ++++++++++++++++++++- 5 files changed, 55 insertions(+), 15 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 0d15b9b4e..8dd62aae4 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -398,7 +398,7 @@ class ScalarCallable(InKernelCallable): arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - callables_table) + callables_table, ()) def with_hw_axes_sizes(self, global_size, local_size): return self.copy() @@ -602,9 +602,15 @@ class CallableKernel(InKernelCallable): # perspective substs = {} + assumptions = {} for arg, par in zip(self.subkernel.args, expr.parameters): - if isinstance(arg, ValueArg): - substs[par] = Variable(arg.name) + if isinstance(arg, ValueArg) and isinstance(par, Variable): + # FIXME: This would not deal with other expression, instead + # do a linear solve like the host <-> kernel interface + if par in substs: + assumptions[arg.name] = substs[par].name + else: + substs[par] = Variable(arg.name) def subst_func(expr): if expr in substs: @@ -621,9 +627,9 @@ class CallableKernel(InKernelCallable): dependents = frozenset().union(*(descr.depends_on() for descr in arg_id_to_descr.values())) - # the strides should be dependent only on variables known to the callee - assert dependents <= (frozenset(self.subkernel.arg_dict.keys()) | - frozenset(self.subkernel.temporary_variables.keys())) + unknown_deps = dependents - self.subkernel.all_variable_names() + # FIXME: Need to make sure that we make the name of the variables + # unique, and then run a subst_mapper new_args = self.subkernel.args[:] kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) @@ -666,16 +672,26 @@ class CallableKernel(InKernelCallable): type(descr)) descriptor_specialized_knl = self.subkernel.copy(args=new_args) + # add the variables on which the strides/shapes depend but not provided + # as arguments + args_added_knl = descriptor_specialized_knl.copy( + args=descriptor_specialized_knl.args + + [ValueArg(dep) for dep in unknown_deps]) from loopy.preprocess import traverse_to_infer_arg_descr - descriptor_specialized_knl, callables_table = ( - traverse_to_infer_arg_descr(descriptor_specialized_knl, + from loopy.transform.parameter import assume + args_added_knl, callables_table = ( + traverse_to_infer_arg_descr(args_added_knl, callables_table)) + if assumptions: + args_added_knl = assume(args_added_knl, 'and '.join([ + '{0} = {1}'.format(key, val) for key, val in assumptions.items()])) + return ( self.copy( - subkernel=descriptor_specialized_knl, + subkernel=args_added_knl, arg_id_to_descr=arg_id_to_descr), - callables_table) + callables_table, tuple(Variable(dep) for dep in unknown_deps)) def with_packing_for_args(self): from loopy.kernel.data import AddressSpace diff --git a/loopy/library/function.py b/loopy/library/function.py index 404005230..5e7dfbaf6 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -42,7 +42,7 @@ class MakeTupleCallable(ScalarCallable): return ( self.copy(arg_id_to_descr=new_arg_id_to_descr), - callables_table) + callables_table, ()) class IndexOfCallable(ScalarCallable): diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 04615137b..213836840 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -461,7 +461,7 @@ class ReductionCallable(ScalarCallable): new_arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - callables_table) + callables_table, ()) def generate_preambles(self, target): if isinstance(self.name, ArgExtOp): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e70e6b6fe..0ee130858 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2207,7 +2207,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): # specializing the function according to the parameter description in_knl_callable = self.callables_table[expr.function.name] - new_in_knl_callable, self.callables_table = ( + new_in_knl_callable, self.callables_table, new_vars = ( in_knl_callable.with_descrs( combined_arg_id_to_descr, self.caller_kernel, self.callables_table, expr)) @@ -2220,8 +2220,9 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): return Call( ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) - for child in expr.parameters)) + for child in expr.parameters)+new_vars) else: + # FIXME: Order for vars when kwards are present? assert isinstance(expr, CallWithKwargs) return CallWithKwargs( ResolvedFunction(new_func_id), diff --git a/test/test_callables.py b/test/test_callables.py index d881656ab..af7e12180 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -545,7 +545,7 @@ def test_stride_depending_on_args(): [i4, i5]: z[i4, i5] = thrice(N, [i6, i7]: x[2*i6+1, i7]) """, [ lp.ValueArg('N', dtype=np.int32), lp.GlobalArg('x', - shape=lp.auto, dtype=np.float64), ...]) + shape=lp.auto, dtype=np.float64), '...']) prog = lp.register_callable_kernel(prog, twice) prog = lp.register_callable_kernel(prog, thrice) @@ -554,6 +554,29 @@ def test_stride_depending_on_args(): print(lp.generate_code_v2(prog).device_code()) +def test_unknown_stride_to_callee(): + twice = lp.make_function( + "{[i, j]: 0<=i, j < n}", + """ + b[i, j] = 2*a[i, j] + """, [lp.ValueArg('n'), lp.GlobalArg('a'), lp.GlobalArg('b')], + name='twice') + + prog = lp.make_kernel( + "{[i,i0,i1,i2,i3]: 0<=i0, i1, i2, i3< N and 0<=i 1: exec(sys.argv[1]) -- GitLab From 9fc3a83113f0ab38f536292b22c9b4289dc8de39 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 23 May 2019 18:21:44 -0500 Subject: [PATCH 499/916] Minor changes to adding assumptions; passes WENO.F90 --- loopy/kernel/function_interface.py | 42 ++++++++++++++++-------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index bcc17211b..6f8ff3ff7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -147,7 +147,6 @@ def get_arg_descriptor_for_expression(kernel, expr): aspace = arg.address_space from loopy.kernel.array import FixedStrideArrayDimTag as DimTag - from loopy.symbolic import simplify_using_aff sub_dim_tags = [] sub_shape = [] @@ -156,11 +155,8 @@ def get_arg_descriptor_for_expression(kernel, expr): # FIXME: This will almost always be nonlinear--when does this # actually help? Maybe the - linearized_index = simplify_using_aff( - kernel, - sum( - dim_tag.stride*iname for dim_tag, iname in - zip(arg.dim_tags, expr.subscript.index_tuple))) + linearized_index = sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg.dim_tags, expr.subscript.index_tuple)) strides_as_dict = SweptInameStrideCollector( tuple(iname.name for iname in expr.swept_inames) @@ -183,13 +179,13 @@ def get_arg_descriptor_for_expression(kernel, expr): elif isinstance(expr, Variable): arg = kernel.get_var_descriptor(expr.name) - if isinstance(arg, (TemporaryVariable, ArrayArg)): + if isinstance(arg, ValueArg) or (isinstance(arg, TemporaryVariable) + and arg.shape == ()): + return ValueArgDescriptor() + elif isinstance(arg, (ArrayArg, TemporaryVariable)): raise LoopyError("may not pass entire array " "'%s' in call statement in kernel '%s'" % (expr.name, kernel.name)) - - elif isinstance(arg, ValueArg): - return ValueArgDescriptor() else: raise LoopyError("unsupported argument type " "'%s' of '%s' in call statement" @@ -672,25 +668,33 @@ class CallableKernel(InKernelCallable): def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): # tune the subkernel so that we have the matching shapes and # dim_tags + print('Started arg_descr_inferring for {0}'.format(self.subkernel.name)) # {{{ map the arg_descrs so that all the variables are from the callees # perspective + domain_dependent_vars = frozenset().union( + *(frozenset(dom.get_var_names(1)) for dom in + self.subkernel.domains)) + # FIXME: This is ill-formed, because par can be an expression, e.g. # 2*i+2 or 2*(i+1). A key feature of expression is that structural # equality and semantic equality are not the same, so even if the # SubstitutionMapper allowed non-variables, it would have to solve the # (considerable) problem of expression equivalence. + + import numbers substs = {} assumptions = {} for arg, par in zip(self.subkernel.args, expr.parameters): - if isinstance(arg, ValueArg) and isinstance(par, Variable): - # FIXME: This would not deal with other expression, instead - # do a linear solve like the host <-> kernel interface - if par in substs: - assumptions[arg.name] = substs[par].name - else: - substs[par] = Variable(arg.name) + if isinstance(arg, ValueArg) and arg.name in domain_dependent_vars: + if isinstance(par, Variable): + if par in substs: + assumptions[arg.name] = substs[par].name + else: + substs[par] = Variable(arg.name) + elif isinstance(par, numbers.Number): + assumptions[arg.name] = par def subst_func(expr): if expr in substs: @@ -764,8 +768,8 @@ class CallableKernel(InKernelCallable): callables_table)) if assumptions: - args_added_knl = assume(args_added_knl, 'and '.join([ - '{0} = {1}'.format(key, val) for key, val in assumptions.items()])) + args_added_knl = assume(args_added_knl, ' and '.join([ + '{0}={1}'.format(key, val) for key, val in assumptions.items()])) return ( self.copy( -- GitLab From 655fe562da5b11dad4970c155c0016ede5238bf3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 18:36:02 -0500 Subject: [PATCH 500/916] Add Program.__getitem__ --- loopy/program.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 99b0fe2b0..b44ea8504 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -196,7 +196,7 @@ def initialize_callables_table_from_kernel(kernel): return callables_table -# {{{ program definition +# {{{ program class Program(ImmutableRecord): """ @@ -230,6 +230,9 @@ class Program(ImmutableRecord): .. automethod:: __init__ .. automethod:: with_root_kernel + .. method:: __getitem__(name) + + Look up the resolved callable with identifier *name*. """ def __init__(self, name, @@ -363,6 +366,9 @@ class Program(ImmutableRecord): callables_table=self.callables_table.copy( resolved_functions=new_resolved_functions)) + def __getitem__(self, name): + return self.callables_table[name] + def __call__(self, *args, **kwargs): key = self.target.get_kernel_executor_cache_key(*args, **kwargs) try: -- GitLab From d6cd3d777b9e35f10ed964c48e5e547e874ad3a4 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 18:37:33 -0500 Subject: [PATCH 501/916] Fix fuse_loop_domains to not fuse imperfectly nested loops, add relevant test --- loopy/loop.py | 11 ++++++++++- test/test_fortran.py | 22 +++++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/loopy/loop.py b/loopy/loop.py index 66d413987..a2793c196 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -32,7 +32,8 @@ def potential_loop_nest_map(kernel): """Returns a dictionary mapping inames to other inames that *could* be nested around them. - :seealso: :func:`loopy.schedule.loop_nest_map` + * :seealso: :func:`loopy.schedule.loop_nest_map` + * :seealso: :func:`loopy.schedule.find_loop_nest_around_map` """ result = {} @@ -65,6 +66,8 @@ def fuse_loop_domains(kernel): parents_per_domain = kernel.parents_per_domain() all_parents_per_domain = kernel.all_parents_per_domain() + iname_to_insns = kernel.iname_to_insns() + new_domains = None for inner_iname, outer_inames in six.iteritems(lnm): @@ -77,6 +80,12 @@ def fuse_loop_domains(kernel): if inner_domain_idx == outer_domain_idx: break + if iname_to_insns[inner_iname] != iname_to_insns[outer_iname]: + # The two inames are imperfectly nested. Domain fusion + # might be invalid when the inner loop is empty, leading to + # the outer loop also being empty. + continue + if ( outer_domain_idx in all_parents_per_domain[inner_domain_idx] and not diff --git a/test/test_fortran.py b/test/test_fortran.py index 496b470de..902c2d1b7 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -517,12 +517,32 @@ def test_fortran_subroutines(ctx_factory): call twice(n, a(1:n, i)) call twice(n, a(i, 1:n)) + end subroutine + """ + prg = lp.parse_fortran(fortran_src) + print(lp.generate_code_v2(prg).device_code()) +def test_domain_fusion_imperfectly_nested(): + fortran_src = """ + subroutine imperfect(n, m, a, b) + implicit none + integer i, j, n, m + real a(n), b(n,n) + + do i=1, n + a(i) = i + do j=1, m + b(i,j) = i*j + end do + end do end subroutine """ + prg = lp.parse_fortran(fortran_src) - print(lp.generate_code_v2(prg).device_code()) + # If n > 0 and m == 0, a single domain would be empty, + # leading (incorrectly) to no assignments to 'a'. + assert len(prg["imperfect"].subkernel.domains) > 1 if __name__ == "__main__": -- GitLab From 9a1cfd57597e208342da3c81c975287f72179ab9 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 18:39:57 -0500 Subject: [PATCH 502/916] Add fixme regarding killing loopy.loop --- loopy/loop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/loop.py b/loopy/loop.py index a2793c196..26eee3848 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -59,6 +59,7 @@ def potential_loop_nest_map(kernel): @iterate_over_kernels_if_given_program def fuse_loop_domains(kernel): + # FIXME: This should be moved to loopy.transforms.iname from loopy.kernel.tools import is_domain_dependent_on_inames while True: -- GitLab From 67384ca8dd5070710b673b934037353a8315b612 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 19:01:37 -0500 Subject: [PATCH 503/916] Add FIXME regarding fuse_loop_domains correctness --- loopy/loop.py | 3 +++ test/test_fortran.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/loop.py b/loopy/loop.py index 26eee3848..f7794c29f 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -85,6 +85,9 @@ def fuse_loop_domains(kernel): # The two inames are imperfectly nested. Domain fusion # might be invalid when the inner loop is empty, leading to # the outer loop also being empty. + + # FIXME: Not fully correct, does not consider reductions + # https://gitlab.tiker.net/inducer/loopy/issues/172 continue if ( diff --git a/test/test_fortran.py b/test/test_fortran.py index 902c2d1b7..e0aa22f5f 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -498,7 +498,7 @@ def test_precompute_some_exist(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) -def test_fortran_subroutines(ctx_factory): +def test_fortran_subroutines(): fortran_src = """ subroutine twice(n, a) implicit none -- GitLab From ede8215ee8e01e4fcfc439f97d5c5125abc6526c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 19:19:59 -0500 Subject: [PATCH 504/916] Rename fuse_loop_domains->merge_loop_domains --- loopy/frontend/fortran/translator.py | 4 ++-- loopy/loop.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 817a448f3..66961ce70 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -808,8 +808,8 @@ class F2LoopyTranslator(FTreeWalkerBase): seq_dependencies=seq_dependencies, ) - from loopy.loop import fuse_loop_domains - knl = fuse_loop_domains(knl) + from loopy.loop import merge_loop_domains + knl = merge_loop_domains(knl) knl = lp.fold_constants(knl) result.append(knl) diff --git a/loopy/loop.py b/loopy/loop.py index f7794c29f..3155adfbc 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -58,7 +58,7 @@ def potential_loop_nest_map(kernel): @iterate_over_kernels_if_given_program -def fuse_loop_domains(kernel): +def merge_loop_domains(kernel): # FIXME: This should be moved to loopy.transforms.iname from loopy.kernel.tools import is_domain_dependent_on_inames @@ -73,7 +73,7 @@ def fuse_loop_domains(kernel): for inner_iname, outer_inames in six.iteritems(lnm): for outer_iname in outer_inames: - # {{{ check if it's safe to fuse + # {{{ check if it's safe to merge inner_domain_idx = kernel.get_home_domain_index(inner_iname) outer_domain_idx = kernel.get_home_domain_index(outer_iname) @@ -95,7 +95,7 @@ def fuse_loop_domains(kernel): and not outer_domain_idx == parents_per_domain[inner_domain_idx]): # Outer domain is not a direct parent of the inner - # domain. Unable to fuse. + # domain. Unable to merge. continue outer_dom = kernel.domains[outer_domain_idx] @@ -105,7 +105,7 @@ def fuse_loop_domains(kernel): if is_domain_dependent_on_inames(kernel, inner_domain_idx, outer_inames): # Bounds of inner domain depend on outer domain. - # Unable to fuse. + # Unable to merge. continue # }}} -- GitLab From 46a822c3b84aa56d39b21d47ac42cbcb85c82a7b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 19:46:16 -0500 Subject: [PATCH 505/916] merge_loop_domains: do not merge domains from SubArrayRefs --- loopy/loop.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/loopy/loop.py b/loopy/loop.py index 3155adfbc..24cbe730f 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -81,6 +81,13 @@ def merge_loop_domains(kernel): if inner_domain_idx == outer_domain_idx: break + if (not iname_to_insns[inner_iname] + or not iname_to_insns[outer_iname]): + # Inames without instructions occur when used in + # a SubArrayRef. We don't want monster SubArrayRef domains, + # so refuse to merge those. + continue + if iname_to_insns[inner_iname] != iname_to_insns[outer_iname]: # The two inames are imperfectly nested. Domain fusion # might be invalid when the inner loop is empty, leading to -- GitLab From aa7213aead0d042b07f640069767e7142ee6a6db Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 May 2019 19:47:15 -0500 Subject: [PATCH 506/916] SliceToInameReplacer: Create one domain per SubArrayRef, not one moster domain --- loopy/kernel/creation.py | 79 ++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index a7205dbbe..ba58af63d 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1875,25 +1875,25 @@ class SliceToInameReplacer(IdentityMapper): An instance of :class:`loopy.LoopKernel` - .. attribute:: iname_domains + .. attribute:: subarray_ref_bounds - An instance of :class:`dict` to store the slices enountered in the + A :class:`list` (one entry for each :class:`SubArrayRef` to be created) + of :class:`dict` instances to store the slices enountered in the expressions as a mapping from ``iname`` to a tuple of ``(start, stop, - step)``, which describes the affine constraint imposed on the ``iname`` - by the corresponding slice notation its intended to replace. - - :Example: - - ``x[:, i, :, j]`` would be mapped to ``[islice_0, islice_1]: - x[islice_0, i, islice_1, j]`` - + step)``, which describes the boxy (i.e. affine) constraints imposed on + the ``iname`` by the corresponding slice notation its intended to + replace. """ def __init__(self, knl, var_name_gen): self.var_name_gen = var_name_gen self.knl = knl - self.iname_domains = {} + + self.subarray_ref_bounds = [] def map_subscript(self, expr): + subscript_iname_bounds = {} + self.subarray_ref_bounds.append(subscript_iname_bounds) + updated_index = [] swept_inames = [] for i, index in enumerate(expr.index_tuple): @@ -1910,7 +1910,7 @@ class SliceToInameReplacer(IdentityMapper): "-- maybe add the shape for the sliced argument.") start, stop, step = get_slice_params( index, domain_length) - self.iname_domains[unique_var_name] = (start, stop, step) + subscript_iname_bounds[unique_var_name] = (start, stop, step) if step > 0: updated_index.append(step*Variable(unique_var_name)) @@ -1950,35 +1950,38 @@ class SliceToInameReplacer(IdentityMapper): tuple(self.rec(_convert_array_to_slices(par)) for par in expr.parameters)) + # FIXME: Missing map_call_with_kwargs + def get_iname_domain_as_isl_set(self): """ Returns the extra domain constraints imposed by the slice inames, recorded in :attr:`iname_domains`. """ - if not self.iname_domains: - return None + subarray_ref_domains = [] + for sar_bounds in self.subarray_ref_bounds: + ctx = self.knl.isl_context + space = isl.Space.create_from_names(ctx, + set=list(sar_bounds.keys())) + from loopy.symbolic import DependencyMapper + args_as_params_for_domains = set() + for _, (start, stop, step) in sar_bounds.items(): + args_as_params_for_domains |= DependencyMapper()(start) + args_as_params_for_domains |= DependencyMapper()(stop) + args_as_params_for_domains |= DependencyMapper()(step) - ctx = self.knl.isl_context - space = isl.Space.create_from_names(ctx, - set=list(self.iname_domains.keys())) - from loopy.symbolic import DependencyMapper - args_as_params_for_domains = set() - for _, (start, stop, step) in self.iname_domains.items(): - args_as_params_for_domains |= DependencyMapper()(start) - args_as_params_for_domains |= DependencyMapper()(stop) - args_as_params_for_domains |= DependencyMapper()(step) + space = space.add_dims(dim_type.param, len(args_as_params_for_domains)) + for i, arg in enumerate(args_as_params_for_domains): + space = space.set_dim_id(dim_type.param, i, isl.Id(arg.name)) - space = space.add_dims(1, len(args_as_params_for_domains)) - for i, arg in enumerate(args_as_params_for_domains): - space = space.set_dim_id(1, i, isl.Id(arg.name)) + iname_set = isl.BasicSet.universe(space) - iname_set = isl.BasicSet.universe(space) + from loopy.isl_helpers import make_slab + for iname, (start, stop, step) in sar_bounds.items(): + iname_set = iname_set & make_slab(space, iname, start, stop, step) - from loopy.isl_helpers import make_slab - for iname, (start, stop, step) in self.iname_domains.items(): - iname_set = iname_set & make_slab(space, iname, start, stop, step) + subarray_ref_domains.append(iname_set) - return iname_set + return subarray_ref_domains def realize_slices_array_inputs_as_sub_array_refs(kernel): @@ -2004,15 +2007,11 @@ def realize_slices_array_inputs_as_sub_array_refs(kernel): raise NotImplementedError("Unknown type of instruction -- %s" % type(insn)) - slice_iname_domains = slice_replacer.get_iname_domain_as_isl_set() - - if slice_iname_domains: - from loopy.kernel.tools import DomainChanger - domch = DomainChanger(kernel.copy(instructions=new_insns), frozenset()) - return kernel.copy(domains=domch.get_domains_with(slice_iname_domains), - instructions=new_insns) - else: - return kernel.copy(instructions=new_insns) + return kernel.copy( + domains=( + kernel.domains + + slice_replacer.get_iname_domain_as_isl_set()), + instructions=new_insns) # }}} -- GitLab From 8ca632eeb2ee0981fd8cf800185a541683662e98 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 24 May 2019 00:07:31 -0500 Subject: [PATCH 507/916] includes lower bound while noting the shape --- loopy/kernel/function_interface.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 6f8ff3ff7..8ece3acdd 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -165,7 +165,8 @@ def get_arg_descriptor_for_expression(kernel, expr): DimTag(strides_as_dict[iname]) for iname in expr.swept_inames) sub_shape = tuple( pw_aff_to_expr( - kernel.get_iname_bounds(iname.name).upper_bound_pw_aff)+1 + kernel.get_iname_bounds(iname.name).upper_bound_pw_aff + - kernel.get_iname_bounds(iname.name).lower_bound_pw_aff)+1 for iname in expr.swept_inames) if expr.swept_inames == (): sub_shape = (1, ) -- GitLab From 35196f30b0116cae453bc402c76aea350d69744a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 24 May 2019 00:50:08 -0500 Subject: [PATCH 508/916] Add _remove kwarg to fix_parameters to allow avoiding removal of the parameters --- loopy/transform/parameter.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index b7d017ec8..5c5e94028 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -71,7 +71,7 @@ def assume(kernel, assumptions): # {{{ fix_parameter -def _fix_parameter(kernel, name, value): +def _fix_parameter(kernel, name, value, remove_argument): def process_set(s): var_dict = s.get_var_dict() @@ -107,7 +107,7 @@ def _fix_parameter(kernel, name, value): from loopy.kernel.array import ArrayBase new_args = [] for arg in kernel.args: - if arg.name == name: + if arg.name == name and remove_argument: # remove from argument list continue @@ -148,8 +148,15 @@ def fix_parameters(kernel, **value_dict): """ assert isinstance(kernel, LoopKernel) + # FIXME: Parameter / argument terminology? + + # FIXME: Is _remove the right approach? (I'm not sure it is.) Because of + # the potential namespace conflict. If yes, document. If no, fix. + + remove_arg = value_dict.pop("_remove", True) + for name, value in six.iteritems(value_dict): - kernel = _fix_parameter(kernel, name, value) + kernel = _fix_parameter(kernel, name, value, remove_arg) return kernel -- GitLab From afc94955ac5d17389e76edcdcf5962a2049309bf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 24 May 2019 00:50:37 -0500 Subject: [PATCH 509/916] Remove arg_descr_inferring debug print --- loopy/kernel/function_interface.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 8ece3acdd..2724b1541 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -669,7 +669,6 @@ class CallableKernel(InKernelCallable): def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): # tune the subkernel so that we have the matching shapes and # dim_tags - print('Started arg_descr_inferring for {0}'.format(self.subkernel.name)) # {{{ map the arg_descrs so that all the variables are from the callees # perspective -- GitLab From 3a6d562e70e053334a9d08f6bf6b867c8d00fe65 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 24 May 2019 00:51:44 -0500 Subject: [PATCH 510/916] Add Program.with_kernel, tweak Program.__getitem__ to return LoopKernel --- loopy/program.py | 17 ++++++++++------- test/test_fortran.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index b44ea8504..9840eb9d9 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -356,18 +356,21 @@ class Program(ImmutableRecord): """:returns: a copy of *self* with the topmost level kernel as *root_kernel*. """ - new_in_knl_callable = self.callables_table[ - self.name].copy(subkernel=root_kernel) - new_resolved_functions = ( - self.callables_table.resolved_functions.copy()) - new_resolved_functions[self.name] = new_in_knl_callable - + assert self.name == root_kernel.name + return self.with_kernel(root_kernel) + + def with_kernel(self, kernel): + # FIXME: Currently only replaces kernel. Should also work for adding. + # FIXME: Document + new_in_knl_callable = self.callables_table[kernel.name].copy(subkernel=kernel) + new_resolved_functions = self.callables_table.resolved_functions.copy() + new_resolved_functions[kernel.name] = new_in_knl_callable return self.copy( callables_table=self.callables_table.copy( resolved_functions=new_resolved_functions)) def __getitem__(self, name): - return self.callables_table[name] + return self.callables_table[name].subkernel def __call__(self, *args, **kwargs): key = self.target.get_kernel_executor_cache_key(*args, **kwargs) diff --git a/test/test_fortran.py b/test/test_fortran.py index e0aa22f5f..2b62148a9 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -542,7 +542,7 @@ def test_domain_fusion_imperfectly_nested(): prg = lp.parse_fortran(fortran_src) # If n > 0 and m == 0, a single domain would be empty, # leading (incorrectly) to no assignments to 'a'. - assert len(prg["imperfect"].subkernel.domains) > 1 + assert len(prg["imperfect"].domains) > 1 if __name__ == "__main__": -- GitLab From c52cb154db0125f24c0ef3479a1512f79e0e38c0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 24 May 2019 00:52:09 -0500 Subject: [PATCH 511/916] Fix grammar in array/scalar passing error messages --- loopy/kernel/function_interface.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 2724b1541..187f0ae24 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -725,8 +725,8 @@ class CallableKernel(InKernelCallable): if isinstance(descr, ArrayArgDescriptor): if not isinstance(self.subkernel.arg_dict[arg_id], ArrayArg): - raise LoopyError("Array passed to a scalar argument " - " '%s' of the function '%s' (in '%s')." % ( + raise LoopyError("Array passed to scalar argument " + "'%s' of the function '%s' (in '%s')." % ( arg_id, self.subkernel.name, caller_kernel.name)) if self.subkernel.arg_dict[arg_id].shape and ( @@ -746,8 +746,8 @@ class CallableKernel(InKernelCallable): new_args] elif isinstance(descr, ValueArgDescriptor): if not isinstance(self.subkernel.arg_dict[arg_id], ValueArg): - raise LoopyError("Scalar passed to an array argument " - " '%s' of the callable '%s' (in '%s')" % ( + raise LoopyError("Scalar passed to array argument " + "'%s' of the callable '%s' (in '%s')" % ( arg_id, self.subkernel.name, caller_kernel.name)) else: -- GitLab From 16a5b46e8fafa65e8a0cd8443b41cdbd81545ed5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 24 May 2019 10:37:31 -0500 Subject: [PATCH 512/916] rename subkernels only while exiting --- loopy/program.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 9840eb9d9..0e914c8bc 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -362,7 +362,8 @@ class Program(ImmutableRecord): def with_kernel(self, kernel): # FIXME: Currently only replaces kernel. Should also work for adding. # FIXME: Document - new_in_knl_callable = self.callables_table[kernel.name].copy(subkernel=kernel) + new_in_knl_callable = self.callables_table[kernel.name].copy( + subkernel=kernel) new_resolved_functions = self.callables_table.resolved_functions.copy() new_resolved_functions[kernel.name] = new_in_knl_callable return self.copy( @@ -599,9 +600,6 @@ class CallablesTable(ImmutableRecord): history = dict((func_id, frozenset([func_id])) for func_id in resolved_functions) - assert all(call.subkernel.name == name for name, call in - resolved_functions.items() if isinstance(call, CallableKernel)) - super(CallablesTable, self).__init__( resolved_functions=resolved_functions, history=history, @@ -829,10 +827,6 @@ class CallablesTable(ImmutableRecord): unique_function_identifier = ( next_indexed_function_identifier( unique_function_identifier)) - if isinstance(in_kernel_callable, CallableKernel): - in_kernel_callable = (in_kernel_callable.copy( - subkernel=in_kernel_callable.subkernel.copy( - name=unique_function_identifier))) updated_resolved_functions = self.resolved_functions.copy() updated_resolved_functions[unique_function_identifier] = ( @@ -902,6 +896,10 @@ class CallablesTable(ImmutableRecord): in_knl_callable) new_history[new_func_id] = self.history[func_id] else: + if isinstance(in_knl_callable, CallableKernel): + in_knl_callable = in_knl_callable.copy( + subkernel=in_knl_callable.subkernel.copy( + name=func_id)) new_resolved_functions[func_id] = in_knl_callable new_history[func_id] = self.history[func_id] -- GitLab From 0e10220ae2a47d9d000501c68619bd2943b4b39c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 24 May 2019 15:05:07 -0500 Subject: [PATCH 513/916] Programmability tweaks for lp.Program --- loopy/program.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 0e914c8bc..1bbd2fe04 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -370,8 +370,15 @@ class Program(ImmutableRecord): callables_table=self.callables_table.copy( resolved_functions=new_resolved_functions)) + def __iter__(self): + return six.iterkeys(self.callables_table.resolved_functions) + def __getitem__(self, name): - return self.callables_table[name].subkernel + result = self.callables_table[name] + if isinstance(result, CallableKernel): + return result.subkernel + else: + return result def __call__(self, *args, **kwargs): key = self.target.get_kernel_executor_cache_key(*args, **kwargs) -- GitLab From f8051fcf6dff9531d45827c87754f280d5d0ea87 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 24 May 2019 17:56:03 -0500 Subject: [PATCH 514/916] Fix, test stride mismatch check --- loopy/target/execution.py | 2 +- test/test_loopy.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index f6a1d9ad0..9d1d14376 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -533,7 +533,7 @@ class ExecutionWrapperGeneratorBase(object): gen("(%s,) = %s.shape" % (", ".join(shape), arg.name)) gen("(%s,) = %s.strides" % (", ".join(strides), arg.name)) - gen("if not %s:" + gen("if not (%s):" % self.get_strides_check_expr( shape, strides, (strify(s) for s in sym_strides))) diff --git a/test/test_loopy.py b/test/test_loopy.py index 0b5462cc2..20052d196 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2955,6 +2955,22 @@ def test_temp_var_type_deprecated_usage(): temp_var_types=(np.dtype(np.int32),)) +def test_shape_mismatch_check(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + prg = lp.make_kernel( + "{[i,j]: 0 <= i < n and 0 <= j < m}", + "c[i] = sum(j, a[i,j]*b[j])", + default_order="F") + + a = np.random.rand(10, 10).astype(np.float32) + b = np.random.rand(10).astype(np.float32) + + with pytest.raises(TypeError, match="strides mismatch"): + prg(queue, a=a, b=b) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From c74315280738f7b13ecb516305cda5712f152855 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 25 May 2019 14:00:43 -0500 Subject: [PATCH 515/916] Fortran parse, preprocess, codegen: use ProcessLogger --- loopy/codegen/__init__.py | 11 ++++++----- loopy/frontend/fortran/__init__.py | 8 ++++++++ loopy/preprocess.py | 12 +++++++----- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d12d36486..70cd7cc95 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -22,6 +22,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) + import six from loopy.diagnostic import LoopyError, warn @@ -39,9 +42,7 @@ from functools import reduce from loopy.kernel.function_interface import CallableKernel from cgen import Collection - -import logging -logger = logging.getLogger(__name__) +from pytools import ProcessLogger # {{{ implemented data info @@ -457,7 +458,7 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target): from loopy.check import pre_codegen_checks pre_codegen_checks(kernel, callables_table) - logger.info("%s: generate code: start" % kernel.name) + codegen_plog = ProcessLogger(logger, "%s: generate code" % kernel.name) # {{{ examine arg list @@ -564,7 +565,7 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target): implemented_domains=LazilyUnpicklingDict( codegen_result.implemented_domains)) - logger.info("%s: generate code: done" % kernel.name) + codegen_plog.done() if CACHING_ENABLED: code_gen_cache.store_if_not_present(input_kernel, codegen_result) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index df3cff996..3516ca29a 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -22,7 +22,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) + from loopy.diagnostic import LoopyError +from pytools import ProcessLogger def c_preprocess(source, defines=None, filename=None, include_paths=None): @@ -243,6 +247,8 @@ def parse_fortran(source, filename="", free_form=None, strict=None, :returns: a :class:`loopy.Program` """ + parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) + if seq_dependencies is not None and auto_dependencies is not None: raise TypeError( "may not specify both seq_dependencies and auto_dependencies") @@ -295,6 +301,8 @@ def parse_fortran(source, filename="", free_form=None, strict=None, # THIS IS A VERY IMPORTANT FIXME!! prog = register_callable_kernel(prog, callee_knl) + parse_plog.done() + return prog diff --git a/loopy/preprocess.py b/loopy/preprocess.py index bbadb99ef..61f130a6b 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -22,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import logging +logger = logging.getLogger(__name__) import six from loopy.diagnostic import ( @@ -42,8 +44,8 @@ from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) from loopy.program import Program, iterate_over_kernels_if_given_program from loopy.kernel.function_interface import CallableKernel, ScalarCallable -import logging -logger = logging.getLogger(__name__) + +from pytools import ProcessLogger # {{{ prepare for caching @@ -2320,7 +2322,7 @@ def preprocess_single_kernel(kernel, callables_table, device=None): # }}} - logger.info("%s: preprocess start" % kernel.name) + prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name) from loopy.check import check_identifiers_in_subst_rules check_identifiers_in_subst_rules(kernel) @@ -2378,11 +2380,11 @@ def preprocess_single_kernel(kernel, callables_table, device=None): kernel = kernel.target.preprocess(kernel) - logger.info("%s: preprocess done" % kernel.name) - kernel = kernel.copy( state=KernelState.PREPROCESSED) + prepro_logger.done() + # {{{ prepare for caching # PicklableDtype instances for example need to know the target they're working -- GitLab From 139a3a54a5940a49f73cf1bf972e00527562f67d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 25 May 2019 23:52:03 -0500 Subject: [PATCH 516/916] Doc typo fix --- loopy/transform/callable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 042990c77..6c43dd508 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -44,7 +44,7 @@ __doc__ = """ .. autofunction:: register_function_id_to_in_knl_callable_mapper -.. autofunction:: eegister_callable_kernel +.. autofunction:: register_callable_kernel """ -- GitLab From 496d8dd70b2ea65cf9daffc95638b5b68f27ba77 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 25 May 2019 23:52:41 -0500 Subject: [PATCH 517/916] set_temporary_scope: set address_space, not scope --- loopy/transform/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index f3bce038e..2c9499d9d 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -737,7 +737,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): except KeyError: raise LoopyError("temporary '%s' not found" % tv_name) - new_temp_vars[tv_name] = tv.copy(scope=scope) + new_temp_vars[tv_name] = tv.copy(address_space=scope) return kernel.copy(temporary_variables=new_temp_vars) -- GitLab From c27cf9faab28157e7b03adf9ca1d1cba2a9ec8e3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 25 May 2019 23:55:50 -0500 Subject: [PATCH 518/916] Barrier insertion: include kernel name in diagnostic --- loopy/schedule/__init__.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 5b97f1e10..b37f87ec4 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1658,16 +1658,17 @@ def _insn_ids_reaching_end(schedule, kind, reverse): return insn_ids_alive_at_scope[-1] -def append_barrier_or_raise_error(schedule, dep, verify_only): +def append_barrier_or_raise_error(kernel_name, schedule, dep, verify_only): if verify_only: from loopy.diagnostic import MissingBarrierError raise MissingBarrierError( - "Dependency '%s' (for variable '%s') " + "%s: Dependency '%s' (for variable '%s') " "requires synchronization " "by a %s barrier (add a 'no_sync_with' " "instruction option to state that no " "synchronization is needed)" % ( + kernel_name, dep.dep_descr.format( tgt=dep.target.id, src=dep.source.id), dep.variable, @@ -1738,7 +1739,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 for dep in chain.from_iterable( dep_tracker.gen_dependencies_with_target_at(insn) for insn in loop_head): - append_barrier_or_raise_error(result, dep, verify_only) + append_barrier_or_raise_error( + kernel.name, result, dep, verify_only) # This barrier gets inserted outside the loop, hence it is # executed unconditionally and so kills all sources before # the loop. @@ -1770,7 +1772,7 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 elif isinstance(sched_item, RunInstruction): for dep in dep_tracker.gen_dependencies_with_target_at( sched_item.insn_id): - append_barrier_or_raise_error(result, dep, verify_only) + append_barrier_or_raise_error(kernel.name, result, dep, verify_only) dep_tracker.discard_all_sources() break result.append(sched_item) -- GitLab From 6b517edd82e86c8a808a97ddd97a013b984ab3c5 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 26 May 2019 00:01:43 -0500 Subject: [PATCH 519/916] Fix ArrayArgDescriptor.update_persistent_hash: shape may be a pymbolic expression --- loopy/kernel/function_interface.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 187f0ae24..aa7457879 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -121,7 +121,12 @@ class ArrayArgDescriptor(ImmutableRecord): dim_tag in self.dim_tags))) return frozenset(var.name for var in result) - update_persistent_hash = update_persistent_hash + # FIXME ArrayArgDescriptor should never need to be persisted, remove + # this method when that is so. + def update_persistent_hash(self, key_hash, key_builder): + key_builder.update_for_pymbolic_expression(key_hash, self.shape) + key_builder.rec(key_hash, self.address_space) + key_builder.rec(key_hash, self.dim_tags) def get_arg_descriptor_for_expression(kernel, expr): -- GitLab From 9f764e8d6276011a9b1f829c317dbbb152350722 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 26 May 2019 00:15:26 -0500 Subject: [PATCH 520/916] LoopKernel.global_var_names: only consider ArrayArgs with GLOBAL address space --- loopy/kernel/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index e5e6a61ec..77313f7fd 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -983,7 +983,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return ( set( arg.name for arg in self.args - if isinstance(arg, ArrayArg)) + if isinstance(arg, ArrayArg) + and arg.address_space == AddressSpace.GLOBAL) | set( tv.name for tv in six.itervalues(self.temporary_variables) -- GitLab From 6ac7bcbdada76c93eac84e2c1c3cc93df515a734 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 26 May 2019 01:39:05 -0500 Subject: [PATCH 521/916] Add missing folds around identify_root_kernel --- loopy/kernel/tools.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 7c0f3c095..397514b32 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1954,6 +1954,8 @@ def infer_args_are_output_only(kernel): # }}} +# {{{ identify_root_kernel + class CallCollector(CombineMapper): def combine(self, values): import operator @@ -2006,4 +2008,6 @@ def identify_root_kernel(kernels): root_knl_name, = (kernel_names - all_calls) return root_knl_name +# }}} + # vim: foldmethod=marker -- GitLab From 827348c08e1e896c5313454ee31cde804459dda6 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 26 May 2019 01:40:52 -0500 Subject: [PATCH 522/916] Disable, add FIXME for check_for_unused_hw_axes --- loopy/check.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index 796c5b4bd..1b99e9c04 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1011,7 +1011,11 @@ def pre_codegen_checks(kernel, callables_table): try: logger.debug("pre-codegen check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel, callables_table) + # FIXME `check_for_unused_hw_axes_in_insns` currently flags a problem + # in the callee if a caller kernel, at a call site, uses hardware axes + # (say `g.0` and `g.1`). It does not seem that that knowledge is + # propagated to the callee. + # check_for_unused_hw_axes_in_insns(kernel, callables_table) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) -- GitLab From 737c7a8eb7df3aacfa26fd656deb909d0325bdab Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 26 May 2019 18:03:07 -0500 Subject: [PATCH 523/916] Fix order flip in GridOverrideForCalleeKernel --- loopy/kernel/function_interface.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index aa7457879..89db0edc7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -237,14 +237,14 @@ class GridOverrideForCalleeKernel(ImmutableRecord): :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. - .. attribute:: local_size - - The local work group size that has to be set in the callee kernel. - .. attribute:: global_size The global work group size that to be set in the callee kernel. + .. attribute:: local_size + + The local work group size that has to be set in the callee kernel. + .. note:: This class acts as a pseudo-callable and its significance lies in @@ -252,12 +252,12 @@ class GridOverrideForCalleeKernel(ImmutableRecord): """ fields = set(["local_size", "global_size"]) - def __init__(self, local_size, global_size): - self.local_size = local_size + def __init__(self, global_size, local_size): self.global_size = global_size + self.local_size = local_size def __call__(self, insn_ids, callables_table, ignore_auto=True): - return self.local_size, self.global_size + return self.global_size, self.local_size # }}} @@ -802,7 +802,7 @@ class CallableKernel(InKernelCallable): return self.copy( subkernel=self.subkernel.copy( overridden_get_grid_sizes_for_insn_ids=( - GridOverrideForCalleeKernel(lsize, gsize)))) + GridOverrideForCalleeKernel(gsize, lsize)))) def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and -- GitLab From cda9c7ebbd1465d0a2c864861cd488d1241819d3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 26 May 2019 18:51:49 -0500 Subject: [PATCH 524/916] modifies the test to not pass when glens = llens --- test/test_callables.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/test/test_callables.py b/test/test_callables.py index af7e12180..9739ca496 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -216,40 +216,46 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 2 ** 5 - x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + x_dev = cl.random.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.random.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 32}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name='linear_combo') - callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0") + callee_knl = lp.split_iname(callee_knl, "i", 2, inner_tag="l.0", outer_tag="g.0") caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + "{[i, j, k, l, m]: 0<=i, j, k, l, m<32}", """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """ ) - caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") + caller_knl = lp.split_iname(caller_knl, "i", 8, inner_tag="l.1", outer_tag="g.1") knl = lp.register_callable_kernel( caller_knl, callee_knl) + knl = lp.set_options(knl, 'return_dict') + + gsize, lsize = knl.get_grid_size_upper_bounds_as_exprs() + if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo') - evt, (out, ) = knl(queue, x=x_dev, y=y_dev) + evt, out = knl(queue, x=x_dev, y=y_dev) x_host = x_dev.get() y_host = y_dev.get() - assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm( + assert gsize == (16, 4) + assert lsize == (2, 8) + assert np.linalg.norm(2*x_host+3*y_host-out['z'].get())/np.linalg.norm( 2*x_host+3*y_host) < 1e-15 -- GitLab From d1683e0c0dbde7e463cb249f27811b241cec8805 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 26 May 2019 18:52:26 -0500 Subject: [PATCH 525/916] reorders gsize, lsize in infer_hw_axes --- loopy/kernel/function_interface.py | 2 +- loopy/preprocess.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 89db0edc7..1195fc995 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -385,7 +385,7 @@ class InKernelCallable(ImmutableRecord): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) - def with_hw_axes_sizes(self, local_size, global_size): + def with_hw_axes_sizes(self, global_size, local_size): """ Returns a copy of *self* with modifications to comply with the grid sizes ``(local_size, global_size)`` of the program in which it is diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 61f130a6b..de620ef9a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2419,7 +2419,7 @@ def infer_hw_axes_sizes(program): collective value. """ - local_size, global_size = program.get_grid_size_upper_bounds() + global_size, local_size = program.get_grid_size_upper_bounds() resolved_function_with_hw_axes_sizes_inferred = {} @@ -2430,7 +2430,7 @@ def infer_hw_axes_sizes(program): in_knl_callable) else: resolved_function_with_hw_axes_sizes_inferred[func_id] = ( - in_knl_callable.with_hw_axes_sizes(local_size, global_size)) + in_knl_callable.with_hw_axes_sizes(global_size, local_size)) new_callables_table = ( program.callables_table.copy( -- GitLab From 9a03edf2a55bfec6489fcb79ce54c0c3b9b5bd0a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 27 May 2019 00:07:01 -0500 Subject: [PATCH 526/916] Add qpolynomial_to_expr --- loopy/symbolic.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index d98c3fdea..e2f9b0b3a 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1655,15 +1655,34 @@ def simplify_using_aff(kernel, expr): # }}} -# {{{ expression/set <-> constraint conversion +# {{{ qpolynomial_to_expr + +def _term_to_expr(space, term): + from pymbolic.primitives import Variable + + result = term.get_coefficient_val().to_python() + for dt in isl._CHECK_DIM_TYPES: + for i in range(term.dim(dt)): + exp = term.get_exp(dt, i) + if exp: + result = result*Variable(space.get_dim_name(dt, i))**exp + + for i in range(term.dim(dim_type.div)): + raise NotImplementedError("divs in terms") + # FIXME print the qpoly, match the semantics + result += aff_to_expr(term.get_div(i)) -def eq_constraint_from_expr(space, expr): - return isl.Constraint.equality_from_aff(aff_from_expr(space, expr)) + return result -def ineq_constraint_from_expr(space, expr): - return isl.Constraint.inequality_from_aff(aff_from_expr(space, expr)) +def qpolynomial_to_expr(qpoly): + space = qpoly.space + return sum(_term_to_expr(space, t) for t in qpoly.get_terms()) +# }}} + + +# {{{ expression/set <-> constraint conversion def constraint_to_cond_expr(cns): # Looks like this is ok after all--get_aff() performs some magic. -- GitLab From 71671d5dcdbf268dbdcd67e7d770aa89480203bf Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 27 May 2019 00:10:11 -0500 Subject: [PATCH 527/916] Add subst_into_pwqpolynomial --- loopy/isl_helpers.py | 91 ++++++++++++++++++++++++++++++++++++++++++++ test/test_isl.py | 17 +++++++++ 2 files changed, 108 insertions(+) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 7acbf62f5..25e5de124 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -25,8 +25,13 @@ THE SOFTWARE. """ +import six +import numpy as np from six.moves import range, zip +from pymbolic.mapper.evaluator import \ + EvaluationMapper as EvaluationMapperBase + from loopy.diagnostic import StaticValueFindingError, LoopyError import islpy as isl @@ -734,4 +739,90 @@ def find_max_of_pwaff_with_params(pw_aff, n_allowed_params): # }}} + +# {{{ subst_into_pwqpolynomial + +class QPolynomialEvaluationMapper(EvaluationMapperBase): + def __init__(self, space): + self.zero = isl.QPolynomial.zero_on_domain(space) + + context = {} + for name, (dt, pos) in six.iteritems(space.get_var_dict()): + if dt == dim_type.set: + dt = dim_type.in_ + + context[name] = isl.QPolynomial.var_on_domain(space, dt, pos) + + super(QPolynomialEvaluationMapper, self).__init__(context) + + def map_constant(self, expr): + if isinstance(expr, np.integer): + expr = int(expr) + + return self.zero + expr + + def map_quotient(self, expr): + raise TypeError("true division in '%s' not supported " + "for as-pwaff evaluation" % expr) + + +def subst_into_pwqpolynomial(space, poly, var_dict): + if not poly.get_pieces(): + return isl.PwQPolynomial.zero(space) + + i_begin_subst_space = poly.dim(dim_type.param) + + new_var_dict = {} + for i in range(i_begin_subst_space): + old_name = poly.space.get_dim_name(dim_type.param, i) + new_name = old_name + "'" + new_var_dict[new_name] = var_dict[old_name] + poly = poly.set_dim_name(dim_type.param, i, new_name) + + var_dict = new_var_dict + del new_var_dict + + poly = poly.add_dims(dim_type.param, space.dim(dim_type.param)) + for i in range(space.dim(dim_type.param)): + poly = poly.set_dim_name(dim_type.param, i+i_begin_subst_space, + space.get_dim_name(dim_type.param, i)) + + par_domain = isl.BasicSet.universe(poly.space).params() + par_space = par_domain.space + + from loopy.symbolic import guarded_aff_from_expr, qpolynomial_to_expr + for i in range(i_begin_subst_space): + name = poly.space.get_dim_name(dim_type.param, i) + aff = guarded_aff_from_expr(par_space, var_dict[name]) + aff = aff.set_coefficient_val(dim_type.param, i, -1) + par_domain = par_domain.add_constraint( + isl.Constraint.equality_from_aff(aff)) + + new_pieces = [] + for valid_set, qpoly in poly.get_pieces(): + valid_set = valid_set & par_domain + if valid_set.plain_is_empty(): + continue + + valid_set = valid_set.project_out(dim_type.param, 0, i_begin_subst_space) + from pymbolic.mapper.substitutor import ( + SubstitutionMapper, make_subst_func) + sub_mapper = SubstitutionMapper(make_subst_func(var_dict)) + expr = sub_mapper(qpolynomial_to_expr(qpoly)) + qpoly = QPolynomialEvaluationMapper(valid_set.space)(expr) + + new_pieces.append((valid_set, qpoly)) + + if not new_pieces: + raise ValueError("no pieces of PwQPolynomial survived the substitution") + valid_set, qpoly = new_pieces[0] + result = isl.PwQPolynomial.alloc(valid_set, qpoly) + for valid_set, qpoly in new_pieces[1:]: + result = result.add_disjoint( + isl.PwQPolynomial.alloc(valid_set, qpoly)) + + return result + +# }}} + # vim: foldmethod=marker diff --git a/test/test_isl.py b/test/test_isl.py index bbd4a813e..90c98839d 100644 --- a/test/test_isl.py +++ b/test/test_isl.py @@ -51,6 +51,23 @@ def test_pw_aff_to_conditional_expr(): assert str(expr) == "If(i == 0, 0, -1 + i)" +def test_subst_into_pwqpolynomial(): + from pymbolic.primitives import Variable + arg_dict = { + 'm': 3*Variable("nx"), + 'n': 3*Variable("ny"), + 'nx': Variable('nx'), + 'ny': Variable('ny'), + 'nz': Variable('nz')} + space = isl.Set("[nx, ny, nz] -> { []: }").space + poly = isl.PwQPolynomial("[m, n] -> { (256 * m + 256 * m * n) : " + "m > 0 and n > 0; 256 * m : m > 0 and n <= 0 }") + + from loopy.isl_helpers import subst_into_pwqpolynomial + result = subst_into_pwqpolynomial(space, poly, arg_dict) + assert "(768 * nx + 2304 * nx * ny)" in str(result) + + if __name__ == "__main__": import sys if len(sys.argv) > 1: -- GitLab From e9de534c5c96daab4f701c57ee3088985e39a9ad Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 30 May 2019 16:33:33 -0500 Subject: [PATCH 528/916] Make sure subst_into_pwqpolynomial produces PwQPolynomials that have an output dimension in their space --- loopy/isl_helpers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 25e5de124..7d0e754be 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -768,7 +768,9 @@ class QPolynomialEvaluationMapper(EvaluationMapperBase): def subst_into_pwqpolynomial(space, poly, var_dict): if not poly.get_pieces(): - return isl.PwQPolynomial.zero(space) + result = isl.PwQPolynomial.zero(space.insert_dims(dim_type.out, 0, 1)) + assert result.dim(dim_type.out) == 1 + return result i_begin_subst_space = poly.dim(dim_type.param) @@ -821,6 +823,7 @@ def subst_into_pwqpolynomial(space, poly, var_dict): result = result.add_disjoint( isl.PwQPolynomial.alloc(valid_set, qpoly)) + assert result.dim(dim_type.out) return result # }}} -- GitLab From e2ae75f3d6250ab26a23ad3c12925839abfa46ea Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 01:52:28 -0500 Subject: [PATCH 529/916] Refactor subst_into_pwqpolynomial to bring out get_param_subst_domain --- loopy/isl_helpers.py | 87 ++++++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 7d0e754be..0eaba8322 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -766,50 +766,88 @@ class QPolynomialEvaluationMapper(EvaluationMapperBase): "for as-pwaff evaluation" % expr) -def subst_into_pwqpolynomial(space, poly, var_dict): - if not poly.get_pieces(): - result = isl.PwQPolynomial.zero(space.insert_dims(dim_type.out, 0, 1)) - assert result.dim(dim_type.out) == 1 - return result +def get_param_subst_domain(new_space, base_obj, subst_dict): + """Modify the :mod:`islpy` object *base_obj* to incorporate parameters for + the keys of *subst_dict*, and rename existing parameters to include a + trailing prime. + + :arg new_space: A :class:`islpy.Space` for that contains the keys of + *subst_dict* + :arg subst_dict: A dictionary mapping parameters occurring in *base_obj* + to their values in terms of variables in *new_space* + :returns: a tuple ``(base_obj, subst_domain, subst_dict)``, where + *base_obj* is the passed *base_obj* with the space extended to cover + the new parameters in *new_space*, *subst_domain* is an + :class:`islpy.BasicSet` incorporating the constraints from *subst_dict* + and existing in the same space as *base_obj*, and *subst_dict* + is a copy of the passed *subst_dict* modified to incorporate primed + variable names in the keys. + """ - i_begin_subst_space = poly.dim(dim_type.param) + # {{{ rename subst_dict keys and base_obj parameters to include trailing prime + + i_begin_subst_space = base_obj.dim(dim_type.param) - new_var_dict = {} + new_subst_dict = {} for i in range(i_begin_subst_space): - old_name = poly.space.get_dim_name(dim_type.param, i) + old_name = base_obj.space.get_dim_name(dim_type.param, i) new_name = old_name + "'" - new_var_dict[new_name] = var_dict[old_name] - poly = poly.set_dim_name(dim_type.param, i, new_name) + new_subst_dict[new_name] = subst_dict[old_name] + base_obj = base_obj.set_dim_name(dim_type.param, i, new_name) - var_dict = new_var_dict - del new_var_dict + subst_dict = new_subst_dict + del new_subst_dict + + # }}} + + # {{{ add dimensions to base_obj + + base_obj = base_obj.add_dims(dim_type.param, new_space.dim(dim_type.param)) + for i in range(new_space.dim(dim_type.param)): + base_obj = base_obj.set_dim_name(dim_type.param, i+i_begin_subst_space, + new_space.get_dim_name(dim_type.param, i)) + + # }}} - poly = poly.add_dims(dim_type.param, space.dim(dim_type.param)) - for i in range(space.dim(dim_type.param)): - poly = poly.set_dim_name(dim_type.param, i+i_begin_subst_space, - space.get_dim_name(dim_type.param, i)) + # {{{ build subst_domain - par_domain = isl.BasicSet.universe(poly.space).params() - par_space = par_domain.space + subst_domain = isl.BasicSet.universe(base_obj.space).params() - from loopy.symbolic import guarded_aff_from_expr, qpolynomial_to_expr + from loopy.symbolic import guarded_aff_from_expr for i in range(i_begin_subst_space): - name = poly.space.get_dim_name(dim_type.param, i) - aff = guarded_aff_from_expr(par_space, var_dict[name]) + name = base_obj.space.get_dim_name(dim_type.param, i) + aff = guarded_aff_from_expr(subst_domain.space, subst_dict[name]) aff = aff.set_coefficient_val(dim_type.param, i, -1) - par_domain = par_domain.add_constraint( + subst_domain = subst_domain.add_constraint( isl.Constraint.equality_from_aff(aff)) + # }}} + + return base_obj, subst_domain, subst_dict + + +def subst_into_pwqpolynomial(new_space, poly, subst_dict): + if not poly.get_pieces(): + result = isl.PwQPolynomial.zero(new_space.insert_dims(dim_type.out, 0, 1)) + assert result.dim(dim_type.out) == 1 + return result + + i_begin_subst_space = poly.dim(dim_type.param) + + poly, subst_domain, subst_dict = get_param_subst_domain( + new_space, poly, subst_dict) + + from loopy.symbolic import qpolynomial_to_expr new_pieces = [] for valid_set, qpoly in poly.get_pieces(): - valid_set = valid_set & par_domain + valid_set = valid_set & subst_domain if valid_set.plain_is_empty(): continue valid_set = valid_set.project_out(dim_type.param, 0, i_begin_subst_space) from pymbolic.mapper.substitutor import ( SubstitutionMapper, make_subst_func) - sub_mapper = SubstitutionMapper(make_subst_func(var_dict)) + sub_mapper = SubstitutionMapper(make_subst_func(subst_dict)) expr = sub_mapper(qpolynomial_to_expr(qpoly)) qpoly = QPolynomialEvaluationMapper(valid_set.space)(expr) @@ -817,6 +855,7 @@ def subst_into_pwqpolynomial(space, poly, var_dict): if not new_pieces: raise ValueError("no pieces of PwQPolynomial survived the substitution") + valid_set, qpoly = new_pieces[0] result = isl.PwQPolynomial.alloc(valid_set, qpoly) for valid_set, qpoly in new_pieces[1:]: -- GitLab From dd5e9601950c26040a115d1383217afe6f27195a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 01:53:58 -0500 Subject: [PATCH 530/916] Document callables_table arg to grid size finding functions --- loopy/kernel/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 77313f7fd..5836b20cb 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1048,6 +1048,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs + :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` *global_size* and *local_size* are instances of :class:`dict` with mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. @@ -1080,6 +1081,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): frozenset(insn.id for insn in callee_kernel.instructions), callables_table, ignore_auto) + # FIXME: Should assert that nothing is being overwritten global_sizes.update(gsize) local_sizes.update(lsize) @@ -1133,6 +1135,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs + :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ @@ -1185,6 +1188,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs + :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` *global_size* and *local_size* are :mod:`pymbolic` expressions """ @@ -1214,6 +1218,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. + :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` + *global_size* and *local_size* are :mod:`pymbolic` expressions """ -- GitLab From cfe6768515f12120045ba7394893562117bfe54b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 01:56:36 -0500 Subject: [PATCH 531/916] Add isl-space sanity checks to GuardedPwQPolynomial (stats) --- loopy/statistics.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 1808af420..58fd2822d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -73,11 +73,20 @@ __doc__ = """ # {{{ GuardedPwQPolynomial +def _get_param_tuple(obj): + return tuple( + obj.get_dim_name(dim_type.param, i) + for i in range(obj.dim(dim_type.param))) + + class GuardedPwQPolynomial(object): def __init__(self, pwqpolynomial, valid_domain): self.pwqpolynomial = pwqpolynomial self.valid_domain = valid_domain + assert (_get_param_tuple(pwqpolynomial.space) + == _get_param_tuple(valid_domain.space)) + def __add__(self, other): if isinstance(other, GuardedPwQPolynomial): return GuardedPwQPolynomial( -- GitLab From 7a1db93799592bc650f3775152b47e7707b8d4db Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 01:57:22 -0500 Subject: [PATCH 532/916] Add a sanity check to ToCountMap (stats) --- loopy/statistics.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 58fd2822d..cd3cd3298 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -157,6 +157,12 @@ class ToCountMap(object): def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial): if init_dict is None: init_dict = {} + + for val in init_dict.values(): + if isinstance(val, isl.PwQPolynomial): + assert val.dim(dim_type.out) + elif isinstance(val, GuardedPwQPolynomial): + assert val.pwqpolynomial.dim(dim_type.out) self.count_map = init_dict self.val_type = val_type -- GitLab From a7a1bcb030be44b3e7b1338f8825655d9ced9003 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 01:58:34 -0500 Subject: [PATCH 533/916] Eliminate redundant key lookup in ToCountMap.__mul__ (stats) --- loopy/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index cd3cd3298..693badda1 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -183,8 +183,8 @@ class ToCountMap(object): def __mul__(self, other): if isinstance(other, GuardedPwQPolynomial): return ToCountMap(dict( - (index, self.count_map[index]*other) - for index in self.keys())) + (index, value*other) + for index, value in six.iteritems(self.count_map))) else: raise ValueError("ToCountMap: Attempted to multiply " "ToCountMap by {0} {1}." -- GitLab From 1d86c380bb462d8a405e02aa7ebfdbb8d24bfbbe Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 01:59:14 -0500 Subject: [PATCH 534/916] ToCountMap: improve printing (stats) --- loopy/statistics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 693badda1..403590b2c 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -208,6 +208,11 @@ class ToCountMap(object): def __repr__(self): return repr(self.count_map) + def __str__(self): + return "\n".join( + "%s: %s" % (k, v) + for k, v in six.iteritems(self.count_map)) + def __len__(self): return len(self.count_map) -- GitLab From 43aec22986c6ce113c7da69f8e52d790fa33800b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 02:00:00 -0500 Subject: [PATCH 535/916] stats: Implement subst_into_to_count_map --- loopy/statistics.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/loopy/statistics.py b/loopy/statistics.py index 403590b2c..721a4d8a9 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -483,6 +483,48 @@ class ToCountMap(object): # }}} +# {{{ subst_into_to_count_map + +def subst_into_guarded_pwqpolynomial(new_space, guarded_poly, subst_dict): + from loopy.isl_helpers import subst_into_pwqpolynomial, get_param_subst_domain + + poly = subst_into_pwqpolynomial( + new_space, guarded_poly.pwqpolynomial, subst_dict) + + valid_domain = guarded_poly.valid_domain + i_begin_subst_space = valid_domain.dim(dim_type.param) + + valid_domain, subst_domain, _ = get_param_subst_domain( + new_space, guarded_poly.valid_domain, subst_dict) + + valid_domain = valid_domain & subst_domain + valid_domain = valid_domain.project_out(dim_type.param, 0, i_begin_subst_space) + return GuardedPwQPolynomial(poly, valid_domain) + + +def subst_into_to_count_map(space, tcm, subst_dict): + from loopy.isl_helpers import subst_into_pwqpolynomial + result = {} + for key, value in six.iteritems(tcm.count_map): + # FIXME: This strips away the guards. Rather than being stripped, + # they should also have the substitution applied + if isinstance(value, GuardedPwQPolynomial): + result[key] = subst_into_guarded_pwqpolynomial(space, value, subst_dict) + + elif isinstance(value, isl.PwQPolynomial): + result[key] = subst_into_pwqpolynomial(space, value, subst_dict) + + elif isinstance(value, int): + result[key] = value + + else: + raise ValueError("unexpected value type") + + return ToCountMap(result, val_type=isl.PwQPolynomial) + +# }}} + + def stringify_stats_mapping(m): result = "" for key in sorted(m.keys(), key=lambda k: str(k)): -- GitLab From f296a71a526f3a5e94d28f5909ea53033ff24d45 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 02:07:30 -0500 Subject: [PATCH 536/916] Add kernel_name to Op and MemAccess (stats) --- loopy/statistics.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 721a4d8a9..8eaee802d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -587,27 +587,38 @@ class Op(Record): implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. + """ - def __init__(self, dtype=None, name=None, count_granularity=None): + def __init__(self, dtype=None, name=None, count_granularity=None, + kernel_name=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) if dtype is None: Record.__init__(self, dtype=dtype, name=name, - count_granularity=count_granularity) + count_granularity=count_granularity, + kernel_name=kernel_name) else: from loopy.types import to_loopy_type Record.__init__(self, dtype=to_loopy_type(dtype), name=name, - count_granularity=count_granularity) + count_granularity=count_granularity, + kernel_name=kernel_name) def __hash__(self): return hash(repr(self)) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity) + if self.kernel_name is not None: + return "Op(%s, %s, %s, %s)" % ( + self.dtype, self.name, self.count_granularity, self.kernel_name) + else: + return "Op(%s, %s, %s)" % (self.dtype, self.name, self.count_granularity) # }}} @@ -673,11 +684,14 @@ class MemAccess(Record): implementation-dependent grouping of work-items within a work-group, analagous to an NVIDIA CUDA warp. + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. """ def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None, direction=None, variable=None, variable_tag=None, - count_granularity=None): + count_granularity=None, kernel_name=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " @@ -688,14 +702,16 @@ class MemAccess(Record): Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, variable=variable, variable_tag=variable_tag, - count_granularity=count_granularity) + count_granularity=count_granularity, + kernel_name=kernel_name) else: from loopy.types import to_loopy_type Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, variable=variable, variable_tag=variable_tag, - count_granularity=count_granularity) + count_granularity=count_granularity, + kernel_name=kernel_name) def __hash__(self): # Note that this means lid_strides and gid_strides must be sorted @@ -704,7 +720,7 @@ class MemAccess(Record): def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s)" % ( + return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s, %s)" % ( self.mtype, self.dtype, None if self.lid_strides is None else dict( @@ -714,7 +730,8 @@ class MemAccess(Record): self.direction, self.variable, self.variable_tag, - self.count_granularity) + self.count_granularity, + self.kernel_name) # }}} -- GitLab From cdecc45bf2ebeda2723a7a8845e85341f658cf24 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 02:12:15 -0500 Subject: [PATCH 537/916] Remove out-of-place validity check (stats) --- loopy/statistics.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 8eaee802d..3b5a81e27 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1220,16 +1220,6 @@ def add_assumptions_guard(kernel, pwqpolynomial): def count(kernel, set, space=None): - from loopy.program import Program - if isinstance(kernel, Program): - if len([in_knl_callable for in_knl_callable in - kernel.callables_table.values() if isinstance(in_knl_callable, - CallableKernel)]) != 1: - raise NotImplementedError("Currently only supported for program with " - "only one CallableKernel.") - - kernel = kernel.root_kernel - try: if space is not None: set = set.align_params(space) -- GitLab From 1504c7eba05b493c3383122ae9f77ef62fc4bf61 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 02:13:44 -0500 Subject: [PATCH 538/916] Move out-of-place docstring for get_synchronization_map --- loopy/statistics.py | 71 ++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 3b5a81e27..ecad59027 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1836,42 +1836,6 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, def get_synchronization_map_for_single_kernel(knl, callables_table, subgroup_size=None): - """Count the number of synchronization events each work-item encounters in - a loopy kernel. - - :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. - - :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` - ``'guess'``, or *None* that specifies the sub-group size. An OpenCL - sub-group is an implementation-dependent grouping of work-items within - a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, - e.g., when counting a :class:`MemAccess` whose count_granularity - specifies that it should only be counted once per sub-group. If set to - *None* an attempt to find the sub-group size using the device will be - made, if this fails an error will be raised. If a :class:`str` - ``'guess'`` is passed as the subgroup_size, get_mem_access_map will - attempt to find the sub-group size using the device and, if - unsuccessful, will make a wild guess. - - :return: A dictionary mapping each type of synchronization event to an - :class:`islpy.PwQPolynomial` holding the number of events per - work-item. - - Possible keys include ``barrier_local``, ``barrier_global`` - (if supported by the target) and ``kernel_launch``. - - Example usage:: - - # (first create loopy kernel and specify array data types) - - sync_map = get_synchronization_map(knl) - params = {'n': 512, 'm': 256, 'l': 128} - barrier_ct = sync_map['barrier_local'].eval_with_dict(params) - - # (now use this count to, e.g., predict performance) - - """ - if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) @@ -1924,6 +1888,41 @@ def get_synchronization_map_for_single_kernel(knl, callables_table, def get_synchronization_map(program, subgroup_size=None): + """Count the number of synchronization events each work-item encounters in + a loopy kernel. + + :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. + + :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` + ``'guess'``, or *None* that specifies the sub-group size. An OpenCL + sub-group is an implementation-dependent grouping of work-items within + a work-group, analagous to an NVIDIA CUDA warp. subgroup_size is used, + e.g., when counting a :class:`MemAccess` whose count_granularity + specifies that it should only be counted once per sub-group. If set to + *None* an attempt to find the sub-group size using the device will be + made, if this fails an error will be raised. If a :class:`str` + ``'guess'`` is passed as the subgroup_size, get_mem_access_map will + attempt to find the sub-group size using the device and, if + unsuccessful, will make a wild guess. + + :return: A dictionary mapping each type of synchronization event to an + :class:`islpy.PwQPolynomial` holding the number of events per + work-item. + + Possible keys include ``barrier_local``, ``barrier_global`` + (if supported by the target) and ``kernel_launch``. + + Example usage:: + + # (first create loopy kernel and specify array data types) + + sync_map = get_synchronization_map(knl) + params = {'n': 512, 'm': 256, 'l': 128} + barrier_ct = sync_map['barrier_local'].eval_with_dict(params) + + # (now use this count to, e.g., predict performance) + + """ from loopy.preprocess import preprocess_program, infer_unknown_types -- GitLab From 64f7c58df8cf3bb80667eaae3b95d840b9065ec9 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 02:15:42 -0500 Subject: [PATCH 539/916] Op/MemAccess: Use .copy() rather than explicit constructor to copy, avoids losing attributes (stats) --- loopy/statistics.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index ecad59027..a70c3cb57 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1483,11 +1483,7 @@ def get_op_map_for_single_kernel(knl, callables_table, if numpy_types: return ToCountMap( init_dict=dict( - (Op( - dtype=op.dtype.numpy_dtype, - name=op.name, - count_granularity=op.count_granularity), - ct) + (op.copy(dtype=op.dtype.numpy_dtype), ct) for op, ct in six.iteritems(op_map.count_map)), val_type=op_map.val_type ) @@ -1698,16 +1694,7 @@ def get_mem_access_map_for_single_kernel(knl, callables_table, if numpy_types: return ToCountMap( init_dict=dict( - (MemAccess( - mtype=mem_access.mtype, - dtype=mem_access.dtype.numpy_dtype, - lid_strides=mem_access.lid_strides, - gid_strides=mem_access.gid_strides, - direction=mem_access.direction, - variable=mem_access.variable, - variable_tag=mem_access.variable_tag, - count_granularity=mem_access.count_granularity), - ct) + (mem_access.copy(dtype=mem_access.dtype.numpy_dtype), ct) for mem_access, ct in six.iteritems(access_map.count_map)), val_type=access_map.val_type ) -- GitLab From 3fbeb2b8f37587a49096e229db9ac10645e4d2bb Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 02:16:51 -0500 Subject: [PATCH 540/916] Stats: comment tweaks --- loopy/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index a70c3cb57..89dabe041 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -787,8 +787,8 @@ class CounterBase(CombineMapper): map_derivative = map_common_subexpression map_slice = map_common_subexpression - # preprocessing should have removed these def map_reduction(self, expr): + # preprocessing should have removed these raise RuntimeError("%s encountered %s--not supposed to happen" % (type(self).__name__, type(expr).__name__)) @@ -1838,7 +1838,7 @@ def get_synchronization_map_for_single_kernel(knl, callables_table, one = isl.PwQPolynomial('{ 1 }') def get_count_poly(iname_list): - if iname_list: # (if iname_list is not empty) + if iname_list: ct = (count(knl, ( knl.get_inames_domain(iname_list). project_out_except(iname_list, [dim_type.set]) -- GitLab From 9c5283c7eb5ddd5fdf728a204a4ea0d8e55e138f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 18:46:26 -0500 Subject: [PATCH 541/916] loopy.schedule Flake8 fix --- loopy/schedule/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index b37f87ec4..f96dac181 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1772,7 +1772,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0 elif isinstance(sched_item, RunInstruction): for dep in dep_tracker.gen_dependencies_with_target_at( sched_item.insn_id): - append_barrier_or_raise_error(kernel.name, result, dep, verify_only) + append_barrier_or_raise_error( + kernel.name, result, dep, verify_only) dep_tracker.discard_all_sources() break result.append(sched_item) -- GitLab From a5257096bf782975d63f1f24016e04e8634a3708 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 18:50:30 -0500 Subject: [PATCH 542/916] loopy.statistics: Get rid of *_poly compat goop --- loopy/__init__.py | 9 ++---- loopy/statistics.py | 70 --------------------------------------------- 2 files changed, 3 insertions(+), 76 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index fe45308db..a70adf398 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -132,9 +132,8 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction, preprocess_program) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping, - Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, - get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map, - get_synchronization_poly, get_synchronization_map, + Op, MemAccess, get_op_map, get_mem_access_map, + get_synchronization_map, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, @@ -271,9 +270,7 @@ __all__ = [ "generate_code", "generate_code_v2", "generate_body", "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op", - "MemAccess", "get_op_poly", "get_op_map", "get_lmem_access_poly", - "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map", - "get_synchronization_poly", "get_synchronization_map", + "MemAccess", "get_op_map", "get_mem_access_map", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/statistics.py b/loopy/statistics.py index 89dabe041..5e4b1ecf1 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -2066,74 +2066,4 @@ def gather_access_footprint_bytes(program, ignore_uncountable=False): # }}} -# {{{ compat goop - -def get_lmem_access_poly(knl): - """Count the number of local memory accesses in a loopy kernel. - - get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['local'] option. - - """ - warn_with_kernel(knl, "deprecated_get_lmem_access_poly", - "get_lmem_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['local'] option.") - return get_mem_access_map(knl).filter_by(mtype=['local']) - - -def get_DRAM_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "deprecated_get_DRAM_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - - -def get_gmem_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "deprecated_get_gmem_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - - -def get_synchronization_poly(knl): - """Count the number of synchronization events each work-item encounters in - a loopy kernel. - - get_synchronization_poly is deprecated. Use get_synchronization_map - instead. - - """ - warn_with_kernel(knl, "deprecated_get_synchronization_poly", - "get_synchronization_poly is deprecated. Use " - "get_synchronization_map instead.") - return get_synchronization_map(knl) - - -def get_op_poly(knl, numpy_types=True): - """Count the number of operations in a loopy kernel. - - get_op_poly is deprecated. Use get_op_map instead. - - """ - warn_with_kernel(knl, "deprecated_get_op_poly", - "get_op_poly is deprecated. Use get_op_map instead.") - return get_op_map(knl, numpy_types) - -# }}} - # vim: foldmethod=marker -- GitLab From 118cb24becb9429ecd8d352465673ac1a0eeeeb7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 31 May 2019 19:01:58 -0500 Subject: [PATCH 543/916] Fix loopy.statistics for kernel callables This is a large refactoring, with many pieces: - Counts from subkernels are incorporated using subst_into_{pwqpolynomial,guarded_pwqpolynomial,to_count_map}. This replaces a prior, broken scheme that existed on the kernel callables branch. - Separate ToCountMap and ToCountPolynomialMap, i.e. separate to-count map types by their value type. The latter type now knows (and checks) its isl space. - The numpy_types argument is now deprecated and ignored, it did not seem to do anything previously. - Introduce Sync() count key for synchronization counting. - Code/robustness cleanups in the ToCountMap* types. - All op descriptors now carry a kernel_name. There are still a few FIMXEs, mainly the SUBGROUP granularity and the footprint gatherer. --- loopy/__init__.py | 4 +- loopy/isl_helpers.py | 1 + loopy/statistics.py | 945 ++++++++++++++++++++++------------------ test/test_statistics.py | 68 ++- 4 files changed, 571 insertions(+), 447 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a70adf398..fd6c8770c 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -131,7 +131,7 @@ from loopy.type_inference import infer_unknown_types from loopy.preprocess import (preprocess_kernel, realize_reduction, preprocess_program) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping, +from loopy.statistics import (ToCountMap, CountGranularity, Op, MemAccess, get_op_map, get_mem_access_map, get_synchronization_map, gather_access_footprints, gather_access_footprint_bytes) @@ -269,7 +269,7 @@ __all__ = [ "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op", + "ToCountMap", "CountGranularity", "Op", "MemAccess", "get_op_map", "get_mem_access_map", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 0eaba8322..0cbd18599 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -828,6 +828,7 @@ def get_param_subst_domain(new_space, base_obj, subst_dict): def subst_into_pwqpolynomial(new_space, poly, subst_dict): if not poly.get_pieces(): + assert new_space.is_params() result = isl.PwQPolynomial.zero(new_space.insert_dims(dim_type.out, 0, 1)) assert result.dim(dim_type.out) == 1 return result diff --git a/loopy/statistics.py b/loopy/statistics.py index 5e4b1ecf1..2c3d4f36f 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1,6 +1,10 @@ from __future__ import division, absolute_import, print_function -__copyright__ = "Copyright (C) 2015 James Stevens" +__copyright__ = """ +Copyright (C) 2015 James Stevens +Copyright (C) 2018 Kaushik Kulkarni +Copyright (C) 2019 Andreas Kloeckner +""" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -22,19 +26,19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from functools import partial import six import loopy as lp from islpy import dim_type import islpy as isl from pymbolic.mapper import CombineMapper -from functools import reduce from loopy.kernel.data import ( MultiAssignmentBase, TemporaryVariable, AddressSpace) from loopy.diagnostic import warn_with_kernel, LoopyError from loopy.symbolic import CoefficientCollector -from pytools import Record, memoize_method -from loopy.kernel.function_interface import ScalarCallable, CallableKernel +from pytools import ImmutableRecord, memoize_method +from loopy.kernel.function_interface import CallableKernel from loopy.kernel import LoopKernel from loopy.program import make_program @@ -44,6 +48,7 @@ __doc__ = """ .. currentmodule:: loopy .. autoclass:: ToCountMap +.. autoclass:: ToCountPolynomialMap .. autoclass:: CountGranularity .. autoclass:: Op .. autoclass:: MemAccess @@ -63,13 +68,29 @@ __doc__ = """ """ -# FIXME: this is broken for the callable kernel design. -# - The variable name, what if multiple kernels use the same name?(needs a -# different MemAccessInfo) -# - We should also add the cumulative effect on the arguments of callee kernels -# into the caller kernel -# - Make changes to MemAccessInfo to include the effect of several kernels. -# - Renovate `count`. +# FIXME: +# - The SUBGROUP granularity is completely broken if the root kernel +# contains the grid and the operations get counted in the callee. +# To test, most of those are set to WORKITEM instead below (marked +# with FIXMEs). This leads to value mismatches and key errors in +# the tests. +# - Currently, nothing prevents summation across different +# granularities, which is guaranteed to yield bogus results. +# - AccessFootprintGatherer needs to be redone to match get_op_map and +# get_mem_access_map style +# - Test for the subkernel functionality need to be written + + +def get_kernel_parameter_space(kernel): + return isl.Space.create_from_names(kernel.isl_context, + set=[], params=kernel.outer_params()).params() + + +def get_kernel_zero_pwqpolynomial(kernel): + space = get_kernel_parameter_space(kernel) + space = space.insert_dims(dim_type.out, 0, 1) + return isl.PwQPolynomial.zero(space) + # {{{ GuardedPwQPolynomial @@ -87,6 +108,10 @@ class GuardedPwQPolynomial(object): assert (_get_param_tuple(pwqpolynomial.space) == _get_param_tuple(valid_domain.space)) + @property + def space(self): + return self.valid_domain.space + def __add__(self, other): if isinstance(other, GuardedPwQPolynomial): return GuardedPwQPolynomial( @@ -143,7 +168,20 @@ class GuardedPwQPolynomial(object): # {{{ ToCountMap class ToCountMap(object): - """Maps any type of key to an arithmetic type. + """A map from work descriptors like :class:`Op` and :class:`MemAccess` + to any arithmetic type. + + .. automethod:: __getitem__ + .. automethod:: __str__ + .. automethod:: __repr__ + .. automethod:: __len__ + .. automethod:: get + .. automethod:: items + .. automethod:: keys + .. automethod:: values + + .. automethod:: copy + .. automethod:: with_set_attributes .. automethod:: filter_by .. automethod:: filter_by_func @@ -154,23 +192,20 @@ class ToCountMap(object): """ - def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial): - if init_dict is None: - init_dict = {} + def __init__(self, count_map=None): + if count_map is None: + count_map = {} - for val in init_dict.values(): - if isinstance(val, isl.PwQPolynomial): - assert val.dim(dim_type.out) - elif isinstance(val, GuardedPwQPolynomial): - assert val.pwqpolynomial.dim(dim_type.out) - self.count_map = init_dict - self.val_type = val_type + self.count_map = count_map + + def _zero(self): + return 0 def __add__(self, other): result = self.count_map.copy() for k, v in six.iteritems(other.count_map): result[k] = self.count_map.get(k, 0) + v - return ToCountMap(result, self.val_type) + return self.copy(count_map=result) def __radd__(self, other): if other != 0: @@ -178,32 +213,18 @@ class ToCountMap(object): "to {0} {1}. ToCountMap may only be added to " "0 and other ToCountMap objects." .format(type(other), other)) + return self def __mul__(self, other): - if isinstance(other, GuardedPwQPolynomial): - return ToCountMap(dict( - (index, value*other) - for index, value in six.iteritems(self.count_map))) - else: - raise ValueError("ToCountMap: Attempted to multiply " - "ToCountMap by {0} {1}." - .format(type(other), other)) + return self.copy(dict( + (index, value*other) + for index, value in six.iteritems(self.count_map))) __rmul__ = __mul__ def __getitem__(self, index): - try: - return self.count_map[index] - except KeyError: - #TODO what is the best way to handle this? - if self.val_type is GuardedPwQPolynomial: - return GuardedPwQPolynomial.zero() - else: - return 0 - - def __setitem__(self, index, value): - self.count_map[index] = value + return self.count_map[index] def __repr__(self): return repr(self.count_map) @@ -225,17 +246,19 @@ class ToCountMap(object): def keys(self): return self.count_map.keys() - def pop(self, item): - return self.count_map.pop(item) + def values(self): + return self.count_map.values() + + def copy(self, count_map=None): + if count_map is None: + count_map = self.count_map - def copy(self): - return ToCountMap(dict(self.count_map), self.val_type) + return type(self)(count_map=count_map) def with_set_attributes(self, **kwargs): - return ToCountMap(dict( + return self.copy(count_map=dict( (key.copy(**kwargs), val) - for key, val in six.iteritems(self.count_map)), - self.val_type) + for key, val in six.iteritems(self.count_map))) def filter_by(self, **kwargs): """Remove items without specified key fields. @@ -262,28 +285,25 @@ class ToCountMap(object): """ - result_map = ToCountMap(val_type=self.val_type) - - from loopy.types import to_loopy_type - if 'dtype' in kwargs.keys(): - kwargs['dtype'] = [to_loopy_type(d) for d in kwargs['dtype']] - - # for each item in self.count_map - for self_key, self_val in self.items(): - try: - # check to see if key attribute values match all filters - for arg_field, allowable_vals in kwargs.items(): - attr_val = getattr(self_key, arg_field) - # see if the value is in the filter list - if attr_val not in allowable_vals: - break - else: # loop terminated without break or error - result_map[self_key] = self_val - except(AttributeError): - # the field passed is not a field of this key - continue - - return result_map + new_count_map = {} + + class _Sentinel: + pass + + new_kwargs = {} + for arg_field, allowable_vals in six.iteritems(kwargs): + if arg_field == "dtype": + from loopy.types import to_loopy_type + allowable_vals = [to_loopy_type(dtype) for dtype in allowable_vals] + + new_kwargs[arg_field] = allowable_vals + + for key, val in six.iteritems(self.count_map): + if all(getattr(key, arg_field, _Sentinel) in allowable_vals + for arg_field, allowable_vals in six.iteritems(new_kwargs)): + new_count_map[key] = val + + return self.copy(count_map=new_count_map) def filter_by_func(self, func): """Keep items that pass a test. @@ -310,14 +330,13 @@ class ToCountMap(object): """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} - # for each item in self.count_map, call func on the key - for self_key, self_val in self.items(): + for self_key, self_val in six.iteritems(self.count_map): if func(self_key): - result_map[self_key] = self_val + new_count_map[self_key] = self_val - return result_map + return self.copy(count_map=new_count_map) def group_by(self, *args): """Group map items together, distinguishing by only the key fields @@ -365,7 +384,7 @@ class ToCountMap(object): """ - result_map = ToCountMap(val_type=self.val_type) + new_count_map = {} # make sure all item keys have same type if self.count_map: @@ -374,22 +393,17 @@ class ToCountMap(object): raise ValueError("ToCountMap: group_by() function may only " "be used on ToCountMaps with uniform keys") else: - return result_map - - # for each item in self.count_map - for self_key, self_val in self.items(): - new_key = key_type() + return self - # set all specified fields - for field in args: - setattr(new_key, field, getattr(self_key, field)) + for self_key, self_val in six.iteritems(self.count_map): + new_key = key_type( + **dict( + (field, getattr(self_key, field)) + for field in args)) - if new_key in result_map.keys(): - result_map[new_key] += self_val - else: - result_map[new_key] = self_val + new_count_map[new_key] = new_count_map.get(new_key, 0) + self_val - return result_map + return self.copy(count_map=new_count_map) def to_bytes(self): """Convert counts to bytes using data type in map key. @@ -422,34 +436,69 @@ class ToCountMap(object): """ - result = self.copy() + new_count_map = {} - for key, val in self.items(): - bytes_processed = int(key.dtype.itemsize) * val - result[key] = bytes_processed + for key, val in six.iteritems(self.count_map): + new_count_map[key] = int(key.dtype.itemsize) * val - #TODO again, is this okay? - result.val_type = int - - return result + return self.copy(new_count_map) def sum(self): - """Add all counts in ToCountMap. - - :return: An :class:`islpy.PwQPolynomial` or :class:`int` containing the - sum of counts. + """:return: A sum of the values of the dictionary.""" - """ - - if self.val_type is GuardedPwQPolynomial: - total = GuardedPwQPolynomial.zero() - else: - total = 0 + total = self._zero() - for k, v in self.items(): + for k, v in six.iteritems(self.count_map): total += v + return total +# }}} + + +# {{{ ToCountPolynomialMap + +class ToCountPolynomialMap(ToCountMap): + """Maps any type of key to a :class:`islpy.PwQPolynomial` or a + :class:`GuardedPwQPolynomial`. + """ + + def __init__(self, space, count_map=None): + if not isinstance(space, isl.Space): + raise TypeError( + "first argument to ToCountPolynomialMap must be " + "of type islpy.Space") + + assert space.is_params() + self.space = space + + space_param_tuple = _get_param_tuple(space) + + for key, val in six.iteritems(count_map): + if isinstance(val, isl.PwQPolynomial): + assert val.dim(dim_type.out) == 1 + elif isinstance(val, GuardedPwQPolynomial): + assert val.pwqpolynomial.dim(dim_type.out) == 1 + else: + raise TypeError("unexpected value type") + + assert _get_param_tuple(val.space) == space_param_tuple + + super(ToCountPolynomialMap, self).__init__(count_map) + + def _zero(self): + space = self.space.insert_dims(dim_type.out, 0, 1) + return isl.PwQPolynomial.zero(space) + + def copy(self, count_map=None, space=None): + if count_map is None: + count_map = self.count_map + + if space is None: + space = self.space + + return type(self)(space, count_map) + #TODO test and document def eval(self, params): result = self.copy() @@ -458,12 +507,11 @@ class ToCountMap(object): result.val_type = int return result - def eval_and_sum(self, params): - """Add all counts in :class:`ToCountMap` and evaluate with provided - parameter dict. + def eval_and_sum(self, params=None): + """Add all counts and evaluate with provided parameter dict *params* - :return: An :class:`int` containing the sum of all counts in the - :class:`ToCountMap` evaluated with the parameters provided. + :return: An :class:`int` containing the sum of all counts + evaluated with the parameters provided. Example usage:: @@ -478,6 +526,9 @@ class ToCountMap(object): # (now use these counts to, e.g., predict performance) """ + if params is None: + params = {} + return self.sum().eval_with_dict(params) # }}} @@ -504,35 +555,29 @@ def subst_into_guarded_pwqpolynomial(new_space, guarded_poly, subst_dict): def subst_into_to_count_map(space, tcm, subst_dict): from loopy.isl_helpers import subst_into_pwqpolynomial - result = {} + new_count_map = {} for key, value in six.iteritems(tcm.count_map): - # FIXME: This strips away the guards. Rather than being stripped, - # they should also have the substitution applied if isinstance(value, GuardedPwQPolynomial): - result[key] = subst_into_guarded_pwqpolynomial(space, value, subst_dict) + new_count_map[key] = subst_into_guarded_pwqpolynomial( + space, value, subst_dict) elif isinstance(value, isl.PwQPolynomial): - result[key] = subst_into_pwqpolynomial(space, value, subst_dict) + new_count_map[key] = subst_into_pwqpolynomial(space, value, subst_dict) elif isinstance(value, int): - result[key] = value + new_count_map[key] = value else: raise ValueError("unexpected value type") - return ToCountMap(result, val_type=isl.PwQPolynomial) + return tcm.copy(space=space, count_map=new_count_map) # }}} -def stringify_stats_mapping(m): - result = "" - for key in sorted(m.keys(), key=lambda k: str(k)): - result += ("%s : %s\n" % (key, m[key])) - return result - +# {{{ CountGranularity -class CountGranularity: +class CountGranularity(object): """Strings specifying whether an operation should be counted once per *work-item*, *sub-group*, or *work-group*. @@ -558,10 +603,12 @@ class CountGranularity: WORKGROUP = "workgroup" ALL = [WORKITEM, SUBGROUP, WORKGROUP] +# }}} + # {{{ Op descriptor -class Op(Record): +class Op(ImmutableRecord): """A descriptor for a type of arithmetic operation. .. attribute:: dtype @@ -599,18 +646,14 @@ class Op(Record): raise ValueError("Op.__init__: count_granularity '%s' is " "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) - if dtype is None: - Record.__init__(self, dtype=dtype, name=name, - count_granularity=count_granularity, - kernel_name=kernel_name) - else: + + if dtype is not None: from loopy.types import to_loopy_type - Record.__init__(self, dtype=to_loopy_type(dtype), name=name, - count_granularity=count_granularity, - kernel_name=kernel_name) + dtype = to_loopy_type(dtype) - def __hash__(self): - return hash(repr(self)) + super(Op, self).__init__(dtype=dtype, name=name, + count_granularity=count_granularity, + kernel_name=kernel_name) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness @@ -625,7 +668,7 @@ class Op(Record): # {{{ MemAccess descriptor -class MemAccess(Record): +class MemAccess(ImmutableRecord): """A descriptor for a type of memory access. .. attribute:: mtype @@ -698,24 +741,19 @@ class MemAccess(Record): "not allowed. count_granularity options: %s" % (count_granularity, CountGranularity.ALL+[None])) - if dtype is None: - Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, - gid_strides=gid_strides, direction=direction, - variable=variable, variable_tag=variable_tag, - count_granularity=count_granularity, - kernel_name=kernel_name) - else: + if dtype is not None: from loopy.types import to_loopy_type - Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), - lid_strides=lid_strides, gid_strides=gid_strides, - direction=direction, variable=variable, - variable_tag=variable_tag, - count_granularity=count_granularity, - kernel_name=kernel_name) + dtype = to_loopy_type(dtype) + + super(MemAccess, self).__init__(mtype=mtype, dtype=dtype, + lid_strides=lid_strides, gid_strides=gid_strides, + direction=direction, variable=variable, + variable_tag=variable_tag, + count_granularity=count_granularity, + kernel_name=kernel_name) def __hash__(self): - # Note that this means lid_strides and gid_strides must be sorted - # in self.__repr__() + # dicts in gid_strides and lid_strides aren't natively hashable return hash(repr(self)) def __repr__(self): @@ -736,29 +774,97 @@ class MemAccess(Record): # }}} -# {{{ counter base +# {{{ Sync descriptor + +class Sync(ImmutableRecord): + """A descriptor for a type of synchronization. + + .. attribute:: kind + + A string describing the synchronization kind, e.g. ``"barrier_global"`` or + ``"barrier_local"`` or ``"kernel_launch"``. + + .. attribute:: kernel_name + + A :class:`str` representing the kernel name where the operation occurred. + """ + + def __init__(self, kind=None, kernel_name=None): + super(Sync, self).__init__(kind=kind, kernel_name=kernel_name) + + def __repr__(self): + # Record.__repr__ overridden for consistent ordering and conciseness + return "Sync(%s, %s)" % (self.kind, self.kernel_name) + +# }}} + + +# {{{ CounterBase class CounterBase(CombineMapper): - def __init__(self, knl, callables_table): + def __init__(self, knl, callables_table, kernel_rec): self.knl = knl self.callables_table = callables_table + self.kernel_rec = kernel_rec + from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl, callables_table) + self.zero = get_kernel_zero_pwqpolynomial(self.knl) + self.one = self.zero + 1 + + @property + @memoize_method + def param_space(self): + return get_kernel_parameter_space(self.knl) + + def new_poly_map(self, count_map): + return ToCountPolynomialMap(self.param_space, count_map) + + def new_zero_poly_map(self): + return self.new_poly_map({}) + def combine(self, values): return sum(values) def map_constant(self, expr): - return ToCountMap() + return self.new_zero_poly_map() def map_call(self, expr): - return self.rec(expr.parameters) + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + from loopy.kernel.data import ValueArg + if isinstance(clbl, CallableKernel): + sub_result = self.kernel_rec(clbl.subkernel) + + assert len(clbl.subkernel.args) == len(expr.parameters) + arg_dict = dict( + (arg.name, value) + for arg, value in zip( + clbl.subkernel.args, + expr.parameters) + if isinstance(arg, ValueArg)) + + return subst_into_to_count_map( + self.param_space, + sub_result, arg_dict) \ + + self.rec(expr.parameters) + + else: + raise NotImplementedError() + + def map_call_with_kwargs(self, expr): + # FIXME + raise NotImplementedError() def map_sum(self, expr): if expr.children: return sum(self.rec(child) for child in expr.children) else: - return ToCountMap() + return self.new_zero_poly_map() map_product = map_sum @@ -798,68 +904,82 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl, callables_table, count_within_subscripts=True): - self.knl = knl - self.callables_table = callables_table + def __init__(self, knl, callables_table, kernel_rec, + count_within_subscripts=True): + super(ExpressionOpCounter, self).__init__( + knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl, callables_table) + + # FIXME: Revert to SUBGROUP + arithmetic_count_granularity = CountGranularity.WORKITEM def combine(self, values): return sum(values) def map_constant(self, expr): - return ToCountMap() + return self.new_zero_poly_map() map_tagged_variable = map_constant map_variable = map_constant def map_call(self, expr): from loopy.symbolic import ResolvedFunction - if isinstance(expr.function, ResolvedFunction): - function_identifier = self.callables_table[ - expr.function.name].name + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + if not isinstance(clbl, CallableKernel): + return self.new_poly_map( + {Op(dtype=self.type_inf(expr), + name='func:'+clbl.name, + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one} + ) + self.rec(expr.parameters) else: - function_identifier = expr.function.name - - return ToCountMap( - {Op(dtype=self.type_inf(expr), - name='func:'+function_identifier, - count_granularity=CountGranularity.SUBGROUP): 1} - ) + self.rec(expr.parameters) + return super(ExpressionOpCounter, self).map_call(expr) def map_subscript(self, expr): if self.count_within_subscripts: return self.rec(expr.index) else: - return ToCountMap() + return self.new_zero_poly_map() + + def map_sub_array_ref(self, expr): + # generates an array view, considered free + return self.new_zero_poly_map() def map_sum(self, expr): assert expr.children - return ToCountMap( + return self.new_poly_map( {Op(dtype=self.type_inf(expr), name='add', - count_granularity=CountGranularity.SUBGROUP): - len(expr.children)-1} + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): + self.zero + (len(expr.children)-1)} ) + sum(self.rec(child) for child in expr.children) def map_product(self, expr): from pymbolic.primitives import is_zero assert expr.children - return sum(ToCountMap({Op(dtype=self.type_inf(expr), + return sum(self.new_poly_map({Op(dtype=self.type_inf(expr), name='mul', - count_granularity=CountGranularity.SUBGROUP): 1}) + count_granularity=( + self.arithmetic_count_granularity), + kernel_name=self.knl.name): self.one}) + self.rec(child) for child in expr.children if not is_zero(child + 1)) + \ - ToCountMap({Op(dtype=self.type_inf(expr), + self.new_poly_map({Op(dtype=self.type_inf(expr), name='mul', - count_granularity=CountGranularity.SUBGROUP): -1}) + count_granularity=( + self.arithmetic_count_granularity), + kernel_name=self.knl.name): -self.one}) def map_quotient(self, expr, *args): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='div', - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.numerator) \ + self.rec(expr.denominator) @@ -867,32 +987,36 @@ class ExpressionOpCounter(CounterBase): map_remainder = map_quotient def map_power(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='pow', - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.base) \ + self.rec(expr.exponent) def map_left_shift(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='shift', - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.shiftee) \ + self.rec(expr.shift) map_right_shift = map_left_shift def map_bitwise_not(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='bw', - count_granularity=CountGranularity.SUBGROUP): 1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): self.one}) \ + self.rec(expr.child) def map_bitwise_or(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='bw', - count_granularity=CountGranularity.SUBGROUP): - len(expr.children)-1}) \ + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): + self.zero + (len(expr.children)-1)}) \ + sum(self.rec(child) for child in expr.children) map_bitwise_xor = map_bitwise_or @@ -913,9 +1037,10 @@ class ExpressionOpCounter(CounterBase): + self.rec(expr.else_) def map_min(self, expr): - return ToCountMap({Op(dtype=self.type_inf(expr), + return self.new_poly_map({Op(dtype=self.type_inf(expr), name='maxmin', - count_granularity=CountGranularity.SUBGROUP): + count_granularity=self.arithmetic_count_granularity, + kernel_name=self.knl.name): len(expr.children)-1}) \ + sum(self.rec(child) for child in expr.children) @@ -956,6 +1081,8 @@ class _IndexStrideCoefficientCollector(CoefficientCollector): # }}} +# {{{ _get_lid_and_gid_strides + def _get_lid_and_gid_strides(knl, array, index): # find all local and global index tags and corresponding inames from loopy.symbolic import get_dependencies @@ -1024,28 +1151,50 @@ def _get_lid_and_gid_strides(knl, array, index): return get_iname_strides(lid_to_iname), get_iname_strides(gid_to_iname) +# }}} + + +# {{{ MemAccessCounterBase + +class MemAccessCounterBase(CounterBase): + def map_sub_array_ref(self, expr): + # generates an array view, considered free + return self.new_zero_poly_map() + + def map_call(self, expr): + from loopy.symbolic import ResolvedFunction + assert isinstance(expr.function, ResolvedFunction) + clbl = self.callables_table[expr.function.name] + + from loopy.kernel.function_interface import CallableKernel + if not isinstance(clbl, CallableKernel): + return self.rec(expr.parameters) + else: + return super(MemAccessCounterBase, self).map_call(expr) -class MemAccessCounter(CounterBase): - pass +# }}} # {{{ LocalMemAccessCounter -class LocalMemAccessCounter(MemAccessCounter): +class LocalMemAccessCounter(MemAccessCounterBase): + # FIXME: Revert to SUBGROUP + local_mem_count_granularity = CountGranularity.WORKITEM + def count_var_access(self, dtype, name, index): - sub_map = ToCountMap() + count_map = {} if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if isinstance(array, TemporaryVariable) and ( array.address_space == AddressSpace.LOCAL): if index is None: # no subscript - sub_map[MemAccess( + count_map[MemAccess( mtype='local', dtype=dtype, - count_granularity=CountGranularity.SUBGROUP) - ] = 1 - return sub_map + count_granularity=self.local_mem_count_granularity, + kernel_name=self.knl.name)] = self.one + return self.new_poly_map(count_map) array = self.knl.temporary_variables[name] @@ -1057,15 +1206,16 @@ class LocalMemAccessCounter(MemAccessCounter): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - sub_map[MemAccess( + count_map[MemAccess( mtype='local', dtype=dtype, lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, - count_granularity=CountGranularity.SUBGROUP)] = 1 + count_granularity=self.local_mem_count_granularity, + kernel_name=self.knl.name)] = self.one - return sub_map + return self.new_poly_map(count_map) def map_variable(self, expr): return self.count_var_access( @@ -1084,7 +1234,7 @@ class LocalMemAccessCounter(MemAccessCounter): # {{{ GlobalMemAccessCounter -class GlobalMemAccessCounter(MemAccessCounter): +class GlobalMemAccessCounter(MemAccessCounterBase): def map_variable(self, expr): name = expr.name @@ -1092,17 +1242,18 @@ class GlobalMemAccessCounter(MemAccessCounter): array = self.knl.arg_dict[name] else: # this is a temporary variable - return ToCountMap() + return self.new_zero_poly_map() if not isinstance(array, lp.ArrayArg): # this array is not in global memory - return ToCountMap() + return self.new_zero_poly_map() - return ToCountMap({MemAccess(mtype='global', - dtype=self.type_inf(expr), lid_strides={}, - gid_strides={}, variable=name, - count_granularity=CountGranularity.WORKITEM): 1} - ) + self.rec(expr.index) + return self.new_poly_map({MemAccess(mtype='global', + dtype=self.type_inf(expr), lid_strides={}, + gid_strides={}, variable=name, + count_granularity=CountGranularity.WORKITEM, + kernel_name=self.knl.name): self.one} + ) + self.rec(expr.index) def map_subscript(self, expr): name = expr.aggregate.name @@ -1128,19 +1279,28 @@ class GlobalMemAccessCounter(MemAccessCounter): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - count_granularity = CountGranularity.WORKITEM if ( - 0 in lid_strides and lid_strides[0] != 0 - ) else CountGranularity.SUBGROUP + # FIXME: Revert to subgroup + global_access_count_granularity = CountGranularity.WORKITEM - return ToCountMap({MemAccess( + # Account for broadcasts once per subgroup + count_granularity = CountGranularity.WORKITEM if ( + # if the stride in lid.0 is known + 0 in lid_strides + and + # it is nonzero + lid_strides[0] != 0 + ) else global_access_count_granularity + + return self.new_poly_map({MemAccess( mtype='global', dtype=self.type_inf(expr), lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, variable_tag=var_tag, - count_granularity=count_granularity - ): 1} + count_granularity=count_granularity, + kernel_name=self.knl.name, + ): self.one} ) + self.rec(expr.index_tuple) # }}} @@ -1216,7 +1376,9 @@ class AccessFootprintGatherer(CombineMapper): # {{{ count def add_assumptions_guard(kernel, pwqpolynomial): - return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions) + return GuardedPwQPolynomial( + pwqpolynomial, + kernel.assumptions.align_params(pwqpolynomial.space)) def count(kernel, set, space=None): @@ -1319,7 +1481,7 @@ def count(kernel, set, space=None): def get_unused_hw_axes_factor(knl, callables_table, insn, - disregard_local_axes, space=None): + disregard_local_axes): # FIXME: Multi-kernel support gsize, lsize = knl.get_grid_size_upper_bounds(callables_table) @@ -1338,12 +1500,12 @@ def get_unused_hw_axes_factor(knl, callables_table, insn, g_used.add(tag.axis) def mult_grid_factor(used_axes, size): - result = 1 + result = get_kernel_zero_pwqpolynomial(knl) + 1 + for iaxis, size in enumerate(size): if iaxis not in used_axes: if not isinstance(size, int): - if space is not None: - size = size.align_params(space) + size = size.align_params(result.space) size = isl.PwQPolynomial.from_pw_aff(size) @@ -1359,6 +1521,16 @@ def get_unused_hw_axes_factor(knl, callables_table, insn, return add_assumptions_guard(knl, result) +def count_inames_domain(knl, inames): + space = get_kernel_parameter_space(knl) + if not inames: + return get_kernel_zero_pwqpolynomial(knl) + 1 + + inames_domain = knl.get_inames_domain(inames) + domain = inames_domain.project_out_except(inames, [dim_type.set]) + return count(knl, domain, space=space) + + def count_insn_runs(knl, callables_table, insn, count_redundant_work, disregard_local_axes=False): @@ -1370,18 +1542,11 @@ def count_insn_runs(knl, callables_table, insn, count_redundant_work, [iname for iname in insn_inames if not knl.iname_tags_of_type(iname, LocalIndexTag)]) - inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except( - insn_inames, [dim_type.set])) - - space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, - set=[], params=knl.outer_params()) - - c = count(knl, domain, space=space) + c = count_inames_domain(knl, insn_inames) if count_redundant_work: unused_fac = get_unused_hw_axes_factor(knl, callables_table, - insn, disregard_local_axes=disregard_local_axes, space=space) + insn, disregard_local_axes=disregard_local_axes) return c * unused_fac else: return c @@ -1412,7 +1577,8 @@ def _get_insn_count(knl, callables_table, insn_id, subgroup_size, if count_granularity == CountGranularity.WORKGROUP: return ct_disregard_local elif count_granularity == CountGranularity.SUBGROUP: - # get the group size + # {{{ compute workgroup_size + from loopy.symbolic import aff_to_expr _, local_size = knl.get_grid_size_upper_bounds(callables_table) workgroup_size = 1 @@ -1425,15 +1591,18 @@ def _get_insn_count(knl, callables_table, insn_id, subgroup_size, % (CountGranularity.SUBGROUP, local_size)) workgroup_size *= s + # }}} + warn_with_kernel(knl, "insn_count_subgroups_upper_bound", "get_insn_count: when counting instruction %s with " "count_granularity=%s, using upper bound for work-group size " "(%d work-items) to compute sub-groups per work-group. When " - "multiple device programs present, actual sub-group count may be" + "multiple device programs present, actual sub-group count may be " "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size)) from pytools import div_ceil return ct_disregard_local*div_ceil(workgroup_size, subgroup_size) + else: # this should not happen since this is enforced in Op/MemAccess raise ValueError("get_insn_count: count_granularity '%s' is" @@ -1445,9 +1614,9 @@ def _get_insn_count(knl, callables_table, insn_id, subgroup_size, # {{{ get_op_map -def get_op_map_for_single_kernel(knl, callables_table, - numpy_types=True, count_redundant_work=False, - count_within_subscripts=True, subgroup_size=None): +def _get_op_map_for_single_kernel(knl, callables_table, + count_redundant_work, + count_within_subscripts, subgroup_size): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " @@ -1455,9 +1624,15 @@ def get_op_map_for_single_kernel(knl, callables_table, subgroup_size = _process_subgroup_size(knl, subgroup_size) - op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl, callables_table, + kernel_rec = partial(_get_op_map_for_single_kernel, + callables_table=callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) + + op_counter = ExpressionOpCounter(knl, callables_table, kernel_rec, count_within_subscripts) + op_map = op_counter.new_zero_poly_map() from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1465,14 +1640,12 @@ def get_op_map_for_single_kernel(knl, callables_table, for insn in knl.instructions: if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - ops = op_counter(insn.assignee) + op_counter(insn.expression) + ops = op_counter(insn.assignees) + op_counter(insn.expression) for key, val in six.iteritems(ops.count_map): - op_map = ( - op_map - + ToCountMap({key: val}) - * _get_insn_count(knl, callables_table, insn.id, + count = _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, - key.count_granularity)) + key.count_granularity) + op_map = op_map + ToCountMap({key: val}) * count elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass @@ -1480,15 +1653,7 @@ def get_op_map_for_single_kernel(knl, callables_table, raise NotImplementedError("unexpected instruction item type: '%s'" % type(insn).__name__) - if numpy_types: - return ToCountMap( - init_dict=dict( - (op.copy(dtype=op.dtype.numpy_dtype), ct) - for op, ct in six.iteritems(op_map.count_map)), - val_type=op_map.val_type - ) - else: - return op_map + return op_map def get_op_map(program, numpy_types=True, count_redundant_work=False, @@ -1498,10 +1663,6 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` flag indicates whether this work should be included in the count. @@ -1519,7 +1680,7 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, specifies that it should only be counted once per sub-group. If set to *None* an attempt to find the sub-group size using the device will be made, if this fails an error will be raised. If a :class:`str` - ``'guess'`` is passed as the subgroup_size, get_mem_access_map will + ``'guess'`` is passed as the subgroup_size, :func:`get_op_map` will attempt to find the sub-group size using the device and, if unsuccessful, will make a wild guess. @@ -1556,34 +1717,28 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, program = make_program(program) from loopy.preprocess import preprocess_program, infer_unknown_types - program = infer_unknown_types(program, expect_completion=True) program = preprocess_program(program) - op_map = ToCountMap() - - callables_count = ( - program.callables_table.callables_count) - - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - knl = in_knl_callable.subkernel - knl_op_map = get_op_map_for_single_kernel(knl, - program.callables_table, numpy_types, count_redundant_work, - count_within_subscripts, subgroup_size) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - for i in range(callables_count[func_id]): - op_map += knl_op_map - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callabke types %s." % ( - type(in_knl_callable).__name__)) + if numpy_types is not None: + from warnings import warn + warn("numpy_types is being ignored and will be removed in 2020.", + DeprecationWarning, stacklevel=2) - return op_map + return _get_op_map_for_single_kernel( + program[program.name], program.callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) # }}} +# {{{ subgoup size finding + def _find_subgroup_size_for_knl(knl): from loopy.target.pyopencl import PyOpenCLTarget if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None: @@ -1635,11 +1790,13 @@ def _process_subgroup_size(knl, subgroup_size_requested): "must be integer, 'guess', or, if you're feeling " "lucky, None." % (subgroup_size_requested)) +# }}} + # {{{ get_mem_access_map -def get_mem_access_map_for_single_kernel(knl, callables_table, - numpy_types=True, count_redundant_work=False, subgroup_size=None): +def _get_mem_access_map_for_single_kernel(knl, callables_table, + count_redundant_work, subgroup_size): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " @@ -1647,9 +1804,16 @@ def get_mem_access_map_for_single_kernel(knl, callables_table, subgroup_size = _process_subgroup_size(knl, subgroup_size) - access_map = ToCountMap() - access_counter_g = GlobalMemAccessCounter(knl, callables_table) - access_counter_l = LocalMemAccessCounter(knl, callables_table) + kernel_rec = partial(_get_mem_access_map_for_single_kernel, + callables_table=callables_table, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size) + + access_counter_g = GlobalMemAccessCounter( + knl, callables_table, kernel_rec) + access_counter_l = LocalMemAccessCounter( + knl, callables_table, kernel_rec) + access_map = access_counter_g.new_zero_poly_map() from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, @@ -1657,62 +1821,39 @@ def get_mem_access_map_for_single_kernel(knl, callables_table, for insn in knl.instructions: if isinstance(insn, (CallInstruction, CInstruction, Assignment)): - access_expr = ( - access_counter_g(insn.expression) - + access_counter_l(insn.expression) - ).with_set_attributes(direction="load") - - access_assignee = ( - access_counter_g(insn.assignee) - + access_counter_l(insn.assignee) - ).with_set_attributes(direction="store") - - for key, val in six.iteritems(access_expr.count_map): - - access_map = ( - access_map - + ToCountMap({key: val}) - * _get_insn_count(knl, callables_table, insn.id, - subgroup_size, count_redundant_work, - key.count_granularity)) - - for key, val in six.iteritems(access_assignee.count_map): - - access_map = ( - access_map - + ToCountMap({key: val}) - * _get_insn_count(knl, callables_table, insn.id, + insn_access_map = ( + access_counter_g(insn.expression) + + access_counter_l(insn.expression) + ).with_set_attributes(direction="load") + for assignee in insn.assignees: + insn_access_map = insn_access_map + ( + access_counter_g(insn.assignee) + + access_counter_l(insn.assignee) + ).with_set_attributes(direction="store") + + for key, val in six.iteritems(insn_access_map.count_map): + count = _get_insn_count(knl, callables_table, insn.id, subgroup_size, count_redundant_work, - key.count_granularity)) + key.count_granularity) + access_map = access_map + ToCountMap({key: val}) * count elif isinstance(insn, (NoOpInstruction, BarrierInstruction)): pass + else: raise NotImplementedError("unexpected instruction item type: '%s'" % type(insn).__name__) - if numpy_types: - return ToCountMap( - init_dict=dict( - (mem_access.copy(dtype=mem_access.dtype.numpy_dtype), ct) - for mem_access, ct in six.iteritems(access_map.count_map)), - val_type=access_map.val_type - ) - else: - return access_map + return access_map -def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, +def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, subgroup_size=None): """Count the number of memory accesses in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be counted. - :arg numpy_types: A :class:`bool` specifying whether the types in the - returned mapping should be numpy types instead of - :class:`loopy.LoopyType`. - :arg count_redundant_work: Based on usage of hardware axes or other specifics, a kernel may perform work redundantly. This :class:`bool` flag indicates whether this work should be included in the count. @@ -1790,62 +1931,46 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False, """ from loopy.preprocess import preprocess_program, infer_unknown_types - program = infer_unknown_types(program, expect_completion=True) program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - access_map = ToCountMap() - - callables_count = program.callables_table.callables_count - - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - knl = in_knl_callable.subkernel - knl_access_map = get_mem_access_map_for_single_kernel(knl, - program.callables_table, numpy_types, - count_redundant_work, subgroup_size) - - # FIXME: didn't see any easy way to multiply - for i in range(callables_count[func_id]): - access_map += knl_access_map - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callabke types %s." % ( - type(in_knl_callable).__name__)) + if numpy_types is not None: + from warnings import warn + warn("numpy_types is being ignored and will be removed in 2020.", + DeprecationWarning, stacklevel=2) - return access_map + return _get_mem_access_map_for_single_kernel( + program[program.name], program.callables_table, + count_redundant_work=count_redundant_work, + subgroup_size=subgroup_size) # }}} # {{{ get_synchronization_map -def get_synchronization_map_for_single_kernel(knl, callables_table, +def _get_synchronization_map_for_single_kernel(knl, callables_table, subgroup_size=None): if not knl.options.ignore_boostable_into: raise LoopyError("Kernel '%s': Using operation counting requires the option " "ignore_boostable_into to be set." % knl.name) + knl = lp.get_one_scheduled_kernel(knl, callables_table) + from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) - from operator import mul - knl = lp.get_one_scheduled_kernel(knl, callables_table) - iname_list = [] - result = ToCountMap() + kernel_rec = partial(_get_synchronization_map_for_single_kernel, + callables_table=callables_table, + subgroup_size=subgroup_size) - one = isl.PwQPolynomial('{ 1 }') + sync_counter = CounterBase(knl, callables_table, kernel_rec) + sync_map = sync_counter.new_zero_poly_map() - def get_count_poly(iname_list): - if iname_list: - ct = (count(knl, ( - knl.get_inames_domain(iname_list). - project_out_except(iname_list, [dim_type.set]) - )), ) - return reduce(mul, ct) - else: - return one + iname_list = [] for sched_item in knl.schedule: if isinstance(sched_item, EnterLoop): @@ -1856,22 +1981,27 @@ def get_synchronization_map_for_single_kernel(knl, callables_table, iname_list.pop() elif isinstance(sched_item, Barrier): - result = result + ToCountMap({"barrier_%s" % - sched_item.synchronization_kind: - get_count_poly(iname_list)}) + sync_map = sync_map + ToCountMap( + {Sync( + "barrier_%s" % sched_item.synchronization_kind, + knl.name): count_inames_domain(knl, frozenset(iname_list))}) + + elif isinstance(sched_item, RunInstruction): + pass elif isinstance(sched_item, CallKernel): - result = result + ToCountMap( - {"kernel_launch": get_count_poly(iname_list)}) + sync_map = sync_map + ToCountMap( + {Sync("kernel_launch", knl.name): + count_inames_domain(knl, frozenset(iname_list))}) - elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)): + elif isinstance(sched_item, ReturnFromKernel): pass else: raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) - return result + return sync_map def get_synchronization_map(program, subgroup_size=None): @@ -1913,45 +2043,21 @@ def get_synchronization_map(program, subgroup_size=None): from loopy.preprocess import preprocess_program, infer_unknown_types - program = infer_unknown_types(program, expect_completion=True) program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) - sync_map = ToCountMap() - callables_count = program.callables_table.callables_count - - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - knl = in_knl_callable.subkernel - knl_sync_map = get_synchronization_map_for_single_kernel(knl, - program.callables_table, subgroup_size) - - # FIXME: didn't see any easy way to multiply - for i in range(callables_count[func_id]): - sync_map += knl_sync_map - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callabke types %s." % ( - type(in_knl_callable).__name__)) - - return sync_map + return _get_synchronization_map_for_single_kernel( + program[program.name], program.callables_table, + subgroup_size=subgroup_size) # }}} # {{{ gather_access_footprints -def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False): - """Return a dictionary mapping ``(var_name, direction)`` to - :class:`islpy.Set` instances capturing which indices of each the array - *var_name* are read/written (where *direction* is either ``read`` or - ``write``. - - :arg ignore_uncountable: If *False*, an error will be raised for accesses - on which the footprint cannot be determined (e.g. data-dependent or - nonlinear indices) - """ - +def _gather_access_footprints_for_single_kernel(kernel, ignore_uncountable): write_footprints = [] read_footprints = [] @@ -1978,6 +2084,16 @@ def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False) def gather_access_footprints(program, ignore_uncountable=False): + """Return a dictionary mapping ``(var_name, direction)`` to + :class:`islpy.Set` instances capturing which indices of each the array + *var_name* are read/written (where *direction* is either ``read`` or + ``write``. + + :arg ignore_uncountable: If *False*, an error will be raised for accesses + on which the footprint cannot be determined (e.g. data-dependent or + nonlinear indices) + """ + # FIMXE: works only for one callable kernel till now. if len([in_knl_callable for in_knl_callable in program.callables_table.values() if isinstance(in_knl_callable, @@ -1987,31 +2103,16 @@ def gather_access_footprints(program, ignore_uncountable=False): from loopy.preprocess import preprocess_program, infer_unknown_types - program = infer_unknown_types(program, expect_completion=True) program = preprocess_program(program) + # Ordering restriction: preprocess might insert arguments to + # make strides valid. Those also need to go through type inference. + program = infer_unknown_types(program, expect_completion=True) write_footprints = [] read_footprints = [] - callables_count = program.callables_table.callables_count - - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - knl = in_knl_callable.subkernel - knl_write_footprints, knl_read_footprints = ( - gather_access_footprints_for_single_kernel(knl, - ignore_uncountable)) - - # FIXME: didn't see any easy way to multiply - for i in range(callables_count[func_id]): - write_footprints.extend(knl_write_footprints) - read_footprints.extend(knl_read_footprints) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callabke types %s." % ( - type(in_knl_callable).__name__)) + write_footprints, read_footprints = _gather_access_footprints_for_single_kernel( + program[program.name], ignore_uncountable) write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) diff --git a/test/test_statistics.py b/test/test_statistics.py index 41a88b386..cadca9fc1 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -218,16 +218,25 @@ def test_op_counter_bitwise(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params) - i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params) - i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP) - ].eval_with_dict(params) - i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP) - ].eval_with_dict(params) - i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP) - ].eval_with_dict(params) - i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) - ].eval_with_dict(params) + print(op_map) + i32add = op_map[ + lp.Op(np.int32, 'add', CG.SUBGROUP, 'bitwise') + ].eval_with_dict(params) + i32bw = op_map[ + lp.Op(np.int32, 'bw', CG.SUBGROUP, 'bitwise') + ].eval_with_dict(params) + i64bw = op_map[ + lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP, 'bitwise') + ].eval_with_dict(params) + i64mul = op_map[ + lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP, 'bitwise') + ].eval_with_dict(params) + i64add = op_map[ + lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP, 'bitwise') + ].eval_with_dict(params) + i64shift = op_map[ + lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP, 'bitwise') + ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert i32add == n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups @@ -922,11 +931,10 @@ def test_barrier_counter_nobarriers(): ell = 128 params = {'n': n, 'm': m, 'ell': ell} assert len(sync_map) == 1 - assert sync_map["kernel_launch"].eval_with_dict(params) == 1 + assert sync_map.filter_by(kind="kernel_launch").eval_and_sum(params) == 1 def test_barrier_counter_barriers(): - knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ @@ -948,10 +956,25 @@ def test_barrier_counter_barriers(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - barrier_count = sync_map["barrier_local"].eval_with_dict(params) + barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum(params) assert barrier_count == 50*10*2 +def test_barrier_count_single(): + knl = lp.make_kernel( + "{[i]: 0<=i<128}", + """ + <> c[i] = 15*i {id=yoink} + c[i+1] = c[i] {dep=yoink} + """) + + knl = lp.tag_inames(knl, {"i": "l.0"}) + sync_map = lp.get_synchronization_map(knl) + print(sync_map) + barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum() + assert barrier_count == 1 + + def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( @@ -978,8 +1001,8 @@ def test_all_counters_parallel_matmul(): sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 - assert sync_map["kernel_launch"].eval_with_dict(params) == 1 - assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize + assert sync_map.filter_by(kind="kernel_launch").eval_and_sum(params) == 1 + assert sync_map.filter_by(kind="barrier_local").eval_and_sum(params) == 2*m/bsize op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ @@ -1096,9 +1119,8 @@ def test_floor_div_coefficient_collector(): n_subgroups = n_workgroups*subgroups_per_group # count local f32 accesses - f32_local = lp.get_mem_access_map( - knl, count_redundant_work=True, subgroup_size=SGS - ).filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params) + m = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) + f32_local = m.filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert f32_local == 2*(rept+1)*n_subgroups @@ -1176,7 +1198,7 @@ def test_gather_access_footprint(): fp = gather_access_footprints(knl) for key, footprint in six.iteritems(fp): - print(key, count(knl, footprint)) + print(key, count(knl.root_kernel, footprint)) def test_gather_access_footprint_2(): @@ -1191,8 +1213,8 @@ def test_gather_access_footprint_2(): params = {"n": 200} for key, footprint in six.iteritems(fp): - assert count(knl, footprint).eval_with_dict(params) == 200 - print(key, count(knl, footprint)) + assert count(knl.root_kernel, footprint).eval_with_dict(params) == 200 + print(key, count(knl.root_kernel, footprint)) def test_summations_and_filters(): @@ -1316,8 +1338,8 @@ def test_strided_footprint(): x_l_foot = footprints[('x', 'read')] from loopy.statistics import count - num = count(knl, x_l_foot).eval_with_dict(param_dict) - denom = count(knl, x_l_foot.remove_divs()).eval_with_dict(param_dict) + num = count(knl.root_kernel, x_l_foot).eval_with_dict(param_dict) + denom = count(knl.root_kernel, x_l_foot.remove_divs()).eval_with_dict(param_dict) assert 2*num < denom -- GitLab From 88ea1329f6157e8fb6444dd62b635b5c08902612 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 11 Jun 2019 13:21:29 -0500 Subject: [PATCH 544/916] move dump_as_python to loopy.tools --- loopy/__init__.py | 4 +- loopy/tools.py | 107 ++++++++++++++++++++++++++++- loopy/transform/write_to_python.py | 104 ---------------------------- 3 files changed, 108 insertions(+), 107 deletions(-) delete mode 100644 loopy/transform/write_to_python.py diff --git a/loopy/__init__.py b/loopy/__init__.py index 7dddf612e..fdfda32c7 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -120,7 +120,6 @@ from loopy.transform.batch import to_batched, save_temporaries_in_loop from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier -from loopy.transform.write_to_python import write_to_python from loopy.transform.callable import (register_callable_kernel, register_function_id_to_in_knl_callable_mapper, inline_callable_kernel) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call @@ -157,6 +156,7 @@ from loopy.target.ispc import ISPCTarget from loopy.target.numba import NumbaTarget, NumbaCudaTarget from loopy.tools import Optional +from loopy.tools import dump_as_python __all__ = [ @@ -241,7 +241,7 @@ __all__ = [ "add_barrier", - "write_to_python", + "dump_as_python", "register_callable_kernel", "register_function_id_to_in_knl_callable_mapper", diff --git a/loopy/tools.py b/loopy/tools.py index 56942820d..4000904fb 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -38,7 +38,9 @@ from pymbolic.mapper.persistent_hash import ( PersistentHashWalkMapper as PersistentHashWalkMapperBase) import six # noqa from six.moves import intern - +import re +from mako.template import Template +import loopy as lp if six.PY2: def is_integer(obj): @@ -704,4 +706,107 @@ def natorder(key): def natsorted(seq, key=lambda x: x): return sorted(seq, key=lambda y: natorder(key(y))) + +def dump_as_python(kernel, filename=None): + """ + Generates a python code for generating *kernel* for sharing kernels. + + :arg kernel: An instance of :class:`loopy.LoopKernel` + :arg filename: An instance of :class:`str`. If *None*, then prints the + python file to *stdout*. + """ + + options = [] + + printed_insn_ids = set() + printed_insn_order = [] + + def insert_insn_into_order(insn): + if insn.id in printed_insn_ids: + return + printed_insn_ids.add(insn.id) + + for dep_id in natsorted(insn.depends_on): + insert_insn_into_order(kernel.id_to_insn[dep_id]) + + printed_insn_order.append(insn) + + for insn in kernel.instructions: + insert_insn_into_order(insn) + + for insn in printed_insn_order: + option = 'id=%s, ' % insn.id + if insn.depends_on: + option += ("dep="+":".join(insn.depends_on)+", ") + if insn.tags: + option += ("tags="+":".join(insn.tags)+", ") + if insn.within_inames: + option += ("inames="+":".join(insn.within_inames)+", ") + if isinstance(insn, lp.MultiAssignmentBase): + if insn.atomicity: + option += "atomic, " + elif isinstance(insn, lp.BarrierInstruction): + option += ("mem_kind=%s, " % insn.mem_kind) + options.append(option[:-2]) + + insn_x_options = zip(printed_insn_order, options) + + python_code = r'''<%! import loopy as lp %>import loopy as lp + import numpy as np + <%! tv_scope = {0: 'lp.AddressSpace.PRIVATE', 1: 'lp.AddressSpace.LOCAL', + 2: 'lp.AddressSpace.GLOBAL', lp.auto: 'lp.auto' } %> + knl = lp.make_kernel( + [ + % for dom in domains: + "${str(dom)}", + % endfor + ], + """ + % for insn, opts in insn_x_opts: + % if isinstance(insn, lp.Assignment): + ${insn.assignee} = ${insn.expression} {${opts}} + % elif isinstance(insn, lp.BarrierInstruction): + ... ${insn.synchronization_kind[0]}barrier{${opts}} + % elif isinstance(insn, lp.NoOpInstruction): + ... nop {${opts}} + % else: + **Not implemented for ${type(insn)}** + % endif + %endfor + """, [ + % for arg in args: + % if isinstance(arg, lp.ValueArg): + lp.ValueArg( + name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}), + % else: + lp.GlobalArg( + name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}, + shape=${arg.shape}, for_atomic=${arg.for_atomic}), + % endif + % endfor + % for tv in temp_vars: + lp.TemporaryVariable( + name='${tv.name}', dtype=np.${tv.dtype.numpy_dtype.name}, + shape=${tv.shape}, for_atomic=${tv.for_atomic}, + address_space=${tv_scope[tv.address_space]}, + read_only=${tv.read_only}, + % if tv.initializer is not None: + initializer=${"np."+str((tv.initializer).__repr__())}, + % endif + ), + % endfor + ], lang_version=${lp.VERSION})''' + + python_code = Template(python_code).render(insn_x_opts=insn_x_options, + domains=kernel.domains, args=kernel.args, + temp_vars=[k for k in kernel.temporary_variables.values()]) + + python_code = re.sub("\\n ", "\n", python_code) + if filename: + with open(filename, 'w') as f: + f.write(python_code) + else: + print(python_code) + + # vim: foldmethod=marker diff --git a/loopy/transform/write_to_python.py b/loopy/transform/write_to_python.py deleted file mode 100644 index 9a863bcd7..000000000 --- a/loopy/transform/write_to_python.py +++ /dev/null @@ -1,104 +0,0 @@ -import re -from mako.template import Template -import loopy as lp -from loopy.tools import natsorted - - -def write_to_python(kernel, filename=None): - """ - Generates a python code for generating *kernel* for sharing kernels. - - :arg kernel: An instance of :class:`loopy.LoopKernel` - :arg filename: An instance of :class:`str`. If *None*, then prints the - python file to *stdout*. - """ - - options = [] - - printed_insn_ids = set() - printed_insn_order = [] - - def insert_insn_into_order(insn): - if insn.id in printed_insn_ids: - return - printed_insn_ids.add(insn.id) - - for dep_id in natsorted(insn.depends_on): - insert_insn_into_order(kernel.id_to_insn[dep_id]) - - printed_insn_order.append(insn) - - for insn in kernel.instructions: - insert_insn_into_order(insn) - - for insn in printed_insn_order: - option = 'id=%s, ' % insn.id - if insn.depends_on: - option += ("dep="+":".join(insn.depends_on)+", ") - if insn.tags: - option += ("tags="+":".join(insn.tags)+", ") - if insn.within_inames: - option += ("inames="+":".join(insn.within_inames)+", ") - if isinstance(insn, lp.MultiAssignmentBase): - if insn.atomicity: - option += "atomic, " - elif isinstance(insn, lp.BarrierInstruction): - option += ("mem_kind=%s, " % insn.mem_kind) - options.append(option[:-2]) - - insn_x_options = zip(printed_insn_order, options) - - python_code = r'''<%! import loopy as lp %>import loopy as lp - import numpy as np - <%! tv_scope = {0: 'lp.AddressSpace.PRIVATE', 1: 'lp.AddressSpace.LOCAL', - 2: 'lp.AddressSpace.GLOBAL', lp.auto: 'lp.auto' } %> - knl = lp.make_kernel( - [ - % for dom in domains: - "${str(dom)}", - % endfor - ], - """ - % for insn, opts in insn_x_opts: - % if isinstance(insn, lp.Assignment): - ${insn.assignee} = ${insn.expression} {${opts}} - % elif isinstance(insn, lp.BarrierInstruction): - ... ${insn.synchronization_kind[0]}barrier{${opts}} - % else: - **Not implemented for ${type(insn)}** - % endif - %endfor - """, [ - % for arg in args: - % if isinstance(arg, lp.ValueArg): - lp.ValueArg( - name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}), - % else: - lp.GlobalArg( - name='${arg.name}', dtype=np.${arg.dtype.numpy_dtype.name}, - shape=${arg.shape}, for_atomic=${arg.for_atomic}), - % endif - % endfor - % for tv in temp_vars: - lp.TemporaryVariable( - name='${tv.name}', dtype=np.${tv.dtype.numpy_dtype.name}, - shape=${tv.shape}, for_atomic=${tv.for_atomic}, - address_space=${tv_scope[tv.address_space]}, - read_only=${tv.read_only}, - % if tv.initializer is not None: - initializer=${"np."+str((tv.initializer).__repr__())}, - % endif - ), - % endfor - ], lang_version=${lp.VERSION})''' - - python_code = Template(python_code).render(insn_x_opts=insn_x_options, - domains=kernel.domains, args=kernel.args, - temp_vars=[k for k in kernel.temporary_variables.values()]) - - python_code = re.sub("\\n ", "\n", python_code) - if filename: - with open(filename, 'w') as f: - f.write(python_code) - else: - print(python_code) -- GitLab From 7023664f021825e4db83db60a43d31af993a19c7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 7 Jul 2019 23:41:36 -0500 Subject: [PATCH 545/916] type inference should walk through comparison expressions to resolve the types of functions --- loopy/type_inference.py | 9 +++++++-- test/test_loopy.py | 43 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index c305e483e..f943c0ffc 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -642,8 +642,13 @@ class TypeInferenceMapper(CombineMapper): def map_logical_not(self, expr): return [NumpyType(np.dtype(np.int32))] - map_logical_and = map_logical_not - map_logical_or = map_logical_not + def map_logical_and(self, expr): + for child in expr.children: + self.rec(child) + + return [NumpyType(np.dtype(np.int32))] + + map_logical_or = map_logical_and def map_group_hw_index(self, expr, *args): return [self.kernel.index_dtype] diff --git a/test/test_loopy.py b/test/test_loopy.py index 16ec6c1d3..50ec99061 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2955,6 +2955,49 @@ def test_temp_var_type_deprecated_usage(): temp_var_types=(np.dtype(np.int32),)) +def test_type_inference_walks_fn_in_comparison(): + # Reported by Lawrence Mitchell + # See: https://gitlab.tiker.net/inducer/loopy/issues/180 + + knl = lp.make_kernel( + [ + "{ [p] : 0 <= p <= 2 }", + "{ [i] : 0 <= i <= 2 }", + ], + """ + t2 = 0.0 {id=insn} + t1 = 0.0 {id=insn_0, dep=insn} + t1 = t1 + t0[p, i]*w_0[1 + i*2] {id=insn_1, dep=insn_0} + t2 = t2 + t0[p, i]*w_0[i*2] {id=insn_2, dep=insn_1} + A[p] = A[p]+(0.2 if abs(-1.2+t2) <= 0.1 and abs(-0.15+t1) <= 0.05 else 0.0 + ) {dep=insn_2} + """, [ + lp.GlobalArg( + name='A', dtype=np.float64, + shape=(3)), + lp.GlobalArg( + name='w_0', dtype=np.float64, + shape=(6),), + lp.TemporaryVariable( + name='t0', dtype=np.float64, + shape=(3, 3), + read_only=True, + address_space=lp.AddressSpace.LOCAL, + initializer=np.array([[1., 0., 0.], + [0., 1., 0.], + [0., 0., 1.]]),), + lp.TemporaryVariable( + name='t1', dtype=np.float64, + shape=()), + lp.TemporaryVariable( + name='t2', dtype=np.float64, + shape=()), + ], + target=lp.CTarget()) + + print(lp.generate_code_v2(knl).device_code()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From cfd5e958d8cbbbcae8680b9ad21b729c01727d0b Mon Sep 17 00:00:00 2001 From: "Timothy A. Smith" Date: Wed, 14 Aug 2019 08:43:40 -0500 Subject: [PATCH 546/916] change some syntax so Fortran test code will parse successfully --- test/test_fortran.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index 2b62148a9..a94be0232 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -60,15 +60,15 @@ def test_fill(ctx_factory): !$loopy begin ! - ! fill, = lp.parse_fortran(SOURCE) + ! fill = lp.parse_fortran(SOURCE) ! fill = lp.split_iname(fill, "i", split_amount, ! outer_tag="g.0", inner_tag="l.0") - ! RESULT = [fill] + ! RESULT = fill ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src, + knl = lp.parse_transformed_fortran(fortran_src, pre_transform_code="split_amount = 128") assert "i_inner" in knl.root_kernel.all_inames() @@ -92,7 +92,7 @@ def test_fill_const(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() @@ -115,7 +115,7 @@ def test_asterisk_in_shape(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -139,7 +139,7 @@ def test_assignment_to_subst(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -166,7 +166,7 @@ def test_assignment_to_subst_two_defs(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -194,7 +194,7 @@ def test_assignment_to_subst_indices(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) @@ -231,7 +231,7 @@ def test_if(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -265,7 +265,7 @@ def test_tagged(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2 @@ -293,7 +293,7 @@ def test_matmul(ctx_factory, buffer_inames): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) assert len(knl.root_kernel.domains) == 1 @@ -355,7 +355,7 @@ def test_batched_sparse(): """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) @@ -399,11 +399,11 @@ def test_fuse_kernels(ctx_factory): result(e,i,j) = prev + d(i,k)*q(e,k,j) """ - xderiv, = lp.parse_fortran( + xderiv = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) - yderiv, = lp.parse_fortran( + yderiv = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) - xyderiv, = lp.parse_fortran( + xyderiv = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) @@ -442,15 +442,17 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! fill, twice = lp.parse_fortran(SOURCE) + ! prg = lp.parse_fortran(SOURCE) + ! fill = prg["fill"] + ! twice = prg["twice"] ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) - ! RESULT = [knl] + ! RESULT = knl ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src) + knl = lp.parse_transformed_fortran(fortran_src) def test_precompute_some_exist(ctx_factory): @@ -470,7 +472,7 @@ def test_precompute_some_exist(ctx_factory): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) assert len(knl.root_kernel.domains) == 1 -- GitLab From ced3617c7635bd9d41a9c30ec4c45a73f1a7dea3 Mon Sep 17 00:00:00 2001 From: "Timothy A. Smith" Date: Wed, 14 Aug 2019 08:46:24 -0500 Subject: [PATCH 547/916] mark Fortran test as xfail since example seems to be broken --- test/test_fortran.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_fortran.py b/test/test_fortran.py index a94be0232..42911e097 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -416,6 +416,7 @@ def test_fuse_kernels(ctx_factory): lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) +@pytest.mark.xfail def test_parse_and_fuse_two_kernels(): fortran_src = """ subroutine fill(out, a, n) -- GitLab From 555e212c6fafdc94f567cf98d6ec9831118a2d80 Mon Sep 17 00:00:00 2001 From: "Timothy A. Smith" Date: Wed, 14 Aug 2019 10:09:15 -0500 Subject: [PATCH 548/916] added a sane default for index_dtype when a Fortran subroutine doesn't have a loop --- loopy/frontend/fortran/translator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 66961ce70..aa635eebf 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -797,13 +797,17 @@ class F2LoopyTranslator(FTreeWalkerBase): # }}} + index_dtype = self.index_dtype + if index_dtype is None: + index_dtype = np.int32 + knl = lp.make_function( sub.index_sets, sub.instructions, kernel_data, name=sub.subprogram_name, default_order="F", - index_dtype=self.index_dtype, + index_dtype=index_dtype, target=self.target, seq_dependencies=seq_dependencies, ) -- GitLab From 6b86c327ab899efe3648acb5704d898bc8401078 Mon Sep 17 00:00:00 2001 From: "Timothy A. Smith" Date: Wed, 14 Aug 2019 22:18:29 -0500 Subject: [PATCH 549/916] Revert "mark Fortran test as xfail since example seems to be broken" This reverts commit ced3617c7635bd9d41a9c30ec4c45a73f1a7dea3. --- test/test_fortran.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index 42911e097..a94be0232 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -416,7 +416,6 @@ def test_fuse_kernels(ctx_factory): lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) -@pytest.mark.xfail def test_parse_and_fuse_two_kernels(): fortran_src = """ subroutine fill(out, a, n) -- GitLab From acd70b141be841bad9287750a84e663b9572daed Mon Sep 17 00:00:00 2001 From: "Timothy A. Smith" Date: Wed, 14 Aug 2019 22:18:44 -0500 Subject: [PATCH 550/916] Revert "change some syntax so Fortran test code will parse successfully" This reverts commit cfd5e958d8cbbbcae8680b9ad21b729c01727d0b. --- test/test_fortran.py | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index a94be0232..2b62148a9 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -60,15 +60,15 @@ def test_fill(ctx_factory): !$loopy begin ! - ! fill = lp.parse_fortran(SOURCE) + ! fill, = lp.parse_fortran(SOURCE) ! fill = lp.split_iname(fill, "i", split_amount, ! outer_tag="g.0", inner_tag="l.0") - ! RESULT = fill + ! RESULT = [fill] ! !$loopy end """ - knl = lp.parse_transformed_fortran(fortran_src, + knl, = lp.parse_transformed_fortran(fortran_src, pre_transform_code="split_amount = 128") assert "i_inner" in knl.root_kernel.all_inames() @@ -92,7 +92,7 @@ def test_fill_const(ctx_factory): end """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) ctx = ctx_factory() @@ -115,7 +115,7 @@ def test_asterisk_in_shape(ctx_factory): end """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -139,7 +139,7 @@ def test_assignment_to_subst(ctx_factory): end """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) ref_knl = knl @@ -166,7 +166,7 @@ def test_assignment_to_subst_two_defs(ctx_factory): end """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) ref_knl = knl @@ -194,7 +194,7 @@ def test_assignment_to_subst_indices(ctx_factory): end """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) @@ -231,7 +231,7 @@ def test_if(ctx_factory): end """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) ref_knl = knl @@ -265,7 +265,7 @@ def test_tagged(ctx_factory): end """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2 @@ -293,7 +293,7 @@ def test_matmul(ctx_factory, buffer_inames): end subroutine """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) assert len(knl.root_kernel.domains) == 1 @@ -355,7 +355,7 @@ def test_batched_sparse(): """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) @@ -399,11 +399,11 @@ def test_fuse_kernels(ctx_factory): result(e,i,j) = prev + d(i,k)*q(e,k,j) """ - xderiv = lp.parse_fortran( + xderiv, = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) - yderiv = lp.parse_fortran( + yderiv, = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) - xyderiv = lp.parse_fortran( + xyderiv, = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) @@ -442,17 +442,15 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! prg = lp.parse_fortran(SOURCE) - ! fill = prg["fill"] - ! twice = prg["twice"] + ! fill, twice = lp.parse_fortran(SOURCE) ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) - ! RESULT = knl + ! RESULT = [knl] ! !$loopy end """ - knl = lp.parse_transformed_fortran(fortran_src) + knl, = lp.parse_transformed_fortran(fortran_src) def test_precompute_some_exist(ctx_factory): @@ -472,7 +470,7 @@ def test_precompute_some_exist(ctx_factory): end subroutine """ - knl = lp.parse_fortran(fortran_src) + knl, = lp.parse_fortran(fortran_src) assert len(knl.root_kernel.domains) == 1 -- GitLab From ac17838678136c8b47d4521f0c9b258eb7c5f79b Mon Sep 17 00:00:00 2001 From: "Timothy A. Smith" Date: Thu, 15 Aug 2019 11:33:52 -0500 Subject: [PATCH 551/916] refactor how index_dtype default is set in LoopKernel constructor --- loopy/frontend/fortran/translator.py | 6 +----- loopy/kernel/__init__.py | 4 +++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index aa635eebf..66961ce70 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -797,17 +797,13 @@ class F2LoopyTranslator(FTreeWalkerBase): # }}} - index_dtype = self.index_dtype - if index_dtype is None: - index_dtype = np.int32 - knl = lp.make_function( sub.index_sets, sub.instructions, kernel_data, name=sub.subprogram_name, default_order="F", - index_dtype=index_dtype, + index_dtype=self.index_dtype, target=self.target, seq_dependencies=seq_dependencies, ) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 5836b20cb..3168f6d8e 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -248,7 +248,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): applied_iname_rewrites=None, cache_manager=None, - index_dtype=np.int32, + index_dtype=None, options=None, state=KernelState.INITIAL, @@ -292,6 +292,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): if cache_manager is None: from loopy.kernel.tools import SetOperationCacheManager cache_manager = SetOperationCacheManager() + if index_dtype is None: + index_dtype = np.int32 # }}} -- GitLab From 510122864ae48c3dbfa069d939ab394871248f34 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 17 Aug 2019 23:48:52 -0500 Subject: [PATCH 552/916] Fix missing merge conflict --- loopy/symbolic.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index ad61520f1..6f3c6f2be 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -335,7 +335,6 @@ class DependencyMapper(DependencyMapperBase): return self.combine( self.rec(child, *args, **kwargs) for child in expr.parameters) -<<<<<<< HEAD def map_call_with_kwargs(self, expr, *args): # Loopy does not have first-class functions. Do not descend # into 'function' attribute of Call. @@ -343,15 +342,9 @@ class DependencyMapper(DependencyMapperBase): self.rec(child, *args) for child in expr.parameters+tuple( expr.kw_parameters.values())) - def map_reduction(self, expr): - deps = self.rec(expr.expr) -||||||| merged common ancestors - def map_reduction(self, expr): - deps = self.rec(expr.expr) -======= def map_reduction(self, expr, *args, **kwargs): deps = self.rec(expr.expr, *args, **kwargs) ->>>>>>> master + return deps - set(p.Variable(iname) for iname in expr.inames) def map_tagged_variable(self, expr, *args, **kwargs): -- GitLab From 3b07c1d97f663bd75e62fcd46deaf2900d954dbb Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 18 Aug 2019 00:08:46 -0500 Subject: [PATCH 553/916] Revert "Revert "change some syntax so Fortran test code will parse successfully"" This reverts commit acd70b141be841bad9287750a84e663b9572daed. --- test/test_fortran.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index 2b62148a9..a94be0232 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -60,15 +60,15 @@ def test_fill(ctx_factory): !$loopy begin ! - ! fill, = lp.parse_fortran(SOURCE) + ! fill = lp.parse_fortran(SOURCE) ! fill = lp.split_iname(fill, "i", split_amount, ! outer_tag="g.0", inner_tag="l.0") - ! RESULT = [fill] + ! RESULT = fill ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src, + knl = lp.parse_transformed_fortran(fortran_src, pre_transform_code="split_amount = 128") assert "i_inner" in knl.root_kernel.all_inames() @@ -92,7 +92,7 @@ def test_fill_const(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() @@ -115,7 +115,7 @@ def test_asterisk_in_shape(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ctx = ctx_factory() queue = cl.CommandQueue(ctx) @@ -139,7 +139,7 @@ def test_assignment_to_subst(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -166,7 +166,7 @@ def test_assignment_to_subst_two_defs(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -194,7 +194,7 @@ def test_assignment_to_subst_indices(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) @@ -231,7 +231,7 @@ def test_if(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) ref_knl = knl @@ -265,7 +265,7 @@ def test_tagged(ctx_factory): end """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2 @@ -293,7 +293,7 @@ def test_matmul(ctx_factory, buffer_inames): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) assert len(knl.root_kernel.domains) == 1 @@ -355,7 +355,7 @@ def test_batched_sparse(): """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) @@ -399,11 +399,11 @@ def test_fuse_kernels(ctx_factory): result(e,i,j) = prev + d(i,k)*q(e,k,j) """ - xderiv, = lp.parse_fortran( + xderiv = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) - yderiv, = lp.parse_fortran( + yderiv = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) - xyderiv, = lp.parse_fortran( + xyderiv = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) @@ -442,15 +442,17 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! fill, twice = lp.parse_fortran(SOURCE) + ! prg = lp.parse_fortran(SOURCE) + ! fill = prg["fill"] + ! twice = prg["twice"] ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) - ! RESULT = [knl] + ! RESULT = knl ! !$loopy end """ - knl, = lp.parse_transformed_fortran(fortran_src) + knl = lp.parse_transformed_fortran(fortran_src) def test_precompute_some_exist(ctx_factory): @@ -470,7 +472,7 @@ def test_precompute_some_exist(ctx_factory): end subroutine """ - knl, = lp.parse_fortran(fortran_src) + knl = lp.parse_fortran(fortran_src) assert len(knl.root_kernel.domains) == 1 -- GitLab From 1140b5ff323be590ca61bd4da5d1d3ae63c40bdb Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 18 Aug 2019 00:12:01 -0500 Subject: [PATCH 554/916] Add Fortran data type preservation tests (contributed by Timothy Smith) --- test/test_fortran.py | 93 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index a94be0232..437199810 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -45,6 +45,97 @@ __all__ = [ pytestmark = pytest.mark.importorskip("fparser") +def test_fp_prec_comparison(): + # FIXME: This test should succeed even when the number is exactly + # representable in single precision. + # + # https://gitlab.tiker.net/inducer/loopy/issues/187 + + fortran_src_dp = """ + subroutine assign_scalar(a) + real*8 a(1) + + a(1) = 1.1d0 + end + """ + + prg_dp = lp.parse_fortran(fortran_src_dp) + + fortran_src_sp = """ + subroutine assign_scalar(a) + real*8 a(1) + + a(1) = 1.1 + end + """ + + prg_sp = lp.parse_fortran(fortran_src_sp) + + assert prg_sp != prg_dp + + +def test_assign_double_precision_scalar(ctx_factory): + fortran_src = """ + subroutine assign_scalar(a) + real*8 a(1) + + a(1) = 1.1d0 + end + """ + + prg = lp.parse_fortran(fortran_src) + print(lp.generate_code_v2(prg).device_code()) + assert "1.1;" in lp.generate_code_v2(prg).device_code() + queue = cl.CommandQueue(ctx_factory()) + + a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") + prg(queue, a=a_dev) + + abs_err = abs(a_dev.get()[0] - 1.1) + assert abs_err < 1e-15 + + +def test_assign_double_precision_scalar_as_rational(ctx_factory): + fortran_src = """ + subroutine assign_scalar(a) + real*8 a(1) + + a(1) = 11 + a(1) = a(1) / 10 + end + """ + + prg = lp.parse_fortran(fortran_src) + queue = cl.CommandQueue(ctx_factory()) + + a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") + prg(queue, a=a_dev) + + abs_err = abs(a_dev.get()[0] - 1.1) + assert abs_err < 1e-15 + + +def test_assign_single_precision_scalar(ctx_factory): + fortran_src = """ + subroutine assign_scalar(a) + real*8 a(1) + + a(1) = 1.1 + end + """ + + prg = lp.parse_fortran(fortran_src) + assert "1.1f" in lp.generate_code_v2(prg).device_code() + queue = cl.CommandQueue(ctx_factory()) + + a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") + prg(queue, a=a_dev) + + abs_err = abs(a_dev.get()[0] - 1.1) + assert abs_err > 1e-15 + assert abs_err < 1e-6 + + def test_fill(ctx_factory): fortran_src = """ subroutine fill(out, a, n) @@ -452,7 +543,7 @@ def test_parse_and_fuse_two_kernels(): !$loopy end """ - knl = lp.parse_transformed_fortran(fortran_src) + lp.parse_transformed_fortran(fortran_src) def test_precompute_some_exist(ctx_factory): -- GitLab From abb17729de7add966006c036a4d84a0d24005aee Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Aug 2019 21:54:05 +0530 Subject: [PATCH 555/916] CallInstruction := instruction with RHS=function call --- loopy/kernel/instruction.py | 42 +++++++++++++++---------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index a17740d28..a245e49b7 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1242,19 +1242,15 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): if temp_var_types is None: temp_var_types = (Optional(),) * len(assignees) - if len(assignees) > 1 or len(assignees) == 0 or is_array_call(assignees, - expression): + from pymbolic.primitives import Call, CallWithKwargs + from loopy.symbolic import Reduction + + if isinstance(expression, (Call, CallWithKwargs, Reduction)): atomicity = kwargs.pop("atomicity", ()) if atomicity: raise LoopyError("atomic operations with more than one " "left-hand side not supported") - from pymbolic.primitives import Call, CallWithKwargs - from loopy.symbolic import Reduction - if not isinstance(expression, (Call, CallWithKwargs, Reduction)): - raise LoopyError("right-hand side in multiple assignment must be " - "function call or reduction, got: '%s'" % expression) - if not is_array_call(assignees, expression): return CallInstruction( assignees=assignees, @@ -1272,29 +1268,25 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): temp_var_types=temp_var_types, **kwargs) else: + from loopy.symbolic import DependencyMapper, SubArrayRef + if len(assignees) != 1: + raise LoopyError("right-hand side in multiple assignment must be" + " function call or reduction, got: '%s'" % expression) + if is_array_call(assignees, expression): + raise LoopyError("right-hand side in array calls must be" + " function, got: '%s'" % expression) + + if any(isinstance(var, SubArrayRef) for var in + DependencyMapper()((expression, assignees[0]))): + raise LoopyError("RHS in an instruction using SubArrayRefs can" + " only be function calls") + return Assignment( assignee=assignees[0], expression=expression, temp_var_type=temp_var_types[0], **kwargs) - atomicity = kwargs.pop("atomicity", ()) - if atomicity: - raise LoopyError("atomic operations with more than one " - "left-hand side not supported") - - from pymbolic.primitives import Call - from loopy.symbolic import Reduction - if not isinstance(expression, (Call, Reduction)): - raise LoopyError("right-hand side in multiple assignment must be " - "function call or reduction, got: '%s'" % expression) - - return CallInstruction( - assignees=assignees, - expression=expression, - temp_var_types=temp_var_types, - **kwargs) - # {{{ c instruction -- GitLab From 41efa740f81178657545655255b9c052a7928a07 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Aug 2019 21:55:06 +0530 Subject: [PATCH 556/916] ... -> '...' for py2 --- test/test_callables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_callables.py b/test/test_callables.py index 9739ca496..3f8fbc9b4 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -575,7 +575,7 @@ def test_unknown_stride_to_callee(): """, [ lp.ValueArg('N', dtype=np.int32), lp.ValueArg('Nvar', dtype=np.int32), lp.GlobalArg('x', shape=lp.auto, - dtype=np.float64), ...]) + dtype=np.float64), '...']) prog = lp.register_callable_kernel(prog, twice) -- GitLab From 02af75ee848eb92f36c2eab58890f18c9599052c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Aug 2019 22:02:03 +0530 Subject: [PATCH 557/916] removes minor redundancy --- loopy/kernel/instruction.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index a245e49b7..c44d3adab 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1272,10 +1272,6 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): if len(assignees) != 1: raise LoopyError("right-hand side in multiple assignment must be" " function call or reduction, got: '%s'" % expression) - if is_array_call(assignees, expression): - raise LoopyError("right-hand side in array calls must be" - " function, got: '%s'" % expression) - if any(isinstance(var, SubArrayRef) for var in DependencyMapper()((expression, assignees[0]))): raise LoopyError("RHS in an instruction using SubArrayRefs can" -- GitLab From 980725baf2b92b281d8a386c36200113ca5a907a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 23 Aug 2019 14:08:46 -0500 Subject: [PATCH 558/916] Do not ignore slice start when processing slices --- loopy/kernel/creation.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index fe34d0a30..e7ce880c5 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1892,7 +1892,7 @@ class SliceToInameReplacer(IdentityMapper): subscript_iname_bounds = {} self.subarray_ref_bounds.append(subscript_iname_bounds) - updated_index = [] + new_index = [] swept_inames = [] for i, index in enumerate(expr.index_tuple): if isinstance(index, Slice): @@ -1910,19 +1910,16 @@ class SliceToInameReplacer(IdentityMapper): index, domain_length) subscript_iname_bounds[unique_var_name] = (start, stop, step) - if step > 0: - updated_index.append(step*Variable(unique_var_name)) - else: - updated_index.append(start+step*Variable(unique_var_name)) + new_index.append(start+step*Variable(unique_var_name)) swept_inames.append(Variable(unique_var_name)) else: - updated_index.append(index) + new_index.append(index) if swept_inames: return SubArrayRef(tuple(swept_inames), Subscript( self.rec(expr.aggregate), - self.rec(tuple(updated_index)))) + self.rec(tuple(new_index)))) else: return IdentityMapper.map_subscript(self, expr) -- GitLab From 708fff07445af8a30621adf3537f6eb877617b82 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 1 Sep 2019 11:39:39 -0500 Subject: [PATCH 559/916] use ctx_factory() --- test/test_callables.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_callables.py b/test/test_callables.py index 3f8fbc9b4..aa3420ba7 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -44,7 +44,6 @@ def test_register_function_lookup(ctx_factory): from testlib import register_log2_lookup x = np.random.rand(10) - ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) prog = lp.make_kernel( -- GitLab From 5f070adf4f57b433e8df3e6291acd9209e1b4e48 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 1 Sep 2019 11:41:13 -0500 Subject: [PATCH 560/916] changes towards the new loopy spec. that all written variables should be assignees --- loopy/kernel/function_interface.py | 51 ++++++++++++++++++------------ loopy/kernel/instruction.py | 9 +----- loopy/target/c/__init__.py | 2 ++ loopy/transform/callable.py | 5 +-- 4 files changed, 37 insertions(+), 30 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 1195fc995..f63c992ae 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -217,14 +217,19 @@ def get_kw_pos_association(kernel): write_count = -1 for arg in kernel.args: - if not arg.is_output_only: - kw_to_pos[arg.name] = read_count - pos_to_kw[read_count] = arg.name - read_count += 1 - else: + if arg.name in kernel.get_written_variables(): kw_to_pos[arg.name] = write_count pos_to_kw[write_count] = arg.name write_count -= 1 + if arg.name in kernel.get_read_variables(): + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 + if not (arg.name in kernel.get_read_variables() or arg.name in + kernel.get_written_variables()): + kw_to_pos[arg.name] = read_count + pos_to_kw[read_count] = arg.name + read_count += 1 return kw_to_pos, pos_to_kw @@ -513,18 +518,23 @@ class ScalarCallable(InKernelCallable): def emit_call_insn(self, insn, target, expression_to_code_mapper): """ - Returns a pymbolic call for C-based targets, when the instructions - involve multiple return values along with the required type casting. - The first assignee is returned, but the rest of them are appended to - the parameters and passed by reference. - - *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` - :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`. :arg target: An instance of :class:`loopy.target.TargetBase`. :arg expression_to_code_mapper: An instance of :class:`IdentityMapper` responsible for code mapping from :mod:`loopy` syntax to the **target syntax**. + + :returns: A tuple of the call to be generated and an instance of + :class:`bool` whether the first assignee is a part of the LHS in + the assignment instruction. + + .. note:: + + The default implementation returns the first assignees and the + references of the rest of the assignees are appended to the + arguments of the call. + + *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` """ # Currently this is formulated such that the first argument is returned @@ -569,9 +579,12 @@ class ScalarCallable(InKernelCallable): tgt_dtype).expr)) # assignee is returned whenever the size of assignees is non zero. - assignee_is_returned = len(assignees) > 0 + first_assignee_is_returned = len(insn.assignees) > 0 - return var(self.name_in_target)(*c_parameters), assignee_is_returned + # TODO: Maybe this interface a bit confusing. Should we allow this + # method to directly return a cgen.Assign or cgen.ExpressionStatement? + + return var(self.name_in_target)(*c_parameters), first_assignee_is_returned def generate_preambles(self, target): return @@ -660,11 +673,9 @@ class CallableKernel(InKernelCallable): expect_completion=True)) new_arg_id_to_dtype = {} - for arg in specialized_kernel.args: - # associate the updated_arg_id_to_dtype with keyword as well as - # positional id. - new_arg_id_to_dtype[arg.name] = arg.dtype - new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype + for pos, kw in pos_to_kw.items(): + new_arg_id_to_dtype[kw] = specialized_kernel.arg_dict[kw].dtype + new_arg_id_to_dtype[pos] = specialized_kernel.arg_dict[kw].dtype # Return the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype @@ -839,7 +850,7 @@ class CallableKernel(InKernelCallable): parameters.append(kw_parameters[pos_to_kw[i]]) par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) - # insert the assigness at the required positions + # insert the assignees at the required positions assignee_write_count = -1 for i, arg in enumerate(self.subkernel.args): if arg.is_output_only: diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index c44d3adab..3be7132c0 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1119,14 +1119,7 @@ class CallInstruction(MultiAssignmentBase): @memoize_method def assignee_var_names(self): - #FIXME: This needs to be smarter, instead of just making all - # as written - from loopy.symbolic import SubArrayRef - return ( - tuple(_get_assignee_var_name(a) for a in self.assignees) + - tuple(par.subscript.aggregate.name for par in - self.expression.parameters if isinstance(par, - SubArrayRef))) + return tuple(_get_assignee_var_name(a) for a in self.assignees) def assignee_subscript_deps(self): return tuple( diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 7b6d68711..559857693 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -908,6 +908,8 @@ class CASTBuilder(ASTBuilderBase): in_knl_callable.name_in_target == 'loopy_make_tuple'): return self.emit_tuple_assignment(codegen_state, insn) + # takes "is_returned" to infer whether insn.assignees[0] is a part of + # LHS. in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( insn=insn, target=self.target, diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 6c43dd508..f020235eb 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -172,8 +172,9 @@ def register_callable_kernel(program, callee_kernel): # check to make sure that the variables with 'out' direction is equal to # the number of assigness in the callee kernel intructions. expected_num_assignees = len([arg for arg in callee_kernel.args if - arg.is_output_only]) - expected_num_parameters = len(callee_kernel.args) - expected_num_assignees + arg.name in callee_kernel.get_written_variables()]) + expected_num_parameters = len([arg for arg in callee_kernel.args if + arg.name in callee_kernel.get_read_variables()]) for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel -- GitLab From 920fd17730b1661622461595dcdcca1263a41d71 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 1 Sep 2019 15:22:20 -0500 Subject: [PATCH 561/916] makes the logic of creating arrays->slices more safer --- loopy/kernel/creation.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index e7ce880c5..1f896bb97 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1925,16 +1925,26 @@ class SliceToInameReplacer(IdentityMapper): def map_call(self, expr): def _convert_array_to_slices(arg): + # FIXME: We do not support something like A[1] should point to the + # second row if 'A' is 3 x 3 array. if isinstance(arg, Variable): + from loopy.kernel.data import auto if (arg.name in self.knl.temporary_variables): - array_arg_shape = ( - self.knl.temporary_variables[arg.name].shape) - else: - assert arg.name in self.knl.arg_dict + if self.knl.temporary_variables[arg.name].shape in [ + auto, None]: + # do not convert arrays with unknown shapes to slices. + array_arg_shape = () + else: + array_arg_shape = ( + self.knl.temporary_variables[arg.name].shape) + elif arg.name in self.knl.arg_dict: if isinstance(self.knl.arg_dict[arg.name], ValueArg): array_arg_shape = () else: array_arg_shape = self.knl.arg_dict[arg.name].shape + else: + assert arg.name in self.knl.all_inames() + array_arg_shape = () if array_arg_shape != (): return Subscript(arg, tuple(Slice(()) for _ in -- GitLab From e5359f5430c1c14377365f7f9c22106e87f2979c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Sep 2019 13:26:08 -0500 Subject: [PATCH 562/916] changes according to the enforcement that all written variables are assignees --- loopy/transform/callable.py | 5 ++++- test/test_callables.py | 13 ++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index f020235eb..7bc31d09a 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -174,7 +174,10 @@ def register_callable_kernel(program, callee_kernel): expected_num_assignees = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_written_variables()]) expected_num_parameters = len([arg for arg in callee_kernel.args if - arg.name in callee_kernel.get_read_variables()]) + arg.name in callee_kernel.get_read_variables()]) + len( + [arg for arg in callee_kernel.args if arg.name not in + (callee_kernel.get_read_variables() | + callee_kernel.get_written_variables())]) for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel diff --git a/test/test_callables.py b/test/test_callables.py index aa3420ba7..f2f3acbd6 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -217,8 +217,8 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): n = 2 ** 5 - x_dev = cl.random.rand(queue, (n, n, n, n, n), np.float64) - y_dev = cl.random.rand(queue, (n, n, n, n, n), np.float64) + x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) + y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_function( "{[i, j]:0<=i, j < 32}", @@ -410,25 +410,24 @@ def test_packing_unpacking(ctx_factory, inline): def test_non_sub_array_refs_arguments(ctx_factory): - import loopy as lp from loopy.transform.callable import _match_caller_callee_argument_dimension_ callee = lp.make_function("{[i] : 0 <= i < 6}", "a[i] = a[i] + j", [lp.GlobalArg("a", dtype="double", shape=(6,), is_output_only=False), lp.ValueArg("j", dtype="int")], name="callee") - caller1 = lp.make_kernel("{[j] : 0 <= j < 2}", "callee(a[:], b[0])", + caller1 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:] = callee(a[:], b[0])", [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output_only=False), lp.GlobalArg("b", dtype="double", shape=(1, ), is_output_only=False)], name="caller", target=lp.CTarget()) - caller2 = lp.make_kernel("{[j] : 0 <= j < 2}", "callee(a[:], 3.1415926)", + caller2 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], 3.1415926)", [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output_only=False)], name="caller", target=lp.CTarget()) - caller3 = lp.make_kernel("{[j] : 0 <= j < 2}", "callee(a[:], kappa)", + caller3 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], kappa)", [lp.GlobalArg("a", dtype="double", shape=(6, ), - is_output_only=False)], + is_output_only=False), '...'], name="caller", target=lp.CTarget()) registered = lp.register_callable_kernel(caller1, callee) -- GitLab From 15bded39f25c2615461e8e4f906b5bf23fab27b6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Sep 2019 13:27:21 -0500 Subject: [PATCH 563/916] revamps _match_caller_callee_args with get_arg_descriptor_for_expression --- loopy/transform/callable.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 7bc31d09a..479843697 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -641,12 +641,18 @@ def _match_caller_callee_argument_dimension_for_single_kernel( return shape from loopy.kernel.function_interface import ( - ArrayArgDescriptor, get_arg_descriptor_for_expression) + ArrayArgDescriptor, get_arg_descriptor_for_expression, + get_kw_pos_association) + _, pos_to_kw = get_kw_pos_association(callee_knl) arg_id_to_shape = {} for arg_id, arg in six.iteritems(insn.arg_id_to_val()): + arg_id = pos_to_kw[arg_id] + arg_descr = get_arg_descriptor_for_expression(caller_knl, arg) if isinstance(arg_descr, ArrayArgDescriptor): - arg_id_to_shape[arg_id] = _shape_1_if_empty(arg_descr) + arg_id_to_shape[arg_id] = _shape_1_if_empty(arg_descr.shape) + else: + arg_id_to_shape[arg_id] = (1, ) dim_changer = DimChanger( callee_knl.arg_dict, -- GitLab From 84b4bade8594a88a7649e4113e44e62eb13c2d94 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Sep 2019 13:28:29 -0500 Subject: [PATCH 564/916] reuses simplify_using_aff and adds comment why is it necessary --- loopy/kernel/function_interface.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index f63c992ae..fe915bde3 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -158,10 +158,18 @@ def get_arg_descriptor_for_expression(kernel, expr): # FIXME This blindly assumes that dim_tag has a stride and # will not work for non-stride dim tags (e.g. vec or sep). - # FIXME: This will almost always be nonlinear--when does this + # (AK) FIXME: This will almost always be nonlinear--when does this # actually help? Maybe the - linearized_index = sum(dim_tag.stride*iname for dim_tag, iname in - zip(arg.dim_tags, expr.subscript.index_tuple)) + # (KK) Reply: This helps in identifying identities like + # "2*(i//2) + i%2" := "i" + # See the kernel in + # test_callables.py::test_shape_translation_through_sub_array_refs + + from loopy.symbolic import simplify_using_aff + linearized_index = simplify_using_aff( + kernel, + sum(dim_tag.stride*iname for dim_tag, iname in + zip(arg.dim_tags, expr.subscript.index_tuple))) strides_as_dict = SweptInameStrideCollector( tuple(iname.name for iname in expr.swept_inames) -- GitLab From 6ec220fe1e8c327f4c8f1c2386dde3997a88b778 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Sep 2019 16:22:48 -0500 Subject: [PATCH 565/916] moves the codegen part of indexof to IndexOfCallable --- loopy/library/function.py | 49 ++++++++++++++++++++++++++++ loopy/target/c/codegen/expression.py | 43 ------------------------ 2 files changed, 49 insertions(+), 43 deletions(-) diff --git a/loopy/library/function.py b/loopy/library/function.py index 5e7dfbaf6..c7f3db3d3 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -23,6 +23,7 @@ THE SOFTWARE. """ from loopy.kernel.function_interface import ScalarCallable +from loopy.diagnostic import LoopyError class MakeTupleCallable(ScalarCallable): @@ -54,6 +55,54 @@ class IndexOfCallable(ScalarCallable): return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), callables_table) + def emit_call(self, expression_to_code_mapper, expression, target): + from pymbolic.primitives import Subscript + + if len(expression.parameters) != 1: + raise LoopyError("%s takes exactly one argument" % self.name) + arg, = expression.parameters + if not isinstance(arg, Subscript): + raise LoopyError( + "argument to %s must be a subscript" % self.name) + + ary = expression_to_code_mapper.find_array(arg) + + from loopy.kernel.array import get_access_info + from pymbolic import evaluate + access_info = get_access_info(expression_to_code_mapper.kernel.target, + ary, arg.index, lambda expr: evaluate(expr, + expression_to_code_mapper.codegen_state.var_subst_map), + expression_to_code_mapper.codegen_state.vectorization_info) + + from loopy.kernel.data import ImageArg + if isinstance(ary, ImageArg): + raise LoopyError("%s does not support images" % self.name) + + if self.name == "indexof": + return access_info.subscripts[0] + elif self.name == "indexof_vec": + from loopy.kernel.array import VectorArrayDimTag + ivec = None + for iaxis, dim_tag in enumerate(ary.dim_tags): + if isinstance(dim_tag, VectorArrayDimTag): + ivec = iaxis + + if ivec is None: + return access_info.subscripts[0] + else: + return ( + access_info.subscripts[0]*ary.shape[ivec] + + access_info.vector_index) + + else: + raise RuntimeError("should not get here") + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + return self.emit_call( + expression_to_code_mapper, + insn.expression, + target), True + def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier): """ diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 9a0f292cd..b8bf7eb11 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -427,52 +427,9 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - from pymbolic.primitives import Subscript - - # {{{ implement indexof, indexof_vec identifier_name = ( self.codegen_state.callables_table[expr.function.name].name) - if identifier_name in ["indexof", "indexof_vec"]: - if len(expr.parameters) != 1: - raise LoopyError("%s takes exactly one argument" % identifier_name) - arg, = expr.parameters - if not isinstance(arg, Subscript): - raise LoopyError( - "argument to %s must be a subscript" % identifier_name) - - ary = self.find_array(arg) - - from loopy.kernel.array import get_access_info - from pymbolic import evaluate - access_info = get_access_info(self.kernel.target, ary, arg.index, - lambda expr: evaluate(expr, self.codegen_state.var_subst_map), - self.codegen_state.vectorization_info) - - from loopy.kernel.data import ImageArg - if isinstance(ary, ImageArg): - raise LoopyError("%s does not support images" % identifier_name) - - if identifier_name == "indexof": - return access_info.subscripts[0] - elif identifier_name == "indexof_vec": - from loopy.kernel.array import VectorArrayDimTag - ivec = None - for iaxis, dim_tag in enumerate(ary.dim_tags): - if isinstance(dim_tag, VectorArrayDimTag): - ivec = iaxis - - if ivec is None: - return access_info.subscripts[0] - else: - return ( - access_info.subscripts[0]*ary.shape[ivec] - + access_info.vector_index) - - else: - raise RuntimeError("should not get here") - - # }}} from loopy.kernel.function_interface import ManglerCallable if isinstance(self.codegen_state.callables_table[expr.function.name], -- GitLab From b3d1e40bef014d6289b0951fcd0725d02c16ad72 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Sep 2019 16:23:19 -0500 Subject: [PATCH 566/916] puts in a patch for singleton assignee CallInstruction --- loopy/type_inference.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 281dcb43d..2f4b9abeb 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -726,9 +726,15 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if isinstance(writer_insn, lp.Assignment): result = type_inf_mapper(expr, return_dtype_set=True) elif isinstance(writer_insn, lp.CallInstruction): - return_dtype_set = type_inf_mapper(expr, return_tuple=True, + # FIXME: Unnecessary separation of logic between CallInstruction + # and Assignment. + return_dtype_set = type_inf_mapper(expr, + return_tuple=len(writer_insn.assignees) != 1, return_dtype_set=True) + if len(writer_insn.assignees) == 1: + return_dtype_set = (return_dtype_set, ) + result = [] for return_dtype_set in return_dtype_set: result_i = None -- GitLab From d9465a2e1c5fc04be820c9bd0e075cad58b634fc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Sep 2019 17:56:41 -0500 Subject: [PATCH 567/916] iteritems -> items --- loopy/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 1bbd2fe04..1fb691531 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -401,7 +401,7 @@ class Program(ImmutableRecord): return "\n".join( strify_callable(clbl) - for name, clbl in six.iteritems(self.callables_table)) + for name, clbl in self.callables_table.items()) # }}} -- GitLab From 3ceddff26429cdb98a87bd3f03d4d31a338e8534 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Sep 2019 19:00:01 -0500 Subject: [PATCH 568/916] interpret mangled symbols and inames in var_descr --- loopy/kernel/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 3168f6d8e..d79308241 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -500,6 +500,21 @@ class LoopKernel(ImmutableRecordWithoutPickling): except KeyError: pass + if name in self.all_inames(): + from loopy import TemporaryVariable + return TemporaryVariable( + name=name, + dtype=self.index_dtype, + shape=()) + + try: + dtype, name = self.mangle_symbol(self.target.get_device_ast_builder(), + name) + from loopy import ValueArg + return ValueArg(name, dtype) + except TypeError: + pass + raise ValueError("nothing known about variable '%s'" % name) @property -- GitLab From cf88a61c0fe9cdd9c4f720d7e39a7085a41299e8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Sep 2019 19:00:30 -0500 Subject: [PATCH 569/916] INT_MAX and INT_MIN to mangled symbols --- loopy/target/c/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 559857693..efde8c401 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -351,6 +351,10 @@ def c_symbol_mangler(kernel, name): # float NAN as defined in C99 standard if name == "NAN": return NumpyType(np.dtype(np.float32)), name + + if name in ["INT_MAX", "INT_MIN"]: + return NumpyType(np.dtype(np.int32)), name + return None # }}} -- GitLab From 2b599802f13ab83ed792c7c2031bca7ad1353fd0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Sep 2019 19:01:32 -0500 Subject: [PATCH 570/916] changes according to the new signature of InKernelCalable.with_descrs() --- loopy/library/function.py | 2 +- loopy/library/reduction.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/library/function.py b/loopy/library/function.py index c7f3db3d3..378b7de58 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -36,7 +36,7 @@ class MakeTupleCallable(ScalarCallable): return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target="loopy_make_tuple"), callables_table) - def with_descrs(self, arg_id_to_descr, callables_table, expr): + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 213836840..6c6a0dd9b 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -455,7 +455,7 @@ class ReductionCallable(ScalarCallable): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=name_in_target), callables_table - def with_descrs(self, arg_id_to_descr, callables_table, expr): + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() -- GitLab From 8e35d26a9c7312f982b94369ad1c8a551065f30c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 4 Sep 2019 14:25:59 -0500 Subject: [PATCH 571/916] Call Instruction := multiassignment call/no assignee call --- loopy/kernel/instruction.py | 34 ++++++++++++++++++++++------------ loopy/type_inference.py | 8 +------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 3be7132c0..fb33d4c7a 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1235,15 +1235,18 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): if temp_var_types is None: temp_var_types = (Optional(),) * len(assignees) - from pymbolic.primitives import Call, CallWithKwargs - from loopy.symbolic import Reduction - - if isinstance(expression, (Call, CallWithKwargs, Reduction)): + if len(assignees) != 1 or is_array_call(assignees, expression): atomicity = kwargs.pop("atomicity", ()) if atomicity: raise LoopyError("atomic operations with more than one " "left-hand side not supported") + from pymbolic.primitives import Call, CallWithKwargs + from loopy.symbolic import Reduction + if not isinstance(expression, (Call, CallWithKwargs, Reduction)): + raise LoopyError("right-hand side in multiple assignment must be " + "function call or reduction, got: '%s'" % expression) + if not is_array_call(assignees, expression): return CallInstruction( assignees=assignees, @@ -1261,14 +1264,21 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): temp_var_types=temp_var_types, **kwargs) else: - from loopy.symbolic import DependencyMapper, SubArrayRef - if len(assignees) != 1: - raise LoopyError("right-hand side in multiple assignment must be" - " function call or reduction, got: '%s'" % expression) - if any(isinstance(var, SubArrayRef) for var in - DependencyMapper()((expression, assignees[0]))): - raise LoopyError("RHS in an instruction using SubArrayRefs can" - " only be function calls") + def _is_array(expr): + from loopy.symbolic import SubArrayRef + from pymbolic.primitives import (Subscript, Slice) + if isinstance(expr, SubArrayRef): + return True + if isinstance(expr, Subscript): + return any(isinstance(idx, Slice) for idx in + expr.index_tuple) + return False + + from loopy.symbolic import DependencyMapper + if any(_is_array(dep) for dep in DependencyMapper()((assignees, + expression))): + raise LoopyError("Array calls only supported as instructions" + " with function call as RHS for now.") return Assignment( assignee=assignees[0], diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 2f4b9abeb..281dcb43d 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -726,15 +726,9 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if isinstance(writer_insn, lp.Assignment): result = type_inf_mapper(expr, return_dtype_set=True) elif isinstance(writer_insn, lp.CallInstruction): - # FIXME: Unnecessary separation of logic between CallInstruction - # and Assignment. - return_dtype_set = type_inf_mapper(expr, - return_tuple=len(writer_insn.assignees) != 1, + return_dtype_set = type_inf_mapper(expr, return_tuple=True, return_dtype_set=True) - if len(writer_insn.assignees) == 1: - return_dtype_set = (return_dtype_set, ) - result = [] for return_dtype_set in return_dtype_set: result_i = None -- GitLab From 2171aa5df91c8c48757376b2881115dd9e88dfe6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 4 Sep 2019 15:25:29 -0500 Subject: [PATCH 572/916] ArrayArgs can also be called without indexing when shape==() --- loopy/kernel/function_interface.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index fe915bde3..d8c120db8 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -192,8 +192,9 @@ def get_arg_descriptor_for_expression(kernel, expr): elif isinstance(expr, Variable): arg = kernel.get_var_descriptor(expr.name) + from loopy.kernel.array import ArrayBase - if isinstance(arg, ValueArg) or (isinstance(arg, TemporaryVariable) + if isinstance(arg, ValueArg) or (isinstance(arg, ArrayBase) and arg.shape == ()): return ValueArgDescriptor() elif isinstance(arg, (ArrayArg, TemporaryVariable)): -- GitLab From 47f60c3ec535c5785d378d8839e62a0828716a6d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Sep 2019 10:53:22 -0500 Subject: [PATCH 573/916] Stats part of the changes --- doc/tutorial.rst | 82 +++++++-------- loopy/statistics.py | 60 ++++++++--- test/test_statistics.py | 217 +++++++++++++++++++++++++--------------- 3 files changed, 224 insertions(+), 135 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 2a9756b20..c98fe8d0c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1581,12 +1581,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} >>> from loopy.statistics import CountGranularity as CG - >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1643,15 +1643,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1686,13 +1686,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, knl.name) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1710,13 +1710,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, None, load, None, None, None) : ... - MemAccess(None, None, None, None, store, None, None, None) : ... + MemAccess(None, None, None, None, load, None, None, None, None) : ... + MemAccess(None, None, None, None, store, None, None, None, None) : ... >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1753,12 +1753,12 @@ this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, loopy_kernel) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1768,13 +1768,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1794,12 +1794,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, loopy_kernel) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1808,13 +1808,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1848,14 +1848,14 @@ kernel from the previous example: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - kernel_launch : { 1 } + Sync(kernel_launch, loopy_kernel) : [l, m, n] -> { 1 } We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict) + >>> launch_count = sync_map[lp.Sync("kernel_launch", knl.name)].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 @@ -1908,8 +1908,8 @@ count the barriers using :func:`loopy.get_synchronization_map`: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - barrier_local : { 1000 } - kernel_launch : { 1 } + Sync(barrier_local, loopy_kernel) : { 1000 } + Sync(kernel_launch, loopy_kernel) : { 1 } Based on the kernel code printed above, we would expect each work-item to diff --git a/loopy/statistics.py b/loopy/statistics.py index 2c3d4f36f..92ea5f696 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -83,7 +83,7 @@ __doc__ = """ def get_kernel_parameter_space(kernel): return isl.Space.create_from_names(kernel.isl_context, - set=[], params=kernel.outer_params()).params() + set=[], params=sorted(list(kernel.outer_params()))).params() def get_kernel_zero_pwqpolynomial(kernel): @@ -160,7 +160,7 @@ class GuardedPwQPolynomial(object): return str(self.pwqpolynomial) def __repr__(self): - return repr(self.pwqpolynomial) + return "Guarded" + repr(self.pwqpolynomial) # }}} @@ -218,7 +218,7 @@ class ToCountMap(object): def __mul__(self, other): return self.copy(dict( - (index, value*other) + (index, other*value) for index, value in six.iteritems(self.count_map))) __rmul__ = __mul__ @@ -232,7 +232,8 @@ class ToCountMap(object): def __str__(self): return "\n".join( "%s: %s" % (k, v) - for k, v in six.iteritems(self.count_map)) + for k, v in sorted(six.iteritems(self.count_map), + key=lambda k: str(k))) def __len__(self): return len(self.count_map) @@ -501,11 +502,13 @@ class ToCountPolynomialMap(ToCountMap): #TODO test and document def eval(self, params): - result = self.copy() - for key, val in self.items(): - result[key] = val.eval_with_dict(params) - result.val_type = int - return result + raise NotImplementedError() + # FIXME: Not sure what you are trying to achieve here. + # result = self.copy() + # for key, val in self.items(): + # result[key] = val.eval_with_dict(params) + # result.val_type = int + # return result def eval_and_sum(self, params=None): """Add all counts and evaluate with provided parameter dict *params* @@ -575,6 +578,18 @@ def subst_into_to_count_map(space, tcm, subst_dict): # }}} +def stringify_stats_mapping(m): + + from warnings import warn + warn("stringify_stats_mapping is deprecated and will be removed in 2020." + " Use ToCountMap.__str__() instead.", DeprecationWarning, stacklevel=2) + + result = "" + for key in sorted(m.keys(), key=lambda k: str(k)): + result += ("%s : %s\n" % (key, m[key])) + return result + + # {{{ CountGranularity class CountGranularity(object): @@ -810,8 +825,10 @@ class CounterBase(CombineMapper): from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl, callables_table) - self.zero = get_kernel_zero_pwqpolynomial(self.knl) - self.one = self.zero + 1 + zero_qpoly = isl.QPolynomial.zero_on_domain(self.param_space) + one_qpoly = zero_qpoly + 1 + self.zero = isl.PwQPolynomial.from_qpolynomial(zero_qpoly) + self.one = isl.PwQPolynomial.from_qpolynomial(one_qpoly) @property @memoize_method @@ -840,7 +857,6 @@ class CounterBase(CombineMapper): if isinstance(clbl, CallableKernel): sub_result = self.kernel_rec(clbl.subkernel) - assert len(clbl.subkernel.args) == len(expr.parameters) arg_dict = dict( (arg.name, value) for arg, value in zip( @@ -911,7 +927,8 @@ class ExpressionOpCounter(CounterBase): self.count_within_subscripts = count_within_subscripts # FIXME: Revert to SUBGROUP - arithmetic_count_granularity = CountGranularity.WORKITEM + # KK: Trying that now... + arithmetic_count_granularity = CountGranularity.SUBGROUP def combine(self, values): return sum(values) @@ -1179,7 +1196,9 @@ class MemAccessCounterBase(CounterBase): class LocalMemAccessCounter(MemAccessCounterBase): # FIXME: Revert to SUBGROUP - local_mem_count_granularity = CountGranularity.WORKITEM + # KK: Trying that now... + # local_mem_count_granularity = CountGranularity.WORKITEM + local_mem_count_granularity = CountGranularity.SUBGROUP def count_var_access(self, dtype, name, index): count_map = {} @@ -1280,7 +1299,8 @@ class GlobalMemAccessCounter(MemAccessCounterBase): self.knl, array, index_tuple) # FIXME: Revert to subgroup - global_access_count_granularity = CountGranularity.WORKITEM + # global_access_count_granularity = CountGranularity.WORKITEM + global_access_count_granularity = CountGranularity.SUBGROUP # Account for broadcasts once per subgroup count_granularity = CountGranularity.WORKITEM if ( @@ -1734,6 +1754,16 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, count_within_subscripts=count_within_subscripts, subgroup_size=subgroup_size) + # FIXME: Maybe we want this, but the current structure of + # ToCountPolynomialMap doesn't allow it. + return sum(_get_op_map_for_single_kernel( + clbl.subkernel, program.callables_table, + count_redundant_work=count_redundant_work, + count_within_subscripts=count_within_subscripts, + subgroup_size=subgroup_size) for clbl in + program.callables_table.values() if isinstance(clbl, + CallableKernel)) + # }}} diff --git a/test/test_statistics.py b/test/test_statistics.py index cadca9fc1..ef5450599 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -67,12 +67,15 @@ def test_op_counter_basic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == f32div == n*m*ell*n_subgroups @@ -99,8 +102,9 @@ def test_op_counter_reduction(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == n*m*ell*n_subgroups @@ -134,11 +138,13 @@ def test_op_counter_logic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) - f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == n*m*n_subgroups @@ -172,17 +178,21 @@ def test_op_counter_specialops(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) - f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) - f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params) - f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP) + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP, knl.name)].eval_with_dict( + params) + f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP) + f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP, knl.name) ].eval_with_dict(params) - f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP) + f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32div == 2*n*m*ell*n_subgroups @@ -270,7 +280,7 @@ def test_op_counter_triangular_domain(): knl, subgroup_size=SGS, count_redundant_work=True - )[lp.Op(np.float64, 'mul', CG.SUBGROUP)] + )[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -316,22 +326,26 @@ def test_mem_access_counter_basic(): f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -341,12 +355,14 @@ def test_mem_access_counter_basic(): f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -380,12 +396,14 @@ def test_mem_access_counter_reduction(): f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -394,7 +412,8 @@ def test_mem_access_counter_reduction(): f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -483,22 +502,26 @@ def test_mem_access_counter_specialops(): f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -508,12 +531,14 @@ def test_mem_access_counter_specialops(): f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -560,22 +585,26 @@ def test_mem_access_counter_bitwise(): i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='a', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='b', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), lid_strides={}, gid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -584,12 +613,14 @@ def test_mem_access_counter_bitwise(): i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='c', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -631,31 +662,36 @@ def test_mem_access_counter_mixed(): f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='x', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='a', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='b', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -682,14 +718,16 @@ def test_mem_access_counter_mixed(): f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='store', variable='c', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups @@ -732,30 +770,32 @@ def test_mem_access_counter_nonconsec(): gid_strides={0: Variable('m')*lsize0}, direction='load', variable='g', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*lsize0}, direction='load', variable='h', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess( 'global', np.dtype(np.float32), lid_strides={0: Variable('m')*Variable('ell')}, gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='a', - count_granularity=CG.WORKITEM - ) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess( 'global', np.dtype(np.float32), lid_strides={0: Variable('m')*Variable('ell')}, gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='b', - count_granularity=CG.WORKITEM - ) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -765,15 +805,16 @@ def test_mem_access_counter_nonconsec(): gid_strides={0: Variable('m')*lsize0}, direction='store', variable='e', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: Variable('m')*Variable('ell')}, gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='store', variable='c', - count_granularity=CG.WORKITEM - ) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64nonconsec == n*m assert f32nonconsec == n*m*ell @@ -786,7 +827,8 @@ def test_mem_access_counter_nonconsec(): lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*lsize0}, direction='load', variable='g', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f64nonconsec += mem_map64[lp.MemAccess( 'global', @@ -794,7 +836,8 @@ def test_mem_access_counter_nonconsec(): lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*lsize0}, direction='load', variable='h', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec = mem_map64[lp.MemAccess( 'global', @@ -803,7 +846,8 @@ def test_mem_access_counter_nonconsec(): gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='a', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32nonconsec += mem_map64[lp.MemAccess( 'global', @@ -812,7 +856,8 @@ def test_mem_access_counter_nonconsec(): gid_strides={0: Variable('m')*Variable('ell')*lsize0}, direction='load', variable='b', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64nonconsec == 2*n*m assert f32nonconsec == 3*n*m*ell @@ -844,27 +889,31 @@ def test_mem_access_counter_consec(): 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='load', variable='g', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess( 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='load', variable='h', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='load', variable='a', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess( 'global', np.dtype(np.float32), lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='load', variable='b', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell @@ -873,14 +922,16 @@ def test_mem_access_counter_consec(): 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='store', variable='e', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='store', variable='c', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f64consec == n*m*ell assert f32consec == n*m*ell @@ -1006,16 +1057,16 @@ def test_all_counters_parallel_matmul(): op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ - lp.Op(np.float32, 'mul', CG.SUBGROUP) + lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) f32add = op_map[ - lp.Op(np.float32, 'add', CG.SUBGROUP) + lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) i32ops = op_map[ - lp.Op(np.int32, 'add', CG.SUBGROUP) + lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name) ].eval_with_dict(params) i32ops += op_map[ - lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) + lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP, knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1028,13 +1079,15 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable('ell')}, gid_strides={1: bsize}, direction='load', variable='b', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('m')}, gid_strides={0: Variable('m')*bsize}, direction='load', - variable='a', count_granularity=CG.WORKITEM) + variable='a', count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize @@ -1044,7 +1097,8 @@ def test_all_counters_parallel_matmul(): lid_strides={0: 1, 1: Variable('ell')}, gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f32coal == n*ell @@ -1063,14 +1117,16 @@ def test_all_counters_parallel_matmul(): lid_strides={1: 16}, gid_strides={}, variable='a_fetch', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={0: 1}, gid_strides={}, variable='b_fetch', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups @@ -1158,7 +1214,8 @@ def test_mem_access_tagged_variables(): gid_strides={1: bsize}, direction='load', variable='b', variable_tag='mmbload', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={1: Variable('m')}, @@ -1166,7 +1223,8 @@ def test_mem_access_tagged_variables(): direction='load', variable='a', variable_tag='mmaload', - count_granularity=CG.SUBGROUP) + count_granularity=CG.SUBGROUP, + kernel_name=knl.name) ].eval_with_dict(params) assert f32s1lb == n*m*ell @@ -1179,7 +1237,8 @@ def test_mem_access_tagged_variables(): gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', variable_tag='mmresult', - count_granularity=CG.WORKITEM) + count_granularity=CG.WORKITEM, + kernel_name=knl.name) ].eval_with_dict(params) assert f32coal == n*ell -- GitLab From 20d9310fc2faa35c2f6fd483a21f98b9b9b94a01 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Sep 2019 11:05:47 -0500 Subject: [PATCH 574/916] removes unnecessary comments --- loopy/statistics.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 92ea5f696..f9a4b62bc 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -218,7 +218,7 @@ class ToCountMap(object): def __mul__(self, other): return self.copy(dict( - (index, other*value) + (index, value*other) for index, value in six.iteritems(self.count_map))) __rmul__ = __mul__ @@ -503,7 +503,7 @@ class ToCountPolynomialMap(ToCountMap): #TODO test and document def eval(self, params): raise NotImplementedError() - # FIXME: Not sure what you are trying to achieve here. + # FIXME: Not sure what's the goal here, I get a PyLint error. # result = self.copy() # for key, val in self.items(): # result[key] = val.eval_with_dict(params) @@ -926,7 +926,7 @@ class ExpressionOpCounter(CounterBase): knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts - # FIXME: Revert to SUBGROUP + # FIXME(AK): Revert to SUBGROUP # KK: Trying that now... arithmetic_count_granularity = CountGranularity.SUBGROUP @@ -1195,7 +1195,7 @@ class MemAccessCounterBase(CounterBase): # {{{ LocalMemAccessCounter class LocalMemAccessCounter(MemAccessCounterBase): - # FIXME: Revert to SUBGROUP + # FIXME(AK): Revert to SUBGROUP # KK: Trying that now... # local_mem_count_granularity = CountGranularity.WORKITEM local_mem_count_granularity = CountGranularity.SUBGROUP @@ -1298,7 +1298,7 @@ class GlobalMemAccessCounter(MemAccessCounterBase): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - # FIXME: Revert to subgroup + # FIXME(AK): Revert to subgroup # global_access_count_granularity = CountGranularity.WORKITEM global_access_count_granularity = CountGranularity.SUBGROUP @@ -1754,16 +1754,6 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, count_within_subscripts=count_within_subscripts, subgroup_size=subgroup_size) - # FIXME: Maybe we want this, but the current structure of - # ToCountPolynomialMap doesn't allow it. - return sum(_get_op_map_for_single_kernel( - clbl.subkernel, program.callables_table, - count_redundant_work=count_redundant_work, - count_within_subscripts=count_within_subscripts, - subgroup_size=subgroup_size) for clbl in - program.callables_table.values() if isinstance(clbl, - CallableKernel)) - # }}} -- GitLab From 1f90b5590cdf4e3eca32cbbfb1926ff7fc65dba9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Sep 2019 20:01:42 -0500 Subject: [PATCH 575/916] removes unhelpful comments --- loopy/statistics.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index f9a4b62bc..39f43ef5d 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -926,8 +926,6 @@ class ExpressionOpCounter(CounterBase): knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts - # FIXME(AK): Revert to SUBGROUP - # KK: Trying that now... arithmetic_count_granularity = CountGranularity.SUBGROUP def combine(self, values): @@ -1195,9 +1193,6 @@ class MemAccessCounterBase(CounterBase): # {{{ LocalMemAccessCounter class LocalMemAccessCounter(MemAccessCounterBase): - # FIXME(AK): Revert to SUBGROUP - # KK: Trying that now... - # local_mem_count_granularity = CountGranularity.WORKITEM local_mem_count_granularity = CountGranularity.SUBGROUP def count_var_access(self, dtype, name, index): @@ -1298,8 +1293,6 @@ class GlobalMemAccessCounter(MemAccessCounterBase): lid_strides, gid_strides = _get_lid_and_gid_strides( self.knl, array, index_tuple) - # FIXME(AK): Revert to subgroup - # global_access_count_granularity = CountGranularity.WORKITEM global_access_count_granularity = CountGranularity.SUBGROUP # Account for broadcasts once per subgroup -- GitLab From e86a16d4cfb26c79f01fe2c7a4ec244f04c3cfc0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Sep 2019 00:10:05 -0500 Subject: [PATCH 576/916] removes `eval`, since no one uses it and its not documented --- loopy/statistics.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 39f43ef5d..06ca06283 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -500,16 +500,6 @@ class ToCountPolynomialMap(ToCountMap): return type(self)(space, count_map) - #TODO test and document - def eval(self, params): - raise NotImplementedError() - # FIXME: Not sure what's the goal here, I get a PyLint error. - # result = self.copy() - # for key, val in self.items(): - # result[key] = val.eval_with_dict(params) - # result.val_type = int - # return result - def eval_and_sum(self, params=None): """Add all counts and evaluate with provided parameter dict *params* -- GitLab From b7e98ffa321b9f6063ecb8d518c6b11d6f675056 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Sep 2019 15:14:25 -0500 Subject: [PATCH 577/916] reverts back pwqpolynomial initialization --- loopy/statistics.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 06ca06283..86f39e55b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -814,11 +814,8 @@ class CounterBase(CombineMapper): from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl, callables_table) - - zero_qpoly = isl.QPolynomial.zero_on_domain(self.param_space) - one_qpoly = zero_qpoly + 1 - self.zero = isl.PwQPolynomial.from_qpolynomial(zero_qpoly) - self.one = isl.PwQPolynomial.from_qpolynomial(one_qpoly) + self.zero = get_kernel_zero_pwqpolynomial(self.knl) + self.one = self.zero + 1 @property @memoize_method -- GitLab From a8aa6521358255d3e5ede0bfb5968552e66503f0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Sep 2019 23:25:40 -0500 Subject: [PATCH 578/916] Merge 'kernel_callables_v3' into 'kernel_callables_v3-edit1' --- doc/tutorial.rst | 4 +- .../fortran/ipython-integration-demo.ipynb | 17 +- examples/fortran/matmul.floopy | 4 +- examples/fortran/sparse.floopy | 4 +- examples/fortran/tagging.floopy | 4 +- examples/fortran/volumeKernel.floopy | 4 +- loopy/__init__.py | 14 +- loopy/check.py | 8 +- loopy/frontend/fortran/__init__.py | 53 ++++- loopy/ipython_ext.py | 2 +- loopy/kernel/creation.py | 94 ++++---- loopy/kernel/instruction.py | 4 +- loopy/library/reduction.py | 193 ++++++++++++---- loopy/preprocess.py | 216 ++++++++++-------- loopy/program.py | 64 +++--- loopy/symbolic.py | 12 +- loopy/target/opencl.py | 16 +- loopy/transform/callable.py | 32 ++- loopy/transform/fusion.py | 5 + loopy/type_inference.py | 2 +- test/test_callables.py | 71 +++--- test/test_fortran.py | 8 +- test/test_numa_diff.py | 20 +- 23 files changed, 520 insertions(+), 331 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index befa5e30b..e6ef54b66 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1157,7 +1157,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... - loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) + loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) The syntax for a inserting a global barrier instruction is ``... gbarrier``. :mod:`loopy` also supports manually inserting local @@ -1554,7 +1554,7 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, subgroup) : ... + Op(np:dtype('float32'), add, subgroup, loopy_kernel) : ... Each line of output will look roughly like:: diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index 7a5c8257b..1b0a9df8d 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -62,9 +62,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "split_amount = 128" @@ -91,7 +89,7 @@ "\n", "!$loopy begin\n", "!\n", - "! tr_fill, = lp.parse_fortran(SOURCE)\n", + "! tr_fill = lp.parse_fortran(SOURCE)\n", "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n", "! outer_tag=\"g.0\", inner_tag=\"l.0\")\n", "! RESULT = [tr_fill]\n", @@ -107,15 +105,6 @@ "source": [ "print(tr_fill)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -134,7 +123,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 4b3552204..a8377bedd 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -13,7 +13,7 @@ subroutine dgemm(m,n,l,alpha,a,b,c) end subroutine !$loopy begin -! dgemm, = lp.parse_fortran(SOURCE, FILENAME) +! dgemm = lp.parse_fortran(SOURCE, FILENAME) ! dgemm = lp.split_iname(dgemm, "i", 16, ! outer_tag="g.0", inner_tag="l.1") ! dgemm = lp.split_iname(dgemm, "j", 8, @@ -24,5 +24,5 @@ end subroutine ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") ! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto") ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto") -! RESULT = [dgemm] +! RESULT = dgemm !$loopy end diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy index 18542e6b0..2b156bdd7 100644 --- a/examples/fortran/sparse.floopy +++ b/examples/fortran/sparse.floopy @@ -23,11 +23,11 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y) end !$loopy begin -! sparse, = lp.parse_fortran(SOURCE, FILENAME) +! sparse = lp.parse_fortran(SOURCE, FILENAME) ! sparse = lp.split_iname(sparse, "i", 128) ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"}) ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"}) ! sparse = lp.split_iname(sparse, "j", 4) ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"}) -! RESULT = [sparse] +! RESULT = sparse !$loopy end diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy index 87aacba68..c7ebb7566 100644 --- a/examples/fortran/tagging.floopy +++ b/examples/fortran/tagging.floopy @@ -23,13 +23,13 @@ end ! "factor 4.0", ! "real_type real*8", ! ]) -! fill, = lp.parse_fortran(SOURCE, FILENAME) +! fill = lp.parse_fortran(SOURCE, FILENAME) ! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1") ! fill = lp.split_iname(fill, "i", 128, ! outer_tag="g.0", inner_tag="l.0") ! fill = lp.split_iname(fill, "i_1", 128, ! outer_tag="g.0", inner_tag="l.0") -! RESULT = [fill] +! RESULT = fill ! !$loopy end diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy index c5784b634..211c38049 100644 --- a/examples/fortran/volumeKernel.floopy +++ b/examples/fortran/volumeKernel.floopy @@ -67,7 +67,7 @@ end subroutine volumeKernel !$loopy begin ! -! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME) +! volumeKernel = lp.parse_fortran(SOURCE, FILENAME) ! volumeKernel = lp.split_iname(volumeKernel, ! "e", 32, outer_tag="g.1", inner_tag="g.0") ! volumeKernel = lp.fix_parameters(volumeKernel, @@ -76,6 +76,6 @@ end subroutine volumeKernel ! i="l.0", j="l.1", k="l.2", ! i_1="l.0", j_1="l.1", k_1="l.2" ! )) -! RESULT = [volumeKernel] +! RESULT = volumeKernel ! !$loopy end diff --git a/loopy/__init__.py b/loopy/__init__.py index 1439cb1ff..058bc93ef 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -130,10 +130,10 @@ from loopy.type_inference import infer_unknown_types from loopy.preprocess import (preprocess_kernel, realize_reduction, preprocess_program) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (ToCountMap, CountGranularity, - Op, MemAccess, get_op_map, get_mem_access_map, - get_synchronization_map, - gather_access_footprints, gather_access_footprint_bytes) +from loopy.statistics import (ToCountMap, ToCountPolynomialMap, + CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_map, + get_mem_access_map, get_synchronization_map, + gather_access_footprints, gather_access_footprint_bytes, Sync) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -269,9 +269,11 @@ __all__ = [ "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "ToCountMap", "CountGranularity", "Op", - "MemAccess", "get_op_map", "get_mem_access_map", "get_synchronization_map", + "ToCountMap", "ToCountPolynomialMap", "CountGranularity", + "stringify_stats_mapping", "Op", "MemAccess", "get_op_map", + "get_mem_access_map", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", + "Sync", "CompiledKernel", diff --git a/loopy/check.py b/loopy/check.py index d1ee125df..83e4fd0af 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -33,8 +33,6 @@ from loopy.type_inference import TypeInferenceMapper from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction) -from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction) from functools import reduce import logging @@ -145,9 +143,9 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper): return self.rec(expr.aggregate) -def check_for_integer_subscript_indices(kernel): +def check_for_integer_subscript_indices(kernel, callables_table): from pymbolic.primitives import Subscript - idx_int_checker = SubscriptIndicesIsIntChecker(kernel) + idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table) for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): idx_int_checker(insn.expression, return_tuple=isinstance(insn, @@ -763,7 +761,7 @@ def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) - check_for_integer_subscript_indices(kernel) + check_for_integer_subscript_indices(kernel, callables_table) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel, callables_table) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 3516ca29a..74c1ebf54 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -241,10 +241,54 @@ def parse_transformed_fortran(source, free_form=True, strict=True, return proc_dict["RESULT"] +def _add_assignees_to_calls(knl, all_kernels): + new_insns = [] + subroutine_dict = dict((kernel.name, kernel) for kernel in all_kernels) + from loopy.kernel.instruction import (Assignment, CallInstruction, + CInstruction, _DataObliviousInstruction, + modify_assignee_for_array_call) + from pymbolic.primitives import Call, Variable + + for insn in knl.instructions: + if isinstance(insn, CallInstruction): + if isinstance(insn.expression, Call) and ( + insn.expression.function.name in subroutine_dict): + assignees = [] + new_params = [] + subroutine = subroutine_dict[insn.expression.function.name] + for par, arg in zip(insn.expression.parameters, subroutine.args): + if arg.name in subroutine.get_written_variables(): + par = modify_assignee_for_array_call(par) + assignees.append(par) + if arg.name in subroutine.get_read_variables(): + new_params.append(par) + if arg.name not in (subroutine.get_written_variables() | + subroutine.get_read_variables()): + new_params.append(par) + + new_insns.append( + insn.copy( + assignees=tuple(assignees), + expression=Variable( + insn.expression.function.name)(*new_params))) + else: + new_insns.append(insn) + pass + elif isinstance(insn, (Assignment, CInstruction, + _DataObliviousInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError(type(insn).__name__) + + return knl.copy(instructions=new_insns) + + def parse_fortran(source, filename="", free_form=None, strict=None, - seq_dependencies=None, auto_dependencies=None, target=None): + seq_dependencies=None, auto_dependencies=None, target=None, + return_list_of_knls=False): """ - :returns: a :class:`loopy.Program` + :returns: an instance of :class:`list` of :class:`loopy.LoopKernel`s if + *return_list_of_knls* is True else a :class:`loopy.Program`. """ parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) @@ -286,6 +330,11 @@ def parse_fortran(source, filename="", free_form=None, strict=None, kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) + if return_list_of_knls: + return kernels + + kernels = [_add_assignees_to_calls(knl, kernels) for knl in kernels] + from loopy.kernel.tools import identify_root_kernel from loopy.program import make_program from loopy.transform.callable import register_callable_kernel diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py index ec1b10f1f..e44b183ed 100644 --- a/loopy/ipython_ext.py +++ b/loopy/ipython_ext.py @@ -9,7 +9,7 @@ import loopy as lp class LoopyMagics(Magics): @cell_magic def fortran_kernel(self, line, cell): - result = lp.parse_fortran(cell) + result = lp.parse_fortran(cell, return_list_of_knls=True) for knl in result: self.shell.user_ns[knl.name] = knl diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1f896bb97..f36a90575 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -37,6 +37,7 @@ from loopy.kernel.data import ( SubstitutionRule, AddressSpace, ValueArg) from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, CallInstruction) +from loopy.program import iterate_over_kernels_if_given_program from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -1753,6 +1754,7 @@ def add_inferred_inames(knl): # {{{ apply single-writer heuristic +@iterate_over_kernels_if_given_program def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): logger.debug("%s: default deps" % kernel.name) @@ -2175,56 +2177,55 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # {{{ handle kernel language version - if not is_callee_kernel: - from loopy.version import LANGUAGE_VERSION_SYMBOLS + from loopy.version import LANGUAGE_VERSION_SYMBOLS - version_to_symbol = dict( - (getattr(loopy.version, lvs), lvs) - for lvs in LANGUAGE_VERSION_SYMBOLS) + version_to_symbol = dict( + (getattr(loopy.version, lvs), lvs) + for lvs in LANGUAGE_VERSION_SYMBOLS) - lang_version = kwargs.pop("lang_version", None) - if lang_version is None: - # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION + lang_version = kwargs.pop("lang_version", None) + if lang_version is None: + # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION - # This *is* gross. But it seems like the right thing interface-wise. - import inspect - caller_globals = inspect.currentframe().f_back.f_globals + # This *is* gross. But it seems like the right thing interface-wise. + import inspect + caller_globals = inspect.currentframe().f_back.f_globals - for ver_sym in LANGUAGE_VERSION_SYMBOLS: - try: - lang_version = caller_globals[ver_sym] - break - except KeyError: - pass + for ver_sym in LANGUAGE_VERSION_SYMBOLS: + try: + lang_version = caller_globals[ver_sym] + break + except KeyError: + pass - # }}} + # }}} - if lang_version is None: - from warnings import warn - from loopy.diagnostic import LoopyWarning - from loopy.version import ( - MOST_RECENT_LANGUAGE_VERSION, - FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " - "To avoid this warning, pass " - "lang_version={ver} in this invocation. " - "(Or say 'from loopy.version import " - "{sym_ver}' in " - "the global scope of the calling frame.)" - .format( - ver=MOST_RECENT_LANGUAGE_VERSION, - sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] - ), - LoopyWarning, stacklevel=2) - - lang_version = FALLBACK_LANGUAGE_VERSION - - if lang_version not in version_to_symbol: - raise LoopyError("Language version '%s' is not known." % (lang_version,)) - if lang_version >= (2018, 1): - options = options.copy(enforce_variable_access_ordered=True) - if lang_version >= (2018, 2): - options = options.copy(ignore_boostable_into=True) + if lang_version is None: + from warnings import warn + from loopy.diagnostic import LoopyWarning + from loopy.version import ( + MOST_RECENT_LANGUAGE_VERSION, + FALLBACK_LANGUAGE_VERSION) + warn("'lang_version' was not passed to make_kernel(). " + "To avoid this warning, pass " + "lang_version={ver} in this invocation. " + "(Or say 'from loopy.version import " + "{sym_ver}' in " + "the global scope of the calling frame.)" + .format( + ver=MOST_RECENT_LANGUAGE_VERSION, + sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] + ), + LoopyWarning, stacklevel=2) + + lang_version = FALLBACK_LANGUAGE_VERSION + + if lang_version not in version_to_symbol: + raise LoopyError("Language version '%s' is not known." % (lang_version,)) + if lang_version >= (2018, 1): + options = options.copy(enforce_variable_access_ordered=True) + if lang_version >= (2018, 2): + options = options.copy(ignore_boostable_into=True) # }}} @@ -2382,11 +2383,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): - lang_version = kwargs.pop('lang_version', None) - if lang_version: - raise LoopyError("lang_version should be set for program, not " - "functions.") - kwargs['is_callee_kernel'] = True return make_kernel(*args, **kwargs) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 9d85f5e84..1ba0dc7ec 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1208,7 +1208,7 @@ def is_array_call(assignees, expression): return False -def modify_assignee_assignee_for_array_call(assignee): +def modify_assignee_for_array_call(assignee): """ Converts the assignee subscript or variable as a SubArrayRef. """ @@ -1258,7 +1258,7 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): # assignee as an instance of SubArrayRef. If not given as a # SubArrayRef return CallInstruction( - assignees=tuple(modify_assignee_assignee_for_array_call( + assignees=tuple(modify_assignee_for_array_call( assignee) for assignee in assignees), expression=expression, temp_var_types=temp_var_types, diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 6c6a0dd9b..504493f4d 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -51,7 +51,7 @@ class ReductionOperation(object): def arg_count(self): raise NotImplementedError - def neutral_element(self, *dtypes): + def neutral_element(self, dtypes, callables_table, target): raise NotImplementedError def __hash__(self): @@ -84,9 +84,6 @@ class ReductionOperation(object): raise LoopyError("unable to parse reduction type: '%s'" % op_type) - def get_scalar_callables(self): - return frozenset() - class ScalarReductionOperation(ReductionOperation): def __init__(self, forced_result_type=None): @@ -128,29 +125,43 @@ class ScalarReductionOperation(ReductionOperation): class SumReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): + def neutral_element(self, dtype, callables_table, target): # FIXME: Document that we always use an int here. - return 0 + from loopy import auto + if dtype not in [None, auto] and dtype.numpy_dtype.kind == 'f': + return 0.0, callables_table - def __call__(self, dtype, operand1, operand2): - return operand1 + operand2 + return 0, callables_table + + def __call__(self, dtype, operand1, operand2, callables_table, target): + return operand1 + operand2, callables_table class ProductReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): + def neutral_element(self, dtype, callables_table, target): # FIXME: Document that we always use an int here. - return 1 + from loopy import auto + if dtype not in [None, auto] and dtype.numpy_dtype.kind == 'f': + return 1.0, callables_table - def __call__(self, dtype, operand1, operand2): - return operand1 * operand2 + return 1, callables_table + + def __call__(self, dtype, operand1, operand2, callables_table, target): + return operand1 * operand2, callables_table def get_le_neutral(dtype): """Return a number y that satisfies (x <= y) for all y.""" if dtype.numpy_dtype.kind == "f": - # OpenCL 1.1, section 6.11.2 - return var("INFINITY") + # OpenCL 1.2, section 6.12.2 + if dtype.numpy_dtype.itemsize == 4: + #float + return var("INFINITY") + elif dtype.numpy_dtype.itemsize == 8: + #double + return var("HUGE_VAL") + elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: @@ -167,8 +178,13 @@ def get_ge_neutral(dtype): """Return a number y that satisfies (x >= y) for all y.""" if dtype.numpy_dtype.kind == "f": - # OpenCL 1.1, section 6.11.2 - return -var("INFINITY") + # OpenCL 1.2, section 6.12.2 + if dtype.numpy_dtype.itemsize == 4: + #float + return -var("INFINITY") + elif dtype.numpy_dtype.itemsize == 8: + #double + return -var("HUGE_VAL") elif dtype.numpy_dtype.kind == "i": # OpenCL 1.1, section 6.11.3 if dtype.numpy_dtype.itemsize == 4: @@ -182,25 +198,53 @@ def get_ge_neutral(dtype): class MaxReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): - return get_ge_neutral(dtype) + def neutral_element(self, dtype, callables_table, target): + return get_ge_neutral(dtype), callables_table - def __call__(self, dtype, operand1, operand2): - return ResolvedFunction("max")(operand1, operand2) + def __call__(self, dtype, operand1, operand2, callables_table, target): + dtype, = dtype + + # getting the callable 'max' from target + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + max_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, "max") + + # type specialize the callable + max_scalar_callable, callables_table = max_scalar_callable.with_types( + {0: dtype, 1: dtype}, None, callables_table) - def get_scalar_callables(self): - return frozenset(["max"]) + # populate callables_table + callables_table, func_id = callables_table.with_added_callable( + 'max', max_scalar_callable) + + return ResolvedFunction(func_id)(operand1, operand2), callables_table class MinReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype): - return get_le_neutral(dtype) + def neutral_element(self, dtype, callables_table, target): + return get_le_neutral(dtype), callables_table - def __call__(self, dtype, operand1, operand2): - return ResolvedFunction("min")(operand1, operand2) + def __call__(self, dtype, operand1, operand2, callables_table, target): + dtype, = dtype + + # getting the callable 'max' from target + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + min_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, "min") + + # type specialize the callable + min_scalar_callable, callables_table = min_scalar_callable.with_types( + {0: dtype, 1: dtype}, None, callables_table) - def get_scalar_callables(self): - return frozenset(["min"]) + # populate callables_table + callables_table, func_id = callables_table.with_added_callable( + 'min', min_scalar_callable) + + return ResolvedFunction(func_id)(operand1, operand2), callables_table # {{{ base class for symbolic reduction ops @@ -259,10 +303,26 @@ class _SegmentedScalarReductionOperation(ReductionOperation): scalar_dtype.numpy_dtype.type.__name__, segment_flag_dtype.numpy_dtype.type.__name__) - def neutral_element(self, scalar_dtype, segment_flag_dtype): - scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype) - return ResolvedFunction("make_tuple")(scalar_neutral_element, - segment_flag_dtype.numpy_dtype.type(0)) + def neutral_element(self, scalar_dtype, segment_flag_dtype, + callables_table, target): + scalar_neutral_element, calables_table = ( + self.inner_reduction.neutral_element( + scalar_dtype, callables_table, target)) + + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + make_tuple_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, "make_tuple") + make_tuple_scalar_callable, _ = ( + make_tuple_scalar_callable.with_types( + dict(enumerate([scalar_dtype, segment_flag_dtype])), None, + None)) + callables_table, func_id = callables_table.with_added_callable( + "make_tuple", make_tuple_scalar_callable) + + return ResolvedFunction(func_id)(scalar_neutral_element, + segment_flag_dtype.numpy_dtype.type(0)), callables_table def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) @@ -277,11 +337,27 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def __eq__(self, other): return type(self) == type(other) - def __call__(self, dtypes, operand1, operand2): - return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2)) + def __call__(self, dtypes, operand1, operand2, callables_table, target): + # getting the callable 'max' from target + + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + segmented_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, SegmentedOp(self)) + + # type specialize the callable + segmented_scalar_callable, callables_table = ( + segmented_scalar_callable.with_types( + {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, + None, callables_table)) - def get_scalar_callables(self): - return frozenset(["make_tuple", SegmentedOp(self)]) + # populate callables_table + callables_table, func_id = callables_table.with_added_callable( + SegmentedOp(self), segmented_scalar_callable) + + return (ResolvedFunction(func_id)(*(operand1 + operand2)), + callables_table) class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): @@ -335,12 +411,27 @@ class _ArgExtremumReductionOperation(ReductionOperation): def result_dtypes(self, kernel, scalar_dtype, index_dtype): return (scalar_dtype, index_dtype) - def neutral_element(self, scalar_dtype, index_dtype): + def neutral_element(self, scalar_dtype, index_dtype, callables_table, + target): scalar_neutral_func = ( get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - return ResolvedFunction("make_tuple")(scalar_neutral_element, - index_dtype.numpy_dtype.type(-1)) + + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + + make_tuple_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, "make_tuple") + make_tuple_scalar_callable, _ = ( + make_tuple_scalar_callable.with_types( + dict(enumerate([scalar_dtype, index_dtype])), None, + None)) + callables_table, func_id = callables_table.with_added_callable( + "make_tuple", make_tuple_scalar_callable) + + return ResolvedFunction(func_id)(scalar_neutral_element, + index_dtype.numpy_dtype.type(-1)), callables_table def __str__(self): return self.which @@ -355,11 +446,27 @@ class _ArgExtremumReductionOperation(ReductionOperation): def arg_count(self): return 2 - def __call__(self, dtypes, operand1, operand2): - return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2)) + def __call__(self, dtypes, operand1, operand2, callables_table, target): + # getting the callable 'max' from target + + from loopy.program import (find_in_knl_callable_from_identifier, + _default_func_id_to_kernel_callable_mappers) + arg_ext_scalar_callable = find_in_knl_callable_from_identifier( + _default_func_id_to_kernel_callable_mappers(target), + target, ArgExtOp(self)) + + # type specialize the callable + arg_ext_scalar_callable, callables_table = ( + arg_ext_scalar_callable.with_types( + {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, + None, callables_table)) + + # populate callables_table + callables_table, func_id = callables_table.with_added_callable( + ArgExtOp(self), arg_ext_scalar_callable) - def get_scalar_callables(self): - return frozenset([self.which, "make_tuple", ArgExtOp(self)]) + return (ResolvedFunction(func_id)(*(operand1 + operand2)), + callables_table) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index de620ef9a..c6b69da83 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -38,8 +38,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types -from loopy.symbolic import RuleAwareIdentityMapper - +from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) from loopy.program import Program, iterate_over_kernels_if_given_program @@ -899,6 +898,18 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): # }}} +class RealizeReductionCallbackMapper(ReductionCallbackMapper): + def __init__(self, callback, callables_table): + super(RealizeReductionCallbackMapper, self).__init__( + callback) + self.callables_table = callables_table + + def map_reduction(self, expr, **kwargs): + result, self.callables_table = self.callback(expr, self.rec, + **kwargs) + return result + + def realize_reduction_for_single_kernel(kernel, callables_table, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): @@ -1046,13 +1057,16 @@ def realize_reduction_for_single_kernel(kernel, callables_table, init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) + init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes), + expression=expression, predicates=insn.predicates,) generated_insns.append(init_insn) @@ -1087,13 +1101,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table, else: reduction_expr = expr.expr + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + reduction_expr, + callables_table, + kernel.target) + reduction_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - reduction_expr), + expression=expression, depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final, @@ -1105,9 +1123,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0] + return acc_vars[0], callables_table else: - return acc_vars + return acc_vars, callables_table # }}} @@ -1190,7 +1208,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table, base_iname_deps = outer_insn_inames - frozenset(expr.inames) - neutral = expr.operation.neutral_element(*arg_dtypes) + neutral, callables_table = expr.operation.neutral_element(*arg_dtypes, + callables_table=callables_table, target=kernel.target) init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, @@ -1243,17 +1262,20 @@ def realize_reduction_for_single_kernel(kernel, callables_table, reduction_expr = expr.expr transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar( + neutral_var_names, + tuple(var(nvn) for nvn in neutral_var_names)), + reduction_expr, + callables_table, + kernel.target) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar( - neutral_var_names, - tuple(var(nvn) for nvn in neutral_var_names)), - reduction_expr), + expression=expression, within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), @@ -1282,22 +1304,26 @@ def realize_reduction_for_single_kernel(kernel, callables_table, new_iname_tags[stage_exec_iname] = kernel.iname_tags(red_iname) stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + ( + var(stage_exec_iname) + new_size,)] + for acc_var in acc_vars)), + callables_table, + kernel.target) + stage_insn = make_assignment( id=stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + ( - var(stage_exec_iname) + new_size,)] - for acc_var in acc_vars))), + expression=expression, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -1318,9 +1344,10 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (0,)] + return acc_vars[0][outer_local_iname_vars + (0,)], callables_table else: - return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] + return [acc_var[outer_local_iname_vars + (0,)] for acc_var in + acc_vars], callables_table # }}} # {{{ utils (stateful) @@ -1414,6 +1441,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if global_barrier is not None: init_insn_depends_on |= frozenset([global_barrier]) + expression, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) + init_insn = make_assignment( id=init_id, assignees=acc_vars, @@ -1421,7 +1451,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, (sweep_iname,) + expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes), + expression=expression, predicates=insn.predicates, ) @@ -1440,13 +1470,17 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if insn.within_inames_is_final: update_insn_iname_deps = insn.within_inames | set([track_iname]) + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, updated_inner_exprs), + callables_table, + kernel.target) + scan_insn = make_assignment( id=update_id, assignees=acc_vars, - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, acc_vars), - _strip_if_scalar(acc_vars, updated_inner_exprs)), + expression=expression, depends_on=frozenset(update_insn_depends_on), within_inames=update_insn_iname_deps, no_sync_with=insn.no_sync_with, @@ -1460,9 +1494,9 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0] + return acc_vars[0], callables_table else: - return acc_vars + return acc_vars, callables_table # }}} @@ -1536,7 +1570,8 @@ def realize_reduction_for_single_kernel(kernel, callables_table, base_iname_deps = (outer_insn_inames - frozenset(expr.inames) - frozenset([sweep_iname])) - neutral = expr.operation.neutral_element(*arg_dtypes) + neutral, callables_table = expr.operation.neutral_element( + *arg_dtypes, callables_table=callables_table, target=kernel.target) init_insn_depends_on = insn.depends_on @@ -1635,19 +1670,23 @@ def realize_reduction_for_single_kernel(kernel, callables_table, write_stage_id = insn_id_gen( "scan_%s_write_stage_%d" % (scan_iname, istage)) + + expression, callables_table = expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, read_vars), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + callables_table, + kernel.target) + write_stage_insn = make_assignment( id=write_stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), - expression=expr.operation( - arg_dtypes, - _strip_if_scalar(acc_vars, read_vars), - _strip_if_scalar(acc_vars, tuple( - acc_var[ - outer_local_iname_vars + (var(stage_exec_iname),)] - for acc_var in acc_vars)) - ), + expression=expression, within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -1668,10 +1707,11 @@ def realize_reduction_for_single_kernel(kernel, callables_table, if nresults == 1: assert len(acc_vars) == 1 - return acc_vars[0][outer_local_iname_vars + (output_idx,)] + return (acc_vars[0][outer_local_iname_vars + (output_idx,)], + callables_table) else: return [acc_var[outer_local_iname_vars + (output_idx,)] - for acc_var in acc_vars] + for acc_var in acc_vars], callables_table # }}} @@ -1765,7 +1805,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, # to reduce over. It's rather similar to an array with () shape in # numpy.) - return expr.expr + return expr.expr, callables_table # }}} @@ -1833,8 +1873,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, # }}} - from loopy.symbolic import ReductionCallbackMapper - cb_mapper = ReductionCallbackMapper(map_reduction) + cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table) insn_queue = kernel.instructions[:] insn_id_replacements = {} @@ -1862,13 +1901,14 @@ def realize_reduction_for_single_kernel(kernel, callables_table, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: + # FIXME[KK]: With the new mapper emitting callables_table + # something should be done. new_expressions = cb_mapper(insn.expression, callables_table=callables_table, nresults=nresults) else: - new_expressions = ( - cb_mapper(insn.expression, - callables_table=callables_table),) + new_expressions = cb_mapper(insn.expression, + callables_table=callables_table), if generated_insns: # An expansion happened, so insert the generated stuff plus @@ -1955,32 +1995,28 @@ def realize_reduction_for_single_kernel(kernel, callables_table, _hackily_ensure_multi_assignment_return_values_are_scoped_private( kernel)) - return kernel + return kernel, cb_mapper.callables_table def realize_reduction(program, *args, **kwargs): assert isinstance(program, Program) - new_resolved_functions = {} - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = realize_reduction_for_single_kernel( - in_knl_callable.subkernel, program.callables_table, - *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable + callables_table = program.callables_table.copy() + kernels_to_scan = [in_knl_callable.subkernel for in_knl_callable in + program.callables_table.values() if isinstance(in_knl_callable, + CallableKernel)] + + for knl in kernels_to_scan: + new_knl, callables_table = realize_reduction_for_single_kernel( + knl, callables_table, *args, **kwargs) + in_knl_callable = callables_table[knl.name].copy( + subkernel=new_knl) + resolved_functions = callables_table.resolved_functions.copy() + resolved_functions[knl.name] = in_knl_callable + callables_table = callables_table.copy( + resolved_functions=resolved_functions) - new_callables_table = program.callables_table.copy( - resolved_functions=new_resolved_functions) - return program.copy(callables_table=new_callables_table) + return program.copy(callables_table=callables_table) # }}} @@ -2338,9 +2374,6 @@ def preprocess_single_kernel(kernel, callables_table, device=None): # }}} - from loopy.transform.subst import expand_subst - kernel = expand_subst(kernel) - # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. @@ -2348,20 +2381,6 @@ def preprocess_single_kernel(kernel, callables_table, device=None): check_for_writes_to_predicates(kernel) check_reduction_iname_uniqueness(kernel) - from loopy.kernel.creation import apply_single_writer_depencency_heuristic - kernel = apply_single_writer_depencency_heuristic(kernel) - - # Ordering restrictions: - # - # - realize_reduction must happen after type inference because it needs - # to be able to determine the types of the reduced expressions. - # - # - realize_reduction must happen after default dependencies are added - # because it manipulates the depends_on field, which could prevent - # defaults from being applied. - kernel = realize_reduction_for_single_kernel(kernel, - callables_table, unknown_types_ok=False) - # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. @@ -2451,6 +2470,23 @@ def preprocess_program(program, device=None): program = infer_unknown_types(program, expect_completion=False) + from loopy.transform.subst import expand_subst + program = expand_subst(program) + + from loopy.kernel.creation import apply_single_writer_depencency_heuristic + program = apply_single_writer_depencency_heuristic(program) + + # Ordering restrictions: + # + # - realize_reduction must happen after type inference because it needs + # to be able to determine the types of the reduced expressions. + # + # - realize_reduction must happen after default dependencies are added + # because it manipulates the depends_on field, which could prevent + # defaults from being applied. + + program = realize_reduction(program, unknown_types_ok=False) + # {{{ preprocess callable kernels # Callable editing restrictions: diff --git a/loopy/program.py b/loopy/program.py index 1fb691531..191a13fa1 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -56,6 +56,25 @@ __doc__ = """ """ +def find_in_knl_callable_from_identifier( + function_id_to_in_knl_callable_mappers, target, identifier): + """ + Returns an instance of + :class:`loopy.kernel.function_interface.InKernelCallable` if the + :arg:`identifier` is known to any kernel function scoper, otherwise returns + *None*. + """ + for func_id_to_in_knl_callable_mapper in ( + function_id_to_in_knl_callable_mappers): + # fixme: do we really need to given target for the function + in_knl_callable = func_id_to_in_knl_callable_mapper( + target, identifier) + if in_knl_callable is not None: + return in_knl_callable + + return None + + class ResolvedFunctionMarker(RuleAwareIdentityMapper): """ Mapper to convert the ``function`` attribute of a @@ -82,23 +101,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): self.function_id_to_in_knl_callable_mappers = ( function_id_to_in_knl_callable_mappers) - def find_in_knl_callable_from_identifier(self, identifier): - """ - Returns an instance of - :class:`loopy.kernel.function_interface.InKernelCallable` if the - :arg:`identifier` is known to any kernel function scoper, otherwise returns - *None*. - """ - for func_id_to_in_knl_callable_mapper in ( - self.function_id_to_in_knl_callable_mappers): - # fixme: do we really need to given target for the function - in_knl_callable = func_id_to_in_knl_callable_mapper( - self.kernel.target, identifier) - if in_knl_callable is not None: - return in_knl_callable - - return None - def map_call(self, expr, expn_state): from loopy.symbolic import parse_tagged_name @@ -117,7 +119,9 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): if not isinstance(expr.function, ResolvedFunction): # search the kernel for the function. - in_knl_callable = self.find_in_knl_callable_from_identifier( + in_knl_callable = find_in_knl_callable_from_identifier( + self.function_id_to_in_knl_callable_mappers, + self.kernel.target, expr.function.name) if in_knl_callable: @@ -140,16 +144,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, expn_state) - def map_reduction(self, expr, expn_state): - for func_id in ( - expr.operation.get_scalar_callables()): - in_knl_callable = self.find_in_knl_callable_from_identifier(func_id) - assert in_knl_callable is not None - self.callables_table, _ = ( - self.callables_table.with_added_callable(func_id, - in_knl_callable)) - return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) - def _default_func_id_to_kernel_callable_mappers(target): """ @@ -525,8 +519,7 @@ class CallablesCountingMapper(CombineMapper): map_call_with_kwargs = map_call def map_reduction(self, expr): - return Counter(expr.operation.get_scalar_callables()) + ( - super(CallablesCountingMapper, self).map_reduction(expr)) + return super(CallablesCountingMapper, self).map_reduction(expr) def map_constant(self, expr): return Counter() @@ -774,13 +767,18 @@ class CallablesTable(ImmutableRecord): # {{{ non-edit mode if not self.is_being_edited: - if function.name in self.resolved_functions and ( - self.resolved_functions[function.name] == in_kernel_callable): + if isinstance(function, ReductionOpFunction): + function_name = function + else: + function_name = function.name + + if function_name in self.resolved_functions and ( + self.resolved_functions[function_name] == in_kernel_callable): # if not being edited, check that the given function is # equal to the old version of the callable. return self, function else: - print('Old: ', self.resolved_functions[function.name]) + print('Old: ', self.resolved_functions[function_name]) print('New: ', in_kernel_callable) raise LoopyError("Use 'with_enter_edit_callables_mode' first.") diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 6f3c6f2be..870f9fc2c 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -719,7 +719,7 @@ class RuleArgument(LoopyExpressionBase): mapper_method = intern("map_rule_argument") -class ResolvedFunction(p.Expression): +class ResolvedFunction(LoopyExpressionBase): """ A function invocation whose definition is known in a :mod:`loopy` kernel. Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression @@ -758,8 +758,8 @@ class ResolvedFunction(p.Expression): def __getinitargs__(self): return (self.function, ) - def stringifier(self): - return StringifyMapper + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() mapper_method = intern("map_resolved_function") @@ -807,7 +807,7 @@ class SweptInameStrideCollector(CoefficientCollectorBase): return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr) -class SubArrayRef(p.Expression): +class SubArrayRef(LoopyExpressionBase): """ An algebraic expression to map an affine memory layout pattern (known as sub-arary) as consecutive elements of the sweeping axes which are defined @@ -871,8 +871,8 @@ class SubArrayRef(p.Expression): and other.subscript == self.subscript and other.swept_inames == self.swept_inames) - def stringifier(self): - return StringifyMapper + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() mapper_method = intern("map_sub_array_ref") diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 10161378b..82478a268 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -25,6 +25,7 @@ THE SOFTWARE. """ import numpy as np +import six from loopy.target.c import CTarget, CASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper @@ -183,14 +184,17 @@ class OpenCLCallable(ScalarCallable): return ( self.copy(arg_id_to_dtype=arg_id_to_dtype), callables_table) - dtype = np.find_common_type( + common_dtype = np.find_common_type( [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() if (id >= 0 and dtype is not None)]) - if dtype.kind in ['u', 'i', 'f']: - if dtype.kind == 'f': + if common_dtype.kind in ['u', 'i', 'f']: + if common_dtype.kind == 'f': name = 'f'+name - dtype = NumpyType(dtype) + + target = [dtype.target for dtype in six.itervalues(arg_id_to_dtype) + if (id >= 0 and dtype is not None)][0] + dtype = NumpyType(common_dtype, target) return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), @@ -198,7 +202,7 @@ class OpenCLCallable(ScalarCallable): else: # Unsupported type. raise LoopyError("%s function not supported for the types %s" % - (name, dtype)) + (name, common_dtype)) if name == "dot": for id in arg_id_to_dtype: @@ -319,6 +323,8 @@ def opencl_symbol_mangler(kernel, name): return NumpyType(np.dtype(np.int32)), name elif name.startswith("LONG_"): return NumpyType(np.dtype(np.int64)), name + elif name == "HUGE_VAL": + return NumpyType(np.dtype(np.float64)), name else: return None diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 479843697..7534818d7 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -50,7 +50,7 @@ __doc__ = """ # {{{ register function lookup -def _resolved_callables_from_function_lookup(program, +def _resolve_callables_from_function_lookup(program, func_id_to_in_kernel_callable_mapper): """ Returns a copy of *program* with the expression nodes marked "Resolved" @@ -124,7 +124,7 @@ def register_function_id_to_in_knl_callable_mapper(program, new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( [func_id_to_in_knl_callable_mapper]) - program = _resolved_callables_from_function_lookup(program, + program = _resolve_callables_from_function_lookup(program, func_id_to_in_knl_callable_mapper) new_program = program.copy( @@ -173,11 +173,17 @@ def register_callable_kernel(program, callee_kernel): # the number of assigness in the callee kernel intructions. expected_num_assignees = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_written_variables()]) - expected_num_parameters = len([arg for arg in callee_kernel.args if + expected_max_num_parameters = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_read_variables()]) + len( [arg for arg in callee_kernel.args if arg.name not in (callee_kernel.get_read_variables() | callee_kernel.get_written_variables())]) + expected_min_num_parameters = len([arg for arg in callee_kernel.args if + arg.name in callee_kernel.get_read_variables() and arg.name not in + callee_kernel.get_written_variables()]) + len( + [arg for arg in callee_kernel.args if arg.name not in + (callee_kernel.get_read_variables() | + callee_kernel.get_written_variables())]) for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel @@ -195,11 +201,21 @@ def register_callable_kernel(program, callee_kernel): "match." % ( callee_kernel.name, insn.id)) if len(insn.expression.parameters+tuple( - kw_parameters.values())) != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of " - "parameters in instruction %s do not match." - % (callee_kernel.name, insn.id)) + kw_parameters.values())) > expected_max_num_parameters: + raise LoopyError("The number of" + " parameters in instruction '%s' exceed" + " the max. number of arguments possible" + " for the callee kernel '%s' => arg matching" + " not possible." + % (insn.id, callee_kernel.name)) + if len(insn.expression.parameters+tuple( + kw_parameters.values())) < expected_min_num_parameters: + raise LoopyError("The number of" + " parameters in instruction '%s' is less than" + " the min. number of arguments possible" + " for the callee kernel '%s' => arg matching" + " not possible." + % (insn.id, callee_kernel.name)) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 9b83f242b..45e9c0a06 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -419,6 +419,11 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + from loopy.program import make_program + + programs = [make_program(knl) if isinstance(knl, LoopKernel) else knl for + knl in programs] + # all the resolved functions in programs must be registered in # main_callables_table main_prog_callables_info = ( diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 281dcb43d..2101fd2fc 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -998,7 +998,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, # functions if _instruction_missed_during_inference(insn): type_inf_mapper(insn.expression, - return_tuple=len(insn.assignees) > 1, + return_tuple=len(insn.assignees) != 1, return_dtype_set=True) elif isinstance(insn, (_DataObliviousInstruction, lp.CInstruction)): diff --git a/test/test_callables.py b/test/test_callables.py index f2f3acbd6..731593ea3 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -63,38 +63,35 @@ def test_register_function_lookup(ctx_factory): def test_register_knl(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) grandchild_knl = lp.make_function( - "{[i, j]:0<= i, j< 16}", + "{[i, j]:0<= i, j< 4}", """ c[i, j] = 2*a[i, j] + 3*b[i, j] """, name='linear_combo1') child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 4}", """ [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) """, name='linear_combo2') parent_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """, kernel_data=[ lp.GlobalArg( - name='x', + name='x, y', dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(n, n, n, n, n)), + '...'] ) knl = lp.register_callable_kernel( @@ -115,36 +112,29 @@ def test_register_knl(ctx_factory, inline): def test_slices_with_negative_step(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") parent_knl = lp.make_kernel( - "{[i, k, m]: 0<=i, k, m<16}", + "{[i, k, m]: 0<=i, k, m<4}", """ - z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], y[i, :, k, :, m]) """, kernel_data=[ lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='z', + name='x, y, z', dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(n, n, n, n, n)), + '...'] ) knl = lp.register_callable_kernel( @@ -163,7 +153,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 2 + n = 4 a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) @@ -215,27 +205,27 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 5 + n = 4 x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_function( - "{[i, j]:0<=i, j < 32}", + "{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name='linear_combo') - callee_knl = lp.split_iname(callee_knl, "i", 2, inner_tag="l.0", outer_tag="g.0") + callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0") caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<32}", + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """ ) - caller_knl = lp.split_iname(caller_knl, "i", 8, inner_tag="l.1", outer_tag="g.1") + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.register_callable_kernel( caller_knl, callee_knl) @@ -252,8 +242,8 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): x_host = x_dev.get() y_host = y_dev.get() - assert gsize == (16, 4) - assert lsize == (2, 8) + assert gsize == (4, 1) + assert lsize == (1, 4) assert np.linalg.norm(2*x_host+3*y_host-out['z'].get())/np.linalg.norm( 2*x_host+3*y_host) < 1e-15 @@ -484,13 +474,13 @@ def test_empty_sub_array_refs(ctx_factory, inline): def test_array_inputs_to_callee_kernels(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 2 ** 3 x = np.random.rand(n, n) y = np.random.rand(n, n) child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 8}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") @@ -502,17 +492,10 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline): """, kernel_data=[ lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16)), - lp.GlobalArg( - name='z', + name='x, y, z', dtype=np.float64, - shape=(16, 16)), '...'], + shape=(n, n)), + '...'] ) knl = lp.register_callable_kernel( diff --git a/test/test_fortran.py b/test/test_fortran.py index 437199810..1ab28409b 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -533,9 +533,11 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! prg = lp.parse_fortran(SOURCE) - ! fill = prg["fill"] - ! twice = prg["twice"] + ! # FIXME: correct this after the "Module" is done. + ! # prg = lp.parse_fortran(SOURCE) + ! # fill = prg["fill"] + ! # twice = prg["twice"] + ! fill, twice = lp.parse_fortran(SOURCE, return_list_of_knls=True) ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) ! RESULT = knl diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 1ba44e77e..55a2d2e11 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -60,7 +60,8 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ - knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False) + knl for knl in lp.parse_fortran(source, filename, + seq_dependencies=False, return_list_of_knls=True) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") @@ -229,6 +230,15 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa hsv = tap_hsv + hsv = lp.set_options(hsv, + ignore_boostable_into=True, + cl_build_options=[ + "-cl-denorms-are-zero", + "-cl-fast-relaxed-math", + "-cl-finite-math-only", + "-cl-mad-enable", + "-cl-no-signed-zeros"]) + if 1: print("OPS") op_map = lp.get_op_map(hsv, subgroup_size=32) @@ -238,14 +248,6 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) - hsv = lp.set_options(hsv, cl_build_options=[ - "-cl-denorms-are-zero", - "-cl-fast-relaxed-math", - "-cl-finite-math-only", - "-cl-mad-enable", - "-cl-no-signed-zeros", - ]) - # FIXME: renaming's a bit tricky in this program model. # add a simple transformation for it # hsv = hsv.copy(name="horizontalStrongVolumeKernel") -- GitLab From e4b58f04b9b941c3b27b3f9bf02bcfb142ad27c0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 18 Sep 2019 23:30:46 -0500 Subject: [PATCH 579/916] leftovers from merge conflict --- loopy/check.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index d1ee125df..83e4fd0af 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -33,8 +33,6 @@ from loopy.type_inference import TypeInferenceMapper from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction) -from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction) from functools import reduce import logging @@ -145,9 +143,9 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper): return self.rec(expr.aggregate) -def check_for_integer_subscript_indices(kernel): +def check_for_integer_subscript_indices(kernel, callables_table): from pymbolic.primitives import Subscript - idx_int_checker = SubscriptIndicesIsIntChecker(kernel) + idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table) for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): idx_int_checker(insn.expression, return_tuple=isinstance(insn, @@ -763,7 +761,7 @@ def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) - check_for_integer_subscript_indices(kernel) + check_for_integer_subscript_indices(kernel, callables_table) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel, callables_table) -- GitLab From 581d15cb2abcf161ddd882e77bcb15c19bb302c1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Sep 2019 00:06:04 -0500 Subject: [PATCH 580/916] picks callables and fortran related diff --- doc/tutorial.rst | 4 +- .../fortran/ipython-integration-demo.ipynb | 17 +--- examples/fortran/matmul.floopy | 4 +- examples/fortran/sparse.floopy | 4 +- examples/fortran/tagging.floopy | 4 +- examples/fortran/volumeKernel.floopy | 4 +- loopy/__init__.py | 14 +-- loopy/frontend/fortran/__init__.py | 53 ++++++++++- loopy/ipython_ext.py | 2 +- loopy/kernel/creation.py | 94 +++++++++---------- loopy/kernel/instruction.py | 4 +- loopy/symbolic.py | 12 +-- loopy/transform/callable.py | 32 +++++-- loopy/transform/fusion.py | 5 + test/test_callables.py | 71 ++++++-------- test/test_fortran.py | 8 +- test/test_numa_diff.py | 20 ++-- 17 files changed, 198 insertions(+), 154 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index befa5e30b..e6ef54b66 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1157,7 +1157,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted: >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... - loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) + loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) The syntax for a inserting a global barrier instruction is ``... gbarrier``. :mod:`loopy` also supports manually inserting local @@ -1554,7 +1554,7 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, subgroup) : ... + Op(np:dtype('float32'), add, subgroup, loopy_kernel) : ... Each line of output will look roughly like:: diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index 7a5c8257b..1b0a9df8d 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -62,9 +62,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "split_amount = 128" @@ -91,7 +89,7 @@ "\n", "!$loopy begin\n", "!\n", - "! tr_fill, = lp.parse_fortran(SOURCE)\n", + "! tr_fill = lp.parse_fortran(SOURCE)\n", "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n", "! outer_tag=\"g.0\", inner_tag=\"l.0\")\n", "! RESULT = [tr_fill]\n", @@ -107,15 +105,6 @@ "source": [ "print(tr_fill)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -134,7 +123,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy index 4b3552204..a8377bedd 100644 --- a/examples/fortran/matmul.floopy +++ b/examples/fortran/matmul.floopy @@ -13,7 +13,7 @@ subroutine dgemm(m,n,l,alpha,a,b,c) end subroutine !$loopy begin -! dgemm, = lp.parse_fortran(SOURCE, FILENAME) +! dgemm = lp.parse_fortran(SOURCE, FILENAME) ! dgemm = lp.split_iname(dgemm, "i", 16, ! outer_tag="g.0", inner_tag="l.1") ! dgemm = lp.split_iname(dgemm, "j", 8, @@ -24,5 +24,5 @@ end subroutine ! dgemm = lp.extract_subst(dgemm, "b_acc", "b[i1,i2]", parameters="i1, i2") ! dgemm = lp.precompute(dgemm, "a_acc", "k_inner,i_inner", default_tag="l.auto") ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner", default_tag="l.auto") -! RESULT = [dgemm] +! RESULT = dgemm !$loopy end diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy index 18542e6b0..2b156bdd7 100644 --- a/examples/fortran/sparse.floopy +++ b/examples/fortran/sparse.floopy @@ -23,11 +23,11 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y) end !$loopy begin -! sparse, = lp.parse_fortran(SOURCE, FILENAME) +! sparse = lp.parse_fortran(SOURCE, FILENAME) ! sparse = lp.split_iname(sparse, "i", 128) ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"}) ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"}) ! sparse = lp.split_iname(sparse, "j", 4) ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"}) -! RESULT = [sparse] +! RESULT = sparse !$loopy end diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy index 87aacba68..c7ebb7566 100644 --- a/examples/fortran/tagging.floopy +++ b/examples/fortran/tagging.floopy @@ -23,13 +23,13 @@ end ! "factor 4.0", ! "real_type real*8", ! ]) -! fill, = lp.parse_fortran(SOURCE, FILENAME) +! fill = lp.parse_fortran(SOURCE, FILENAME) ! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1") ! fill = lp.split_iname(fill, "i", 128, ! outer_tag="g.0", inner_tag="l.0") ! fill = lp.split_iname(fill, "i_1", 128, ! outer_tag="g.0", inner_tag="l.0") -! RESULT = [fill] +! RESULT = fill ! !$loopy end diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy index c5784b634..211c38049 100644 --- a/examples/fortran/volumeKernel.floopy +++ b/examples/fortran/volumeKernel.floopy @@ -67,7 +67,7 @@ end subroutine volumeKernel !$loopy begin ! -! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME) +! volumeKernel = lp.parse_fortran(SOURCE, FILENAME) ! volumeKernel = lp.split_iname(volumeKernel, ! "e", 32, outer_tag="g.1", inner_tag="g.0") ! volumeKernel = lp.fix_parameters(volumeKernel, @@ -76,6 +76,6 @@ end subroutine volumeKernel ! i="l.0", j="l.1", k="l.2", ! i_1="l.0", j_1="l.1", k_1="l.2" ! )) -! RESULT = [volumeKernel] +! RESULT = volumeKernel ! !$loopy end diff --git a/loopy/__init__.py b/loopy/__init__.py index 1439cb1ff..058bc93ef 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -130,10 +130,10 @@ from loopy.type_inference import infer_unknown_types from loopy.preprocess import (preprocess_kernel, realize_reduction, preprocess_program) from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel -from loopy.statistics import (ToCountMap, CountGranularity, - Op, MemAccess, get_op_map, get_mem_access_map, - get_synchronization_map, - gather_access_footprints, gather_access_footprint_bytes) +from loopy.statistics import (ToCountMap, ToCountPolynomialMap, + CountGranularity, stringify_stats_mapping, Op, MemAccess, get_op_map, + get_mem_access_map, get_synchronization_map, + gather_access_footprints, gather_access_footprint_bytes, Sync) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -269,9 +269,11 @@ __all__ = [ "PreambleInfo", "generate_code", "generate_code_v2", "generate_body", - "ToCountMap", "CountGranularity", "Op", - "MemAccess", "get_op_map", "get_mem_access_map", "get_synchronization_map", + "ToCountMap", "ToCountPolynomialMap", "CountGranularity", + "stringify_stats_mapping", "Op", "MemAccess", "get_op_map", + "get_mem_access_map", "get_synchronization_map", "gather_access_footprints", "gather_access_footprint_bytes", + "Sync", "CompiledKernel", diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 3516ca29a..74c1ebf54 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -241,10 +241,54 @@ def parse_transformed_fortran(source, free_form=True, strict=True, return proc_dict["RESULT"] +def _add_assignees_to_calls(knl, all_kernels): + new_insns = [] + subroutine_dict = dict((kernel.name, kernel) for kernel in all_kernels) + from loopy.kernel.instruction import (Assignment, CallInstruction, + CInstruction, _DataObliviousInstruction, + modify_assignee_for_array_call) + from pymbolic.primitives import Call, Variable + + for insn in knl.instructions: + if isinstance(insn, CallInstruction): + if isinstance(insn.expression, Call) and ( + insn.expression.function.name in subroutine_dict): + assignees = [] + new_params = [] + subroutine = subroutine_dict[insn.expression.function.name] + for par, arg in zip(insn.expression.parameters, subroutine.args): + if arg.name in subroutine.get_written_variables(): + par = modify_assignee_for_array_call(par) + assignees.append(par) + if arg.name in subroutine.get_read_variables(): + new_params.append(par) + if arg.name not in (subroutine.get_written_variables() | + subroutine.get_read_variables()): + new_params.append(par) + + new_insns.append( + insn.copy( + assignees=tuple(assignees), + expression=Variable( + insn.expression.function.name)(*new_params))) + else: + new_insns.append(insn) + pass + elif isinstance(insn, (Assignment, CInstruction, + _DataObliviousInstruction)): + new_insns.append(insn) + else: + raise NotImplementedError(type(insn).__name__) + + return knl.copy(instructions=new_insns) + + def parse_fortran(source, filename="", free_form=None, strict=None, - seq_dependencies=None, auto_dependencies=None, target=None): + seq_dependencies=None, auto_dependencies=None, target=None, + return_list_of_knls=False): """ - :returns: a :class:`loopy.Program` + :returns: an instance of :class:`list` of :class:`loopy.LoopKernel`s if + *return_list_of_knls* is True else a :class:`loopy.Program`. """ parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) @@ -286,6 +330,11 @@ def parse_fortran(source, filename="", free_form=None, strict=None, kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) + if return_list_of_knls: + return kernels + + kernels = [_add_assignees_to_calls(knl, kernels) for knl in kernels] + from loopy.kernel.tools import identify_root_kernel from loopy.program import make_program from loopy.transform.callable import register_callable_kernel diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py index ec1b10f1f..e44b183ed 100644 --- a/loopy/ipython_ext.py +++ b/loopy/ipython_ext.py @@ -9,7 +9,7 @@ import loopy as lp class LoopyMagics(Magics): @cell_magic def fortran_kernel(self, line, cell): - result = lp.parse_fortran(cell) + result = lp.parse_fortran(cell, return_list_of_knls=True) for knl in result: self.shell.user_ns[knl.name] = knl diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 1f896bb97..f36a90575 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -37,6 +37,7 @@ from loopy.kernel.data import ( SubstitutionRule, AddressSpace, ValueArg) from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, CallInstruction) +from loopy.program import iterate_over_kernels_if_given_program from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -1753,6 +1754,7 @@ def add_inferred_inames(knl): # {{{ apply single-writer heuristic +@iterate_over_kernels_if_given_program def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): logger.debug("%s: default deps" % kernel.name) @@ -2175,56 +2177,55 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): # {{{ handle kernel language version - if not is_callee_kernel: - from loopy.version import LANGUAGE_VERSION_SYMBOLS + from loopy.version import LANGUAGE_VERSION_SYMBOLS - version_to_symbol = dict( - (getattr(loopy.version, lvs), lvs) - for lvs in LANGUAGE_VERSION_SYMBOLS) + version_to_symbol = dict( + (getattr(loopy.version, lvs), lvs) + for lvs in LANGUAGE_VERSION_SYMBOLS) - lang_version = kwargs.pop("lang_version", None) - if lang_version is None: - # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION + lang_version = kwargs.pop("lang_version", None) + if lang_version is None: + # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION - # This *is* gross. But it seems like the right thing interface-wise. - import inspect - caller_globals = inspect.currentframe().f_back.f_globals + # This *is* gross. But it seems like the right thing interface-wise. + import inspect + caller_globals = inspect.currentframe().f_back.f_globals - for ver_sym in LANGUAGE_VERSION_SYMBOLS: - try: - lang_version = caller_globals[ver_sym] - break - except KeyError: - pass + for ver_sym in LANGUAGE_VERSION_SYMBOLS: + try: + lang_version = caller_globals[ver_sym] + break + except KeyError: + pass - # }}} + # }}} - if lang_version is None: - from warnings import warn - from loopy.diagnostic import LoopyWarning - from loopy.version import ( - MOST_RECENT_LANGUAGE_VERSION, - FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " - "To avoid this warning, pass " - "lang_version={ver} in this invocation. " - "(Or say 'from loopy.version import " - "{sym_ver}' in " - "the global scope of the calling frame.)" - .format( - ver=MOST_RECENT_LANGUAGE_VERSION, - sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] - ), - LoopyWarning, stacklevel=2) - - lang_version = FALLBACK_LANGUAGE_VERSION - - if lang_version not in version_to_symbol: - raise LoopyError("Language version '%s' is not known." % (lang_version,)) - if lang_version >= (2018, 1): - options = options.copy(enforce_variable_access_ordered=True) - if lang_version >= (2018, 2): - options = options.copy(ignore_boostable_into=True) + if lang_version is None: + from warnings import warn + from loopy.diagnostic import LoopyWarning + from loopy.version import ( + MOST_RECENT_LANGUAGE_VERSION, + FALLBACK_LANGUAGE_VERSION) + warn("'lang_version' was not passed to make_kernel(). " + "To avoid this warning, pass " + "lang_version={ver} in this invocation. " + "(Or say 'from loopy.version import " + "{sym_ver}' in " + "the global scope of the calling frame.)" + .format( + ver=MOST_RECENT_LANGUAGE_VERSION, + sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] + ), + LoopyWarning, stacklevel=2) + + lang_version = FALLBACK_LANGUAGE_VERSION + + if lang_version not in version_to_symbol: + raise LoopyError("Language version '%s' is not known." % (lang_version,)) + if lang_version >= (2018, 1): + options = options.copy(enforce_variable_access_ordered=True) + if lang_version >= (2018, 2): + options = options.copy(ignore_boostable_into=True) # }}} @@ -2382,11 +2383,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): - lang_version = kwargs.pop('lang_version', None) - if lang_version: - raise LoopyError("lang_version should be set for program, not " - "functions.") - kwargs['is_callee_kernel'] = True return make_kernel(*args, **kwargs) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 9d85f5e84..1ba0dc7ec 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1208,7 +1208,7 @@ def is_array_call(assignees, expression): return False -def modify_assignee_assignee_for_array_call(assignee): +def modify_assignee_for_array_call(assignee): """ Converts the assignee subscript or variable as a SubArrayRef. """ @@ -1258,7 +1258,7 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): # assignee as an instance of SubArrayRef. If not given as a # SubArrayRef return CallInstruction( - assignees=tuple(modify_assignee_assignee_for_array_call( + assignees=tuple(modify_assignee_for_array_call( assignee) for assignee in assignees), expression=expression, temp_var_types=temp_var_types, diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 6f3c6f2be..870f9fc2c 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -719,7 +719,7 @@ class RuleArgument(LoopyExpressionBase): mapper_method = intern("map_rule_argument") -class ResolvedFunction(p.Expression): +class ResolvedFunction(LoopyExpressionBase): """ A function invocation whose definition is known in a :mod:`loopy` kernel. Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression @@ -758,8 +758,8 @@ class ResolvedFunction(p.Expression): def __getinitargs__(self): return (self.function, ) - def stringifier(self): - return StringifyMapper + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() mapper_method = intern("map_resolved_function") @@ -807,7 +807,7 @@ class SweptInameStrideCollector(CoefficientCollectorBase): return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr) -class SubArrayRef(p.Expression): +class SubArrayRef(LoopyExpressionBase): """ An algebraic expression to map an affine memory layout pattern (known as sub-arary) as consecutive elements of the sweeping axes which are defined @@ -871,8 +871,8 @@ class SubArrayRef(p.Expression): and other.subscript == self.subscript and other.swept_inames == self.swept_inames) - def stringifier(self): - return StringifyMapper + def make_stringifier(self, originating_stringifier=None): + return StringifyMapper() mapper_method = intern("map_sub_array_ref") diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 479843697..7534818d7 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -50,7 +50,7 @@ __doc__ = """ # {{{ register function lookup -def _resolved_callables_from_function_lookup(program, +def _resolve_callables_from_function_lookup(program, func_id_to_in_kernel_callable_mapper): """ Returns a copy of *program* with the expression nodes marked "Resolved" @@ -124,7 +124,7 @@ def register_function_id_to_in_knl_callable_mapper(program, new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( [func_id_to_in_knl_callable_mapper]) - program = _resolved_callables_from_function_lookup(program, + program = _resolve_callables_from_function_lookup(program, func_id_to_in_knl_callable_mapper) new_program = program.copy( @@ -173,11 +173,17 @@ def register_callable_kernel(program, callee_kernel): # the number of assigness in the callee kernel intructions. expected_num_assignees = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_written_variables()]) - expected_num_parameters = len([arg for arg in callee_kernel.args if + expected_max_num_parameters = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_read_variables()]) + len( [arg for arg in callee_kernel.args if arg.name not in (callee_kernel.get_read_variables() | callee_kernel.get_written_variables())]) + expected_min_num_parameters = len([arg for arg in callee_kernel.args if + arg.name in callee_kernel.get_read_variables() and arg.name not in + callee_kernel.get_written_variables()]) + len( + [arg for arg in callee_kernel.args if arg.name not in + (callee_kernel.get_read_variables() | + callee_kernel.get_written_variables())]) for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel @@ -195,11 +201,21 @@ def register_callable_kernel(program, callee_kernel): "match." % ( callee_kernel.name, insn.id)) if len(insn.expression.parameters+tuple( - kw_parameters.values())) != expected_num_parameters: - raise LoopyError("The number of expected arguments " - "for the callee kernel %s and the number of " - "parameters in instruction %s do not match." - % (callee_kernel.name, insn.id)) + kw_parameters.values())) > expected_max_num_parameters: + raise LoopyError("The number of" + " parameters in instruction '%s' exceed" + " the max. number of arguments possible" + " for the callee kernel '%s' => arg matching" + " not possible." + % (insn.id, callee_kernel.name)) + if len(insn.expression.parameters+tuple( + kw_parameters.values())) < expected_min_num_parameters: + raise LoopyError("The number of" + " parameters in instruction '%s' is less than" + " the min. number of arguments possible" + " for the callee kernel '%s' => arg matching" + " not possible." + % (insn.id, callee_kernel.name)) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 9b83f242b..45e9c0a06 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -419,6 +419,11 @@ def fuse_kernels(programs, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + from loopy.program import make_program + + programs = [make_program(knl) if isinstance(knl, LoopKernel) else knl for + knl in programs] + # all the resolved functions in programs must be registered in # main_callables_table main_prog_callables_info = ( diff --git a/test/test_callables.py b/test/test_callables.py index f2f3acbd6..731593ea3 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -63,38 +63,35 @@ def test_register_function_lookup(ctx_factory): def test_register_knl(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) grandchild_knl = lp.make_function( - "{[i, j]:0<= i, j< 16}", + "{[i, j]:0<= i, j< 4}", """ c[i, j] = 2*a[i, j] + 3*b[i, j] """, name='linear_combo1') child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 4}", """ [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) """, name='linear_combo2') parent_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """, kernel_data=[ lp.GlobalArg( - name='x', + name='x, y', dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(n, n, n, n, n)), + '...'] ) knl = lp.register_callable_kernel( @@ -115,36 +112,29 @@ def test_register_knl(ctx_factory, inline): def test_slices_with_negative_step(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 4 x = np.random.rand(n, n, n, n, n) y = np.random.rand(n, n, n, n, n) child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") parent_knl = lp.make_kernel( - "{[i, k, m]: 0<=i, k, m<16}", + "{[i, k, m]: 0<=i, k, m<4}", """ - z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], + z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m], y[i, :, k, :, m]) """, kernel_data=[ lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16, 16, 16, 16)), - lp.GlobalArg( - name='z', + name='x, y, z', dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(n, n, n, n, n)), + '...'] ) knl = lp.register_callable_kernel( @@ -163,7 +153,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 2 + n = 4 a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) @@ -215,27 +205,27 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 5 + n = 4 x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) callee_knl = lp.make_function( - "{[i, j]:0<=i, j < 32}", + "{[i, j]:0<=i, j < 4}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name='linear_combo') - callee_knl = lp.split_iname(callee_knl, "i", 2, inner_tag="l.0", outer_tag="g.0") + callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0") caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<32}", + "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}", """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) """ ) - caller_knl = lp.split_iname(caller_knl, "i", 8, inner_tag="l.1", outer_tag="g.1") + caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.register_callable_kernel( caller_knl, callee_knl) @@ -252,8 +242,8 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): x_host = x_dev.get() y_host = y_dev.get() - assert gsize == (16, 4) - assert lsize == (2, 8) + assert gsize == (4, 1) + assert lsize == (1, 4) assert np.linalg.norm(2*x_host+3*y_host-out['z'].get())/np.linalg.norm( 2*x_host+3*y_host) < 1e-15 @@ -484,13 +474,13 @@ def test_empty_sub_array_refs(ctx_factory, inline): def test_array_inputs_to_callee_kernels(ctx_factory, inline): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - n = 2 ** 4 + n = 2 ** 3 x = np.random.rand(n, n) y = np.random.rand(n, n) child_knl = lp.make_function( - "{[i, j]:0<=i, j < 16}", + "{[i, j]:0<=i, j < 8}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] """, name="linear_combo") @@ -502,17 +492,10 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline): """, kernel_data=[ lp.GlobalArg( - name='x', - dtype=np.float64, - shape=(16, 16)), - lp.GlobalArg( - name='y', - dtype=np.float64, - shape=(16, 16)), - lp.GlobalArg( - name='z', + name='x, y, z', dtype=np.float64, - shape=(16, 16)), '...'], + shape=(n, n)), + '...'] ) knl = lp.register_callable_kernel( diff --git a/test/test_fortran.py b/test/test_fortran.py index 437199810..1ab28409b 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -533,9 +533,11 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! prg = lp.parse_fortran(SOURCE) - ! fill = prg["fill"] - ! twice = prg["twice"] + ! # FIXME: correct this after the "Module" is done. + ! # prg = lp.parse_fortran(SOURCE) + ! # fill = prg["fill"] + ! # twice = prg["twice"] + ! fill, twice = lp.parse_fortran(SOURCE, return_list_of_knls=True) ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) ! RESULT = knl diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 1ba44e77e..55a2d2e11 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -60,7 +60,8 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ - knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False) + knl for knl in lp.parse_fortran(source, filename, + seq_dependencies=False, return_list_of_knls=True) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") @@ -229,6 +230,15 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa hsv = tap_hsv + hsv = lp.set_options(hsv, + ignore_boostable_into=True, + cl_build_options=[ + "-cl-denorms-are-zero", + "-cl-fast-relaxed-math", + "-cl-finite-math-only", + "-cl-mad-enable", + "-cl-no-signed-zeros"]) + if 1: print("OPS") op_map = lp.get_op_map(hsv, subgroup_size=32) @@ -238,14 +248,6 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) - hsv = lp.set_options(hsv, cl_build_options=[ - "-cl-denorms-are-zero", - "-cl-fast-relaxed-math", - "-cl-finite-math-only", - "-cl-mad-enable", - "-cl-no-signed-zeros", - ]) - # FIXME: renaming's a bit tricky in this program model. # add a simple transformation for it # hsv = hsv.copy(name="horizontalStrongVolumeKernel") -- GitLab From 6857c4ba818ac896ee677ac4dd4c69c90bb20108 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Sep 2019 15:32:08 -0500 Subject: [PATCH 581/916] adds some helpful comments --- loopy/frontend/fortran/__init__.py | 12 ++++++++++++ loopy/transform/callable.py | 3 +++ 2 files changed, 15 insertions(+) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 74c1ebf54..bc360b996 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -242,6 +242,18 @@ def parse_transformed_fortran(source, free_form=True, strict=True, def _add_assignees_to_calls(knl, all_kernels): + """ + Returns a copy of *knl* coming from the fortran parser adjusted to the + loopy specification that written variables of a call must appear in the + assignee. + + :param knl: An instance of :class:`loopy.LoopKernel`, which have incorrect + calls to the kernels in *all_kernels* by stuffing both the input and + output arguments into parameters. + + :param all_kernels: An instance of :class:`list` of loopy kernels which + may be called by *kernel*. + """ new_insns = [] subroutine_dict = dict((kernel.name, kernel) for kernel in all_kernels) from loopy.kernel.instruction import (Assignment, CallInstruction, diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 7534818d7..e0f4a79d7 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -173,6 +173,9 @@ def register_callable_kernel(program, callee_kernel): # the number of assigness in the callee kernel intructions. expected_num_assignees = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_written_variables()]) + + # can only predict the range of actual number of parameters to a kernel + # call, as a variable intended for pure output can be read expected_max_num_parameters = len([arg for arg in callee_kernel.args if arg.name in callee_kernel.get_read_variables()]) + len( [arg for arg in callee_kernel.args if arg.name not in -- GitLab From 4d5f37e001c63de2f3adcae79b2c19fabbc3df2d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Sep 2019 15:32:28 -0500 Subject: [PATCH 582/916] adds in-place update test --- test/test_callables.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/test_callables.py b/test/test_callables.py index 731593ea3..ce6b89e36 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -564,6 +564,29 @@ def test_unknown_stride_to_callee(): print(lp.generate_code_v2(prog).device_code()) +def test_argument_matching_for_inplace_update(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + twice = lp.make_function( + "{[i]: 0<=i<10}", + """ + x[i] = 2*x[i] + """, name='twice') + + knl = lp.make_kernel( + "{:}", + """ + x[:] = twice(x[:]) + """, [lp.GlobalArg('x', shape=(10,), dtype=np.float64)]) + + knl = lp.register_callable_kernel(knl, twice) + + x = np.random.randn(10) + evt, (out, ) = knl(queue, np.copy(x)) + + assert np.allclose(2*x, out) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From df475fcf3c0c1ef57c26ee769d99a7e080b2f022 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Sep 2019 17:08:46 -0500 Subject: [PATCH 583/916] KernelArgument.is_output_only -> KernelArgument.is_output --- loopy/auto_test.py | 2 +- loopy/frontend/fortran/translator.py | 2 +- loopy/target/execution.py | 2 +- loopy/transform/make_scalar.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 4bca7ebdb..b5039bd2c 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -118,7 +118,7 @@ def make_ref_args(program, impl_arg_info, queue, parameters): shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype - is_output = kernel_arg.is_output_only + is_output = kernel_arg.is_output if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 66961ce70..949a3d4cc 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -763,7 +763,7 @@ class F2LoopyTranslator(FTreeWalkerBase): arg_name, dtype=sub.get_type(arg_name), shape=sub.get_loopy_shape(arg_name), - is_output_only=False, + is_output=False, )) else: kernel_data.append( diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 9d1d14376..96f6e065c 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -725,7 +725,7 @@ class KernelExecutorBase(object): self.packing_controller = SeparateArrayPackingController(program) self.output_names = tuple(arg.name for arg in self.program.args - if arg.is_output_only) + if arg.is_output) self.has_runtime_typed_args = any( arg.dtype is None diff --git a/loopy/transform/make_scalar.py b/loopy/transform/make_scalar.py index ab91fdf78..d0e7d1bc2 100644 --- a/loopy/transform/make_scalar.py +++ b/loopy/transform/make_scalar.py @@ -23,7 +23,7 @@ def make_scalar(kernel, var_name): kernel = ScalarChanger(rule_mapping_context, var_name).map_kernel(kernel) new_args = [ValueArg(arg.name, arg.dtype, target=arg.target, - is_output_only=arg.is_output_only) if arg.name == var_name else arg for + is_output=arg.is_output) if arg.name == var_name else arg for arg in kernel.args] new_temps = dict((tv.name, tv.copy(shape=(), dim_tags=None)) if tv.name == var_name else (tv.name, tv) for tv in -- GitLab From 71d7541dc55f5a2f2e1fefa83628543fe634ef53 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Sep 2019 17:08:51 -0500 Subject: [PATCH 584/916] Adds a kernel argument attribute is_input - Transmits changes in the function interface so that they also use is_input while performing caller<->callee argument matching - Makes changes in the test cases so that they set is_output, is_input correctly --- loopy/kernel/creation.py | 4 ++-- loopy/kernel/data.py | 27 ++++++++++++++------- loopy/kernel/function_interface.py | 13 ++++------ loopy/kernel/tools.py | 38 +++++++++++++++++++++--------- loopy/transform/callable.py | 36 ++++++---------------------- test/test_callables.py | 16 ++++++++----- 6 files changed, 70 insertions(+), 64 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f36a90575..4be7e06b8 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2367,8 +2367,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): check_for_duplicate_names(knl) check_written_variable_names(knl) - from loopy.kernel.tools import infer_args_are_output_only - knl = infer_args_are_output_only(knl) + from loopy.kernel.tools import infer_args_are_input_output + knl = infer_args_are_input_output(knl) from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 4c0959111..15a77b809 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -338,7 +338,8 @@ class KernelArgument(ImmutableRecord): dtype = None kwargs["dtype"] = dtype - kwargs["is_output_only"] = kwargs.pop("is_output_only", None) + kwargs["is_output"] = kwargs.pop("is_output", None) + kwargs["is_input"] = kwargs.pop("is_input", None) ImmutableRecord.__init__(self, **kwargs) @@ -351,20 +352,27 @@ class ArrayArg(ArrayBase, KernelArgument): An attribute of :class:`AddressSpace` defining the address space in which the array resides. - .. attribute:: is_output_only + .. attribute:: is_output An instance of :class:`bool`. If set to *True*, recorded to be returned from the kernel. + + .. attribute:: is_input + + An instance of :class:`bool`. If set to *True*, expected to be + provided by the user. """) allowed_extra_kwargs = [ "address_space", - "is_output_only"] + "is_output", + "is_input"] def __init__(self, *args, **kwargs): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - kwargs["is_output_only"] = kwargs.pop("is_output_only", None) + kwargs["is_output"] = kwargs.pop("is_output", None) + kwargs["is_input"] = kwargs.pop("is_input", None) super(ArrayArg, self).__init__(*args, **kwargs) @@ -392,7 +400,8 @@ class ArrayArg(ArrayBase, KernelArgument): """ super(ArrayArg, self).update_persistent_hash(key_hash, key_builder) key_builder.rec(key_hash, self.address_space) - key_builder.rec(key_hash, self.is_output_only) + key_builder.rec(key_hash, self.is_output) + key_builder.rec(key_hash, self.is_input) # Making this a function prevents incorrect use in isinstance. @@ -413,7 +422,8 @@ class ConstantArg(ArrayBase, KernelArgument): max_target_axes = 1 # Constant Arg cannot be an output - is_output_only = False + is_output = False + is_input = True def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape, @@ -436,13 +446,14 @@ class ImageArg(ArrayBase, KernelArgument): class ValueArg(KernelArgument): def __init__(self, name, dtype=None, approximately=1000, target=None, - is_output_only=False): + is_output=False, is_input=True): KernelArgument.__init__(self, name=name, dtype=dtype, approximately=approximately, target=target, - is_output_only=is_output_only) + is_output=is_output, + is_input=is_input) def __str__(self): import loopy as lp diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index d8c120db8..4b2d18ec5 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -226,16 +226,13 @@ def get_kw_pos_association(kernel): write_count = -1 for arg in kernel.args: - if arg.name in kernel.get_written_variables(): + if arg.is_output: kw_to_pos[arg.name] = write_count pos_to_kw[write_count] = arg.name write_count -= 1 - if arg.name in kernel.get_read_variables(): - kw_to_pos[arg.name] = read_count - pos_to_kw[read_count] = arg.name - read_count += 1 - if not (arg.name in kernel.get_read_variables() or arg.name in - kernel.get_written_variables()): + if arg.is_input: + # if an argument is both input and output then the input is given + # more significance in kw_to_pos kw_to_pos[arg.name] = read_count pos_to_kw[read_count] = arg.name read_count += 1 @@ -862,7 +859,7 @@ class CallableKernel(InKernelCallable): # insert the assignees at the required positions assignee_write_count = -1 for i, arg in enumerate(self.subkernel.args): - if arg.is_output_only: + if arg.is_output and not arg.is_input: assignee = assignees[-assignee_write_count-1] parameters.insert(i, assignee) par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index e311fcc0f..46d70c054 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1923,34 +1923,50 @@ def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,): # {{{ direction helper tools -def infer_args_are_output_only(kernel): +def infer_args_are_input_output(kernel): """ - Returns a copy of *kernel* with the attribute ``is_output_only`` set. + Returns a copy of *kernel* with the attributes ``is_input`` and + ``is_output`` of the arguments set. .. note:: - If the attribute ``is_output_only`` is not supplied from an user, then - infers it as an output argument if it is written at some point in the - kernel. + If the attribute ``is_output`` of an argument is not supplied from an + user, then it is inferred as an output argument if it is written at + some point in the kernel. + + If the attribute ``is_input`` of an argument of is not supplied from + an user, then it is inferred as an input argument if it is either read + at some point in the kernel or it is neither read nor written. """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg new_args = [] for arg in kernel.args: if isinstance(arg, (ArrayArg, ImageArg, ValueArg)): - if arg.is_output_only is not None: - assert isinstance(arg.is_output_only, bool) - new_args.append(arg) + if arg.is_output is not None: + assert isinstance(arg.is_output, bool) else: if arg.name in kernel.get_written_variables(): - new_args.append(arg.copy(is_output_only=True)) + arg = arg.copy(is_output=True) + else: + arg = arg.copy(is_output=False) + + if arg.is_input is not None: + assert isinstance(arg.is_input, bool) + else: + if arg.name in kernel.get_read_variables() or ( + (arg.name not in kernel.get_read_variables()) and ( + arg.name not in kernel.get_written_variables())): + arg = arg.copy(is_input=True) else: - new_args.append(arg.copy(is_output_only=False)) + arg = arg.copy(is_input=False) elif isinstance(arg, ConstantArg): - new_args.append(arg) + pass else: raise NotImplementedError("Unkonwn argument type %s." % type(arg)) + new_args.append(arg) + return kernel.copy(args=new_args) # }}} diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index e0f4a79d7..05866a105 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -171,22 +171,8 @@ def register_callable_kernel(program, callee_kernel): # check to make sure that the variables with 'out' direction is equal to # the number of assigness in the callee kernel intructions. - expected_num_assignees = len([arg for arg in callee_kernel.args if - arg.name in callee_kernel.get_written_variables()]) - - # can only predict the range of actual number of parameters to a kernel - # call, as a variable intended for pure output can be read - expected_max_num_parameters = len([arg for arg in callee_kernel.args if - arg.name in callee_kernel.get_read_variables()]) + len( - [arg for arg in callee_kernel.args if arg.name not in - (callee_kernel.get_read_variables() | - callee_kernel.get_written_variables())]) - expected_min_num_parameters = len([arg for arg in callee_kernel.args if - arg.name in callee_kernel.get_read_variables() and arg.name not in - callee_kernel.get_written_variables()]) + len( - [arg for arg in callee_kernel.args if arg.name not in - (callee_kernel.get_read_variables() | - callee_kernel.get_written_variables())]) + expected_num_assignees = sum(arg.is_output for arg in callee_kernel.args) + expected_num_arguments = sum(arg.is_input for arg in callee_kernel.args) for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel @@ -204,19 +190,11 @@ def register_callable_kernel(program, callee_kernel): "match." % ( callee_kernel.name, insn.id)) if len(insn.expression.parameters+tuple( - kw_parameters.values())) > expected_max_num_parameters: + kw_parameters.values())) != expected_num_arguments: raise LoopyError("The number of" - " parameters in instruction '%s' exceed" - " the max. number of arguments possible" - " for the callee kernel '%s' => arg matching" - " not possible." - % (insn.id, callee_kernel.name)) - if len(insn.expression.parameters+tuple( - kw_parameters.values())) < expected_min_num_parameters: - raise LoopyError("The number of" - " parameters in instruction '%s' is less than" - " the min. number of arguments possible" - " for the callee kernel '%s' => arg matching" + " arguments in instruction '%s' do match" + " the number of input arguments in" + " the callee kernel '%s' => arg matching" " not possible." % (insn.id, callee_kernel.name)) @@ -409,7 +387,7 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): assignee_pos = 0 parameter_pos = 0 for i, arg in enumerate(callee_knl.args): - if arg.is_output_only: + if arg.is_output: arg_map[arg.name] = assignees[assignee_pos] assignee_pos += 1 else: diff --git a/test/test_callables.py b/test/test_callables.py index ce6b89e36..a241b21f2 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -327,6 +327,9 @@ def test_multi_arg_array_call(ctx_factory): lp.Assignment(id="update", assignee=acc_i, expression=p.Variable("min")(acc_i, a_i), depends_on="init1,init2")], + [ + lp.GlobalArg('acc_i, index', is_input=False, is_output=True), + "..."], name="custom_argmin") argmin_kernel = lp.fix_parameters(argmin_kernel, n=n) @@ -403,21 +406,22 @@ def test_non_sub_array_refs_arguments(ctx_factory): from loopy.transform.callable import _match_caller_callee_argument_dimension_ callee = lp.make_function("{[i] : 0 <= i < 6}", "a[i] = a[i] + j", - [lp.GlobalArg("a", dtype="double", shape=(6,), is_output_only=False), + [lp.GlobalArg("a", dtype="double", shape=(6,), is_output=True, + is_input=True), lp.ValueArg("j", dtype="int")], name="callee") caller1 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:] = callee(a[:], b[0])", - [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output_only=False), - lp.GlobalArg("b", dtype="double", shape=(1, ), is_output_only=False)], + [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output=False), + lp.GlobalArg("b", dtype="double", shape=(1, ), is_output=False)], name="caller", target=lp.CTarget()) caller2 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], 3.1415926)", [lp.GlobalArg("a", dtype="double", shape=(6, ), - is_output_only=False)], + is_output=False)], name="caller", target=lp.CTarget()) caller3 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], kappa)", [lp.GlobalArg("a", dtype="double", shape=(6, ), - is_output_only=False), '...'], + is_output=False), '...'], name="caller", target=lp.CTarget()) registered = lp.register_callable_kernel(caller1, callee) @@ -582,7 +586,7 @@ def test_argument_matching_for_inplace_update(ctx_factory): knl = lp.register_callable_kernel(knl, twice) x = np.random.randn(10) - evt, (out, ) = knl(queue, np.copy(x)) + evt, (out, ) = knl(queue, x=np.copy(x)) assert np.allclose(2*x, out) -- GitLab From a37db7a463cbf32ee88a94a06283175aecb6f933 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Sep 2019 20:21:01 -0500 Subject: [PATCH 585/916] fixes minor error in argument matching --- loopy/kernel/function_interface.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 4b2d18ec5..2b50a2dc9 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -859,10 +859,12 @@ class CallableKernel(InKernelCallable): # insert the assignees at the required positions assignee_write_count = -1 for i, arg in enumerate(self.subkernel.args): - if arg.is_output and not arg.is_input: - assignee = assignees[-assignee_write_count-1] - parameters.insert(i, assignee) - par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) + if arg.is_output: + if not arg.is_input: + assignee = assignees[-assignee_write_count-1] + parameters.insert(i, assignee) + par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count]) + assignee_write_count -= 1 # no type casting in array calls -- GitLab From ddbe1c97045b70446dab340b4a98ecaf139e3165 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Sep 2019 20:22:33 -0500 Subject: [PATCH 586/916] check the validity of a kernel call more diligenltly --- loopy/transform/callable.py | 80 +++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 05866a105..2b888c21b 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -154,6 +154,84 @@ class _RegisterCalleeKernel(ImmutableRecord): return None +def subarrayrefs_are_equiv(sar1, sar2): + """ + Compares if two instance of :class:`loopy.symbolic.SubArrayRef`s point + to the same array region. + """ + if len(sar1.swept_inames) != len(sar2.swept_inames): + return False + + iname_map = dict(zip(sar1.swept_inames, sar2.swept_inames)) + + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper + sar1_substed = SubstitutionMapper(make_subst_func(iname_map))(sar1) + + return sar1_substed == sar2 + + +def _check_correctness_of_args_and_assignees(insn, callee_kernel): + from loopy.kernel.function_interface import get_kw_pos_association + kw_to_pos, pos_to_kw = get_kw_pos_association(callee_kernel) + callee_args_to_insn_params = [[] for _ in callee_kernel.args] + expr = insn.expression + from pymbolic.primitives import Call, CallWithKwargs + if isinstance(expr, Call): + expr = CallWithKwargs(expr.function, expr.parameters, kw_parameters={}) + for i, param in enumerate(expr.parameters): + pos = kw_to_pos[callee_kernel.args[i].name] + if pos < 0: + raise LoopyError("#{} argument meant for output obtained as an" + " input in '{}'.".format(i, insn)) + + assert pos == i + + callee_args_to_insn_params[i].append(param) + + for kw, param in six.iteritems(expr.kw_parameters): + pos = kw_to_pos[kw] + if pos < 0: + raise LoopyError("KW-argument '{}' meant for output obtained as an" + " input in '{}'.".format(kw, insn)) + callee_args_to_insn_params[pos].append(param) + + num_pure_assignees = 0 + for i, assignee in enumerate(insn.assignees): + pos = kw_to_pos[pos_to_kw[-i-1]] + + if pos < 0: + pos = (len(expr.parameters) + + len(expr.kw_parameters)+num_pure_assignees) + num_pure_assignees += 1 + + callee_args_to_insn_params[pos].append(assignee) + + # TODO: Some of the checks might be redundant. + + for arg, insn_params in zip(callee_kernel.args, + callee_args_to_insn_params): + if len(insn_params) == 1: + # making sure that the argument is either only input or output + if arg.is_input == arg.is_output: + raise LoopyError("Argument '{}' in '{}' should be passed in" + " both assignees and parameters in Call.".format( + insn_params[0], insn)) + elif len(insn_params) == 2: + if arg.is_input != arg.is_output: + raise LoopyError("Found multiple parameters mapping to an" + " argument which is not both input and output in" + " ''.".format()) + if not subarrayrefs_are_equiv(insn_params[0], insn_params[1]): + raise LoopyError("'{}' and '{}' point to the same argument in" + " the callee, but are unequal.".format( + insn_params[0], insn_params[1])) + else: + raise LoopyError("Multiple(>2) arguments pointing to the same" + " argument for '{}' in '{}'.".format(callee_kernel.name, + insn)) + + def register_callable_kernel(program, callee_kernel): """Returns a copy of *caller_kernel*, which would resolve *function_name* in an expression as a call to *callee_kernel*. @@ -198,6 +276,8 @@ def register_callable_kernel(program, callee_kernel): " not possible." % (insn.id, callee_kernel.name)) + _check_correctness_of_args_and_assignees(insn, callee_kernel) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass -- GitLab From ed9697621aec711d8d6b2b8c0e0b38a5699a34d9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 Sep 2019 23:36:28 -0500 Subject: [PATCH 587/916] new enforcement of argument matching find some bugs in the tests! --- test/test_callables.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_callables.py b/test/test_callables.py index a241b21f2..4fe8735dc 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -260,19 +260,19 @@ def test_shape_translation_through_sub_array_ref(ctx_factory, inline): callee1 = lp.make_function( "{[i]: 0<=i<6}", """ - a[i] = 2*abs(b[i]) + b[i] = 2*abs(a[i]) """, name="callee_fn1") callee2 = lp.make_function( "{[i, j]: 0<=i<3 and 0 <= j < 2}", """ - a[i, j] = 3*b[i, j] + b[i, j] = 3*a[i, j] """, name="callee_fn2") callee3 = lp.make_function( "{[i]: 0<=i<6}", """ - a[i] = 5*b[i] + b[i] = 5*a[i] """, name="callee_fn3") knl = lp.make_kernel( @@ -328,6 +328,7 @@ def test_multi_arg_array_call(ctx_factory): expression=p.Variable("min")(acc_i, a_i), depends_on="init1,init2")], [ + lp.GlobalArg('a'), lp.GlobalArg('acc_i, index', is_input=False, is_output=True), "..."], name="custom_argmin") -- GitLab From 89efdfc96376c4bb9786f7464b5868e47447a918 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 20 Sep 2019 02:32:37 -0500 Subject: [PATCH 588/916] Fixes SubArrayRef.get_begin_subscript(..) - Fixed all the places where it was invoked. - get_begin_subscript(..) should be only called when generating code, so made sure that it is not being called at unnecessary places in :mod:`loopy`. --- loopy/kernel/instruction.py | 3 ++- loopy/symbolic.py | 21 ++++++++++++++------- loopy/target/c/codegen/expression.py | 2 +- loopy/transform/callable.py | 3 ++- loopy/type_inference.py | 2 +- 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 1ba0dc7ec..97d0931bd 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -543,7 +543,8 @@ def _get_assignee_subscript_deps(expr): elif isinstance(expr, LinearSubscript): return get_dependencies(expr.index) elif isinstance(expr, SubArrayRef): - return get_dependencies(expr.get_begin_subscript().index) + return get_dependencies(expr.subscript.index) - ( + frozenset(iname.name for iname in expr.swept_inames)) else: raise RuntimeError("invalid lvalue '%s'" % expr) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 870f9fc2c..53d8d4431 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -198,7 +198,9 @@ class CombineMapper(CombineMapperBase): return self.rec(expr.expr, *args, **kwargs) def map_sub_array_ref(self, expr): - return self.rec(expr.get_begin_subscript()) + return self.combine(( + self.rec(expr.subscript), + self.combine(tuple(self.rec(idx) for idx in expr.swept_inames)))) map_linear_subscript = CombineMapperBase.map_subscript @@ -353,9 +355,9 @@ class DependencyMapper(DependencyMapperBase): def map_loopy_function_identifier(self, expr, *args, **kwargs): return set() - def map_sub_array_ref(self, expr, *args): - deps = self.rec(expr.subscript, *args) - return deps - set(iname for iname in expr.swept_inames) + def map_sub_array_ref(self, expr, *args, **kwargs): + deps = self.rec(expr.subscript, *args, **kwargs) + return deps - set(expr.swept_inames) map_linear_subscript = DependencyMapperBase.map_subscript @@ -845,7 +847,7 @@ class SubArrayRef(LoopyExpressionBase): self.swept_inames = swept_inames self.subscript = subscript - def get_begin_subscript(self): + def get_begin_subscript(self, kernel): """ Returns an instance of :class:`pymbolic.primitives.Subscript`, the beginning subscript of the array swept by the *SubArrayRef*. @@ -853,9 +855,14 @@ class SubArrayRef(LoopyExpressionBase): **Example:** Consider ``[i, k]: a[i, j, k, l]``. The beginning subscript would be ``a[0, j, 0, l]`` """ - # TODO: Set the zero to the minimum value of the iname. + + def _get_lower_bound(iname): + pwaff = kernel.get_iname_bounds(iname).lower_bound_pw_aff + return int(pw_aff_to_expr(pwaff)) + swept_inames_to_zeros = dict( - (swept_iname.name, 0) for swept_iname in self.swept_inames) + (swept_iname.name, _get_lower_bound(swept_iname.name)) for + swept_iname in self.swept_inames) return EvaluatorWithDeficientContext(swept_inames_to_zeros)( self.subscript) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index c970901b1..5a066ddfb 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -167,7 +167,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): return var(expr.name) def map_sub_array_ref(self, expr, type_context): - return var("&")(self.rec(expr.get_begin_subscript(), + return var("&")(self.rec(expr.get_begin_subscript(self.kernel), type_context)) def map_subscript(self, expr, type_context): diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 2b888c21b..56fab7561 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -368,7 +368,8 @@ class KernelInliner(SubstitutionMapper): "constant shape.".format(callee_arg)) flatten_index = 0 - for i, idx in enumerate(sar.get_begin_subscript().index_tuple): + for i, idx in enumerate(sar.get_begin_subscript( + self.caller).index_tuple): flatten_index += idx*caller_arg.dim_tags[i].stride flatten_index += sum( diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 281dcb43d..0d4430e0d 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -692,7 +692,7 @@ class TypeInferenceMapper(CombineMapper): for rec_result in rec_results] def map_sub_array_ref(self, expr): - return self.rec(expr.get_begin_subscript()) + return self.rec(expr.subscript) # }}} -- GitLab From 50250d247d38606cf33c3948c474d063d407d034 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 20 Sep 2019 02:36:13 -0500 Subject: [PATCH 589/916] minor fixes in the tests; test for a bug when the start of the swept iname is non zero --- test/test_callables.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/test/test_callables.py b/test/test_callables.py index 4fe8735dc..04eeae666 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -364,13 +364,13 @@ def test_packing_unpacking(ctx_factory, inline): callee1 = lp.make_function( "{[i]: 0<=i<6}", """ - a[i] = 2*b[i] + b[i] = 2*a[i] """, name="callee_fn1") callee2 = lp.make_function( "{[i, j]: 0<=i<2 and 0 <= j < 3}", """ - a[i, j] = 3*b[i, j] + b[i, j] = 3*a[i, j] """, name="callee_fn2") knl = lp.make_kernel( @@ -456,8 +456,7 @@ def test_empty_sub_array_refs(ctx_factory, inline): callee = lp.make_function( "{[d]:0<=d<1}", """ - a[d] = b[d] - c[d] - + c[d] = a[d] - b[d] """, name='wence_function') caller = lp.make_kernel("{[i]: 0<=i<10}", @@ -592,6 +591,29 @@ def test_argument_matching_for_inplace_update(ctx_factory): assert np.allclose(2*x, out) +def test_non_zero_start_in_subarray_ref(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + twice = lp.make_function( + "{[i]: 0<=i<10}", + """ + b[i] = 2*a[i] + """, name='twice') + + knl = lp.make_kernel( + "{[i, j]: -5<=i<5 and 0<=j<10}", + """ + [i]:y[i+5] = twice([j]: x[j]) + """, [lp.GlobalArg('x, y', shape=(10,), dtype=np.float64)]) + + knl = lp.register_callable_kernel(knl, twice) + + x = np.random.randn(10) + evt, (out, ) = knl(queue, x=np.copy(x)) + + assert np.allclose(2*x, out) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 74c049694a0e76ff0980cb1fa6595cdfe3c6516f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 20 Sep 2019 02:38:08 -0500 Subject: [PATCH 590/916] correctly checks if 2 sub array refs refer to the same part of arrays --- loopy/transform/callable.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 56fab7561..9c05dc97f 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -154,24 +154,20 @@ class _RegisterCalleeKernel(ImmutableRecord): return None -def subarrayrefs_are_equiv(sar1, sar2): +def subarrayrefs_are_equiv(sar1, sar2, knl): """ Compares if two instance of :class:`loopy.symbolic.SubArrayRef`s point to the same array region. """ - if len(sar1.swept_inames) != len(sar2.swept_inames): - return False - - iname_map = dict(zip(sar1.swept_inames, sar2.swept_inames)) - - from pymbolic.mapper.substitutor import make_subst_func - from loopy.symbolic import SubstitutionMapper - sar1_substed = SubstitutionMapper(make_subst_func(iname_map))(sar1) + from loopy.kernel.function_interface import get_arg_descriptor_for_expression - return sar1_substed == sar2 + return get_arg_descriptor_for_expression(knl, sar1) == ( + get_arg_descriptor_for_expression(knl, sar2)) and ( + sar1.get_begin_subscript(knl) == + sar2.get_begin_subscript(knl)) -def _check_correctness_of_args_and_assignees(insn, callee_kernel): +def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): from loopy.kernel.function_interface import get_kw_pos_association kw_to_pos, pos_to_kw = get_kw_pos_association(callee_kernel) callee_args_to_insn_params = [[] for _ in callee_kernel.args] @@ -222,7 +218,8 @@ def _check_correctness_of_args_and_assignees(insn, callee_kernel): raise LoopyError("Found multiple parameters mapping to an" " argument which is not both input and output in" " ''.".format()) - if not subarrayrefs_are_equiv(insn_params[0], insn_params[1]): + if not subarrayrefs_are_equiv(insn_params[0], insn_params[1], + caller_knl): raise LoopyError("'{}' and '{}' point to the same argument in" " the callee, but are unequal.".format( insn_params[0], insn_params[1])) @@ -276,7 +273,8 @@ def register_callable_kernel(program, callee_kernel): " not possible." % (insn.id, callee_kernel.name)) - _check_correctness_of_args_and_assignees(insn, callee_kernel) + _check_correctness_of_args_and_assignees(insn, + callee_kernel, caller_kernel) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): -- GitLab From 94e115a766373a801ef8350ee40281a9827e2f7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Sun, 22 Sep 2019 03:13:45 +0200 Subject: [PATCH 591/916] =?UTF-8?q?Romanize=20"Kl=C3=B6ckner"=20in=20funct?= =?UTF-8?q?ion=5Finterface.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- loopy/kernel/function_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index d8c120db8..0cb610074 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -1,6 +1,6 @@ from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2018 Andreas Klöckner, Kaushik Kulkarni" +__copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy -- GitLab From 51b25d2a029bfa7d554a83f5d0f286b2dc476aaa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 23 Sep 2019 05:11:22 -0500 Subject: [PATCH 592/916] minor fixes from the review --- loopy/kernel/data.py | 2 +- loopy/kernel/tools.py | 5 +++++ loopy/transform/callable.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 15a77b809..51367e64e 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -360,7 +360,7 @@ class ArrayArg(ArrayBase, KernelArgument): .. attribute:: is_input An instance of :class:`bool`. If set to *True*, expected to be - provided by the user. + provided by the caller. """) allowed_extra_kwargs = [ diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 46d70c054..d0e4ef084 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1965,6 +1965,11 @@ def infer_args_are_input_output(kernel): else: raise NotImplementedError("Unkonwn argument type %s." % type(arg)) + if not (arg.is_input or arg.is_output): + raise LoopyError("Kernel argument must be either input or output." + " '{}' in '{}' does not follow it.".format(arg.name, + kernel.name)) + new_args.append(arg) return kernel.copy(args=new_args) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 9c05dc97f..a87a43f4e 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -267,7 +267,7 @@ def register_callable_kernel(program, callee_kernel): if len(insn.expression.parameters+tuple( kw_parameters.values())) != expected_num_arguments: raise LoopyError("The number of" - " arguments in instruction '%s' do match" + " arguments in instruction '%s' do not match" " the number of input arguments in" " the callee kernel '%s' => arg matching" " not possible." -- GitLab From 7b4771017af6ba16b2198b01b17d66d97c528573 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 23 Sep 2019 05:26:39 -0500 Subject: [PATCH 593/916] rephrasing is_output docs --- loopy/kernel/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 51367e64e..f0d7b3789 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -354,8 +354,8 @@ class ArrayArg(ArrayBase, KernelArgument): .. attribute:: is_output - An instance of :class:`bool`. If set to *True*, recorded to be - returned from the kernel. + An instance of :class:`bool`. If set to *True*, the argument is used + to return information to the caller .. attribute:: is_input -- GitLab From b98f296617ff12de3365e519bf85c75baf9b19f7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 26 Sep 2019 19:38:54 -0500 Subject: [PATCH 594/916] Interface changes for registering kernel / callable - register_callable_kernel -> fuse_translation_units - register_func_id_to_in_knl_callable_mappers->register_callable --- loopy/__init__.py | 10 +- loopy/transform/callable.py | 299 +++++------------------------------- 2 files changed, 45 insertions(+), 264 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 058bc93ef..15a670583 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -51,7 +51,7 @@ from loopy.kernel.data import ( from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.program import ( - CallablesTable, Program, make_program) + Program, make_program) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -120,8 +120,8 @@ from loopy.transform.batch import to_batched, save_temporaries_in_loop from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier -from loopy.transform.callable import (register_callable_kernel, - register_function_id_to_in_knl_callable_mapper, inline_callable_kernel) +from loopy.transform.callable import (register_callable, + fuse_translation_units, inline_callable_kernel) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call # }}} @@ -242,8 +242,8 @@ __all__ = [ "dump_as_python", - "register_callable_kernel", - "register_function_id_to_in_knl_callable_mapper", + "register_callable", + "fuse_translation_units", "inline_callable_kernel", "pack_and_unpack_args_for_call", diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index a87a43f4e..c9baa741f 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -44,286 +44,67 @@ __doc__ = """ .. autofunction:: register_function_id_to_in_knl_callable_mapper -.. autofunction:: register_callable_kernel +.. autofunction:: fuse_translation_units """ -# {{{ register function lookup - -def _resolve_callables_from_function_lookup(program, - func_id_to_in_kernel_callable_mapper): +def register_callable(translation_unit, function_identifier, callable_, + redefining_not_ok=True): """ - Returns a copy of *program* with the expression nodes marked "Resolved" - if any match is found through the given - *func_id_to_in_kernel_callable_mapper*. - - :arg func_id_to_in_kernel_callable_mapper: A function with signature - ``(target, identifier)`` that returns either an instance of - :class:`loopy.InKernelCallable` or *None*. + :param translation_unit: A :class:`loopy.Program`. + :param callable_: A :class:`loopy.InKernelCallable`. """ - callables_table = program.callables_table - callable_knls = dict( - (func_id, in_knl_callable) for func_id, in_knl_callable in - callables_table.items() if isinstance(in_knl_callable, - CallableKernel)) - edited_callable_knls = {} + if isinstance(callable_, LoopKernel): + callable_ = CallableKernel(callable_) - for func_id, in_knl_callable in callable_knls.items(): - kernel = in_knl_callable.subkernel + from loopy.kernel.function_interface import InKernelCallable + assert isinstance(callable_, InKernelCallable) - from loopy.symbolic import SubstitutionRuleMappingContext - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, kernel.get_var_name_generator()) + if (function_identifier in translation_unit.callables) and ( + redefining_not_ok): + raise LoopyError("Redifining function identifier not allowed. Set the" + " option 'redefining_not_ok=False' to bypass this error.") - resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, kernel, callables_table, - [func_id_to_in_kernel_callable_mapper]) + callables = translation_unit.copy() + callables[function_identifier] = callable_ - new_subkernel = rule_mapping_context.finish_kernel( - resolved_function_marker.map_kernel(kernel)) - callables_table = resolved_function_marker.callables_table + return translation_unit.copy( + callables=callables) - edited_callable_knls[func_id] = in_knl_callable.copy( - subkernel=new_subkernel) - new_resolved_functions = {} - - for func_id, in_knl_callable in callables_table.items(): - if func_id in edited_callable_knls: - new_resolved_functions[func_id] = edited_callable_knls[func_id] - else: - new_resolved_functions[func_id] = in_knl_callable - - callables_table = callables_table.copy( - resolved_functions=new_resolved_functions) - - return program.copy(callables_table=callables_table) - - -def register_function_id_to_in_knl_callable_mapper(program, - func_id_to_in_knl_callable_mapper): +def fuse_translation_units(translation_units, collision_not_ok=True): """ - Returns a copy of *program* with the *function_lookup* registered. + :param translation_units: A list of :class:`loopy.Program`. + :param collision_not_ok: An instance of :class:`bool`. - :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target, - identifier)`` returning a - :class:`loopy.kernel.function_interface.InKernelCallable` or *None* if - the *function_identifier* is not known. + :returns: An instance of :class:`loopy.Program` which contains all the + callables from each of the *translation_units. """ - # adding the function lookup to the set of function lookers in the kernel. - if func_id_to_in_knl_callable_mapper not in ( - program.func_id_to_in_knl_callable_mappers): - from loopy.tools import unpickles_equally - if not unpickles_equally(func_id_to_in_knl_callable_mapper): - raise LoopyError("function '%s' does not " - "compare equally after being upickled " - "and would disrupt loopy's caches" - % func_id_to_in_knl_callable_mapper) - new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + ( - [func_id_to_in_knl_callable_mapper]) - - program = _resolve_callables_from_function_lookup(program, - func_id_to_in_knl_callable_mapper) - - new_program = program.copy( - func_id_to_in_knl_callable_mappers=new_func_id_mappers) - - return new_program - -# }}} - - -# {{{ register_callable_kernel - -class _RegisterCalleeKernel(ImmutableRecord): - """ - Helper class to make the function scoper from - :func:`loopy.transform.register_callable_kernel` picklable. As python - cannot pickle lexical closures. - """ - fields = set(['callable_kernel']) + for i in range(1, len(translation_units)): + if translation_units[i].target != translation_units[i-1].target: + raise LoopyError("fuse_translation_units should have" + " translation_units to be of the same target to be able to" + " fuse.") + callables_table = {} + for trans_unit in translation_units: + callables_table.update(trans_unit.callables_table.copy()) - def __init__(self, callable_kernel): - self.callable_kernel = callable_kernel + # {{{ - def __call__(self, target, identifier): - if identifier == self.callable_kernel.subkernel.name: - return self.callable_kernel - return None - - -def subarrayrefs_are_equiv(sar1, sar2, knl): - """ - Compares if two instance of :class:`loopy.symbolic.SubArrayRef`s point - to the same array region. - """ - from loopy.kernel.function_interface import get_arg_descriptor_for_expression - - return get_arg_descriptor_for_expression(knl, sar1) == ( - get_arg_descriptor_for_expression(knl, sar2)) and ( - sar1.get_begin_subscript(knl) == - sar2.get_begin_subscript(knl)) - - -def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): - from loopy.kernel.function_interface import get_kw_pos_association - kw_to_pos, pos_to_kw = get_kw_pos_association(callee_kernel) - callee_args_to_insn_params = [[] for _ in callee_kernel.args] - expr = insn.expression - from pymbolic.primitives import Call, CallWithKwargs - if isinstance(expr, Call): - expr = CallWithKwargs(expr.function, expr.parameters, kw_parameters={}) - for i, param in enumerate(expr.parameters): - pos = kw_to_pos[callee_kernel.args[i].name] - if pos < 0: - raise LoopyError("#{} argument meant for output obtained as an" - " input in '{}'.".format(i, insn)) - - assert pos == i - - callee_args_to_insn_params[i].append(param) - - for kw, param in six.iteritems(expr.kw_parameters): - pos = kw_to_pos[kw] - if pos < 0: - raise LoopyError("KW-argument '{}' meant for output obtained as an" - " input in '{}'.".format(kw, insn)) - callee_args_to_insn_params[pos].append(param) - - num_pure_assignees = 0 - for i, assignee in enumerate(insn.assignees): - pos = kw_to_pos[pos_to_kw[-i-1]] - - if pos < 0: - pos = (len(expr.parameters) + - len(expr.kw_parameters)+num_pure_assignees) - num_pure_assignees += 1 - - callee_args_to_insn_params[pos].append(assignee) - - # TODO: Some of the checks might be redundant. - - for arg, insn_params in zip(callee_kernel.args, - callee_args_to_insn_params): - if len(insn_params) == 1: - # making sure that the argument is either only input or output - if arg.is_input == arg.is_output: - raise LoopyError("Argument '{}' in '{}' should be passed in" - " both assignees and parameters in Call.".format( - insn_params[0], insn)) - elif len(insn_params) == 2: - if arg.is_input != arg.is_output: - raise LoopyError("Found multiple parameters mapping to an" - " argument which is not both input and output in" - " ''.".format()) - if not subarrayrefs_are_equiv(insn_params[0], insn_params[1], - caller_knl): - raise LoopyError("'{}' and '{}' point to the same argument in" - " the callee, but are unequal.".format( - insn_params[0], insn_params[1])) - else: - raise LoopyError("Multiple(>2) arguments pointing to the same" - " argument for '{}' in '{}'.".format(callee_kernel.name, - insn)) - - -def register_callable_kernel(program, callee_kernel): - """Returns a copy of *caller_kernel*, which would resolve *function_name* in an - expression as a call to *callee_kernel*. - - :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - :arg function_name: An instance of :class:`str`. - :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`. - """ - - # {{{ sanity checks - - assert isinstance(program, Program) - assert isinstance(callee_kernel, LoopKernel), ('{0} !=' - '{1}'.format(type(callee_kernel), LoopKernel)) - - # check to make sure that the variables with 'out' direction is equal to - # the number of assigness in the callee kernel intructions. - expected_num_assignees = sum(arg.is_output for arg in callee_kernel.args) - expected_num_arguments = sum(arg.is_input for arg in callee_kernel.args) - for in_knl_callable in program.callables_table.values(): - if isinstance(in_knl_callable, CallableKernel): - caller_kernel = in_knl_callable.subkernel - for insn in caller_kernel.instructions: - if isinstance(insn, CallInstruction) and ( - insn.expression.function.name == callee_kernel.name): - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - else: - kw_parameters = {} - if len(insn.assignees) != expected_num_assignees: - raise LoopyError("The number of arguments with 'out' " - "direction " "in callee kernel %s and the number " - "of assignees in " "instruction %s do not " - "match." % ( - callee_kernel.name, insn.id)) - if len(insn.expression.parameters+tuple( - kw_parameters.values())) != expected_num_arguments: - raise LoopyError("The number of" - " arguments in instruction '%s' do not match" - " the number of input arguments in" - " the callee kernel '%s' => arg matching" - " not possible." - % (insn.id, callee_kernel.name)) - - _check_correctness_of_args_and_assignees(insn, - callee_kernel, caller_kernel) - - elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("unknown instruction %s" % type(insn)) - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callable type %s." % - type(in_knl_callable).__name__) + if len(callables_table) != sum(len(trans_unit.callables_table) for trans_unit in + translation_units) and collision_not_ok: + raise LoopyError("translation units in fuse_translation_units cannot" + " not contain callables with same names.") # }}} - # take the function resolvers from the Program and resolve the functions in - # the callee kernel - from loopy.symbolic import SubstitutionRuleMappingContext - rule_mapping_context = SubstitutionRuleMappingContext( - callee_kernel.substitutions, - callee_kernel.get_var_name_generator()) - - resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, callee_kernel, program.callables_table, - program.func_id_to_in_knl_callable_mappers) - - callee_kernel = rule_mapping_context.finish_kernel( - resolved_function_marker.map_kernel(callee_kernel)) - callables_table = resolved_function_marker.callables_table.copy() - - program = program.copy(callables_table=callables_table) - - # making the target of the child kernel to be same as the target of parent - # kernel. - callable_kernel = CallableKernel(subkernel=callee_kernel.copy( - target=program.target, - is_called_from_host=False)) - - # FIXME disabling global barriers for callee kernel (for now) - from loopy import set_options - callee_kernel = set_options(callee_kernel, "disable_global_barriers") - - # FIXME: the number of callables is wrong. This is horrible please - # compensate. - - return register_function_id_to_in_knl_callable_mapper( - program, - _RegisterCalleeKernel(callable_kernel)) - -# }}} + return Program( + entrypoints=frozenset().union(*( + t.entrypoints for t in translation_units)), + callables_table=callables_table, + target=translation_units[0].target) # {{{ kernel inliner mapper -- GitLab From 18ae117215d857d2f12217ecea74494394ab8a4d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 29 Sep 2019 16:13:32 -0500 Subject: [PATCH 595/916] Moves the mangler style function inference to CallableKernel style - Made changes on the ASTBuilderBase end to return the correct type - Made changes to the loopy callables to return the correct type --- loopy/library/function.py | 26 +++++++++++++------------- loopy/library/reduction.py | 13 ++++++++----- loopy/target/__init__.py | 12 +++++++----- loopy/target/c/__init__.py | 18 ++++++++++-------- loopy/target/cuda.py | 17 ++++++++--------- loopy/target/opencl.py | 16 ++++++++-------- loopy/target/pyopencl.py | 23 +++++++++++------------ loopy/target/python.py | 9 +++++---- 8 files changed, 70 insertions(+), 64 deletions(-) diff --git a/loopy/library/function.py b/loopy/library/function.py index 378b7de58..247d5b231 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -104,25 +104,25 @@ class IndexOfCallable(ScalarCallable): target), True -def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier): +def get_loopy_callables(): """ - Returns an instance of :class:`InKernelCallable` for the *idenitifer* - which is not present in *target*, but whose interface is given by - :mod:`loo.py`. Callables that fall in this category are -- + Returns a mapping from function ids to corresponding + :class:`loopy.kernel.function_interface.InKernelCallable` for functions + whose interface is provided by :mod:`loopy`. Callables that fall in this + category are -- - reductions leading to function calls like ``argmin``, ``argmax``. - callables that have a predefined meaning in :mod:`loo.py` like ``make_tuple``, ``index_of``, ``indexof_vec``. """ - if identifier == "make_tuple": - return MakeTupleCallable(name="make_tuple") - - if identifier in ["indexof", "indexof_vec"]: - return IndexOfCallable(name=identifier) - - from loopy.library.reduction import ( - reduction_func_id_to_in_knl_callable_mapper) - return reduction_func_id_to_in_knl_callable_mapper(target, identifier) + known_callables = { + "make_tuple": MakeTupleCallable(name="make_tuple"), + "indexof": IndexOfCallable(name="indexof"), + "indexof_vec": IndexOfCallable(name="indexof_vec"), + } + + from loopy.library.reduction import get_reduction_callables + return known_callables.update(get_reduction_callables()) # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 504493f4d..675db0485 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -626,11 +626,14 @@ class ReductionCallable(ScalarCallable): return -def reduction_func_id_to_in_knl_callable_mapper(target, identifier): - if isinstance(identifier, ReductionOpFunction): - return ReductionCallable(name=identifier) - - return None +def get_reduction_callables(target, identifier): + + return dict((id_, ReductionCallable(id_)) for id_ in [ + ReductionOpFunction(SegmentedSumReductionOperation), + ReductionOpFunction(SegmentedProductReductionOperation), + ReductionOpFunction(ArgMaxReductionOperation), + ReductionOpFunction(ArgMinReductionOperation), + ]) # }}} diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index f27ee4e96..fa76d4251 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -150,13 +150,15 @@ class ASTBuilderBase(object): # {{{ library - def function_id_in_knl_callable_mapper(self): + @property + def known_callables(self): """ - Returns an instance of list of the functions of signature - ``(target, identifiers)`` returning either an instance of - :class:`InKernelCallable` if a match is found or *None*. + Returns a mapping from function ids to corresponding + :class:`loopy.kernel.function_interface.InKernelCallable` for the + function ids known to *self.target*. """ - return [] + # FIXME: @inducer: Do we need to move this to TargetBase? + return {} def symbol_manglers(self): return [] diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 55125371f..5cabc796e 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -523,17 +523,17 @@ class CMathCallable(ScalarCallable): callables_table) -def scope_c_math_functions(target, identifier): +def get_c_callables(): """ Returns an instance of :class:`InKernelCallable` if the function represented by :arg:`identifier` is known in C, otherwise returns *None*. """ - if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", + cmath_ids = ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", - "fabs", "tan", "erf", "erfc"]: - return CMathCallable(name=identifier) - return None + "fabs", "tan", "erf", "erfc"] + + return dict((id_, CMathCallable(id_)) for id_ in cmath_ids) # }}} @@ -553,10 +553,12 @@ class CASTBuilder(ASTBuilderBase): _preamble_generator, ]) - def function_id_in_knl_callable_mapper(self): + @property + def known_callables(self): return ( - super(CASTBuilder, self).function_id_in_knl_callable_mapper() + [ - scope_c_math_functions]) + super(CASTBuilder, + self).known_callables.update( + get_c_callables())) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index dfa94f71b..b8f644ddd 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -184,12 +184,9 @@ class CudaCallable(ScalarCallable): callables_table) -def scope_cuda_functions(target, identifier): - if identifier in set(["dot"]) | set( - _CUDA_SPECIFIC_FUNCTIONS): - return CudaCallable(name=identifier) - - return None +def get_cuda_callables(): + cuda_func_ids = set(["dot"]) | set(_CUDA_SPECIFIC_FUNCTIONS) + return dict((id_, CudaCallable(name=id_)) for id_ in cuda_func_ids) # }}} @@ -312,9 +309,11 @@ class CUDACASTBuilder(CASTBuilder): # {{{ library - def function_id_in_knl_callable_mapper(self): - return [scope_cuda_functions] + ( - super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper()) + @property + def known_callables(self): + return ( + super(CUDACASTBuilder, self).known_callables().update( + get_cuda_callables())) # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 82478a268..66f2c67c3 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -288,7 +288,7 @@ class OpenCLCallable(ScalarCallable): callables_table) -def scope_opencl_functions(target, identifier): +def get_opencl_callables(): """ Returns an instance of :class:`InKernelCallable` if the function defined by *identifier* is known in OpenCL. @@ -296,10 +296,8 @@ def scope_opencl_functions(target, identifier): opencl_function_ids = set(["max", "min", "dot"]) | set( _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) - if identifier in opencl_function_ids: - return OpenCLCallable(name=identifier) - - return None + return dict((id_, OpenCLCallable(name=id_)) for id_ in + opencl_function_ids) # }}} @@ -447,10 +445,12 @@ class OpenCLTarget(CTarget): class OpenCLCASTBuilder(CASTBuilder): # {{{ library - def function_id_in_knl_callable_mapper(self): + @property + def known_callables(self): return ( - [scope_opencl_functions] + super( - OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) + super( + OpenCLCASTBuilder, self).known_callables).update( + get_opencl_callables()) def symbol_manglers(self): return ( diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 9624a7d41..c042812e7 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -275,12 +275,10 @@ class PyOpenCLCallable(ScalarCallable): callables_table) -def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier): - if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", - "tanh", "conj", "real", "imag", "abs"]: - return PyOpenCLCallable(name=identifier) - - return None +def get_pyopencl_callables(): + pyopencl_ids = ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", + "tanh", "conj", "real", "imag", "abs"] + return dict((id_, PyOpenCLCallable(name=id_)) for id_ in pyopencl_ids) # }}} @@ -796,13 +794,14 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): # {{{ library - def function_id_in_knl_callable_mapper(self): - from loopy.library.random123 import ( - random123_function_id_to_in_knl_callable_mapper) + @property + def known_callables(self): + from loopy.library.random123 import get_random123_callables return ( - [pyopencl_function_id_to_in_knl_callable_mapper, - random123_function_id_to_in_knl_callable_mapper] + super( - PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper()) + super( + PyOpenCLCASTBuilder, self).known_callables).update( + get_pyopencl_callables()).update( + get_random123_callables()) def preamble_generators(self): return ([ diff --git a/loopy/target/python.py b/loopy/target/python.py index 1f83112ff..b88830ab0 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -180,12 +180,13 @@ class PythonASTBuilderBase(ASTBuilderBase): # {{{ code generation guts - def function_id_in_knl_callable_mapper(self): - from loopy.target.c import scope_c_math_functions + @property + def known_callables(self): + from loopy.target.c import get_c_callables return ( super(PythonASTBuilderBase, - self).function_id_in_knl_callable_mapper() + - [scope_c_math_functions]) + self).known_callables.update( + get_c_callables())) def preamble_generators(self): return ( -- GitLab From cf8c27c63d4145242f150031482d8ebd7cf46308 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Sep 2019 18:13:41 -0500 Subject: [PATCH 596/916] correct minor error in updating dict --- loopy/library/function.py | 5 +++-- loopy/library/random123.py | 7 ++----- loopy/library/reduction.py | 3 +-- loopy/target/c/__init__.py | 7 +++---- loopy/target/cuda.py | 6 +++--- loopy/target/opencl.py | 7 +++---- 6 files changed, 15 insertions(+), 20 deletions(-) diff --git a/loopy/library/function.py b/loopy/library/function.py index 247d5b231..118b9dcc5 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -115,14 +115,15 @@ def get_loopy_callables(): - callables that have a predefined meaning in :mod:`loo.py` like ``make_tuple``, ``index_of``, ``indexof_vec``. """ + from loopy.library.reduction import get_reduction_callables known_callables = { "make_tuple": MakeTupleCallable(name="make_tuple"), "indexof": IndexOfCallable(name="indexof"), "indexof_vec": IndexOfCallable(name="indexof_vec"), } + known_callables.update(get_reduction_callables()) - from loopy.library.reduction import get_reduction_callables - return known_callables.update(get_reduction_callables()) + return known_callables # vim: foldmethod=marker diff --git a/loopy/library/random123.py b/loopy/library/random123.py index e59a892bb..f6fad2faa 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -231,10 +231,7 @@ class Random123Callable(ScalarCallable): return -def random123_function_id_to_in_knl_callable_mapper(target, identifier): - if identifier in FUNC_NAMES_TO_RNG: - return Random123Callable(name=identifier) - - return None +def get_random123_callables(): + return dict((id_, Random123Callable(id_)) for id_ in FUNC_NAMES_TO_RNG) # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 675db0485..9418ee282 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -626,8 +626,7 @@ class ReductionCallable(ScalarCallable): return -def get_reduction_callables(target, identifier): - +def get_reduction_callables(): return dict((id_, ReductionCallable(id_)) for id_ in [ ReductionOpFunction(SegmentedSumReductionOperation), ReductionOpFunction(SegmentedProductReductionOperation), diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 5cabc796e..82f18e56c 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -555,10 +555,9 @@ class CASTBuilder(ASTBuilderBase): @property def known_callables(self): - return ( - super(CASTBuilder, - self).known_callables.update( - get_c_callables())) + callables = super(CASTBuilder, self).known_callables + callables.update(get_c_callables()) + return callables # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index b8f644ddd..b47e6f7b2 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -311,9 +311,9 @@ class CUDACASTBuilder(CASTBuilder): @property def known_callables(self): - return ( - super(CUDACASTBuilder, self).known_callables().update( - get_cuda_callables())) + callables = super(CUDACASTBuilder, self).known_callables + callables.update(get_cuda_callables()) + return callables # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 66f2c67c3..704ad25b1 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -447,10 +447,9 @@ class OpenCLCASTBuilder(CASTBuilder): @property def known_callables(self): - return ( - super( - OpenCLCASTBuilder, self).known_callables).update( - get_opencl_callables()) + callables = super(OpenCLCASTBuilder, self).known_callables + callables.update(get_opencl_callables()) + return callables def symbol_manglers(self): return ( -- GitLab From df844889f41502bf433482a8da1e820dc7e00893 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Sep 2019 18:14:12 -0500 Subject: [PATCH 597/916] corrects strify for resolved functions --- loopy/symbolic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 53d8d4431..0397a083b 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -261,7 +261,7 @@ class StringifyMapper(StringifyMapperBase): return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE)) def map_resolved_function(self, expr, prec): - return expr.name + return "Resolved(%s)" % expr.name def map_sub_array_ref(self, expr, prec): return "[{inames}]: {subscr}".format( -- GitLab From 5a45df73274a7116c280eee1c0af8b3302a3d3f3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Sep 2019 18:15:07 -0500 Subject: [PATCH 598/916] Starts working on translation units with multiple entrypoints - Changes the position of resolving in the codegen pipeline - Execution objects now take "entrypoint" to identify which kernel to execute in a program. --- loopy/codegen/__init__.py | 6 +- loopy/kernel/__init__.py | 30 ++ loopy/kernel/creation.py | 10 +- loopy/program.py | 460 ++++++++++++----------------- loopy/target/execution.py | 26 +- loopy/target/pyopencl.py | 14 +- loopy/target/pyopencl_execution.py | 4 +- loopy/target/python.py | 7 +- loopy/transform/callable.py | 2 +- loopy/type_inference.py | 18 +- 10 files changed, 281 insertions(+), 296 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 70cd7cc95..4acf2ce0a 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -587,7 +587,11 @@ def generate_code_v2(program): program = make_program(program) from loopy.kernel import KernelState - if program.root_kernel.state == KernelState.INITIAL: + if program.state == KernelState.INITIAL: + # Note that we cannot have preprocessing separately for everyone. + # Since, now the preprocessing of each one depends on the other. + # So we check if any one of the callable kernels are not preprocesses + # then, we have to do the preprocessing of every other kernel. from loopy.preprocess import preprocess_program program = preprocess_program(program) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index d79308241..8c441c35e 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1433,6 +1433,36 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} + # {{{ implementation arguments + + @property + @memoize_method + def impl_arg_to_arg(self): + from loopy.kernel.array import ArrayBase + + result = {} + + for arg in self.args: + if not isinstance(arg, ArrayBase): + result[arg.name] = arg + continue + + if arg.shape is None or arg.dim_tags is None: + result[arg.name] = arg + continue + + subscripts_and_names = arg.subscripts_and_names() + if subscripts_and_names is None: + result[arg.name] = arg + continue + + for index, sub_arg_name in subscripts_and_names: + result[sub_arg_name] = arg + + return result + + # }}} + # {{{ direct execution def __call__(self, *args, **kwargs): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 4be7e06b8..c6081156f 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2151,7 +2151,6 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): target = kwargs.pop("target", None) seq_dependencies = kwargs.pop("seq_dependencies", False) fixed_parameters = kwargs.pop("fixed_parameters", {}) - is_callee_kernel = kwargs.pop("is_callee_kernel", False) if defines: from warnings import warn @@ -2375,15 +2374,12 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): creation_plog.done() - if is_callee_kernel: - return knl - else: - from loopy.program import make_program - return make_program(knl) + from loopy.program import make_program + return make_program(knl) def make_function(*args, **kwargs): - kwargs['is_callee_kernel'] = True + #FIXME: Do we need this anymore?? return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/program.py b/loopy/program.py index 191a13fa1..13d2ff9fd 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -30,7 +30,7 @@ from pymbolic.primitives import Variable from functools import wraps from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, - CombineMapper, SubstitutionRuleExpander) + CombineMapper, SubstitutionRuleMappingContext) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.kernel.instruction import ( @@ -40,8 +40,8 @@ from loopy.library.reduction import ReductionOpFunction from loopy.kernel import LoopKernel from loopy.tools import update_persistent_hash -from collections import Counter from pymbolic.primitives import Call, CallWithKwargs +from functools import reduce __doc__ = """ @@ -75,7 +75,8 @@ def find_in_knl_callable_from_identifier( return None -class ResolvedFunctionMarker(RuleAwareIdentityMapper): +class CallableResolver(RuleAwareIdentityMapper): + #FIXME: Recheck this! """ Mapper to convert the ``function`` attribute of a :class:`pymbolic.primitives.Call` known in the kernel as instances of @@ -93,13 +94,10 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): :arg function_ids: A container with instances of :class:`str` indicating the function identifiers to look for while scoping functions. """ - def __init__(self, rule_mapping_context, kernel, callables_table, - function_id_to_in_knl_callable_mappers): - super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) - self.kernel = kernel - self.callables_table = callables_table - self.function_id_to_in_knl_callable_mappers = ( - function_id_to_in_knl_callable_mappers) + def __init__(self, rule_mapping_context, known_callables): + super(CallableResolver, self).__init__(rule_mapping_context) + self.resolved_functions = {} + self.known_callables = known_callables def map_call(self, expr, expn_state): from loopy.symbolic import parse_tagged_name @@ -117,31 +115,27 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): def map_call_with_kwargs(self, expr, expn_state): if not isinstance(expr.function, ResolvedFunction): - - # search the kernel for the function. - in_knl_callable = find_in_knl_callable_from_identifier( - self.function_id_to_in_knl_callable_mappers, - self.kernel.target, - expr.function.name) + # FIXME: Do we need to care about ReductionOpFunctions over here? + in_knl_callable = self.known_callables.get(expr.function.name) if in_knl_callable: - # associate the newly created ResolvedFunction with the - # resolved in-kernel callable - - self.callables_table, new_func_id = ( - self.callables_table.with_added_callable( - expr.function, in_knl_callable)) + if expr.function.name in self.resolved_functions: + assert self.resolved_functions[expr.function.name] == ( + in_knl_callable) + self.resolved_functions[expr.function.name] = in_knl_callable return type(expr)( - ResolvedFunction(new_func_id), + ResolvedFunction(expr.function.name), tuple(self.rec(child, expn_state) for child in expr.parameters), dict( (key, self.rec(val, expn_state)) for key, val in six.iteritems(expr.kw_parameters)) ) + else: + # FIXME: Once function mangler is completely deprecated raise here. + pass - # this is an unknown function as of yet, do not modify it - return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, + return super(CallableResolver, self).map_call_with_kwargs(expr, expn_state) @@ -157,53 +151,22 @@ def _default_func_id_to_kernel_callable_mappers(target): ))) -def initialize_callables_table_from_kernel(kernel): - """ - Returns an instance of :class:`loopy.CallablesTable`, by resolving - the functions based on :mod:`loopy`'s default function resolvers. - """ - # collect the default function resolvers - func_id_to_kernel_callable_mappers = ( - _default_func_id_to_kernel_callable_mappers(kernel.target)) - callables_table = CallablesTable({}) - - from loopy.symbolic import SubstitutionRuleMappingContext - rule_mapping_context = SubstitutionRuleMappingContext( - kernel.substitutions, kernel.get_var_name_generator()) - - resolved_function_marker = ResolvedFunctionMarker( - rule_mapping_context, kernel, callables_table, - func_id_to_kernel_callable_mappers) - - # mark the functions as "Resolved" in the expression nodes. - kernel_with_functions_resolved = rule_mapping_context.finish_kernel( - resolved_function_marker.map_kernel(kernel)) - # collect the update callables_table - callables_table = resolved_function_marker.callables_table - - callable_kernel = CallableKernel(kernel_with_functions_resolved) - - # add the callable kernel to the callables_table - callables_table, _ = callables_table.with_added_callable( - Variable(kernel.name), callable_kernel) - - return callables_table - - # {{{ program class Program(ImmutableRecord): """ Records the information about all the callables in a :mod:`loopy` program. - .. attribute:: name + .. attribute:: entrypoints - An instance of :class:`str`, also the name of the top-most level - :class:`loopy.LoopKernel`. + A :class:`frozenset` of the names of the kernels which + could be called from the host. .. attribute:: callables_table - An instance of :class:`loopy.program.CallablesTable`. + An instance of :class:`dict` mapping the function identifiers in a + kernel to their associated instances of + :class:`loopy.kernel.function_interface.InKernelCallable`. .. attribute:: target @@ -211,9 +174,9 @@ class Program(ImmutableRecord): .. attribute:: func_id_to_in_knl_callables_mappers - A list of functions of the signature ``(target: TargetBase, - function_indentifier: str)`` that would return an instance of - :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + A :class:`frozenset` of functions of the signature ``(target: + TargetBase, function_indentifier: str)`` that would return an instance + of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. .. note:: @@ -229,16 +192,19 @@ class Program(ImmutableRecord): Look up the resolved callable with identifier *name*. """ def __init__(self, - name, - callables_table, - target, - func_id_to_in_knl_callable_mappers): + entrypoints=None, + callables_table={}, + target=None, + func_id_to_in_knl_callable_mappers=[]): + + # {{{ sanity checks + assert isinstance(callables_table, CallablesTable) - assert name in callables_table + # }}} super(Program, self).__init__( - name=name, + entrypoints=entrypoints, callables_table=callables_table, target=target, func_id_to_in_knl_callable_mappers=( @@ -247,7 +213,7 @@ class Program(ImmutableRecord): self._program_executor_cache = {} hash_fields = ( - "name", + "entrypoints", "callables_table", "target",) @@ -255,26 +221,28 @@ class Program(ImmutableRecord): def copy(self, **kwargs): if 'target' in kwargs: - # target attribute of all the callable kernels should be updated. - target = kwargs['target'] - new_self = super(Program, self).copy(**kwargs) - new_resolved_functions = {} - for func_id, in_knl_callable in ( - new_self.callables_table.items()): - if isinstance(in_knl_callable, CallableKernel): - subkernel = in_knl_callable.subkernel - new_resolved_functions[func_id] = in_knl_callable.copy( - subkernel=subkernel.copy(target=target)) - else: - new_resolved_functions[func_id] = in_knl_callable + from loopy.kernel import KernelState + if max(callable_knl.subkernel.state for callable_knl in + six.itervalues(self.callables_table) if + isinstance(callable_knl, CallableKernel)) > ( + KernelState.INITIAL): + raise LoopyError("One of the kenels in the program has been " + "preprocessed, cannot modify target now.") - callables_table = new_self.callables_table.copy( - resolved_functions=new_resolved_functions) + return super(Program, self).copy(**kwargs) - return super(Program, new_self).copy( - callables_table=callables_table) - else: - return super(Program, self).copy(**kwargs) + def with_entrypoints(self, entrypoints): + """ + :param entrypoints: Either a comma-separated :class:`str` or + :class:`frozenset`. + """ + if isinstance(entrypoints, str): + entrypoints = frozenset([e.strip() for e in + entrypoints.split(',')]) + + assert isinstance(entrypoints, frozenset) + + return self.copy(entrypoints=entrypoints) def get_grid_size_upper_bounds(self, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that @@ -282,6 +250,9 @@ class Program(ImmutableRecord): *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ + # This should take in an input of an entrypoint. + raise NotImplementedError() + return self.root_kernel.get_grid_size_upper_bounds( self.callables_table, ignore_auto=ignore_auto) @@ -292,66 +263,19 @@ class Program(ImmutableRecord): *global_size* and *local_size* are :mod:`pymbolic` expressions """ + # This should take in an input of an entrypoint. + raise NotImplementedError() + return self.root_kernel.get_grid_size_upper_bounds_as_exprs( self.callables_table, ignore_auto=ignore_auto) - # {{{ implementation arguments - - @property - @memoize_method - def impl_arg_to_arg(self): - from loopy.kernel.array import ArrayBase - - result = {} - - for arg in self.args: - if not isinstance(arg, ArrayBase): - result[arg.name] = arg - continue - - if arg.shape is None or arg.dim_tags is None: - result[arg.name] = arg - continue - - subscripts_and_names = arg.subscripts_and_names() - if subscripts_and_names is None: - result[arg.name] = arg - continue - - for index, sub_arg_name in subscripts_and_names: - result[sub_arg_name] = arg - - return result - - # }}} - - @property - def root_kernel(self): - """ - Returns an instance of :class:`loopy.LoopKernel` denoting the topmost - level kernel. - """ - return self.callables_table[self.name].subkernel - - @property - def arg_dict(self): - """ - Returns ``arg_dict`` of the ``root_kernel``. - """ - return self.root_kernel.arg_dict - @property - def args(self): - """Returns ``args`` of the ``root_kernel``.""" - return self.root_kernel.args[:] - - def with_root_kernel(self, root_kernel): - """:returns: a copy of *self* with the topmost level kernel as - *root_kernel*. - """ - assert self.name == root_kernel.name - return self.with_kernel(root_kernel) + def state(self): + """ Returns an instance of :class:`loopy.kernel.KernelState`. """ + return min(callable_knl.subkernel.state for callable_knl in + six.itervalues(self.callables_table) if + isinstance(callable_knl, CallableKernel)) def with_kernel(self, kernel): # FIXME: Currently only replaces kernel. Should also work for adding. @@ -364,7 +288,48 @@ class Program(ImmutableRecord): callables_table=self.callables_table.copy( resolved_functions=new_resolved_functions)) + def with_resolved_callables(self): + + from loopy.library.function import get_loopy_callables + known_callables = self.target.get_device_ast_builder().known_callables + known_callables.update(get_loopy_callables()) + known_callables.update(self.callables_table.resolved_functions) + # update the known callables from the target. + resolved_functions = dict((e, self.callables_table[e]) for e in + self.entrypoints) + + # start a traversal to collect all the callables + queue = list(self.entrypoints) + + while queue: + top = queue[0] + assert top in resolved_functions + queue = queue[1:] + + knl = resolved_functions[top].subkernel + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, knl.get_var_name_generator()) + callables_collector = CallableResolver( + rule_mapping_context, + known_callables) + knl = rule_mapping_context.finish_kernel( + callables_collector.map_kernel(knl)) + resolved_functions[top] = resolved_functions[top].copy(subkernel=knl) + + for func, clbl in six.iteritems(callables_collector.resolved_functions): + if func not in resolved_functions: + if isinstance(clbl, CallableKernel): + queue.append(func) + resolved_functions[func] = clbl + else: + assert resolved_functions[func] == clbl + + new_callables_table = CallablesTable(resolved_functions=resolved_functions) + + return self.copy(callables_table=new_callables_table) + def __iter__(self): + #FIXME: Document return six.iterkeys(self.callables_table.resolved_functions) def __getitem__(self, name): @@ -375,6 +340,33 @@ class Program(ImmutableRecord): return result def __call__(self, *args, **kwargs): + entrypoint = kwargs.get('entrypoint', None) + + if self.entrypoints is None: + if len([clbl for clbl in self.callables_table.values() if + isinstance(clbl, CallableKernel)]) == 1: + self.entrypoints = frozenset([clbl.subkernel.name for + clbl in self.callables_table.values() if isinstance(clbl, + CallableKernel)]) + else: + raise LoopyError("entrypoint attribute unset. Use" + " 'with_entrypoints' before calling.") + + if entrypoint is None: + # did not receive an entrypoint for the program to execute + if len(self.entrypoints) == 1: + entrypoint, = list(self.entrypoints) + else: + raise TypeError("Program.__call__() missing 1 required" + " keyword argument: 'entrypoint'") + + if entrypoint not in self.entrypoints: + raise LoopyError("'{}' not in list possible entrypoints supplied to" + " the program. Maybe you want to invoke 'with_entrypoints'" + " before calling the program.".format(entrypoint)) + + kwargs['entrypoint'] = entrypoint + key = self.target.get_kernel_executor_cache_key(*args, **kwargs) try: pex = self._program_executor_cache[key] @@ -464,65 +456,40 @@ def rename_resolved_functions_in_a_single_kernel(kernel, resolved_function_renamer.map_kernel(kernel))) -# {{{ counting helpers - -class CallablesCountingMapper(CombineMapper): +class CallablesIDCollector(CombineMapper): """ - Returns an instance of :class:`collections.Counter` with the count of - callables registered in *callables_table*. - - .. attribute:: callables_table - - An instance of :class:`loopy.program.CallablesTable`. + Returns an instance of :class:`frozenset` containing instances of + :class:`loopy.kernel.function_interface.InKernelCallable` in the + :attr:``kernel`. """ - def __init__(self, callables_table): - self.callables_table = callables_table - def combine(self, values): - return sum(values, Counter()) + import operator + return reduce(operator.or_, values, frozenset()) - def map_call(self, expr): + def map_resolved_function(self, expr): + return frozenset([self.kernel.scoped_functions[ + expr.name]]) - if isinstance(expr, CallWithKwargs): - kw_parameters = expr.kw_parameters - else: - assert isinstance(expr, Call) - kw_parameters = {} - - if isinstance(expr.function, (ResolvedFunction)): - in_knl_callable = self.callables_table[expr.function.name] - if isinstance(in_knl_callable, ScalarCallable): - return (Counter([expr.function.name]) + - self.combine((self.rec(child) for child in expr.parameters - + tuple(kw_parameters.values())))) - - elif isinstance(in_knl_callable, CallableKernel): - - # callable kernels have more callables in them. - callables_count_in_subkernel = ( - count_callables_in_kernel( - in_knl_callable.subkernel, - self.callables_table)) - - return (Counter([expr.function.name]) + - self.combine((self.rec(child) for child in expr.parameters - + tuple(kw_parameters.values())))) + ( - callables_count_in_subkernel) - else: - raise NotImplementedError("Unknown callable type %s." % ( - type)) - else: - return ( - self.combine((self.rec(child) for child in expr.parameters - + tuple(kw_parameters.values())))) + def map_constant(self, expr): + return frozenset() + + def map_kernel(self, kernel): + callables_in_insn = frozenset() - map_call_with_kwargs = map_call + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callables_in_insn = callables_in_insn | ( + self(insn.expression)) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError(type(insn).__name__) - def map_reduction(self, expr): - return super(CallablesCountingMapper, self).map_reduction(expr) + for rule in six.itervalues(kernel.substitutions): + callables_in_insn = callables_in_insn | ( + self(rule.expression)) - def map_constant(self, expr): - return Counter() + return callables_in_insn map_variable = map_constant map_function_symbol = map_constant @@ -530,40 +497,9 @@ class CallablesCountingMapper(CombineMapper): map_type_cast = map_constant -@memoize_method -def count_callables_in_kernel(kernel, callables_table): - """ - Returns an instance of :class:`collections.Counter` representing the number - of callables in the *kernel* that are registered in - *callables_table*. - """ - assert isinstance(kernel, LoopKernel) - callables_count = Counter() - callables_counting_mapper = CallablesCountingMapper( - callables_table) - subst_expander = SubstitutionRuleExpander(kernel.substitutions) - - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - callables_count += ( - callables_counting_mapper(subst_expander( - insn.expression))) - elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): - pass - else: - raise NotImplementedError("Unknown instruction type %s." % ( - type(insn))) - - return callables_count - -# }}} - - -# {{{ program callables info +# {{{ callables table class CallablesTable(ImmutableRecord): - # FIXME: is CallablesTable a better name?(similar to symbol table in - # compilers.) """ Records the information of all the callables called in a :class:`loopy.Program`. @@ -573,19 +509,21 @@ class CallablesTable(ImmutableRecord): identifier to instances of :class:`loopy.kernel.function_interface.InKernelCallable` - .. attribute:: history - - An instance of :class:`dict` that contains a mapping from function - identifier to and instance of :class:`list`that would contain all the - names taken by a function before the current name.(For example: one - possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``) - .. attribute:: is_being_edited An instance of :class:`bool` which is intended to aid the working of :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and :meth:`with_exit_edit_callables_mode`. + .. attribute:: history + + An instance of :class:`dict` that contains a mapping from function + identifier to and instance of :class:`list`that would contain all the + names taken by a function before the current name.(For example: one + possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``). This + attribute is ephemeral i.e. should be only active when + *is_being_edited*=True. + .. automethod:: __init__ .. automethod:: callables_count .. automethod:: with_added_callable @@ -594,11 +532,14 @@ class CallablesTable(ImmutableRecord): .. automethod:: with_exit_edit_callables_mode """ def __init__(self, resolved_functions, - history=None, is_being_edited=False): + is_being_edited=False, + history=None): + + # FIXME: Maybe resolved_functions is an unnecessary name, how about + # just callables? - if history is None: - history = dict((func_id, frozenset([func_id])) for func_id in - resolved_functions) + if history is not None: + assert is_being_edited super(CallablesTable, self).__init__( resolved_functions=resolved_functions, @@ -621,23 +562,14 @@ class CallablesTable(ImmutableRecord): @property @memoize_method - def callables_count(self): + def get_callable_ids(self): """ - Returns an instance of :class:`collection.Counter` representing the number - of times the callables is called in callables_table. + Returns a :class:`frozenset` of the callable identfiers throughout all + the kernels in *self*. """ - root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable - in self.values() if - isinstance(in_knl_callable, CallableKernel) and - in_knl_callable.subkernel.is_called_from_host] - - from collections import Counter - callables_count = Counter([root_kernel_name]) - callables_count += ( - count_callables_in_kernel(self[ - root_kernel_name].subkernel, self)) - - return callables_count + clbl_id_collector = CallablesIDCollector() + return frozenset().union(*(clbl_id_collector.map_kernel(clbl.subkernel) + for clbl in self.values() if isinstance(clbl, CallableKernel))) # {{{ interface to perform edits on callables @@ -915,7 +847,7 @@ class CallablesTable(ImmutableRecord): # }}} - # {{{ behave like a dict(syntactic sugar) + # {{{ behave like a dict def __getitem__(self, item): return self.resolved_functions[item] @@ -941,19 +873,18 @@ class CallablesTable(ImmutableRecord): def make_program(kernel): """ - Returns an instance of :class:`loopy.Program` with the *kernel* as the root - kernel. + Returns an instance of :class:`loopy.Program` with *kernel* as the only + callable kernel. """ - # get the program callables info - callables_table = initialize_callables_table_from_kernel(kernel) - # get the program from program callables info + #FIXME:(For KK): do we need to register the current kernel in + # func_id_to_in_knl_callable_mappers + #FIXME(For inducer): Deriving the target of this program from the kernel's + # target. program = Program( - name=kernel.name, - callables_table=callables_table, - func_id_to_in_knl_callable_mappers=( - _default_func_id_to_kernel_callable_mappers(kernel.target)), + callables_table=CallablesTable({kernel.name: + CallableKernel(kernel)}), target=kernel.target) return program @@ -976,7 +907,6 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): in_knl_callable.subkernel, *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) - elif isinstance(in_knl_callable, ScalarCallable): pass else: diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 96f6e065c..02a5baabf 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -61,12 +61,13 @@ class SeparateArrayPackingController(object): It also repacks outgoing arrays of this type back into an object array. """ - def __init__(self, program): + def __init__(self, program, entrypoint): + # map from arg name self.packing_info = {} from loopy.kernel.array import ArrayBase - for arg in program.args: + for arg in program[entrypoint].args: if not isinstance(arg, ArrayBase): continue @@ -715,26 +716,31 @@ class KernelExecutorBase(object): .. automethod:: __call__ """ - def __init__(self, program): + def __init__(self, program, entrypoint): """ :arg kernel: a loopy.LoopKernel """ self.program = program + self.entrypoint = entrypoint - self.packing_controller = SeparateArrayPackingController(program) + self.packing_controller = SeparateArrayPackingController(program, + entrypoint) - self.output_names = tuple(arg.name for arg in self.program.args + self.output_names = tuple(arg.name for arg in self.program[entrypoint].args if arg.is_output) self.has_runtime_typed_args = any( arg.dtype is None - for arg in program.args) + for arg in program[entrypoint].args) def get_typed_and_scheduled_program_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes program = self.program + program = program.with_resolved_callables() + print(program) + 1/0 if arg_to_dtype_set: var_to_dtype = {} @@ -782,7 +788,8 @@ class KernelExecutorBase(object): except KeyError: pass - logger.debug("%s: typed-and-scheduled cache miss" % self.program.name) + logger.debug("%s: typed-and-scheduled cache miss" % + self.program.entrypoints) kernel = self.get_typed_and_scheduled_program_uncached(arg_to_dtype_set) @@ -792,10 +799,13 @@ class KernelExecutorBase(object): return kernel def arg_to_dtype_set(self, kwargs): + kwargs = kwargs.copy() if not self.has_runtime_typed_args: return None - impl_arg_to_arg = self.program.impl_arg_to_arg + entrypoint = kwargs.pop('entrypoint') + + impl_arg_to_arg = self.program[entrypoint].impl_arg_to_arg arg_to_dtype = {} for arg_name, val in six.iteritems(kwargs): arg = impl_arg_to_arg.get(arg_name, None) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index c042812e7..2919cb8ee 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -458,9 +458,10 @@ class PyOpenCLTarget(OpenCLTarget): def get_kernel_executor_cache_key(self, queue, **kwargs): return queue.context - def get_kernel_executor(self, kernel, queue, **kwargs): + def get_kernel_executor(self, program, queue, **kwargs): from loopy.target.pyopencl_execution import PyOpenCLKernelExecutor - return PyOpenCLKernelExecutor(queue.context, kernel) + return PyOpenCLKernelExecutor(queue.context, program, + entrypoint=kwargs.pop('entrypoint')) def with_device(self, device): return type(self)(device) @@ -797,11 +798,10 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): @property def known_callables(self): from loopy.library.random123 import get_random123_callables - return ( - super( - PyOpenCLCASTBuilder, self).known_callables).update( - get_pyopencl_callables()).update( - get_random123_callables()) + callables = super(PyOpenCLCASTBuilder, self).known_callables + callables.update(get_pyopencl_callables()) + callables.update(get_random123_callables()) + return callables def preamble_generators(self): return ([ diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index b7006575b..1b40e3f2a 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -255,7 +255,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, context, program): + def __init__(self, context, program, entrypoint): """ :arg context: a :class:`pyopencl.Context` :arg kernel: may be a loopy.LoopKernel, a generator returning kernels @@ -264,7 +264,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super(PyOpenCLKernelExecutor, self).__init__(program) + super(PyOpenCLKernelExecutor, self).__init__(program, entrypoint) self.context = context diff --git a/loopy/target/python.py b/loopy/target/python.py index b88830ab0..d174504fa 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -183,10 +183,9 @@ class PythonASTBuilderBase(ASTBuilderBase): @property def known_callables(self): from loopy.target.c import get_c_callables - return ( - super(PythonASTBuilderBase, - self).known_callables.update( - get_c_callables())) + callables = super(PythonASTBuilderBase, self).known_callables + callables.update(get_c_callables()) + return callables def preamble_generators(self): return ( diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index c9baa741f..2a1dd1115 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -36,7 +36,7 @@ from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) -from loopy.program import Program, ResolvedFunctionMarker +from loopy.program import Program from loopy.symbolic import SubArrayRef __doc__ = """ diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 0f280f6d9..8a0bf9e24 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -35,7 +35,6 @@ from loopy.diagnostic import ( TypeInferenceFailure, DependencyTypeInferenceFailure) from loopy.kernel.instruction import _DataObliviousInstruction -from loopy.program import CallablesTable from loopy.symbolic import ( LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper, SubstitutionRuleExpander, ResolvedFunction, @@ -1036,9 +1035,26 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" + 1/0 + + from loopy.kernel.data import auto callables_table = program.callables_table + history_of_callable_ids = initialize_history(callables_table) + + for e in program.entrypoints: + arg_id_to_dtype = dict((arg.name, arg.dtype) for arg in + callables_table[e].args if arg.dtype not in (None, auto)) + new_callable, callables_table = callables_table[e].with_types( + arg_id_to_dtype, None, callables_table) + callables_table, _ = add_to_callables(e, callables_table, + history_of_callable_ids, + is_entrypoint=True) + + # FIXME: Just a temporary_check... Remove before MR. + assert callables_table[e] == new_callable + type_uninferred_knl_callable = ( callables_table[program.name]) type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel -- GitLab From 8ef066140cf23c1cf823edf62c26a1a72638ede8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Sep 2019 23:59:05 -0500 Subject: [PATCH 599/916] changes the interface of add_dtypes to bring back adding dtypes to kernels instead of program --- loopy/kernel/tools.py | 11 +++-------- loopy/target/execution.py | 3 ++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index d0e4ef084..c468a2201 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -48,25 +48,20 @@ logger = logging.getLogger(__name__) # {{{ add and infer argument dtypes -def add_dtypes(program, dtype_dict): +def add_dtypes(kernel, dtype_dict): """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - root_kernel = program.root_kernel dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( - root_kernel, dtype_dict) + kernel, dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) - root_kernel - root_kernel_with_added_dtypes = ( - root_kernel.copy(args=new_args, temporary_variables=new_temp_vars)) - - return program.with_root_kernel(root_kernel_with_added_dtypes) + return kernel.copy(args=new_args, temporary_variables=new_temp_vars) def _add_dtypes_overdetermined(knl, dtype_dict): diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 02a5baabf..3f2b02f3f 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -757,7 +757,8 @@ class KernelExecutorBase(object): "no known variable/argument with that name" % var) - program = add_dtypes(program, var_to_dtype) + program = program.with_kernel(add_dtypes(program[entrypoint], + var_to_dtype)) from loopy.type_inference import infer_unknown_types program = infer_unknown_types(program, expect_completion=True) -- GitLab From fff5fd2f76b94aef426febb9b38bff7a00051528 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Oct 2019 00:01:36 -0500 Subject: [PATCH 600/916] adds an entrypoint info. to the process of generating code --- loopy/target/execution.py | 19 +++++++++---------- loopy/target/pyopencl_execution.py | 8 +++++--- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3f2b02f3f..da5f32546 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -734,19 +734,17 @@ class KernelExecutorBase(object): arg.dtype is None for arg in program[entrypoint].args) - def get_typed_and_scheduled_program_uncached(self, arg_to_dtype_set): + def get_typed_and_scheduled_program_uncached(self, entrypoint, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes program = self.program program = program.with_resolved_callables() - print(program) - 1/0 if arg_to_dtype_set: var_to_dtype = {} for var, dtype in arg_to_dtype_set: try: - dest_name = program.impl_arg_to_arg[var].name + dest_name = program[entrypoint].impl_arg_to_arg[var].name except KeyError: dest_name = var @@ -774,7 +772,7 @@ class KernelExecutorBase(object): return program - def get_typed_and_scheduled_program(self, arg_to_dtype_set): + def get_typed_and_scheduled_program(self, entrypoint, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching @@ -792,7 +790,8 @@ class KernelExecutorBase(object): logger.debug("%s: typed-and-scheduled cache miss" % self.program.entrypoints) - kernel = self.get_typed_and_scheduled_program_uncached(arg_to_dtype_set) + kernel = self.get_typed_and_scheduled_program_uncached(entrypoint, + arg_to_dtype_set) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) @@ -827,12 +826,12 @@ class KernelExecutorBase(object): # {{{ debugging aids - def get_highlighted_code(self, arg_to_dtype=None, code=None): + def get_highlighted_code(self, entrypoint, arg_to_dtype=None, code=None): if code is None: - code = self.get_code(arg_to_dtype) + code = self.get_code(entrypoint, arg_to_dtype) return get_highlighted_code(code) - def get_code(self, arg_to_dtype=None): + def get_code(self, entrypoint, arg_to_dtype=None): def process_dtype(dtype): if isinstance(dtype, type) and issubclass(dtype, np.generic): dtype = np.dtype(dtype) @@ -846,7 +845,7 @@ class KernelExecutorBase(object): arg_to_dtype = frozenset( (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype)) - kernel = self.get_typed_and_scheduled_program(arg_to_dtype) + kernel = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype) from loopy.codegen import generate_code_v2 code = generate_code_v2(kernel) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 1b40e3f2a..65e0f4bca 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -278,8 +278,9 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): return generator(kernel, codegen_result) @memoize_method - def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - program = self.get_typed_and_scheduled_program(arg_to_dtype_set) + def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), + all_kwargs=None): + program = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype_set) from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code @@ -351,7 +352,8 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - program_info = self.program_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(kwargs['entrypoint'], + self.arg_to_dtype_set(kwargs)) return program_info.invoker( program_info.cl_kernels, queue, allocator, wait_for, -- GitLab From 4c97e2c0a22dfcfddba3c7e5a1ae6370b64e1b9a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Oct 2019 11:35:04 -0500 Subject: [PATCH 601/916] adds outline to make callables table a dict The changes haven't been propagated completely yet --- loopy/program.py | 219 +++-------------------------------------------- 1 file changed, 13 insertions(+), 206 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 13d2ff9fd..8c475b67a 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -48,7 +48,6 @@ __doc__ = """ .. currentmodule:: loopy .. autoclass:: Program -.. autoclass:: CallablesTable .. autofunction:: make_program .. autofunction:: iterate_over_kernels_if_given_program @@ -199,7 +198,7 @@ class Program(ImmutableRecord): # {{{ sanity checks - assert isinstance(callables_table, CallablesTable) + assert isinstance(callables_table, dict) # }}} @@ -497,225 +496,33 @@ class CallablesIDCollector(CombineMapper): map_type_cast = map_constant -# {{{ callables table +class CallablesInferenceContext(ImmutableRecord): + def __init__(self, callables, history=None): + assert isinstance(callables, dict) + if history is None: + history = dict((func_id, frozenset([func_id])) for func_id in + callables) -class CallablesTable(ImmutableRecord): - """ - Records the information of all the callables called in a :class:`loopy.Program`. - - .. attribute:: resolved_functions - - An instance of :class:`dict` that contains a mapping from function - identifier to instances of - :class:`loopy.kernel.function_interface.InKernelCallable` - - .. attribute:: is_being_edited - - An instance of :class:`bool` which is intended to aid the working of - :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and - :meth:`with_exit_edit_callables_mode`. - - .. attribute:: history - - An instance of :class:`dict` that contains a mapping from function - identifier to and instance of :class:`list`that would contain all the - names taken by a function before the current name.(For example: one - possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``). This - attribute is ephemeral i.e. should be only active when - *is_being_edited*=True. - - .. automethod:: __init__ - .. automethod:: callables_count - .. automethod:: with_added_callable - .. automethod:: with_edit_callables_mode - .. automethod:: with_callable - .. automethod:: with_exit_edit_callables_mode - """ - def __init__(self, resolved_functions, - is_being_edited=False, - history=None): - - # FIXME: Maybe resolved_functions is an unnecessary name, how about - # just callables? - - if history is not None: - assert is_being_edited - - super(CallablesTable, self).__init__( - resolved_functions=resolved_functions, - history=history, - is_being_edited=is_being_edited) + super(CallablesTable, self).__init__(callables, history) - hash_fields = ( - "resolved_functions", - "is_being_edited", - "history") - - def __hash__(self): - return hash(( - frozenset(six.iteritems(self.resolved_functions)), - frozenset(six.iteritems(self.history)), - self.is_being_edited - )) - - update_persistent_hash = update_persistent_hash - - @property - @memoize_method - def get_callable_ids(self): - """ - Returns a :class:`frozenset` of the callable identfiers throughout all - the kernels in *self*. - """ clbl_id_collector = CallablesIDCollector() - return frozenset().union(*(clbl_id_collector.map_kernel(clbl.subkernel) - for clbl in self.values() if isinstance(clbl, CallableKernel))) + self.old_callables_ids = frozenset().union(*( + clbl_id_collector.map_kernel(clbl.subkernel) for clbl in + self.values() if isinstance(clbl, CallableKernel))) # {{{ interface to perform edits on callables - def with_added_callable(self, function, in_kernel_callable): - """ - Returns an instance of :class:`tuple` of ``(new_self, new_function)``. - ``new_self`` is a copy of *self* with the *function* associated with the - *in_kernel_callable*. ``new_function`` is the function identifier that - should be noted in the expression node so that it could be associated - with an instance of :class:`InKernelCallable`. - - .. note:: - - - Always checks whether the - :attr:``loopy.CallablesTable.resolved_functions` has - *in_kernel_callable*, does not introduce copies. - - - The difference between - :meth:`loopy.CallablesTable.with_added_callable` - and :meth:`CallablesTable.with_callable` being that - the former has no support for renaming the callable back i.e. - ``with_callable`` supports renaming from ``sin_0`` to ``sin``, - if possible, through the member method - ``loopy.CallablesTable.with_exit_edit_callables_mode`` - - This subtle difference makes -- - - - :meth:`loopy.CallablesTable.with_added_callable` suitable - for usage while resolving the functions first time, where no - renaming is needed. - - - :meth:`loopy.CallablesTable.with_callable` suitable for - implementing edits in callables during inference-walks. - """ - - # {{{ sanity checks - - if isinstance(function, str): - function = Variable(function) - - assert isinstance(function, (Variable, ReductionOpFunction)) - - # }}} - - history = self.history.copy() - - if in_kernel_callable in self.resolved_functions.values(): - # the callable already exists, implies return the function - # identifier corresponding to that callable. - for func_id, in_knl_callable in self.resolved_functions.items(): - if in_knl_callable == in_kernel_callable: - history[func_id] = history[func_id] | frozenset([function.name]) - return ( - self.copy( - history=history), - func_id) - else: - - # {{{ handle ReductionOpFunction - - if isinstance(function, ReductionOpFunction): - unique_function_identifier = function.copy() - updated_resolved_functions = self.resolved_functions.copy() - updated_resolved_functions[unique_function_identifier] = ( - in_kernel_callable) - history[unique_function_identifier] = frozenset( - [unique_function_identifier]) - - return ( - self.copy( - history=history, - resolved_functions=updated_resolved_functions), - unique_function_identifier) - - # }}} - - unique_function_identifier = function.name - - if isinstance(in_kernel_callable, CallableKernel) and ( - in_kernel_callable.subkernel.is_called_from_host): - # do not rename root kernel - pass - else: - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) - - updated_resolved_functions = self.resolved_functions.copy() - updated_resolved_functions[unique_function_identifier] = ( - in_kernel_callable) - - history[unique_function_identifier] = frozenset( - [unique_function_identifier]) - - return ( - self.copy( - history=history, - resolved_functions=updated_resolved_functions), - Variable(unique_function_identifier)) - - def with_edit_callables_mode(self): - """ - Returns a copy of *self* for a walk traversal through all the callables. - """ - return self.copy( - is_being_edited=True) - def with_callable(self, function, in_kernel_callable): """ Returns an instance of :class:`tuple` ``(new_self, new_function)``. - Also refer -- :meth:`loopy.CallablesTable.with_added_callable` - :arg function: An instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.library.reduction.ReductionOpFunction`. :arg in_kernel_callable: An instance of :class:`loopy.InKernelCallable`. - - .. note:: - - - Use :meth:`with_added_callable` if a callable is being resolved for the - first time. """ - # {{{ non-edit mode - - if not self.is_being_edited: - if isinstance(function, ReductionOpFunction): - function_name = function - else: - function_name = function.name - - if function_name in self.resolved_functions and ( - self.resolved_functions[function_name] == in_kernel_callable): - # if not being edited, check that the given function is - # equal to the old version of the callable. - return self, function - else: - print('Old: ', self.resolved_functions[function_name]) - print('New: ', in_kernel_callable) - raise LoopyError("Use 'with_enter_edit_callables_mode' first.") - - # }}} - # {{{ sanity checks if isinstance(function, str): @@ -883,8 +690,8 @@ def make_program(kernel): #FIXME(For inducer): Deriving the target of this program from the kernel's # target. program = Program( - callables_table=CallablesTable({kernel.name: - CallableKernel(kernel)}), + callables_table={ + kernel.name: CallableKernel(kernel)}, target=kernel.target) return program -- GitLab From f28e9d4bcb239ef55c4fe1b5770784e01cacaf7e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Oct 2019 11:36:18 -0500 Subject: [PATCH 602/916] corrects the equality check for ReductionOpFunctions --- loopy/library/reduction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 9418ee282..2d27d24ec 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -335,7 +335,8 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return hash(type(self)) def __eq__(self, other): - return type(self) == type(other) + return type(self) == type(other) and (self.inner_reduction == + other.inner_reduction) def __call__(self, dtypes, operand1, operand2, callables_table, target): # getting the callable 'max' from target -- GitLab From d21e61f4abd7c7b6695b6bbb797ffbaba661e440 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Oct 2019 11:38:04 -0500 Subject: [PATCH 603/916] Outlines the design for CallablesInferenceContext - Design inspired from SubstitutionRuleMappingContext --- loopy/program.py | 63 +++++++++++++---------------------------- loopy/type_inference.py | 50 +++++++------------------------- 2 files changed, 29 insertions(+), 84 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 8c475b67a..f4d7003e2 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -534,46 +534,45 @@ class CallablesInferenceContext(ImmutableRecord): history = self.history.copy() - if in_kernel_callable in self.resolved_functions.values(): - + if in_kernel_callable in self.callables.values(): # the callable already exists, hence return the function # identifier corresponding to that callable. - for func_id, in_knl_callable in self.resolved_functions.items(): + for func_id, in_knl_callable in self.callables.items(): if in_knl_callable == in_kernel_callable: history[func_id] = history[func_id] | frozenset([function.name]) return ( self.copy( history=history), func_id) + + assert False else: # {{{ handle ReductionOpFunction if isinstance(function, ReductionOpFunction): + # FIXME: Check what happens if we have 2 same ArgMax functions + # with different types in the same kernel! unique_function_identifier = function.copy() - updated_resolved_functions = self.resolved_functions.copy() - updated_resolved_functions[unique_function_identifier] = ( + updated_callables = self.callables.copy() + updated_callables[unique_function_identifier] = ( in_kernel_callable) return ( self.copy( - resolved_functions=updated_resolved_functions), + callables=updated_callables), unique_function_identifier) # }}} + unique_function_identifier = function.name - if isinstance(in_kernel_callable, CallableKernel) and ( - in_kernel_callable.subkernel.is_called_from_host): - # do not rename root kernel - pass - else: - while unique_function_identifier in self.resolved_functions: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + while unique_function_identifier in self.resolved_functions: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) - updated_resolved_functions = self.resolved_functions.copy() - updated_resolved_functions[unique_function_identifier] = ( + updated_callables = self.callables.copy() + updated_callables[unique_function_identifier] = ( in_kernel_callable) history[unique_function_identifier] = ( @@ -582,10 +581,10 @@ class CallablesInferenceContext(ImmutableRecord): return ( self.copy( history=history, - resolved_functions=updated_resolved_functions), + callables=updated_callables), Variable(unique_function_identifier)) - def with_exit_edit_callables_mode(self, old_callables_count): + def finish_program(self, program): """ Returns a copy of *self* with renaming of the callables done whenever possible. @@ -647,34 +646,10 @@ class CallablesInferenceContext(ImmutableRecord): new_resolved_functions[func_id] = in_knl_callable new_history[func_id] = self.history[func_id] - return self.copy( - is_being_edited=False, - resolved_functions=new_resolved_functions, - history=new_history) - - # }}} - - # {{{ behave like a dict - - def __getitem__(self, item): - return self.resolved_functions[item] - - def __contains__(self, item): - return item in self.resolved_functions - - def items(self): - return six.iteritems(self.resolved_functions) - - def values(self): - return six.itervalues(self.resolved_functions) - - def keys(self): - return six.iterkeys(self.resolved_functions) + return program.copy(callables_table=new_callables_table) # }}} -# }}} - # {{{ helper functions diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 8a0bf9e24..ccf614843 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1035,51 +1035,21 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" - 1/0 - from loopy.kernel.data import auto + from loopy.program import CallablesInferenceContext - callables_table = program.callables_table - - history_of_callable_ids = initialize_history(callables_table) + clbl_inf_ctx = CallablesInferenceContext(program.callables_table) for e in program.entrypoints: + # FIXME: Need to add docs which say that we need not add the current + # callable to the clbl_inf_ctx while writing the "with_types" arg_id_to_dtype = dict((arg.name, arg.dtype) for arg in - callables_table[e].args if arg.dtype not in (None, auto)) - new_callable, callables_table = callables_table[e].with_types( - arg_id_to_dtype, None, callables_table) - callables_table, _ = add_to_callables(e, callables_table, - history_of_callable_ids, - is_entrypoint=True) - - # FIXME: Just a temporary_check... Remove before MR. - assert callables_table[e] == new_callable - - type_uninferred_knl_callable = ( - callables_table[program.name]) - type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel - - old_callables_count = callables_table.callables_count - callables_table = ( - program.callables_table.with_edit_callables_mode()) - root_kernel, callables_table = ( - infer_unknown_types_for_a_single_kernel( - type_uninferred_root_kernel, - callables_table, expect_completion)) - - type_inferred_knl_callable = type_uninferred_knl_callable.copy( - subkernel=root_kernel) - - callables_table, _ = ( - callables_table.with_callable( - program.name, - type_inferred_knl_callable)) - - callables_table = ( - callables_table.with_exit_edit_callables_mode( - old_callables_count)) - - return program.copy(callables_table=callables_table) + program[e].args if arg.dtype not in (None, auto)) + new_callable, clbl_inf_ctx = program[e].with_types( + arg_id_to_dtype, None, clbl_inf_ctx) + clbl_inf_ctx, _ = clbl_inf_ctx.with_callable(e, new_callable) + + return clbl_inf_ctx.finish_program(program) # }}} -- GitLab From 8cb584132d260d20df774df69cd2a18d025aeb24 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Oct 2019 11:58:31 -0500 Subject: [PATCH 604/916] Minor changes to go with the changes in CallablesTable - Functions are getting resolved - dtypes not yet inferred --- loopy/program.py | 46 ++++++++++++++++++----------------------- loopy/type_inference.py | 2 +- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index f4d7003e2..e0c2b5032 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -281,20 +281,18 @@ class Program(ImmutableRecord): # FIXME: Document new_in_knl_callable = self.callables_table[kernel.name].copy( subkernel=kernel) - new_resolved_functions = self.callables_table.resolved_functions.copy() - new_resolved_functions[kernel.name] = new_in_knl_callable - return self.copy( - callables_table=self.callables_table.copy( - resolved_functions=new_resolved_functions)) + new_callables = self.callables_table.copy() + new_callables[kernel.name] = new_in_knl_callable + return self.copy(callables_table=new_callables) def with_resolved_callables(self): from loopy.library.function import get_loopy_callables known_callables = self.target.get_device_ast_builder().known_callables known_callables.update(get_loopy_callables()) - known_callables.update(self.callables_table.resolved_functions) + known_callables.update(self.callables_table) # update the known callables from the target. - resolved_functions = dict((e, self.callables_table[e]) for e in + callables_table = dict((e, self.callables_table[e]) for e in self.entrypoints) # start a traversal to collect all the callables @@ -302,10 +300,10 @@ class Program(ImmutableRecord): while queue: top = queue[0] - assert top in resolved_functions + assert top in callables_table queue = queue[1:] - knl = resolved_functions[top].subkernel + knl = callables_table[top].subkernel rule_mapping_context = SubstitutionRuleMappingContext( knl.substitutions, knl.get_var_name_generator()) callables_collector = CallableResolver( @@ -313,19 +311,17 @@ class Program(ImmutableRecord): known_callables) knl = rule_mapping_context.finish_kernel( callables_collector.map_kernel(knl)) - resolved_functions[top] = resolved_functions[top].copy(subkernel=knl) + callables_table[top] = callables_table[top].copy(subkernel=knl) for func, clbl in six.iteritems(callables_collector.resolved_functions): - if func not in resolved_functions: + if func not in callables_table: if isinstance(clbl, CallableKernel): queue.append(func) - resolved_functions[func] = clbl + callables_table[func] = clbl else: - assert resolved_functions[func] == clbl + assert callables_table[func] == clbl - new_callables_table = CallablesTable(resolved_functions=resolved_functions) - - return self.copy(callables_table=new_callables_table) + return self.copy(callables_table=callables_table) def __iter__(self): #FIXME: Document @@ -466,8 +462,7 @@ class CallablesIDCollector(CombineMapper): return reduce(operator.or_, values, frozenset()) def map_resolved_function(self, expr): - return frozenset([self.kernel.scoped_functions[ - expr.name]]) + return frozenset([expr.name]) def map_constant(self, expr): return frozenset() @@ -503,12 +498,12 @@ class CallablesInferenceContext(ImmutableRecord): history = dict((func_id, frozenset([func_id])) for func_id in callables) - super(CallablesTable, self).__init__(callables, history) + super(CallablesInferenceContext, self).__init__(callables, history) clbl_id_collector = CallablesIDCollector() self.old_callables_ids = frozenset().union(*( clbl_id_collector.map_kernel(clbl.subkernel) for clbl in - self.values() if isinstance(clbl, CallableKernel))) + callables.values() if isinstance(clbl, CallableKernel))) # {{{ interface to perform edits on callables @@ -593,6 +588,7 @@ class CallablesInferenceContext(ImmutableRecord): then all the renaming is done such that one of flavors of the callable is renamed back to ``sin``. """ + 1/0 assert self.is_being_edited @@ -682,8 +678,8 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): def _collective_transform(program_or_kernel, *args, **kwargs): if isinstance(program_or_kernel, Program): program = program_or_kernel - new_resolved_functions = {} - for func_id, in_knl_callable in program.callables_table.items(): + new_callables = {} + for func_id, in_knl_callable in six.iteritems(program.callables_table): if isinstance(in_knl_callable, CallableKernel): new_subkernel = transform_for_single_kernel( in_knl_callable.subkernel, *args, **kwargs) @@ -695,11 +691,9 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): raise NotImplementedError("Unknown type of callable %s." % ( type(in_knl_callable).__name__)) - new_resolved_functions[func_id] = in_knl_callable + new_callables[func_id] = in_knl_callable - new_callables_table = program.callables_table.copy( - resolved_functions=new_resolved_functions) - return program.copy(callables_table=new_callables_table) + return program.copy(callables_table=new_callables) else: assert isinstance(program_or_kernel, LoopKernel) kernel = program_or_kernel diff --git a/loopy/type_inference.py b/loopy/type_inference.py index ccf614843..5ff7bbb92 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1045,7 +1045,7 @@ def infer_unknown_types(program, expect_completion=False): # callable to the clbl_inf_ctx while writing the "with_types" arg_id_to_dtype = dict((arg.name, arg.dtype) for arg in program[e].args if arg.dtype not in (None, auto)) - new_callable, clbl_inf_ctx = program[e].with_types( + new_callable, clbl_inf_ctx = program.callables_table[e].with_types( arg_id_to_dtype, None, clbl_inf_ctx) clbl_inf_ctx, _ = clbl_inf_ctx.with_callable(e, new_callable) -- GitLab From 11f28f992c9a6b1109722fa5f521c373eb9bb238 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Oct 2019 15:37:49 -0500 Subject: [PATCH 605/916] callables_table -> clbl_inf_ctx --- loopy/type_inference.py | 45 +++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 5ff7bbb92..e3091171d 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -40,6 +40,7 @@ from loopy.symbolic import ( SubstitutionRuleExpander, ResolvedFunction, SubstitutionRuleMappingContext, SubArrayRef) from pymbolic.primitives import Variable, Subscript, Lookup +from loopy.program import CallablesInferenceContext import logging logger = logging.getLogger(__name__) @@ -196,7 +197,7 @@ def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): # {{{ type inference mapper class TypeInferenceMapper(CombineMapper): - def __init__(self, kernel, callables_table, new_assignments=None): + def __init__(self, kernel, clbl_inf_ctx, new_assignments=None): """ :arg new_assignments: mapping from names to either :class:`loopy.kernel.data.TemporaryVariable` @@ -205,12 +206,12 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel - assert isinstance(callables_table, CallablesTable) + assert isinstance(clbl_inf_ctx, CallablesInferenceContext) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments self.symbols_with_unknown_types = set() - self.callables_table = callables_table + self.clbl_inf_ctx = clbl_inf_ctx self.old_calls_to_new_calls = {} def __call__(self, expr, return_tuple=False, return_dtype_set=False): @@ -244,16 +245,16 @@ class TypeInferenceMapper(CombineMapper): # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x) # are Python-equal (for many common constants such as integers). - def copy(self, callables_table=None): - if callables_table is None: - callables_table = self.callables_table - return type(self)(self.kernel, callables_table, + def copy(self, clbl_inf_ctx=None): + if clbl_inf_ctx is None: + clbl_inf_ctx = self.clbl_inf_ctx + return type(self)(self.kernel, clbl_inf_ctx, self.new_assignments) def with_assignments(self, names_to_vars): new_ass = self.new_assignments.copy() new_ass.update(names_to_vars) - return type(self)(self.kernel, self.callables_table, new_ass) + return type(self)(self.kernel, self.clbl_inf_ctx, new_ass) @staticmethod def combine(dtype_sets): @@ -430,7 +431,7 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type if isinstance(expr.function, ResolvedFunction): - in_knl_callable = self.callables_table[expr.function.name] + in_knl_callable = self.clbl_inf_ctx[expr.function.name] # {{{ checking that there is no overwriting of types of in_knl_callable @@ -467,17 +468,18 @@ class TypeInferenceMapper(CombineMapper): "InKernelCallable?") # }}} - in_knl_callable, self.callables_table = ( + + in_knl_callable, self.clbl_inf_ctx = ( in_knl_callable.with_types( arg_id_to_dtype, self.kernel, - self.callables_table)) + self.clbl_inf_ctx)) in_knl_callable = in_knl_callable.with_target(self.kernel.target) # storing the type specialized function so that it can be used for # later use - self.callables_table, new_function_id = ( - self.callables_table.with_callable( + self.clbl_inf_ctx, new_function_id = ( + self.clbl_inf_ctx.with_callable( expr.function.function, in_knl_callable)) @@ -750,13 +752,13 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if not dtype_sets: return ( None, type_inf_mapper.symbols_with_unknown_types, None, - type_inf_mapper.callables_table) + type_inf_mapper.clbl_inf_ctx) result = type_inf_mapper.combine(dtype_sets) return (result, type_inf_mapper.symbols_with_unknown_types, type_inf_mapper.old_calls_to_new_calls, - type_inf_mapper.callables_table) + type_inf_mapper.clbl_inf_ctx) # }}} @@ -783,7 +785,7 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types_for_a_single_kernel(kernel, callables_table, +def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx, expect_completion=False): """Infer types on temporaries and arguments.""" @@ -846,7 +848,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, new_temp_vars, new_arg_dict ]) - type_inf_mapper = TypeInferenceMapper(kernel, callables_table, + type_inf_mapper = TypeInferenceMapper(kernel, clbl_inf_ctx, item_lookup) from loopy.symbolic import SubstitutionRuleExpander @@ -882,13 +884,13 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, debug("inferring type for %s %s", type(item).__name__, item.name) try: (result, symbols_with_unavailable_types, - new_old_calls_to_new_calls, callables_table) = ( + new_old_calls_to_new_calls, clbl_inf_ctx) = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) except DependencyTypeInferenceFailure: result = tuple() type_inf_mapper = type_inf_mapper.copy( - callables_table=callables_table) + clbl_inf_ctx=clbl_inf_ctx) failed = not result if not failed: @@ -1006,7 +1008,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, raise NotImplementedError("Unknown instructions type %s." % ( type(insn).__name__)) - callables_table = type_inf_mapper.callables_table + clbl_inf_ctx = type_inf_mapper.clbl_inf_ctx old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) end_time = time.time() @@ -1030,13 +1032,12 @@ def infer_unknown_types_for_a_single_kernel(kernel, callables_table, from loopy.check import check_functions_are_resolved check_functions_are_resolved(type_specialized_kernel) - return type_specialized_kernel, callables_table + return type_specialized_kernel, clbl_inf_ctx def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" from loopy.kernel.data import auto - from loopy.program import CallablesInferenceContext clbl_inf_ctx = CallablesInferenceContext(program.callables_table) -- GitLab From b15abd765c9bd7349d7cd9a2f3546333217c1a65 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Oct 2019 15:38:17 -0500 Subject: [PATCH 606/916] minor fixes and code readjustments --- loopy/program.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index e0c2b5032..4356fcbdc 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -25,7 +25,7 @@ THE SOFTWARE. import six import re -from pytools import ImmutableRecord, memoize_method +from pytools import ImmutableRecord from pymbolic.primitives import Variable from functools import wraps @@ -491,19 +491,28 @@ class CallablesIDCollector(CombineMapper): map_type_cast = map_constant +def _get_callables_ids(callables): + clbl_id_collector = CallablesIDCollector() + + return frozenset().union(*( + clbl_id_collector.map_kernel(clbl.subkernel) for clbl in + callables.values() if isinstance(clbl, CallableKernel))) + + class CallablesInferenceContext(ImmutableRecord): - def __init__(self, callables, history=None): + def __init__(self, callables, old_callables_id=None, history=None): assert isinstance(callables, dict) if history is None: history = dict((func_id, frozenset([func_id])) for func_id in callables) - super(CallablesInferenceContext, self).__init__(callables, history) + if old_callables_id is None: + self.old_callables_ids = _get_callables_ids(callables) - clbl_id_collector = CallablesIDCollector() - self.old_callables_ids = frozenset().union(*( - clbl_id_collector.map_kernel(clbl.subkernel) for clbl in - callables.values() if isinstance(clbl, CallableKernel))) + super(CallablesInferenceContext, self).__init__( + callables=callables, + old_callables_id=old_callables_id, + history=history) # {{{ interface to perform edits on callables @@ -561,7 +570,7 @@ class CallablesInferenceContext(ImmutableRecord): unique_function_identifier = function.name - while unique_function_identifier in self.resolved_functions: + while unique_function_identifier in self.callables: unique_function_identifier = ( next_indexed_function_identifier( unique_function_identifier)) @@ -588,8 +597,6 @@ class CallablesInferenceContext(ImmutableRecord): then all the renaming is done such that one of flavors of the callable is renamed back to ``sin``. """ - 1/0 - assert self.is_being_edited new_callables_count = self.callables_count @@ -646,6 +653,13 @@ class CallablesInferenceContext(ImmutableRecord): # }}} + def __getitem__(self, name): + result = self.callables[name] + if isinstance(result, CallableKernel): + return result.subkernel + else: + return result + # {{{ helper functions -- GitLab From 266fea05eeb4bf3c082cad5d313f8a9d97684c28 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Oct 2019 22:09:35 -0500 Subject: [PATCH 607/916] Removes support for "return_list_of_knl" in parse_fortran --- loopy/frontend/fortran/__init__.py | 9 ++------- loopy/ipython_ext.py | 2 +- test/test_fortran.py | 7 +++---- test/test_numa_diff.py | 2 +- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index bc360b996..9b63c10f8 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -296,11 +296,9 @@ def _add_assignees_to_calls(knl, all_kernels): def parse_fortran(source, filename="", free_form=None, strict=None, - seq_dependencies=None, auto_dependencies=None, target=None, - return_list_of_knls=False): + seq_dependencies=None, auto_dependencies=None, target=None): """ - :returns: an instance of :class:`list` of :class:`loopy.LoopKernel`s if - *return_list_of_knls* is True else a :class:`loopy.Program`. + :returns: A :class:`loopy.Program`. """ parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) @@ -342,9 +340,6 @@ def parse_fortran(source, filename="", free_form=None, strict=None, kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) - if return_list_of_knls: - return kernels - kernels = [_add_assignees_to_calls(knl, kernels) for knl in kernels] from loopy.kernel.tools import identify_root_kernel diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py index e44b183ed..ec1b10f1f 100644 --- a/loopy/ipython_ext.py +++ b/loopy/ipython_ext.py @@ -9,7 +9,7 @@ import loopy as lp class LoopyMagics(Magics): @cell_magic def fortran_kernel(self, line, cell): - result = lp.parse_fortran(cell, return_list_of_knls=True) + result = lp.parse_fortran(cell) for knl in result: self.shell.user_ns[knl.name] = knl diff --git a/test/test_fortran.py b/test/test_fortran.py index 1ab28409b..856d85c49 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -534,10 +534,9 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! ! # FIXME: correct this after the "Module" is done. - ! # prg = lp.parse_fortran(SOURCE) - ! # fill = prg["fill"] - ! # twice = prg["twice"] - ! fill, twice = lp.parse_fortran(SOURCE, return_list_of_knls=True) + ! prg = lp.parse_fortran(SOURCE) + ! fill = prg["fill"] + ! twice = prg["twice"] ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) ! RESULT = knl diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 55a2d2e11..de0bcf70a 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -61,7 +61,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, - seq_dependencies=False, return_list_of_knls=True) + seq_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") -- GitLab From 435155d5b0a1134adc0cd93f678489a506bcd6c6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Oct 2019 22:35:15 -0500 Subject: [PATCH 608/916] deprecates is_output_only --- loopy/kernel/data.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index f0d7b3789..c1acd5069 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -371,8 +371,16 @@ class ArrayArg(ArrayBase, KernelArgument): def __init__(self, *args, **kwargs): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - kwargs["is_output"] = kwargs.pop("is_output", None) - kwargs["is_input"] = kwargs.pop("is_input", None) + + is_output_only = kwargs.pop("is_output_only", None) + if is_output_only is not None: + warn("'is_output_only' is deprecated. Use 'is_output', 'is_input'" + " instead.", DeprecationWarning, stacklevel=2) + kwargs["is_output"] = is_output_only + kwargs["is_input"] = not is_output_only + else: + kwargs["is_output"] = kwargs.pop("is_output", None) + kwargs["is_input"] = kwargs.pop("is_input", None) super(ArrayArg, self).__init__(*args, **kwargs) -- GitLab From 63979735f675e2d76033cb1e6177ee9d0187cd87 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Oct 2019 11:30:04 -0500 Subject: [PATCH 609/916] handles minor docs issues --- loopy/kernel/data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index c1acd5069..0d74b7248 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -355,12 +355,14 @@ class ArrayArg(ArrayBase, KernelArgument): .. attribute:: is_output An instance of :class:`bool`. If set to *True*, the argument is used - to return information to the caller + to return information to the caller. If set to *False*, then the + callee should not write the array during execution. .. attribute:: is_input An instance of :class:`bool`. If set to *True*, expected to be - provided by the caller. + provided by the caller. If *False* then the callee should not depend + on the state of the array on entry to a function. """) allowed_extra_kwargs = [ -- GitLab From 6f177eb923b01e7e1e3c789f83fe2ce347387e9b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Oct 2019 12:30:51 -0500 Subject: [PATCH 610/916] minor rewording in comments/error strings --- loopy/transform/callable.py | 36 +++++++++++------------------------- test/test_fortran.py | 2 +- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index a87a43f4e..2cde66767 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -154,19 +154,6 @@ class _RegisterCalleeKernel(ImmutableRecord): return None -def subarrayrefs_are_equiv(sar1, sar2, knl): - """ - Compares if two instance of :class:`loopy.symbolic.SubArrayRef`s point - to the same array region. - """ - from loopy.kernel.function_interface import get_arg_descriptor_for_expression - - return get_arg_descriptor_for_expression(knl, sar1) == ( - get_arg_descriptor_for_expression(knl, sar2)) and ( - sar1.get_begin_subscript(knl) == - sar2.get_begin_subscript(knl)) - - def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): from loopy.kernel.function_interface import get_kw_pos_association kw_to_pos, pos_to_kw = get_kw_pos_association(callee_kernel) @@ -178,8 +165,8 @@ def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): for i, param in enumerate(expr.parameters): pos = kw_to_pos[callee_kernel.args[i].name] if pos < 0: - raise LoopyError("#{} argument meant for output obtained as an" - " input in '{}'.".format(i, insn)) + raise LoopyError("#{}(1-based) argument meant for output obtained as an" + " input in '{}'.".format(i+1, insn)) assert pos == i @@ -188,7 +175,7 @@ def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): for kw, param in six.iteritems(expr.kw_parameters): pos = kw_to_pos[kw] if pos < 0: - raise LoopyError("KW-argument '{}' meant for output obtained as an" + raise LoopyError("Keyword argument '{}' meant for output obtained as an" " input in '{}'.".format(kw, insn)) callee_args_to_insn_params[pos].append(param) @@ -203,8 +190,6 @@ def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): callee_args_to_insn_params[pos].append(assignee) - # TODO: Some of the checks might be redundant. - for arg, insn_params in zip(callee_kernel.args, callee_args_to_insn_params): if len(insn_params) == 1: @@ -218,14 +203,15 @@ def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): raise LoopyError("Found multiple parameters mapping to an" " argument which is not both input and output in" " ''.".format()) - if not subarrayrefs_are_equiv(insn_params[0], insn_params[1], - caller_knl): - raise LoopyError("'{}' and '{}' point to the same argument in" - " the callee, but are unequal.".format( - insn_params[0], insn_params[1])) + if insn_params[0] != insn_params[1]: + raise LoopyError("Unequal SubArrayRefs '{}', '{}' passed as '{}'" + " to '{}'.".format(insn_params[0], insn_params[1], + arg.name, callee_kernel.name)) else: - raise LoopyError("Multiple(>2) arguments pointing to the same" - " argument for '{}' in '{}'.".format(callee_kernel.name, + # repitition due incorrect usage of kwargs and + # positional args + raise LoopyError("Multiple(>2) arguments obtained for" + " '{}' in '{}'.".format(callee_kernel.name, insn)) diff --git a/test/test_fortran.py b/test/test_fortran.py index 856d85c49..c6b7e8e37 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -533,7 +533,7 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! # FIXME: correct this after the "Module" is done. + ! # FIXME: correct this after the "TranslationUnit" is done. ! prg = lp.parse_fortran(SOURCE) ! fill = prg["fill"] ! twice = prg["twice"] -- GitLab From 2dadb47d8c45c1a068316bdcbefdedbd1ca4071d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Oct 2019 12:31:42 -0500 Subject: [PATCH 611/916] cache the results of slice->SAR during the processing of an instruction --- loopy/kernel/creation.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 4be7e06b8..5582b0c63 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1888,9 +1888,18 @@ class SliceToInameReplacer(IdentityMapper): self.var_name_gen = var_name_gen self.knl = knl + # caching to map equivalent slices to equivalent SubArrayRefs + self.cache = {} + self.subarray_ref_bounds = [] + def clear_cache(self): + self.cache = {} + def map_subscript(self, expr): + if expr in self.cache: + return self.cache[expr] + subscript_iname_bounds = {} self.subarray_ref_bounds.append(subscript_iname_bounds) @@ -1919,11 +1928,15 @@ class SliceToInameReplacer(IdentityMapper): new_index.append(index) if swept_inames: - return SubArrayRef(tuple(swept_inames), Subscript( + result = SubArrayRef(tuple(swept_inames), Subscript( self.rec(expr.aggregate), self.rec(tuple(new_index)))) else: - return IdentityMapper.map_subscript(self, expr) + result = IdentityMapper.map_subscript(self, expr) + + self.cache[expr] = result + + return result def map_call(self, expr): def _convert_array_to_slices(arg): @@ -2014,6 +2027,8 @@ def realize_slices_array_inputs_as_sub_array_refs(kernel): raise NotImplementedError("Unknown type of instruction -- %s" % type(insn)) + slice_replacer.clear_cache() + return kernel.copy( domains=( kernel.domains -- GitLab From 584c4d0de273295c320694ced999f7bf01ba4301 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Oct 2019 13:16:16 -0500 Subject: [PATCH 612/916] minor docs fix --- loopy/kernel/tools.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index d0e4ef084..7dfe4f48b 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1930,13 +1930,13 @@ def infer_args_are_input_output(kernel): .. note:: - If the attribute ``is_output`` of an argument is not supplied from an - user, then it is inferred as an output argument if it is written at + If the :attr:`~loopy.ArrayArg.is_output` is not supplied from a user, + then the array is inferred as an output argument if it is written at some point in the kernel. - If the attribute ``is_input`` of an argument of is not supplied from - an user, then it is inferred as an input argument if it is either read - at some point in the kernel or it is neither read nor written. + If the :attr:`~loopy.ArrayArg.is_input` is not supplied from a user, + then the array is inferred as an input argument if it is either read at + some point in the kernel or it is neither read nor written. """ from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg new_args = [] -- GitLab From 44d4c497b3aa22f07ca004b7c97e7860297bbf6e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Oct 2019 13:31:47 -0500 Subject: [PATCH 613/916] fuse_kernel should take in LoopKernels --- loopy/transform/fusion.py | 150 ++++++++++++-------------------------- 1 file changed, 45 insertions(+), 105 deletions(-) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 45e9c0a06..287c810e2 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -32,8 +32,6 @@ from loopy.diagnostic import LoopyError from pymbolic import var from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel -from loopy.program import rename_resolved_functions_in_a_single_kernel def _apply_renames_in_exprs(kernel, var_renames): @@ -291,7 +289,51 @@ def _fuse_two_kernels(knla, knlb): # }}} -def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): +def fuse_kernels(kernels, suffixes=None, data_flow=None): + """Return a kernel that performs all the operations in all entries + of *kernels*. + + :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. + :arg suffixes: If given, must be a list of strings of a length matching + that of *kernels*. This will be used to disambiguate the names + of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. + *from_kernel* and *to_kernel* are indices into *kernels*. + + The components of the kernels are fused as follows: + + * The resulting kernel will have a domain involving all the inames + and parameters occurring across *kernels*. + Inames with matching names across *kernels* are fused in such a way + that they remain a single iname in the fused kernel. + Use :func:`loopy.rename_iname` if this is not desired. + + * The projection of the domains of each pair of kernels onto their + common subset of inames must match in order for fusion to + succeed. + + * Assumptions are fused by taking their conjunction. + + * If kernel arguments with matching names are encountered across + *kernels*, their declarations must match in order for fusion to + succeed. + + * Temporaries are automatically renamed to remain uniquely associated + with each instruction stream. + + * The resulting kernel will contain all instructions from each entry + of *kernels*. Clashing instruction IDs will be renamed to ensure + uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 + """ + assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) @@ -373,106 +415,4 @@ def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): return result - -def fuse_kernels(programs, suffixes=None, data_flow=None): - """Return a kernel that performs all the operations in all entries - of *kernels*. - - :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. - :arg suffixes: If given, must be a list of strings of a length matching - that of *kernels*. This will be used to disambiguate the names - of temporaries, as described below. - :arg data_flow: A list of data dependencies - ``[(var_name, from_kernel, to_kernel), ...]``. - Based on this, the fuser will create dependencies between all - writers of *var_name* in ``kernels[from_kernel]`` to - readers of *var_name* in ``kernels[to_kernel]``. - *from_kernel* and *to_kernel* are indices into *kernels*. - - The components of the kernels are fused as follows: - - * The resulting kernel will have a domain involving all the inames - and parameters occurring across *kernels*. - Inames with matching names across *kernels* are fused in such a way - that they remain a single iname in the fused kernel. - Use :func:`loopy.rename_iname` if this is not desired. - - * The projection of the domains of each pair of kernels onto their - common subset of inames must match in order for fusion to - succeed. - - * Assumptions are fused by taking their conjunction. - - * If kernel arguments with matching names are encountered across - *kernels*, their declarations must match in order for fusion to - succeed. - - * Temporaries are automatically renamed to remain uniquely associated - with each instruction stream. - - * The resulting kernel will contain all instructions from each entry - of *kernels*. Clashing instruction IDs will be renamed to ensure - uniqueness. - - .. versionchanged:: 2016.2 - - *data_flow* was added in version 2016.2 - """ - - from loopy.program import make_program - - programs = [make_program(knl) if isinstance(knl, LoopKernel) else knl for - knl in programs] - - # all the resolved functions in programs must be registered in - # main_callables_table - main_prog_callables_info = ( - programs[0].callables_table) - old_root_kernel_callable = ( - programs[0].callables_table[programs[0].name]) - kernels = [programs[0].root_kernel] - - # removing the callable collisions that maybe present - for prog in programs[1:]: - root_kernel = prog.root_kernel - renames_needed = {} - for old_func_id, in_knl_callable in prog.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - # Fusing programs with multiple callable kernels is tough. - # Reason: Need to first figure out the order in which the - # callable kernels must be resolved into - # main_callables_table, because of renaming is - # needed to be done in the callable kernels before registering. - # Hence disabling it until required. - if in_knl_callable.subkernel.name != prog.name: - raise LoopyError("fuse_kernels cannot fuse programs with " - "multiple callable kernels.") - - # root kernel are dealt at the end after performing all the - # renaming. - continue - main_prog_callables_info, new_func_id = ( - main_prog_callables_info.with_added_callable(var(old_func_id), - in_knl_callable)) - - if old_func_id != new_func_id: - renames_needed[old_func_id] = new_func_id - - if renames_needed: - root_kernel = rename_resolved_functions_in_a_single_kernel( - root_kernel, renames_needed) - - kernels.append(root_kernel) - - new_root_kernel = fuse_loop_kernels(kernels, suffixes, data_flow) - new_root_kernel_callable = old_root_kernel_callable.copy( - subkernel=new_root_kernel.copy(name=programs[0].name)) - - # TODO: change the name of the final root kernel. - main_prog_callables_info, _ = main_prog_callables_info.with_added_callable( - var(programs[0].name), new_root_kernel_callable) - - return programs[0].copy( - callables_table=main_prog_callables_info) - # vim: foldmethod=marker -- GitLab From 71b05d5be15c38b4534dfdc92d056ebb6bfbf44a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Oct 2019 14:36:51 -0500 Subject: [PATCH 614/916] way better docs for _check_correctness_of_args_and_assignees --- loopy/transform/callable.py | 72 +++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 2cde66767..2fb9168ec 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -154,14 +154,29 @@ class _RegisterCalleeKernel(ImmutableRecord): return None -def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): +def _check_correctness_of_args_and_assignees(insn, callee_kernel): + """ + Checks that -- + 1. the call in *insn* agrees the :attr:`~loopy.ArrayArg.is_input` and + :attr:`~loopy.ArrayArg.is_output` for the corresponding arguments in + *callee_kernel*, + 2. the call does not get multiple values for a keyword argument, + 3. only the arguments that are both output and input appear in the + assignees as well as parameters in *insn*'s call. + """ from loopy.kernel.function_interface import get_kw_pos_association kw_to_pos, pos_to_kw = get_kw_pos_association(callee_kernel) + + # mapping from argument index in callee to the assignees/paramters mapping + # to it callee_args_to_insn_params = [[] for _ in callee_kernel.args] expr = insn.expression - from pymbolic.primitives import Call, CallWithKwargs + from pymbolic.primitives import Call if isinstance(expr, Call): expr = CallWithKwargs(expr.function, expr.parameters, kw_parameters={}) + + # {{{ check that call parameters are input arguments in callee + for i, param in enumerate(expr.parameters): pos = kw_to_pos[callee_kernel.args[i].name] if pos < 0: @@ -179,6 +194,20 @@ def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): " input in '{}'.".format(kw, insn)) callee_args_to_insn_params[pos].append(param) + # }}} + + # {{{ check that positional and Keyword arguments and positional do not map + # to the same callee arg + + if any(len(pars) >= 2 for pars in callee_args_to_insn_params): + raise LoopyError("{}() got multiple values for keyword argument" + " '{}'".format(callee_kernel.name, callee_kernel.args[i].name)) + + # }}} + + # {{{ check that only the args which are both input and output appear both + # in assignees and parameters + num_pure_assignees = 0 for i, assignee in enumerate(insn.assignees): pos = kw_to_pos[pos_to_kw[-i-1]] @@ -195,7 +224,7 @@ def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): if len(insn_params) == 1: # making sure that the argument is either only input or output if arg.is_input == arg.is_output: - raise LoopyError("Argument '{}' in '{}' should be passed in" + raise LoopyError("Parameter '{}' in '{}' should be passed in" " both assignees and parameters in Call.".format( insn_params[0], insn)) elif len(insn_params) == 2: @@ -208,11 +237,10 @@ def _check_correctness_of_args_and_assignees(insn, callee_kernel, caller_knl): " to '{}'.".format(insn_params[0], insn_params[1], arg.name, callee_kernel.name)) else: - # repitition due incorrect usage of kwargs and - # positional args - raise LoopyError("Multiple(>2) arguments obtained for" - " '{}' in '{}'.".format(callee_kernel.name, - insn)) + # should not reach here + assert False + + # }}} def register_callable_kernel(program, callee_kernel): @@ -230,37 +258,13 @@ def register_callable_kernel(program, callee_kernel): assert isinstance(callee_kernel, LoopKernel), ('{0} !=' '{1}'.format(type(callee_kernel), LoopKernel)) - # check to make sure that the variables with 'out' direction is equal to - # the number of assigness in the callee kernel intructions. - expected_num_assignees = sum(arg.is_output for arg in callee_kernel.args) - expected_num_arguments = sum(arg.is_input for arg in callee_kernel.args) for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): caller_kernel = in_knl_callable.subkernel for insn in caller_kernel.instructions: if isinstance(insn, CallInstruction) and ( insn.expression.function.name == callee_kernel.name): - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - else: - kw_parameters = {} - if len(insn.assignees) != expected_num_assignees: - raise LoopyError("The number of arguments with 'out' " - "direction " "in callee kernel %s and the number " - "of assignees in " "instruction %s do not " - "match." % ( - callee_kernel.name, insn.id)) - if len(insn.expression.parameters+tuple( - kw_parameters.values())) != expected_num_arguments: - raise LoopyError("The number of" - " arguments in instruction '%s' do not match" - " the number of input arguments in" - " the callee kernel '%s' => arg matching" - " not possible." - % (insn.id, callee_kernel.name)) - - _check_correctness_of_args_and_assignees(insn, - callee_kernel, caller_kernel) + _check_correctness_of_args_and_assignees(insn, callee_kernel) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): @@ -439,8 +443,6 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): parameters = instruction.expression.parameters # reads # add keyword parameters - from pymbolic.primitives import CallWithKwargs - if isinstance(instruction.expression, CallWithKwargs): from loopy.kernel.function_interface import get_kw_pos_association -- GitLab From 1363a694946cad14db8d085eb3bb5bb709fa4bec Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Oct 2019 14:45:19 -0500 Subject: [PATCH 615/916] SubArrayRef.begin_subscript -> get_start_subscript_from_sar --- loopy/symbolic.py | 41 ++++++++++++++-------------- loopy/target/c/codegen/expression.py | 3 +- loopy/transform/callable.py | 3 +- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 53d8d4431..6a664f60e 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -809,6 +809,27 @@ class SweptInameStrideCollector(CoefficientCollectorBase): return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr) +def get_start_subscript_from_sar(sar, kernel): + """ + Returns an instance of :class:`pymbolic.primitives.Subscript`, the + beginning subscript of the array swept by the *SubArrayRef*. + + **Example:** Consider ``[i, k]: a[i, j, k, l]``. The beginning + subscript would be ``a[0, j, 0, l]`` + """ + + def _get_lower_bound(iname): + pwaff = kernel.get_iname_bounds(iname).lower_bound_pw_aff + return int(pw_aff_to_expr(pwaff)) + + swept_inames_to_zeros = dict( + (swept_iname.name, _get_lower_bound(swept_iname.name)) for + swept_iname in sar.swept_inames) + + return EvaluatorWithDeficientContext(swept_inames_to_zeros)( + sar.subscript) + + class SubArrayRef(LoopyExpressionBase): """ An algebraic expression to map an affine memory layout pattern (known as @@ -847,26 +868,6 @@ class SubArrayRef(LoopyExpressionBase): self.swept_inames = swept_inames self.subscript = subscript - def get_begin_subscript(self, kernel): - """ - Returns an instance of :class:`pymbolic.primitives.Subscript`, the - beginning subscript of the array swept by the *SubArrayRef*. - - **Example:** Consider ``[i, k]: a[i, j, k, l]``. The beginning - subscript would be ``a[0, j, 0, l]`` - """ - - def _get_lower_bound(iname): - pwaff = kernel.get_iname_bounds(iname).lower_bound_pw_aff - return int(pw_aff_to_expr(pwaff)) - - swept_inames_to_zeros = dict( - (swept_iname.name, _get_lower_bound(swept_iname.name)) for - swept_iname in self.swept_inames) - - return EvaluatorWithDeficientContext(swept_inames_to_zeros)( - self.subscript) - def __getinitargs__(self): return (self.swept_inames, self.subscript) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 5a066ddfb..b0bc187eb 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -167,7 +167,8 @@ class ExpressionToCExpressionMapper(IdentityMapper): return var(expr.name) def map_sub_array_ref(self, expr, type_context): - return var("&")(self.rec(expr.get_begin_subscript(self.kernel), + from loopy.symbolic import get_start_subscript_from_sar + return var("&")(self.rec(get_start_subscript_from_sar(expr, self.kernel), type_context)) def map_subscript(self, expr, type_context): diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 2fb9168ec..1bbdb1201 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -356,7 +356,8 @@ class KernelInliner(SubstitutionMapper): "constant shape.".format(callee_arg)) flatten_index = 0 - for i, idx in enumerate(sar.get_begin_subscript( + from loopy.symbolic import get_start_subscript_from_sar + for i, idx in enumerate(get_start_subscript_from_sar(sar, self.caller).index_tuple): flatten_index += idx*caller_arg.dim_tags[i].stride -- GitLab From 65c25393a2f8741dc39da9a7a34c85f70bd576c2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Oct 2019 14:50:37 -0500 Subject: [PATCH 616/916] better phrasing of comment --- loopy/kernel/function_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 2b50a2dc9..38beeaf44 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -231,8 +231,8 @@ def get_kw_pos_association(kernel): pos_to_kw[write_count] = arg.name write_count -= 1 if arg.is_input: - # if an argument is both input and output then the input is given - # more significance in kw_to_pos + # if an argument is both input and output then kw_to_pos is + # overwritten with its expected position in the parameters kw_to_pos[arg.name] = read_count pos_to_kw[read_count] = arg.name read_count += 1 -- GitLab From 80377b01173ef9b73412bf77ff9f7043addd4a5c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 6 Oct 2019 21:24:56 -0500 Subject: [PATCH 617/916] Completed CallableInferenceCollector.finish_program - Did some changes in type inference to account for some changes due to minor interfacial changes in CallableInferenceCollector - Type inference works for simple program --- loopy/program.py | 113 +++++++++++++++++++++++++--------------- loopy/type_inference.py | 12 +++-- 2 files changed, 78 insertions(+), 47 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 4356fcbdc..26c2aa7cf 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -491,27 +491,33 @@ class CallablesIDCollector(CombineMapper): map_type_cast = map_constant -def _get_callables_ids(callables): +def _get_callable_ids_for_knl(knl, callables): clbl_id_collector = CallablesIDCollector() + return frozenset().union(( + _get_callable_ids_for_knl(callables[clbl].subkernel) if + isinstance(callables[clbl], CallableKernel) else clbl + for clbl in clbl_id_collector.map_kernel(knl))) + + +def _get_callable_ids(callables, entrypoints): return frozenset().union(*( - clbl_id_collector.map_kernel(clbl.subkernel) for clbl in - callables.values() if isinstance(clbl, CallableKernel))) + _get_callable_ids_for_knl(callables[e].subkernel, callables) for e in + entrypoints)) + + +def make_clbl_inf_ctx(callables, entrypoints): + return CallablesInferenceContext(callables, _get_callable_ids(callables, + entrypoints)) class CallablesInferenceContext(ImmutableRecord): - def __init__(self, callables, old_callables_id=None, history=None): + def __init__(self, callables, old_callable_ids, history={}): assert isinstance(callables, dict) - if history is None: - history = dict((func_id, frozenset([func_id])) for func_id in - callables) - - if old_callables_id is None: - self.old_callables_ids = _get_callables_ids(callables) super(CallablesInferenceContext, self).__init__( callables=callables, - old_callables_id=old_callables_id, + old_callable_ids=old_callable_ids, history=history) # {{{ interface to perform edits on callables @@ -543,7 +549,7 @@ class CallablesInferenceContext(ImmutableRecord): # identifier corresponding to that callable. for func_id, in_knl_callable in self.callables.items(): if in_knl_callable == in_kernel_callable: - history[func_id] = history[func_id] | frozenset([function.name]) + history[func_id] = function.name return ( self.copy( history=history), @@ -554,8 +560,9 @@ class CallablesInferenceContext(ImmutableRecord): # {{{ handle ReductionOpFunction if isinstance(function, ReductionOpFunction): - # FIXME: Check what happens if we have 2 same ArgMax functions - # with different types in the same kernel! + # FIXME: Check if we have 2 ArgMax functions + # with different types in the same kernel the generated code + # does not mess up the types. unique_function_identifier = function.copy() updated_callables = self.callables.copy() updated_callables[unique_function_identifier] = ( @@ -579,8 +586,7 @@ class CallablesInferenceContext(ImmutableRecord): updated_callables[unique_function_identifier] = ( in_kernel_callable) - history[unique_function_identifier] = ( - history[function.name] | frozenset([unique_function_identifier])) + history[unique_function_identifier] = function.name return ( self.copy( @@ -588,42 +594,66 @@ class CallablesInferenceContext(ImmutableRecord): callables=updated_callables), Variable(unique_function_identifier)) - def finish_program(self, program): + def finish_program(self, program, renamed_entrypoints): """ - Returns a copy of *self* with renaming of the callables done whenever - possible. + Returns a copy of *program* with renaming of the callables done whenever + needed. *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, then all the renaming is done such that one of flavors of the callable is renamed back to ``sin``. + + :param renamed_entrypoints: A :class:`frozenset` of the names of the + renamed callable kernels which correspond to the entrypoints in + *self.callables_table*. """ - assert self.is_being_edited + assert len(renamed_entrypoints) == len(program.entrypoints) + new_callable_ids = _get_callable_ids(self.callables, renamed_entrypoints) + + callees_with_entrypoint_names = (program.entrypoints & + new_callable_ids) - renamed_entrypoints + + renames = {} + new_callables = {} + + for c in callees_with_entrypoint_names: + unique_function_identifier = c - new_callables_count = self.callables_count + while unique_function_identifier in self.callables: + unique_function_identifier = ( + next_indexed_function_identifier( + unique_function_identifier)) + + renames[c] = unique_function_identifier + + # we should perform a rewrite here. + + for e in renamed_entrypoints: + renames[e] = self.history[e] + assert renames[e] in program.entrypoints + new_subkernel = self.callables[e].subkernel.copy(name=self.history[e]) + new_callables[self.history[e]] = self.callables[e].copy( + subkernel=new_subkernel) # {{{ calculate the renames needed - renames_needed = {} - for old_func_id in old_callables_count-new_callables_count: - # this implies that all the function instances having the name - # "func_id" have been renamed to something else. - for new_func_id in ( - six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): - if old_func_id in self.history[new_func_id]: - renames_needed[new_func_id] = old_func_id + for old_func_id in ((self.old_callable_ids-new_callable_ids) - + program.entrypoints): + # at this point we should not rename anything to the names of + # entrypoints + for new_func_id in (new_callable_ids-six.viewkeys(renames)): + if old_func_id == self.history[new_func_id]: + renames[new_func_id] = old_func_id break # }}} - new_resolved_functions = {} - new_history = {} - - for func_id in new_callables_count: - in_knl_callable = self.resolved_functions[func_id] + for func_id in new_callable_ids-renamed_entrypoints: + in_knl_callable = self.callables[func_id] if isinstance(in_knl_callable, CallableKernel): # if callable kernel, perform renames inside its expressions. old_subkernel = in_knl_callable.subkernel new_subkernel = rename_resolved_functions_in_a_single_kernel( - old_subkernel, renames_needed) + old_subkernel, renames) in_knl_callable = ( in_knl_callable.copy(subkernel=new_subkernel)) elif isinstance(in_knl_callable, ScalarCallable): @@ -632,24 +662,21 @@ class CallablesInferenceContext(ImmutableRecord): raise NotImplementedError("Unknown callable type %s." % type(in_knl_callable).__name__) - if func_id in renames_needed: - new_func_id = renames_needed[func_id] + if func_id in renames: + new_func_id = renames[func_id] if isinstance(in_knl_callable, CallableKernel): in_knl_callable = (in_knl_callable.copy( subkernel=in_knl_callable.subkernel.copy( name=new_func_id))) - new_resolved_functions[new_func_id] = ( - in_knl_callable) - new_history[new_func_id] = self.history[func_id] + new_callables[new_func_id] = in_knl_callable else: if isinstance(in_knl_callable, CallableKernel): in_knl_callable = in_knl_callable.copy( subkernel=in_knl_callable.subkernel.copy( name=func_id)) - new_resolved_functions[func_id] = in_knl_callable - new_history[func_id] = self.history[func_id] + new_callables[func_id] = in_knl_callable - return program.copy(callables_table=new_callables_table) + return program.copy(callables_table=new_callables) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index e3091171d..6205d219b 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -40,7 +40,7 @@ from loopy.symbolic import ( SubstitutionRuleExpander, ResolvedFunction, SubstitutionRuleMappingContext, SubArrayRef) from pymbolic.primitives import Variable, Subscript, Lookup -from loopy.program import CallablesInferenceContext +from loopy.program import CallablesInferenceContext, make_clbl_inf_ctx import logging logger = logging.getLogger(__name__) @@ -1039,7 +1039,10 @@ def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" from loopy.kernel.data import auto - clbl_inf_ctx = CallablesInferenceContext(program.callables_table) + clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, + program.entrypoints) + + renamed_entrypoints = set() for e in program.entrypoints: # FIXME: Need to add docs which say that we need not add the current @@ -1048,9 +1051,10 @@ def infer_unknown_types(program, expect_completion=False): program[e].args if arg.dtype not in (None, auto)) new_callable, clbl_inf_ctx = program.callables_table[e].with_types( arg_id_to_dtype, None, clbl_inf_ctx) - clbl_inf_ctx, _ = clbl_inf_ctx.with_callable(e, new_callable) + clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) + renamed_entrypoints.add(new_name.name) - return clbl_inf_ctx.finish_program(program) + return clbl_inf_ctx.finish_program(program, renamed_entrypoints) # }}} -- GitLab From 2fa9bd7808b8d121ba230e1d9419baf944dd2557 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 6 Oct 2019 22:32:02 -0500 Subject: [PATCH 618/916] rectified minor error in CallablesInferenceContext.finish_program --- loopy/program.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 26c2aa7cf..234247bf6 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -631,9 +631,6 @@ class CallablesInferenceContext(ImmutableRecord): for e in renamed_entrypoints: renames[e] = self.history[e] assert renames[e] in program.entrypoints - new_subkernel = self.callables[e].subkernel.copy(name=self.history[e]) - new_callables[self.history[e]] = self.callables[e].copy( - subkernel=new_subkernel) # {{{ calculate the renames needed @@ -647,6 +644,13 @@ class CallablesInferenceContext(ImmutableRecord): break # }}} + for e in renamed_entrypoints: + new_subkernel = self.callables[e].subkernel.copy(name=self.history[e]) + new_subkernel = rename_resolved_functions_in_a_single_kernel( + new_subkernel, renames) + new_callables[self.history[e]] = self.callables[e].copy( + subkernel=new_subkernel) + for func_id in new_callable_ids-renamed_entrypoints: in_knl_callable = self.callables[func_id] if isinstance(in_knl_callable, CallableKernel): -- GitLab From ee0bb92c1ab24333bf5f03b567136696bf491e24 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 7 Oct 2019 00:53:38 -0500 Subject: [PATCH 619/916] make the passing of expr to with_descrs optional --- loopy/kernel/function_interface.py | 44 +++++++++++++++++------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 2b50a2dc9..b58e05b6c 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -688,7 +688,8 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=specialized_kernel, arg_id_to_dtype=new_arg_id_to_dtype), callables_table - def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, + expr=None): # tune the subkernel so that we have the matching shapes and # dim_tags @@ -708,32 +709,37 @@ class CallableKernel(InKernelCallable): import numbers substs = {} assumptions = {} - for arg, par in zip(self.subkernel.args, expr.parameters): - if isinstance(arg, ValueArg) and arg.name in domain_dependent_vars: - if isinstance(par, Variable): - if par in substs: - assumptions[arg.name] = substs[par].name - else: - substs[par] = Variable(arg.name) - elif isinstance(par, numbers.Number): - assumptions[arg.name] = par - - def subst_func(expr): - if expr in substs: - return substs[expr] - else: - return expr - subst_mapper = SubstitutionMapper(subst_func) + if expr: + for arg, par in zip(self.subkernel.args, expr.parameters): + if isinstance(arg, ValueArg) and arg.name in domain_dependent_vars: + if isinstance(par, Variable): + if par in substs: + assumptions[arg.name] = substs[par].name + else: + substs[par] = Variable(arg.name) + elif isinstance(par, numbers.Number): + assumptions[arg.name] = par + + def subst_func(expr): + if expr in substs: + return substs[expr] + else: + return expr - arg_id_to_descr = dict((arg_id, descr.map_expr(subst_mapper)) for - arg_id, descr in arg_id_to_descr.items()) + subst_mapper = SubstitutionMapper(subst_func) + + arg_id_to_descr = dict((arg_id, descr.map_expr(subst_mapper)) for + arg_id, descr in arg_id_to_descr.items()) # }}} dependents = frozenset().union(*(descr.depends_on() for descr in arg_id_to_descr.values())) unknown_deps = dependents - self.subkernel.all_variable_names() + + if expr is None: + assert dependents == frozenset() # FIXME: Need to make sure that we make the name of the variables # unique, and then run a subst_mapper -- GitLab From 36c69b331e3cd0e6a67b6700d7432f8d432398e9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 7 Oct 2019 00:54:18 -0500 Subject: [PATCH 620/916] minor fixes --- loopy/program.py | 2 +- loopy/target/execution.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 234247bf6..8cb251386 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -553,7 +553,7 @@ class CallablesInferenceContext(ImmutableRecord): return ( self.copy( history=history), - func_id) + Variable(func_id)) assert False else: diff --git a/loopy/target/execution.py b/loopy/target/execution.py index da5f32546..cfc7a50d7 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -759,9 +759,10 @@ class KernelExecutorBase(object): var_to_dtype)) from loopy.type_inference import infer_unknown_types + from loopy.kernel import KernelState program = infer_unknown_types(program, expect_completion=True) - if program.root_kernel.schedule is None: + if program.state < KernelState.SCHEDULED: from loopy.preprocess import preprocess_program program = preprocess_program(program) -- GitLab From 271c41f3d96ddde45c73ba0778a8a75cf832521d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 7 Oct 2019 00:56:47 -0500 Subject: [PATCH 621/916] changes to arg_id_to_descr according to the new renaming interface --- loopy/preprocess.py | 57 +++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c6b69da83..c38dea62d 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2011,10 +2011,7 @@ def realize_reduction(program, *args, **kwargs): knl, callables_table, *args, **kwargs) in_knl_callable = callables_table[knl.name].copy( subkernel=new_knl) - resolved_functions = callables_table.resolved_functions.copy() - resolved_functions[knl.name] = in_knl_callable - callables_table = callables_table.copy( - resolved_functions=resolved_functions) + callables_table[knl.name] = in_knl_callable return program.copy(callables_table=callables_table) @@ -2312,23 +2309,35 @@ def infer_arg_descr(program): :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the callables. """ - root_kernel_callable = program.callables_table[program.name] - old_callables_count = program.callables_table.callables_count - callables_table = ( - program.callables_table.with_edit_callables_mode()) - root_kernel = program.root_kernel - - new_root_kernel, callables_table = traverse_to_infer_arg_descr( - root_kernel, callables_table) - new_root_kernel_callable = root_kernel_callable.copy( - subkernel=new_root_kernel) - callables_table, _ = callables_table.with_callable(program.name, - new_root_kernel_callable) - - callables_table = callables_table.with_exit_edit_callables_mode( - old_callables_count) - return program.copy(callables_table=callables_table) + from loopy.program import make_clbl_inf_ctx + from loopy.kernel.array import ArrayBase + from loopy.kernel.function_interface import (ArrayArgDescriptor, + ValueArgDescriptor) + + clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, + program.entrypoints) + + renamed_entrypoints = set() + + for e in program.entrypoints: + # FIXME: Need to add docs which say that we need not add the current + # callable to the clbl_inf_ctx while writing the "with_types" + # This is treacherous, we should use traverse... instead. + def _tuple_if_int(s): + if isinstance(s, int): + return s, + return s + arg_id_to_descr = dict((arg.name, ArrayArgDescriptor( + _tuple_if_int(arg.shape), arg.address_space, arg.dim_tags) if + isinstance(arg, ArrayBase) else ValueArgDescriptor()) for arg in + program[e].args) + new_callable, clbl_inf_ctx, _ = program.callables_table[e].with_descrs( + arg_id_to_descr, None, clbl_inf_ctx) + clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) + renamed_entrypoints.add(new_name.name) + + return clbl_inf_ctx.finish_program(program, renamed_entrypoints) # }}} @@ -2496,7 +2505,7 @@ def preprocess_program(program, device=None): # # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects - new_resolved_functions = {} + new_callables = {} for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = preprocess_single_kernel( @@ -2510,11 +2519,9 @@ def preprocess_program(program, device=None): raise NotImplementedError("Unknown callable type %s." % ( type(in_knl_callable).__name__)) - new_resolved_functions[func_id] = in_knl_callable + new_callables[func_id] = in_knl_callable - new_callables_table = program.callables_table.copy( - resolved_functions=new_resolved_functions) - program = program.copy(callables_table=new_callables_table) + program = program.copy(callables_table=new_callables) # }}} -- GitLab From 9c660247cf42d095ebc02994d9661e797d617cd9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 7 Oct 2019 02:34:44 -0500 Subject: [PATCH 622/916] no assumptions about is_output of args in fortran frontend --- loopy/frontend/fortran/translator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 949a3d4cc..caa8fa681 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -763,7 +763,6 @@ class F2LoopyTranslator(FTreeWalkerBase): arg_name, dtype=sub.get_type(arg_name), shape=sub.get_loopy_shape(arg_name), - is_output=False, )) else: kernel_data.append( -- GitLab From acdf35d8dfec907ccce2e1806e286a1719b17f40 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 7 Oct 2019 17:11:28 -0500 Subject: [PATCH 623/916] removes the unnecessary infer_hw_axes --- loopy/preprocess.py | 38 -------------------------------------- 1 file changed, 38 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c38dea62d..ad26efc68 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2433,42 +2433,6 @@ def preprocess_single_kernel(kernel, callables_table, device=None): return kernel -# {{{ hw axes inference - -def infer_hw_axes_sizes(program): - """ - Returns copy of *program* with the hardware axes sizes inferred. - - .. note:: - - - Firstly, computes the collective hardware axes sizes from all the - callable kernels. - - Then, overrides the grid sizes of all the callable kernels to the - collective value. - """ - - global_size, local_size = program.get_grid_size_upper_bounds() - - resolved_function_with_hw_axes_sizes_inferred = {} - - for func_id, in_knl_callable in ( - program.callables_table.items()): - if func_id == program.name: - resolved_function_with_hw_axes_sizes_inferred[func_id] = ( - in_knl_callable) - else: - resolved_function_with_hw_axes_sizes_inferred[func_id] = ( - in_knl_callable.with_hw_axes_sizes(global_size, local_size)) - - new_callables_table = ( - program.callables_table.copy( - resolved_functions=resolved_function_with_hw_axes_sizes_inferred)) - - return program.copy(callables_table=new_callables_table) - -# }}} - - def preprocess_program(program, device=None): if device is not None: @@ -2528,8 +2492,6 @@ def preprocess_program(program, device=None): # infer arg descrs of the callables program = infer_arg_descr(program) - program = infer_hw_axes_sizes(program) - return program -- GitLab From 236e6418fa7e18d309e1603876a92bffdb8323f2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 7 Oct 2019 17:13:36 -0500 Subject: [PATCH 624/916] adds some comments to take care while dealing with fdecl --- loopy/target/opencl.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 704ad25b1..6dced9ad9 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -468,13 +468,15 @@ class OpenCLCASTBuilder(CASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index): + schedule_index, is_entrypoint): + raise NotImplementedError("this should probably take is is_entrypoint" + " or something equivalent.") fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( codegen_state, codegen_result, schedule_index) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) - if not codegen_state.kernel.is_called_from_host: + if not is_entrypoint: # auxiliary kernels need not mention opencl speicific qualifiers # for a functions signature return fdecl @@ -485,6 +487,8 @@ class OpenCLCASTBuilder(CASTBuilder): fdecl = CLKernel(fdecl) from loopy.schedule import get_insn_ids_for_block_at + raise NotImplementedError("this should pll the grid size from the" + "translation unit?") _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index), -- GitLab From be16786b66654a295ab34acdc16a8f899f3a1978 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 7 Oct 2019 17:18:59 -0500 Subject: [PATCH 625/916] changes the interface to get grid sizes --- loopy/kernel/__init__.py | 54 +++++++------------------------------ loopy/program.py | 57 +++++++++++++++++++++++++++++++--------- 2 files changed, 55 insertions(+), 56 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 8c441c35e..9ebcf2bcb 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -221,11 +221,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): A subclass of :class:`loopy.TargetBase`. - .. attribute:: is_called_from_host - An instance of :class:`bool`. Will be set *False* for the kernel which - would be called from other top level kernels. Default value is - *True*. - """ # {{{ constructor @@ -253,7 +248,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): state=KernelState.INITIAL, target=None, - is_called_from_host=True, overridden_get_grid_sizes_for_insn_ids=None, _cached_written_variables=None): @@ -373,7 +367,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): options=options, state=state, target=target, - is_called_from_host=is_called_from_host, overridden_get_grid_sizes_for_insn_ids=( overridden_get_grid_sizes_for_insn_ids), _cached_written_variables=_cached_written_variables) @@ -1057,9 +1050,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) - @memoize_method def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, callables_table, ignore_auto=False): + # FIXME: re-add the memoization? + # FIXME: docs """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1144,9 +1138,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): return global_sizes, local_sizes - @memoize_method def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, ignore_auto=False): + #Fixme: Re-add the memoize wrap here? + # Fixme: docs """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1163,43 +1158,14 @@ class LoopKernel(ImmutableRecordWithoutPickling): callables_table=callables_table, ignore_auto=ignore_auto) - assert self.is_called_from_host, ("Callee kernels do not have sufficient " - "information to compute grid sizes.") - global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( insn_ids, callables_table, ignore_auto=ignore_auto) - def to_dim_tuple(size_dict, which, forced_sizes={}): - forced_sizes = forced_sizes.copy() - - size_list = [] - sorted_axes = sorted(six.iterkeys(size_dict)) - - while sorted_axes or forced_sizes: - if sorted_axes: - cur_axis = sorted_axes.pop(0) - else: - cur_axis = None - - if len(size_list) in forced_sizes: - size_list.append(forced_sizes.pop(len(size_list))) - continue - - assert cur_axis is not None - - if cur_axis > len(size_list): - raise LoopyError("%s axis %d unused for %s" % ( - which, len(size_list), self.name)) - - size_list.append(size_dict[cur_axis]) - - return tuple(size_list) - - return (to_dim_tuple(global_sizes, "global"), - to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + return global_sizes, local_sizes def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, callables_table, ignore_auto=False): + # FIXME docs """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. @@ -1213,11 +1179,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): grid_size, group_size = self.get_grid_sizes_for_insn_ids( insn_ids, callables_table, ignore_auto) - def tup_to_exprs(tup): + def dict_to_exprs(d): from loopy.symbolic import pw_aff_to_expr - return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup) + return dict((k, pw_aff_to_expr(v, int_ok=True)) for k, v in + six.iteritems(d)) - return tup_to_exprs(grid_size), tup_to_exprs(group_size) + return dict_to_exprs(grid_size), dict_to_exprs(group_size) def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that @@ -1552,7 +1519,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): "silenced_warnings", "options", "state", - "is_called_from_host", "target", ) diff --git a/loopy/program.py b/loopy/program.py index 8cb251386..5b0089da4 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -243,31 +243,64 @@ class Program(ImmutableRecord): return self.copy(entrypoints=entrypoints) - def get_grid_size_upper_bounds(self, ignore_auto=False): + def get_grid_size_upper_bounds(self, entrypoint, ignore_auto=False): + #FIXME: docs """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ - # This should take in an input of an entrypoint. - raise NotImplementedError() + # do the check over here, get the thing as a dict. + def to_dim_tuple(size_dict, which, forced_sizes={}): + forced_sizes = forced_sizes.copy() - return self.root_kernel.get_grid_size_upper_bounds( - self.callables_table, - ignore_auto=ignore_auto) + size_list = [] + sorted_axes = sorted(six.iterkeys(size_dict)) - def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False): + while sorted_axes or forced_sizes: + if sorted_axes: + cur_axis = sorted_axes.pop(0) + else: + cur_axis = None + + if len(size_list) in forced_sizes: + size_list.append(forced_sizes.pop(len(size_list))) + continue + + assert cur_axis is not None + + if cur_axis > len(size_list): + raise LoopyError("%s axis %d unused for %s" % ( + which, len(size_list), self.name)) + + size_list.append(size_dict[cur_axis]) + + return tuple(size_list) + + global_sizes, local_sizes = (self.callables_table[entrypoint] + .subkernel + .get_grid_size_upper_bounds( + self.callables_table, ignore_auto=ignore_auto)) + + return (to_dim_tuple(global_sizes, "global"), + to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + + def get_grid_size_upper_bounds_as_exprs(self, entrypoint, ignore_auto=False): + #FIXME: docs """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. *global_size* and *local_size* are :mod:`pymbolic` expressions """ - # This should take in an input of an entrypoint. - raise NotImplementedError() + # do the check over here, get the thing as a dict. + grid_size, group_size = self.get_grid_sizes_for_insn_ids( + entrypoint, ignore_auto) + + def tup_to_exprs(tup): + from loopy.symbolic import pw_aff_to_expr + return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup) - return self.root_kernel.get_grid_size_upper_bounds_as_exprs( - self.callables_table, - ignore_auto=ignore_auto) + return tup_to_exprs(grid_size), tup_to_exprs(group_size) @property def state(self): -- GitLab From 7ba0f6400ec1f6e707ca0c147a9298f245ccb167 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 7 Oct 2019 17:22:07 -0500 Subject: [PATCH 626/916] root kernel -> entrypoints --- loopy/target/execution.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index cfc7a50d7..4530000a3 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -767,9 +767,9 @@ class KernelExecutorBase(object): program = preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel - program = program.with_root_kernel( - get_one_scheduled_kernel(program.root_kernel, - program.callables_table)) + for e in program.entrypoints: + program = program.with_kernel( + get_one_scheduled_kernel(program[e], program.callables_table)) return program -- GitLab From fbfc2fc6458a0425bb8ab73bf8ca1634b88784ae Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 12 Oct 2019 23:51:25 -0500 Subject: [PATCH 627/916] saving a bunch of stuff --- loopy/check.py | 4 +- loopy/codegen/__init__.py | 108 +++++++++++++-------------- loopy/codegen/control.py | 2 +- loopy/codegen/result.py | 35 ++++++--- loopy/kernel/__init__.py | 66 ++++++++++++---- loopy/program.py | 59 --------------- loopy/target/c/__init__.py | 2 +- loopy/target/c/codegen/expression.py | 4 +- loopy/target/execution.py | 2 + loopy/target/opencl.py | 8 +- loopy/target/pyopencl_execution.py | 15 +++- loopy/target/python.py | 4 +- loopy/type_inference.py | 44 +++++++++++ 13 files changed, 196 insertions(+), 157 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 83e4fd0af..e77d009f7 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -29,7 +29,7 @@ from islpy import dim_type import islpy as isl from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel -from loopy.type_inference import TypeInferenceMapper +from loopy.type_inference import TypeReader from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, CInstruction, _DataObliviousInstruction) @@ -133,7 +133,7 @@ def check_functions_are_resolved(kernel): VALID_NOSYNC_SCOPES = frozenset(["local", "global", "any"]) -class SubscriptIndicesIsIntChecker(TypeInferenceMapper): +class SubscriptIndicesIsIntChecker(TypeReader): def map_subscript(self, expr): for idx in expr.index_tuple: if not self.rec(idx)[0].is_integral(): diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 4acf2ce0a..083664c13 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -40,9 +40,8 @@ from loopy.symbolic import CombineMapper from functools import reduce from loopy.kernel.function_interface import CallableKernel -from cgen import Collection -from pytools import ProcessLogger +from pytools import ProcessLogger, memoize_method # {{{ implemented data info @@ -201,6 +200,11 @@ class CodeGenerationState(object): .. attribute:: callables_table An instance of :class:`loopy.CallablesTable`. + + .. attribute:: is_entrypoint + + A :class:`bool` to indicate if the code is being generated for an + entrypoint kernel """ def __init__(self, kernel, target, @@ -208,6 +212,7 @@ class CodeGenerationState(object): seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map, allow_complex, callables_table, + is_entrypoint, vectorization_info=None, var_name_generator=None, is_generating_device_code=None, gen_program_name=None, @@ -223,6 +228,7 @@ class CodeGenerationState(object): self.var_subst_map = var_subst_map.copy() self.allow_complex = allow_complex self.callables_table = callables_table + self.is_entrypoint = is_entrypoint self.vectorization_info = vectorization_info self.var_name_generator = var_name_generator self.is_generating_device_code = is_generating_device_code @@ -233,9 +239,8 @@ class CodeGenerationState(object): def copy(self, kernel=None, target=None, implemented_data_info=None, implemented_domain=None, implemented_predicates=frozenset(), - var_subst_map=None, vectorization_info=None, - is_generating_device_code=None, - gen_program_name=None, + var_subst_map=None, is_entrypoint=None, vectorization_info=None, + is_generating_device_code=None, gen_program_name=None, schedule_index_end=None): if kernel is None: @@ -247,6 +252,9 @@ class CodeGenerationState(object): if implemented_data_info is None: implemented_data_info = self.implemented_data_info + if is_entrypoint is None: + is_entrypoint = self.is_entrypoint + if vectorization_info is False: vectorization_info = None @@ -275,6 +283,7 @@ class CodeGenerationState(object): var_subst_map=var_subst_map or self.var_subst_map, allow_complex=self.allow_complex, callables_table=self.callables_table, + is_entrypoint=is_entrypoint, vectorization_info=vectorization_info, var_name_generator=self.var_name_generator, is_generating_device_code=is_generating_device_code, @@ -422,7 +431,8 @@ class PreambleInfo(ImmutableRecord): # {{{ main code generation entrypoint -def generate_code_for_a_single_kernel(kernel, callables_table, target): +def generate_code_for_a_single_kernel(kernel, callables_table, target, + is_entrypoint): """ :returns: a :class:`CodeGenerationResult` @@ -518,7 +528,8 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target): + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), - callables_table=callables_table) + callables_table=callables_table, + is_entrypoint=is_entrypoint) from loopy.codegen.result import generate_host_or_device_program @@ -573,6 +584,7 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target): return codegen_result +@memoize_method def generate_code_v2(program): """ Returns an instance of :class:`CodeGenerationResult`. @@ -581,7 +593,7 @@ def generate_code_v2(program): """ from loopy.kernel import LoopKernel from loopy.program import make_program - from cgen import FunctionBody + from loopy.codegen.result import CodeGenerationResult if isinstance(program, LoopKernel): program = make_program(program) @@ -598,56 +610,44 @@ def generate_code_v2(program): from loopy.type_inference import infer_unknown_types program = infer_unknown_types(program, expect_completion=True) - codegen_results = {} + host_programs = [] + device_programs = [] + device_preambles = [] + implemented_data_infos = [] for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): - codegen_results[func_id] = ( - generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.callables_table, program.target)) - if not in_knl_callable.subkernel.is_called_from_host: - assert codegen_results[func_id].host_program is None - - device_preambles = [] - for cgr in codegen_results.values(): - device_preambles.extend(cgr.device_preambles) - - # collecting the function declarations of callee kernels - for in_knl_callable in program.callables_table.values(): - for preamble in in_knl_callable.generate_preambles(program.target): - device_preambles.append(preamble) - - collective_device_program = codegen_results[program.name].device_programs[0] - callee_fdecls = [] - - for func_id, callee_cgr in codegen_results.items(): - if func_id != program.name: - assert len(callee_cgr.device_programs) == 1 - callee_prog_ast = callee_cgr.device_programs[0].ast - collective_device_program = collective_device_program.copy( - ast=Collection([callee_prog_ast, collective_device_program.ast])) - if isinstance(callee_prog_ast, Collection): - # if there is a read only constant in the kernel - for entry in callee_prog_ast.contents: - if isinstance(entry, FunctionBody): - callee_fdecls.append(entry.fdecl) - elif isinstance(callee_prog_ast, FunctionBody): - callee_fdecls.append(callee_prog_ast.fdecl) + #FIXME: + # 1. Diverge the kernels which are both entrypoint and callees at this + # point. By diverge we should rename the callees in kernels. + # 2. Then pass the callee versions by saying is_entrypoint=False + cgr = generate_code_for_a_single_kernel(in_knl_callable.subkernel, + program.callables_table, program.target, True) + if func_id in program.entrypoints: + host_programs.extend(cgr.host_programs) + implemented_data_infos.append(cgr.implemented_data_info) else: - raise NotImplementedError("Do not know how to add forward" - " declarations for %r." % type(callee_prog_ast)) - - # collecting the function declarations of callee kernels - for callee_fdecl in callee_fdecls: - collective_device_program = collective_device_program.copy( - ast=Collection([callee_fdecl, collective_device_program.ast])) - - collective_device_programs = [collective_device_program] + ( - codegen_results[program.name].device_programs[1:]) - - return codegen_results[program.name].copy( - device_programs=collective_device_programs, - device_preambles=device_preambles) + assert cgr.host_programs == [] + assert len(cgr.device_programs) == 1 + #FIXME: + # if isinstance(callee_prog_ast, Collection): + # for entry in callee_prog_ast.contents: + # if isinstance(entry, FunctionBody): + # callee_fdecls.append(entry.fdecl) + + device_programs.insert( + cgr.device_programs[0].ast.fdecl, 0) + + device_programs.extend(cgr.device_programs) + device_preambles.extend(cgr.device_preambles) + + device_preambles.extend(list(in_knl_callable.generate_preambles( + program.target))) + + return CodeGenerationResult( + host_programs=host_programs, + device_programs=device_programs, + implemented_data_infos=implemented_data_infos) def generate_code(kernel, device=None): diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e17dd55b8..81959032a 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -117,7 +117,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index), codegen_state.callables_table) - if kernel.is_called_from_host: + if codegen_state.is_entrypoint: return merge_codegen_results(codegen_state, [ codegen_result, diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 7950c56b3..e53f25835 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -66,7 +66,11 @@ class GeneratedProgram(ImmutableRecord): class CodeGenerationResult(ImmutableRecord): """ - .. attribute:: host_program + .. attribute:: host_programs + + A list of :class:`GeneratedProgram` instances + intended to run on the host. + .. attribute:: device_programs A list of :class:`GeneratedProgram` instances @@ -99,12 +103,12 @@ class CodeGenerationResult(ImmutableRecord): if codegen_state.is_generating_device_code: kwargs = { - "host_program": None, + "host_programs": [], "device_programs": [prg], } else: kwargs = { - "host_program": prg, + "host_programs": [prg], "device_programs": [], } @@ -118,8 +122,8 @@ class CodeGenerationResult(ImmutableRecord): return ( "".join(preamble_codes) - + - str(self.host_program.ast)) + + "\n" + + "\n\n".join(str(hp.ast) for hp in self.host_programs)) def device_code(self): preamble_codes = process_preambles(getattr(self, "device_preambles", [])) @@ -141,7 +145,7 @@ class CodeGenerationResult(ImmutableRecord): + "\n" + "\n\n".join(str(dp.ast) for dp in self.device_programs) + "\n\n" - + str(self.host_program.ast)) + + "\n\n".join(str(hp.ast) for hp in self.host_programs)) def current_program(self, codegen_state): if codegen_state.is_generating_device_code: @@ -150,7 +154,10 @@ class CodeGenerationResult(ImmutableRecord): else: result = None else: - result = self.host_program + if self.host_programs: + result = self.host_programs[-1] + else: + result = None if result is None: ast = codegen_state.ast_builder.ast_block_class([]) @@ -174,7 +181,11 @@ class CodeGenerationResult(ImmutableRecord): else: assert program.name == codegen_state.gen_program_name assert not program.is_device_program - return self.copy(host_program=program) + return self.copy( + host_programs=( + self.host_programs[:-1] + + + [program])) def current_ast(self, codegen_state): return self.current_program(codegen_state).ast @@ -195,7 +206,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True): if not elements: return CodeGenerationResult( - host_program=None, + host_programs=[], device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) @@ -293,7 +304,7 @@ def generate_host_or_device_program(codegen_state, schedule_index): codegen_result = build_loop_nest(codegen_state, schedule_index) if (codegen_state.is_generating_device_code) or ( - codegen_state.kernel.is_called_from_host): + codegen_state.is_entrypoint): codegen_result = merge_codegen_results( codegen_state, ast_builder.generate_top_of_body(codegen_state) @@ -317,8 +328,10 @@ def generate_host_or_device_program(codegen_state, schedule_index): body_ast=ast_builder.process_ast(body_ast))) else: codegen_result = codegen_result.copy( - host_program=None) + host_programs=[]) return codegen_result # }}} + +# vim: foldmethod=marker diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 9ebcf2bcb..0cc1cce37 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1139,7 +1139,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return global_sizes, local_sizes def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, - ignore_auto=False): + ignore_auto=False, return_dict=False): #Fixme: Re-add the memoize wrap here? # Fixme: docs """Return a tuple (global_size, local_size) containing a grid that @@ -1161,10 +1161,40 @@ class LoopKernel(ImmutableRecordWithoutPickling): global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( insn_ids, callables_table, ignore_auto=ignore_auto) - return global_sizes, local_sizes + if return_dict: + return global_sizes, local_sizes + + def to_dim_tuple(size_dict, which, forced_sizes={}): + forced_sizes = forced_sizes.copy() + + size_list = [] + sorted_axes = sorted(six.iterkeys(size_dict)) + + while sorted_axes or forced_sizes: + if sorted_axes: + cur_axis = sorted_axes.pop(0) + else: + cur_axis = None + + if len(size_list) in forced_sizes: + size_list.append(forced_sizes.pop(len(size_list))) + continue + + assert cur_axis is not None + + if cur_axis > len(size_list): + raise LoopyError("%s axis %d unused for %s" % ( + which, len(size_list), self.name)) + + size_list.append(size_dict[cur_axis]) + + return tuple(size_list) + + return (to_dim_tuple(global_sizes, "global"), + to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, - callables_table, ignore_auto=False): + callables_table, ignore_auto=False, return_dict=False): # FIXME docs """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given @@ -1177,16 +1207,24 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ grid_size, group_size = self.get_grid_sizes_for_insn_ids( - insn_ids, callables_table, ignore_auto) + insn_ids, callables_table, ignore_auto, return_dict) - def dict_to_exprs(d): + if return_dict: + def dict_to_exprs(d): + from loopy.symbolic import pw_aff_to_expr + return dict((k, pw_aff_to_expr(v, int_ok=True)) for k, v in + six.iteritems(d)) + + return dict_to_exprs(grid_size), dict_to_exprs(group_size) + + def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr - return dict((k, pw_aff_to_expr(v, int_ok=True)) for k, v in - six.iteritems(d)) + return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup) - return dict_to_exprs(grid_size), dict_to_exprs(group_size) + return tup_to_exprs(grid_size), tup_to_exprs(group_size) - def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False): + def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False, + return_dict=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1194,11 +1232,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), - callables_table, - ignore_auto=ignore_auto) + callables_table, ignore_auto=ignore_auto) def get_grid_size_upper_bounds_as_exprs(self, callables_table, - ignore_auto=False): + ignore_auto=False, return_dict=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. @@ -1206,11 +1243,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): *global_size* and *local_size* are :mod:`pymbolic` expressions """ - return self.get_grid_sizes_for_insn_ids_as_exprs( frozenset(insn.id for insn in self.instructions), - callables_table, - ignore_auto=ignore_auto) + callables_table, ignore_auto=ignore_auto, + return_dict=return_dict) # }}} diff --git a/loopy/program.py b/loopy/program.py index 5b0089da4..9cfafe1ba 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -243,65 +243,6 @@ class Program(ImmutableRecord): return self.copy(entrypoints=entrypoints) - def get_grid_size_upper_bounds(self, entrypoint, ignore_auto=False): - #FIXME: docs - """Return a tuple (global_size, local_size) containing a grid that - could accommodate execution of *all* instructions in the kernel. - - *global_size* and *local_size* are :class:`islpy.PwAff` objects. - """ - # do the check over here, get the thing as a dict. - def to_dim_tuple(size_dict, which, forced_sizes={}): - forced_sizes = forced_sizes.copy() - - size_list = [] - sorted_axes = sorted(six.iterkeys(size_dict)) - - while sorted_axes or forced_sizes: - if sorted_axes: - cur_axis = sorted_axes.pop(0) - else: - cur_axis = None - - if len(size_list) in forced_sizes: - size_list.append(forced_sizes.pop(len(size_list))) - continue - - assert cur_axis is not None - - if cur_axis > len(size_list): - raise LoopyError("%s axis %d unused for %s" % ( - which, len(size_list), self.name)) - - size_list.append(size_dict[cur_axis]) - - return tuple(size_list) - - global_sizes, local_sizes = (self.callables_table[entrypoint] - .subkernel - .get_grid_size_upper_bounds( - self.callables_table, ignore_auto=ignore_auto)) - - return (to_dim_tuple(global_sizes, "global"), - to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) - - def get_grid_size_upper_bounds_as_exprs(self, entrypoint, ignore_auto=False): - #FIXME: docs - """Return a tuple (global_size, local_size) containing a grid that - could accommodate execution of *all* instructions in the kernel. - - *global_size* and *local_size* are :mod:`pymbolic` expressions - """ - # do the check over here, get the thing as a dict. - grid_size, group_size = self.get_grid_sizes_for_insn_ids( - entrypoint, ignore_auto) - - def tup_to_exprs(tup): - from loopy.symbolic import pw_aff_to_expr - return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup) - - return tup_to_exprs(grid_size), tup_to_exprs(group_size) - @property def state(self): """ Returns an instance of :class:`loopy.kernel.KernelState`. """ diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 82f18e56c..04bfbe10a 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -645,7 +645,7 @@ class CASTBuilder(ASTBuilderBase): if self.target.fortran_abi: name += "_" - if codegen_state.kernel.is_called_from_host: + if codegen_state.is_entrypoint: name = Value("void", name) else: name = Value("static void", name) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 5a066ddfb..ab484d6ca 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -39,7 +39,7 @@ from pymbolic import var from loopy.expression import dtype_to_type_context -from loopy.type_inference import TypeInferenceMapper +from loopy.type_inference import TypeReader from loopy.diagnostic import LoopyError from loopy.tools import is_integer @@ -54,7 +54,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel, + type_inf_mapper = TypeReader(self.kernel, self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 4530000a3..d6e78a5ad 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -866,6 +866,8 @@ class KernelExecutorBase(object): except KeyError: pass + import pudb; pu.db + logger.debug("%s: invoker cache miss" % kernel.name) invoker = self.get_invoker_uncached(kernel, *args) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 6dced9ad9..6d1194bad 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -468,15 +468,13 @@ class OpenCLCASTBuilder(CASTBuilder): # {{{ top-level codegen def get_function_declaration(self, codegen_state, codegen_result, - schedule_index, is_entrypoint): - raise NotImplementedError("this should probably take is is_entrypoint" - " or something equivalent.") + schedule_index): fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( codegen_state, codegen_result, schedule_index) from loopy.target.c import FunctionDeclarationWrapper assert isinstance(fdecl, FunctionDeclarationWrapper) - if not is_entrypoint: + if not codegen_state.is_entrypoint: # auxiliary kernels need not mention opencl speicific qualifiers # for a functions signature return fdecl @@ -487,8 +485,6 @@ class OpenCLCASTBuilder(CASTBuilder): fdecl = CLKernel(fdecl) from loopy.schedule import get_insn_ids_for_block_at - raise NotImplementedError("this should pll the grid size from the" - "translation unit?") _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at( codegen_state.kernel.schedule, schedule_index), diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 65e0f4bca..c7fce36a2 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -288,7 +288,8 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): dev_code = codegen_result.device_code() - if self.program.root_kernel.options.write_cl: + if program[entrypoint].options.write_cl: + #FIXME: redirect to "translation unit" level option as well. output = dev_code if self.program.root_kernel.options.highlight_cl: output = get_highlighted_code(output) @@ -299,15 +300,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): with open(self.program.root_kernel.options.write_cl, "w") as outf: outf.write(output) - if self.program.root_kernel.options.edit_cl: + if program[entrypoint].options.edit_cl: + #FIXME: redirect to "translation unit" level option as well. from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") import pyopencl as cl + #FIXME: redirect to "translation unit" level option as well. cl_program = ( cl.Program(self.context, dev_code) - .build(options=program.root_kernel.options.cl_build_options)) + .build(options=program[entrypoint].options.cl_build_options)) cl_kernels = _Kernels() for dp in codegen_result.device_programs: @@ -316,7 +319,11 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): return _KernelInfo( program=program, cl_kernels=cl_kernels, - implemented_data_info=codegen_result.implemented_data_info, + implemented_data_info=[i for i, h in + zip(codegen_result.implemented_data_infos, + codegen_result.host_programs) if + h.name.endswith(entrypoint)][0], + # implemented_data_info=codegen_result.implemented_data_info[0], invoker=self.get_invoker(program, codegen_result)) def __call__(self, queue, **kwargs): diff --git a/loopy/target/python.py b/loopy/target/python.py index d174504fa..a72e9c272 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -29,7 +29,7 @@ import numpy as np from pymbolic.mapper import Mapper from pymbolic.mapper.stringifier import StringifyMapper -from loopy.type_inference import TypeInferenceMapper +from loopy.type_inference import TypeReader from loopy.kernel.data import ValueArg from loopy.diagnostic import LoopyError # noqa from loopy.target import ASTBuilderBase @@ -44,7 +44,7 @@ class ExpressionToPythonMapper(StringifyMapper): self.codegen_state = codegen_state if type_inf_mapper is None: - type_inf_mapper = TypeInferenceMapper(self.kernel, + type_inf_mapper = TypeReader(self.kernel, self.codegen_state.callables_table) self.type_inf_mapper = type_inf_mapper diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 6205d219b..b646f2d21 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -695,6 +695,50 @@ class TypeInferenceMapper(CombineMapper): def map_sub_array_ref(self, expr): return self.rec(expr.subscript) +# }}} + + +# {{{ TypeReader + +class TypeReader(TypeInferenceMapper): + def __init__(self, kernel, callables, new_assignments={}): + self.kernel = kernel + self.callables = callables + self.new_assignments = new_assignments + + # {{{ disabled interface + + def copy(self, *args, **kwargs): + raise ValueError("Not allowed in TypeReader") + + # }}} + + def map_call(self, expr, return_tuple=False): + identifier = expr.function + if isinstance(identifier, (Variable, ResolvedFunction)): + identifier = identifier.name + + # specializing the known function wrt type + if isinstance(expr.function, ResolvedFunction): + in_knl_callable = self.callables[expr.function.name] + + arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + + if arg_id_to_dtype is None: + return [] + + # collecting result dtypes in order of the assignees + if -1 in arg_id_to_dtype and arg_id_to_dtype[-1] is not None: + if return_tuple: + return [get_return_types_as_tuple(arg_id_to_dtype)] + else: + return [arg_id_to_dtype[-1]] + else: + raise NotImplementedError() + + return [] + + map_call_with_kwargs = map_call # }}} -- GitLab From a208d0b7c9a21aeafd76e346b0b1b36e4f718069 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 14 Oct 2019 02:30:46 -0500 Subject: [PATCH 628/916] introduces a "CALLS_RESOLVED" state --- loopy/codegen/__init__.py | 2 +- loopy/kernel/__init__.py | 6 ++++-- loopy/program.py | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 083664c13..b764615b2 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -599,7 +599,7 @@ def generate_code_v2(program): program = make_program(program) from loopy.kernel import KernelState - if program.state == KernelState.INITIAL: + if program.state < KernelState.PREPROCESSED: # Note that we cannot have preprocessing separately for everyone. # Since, now the preprocessing of each one depends on the other. # So we check if any one of the callable kernels are not preprocesses diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 0cc1cce37..df5c40d41 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -97,8 +97,9 @@ class _UniqueVarNameGenerator(UniqueNameGenerator): class KernelState: # noqa INITIAL = 0 - PREPROCESSED = 1 - SCHEDULED = 2 + CALLS_RESOLVED = 1 + PREPROCESSED = 2 + SCHEDULED = 3 # {{{ kernel_state, KernelState compataibility @@ -327,6 +328,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): if state not in [ KernelState.INITIAL, + KernelState.CALLS_RESOLVED, KernelState.PREPROCESSED, KernelState.SCHEDULED, ]: diff --git a/loopy/program.py b/loopy/program.py index 9cfafe1ba..1441190e7 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -260,8 +260,9 @@ class Program(ImmutableRecord): return self.copy(callables_table=new_callables) def with_resolved_callables(self): - from loopy.library.function import get_loopy_callables + from loopy.kernel import KernelState + known_callables = self.target.get_device_ast_builder().known_callables known_callables.update(get_loopy_callables()) known_callables.update(self.callables_table) @@ -285,6 +286,7 @@ class Program(ImmutableRecord): known_callables) knl = rule_mapping_context.finish_kernel( callables_collector.map_kernel(knl)) + knl = knl.copy(state=KernelState.CALLS_RESOLVED) callables_table[top] = callables_table[top].copy(subkernel=knl) for func, clbl in six.iteritems(callables_collector.resolved_functions): -- GitLab From 255a3da4a4c434434e885cd23e1f20d4df19cb1a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 14 Oct 2019 02:31:23 -0500 Subject: [PATCH 629/916] minor fixes --- loopy/preprocess.py | 17 ++++++++++++++++- loopy/program.py | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ad26efc68..475ca8df7 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2314,6 +2314,7 @@ def infer_arg_descr(program): from loopy.kernel.array import ArrayBase from loopy.kernel.function_interface import (ArrayArgDescriptor, ValueArgDescriptor) + from loopy import auto clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, program.entrypoints) @@ -2331,7 +2332,7 @@ def infer_arg_descr(program): arg_id_to_descr = dict((arg.name, ArrayArgDescriptor( _tuple_if_int(arg.shape), arg.address_space, arg.dim_tags) if isinstance(arg, ArrayBase) else ValueArgDescriptor()) for arg in - program[e].args) + program[e].args if arg.shape not in (None, auto)) new_callable, clbl_inf_ctx, _ = program.callables_table[e].with_descrs( arg_id_to_descr, None, clbl_inf_ctx) clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) @@ -2435,6 +2436,20 @@ def preprocess_single_kernel(kernel, callables_table, device=None): def preprocess_program(program, device=None): + if len([clbl for clbl in six.itervalues(program.callables_table) if + isinstance(clbl, CallableKernel)]) == 1: + program = program.with_entrypoints(','.join(clbl.name for clbl in + six.itervalues(program.callables_table) if isinstance(clbl, + CallableKernel))) + + if not program.entrypoints: + raise LoopyError("Translation unit did not receive any entrypoints") + + from loopy.kernel import KernelState + + if program.state < KernelState.CALLS_RESOLVED: + program = program.with_resolved_callables() + if device is not None: # FIXME: Time to remove this? (Git blame shows 5 years ago) from warnings import warn diff --git a/loopy/program.py b/loopy/program.py index 1441190e7..5edb8a716 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -316,6 +316,7 @@ class Program(ImmutableRecord): if self.entrypoints is None: if len([clbl for clbl in self.callables_table.values() if isinstance(clbl, CallableKernel)]) == 1: + #FIXME: in place update, can we do any better? self.entrypoints = frozenset([clbl.subkernel.name for clbl in self.callables_table.values() if isinstance(clbl, CallableKernel)]) -- GitLab From 77d2dda58d7eb60926ccfc80ae78d5337a0ee3b6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 14 Oct 2019 02:32:18 -0500 Subject: [PATCH 630/916] changes in the near-target codegen pipeline to take kernels instead of programs --- loopy/target/execution.py | 106 ++++++++++++++--------------- loopy/target/pyopencl_execution.py | 27 ++++---- 2 files changed, 68 insertions(+), 65 deletions(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index d6e78a5ad..1dafd440c 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -217,9 +217,9 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from offsets - def generate_integer_arg_finding_from_offsets(self, gen, program, - implemented_data_info): - options = program.root_kernel.options + def generate_integer_arg_finding_from_offsets(self, gen, kernel, + implemented_data_info): + options = kernel.options gen("# {{{ find integer arguments from offsets") gen("") @@ -242,7 +242,7 @@ class ExecutionWrapperGeneratorBase(object): else: gen("_lpy_offset = %s.offset" % impl_array_name) - base_arg = program.impl_arg_to_arg[impl_array_name] + base_arg = kernel.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" @@ -267,8 +267,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ integer arg finding from strides def generate_integer_arg_finding_from_strides( - self, gen, program, implemented_data_info): - options = program.root_kernel.options + self, gen, kernel, implemented_data_info): + options = kernel.options gen("# {{{ find integer arguments from strides") gen("") @@ -287,7 +287,7 @@ class ExecutionWrapperGeneratorBase(object): "passed array\")" % (arg.name, impl_array_name)) - base_arg = program.impl_arg_to_arg[impl_array_name] + base_arg = kernel.impl_arg_to_arg[impl_array_name] if not options.skip_arg_checks: gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" @@ -310,8 +310,8 @@ class ExecutionWrapperGeneratorBase(object): # {{{ check that value args are present def generate_value_arg_check( - self, gen, program, implemented_data_info): - if program.root_kernel.options.skip_arg_checks: + self, gen, kernel, implemented_data_info): + if kernel.options.skip_arg_checks: return from loopy.kernel.data import ValueArg @@ -364,7 +364,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ arg setup def generate_arg_setup( - self, gen, program, implemented_data_info, options): + self, gen, kernel, implemented_data_info, options): import loopy as lp from loopy.kernel.data import KernelArgument @@ -387,8 +387,8 @@ class ExecutionWrapperGeneratorBase(object): expect_no_more_arguments = False for arg_idx, arg in enumerate(implemented_data_info): - is_written = arg.base_name in program.root_kernel.get_written_variables() - program_arg = program.impl_arg_to_arg.get(arg.name) + is_written = arg.base_name in kernel.get_written_variables() + kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if not issubclass(arg.arg_class, KernelArgument): expect_no_more_arguments = True @@ -450,7 +450,7 @@ class ExecutionWrapperGeneratorBase(object): gen("if %s is None:" % arg.name) with Indentation(gen): self.handle_alloc( - gen, arg, program_arg, strify, options.skip_arg_checks) + gen, arg, kernel_arg, strify, options.skip_arg_checks) gen("_lpy_made_by_loopy = True") gen("") @@ -468,7 +468,7 @@ class ExecutionWrapperGeneratorBase(object): with Indentation(gen): gen("if %s.dtype != %s:" % (arg.name, self.python_dtype_str( - program_arg.dtype.numpy_dtype))) + kernel_arg.dtype.numpy_dtype))) with Indentation(gen): gen("raise TypeError(\"dtype mismatch on argument '%s' " "(got: %%s, expected: %s)\" %% %s.dtype)" @@ -496,10 +496,10 @@ class ExecutionWrapperGeneratorBase(object): "%% (%s.shape, %s))" % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - if program_arg.shape is None: + if kernel_arg.shape is None: pass - elif any(shape_axis is None for shape_axis in program_arg.shape): + elif any(shape_axis is None for shape_axis in kernel_arg.shape): gen("if len(%s.shape) != %s:" % (arg.name, len(arg.unvec_shape))) with Indentation(gen): @@ -522,8 +522,8 @@ class ExecutionWrapperGeneratorBase(object): # }}} - if arg.unvec_strides and program_arg.dim_tags: - itemsize = program_arg.dtype.numpy_dtype.itemsize + if arg.unvec_strides and kernel_arg.dim_tags: + itemsize = kernel_arg.dtype.numpy_dtype.itemsize sym_strides = tuple( itemsize*s_i for s_i in arg.unvec_strides) @@ -620,7 +620,7 @@ class ExecutionWrapperGeneratorBase(object): def generate_host_code(self, gen, codegen_result): raise NotImplementedError - def __call__(self, program, codegen_result): + def __call__(self, program, entrypoint, codegen_result): """ Generates the wrapping python invoker for this execution target @@ -632,12 +632,16 @@ class ExecutionWrapperGeneratorBase(object): kernel """ - options = program.root_kernel.options - implemented_data_info = codegen_result.implemented_data_info + options = program[entrypoint].options + #FIXME: endswith is ugly maybe make + # codegen_result.implemented_data_infos a dict? + implemented_data_info = [i for i, h in + zip(codegen_result.implemented_data_infos, + codegen_result.host_programs) if h.name.endswith(entrypoint)][0] from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % program.name, + "invoke_%s_loopy_kernel" % entrypoint, self.system_args + [ "%s=None" % idi.name for idi in implemented_data_info @@ -654,21 +658,25 @@ class ExecutionWrapperGeneratorBase(object): self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( - gen, program, implemented_data_info) + gen, program[entrypoint], implemented_data_info) self.generate_integer_arg_finding_from_offsets( - gen, program, implemented_data_info) + gen, program[entrypoint], implemented_data_info) self.generate_integer_arg_finding_from_strides( - gen, program, implemented_data_info) + gen, program[entrypoint], implemented_data_info) self.generate_value_arg_check( - gen, program, implemented_data_info) - + gen, program[entrypoint], implemented_data_info) args = self.generate_arg_setup( - gen, program, implemented_data_info, options) + gen, program[entrypoint], implemented_data_info, options) + + #FIXME: should we make this as a dict as well. + host_program_name, = [h.name for h in codegen_result.host_programs if + h.name.endswith(entrypoint)] - self.generate_invocation(gen, codegen_result.host_program.name, args, - program, implemented_data_info) + self.generate_invocation(gen, host_program_name, args, + program[entrypoint], implemented_data_info) - self.generate_output_handler(gen, options, program, implemented_data_info) + self.generate_output_handler(gen, options, program[entrypoint], + implemented_data_info) if options.write_wrapper: output = gen.get() @@ -740,23 +748,17 @@ class KernelExecutorBase(object): program = self.program program = program.with_resolved_callables() - if arg_to_dtype_set: - var_to_dtype = {} - for var, dtype in arg_to_dtype_set: - try: - dest_name = program[entrypoint].impl_arg_to_arg[var].name - except KeyError: - dest_name = var + var_to_dtype = {} + entry_knl = program[entrypoint] + for var, dtype in arg_to_dtype_set: + if var in entry_knl.impl_arg_to_arg: + dest_name = entry_knl.impl_arg_to_arg[var].name + else: + dest_name = var - try: - var_to_dtype[dest_name] = dtype - except KeyError: - raise LoopyError("cannot set type for '%s': " - "no known variable/argument with that name" - % var) + var_to_dtype[dest_name] = dtype - program = program.with_kernel(add_dtypes(program[entrypoint], - var_to_dtype)) + program = program.with_kernel(add_dtypes(entry_knl, var_to_dtype)) from loopy.type_inference import infer_unknown_types from loopy.kernel import KernelState @@ -852,13 +854,13 @@ class KernelExecutorBase(object): code = generate_code_v2(kernel) return code.device_code() - def get_invoker_uncached(self, kernel, *args): + def get_invoker_uncached(self, program, entrypoint, *args): raise NotImplementedError() - def get_invoker(self, kernel, *args): + def get_invoker(self, program, entrypoint, *args): from loopy import CACHING_ENABLED - cache_key = (self.__class__.__name__, kernel) + cache_key = (self.__class__.__name__, (program, entrypoint)) if CACHING_ENABLED: try: @@ -866,11 +868,9 @@ class KernelExecutorBase(object): except KeyError: pass - import pudb; pu.db - - logger.debug("%s: invoker cache miss" % kernel.name) + logger.debug("%s: invoker cache miss" % entrypoint) - invoker = self.get_invoker_uncached(kernel, *args) + invoker = self.get_invoker_uncached(program, entrypoint, *args) if CACHING_ENABLED: invoker_cache.store_if_not_present(cache_key, invoker) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index c7fce36a2..aa61ea3bc 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -152,8 +152,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation def generate_invocation(self, gen, program_name, args, - program, implemented_data_info): - if program.root_kernel.options.cl_exec_manage_array_events: + kernel, implemented_data_info): + if kernel.options.cl_exec_manage_array_events: gen(""" if wait_for is None: wait_for = [] @@ -177,13 +177,13 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): + args + ["wait_for=wait_for", "allocator=allocator"]))) - if program.root_kernel.options.cl_exec_manage_array_events: + if kernel.options.cl_exec_manage_array_events: gen("") from loopy.kernel.data import ArrayArg for arg in implemented_data_info: if (issubclass(arg.arg_class, ArrayArg) and arg.base_name in ( - program.root_kernel.get_written_variables())): + kernel.get_written_variables())): gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name)) # }}} @@ -191,7 +191,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, program, implemented_data_info): + self, gen, options, kernel, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -209,7 +209,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): continue is_written = arg.base_name in ( - program.root_kernel.get_written_variables()) + kernel.get_written_variables()) if is_written: gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) @@ -221,12 +221,12 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) if arg.base_name in - program.root_kernel.get_written_variables())) + kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in program.root_kernel.get_written_variables()] + if arg.base_name in kernel.get_written_variables()] if out_args: gen("return _lpy_evt, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -273,15 +273,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): self.program = program.copy(target=( program.target.with_device(context.devices[0]))) - def get_invoker_uncached(self, kernel, codegen_result): + def get_invoker_uncached(self, program, entrypoint, codegen_result): generator = PyOpenCLExecutionWrapperGenerator() - return generator(kernel, codegen_result) + return generator(program, entrypoint, codegen_result) @memoize_method def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), all_kwargs=None): - program = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype_set) + program = self.get_typed_and_scheduled_program(entrypoint, + arg_to_dtype_set) + # FIXME: now just need to add the types to the arguments from loopy.codegen import generate_code_v2 from loopy.target.execution import get_highlighted_code codegen_result = generate_code_v2(program) @@ -324,7 +326,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): codegen_result.host_programs) if h.name.endswith(entrypoint)][0], # implemented_data_info=codegen_result.implemented_data_info[0], - invoker=self.get_invoker(program, codegen_result)) + invoker=self.get_invoker(program, entrypoint, codegen_result)) def __call__(self, queue, **kwargs): """ @@ -361,6 +363,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): program_info = self.program_info(kwargs['entrypoint'], self.arg_to_dtype_set(kwargs)) + kwargs.pop('entrypoint') return program_info.invoker( program_info.cl_kernels, queue, allocator, wait_for, -- GitLab From a42c21f5c98086e30f3616dbe3883e6860694c76 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 14 Oct 2019 02:58:02 -0500 Subject: [PATCH 631/916] dotted way to call entrypoints --- loopy/program.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/loopy/program.py b/loopy/program.py index 5edb8a716..adeb8a5e0 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -310,6 +310,13 @@ class Program(ImmutableRecord): else: return result + def __getattr__(self, attr): + if attr in self.entrypoints: + return lambda *args, **kwargs: self(*args, entrypoint=attr, + **kwargs) + + return super(Program, self).__getattr__(attr) + def __call__(self, *args, **kwargs): entrypoint = kwargs.get('entrypoint', None) -- GitLab From 5cd9ad21d81f79dbdb26351f51ab5ad6b27fbebd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 14 Oct 2019 04:39:56 -0500 Subject: [PATCH 632/916] fixes to misc. minor errors --- loopy/program.py | 11 ++++------- loopy/transform/callable.py | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index adeb8a5e0..74b961dc5 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -478,10 +478,10 @@ class CallablesIDCollector(CombineMapper): def _get_callable_ids_for_knl(knl, callables): clbl_id_collector = CallablesIDCollector() - return frozenset().union(( - _get_callable_ids_for_knl(callables[clbl].subkernel) if + return frozenset().union(*( + _get_callable_ids_for_knl(callables[clbl].subkernel, callables) if isinstance(callables[clbl], CallableKernel) else clbl - for clbl in clbl_id_collector.map_kernel(knl))) + for clbl in clbl_id_collector.map_kernel(knl))) | frozenset([knl.name]) def _get_callable_ids(callables, entrypoints): @@ -670,10 +670,7 @@ class CallablesInferenceContext(ImmutableRecord): def __getitem__(self, name): result = self.callables[name] - if isinstance(result, CallableKernel): - return result.subkernel - else: - return result + return result # {{{ helper functions diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 2a1dd1115..c96a51778 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -102,7 +102,7 @@ def fuse_translation_units(translation_units, collision_not_ok=True): return Program( entrypoints=frozenset().union(*( - t.entrypoints for t in translation_units)), + t.entrypoints or frozenset() for t in translation_units)), callables_table=callables_table, target=translation_units[0].target) -- GitLab From 43c48f964cae36de54d4296df0f88c89ea8a4245 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 14 Oct 2019 04:48:07 -0500 Subject: [PATCH 633/916] minor error: fixes an error which was leading to divergece of logic between scalar and knl-callable --- loopy/program.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 74b961dc5..9b71f9d25 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -480,7 +480,7 @@ def _get_callable_ids_for_knl(knl, callables): return frozenset().union(*( _get_callable_ids_for_knl(callables[clbl].subkernel, callables) if - isinstance(callables[clbl], CallableKernel) else clbl + isinstance(callables[clbl], CallableKernel) else frozenset([clbl]) for clbl in clbl_id_collector.map_kernel(knl))) | frozenset([knl.name]) -- GitLab From e66f10ee6c3c74c25db9e0fd6426991a3ed12cdc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 14 Oct 2019 07:47:22 -0500 Subject: [PATCH 634/916] do not add entrypoint in get_callable_id --- loopy/program.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 9b71f9d25..4a1225a44 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -479,9 +479,10 @@ def _get_callable_ids_for_knl(knl, callables): clbl_id_collector = CallablesIDCollector() return frozenset().union(*( - _get_callable_ids_for_knl(callables[clbl].subkernel, callables) if - isinstance(callables[clbl], CallableKernel) else frozenset([clbl]) - for clbl in clbl_id_collector.map_kernel(knl))) | frozenset([knl.name]) + _get_callable_ids_for_knl(callables[clbl].subkernel, callables) | + frozenset([clbl]) if isinstance(callables[clbl], CallableKernel) else + frozenset([clbl]) + for clbl in clbl_id_collector.map_kernel(knl))) def _get_callable_ids(callables, entrypoints): -- GitLab From d3d10b2509cdd4e8dba8cbb80594813743d5de7b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 14 Oct 2019 07:53:34 -0500 Subject: [PATCH 635/916] diverge entrypoint and a callee kernel after scheduling --- loopy/codegen/__init__.py | 75 ++++++++++++++++++++++++++---- loopy/target/pyopencl_execution.py | 5 +- 2 files changed, 68 insertions(+), 12 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index b764615b2..0f8028e43 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -39,7 +39,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.symbolic import CombineMapper from functools import reduce -from loopy.kernel.function_interface import CallableKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pytools import ProcessLogger, memoize_method @@ -442,10 +442,6 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, """ from loopy.kernel import KernelState - if kernel.schedule is None: - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel, callables_table) - if kernel.state != KernelState.SCHEDULED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") @@ -584,6 +580,40 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, return codegen_result +def diverge_callee_entrypoints(program): + from loopy.program import _get_callable_ids + from pytools import UniqueNameGenerator + callable_ids = _get_callable_ids(program.callables_table, + program.entrypoints) + + new_callables = {} + renames = {} + + vng = UniqueNameGenerator(list(six.iterkeys(program.callables_table))) + + for clbl_id in callable_ids & program.entrypoints: + renames[clbl_id] = vng(based_on=clbl_id) + + for name, clbl in six.iteritems(program.callables_table): + if isinstance(clbl, CallableKernel): + from loopy.program import ( + rename_resolved_functions_in_a_single_kernel) + knl = rename_resolved_functions_in_a_single_kernel( + clbl.subkernel, renames) + new_callables[name] = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + new_callables[name] = clbl + else: + raise NotImplementedError(type(clbl)) + + for clbl_id in callable_ids & program.entrypoints: + knl = new_callables[clbl_id].subkernel.copy(name=renames[clbl_id]) + new_callables[renames[clbl_id]] = new_callables[clbl_id].copy( + subkernel=knl) + + return program.copy(callables_table=new_callables) + + @memoize_method def generate_code_v2(program): """ @@ -610,9 +640,29 @@ def generate_code_v2(program): from loopy.type_inference import infer_unknown_types program = infer_unknown_types(program, expect_completion=True) + new_callables = {} + + for name, clbl in six.iteritems(program.callables_table): + if isinstance(clbl, CallableKernel): + from loopy.schedule import get_one_scheduled_kernel + knl = clbl.subkernel + if knl.schedule is None: + knl = get_one_scheduled_kernel( + knl, program.callables_table) + new_callables[name] = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + new_callables[name] = clbl + else: + raise NotImplementedError(type(clbl)) + + program = program.copy(callables_table=new_callables) + + program = diverge_callee_entrypoints(program) + host_programs = [] device_programs = [] device_preambles = [] + callee_fdecls = [] implemented_data_infos = [] for func_id, in_knl_callable in program.callables_table.items(): @@ -622,21 +672,21 @@ def generate_code_v2(program): # point. By diverge we should rename the callees in kernels. # 2. Then pass the callee versions by saying is_entrypoint=False cgr = generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.callables_table, program.target, True) + program.callables_table, program.target, func_id in + program.entrypoints) if func_id in program.entrypoints: host_programs.extend(cgr.host_programs) implemented_data_infos.append(cgr.implemented_data_info) else: - assert cgr.host_programs == [] + # FIXME: This assertion should be valid + # assert cgr.host_programs == [] assert len(cgr.device_programs) == 1 #FIXME: # if isinstance(callee_prog_ast, Collection): # for entry in callee_prog_ast.contents: # if isinstance(entry, FunctionBody): # callee_fdecls.append(entry.fdecl) - - device_programs.insert( - cgr.device_programs[0].ast.fdecl, 0) + callee_fdecls.append(cgr.device_programs[0].ast.fdecl) device_programs.extend(cgr.device_programs) device_preambles.extend(cgr.device_preambles) @@ -644,6 +694,11 @@ def generate_code_v2(program): device_preambles.extend(list(in_knl_callable.generate_preambles( program.target))) + # adding the callee fdecls to the device_programs + from cgen import Collection + device_programs = ([device_programs[0].copy( + ast=Collection(callee_fdecls+[device_programs[0].ast]))] + + device_programs[1:]) return CodeGenerationResult( host_programs=host_programs, device_programs=device_programs, diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index aa61ea3bc..475e6d1c8 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -315,8 +315,9 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .build(options=program[entrypoint].options.cl_build_options)) cl_kernels = _Kernels() - for dp in codegen_result.device_programs: - setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) + for dp in program.entrypoints: + #FIXME: This will fail for barriers, use a better option here. + setattr(cl_kernels, dp, getattr(cl_program, dp)) return _KernelInfo( program=program, -- GitLab From 87c9fa7e1199d821b61b629b40193623fb3530bf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 21 Oct 2019 08:03:24 -0500 Subject: [PATCH 636/916] miscellaneous minor fixes: --- loopy/__init__.py | 2 +- loopy/codegen/__init__.py | 7 +++++-- loopy/kernel/creation.py | 26 +++++++++++++++++--------- loopy/transform/callable.py | 8 ++++---- test/test_callables.py | 11 ++++------- test/testlib.py | 5 ----- 6 files changed, 31 insertions(+), 28 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 15a670583..8f21cac56 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -178,7 +178,7 @@ __all__ = [ "ScalarCallable", "CallableKernel", - "CallablesTable", "Program", "make_program", + "Program", "make_program", "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 0f8028e43..8d5bd14f4 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -199,7 +199,8 @@ class CodeGenerationState(object): .. attribute:: callables_table - An instance of :class:`loopy.CallablesTable`. + A mapping from callable names to instances of + :class:`loopy.kernel.function_interface.InKernelCallable`. .. attribute:: is_entrypoint @@ -699,11 +700,13 @@ def generate_code_v2(program): device_programs = ([device_programs[0].copy( ast=Collection(callee_fdecls+[device_programs[0].ast]))] + device_programs[1:]) - return CodeGenerationResult( + cgr = CodeGenerationResult( host_programs=host_programs, device_programs=device_programs, implemented_data_infos=implemented_data_infos) + return cgr + def generate_code(kernel, device=None): if device is not None: diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index c6081156f..242389384 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1900,14 +1900,18 @@ class SliceToInameReplacer(IdentityMapper): if isinstance(index, Slice): unique_var_name = self.var_name_gen(based_on="i") if expr.aggregate.name in self.knl.arg_dict: - domain_length = self.knl.arg_dict[expr.aggregate.name].shape[i] - elif expr.aggregate.name in self.knl.temporary_variables: - domain_length = self.knl.temporary_variables[ - expr.aggregate.name].shape[i] + shape = self.knl.arg_dict[expr.aggregate.name].shape else: + assert expr.aggregate.name in self.knl.temporary_variables + shape = self.knl.temporary_variables[ + expr.aggregate.name].shape + if shape is None or shape[i] is None: raise LoopyError("Slice notation is only supported for " "variables whose shapes are known at creation time " - "-- maybe add the shape for the sliced argument.") + "-- maybe add the shape for '{}'.".format( + expr.aggregate.name)) + + domain_length = shape[i] start, stop, step = get_slice_params( index, domain_length) subscript_iname_bounds[unique_var_name] = (start, stop, step) @@ -2025,7 +2029,7 @@ def realize_slices_array_inputs_as_sub_array_refs(kernel): # {{{ kernel creation top-level -def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): +def make_function(domains, instructions, kernel_data=["..."], **kwargs): """User-facing kernel creation entrypoint. :arg domains: @@ -2378,9 +2382,13 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): return make_program(knl) -def make_function(*args, **kwargs): - #FIXME: Do we need this anymore?? - return make_kernel(*args, **kwargs) +def make_kernel(*args, **kwargs): + tunit = make_function(*args, **kwargs) + name, = [name for name in tunit.callables_table] + return tunit.with_entrypoints(name) + + +make_kernel.__doc__ = make_function.__doc__ # }}} diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index c96a51778..f2e1bead5 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -42,7 +42,7 @@ from loopy.symbolic import SubArrayRef __doc__ = """ .. currentmodule:: loopy -.. autofunction:: register_function_id_to_in_knl_callable_mapper +.. autofunction:: register_callable .. autofunction:: fuse_translation_units """ @@ -61,16 +61,16 @@ def register_callable(translation_unit, function_identifier, callable_, from loopy.kernel.function_interface import InKernelCallable assert isinstance(callable_, InKernelCallable) - if (function_identifier in translation_unit.callables) and ( + if (function_identifier in translation_unit.callables_table) and ( redefining_not_ok): raise LoopyError("Redifining function identifier not allowed. Set the" " option 'redefining_not_ok=False' to bypass this error.") - callables = translation_unit.copy() + callables = translation_unit.callables_table.copy() callables[function_identifier] = callable_ return translation_unit.copy( - callables=callables) + callables_table=callables) def fuse_translation_units(translation_units, collision_not_ok=True): diff --git a/test/test_callables.py b/test/test_callables.py index 04eeae666..17f9a3c0a 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -41,7 +41,7 @@ def test_register_function_lookup(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - from testlib import register_log2_lookup + from testlib import Log2Callable x = np.random.rand(10) queue = cl.CommandQueue(ctx) @@ -51,8 +51,7 @@ def test_register_function_lookup(ctx_factory): """ y[i] = log2(x[i]) """) - prog = lp.register_function_id_to_in_knl_callable_mapper(prog, - register_log2_lookup) + prog = lp.register_callable(prog, 'log2', Log2Callable('log2')) evt, (out, ) = prog(queue, x=x) @@ -94,10 +93,8 @@ def test_register_knl(ctx_factory, inline): '...'] ) - knl = lp.register_callable_kernel( - parent_knl, child_knl) - knl = lp.register_callable_kernel( - knl, grandchild_knl) + knl = lp.fuse_translation_units([grandchild_knl, child_knl, parent_knl]) + if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo2') knl = lp.inline_callable_kernel(knl, 'linear_combo1') diff --git a/test/testlib.py b/test/testlib.py index 853e2584a..4f45e69b5 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -171,11 +171,6 @@ class Log2Callable(lp.ScalarCallable): callables_table) -def register_log2_lookup(target, identifier): - if identifier == 'log2': - return Log2Callable(name='log2') - return None - # }}} # vim: foldmethod=marker -- GitLab From 87d856fe6b4a5e532fb0dd318962b0675f066af8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 21 Oct 2019 12:50:08 -0500 Subject: [PATCH 637/916] saving state --- loopy/__init__.py | 4 +-- loopy/program.py | 9 +++-- loopy/target/execution.py | 25 +++++++------- loopy/transform/callable.py | 68 ++++++++++++++----------------------- test/test_callables.py | 50 +++++++++++++-------------- 5 files changed, 70 insertions(+), 86 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 8f21cac56..9a0791948 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -121,7 +121,7 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.callable import (register_callable, - fuse_translation_units, inline_callable_kernel) + merge, inline_callable_kernel) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call # }}} @@ -243,7 +243,7 @@ __all__ = [ "dump_as_python", "register_callable", - "fuse_translation_units", + "merge", "inline_callable_kernel", "pack_and_unpack_args_for_call", diff --git a/loopy/program.py b/loopy/program.py index 4a1225a44..61556df9a 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -132,7 +132,11 @@ class CallableResolver(RuleAwareIdentityMapper): ) else: # FIXME: Once function mangler is completely deprecated raise here. + # Oh function mangler I loathe you so much! pass + else: + self.resolved_functions[expr.function.name] = ( + self.known_callables[expr.function.name]) return super(CallableResolver, self).map_call_with_kwargs(expr, expn_state) @@ -225,8 +229,9 @@ class Program(ImmutableRecord): six.itervalues(self.callables_table) if isinstance(callable_knl, CallableKernel)) > ( KernelState.INITIAL): - raise LoopyError("One of the kenels in the program has been " - "preprocessed, cannot modify target now.") + if not isinstance(kwargs['target'], type(self.target)): + raise LoopyError("One of the kenels in the program has been " + "preprocessed, cannot modify target now.") return super(Program, self).copy(**kwargs) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 1dafd440c..2888462a4 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -744,25 +744,26 @@ class KernelExecutorBase(object): def get_typed_and_scheduled_program_uncached(self, entrypoint, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes + from loopy.kernel import KernelState program = self.program program = program.with_resolved_callables() - var_to_dtype = {} - entry_knl = program[entrypoint] - for var, dtype in arg_to_dtype_set: - if var in entry_knl.impl_arg_to_arg: - dest_name = entry_knl.impl_arg_to_arg[var].name - else: - dest_name = var + if arg_to_dtype_set: + var_to_dtype = {} + entry_knl = program[entrypoint] + for var, dtype in arg_to_dtype_set: + if var in entry_knl.impl_arg_to_arg: + dest_name = entry_knl.impl_arg_to_arg[var].name + else: + dest_name = var - var_to_dtype[dest_name] = dtype + var_to_dtype[dest_name] = dtype - program = program.with_kernel(add_dtypes(entry_knl, var_to_dtype)) + program = program.with_kernel(add_dtypes(entry_knl, var_to_dtype)) - from loopy.type_inference import infer_unknown_types - from loopy.kernel import KernelState - program = infer_unknown_types(program, expect_completion=True) + from loopy.type_inference import infer_unknown_types + program = infer_unknown_types(program, expect_completion=True) if program.state < KernelState.SCHEDULED: from loopy.preprocess import preprocess_program diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index f2e1bead5..cac0ea9fa 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -25,10 +25,8 @@ THE SOFTWARE. import six import islpy as isl -from pymbolic.primitives import CallWithKwargs from loopy.kernel import LoopKernel -from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, Assignment, CInstruction, _DataObliviousInstruction) @@ -44,7 +42,7 @@ __doc__ = """ .. autofunction:: register_callable -.. autofunction:: fuse_translation_units +.. autofunction:: merge """ @@ -73,7 +71,7 @@ def register_callable(translation_unit, function_identifier, callable_, callables_table=callables) -def fuse_translation_units(translation_units, collision_not_ok=True): +def merge(translation_units, collision_not_ok=True): """ :param translation_units: A list of :class:`loopy.Program`. :param collision_not_ok: An instance of :class:`bool`. @@ -84,7 +82,7 @@ def fuse_translation_units(translation_units, collision_not_ok=True): for i in range(1, len(translation_units)): if translation_units[i].target != translation_units[i-1].target: - raise LoopyError("fuse_translation_units should have" + raise LoopyError("merge() should have" " translation_units to be of the same target to be able to" " fuse.") callables_table = {} @@ -95,7 +93,7 @@ def fuse_translation_units(translation_units, collision_not_ok=True): if len(callables_table) != sum(len(trans_unit.callables_table) for trans_unit in translation_units) and collision_not_ok: - raise LoopyError("translation units in fuse_translation_units cannot" + raise LoopyError("translation units in merge() cannot" " not contain callables with same names.") # }}} @@ -362,23 +360,15 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): # {{{ inline callable kernel -def _inline_single_callable_kernel(caller_kernel, function_name, +def _inline_single_callable_kernel(caller_kernel, callee_kernel, callables_table): - old_insns = caller_kernel.instructions - for insn in old_insns: + for insn in caller_kernel.instructions: if isinstance(insn, CallInstruction): # FIXME This seems to use identifiers across namespaces. Why not # check whether the function is a scoped function first? ~AK - if insn.expression.function.name in callables_table: - history_of_identifier = callables_table.history[ - insn.expression.function.name] - - if function_name in history_of_identifier: - in_knl_callable = callables_table[ - insn.expression.function.name] - assert isinstance(in_knl_callable, CallableKernel) - caller_kernel = _inline_call_instruction( - caller_kernel, in_knl_callable.subkernel, insn) + if insn.expression.function.name == callee_kernel.name: + caller_kernel = _inline_call_instruction( + caller_kernel, callee_kernel, insn) elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass @@ -387,7 +377,7 @@ def _inline_single_callable_kernel(caller_kernel, function_name, "Unknown instruction type %s" % type(insn).__name__) - return caller_kernel, callables_table + return caller_kernel # FIXME This should take a 'within' parameter to be able to only inline @@ -398,34 +388,26 @@ def inline_callable_kernel(program, function_name): (scoped) name *function_name* inlined. """ from loopy.preprocess import infer_arg_descr + program = program.with_resolved_callables() program = infer_arg_descr(program) callables_table = program.callables_table - old_callables_table = callables_table.copy() - - edited_callable_kernels = {} - - for func_id, in_knl_callable in old_callables_table.items(): - if function_name not in old_callables_table.history[func_id] and ( - isinstance(in_knl_callable, CallableKernel)): - caller_kernel = in_knl_callable.subkernel - caller_kernel, callables_table = ( - _inline_single_callable_kernel(caller_kernel, - function_name, - callables_table)) - edited_callable_kernels[func_id] = in_knl_callable.copy( - subkernel=caller_kernel) - - new_resolved_functions = {} - for func_id, in_knl_callable in callables_table.items(): - if func_id in edited_callable_kernels: - new_resolved_functions[func_id] = edited_callable_kernels[func_id] + new_callables = {} + callee = program[function_name] + + for func_id, in_knl_callable in six.iteritems(callables_table): + if isinstance(in_knl_callable, CallableKernel): + caller = in_knl_callable.subkernel + in_knl_callable = in_knl_callable.copy( + subkernel=_inline_single_callable_kernel(caller, + callee, program.callables_table)) + elif isinstance(in_knl_callable, ScalarCallable): + pass else: - new_resolved_functions[func_id] = in_knl_callable + raise NotImplementedError() - callables_table = callables_table.copy( - resolved_functions=new_resolved_functions) + new_callables[func_id] = in_knl_callable - return program.copy(callables_table=callables_table) + return program.copy(callables_table=new_callables) # }}} diff --git a/test/test_callables.py b/test/test_callables.py index 17f9a3c0a..1457da3af 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -93,7 +93,7 @@ def test_register_knl(ctx_factory, inline): '...'] ) - knl = lp.fuse_translation_units([grandchild_knl, child_knl, parent_knl]) + knl = lp.merge([grandchild_knl, child_knl, parent_knl]) if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo2') @@ -134,8 +134,7 @@ def test_slices_with_negative_step(ctx_factory, inline): '...'] ) - knl = lp.register_callable_kernel( - parent_knl, child_knl) + knl = lp.merge([parent_knl, child_knl]) if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo') @@ -177,8 +176,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): e=[j, l]: c[i, j, k, l, m]) """) - knl = lp.register_callable_kernel( - caller_knl, callee_knl) + knl = lp.merge([caller_knl, callee_knl]) if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo') @@ -220,16 +218,15 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) - """ - ) + """, name='caller') caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") - knl = lp.register_callable_kernel( - caller_knl, callee_knl) + knl = lp.merge([caller_knl, callee_knl]) knl = lp.set_options(knl, 'return_dict') - gsize, lsize = knl.get_grid_size_upper_bounds_as_exprs() + gsize, lsize = knl['caller'].get_grid_size_upper_bounds_as_exprs( + knl.callables_table) if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo') @@ -280,9 +277,9 @@ def test_shape_translation_through_sub_array_ref(ctx_factory, inline): [l]: y3[l, l] = callee_fn3([l]: x3[l, l]) """) - knl = lp.register_callable_kernel(knl, callee1) - knl = lp.register_callable_kernel(knl, callee2) - knl = lp.register_callable_kernel(knl, callee3) + knl = lp.merge([knl, callee1]) + knl = lp.merge([knl, callee2]) + knl = lp.merge([knl, callee3]) if inline: knl = lp.inline_callable_kernel(knl, 'callee_fn1') @@ -341,7 +338,7 @@ def test_multi_arg_array_call(ctx_factory): knl = lp.fix_parameters(knl, n=n) knl = lp.set_options(knl, return_dict=True) - knl = lp.register_callable_kernel(knl, argmin_kernel) + knl = lp.merge([knl, argmin_kernel]) b = np.random.randn(n) evt, out_dict = knl(queue, b=b) tol = 1e-15 @@ -377,8 +374,8 @@ def test_packing_unpacking(ctx_factory, inline): [k]: y2[k] = callee_fn2([k]: x2[k]) """) - knl = lp.register_callable_kernel(knl, callee1) - knl = lp.register_callable_kernel(knl, callee2) + knl = lp.merge([knl, callee1]) + knl = lp.merge([knl, callee2]) knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1') knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2') @@ -422,19 +419,19 @@ def test_non_sub_array_refs_arguments(ctx_factory): is_output=False), '...'], name="caller", target=lp.CTarget()) - registered = lp.register_callable_kernel(caller1, callee) + registered = lp.merge([caller1, callee]) inlined = _match_caller_callee_argument_dimension_(registered, callee.name) inlined = lp.inline_callable_kernel(inlined, callee.name) print(inlined) - registered = lp.register_callable_kernel(caller2, callee) + registered = lp.merge([caller2, callee]) inlined = _match_caller_callee_argument_dimension_(registered, callee.name) inlined = lp.inline_callable_kernel(inlined, callee.name) print(inlined) - registered = lp.register_callable_kernel(caller3, callee) + registered = lp.merge([caller3, callee]) inlined = _match_caller_callee_argument_dimension_(registered, callee.name) inlined = lp.inline_callable_kernel(inlined, callee.name) @@ -462,7 +459,7 @@ def test_empty_sub_array_refs(ctx_factory, inline): """, [lp.GlobalArg('x, y', dtype=np.float64, shape=(10, )), '...']) - caller = lp.register_callable_kernel(caller, callee) + caller = lp.merge([caller, callee]) if inline: caller = lp.inline_callable_kernel(caller, callee.name) @@ -499,8 +496,7 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline): '...'] ) - knl = lp.register_callable_kernel( - parent_knl, child_knl) + knl = lp.merge([parent_knl, child_knl]) if inline: knl = lp.inline_callable_kernel(knl, 'linear_combo') @@ -535,8 +531,8 @@ def test_stride_depending_on_args(): lp.ValueArg('N', dtype=np.int32), lp.GlobalArg('x', shape=lp.auto, dtype=np.float64), '...']) - prog = lp.register_callable_kernel(prog, twice) - prog = lp.register_callable_kernel(prog, thrice) + prog = lp.merge([prog, twice]) + prog = lp.merge([prog, thrice]) # FIXME: actually test something print(lp.generate_code_v2(prog).device_code()) @@ -559,7 +555,7 @@ def test_unknown_stride_to_callee(): dtype=np.int32), lp.GlobalArg('x', shape=lp.auto, dtype=np.float64), '...']) - prog = lp.register_callable_kernel(prog, twice) + prog = lp.merge([prog, twice]) # FIXME: actually test something print(lp.generate_code_v2(prog).device_code()) @@ -580,7 +576,7 @@ def test_argument_matching_for_inplace_update(ctx_factory): x[:] = twice(x[:]) """, [lp.GlobalArg('x', shape=(10,), dtype=np.float64)]) - knl = lp.register_callable_kernel(knl, twice) + knl = lp.merge([knl, twice]) x = np.random.randn(10) evt, (out, ) = knl(queue, x=np.copy(x)) @@ -603,7 +599,7 @@ def test_non_zero_start_in_subarray_ref(ctx_factory): [i]:y[i+5] = twice([j]: x[j]) """, [lp.GlobalArg('x, y', shape=(10,), dtype=np.float64)]) - knl = lp.register_callable_kernel(knl, twice) + knl = lp.merge([knl, twice]) x = np.random.randn(10) evt, (out, ) = knl(queue, x=np.copy(x)) -- GitLab From 6ad17dd9a19671fedb6944b44dfdb2b0cf20196c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 21 Oct 2019 17:33:39 -0500 Subject: [PATCH 638/916] miscellaneous minor fixes --- loopy/codegen/__init__.py | 1 + loopy/kernel/function_interface.py | 2 +- loopy/preprocess.py | 17 ++++++++++++----- loopy/program.py | 7 ++++--- loopy/target/pyopencl_execution.py | 6 +++--- loopy/transform/callable.py | 2 +- loopy/transform/pack_and_unpack_args.py | 9 +++------ test/test_callables.py | 19 +++++++++++-------- 8 files changed, 36 insertions(+), 27 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 8d5bd14f4..3a3b88de5 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -703,6 +703,7 @@ def generate_code_v2(program): cgr = CodeGenerationResult( host_programs=host_programs, device_programs=device_programs, + device_preambles=device_preambles, implemented_data_infos=implemented_data_infos) return cgr diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index b58e05b6c..3584440fb 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -739,7 +739,7 @@ class CallableKernel(InKernelCallable): unknown_deps = dependents - self.subkernel.all_variable_names() if expr is None: - assert dependents == frozenset() + assert unknown_deps == frozenset() # FIXME: Need to make sure that we make the name of the variables # unique, and then run a subst_mapper diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 475ca8df7..4037229aa 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2314,7 +2314,7 @@ def infer_arg_descr(program): from loopy.kernel.array import ArrayBase from loopy.kernel.function_interface import (ArrayArgDescriptor, ValueArgDescriptor) - from loopy import auto + from loopy import auto, ValueArg clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, program.entrypoints) @@ -2329,10 +2329,17 @@ def infer_arg_descr(program): if isinstance(s, int): return s, return s - arg_id_to_descr = dict((arg.name, ArrayArgDescriptor( - _tuple_if_int(arg.shape), arg.address_space, arg.dim_tags) if - isinstance(arg, ArrayBase) else ValueArgDescriptor()) for arg in - program[e].args if arg.shape not in (None, auto)) + arg_id_to_descr = {} + for arg in program[e].args: + if isinstance(arg, ArrayBase): + if arg.shape not in (None, auto): + arg_id_to_descr[arg.name] = ArrayArgDescriptor( + _tuple_if_int(arg.shape), arg.address_space, + arg.dim_tags) + elif isinstance(arg, ValueArg): + arg_id_to_descr[arg.name] = ValueArgDescriptor() + else: + raise NotImplementedError() new_callable, clbl_inf_ctx, _ = program.callables_table[e].with_descrs( arg_id_to_descr, None, clbl_inf_ctx) clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) diff --git a/loopy/program.py b/loopy/program.py index 61556df9a..75fd0d77d 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -316,9 +316,10 @@ class Program(ImmutableRecord): return result def __getattr__(self, attr): - if attr in self.entrypoints: - return lambda *args, **kwargs: self(*args, entrypoint=attr, - **kwargs) + if self.entrypoints: + if attr in self.entrypoints: + return lambda *args, **kwargs: self(*args, entrypoint=attr, + **kwargs) return super(Program, self).__getattr__(attr) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 475e6d1c8..0af40a1fe 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -293,13 +293,13 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): if program[entrypoint].options.write_cl: #FIXME: redirect to "translation unit" level option as well. output = dev_code - if self.program.root_kernel.options.highlight_cl: + if self.program[entrypoint].options.highlight_cl: output = get_highlighted_code(output) - if self.program.root_kernel.options.write_cl is True: + if self.program[entrypoint].options.write_cl is True: print(output) else: - with open(self.program.root_kernel.options.write_cl, "w") as outf: + with open(self.program[entrypoint].options.write_cl, "w") as outf: outf.write(output) if program[entrypoint].options.edit_cl: diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index cac0ea9fa..84537164f 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -554,7 +554,7 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name): .. note:: - The callee kernel addressed by *callee_funciton_name*, should be + The callee kernel addressed by *callee_function_name*, should be called only once. """ assert isinstance(program, Program) diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index a18326187..33830d4ab 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -321,7 +321,7 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, def pack_and_unpack_args_for_call(program, *args, **kwargs): assert isinstance(program, Program) - new_resolved_functions = {} + new_callables = {} for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = pack_and_unpack_args_for_call_for_single_kernel( @@ -329,17 +329,14 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs): *args, **kwargs) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) - elif isinstance(in_knl_callable, ScalarCallable): pass else: raise NotImplementedError("Unknown type of callable %s." % ( type(in_knl_callable).__name__)) - new_resolved_functions[func_id] = in_knl_callable + new_callables[func_id] = in_knl_callable - new_callables_table = program.callables_table.copy( - resolved_functions=new_resolved_functions) - return program.copy(callables_table=new_callables_table) + return program.copy(callables_table=new_callables) # vim: foldmethod=marker diff --git a/test/test_callables.py b/test/test_callables.py index 1457da3af..111861f4e 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -403,7 +403,8 @@ def test_non_sub_array_refs_arguments(ctx_factory): callee = lp.make_function("{[i] : 0 <= i < 6}", "a[i] = a[i] + j", [lp.GlobalArg("a", dtype="double", shape=(6,), is_output=True, is_input=True), - lp.ValueArg("j", dtype="int")], name="callee") + lp.ValueArg("j", dtype="int")], name="callee", + target=lp.CTarget()) caller1 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:] = callee(a[:], b[0])", [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output=False), lp.GlobalArg("b", dtype="double", shape=(1, ), is_output=False)], @@ -420,20 +421,22 @@ def test_non_sub_array_refs_arguments(ctx_factory): name="caller", target=lp.CTarget()) registered = lp.merge([caller1, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, callee.name) - inlined = lp.inline_callable_kernel(inlined, callee.name) + inlined = _match_caller_callee_argument_dimension_(registered, 'callee') + inlined = lp.inline_callable_kernel(inlined, 'callee') print(inlined) registered = lp.merge([caller2, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, callee.name) - inlined = lp.inline_callable_kernel(inlined, callee.name) + inlined = _match_caller_callee_argument_dimension_(registered, 'callee') + inlined = lp.inline_callable_kernel(inlined, 'callee') print(inlined) registered = lp.merge([caller3, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, callee.name) - inlined = lp.inline_callable_kernel(inlined, callee.name) + inlined = _match_caller_callee_argument_dimension_(registered, 'callee') + inlined = lp.inline_callable_kernel(inlined, 'callee') + + print(inlined) print(inlined) @@ -462,7 +465,7 @@ def test_empty_sub_array_refs(ctx_factory, inline): caller = lp.merge([caller, callee]) if inline: - caller = lp.inline_callable_kernel(caller, callee.name) + caller = lp.inline_callable_kernel(caller, 'wence_function') evt, (out, ) = caller(queue, x=x, y=y) assert np.allclose(out, x-y) -- GitLab From 75dae8648071086106ec88979235b5d6e3b85440 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 22 Oct 2019 13:05:24 -0500 Subject: [PATCH 639/916] removing root kernel --- loopy/auto_test.py | 51 ++++++++++++++++++++++++++------------- loopy/kernel/tools.py | 39 ++++++++++++++++++++++++------ loopy/target/execution.py | 2 +- test/test_loopy.py | 38 ++++++++++++++++------------- 4 files changed, 88 insertions(+), 42 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index b5039bd2c..8b09aead7 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -76,7 +76,7 @@ class TestArgInfo(Record): # {{{ "reference" arguments -def make_ref_args(program, impl_arg_info, queue, parameters): +def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array @@ -89,7 +89,7 @@ def make_ref_args(program, impl_arg_info, queue, parameters): ref_arg_data = [] for arg in impl_arg_info: - kernel_arg = program.impl_arg_to_arg.get(arg.name) + kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: if arg.offset_for_name: @@ -370,7 +370,8 @@ def auto_test_vs_ref( dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, - quiet=False, blacklist_ref_vendors=[]): + quiet=False, blacklist_ref_vendors=[], ref_entrypoint=None, + test_entrypoint=None): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. @@ -386,14 +387,25 @@ def auto_test_vs_ref( test_prog = ref_prog do_check = False + if ref_entrypoint is None: + if len(ref_prog.entrypoints) != 1: + raise LoopyError("Unable to guess entrypoint for ref_prog.") + ref_entrypoint = list(ref_prog.entrypoints)[0] + + if test_entrypoint is None: + if len(test_prog.entrypoints) != 1: + raise LoopyError("Unable to guess entrypoint for ref_prog.") + test_entrypoint = list(test_prog.entrypoints)[0] + ref_prog = lp.preprocess_kernel(ref_prog) test_prog = lp.preprocess_kernel(test_prog) - if len(ref_prog.args) != len(test_prog.args): + if len(ref_prog[ref_entrypoint].args) != len(test_prog[test_entrypoint].args): raise LoopyError("ref_prog and test_prog do not have the same number " "of arguments") - for i, (ref_arg, test_arg) in enumerate(zip(ref_prog.args, test_prog.args)): + for i, (ref_arg, test_arg) in enumerate(zip(ref_prog[ref_entrypoint].args, + test_prog[test_entrypoint].args)): if ref_arg.name != test_arg.name: raise LoopyError("ref_prog and test_prog argument lists disagree at " "index %d (1-based)" % (i+1)) @@ -434,10 +446,13 @@ def auto_test_vs_ref( properties=cl.command_queue_properties.PROFILING_ENABLE) ref_codegen_result = lp.generate_code_v2(ref_prog) - ref_implemented_data_info = ref_codegen_result.implemented_data_info + #FIXME: This is not correct, but I am thinking of moving to a dict of + #implemented_data_info anyway. That should make it more elegant. + assert len(ref_prog.entrypoints) == 1 + ref_implemented_data_info = ref_codegen_result.implemented_data_infos[0] logger.info("%s (ref): trying %s for the reference calculation" % ( - ref_prog.name, dev)) + ref_entrypoint, dev)) if not quiet and print_ref_code: print(75*"-") @@ -449,7 +464,7 @@ def auto_test_vs_ref( try: ref_args, ref_arg_data = \ - make_ref_args(ref_prog, + make_ref_args(ref_prog[ref_entrypoint], ref_implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False @@ -475,8 +490,8 @@ def auto_test_vs_ref( ref_queue.finish() logger.info("%s (ref): using %s for the reference calculation" % ( - ref_prog.name, dev)) - logger.info("%s (ref): run" % ref_prog.name) + ref_entrypoint, dev)) + logger.info("%s (ref): run" % ref_entrypoint) ref_start = time() @@ -489,7 +504,7 @@ def auto_test_vs_ref( ref_stop = time() ref_elapsed_wall = ref_stop-ref_start - logger.info("%s (ref): run done" % ref_prog.name) + logger.info("%s (ref): run done" % ref_entrypoint) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) @@ -515,8 +530,10 @@ def auto_test_vs_ref( test_prog = infer_unknown_types(test_prog, expect_completion=True) test_prog_codegen_result = lp.generate_code_v2(test_prog) - args = make_args(test_prog, - test_prog_codegen_result.implemented_data_info, + assert len(test_prog.entrypoints) == 1 + + args = make_args(test_prog[test_entrypoint], + test_prog_codegen_result.implemented_data_infos[0], queue, ref_arg_data, parameters) args["out_host"] = False @@ -533,7 +550,7 @@ def auto_test_vs_ref( print(test_prog_codegen_result.cl_program.binaries[0]) print(75*"-") - logger.info("%s: run warmup" % (test_prog.name)) + logger.info("%s: run warmup" % (test_entrypoint)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: @@ -568,9 +585,9 @@ def auto_test_vs_ref( events = [] queue.finish() - logger.info("%s: warmup done" % (test_prog.name)) + logger.info("%s: warmup done" % (test_entrypoint)) - logger.info("%s: timing run" % (test_prog.name)) + logger.info("%s: timing run" % (test_entrypoint)) timing_rounds = max(warmup_rounds, 1) @@ -614,7 +631,7 @@ def auto_test_vs_ref( else: break - logger.info("%s: timing run done" % (test_prog.name)) + logger.info("%s: timing run done" % (test_entrypoint)) rates = "" for cnt, lbl in zip(op_count, op_label): diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index c468a2201..27b1efe8a 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -39,6 +39,7 @@ from loopy.tools import natsorted from loopy.symbolic import CombineMapper from loopy.kernel import LoopKernel from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel.function_interface import CallableKernel from loopy.kernel.instruction import (MultiAssignmentBase, _DataObliviousInstruction) from functools import reduce @@ -48,20 +49,36 @@ logger = logging.getLogger(__name__) # {{{ add and infer argument dtypes -def add_dtypes(kernel, dtype_dict): +def add_dtypes(prog_or_kernel, dtype_dict): """Specify remaining unspecified argument/temporary variable types. :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ + if isinstance(prog_or_kernel, Program): + kernel_names = [clbl.subkernel.name for clbl in + six.itervalues(prog_or_kernel.callables_table) if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError("add_dtypes may not take a Program with more than" + " one callable kernels. Please provide individual kernels" + " instead.") + + kernel_name, = kernel_names + + return prog_or_kernel.with_kernel( + add_dtypes(prog_or_kernel[kernel_name], dtype_dict)) + + assert isinstance(prog_or_kernel, LoopKernel) + dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes( - kernel, dtype_dict) + prog_or_kernel, dtype_dict) if dtype_dict_remainder: raise RuntimeError("unused argument dtypes: %s" % ", ".join(dtype_dict_remainder)) - return kernel.copy(args=new_args, temporary_variables=new_temp_vars) + return prog_or_kernel.copy(args=new_args, temporary_variables=new_temp_vars) def _add_dtypes_overdetermined(knl, dtype_dict): @@ -113,8 +130,18 @@ def get_arguments_with_incomplete_dtype(knl): if arg.dtype is None] -def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False): +def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False, + kernel_name=None): assert isinstance(prog, Program) + if kernel_name is None: + kernel_names = [clbl.subkernel.name for clbl in + six.itervalues(prog.callables_table) if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError("Provide 'kernel_name' argument.") + + kernel_name, = kernel_names + processed_dtype_dict = {} for k, v in six.iteritems(dtype_dict): @@ -123,7 +150,7 @@ def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False): if subkey: processed_dtype_dict[subkey] = v - prog = add_dtypes(prog, processed_dtype_dict) + prog = prog.with_kernel(add_dtypes(prog[kernel_name], processed_dtype_dict)) from loopy.type_inference import infer_unknown_types return infer_unknown_types(prog, expect_completion=expect_completion) @@ -1883,8 +1910,6 @@ def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,): if insn_ids is None: insn_ids = frozenset(insn.id for insn in kernel.instructions) - from loopy.kernel.function_interface import CallableKernel - def _get_callee_kernel_if_insn_has_callable_kernel(insn_id): """Returns callee kernel if the instruction has a call to a :class:`loopy.kernel.function_interface.CallableKernel`. Otherwise diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 2888462a4..ee2390ab7 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -84,7 +84,7 @@ class SeparateArrayPackingController(object): sep_shape=arg.sep_shape(), subscripts_and_names=subscripts_and_names, is_written=arg.name in - program.root_kernel.get_written_variables()) + program[entrypoint].get_written_variables()) def unpack(self, kernel_kwargs): if not self.packing_info: diff --git a/test/test_loopy.py b/test/test_loopy.py index c762b84bc..799b415ed 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -97,7 +97,7 @@ def test_complicated_subst(ctx_factory): print(knl) - sr_keys = list(knl.root_kernel.substitutions.keys()) + sr_keys = list(knl['loopy_kernel'].substitutions.keys()) for letter, how_many in [ ("f", 1), ("g", 1), @@ -145,13 +145,13 @@ def test_type_inference_with_type_dependencies(): prog = lp.infer_unknown_types(prog) from loopy.types import to_loopy_type - assert prog.root_kernel.temporary_variables["a"].dtype == to_loopy_type( + assert prog['loopy_kernel'].temporary_variables["a"].dtype == to_loopy_type( np.int32) - assert prog.root_kernel.temporary_variables["b"].dtype == to_loopy_type( + assert prog['loopy_kernel'].temporary_variables["b"].dtype == to_loopy_type( np.float32) - assert prog.root_kernel.temporary_variables["c"].dtype == to_loopy_type( + assert prog['loopy_kernel'].temporary_variables["c"].dtype == to_loopy_type( np.float32) - assert prog.root_kernel.temporary_variables["d"].dtype == to_loopy_type( + assert prog['loopy_kernel'].temporary_variables["d"].dtype == to_loopy_type( np.complex128) @@ -268,9 +268,8 @@ def test_bare_data_dependency(ctx_factory): lp.ValueArg("n", np.int32), ]) - cknl = lp.CompiledKernel(ctx, knl) n = 20000 - evt, (a,) = cknl(queue, n=n, out_host=True) + evt, (a,) = knl(queue, n=n, out_host=True) assert a.shape == (n,) assert (a == 1).all() @@ -291,7 +290,8 @@ def test_ilp_write_race_detection_global(ctx_factory): lp.ValueArg("n", np.int32, approximately=1000), ], assumptions="n>=1", - target=lp.PyOpenCLTarget(ctx.devices[0])) + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") knl = lp.tag_inames(knl, dict(j="ilp")) @@ -301,7 +301,7 @@ def test_ilp_write_race_detection_global(ctx_factory): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: - list(lp.generate_loop_schedules(knl.root_kernel, + list(lp.generate_loop_schedules(knl["loopy_kernel"], knl.callables_table)) assert any(isinstance(w.message, WriteRaceConditionWarning) @@ -317,12 +317,13 @@ def test_ilp_write_race_avoidance_local(ctx_factory): "<> a[i] = 5+i+j", ], [], - target=lp.PyOpenCLTarget(ctx.devices[0])) + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) - assert knl.root_kernel.temporary_variables["a"].shape == (16, 17) + assert knl["loopy_kernel"].temporary_variables["a"].shape == (16, 17) def test_ilp_write_race_avoidance_private(ctx_factory): @@ -334,12 +335,13 @@ def test_ilp_write_race_avoidance_private(ctx_factory): "<> a = 5+j", ], [], - target=lp.PyOpenCLTarget(ctx.devices[0])) + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl) - assert knl.root_kernel.temporary_variables["a"].shape == (16,) + assert knl["loopy_kernel"].temporary_variables["a"].shape == (16,) # }}} @@ -581,10 +583,11 @@ def test_dependent_domain_insn_iname_finding(ctx_factory): lp.GlobalArg("strengths", None, shape="nsources"), "..."], - target=lp.PyOpenCLTarget(ctx.devices[0])) + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") print(prog) - assert "isrc_box" in prog.root_kernel.insn_inames("set_strength") + assert "isrc_box" in prog["loopy_kernel"].insn_inames("set_strength") prog = lp.add_dtypes(prog, dict( @@ -607,10 +610,11 @@ def test_inames_deps_from_write_subscript(ctx_factory): [ lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", None, shape=None), - "..."]) + "..."], + name="loopy_kernel") print(prog) - assert "i" in prog.root_kernel.insn_inames("myred") + assert "i" in prog['loopy_kernel'].insn_inames("myred") def test_modulo_indexing(ctx_factory): -- GitLab From 188b38aba7f800fb9253a19a779fa06f3651c9c4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 22 Oct 2019 13:05:53 -0500 Subject: [PATCH 640/916] misc. minor error fixes --- loopy/codegen/control.py | 2 +- loopy/kernel/array.py | 20 ++++++++++++++++++++ loopy/kernel/function_interface.py | 11 ++++++----- loopy/preprocess.py | 2 +- loopy/target/pyopencl_execution.py | 3 +-- loopy/transform/data.py | 8 +++----- loopy/type_inference.py | 14 ++++++++++---- 7 files changed, 42 insertions(+), 18 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 81959032a..e3c558916 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -179,7 +179,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): if sched_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( - host_program=None, + host_programs=[], device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index d079aebe5..94d867f8d 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -74,6 +74,9 @@ class _StrideArrayDimTagBase(ArrayDimImplementationTag): occur. """ + def depends_on(self): + raise NotImplementedError() + class FixedStrideArrayDimTag(_StrideArrayDimTagBase): """An arg dimension implementation tag for a fixed (potentially @@ -125,6 +128,14 @@ class FixedStrideArrayDimTag(_StrideArrayDimTagBase): def map_expr(self, mapper): return self.copy(stride=mapper(self.stride)) + def depends_on(self): + from loopy.kernel.data import auto + from loopy.symbolic import DependencyMapper + if self.stride is auto: + return frozenset() + + return DependencyMapper(composite_leaves=auto)(self.stride) + class ComputedStrideArrayDimTag(_StrideArrayDimTagBase): """ @@ -159,6 +170,9 @@ class ComputedStrideArrayDimTag(_StrideArrayDimTagBase): def map_expr(self, mapper): return self + def depends_on(self): + return frozenset() + class SeparateArrayArrayDimTag(ArrayDimImplementationTag): def stringify(self, include_target_axis): @@ -170,6 +184,9 @@ class SeparateArrayArrayDimTag(ArrayDimImplementationTag): def map_expr(self, mapper): return self + def depends_on(self): + return frozenset() + class VectorArrayDimTag(ArrayDimImplementationTag): def stringify(self, include_target_axis): @@ -181,6 +198,9 @@ class VectorArrayDimTag(ArrayDimImplementationTag): def map_expr(self, mapper): return self + def depends_on(self): + return frozenset() + NESTING_LEVEL_RE = re.compile(r"^N([-0-9]+)(?::(.*)|)$") PADDED_STRIDE_TAG_RE = re.compile(r"^([a-zA-Z]*)\(pad=(.*)\)$") diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 3584440fb..cbd948ef8 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -79,7 +79,8 @@ class ArrayArgDescriptor(ImmutableRecord): .. attribute:: dim_tags - A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` + A tuple of instances of + :class:`loopy.kernel.array.ArrayDimImplementationTag` """ fields = set(['shape', 'address_space', 'dim_tags']) @@ -88,13 +89,13 @@ class ArrayArgDescriptor(ImmutableRecord): # {{{ sanity checks - from loopy.kernel.array import FixedStrideArrayDimTag + from loopy.kernel.array import ArrayDimImplementationTag assert isinstance(shape, tuple) assert isinstance(dim_tags, tuple) # FIXME at least vector dim tags should be supported - assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in + assert all(isinstance(dim_tag, ArrayDimImplementationTag) for dim_tag in dim_tags) # }}} @@ -117,8 +118,8 @@ class ArrayArgDescriptor(ImmutableRecord): def depends_on(self): result = DependencyMapper(composite_leaves=False)(self.shape) | ( - DependencyMapper(composite_leaves=False)(tuple(dim_tag.stride for - dim_tag in self.dim_tags))) + frozenset().union(*(dim_tag.depends_on() for dim_tag in + self.dim_tags))) return frozenset(var.name for var in result) # FIXME ArrayArgDescriptor should never need to be persisted, remove diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 4037229aa..4db499dd4 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1723,7 +1723,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, from loopy.type_inference import ( infer_arg_and_reduction_dtypes_for_reduction_expression) - arg_dtypes, reduction_dtypes, callables_table = ( + arg_dtypes, reduction_dtypes = ( infer_arg_and_reduction_dtypes_for_reduction_expression( temp_kernel, expr, callables_table, unknown_types_ok)) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 0af40a1fe..d41fe7006 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -315,8 +315,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): .build(options=program[entrypoint].options.cl_build_options)) cl_kernels = _Kernels() - for dp in program.entrypoints: - #FIXME: This will fail for barriers, use a better option here. + for dp in cl_program.kernel_names.split(';'): setattr(cl_kernels, dp, getattr(cl_program, dp)) return _KernelInfo( diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 2c9499d9d..cd8656aee 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -372,7 +372,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, def add_prefetch(program, *args, **kwargs): assert isinstance(program, Program) - new_resolved_functions = {} + new_callables = {} for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = add_prefetch_for_single_kernel( @@ -387,11 +387,9 @@ def add_prefetch(program, *args, **kwargs): raise NotImplementedError("Unknown type of callable %s." % ( type(in_knl_callable).__name__)) - new_resolved_functions[func_id] = in_knl_callable + new_callables[func_id] = in_knl_callable - new_callables_table = program.callables_table.copy( - resolved_functions=new_resolved_functions) - return program.copy(callables_table=new_callables_table) + return program.copy(callables_table=new_callables) # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index b646f2d21..e56a0f2af 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -542,6 +542,8 @@ class TypeInferenceMapper(CombineMapper): in_knl_callable = ManglerCallable( identifier, function_mangler, arg_id_to_dtype, arg_id_to_descr, mangle_result.target_name) + # FIXME: we have not tested how it works with mangler callable + # yet. self.callables_table, new_function_id = ( self.callables_table.with_added_callable( expr.function, in_knl_callable)) @@ -713,6 +715,11 @@ class TypeReader(TypeInferenceMapper): # }}} + def with_assignments(self, names_to_vars): + new_ass = self.new_assignments.copy() + new_ass.update(names_to_vars) + return type(self)(self.kernel, self.callables, new_ass) + def map_call(self, expr, return_tuple=False): identifier = expr.function if isinstance(identifier, (Variable, ResolvedFunction)): @@ -749,7 +756,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander): if var_name in kernel.all_params(): return [kernel.index_dtype], [], {}, ( - type_inf_mapper.callables_table) + type_inf_mapper.clbl_inf_ctx) from functools import partial debug = partial(_debug, kernel) @@ -1107,7 +1114,7 @@ def infer_unknown_types(program, expect_completion=False): def infer_arg_and_reduction_dtypes_for_reduction_expression( kernel, expr, callables_table, unknown_types_ok): - type_inf_mapper = TypeInferenceMapper(kernel, callables_table) + type_inf_mapper = TypeReader(kernel, callables_table) import loopy as lp if expr.is_tuple_typed: @@ -1138,8 +1145,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( if dt is not lp.auto else dt for dt in reduction_dtypes) - return tuple(arg_dtypes), reduction_dtypes, ( - type_inf_mapper.callables_table) + return tuple(arg_dtypes), reduction_dtypes # }}} -- GitLab From fb17ad8488b018fd83fec0c92f21de3d32bb93db Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 22 Oct 2019 15:50:20 -0500 Subject: [PATCH 641/916] some more root_kernel -> entrypoint --- loopy/transform/save.py | 12 +++++--- test/test_loopy.py | 63 +++++++++++++++++++++-------------------- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index e463353ef..138d83573 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -724,7 +724,7 @@ class TemporarySaver(object): # {{{ auto save and reload across kernel calls -def save_and_reload_temporaries(program): +def save_and_reload_temporaries(program, entrypoint=None): """ Add instructions to save and reload temporary variables that are live across kernel calls. @@ -747,13 +747,17 @@ def save_and_reload_temporaries(program): :returns: The resulting kernel """ + if entrypoint is None: + if len(program.entrypoints) != 1: + raise LoopyError("Missing argument 'entrypoint'.") + entrypoint = list(program.entrypoints)[0] - knl = program.root_kernel + knl = program[entrypoint] if not knl.schedule: program = lp.preprocess_program(program) from loopy.schedule import get_one_scheduled_kernel - knl = get_one_scheduled_kernel(program.root_kernel, + knl = get_one_scheduled_kernel(program[entrypoint], program.callables_table) assert knl.schedule is not None @@ -797,7 +801,7 @@ def save_and_reload_temporaries(program): .format(temporary, sched_item.kernel_name)) saver.save(temporary, sched_item.kernel_name) - return program.with_root_kernel(saver.finish()) + return program.with_kernel(saver.finish()) # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 799b415ed..6a780eaab 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -993,12 +993,13 @@ def test_within_inames_and_reduction(): lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)), ], target=lp.CTarget(), + name="loopy_kernel" ) prog = lp.preprocess_kernel(prog) - assert 'i' not in prog.root_kernel.insn_inames("insn_0_j_update") - print(prog.root_kernel.stringify(with_dependencies=True)) + assert 'i' not in prog["loopy_kernel"].insn_inames("insn_0_j_update") + print(prog["loopy_kernel"].stringify(with_dependencies=True)) def test_literal_local_barrier(ctx_factory): @@ -1112,7 +1113,7 @@ def save_and_reload_temporaries_test(queue, prog, out_expect, debug=False): from loopy.transform.save import save_and_reload_temporaries prog = save_and_reload_temporaries(prog) - prog = prog.with_root_kernel(lp.get_one_scheduled_kernel(prog.root_kernel, + prog = prog.with_kernel(lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table)) if debug: @@ -1615,7 +1616,7 @@ def test_regression_no_ret_call_removal(ctx_factory): "f(sum(i, x[i]))") prog = lp.add_and_infer_dtypes(prog, {"x": np.float32}) prog = lp.preprocess_kernel(prog) - assert len(prog.root_kernel.instructions) == 3 + assert len(prog["loopy_kernel"].instructions) == 3 def test_regression_persistent_hash(): @@ -1628,8 +1629,8 @@ def test_regression_persistent_hash(): "cse_exprvar = d[0]*d[0]") from loopy.tools import LoopyKeyBuilder lkb = LoopyKeyBuilder() - assert (lkb(knl1.root_kernel.instructions[0]) != - lkb(knl2.root_kernel.instructions[0])) + assert (lkb(knl1["loopy_kernel"].instructions[0]) != + lkb(knl2["loopy_kernel"].instructions[0])) assert lkb(knl1) != lkb(knl2) @@ -1648,7 +1649,7 @@ def test_sequential_dependencies(ctx_factory): end """, seq_dependencies=True) - print(prog.root_kernel.stringify(with_dependencies=True)) + print(prog["loopy_kernel"].stringify(with_dependencies=True)) lp.auto_test_vs_ref(prog, ctx, prog, parameters=dict(n=5)) @@ -1706,10 +1707,10 @@ def test_global_barrier(ctx_factory): knl = lp.preprocess_kernel(knl) assert ( - knl.root_kernel.temporary_variables["z"].address_space == + knl["loopy_kernel"].temporary_variables["z"].address_space == lp.AddressSpace.GLOBAL) assert ( - knl.root_kernel.temporary_variables["v"].address_space == + knl["loopy_kernel"].temporary_variables["v"].address_space == lp.AddressSpace.GLOBAL) print(knl) @@ -1873,7 +1874,7 @@ def test_const_temp_with_initializer_not_saved(): prog = lp.save_and_reload_temporaries(prog) # This ensures no save slot was added. - assert len(prog.root_kernel.temporary_variables) == 1 + assert len(prog["loopy_kernel"].temporary_variables) == 1 def test_header_extract(): @@ -2066,12 +2067,12 @@ def test_unscheduled_insn_detection(): "...") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) - prog = prog.with_root_kernel(knl) + knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table) + prog = prog.with_kernel(knl) insn1, = lp.find_instructions(prog, "id:insn1") - insns = prog.root_kernel.instructions[:] + insns = prog["loopy_kernel"].instructions[:] insns.append(insn1.copy(id="insn2")) - prog = prog.with_root_kernel(prog.root_kernel.copy(instructions=insns)) + prog = prog.with_kernel(prog["loopy_kernel"].copy(instructions=insns)) from loopy.diagnostic import UnscheduledInstructionError with pytest.raises(UnscheduledInstructionError): @@ -2236,7 +2237,7 @@ def test_barrier_insertion_near_top_of_loop(): prog = lp.set_temporary_scope(prog, "a", "local") prog = lp.set_temporary_scope(prog, "b", "local") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) + knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table) print(knl) @@ -2264,7 +2265,7 @@ def test_barrier_insertion_near_bottom_of_loop(): prog = lp.set_temporary_scope(prog, "a", "local") prog = lp.set_temporary_scope(prog, "b", "local") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) + knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table) print(knl) @@ -2294,10 +2295,10 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): from testlib import GridOverride # artifically expand via overridden_get_grid_sizes_for_insn_ids - knl = prog.root_kernel + knl = prog["loopy_kernel"] knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride( knl.copy(), vecsize)) - prog = prog.with_root_kernel(knl) + prog = prog.with_kernel(knl) # make sure we can generate the code lp.generate_code_v2(prog) @@ -2322,7 +2323,7 @@ def test_multi_argument_reduction_type_inference(): allow_simultaneous=True), allow_simultaneous=True) - t_inf_mapper = TypeInferenceMapper(prog.root_kernel, + t_inf_mapper = TypeInferenceMapper(prog["loopy_kernel"], prog.callables_table) assert ( @@ -2356,7 +2357,7 @@ def test_global_barrier_order_finding(): end """) - assert (lp.get_global_barrier_order(prog.root_kernel) == ("top", "yoink", + assert (lp.get_global_barrier_order(prog["loopy_kernel"]) == ("top", "yoink", "postloop")) for insn, barrier in ( @@ -2367,7 +2368,7 @@ def test_global_barrier_order_finding(): ("yoink", "top"), ("postloop", "yoink"), ("zzzv", "postloop")): - assert lp.find_most_recent_global_barrier(prog.root_kernel, insn) == barrier + assert lp.find_most_recent_global_barrier(prog["loopy_kernel"], insn) == barrier def test_global_barrier_error_if_unordered(): @@ -2380,7 +2381,7 @@ def test_global_barrier_error_if_unordered(): from loopy.diagnostic import LoopyError with pytest.raises(LoopyError): - lp.get_global_barrier_order(prog.root_kernel) + lp.get_global_barrier_order(prog["loopy_kernel"]) def test_struct_assignment(ctx_factory): @@ -2449,7 +2450,7 @@ def test_kernel_var_name_generator(): <>b_s0 = 0 """) - vng = prog.root_kernel.get_var_name_generator() + vng = prog["loopy_kernel"].get_var_name_generator() assert vng("a_s0") != "a_s0" assert vng("b") != "b" @@ -2472,7 +2473,7 @@ def test_fixed_parameters(ctx_factory): def test_parameter_inference(): knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "") - assert knl.root_kernel.all_params() == set(["n"]) + assert knl["loopy_kernel"].all_params() == set(["n"]) def test_execution_backend_can_cache_dtypes(ctx_factory): @@ -2504,14 +2505,14 @@ def test_wildcard_dep_matching(): all_insns = set("insn%d" % i for i in range(1, 6)) - assert prog.root_kernel.id_to_insn["insn1"].depends_on == set() - assert (prog.root_kernel.id_to_insn["insn2"].depends_on == all_insns - + assert prog["loopy_kernel"].id_to_insn["insn1"].depends_on == set() + assert (prog["loopy_kernel"].id_to_insn["insn2"].depends_on == all_insns - set(["insn2"])) - assert (prog.root_kernel.id_to_insn["insn3"].depends_on == all_insns - + assert (prog["loopy_kernel"].id_to_insn["insn3"].depends_on == all_insns - set(["insn3"])) - assert (prog.root_kernel.id_to_insn["insn4"].depends_on == set(["insn1", + assert (prog["loopy_kernel"].id_to_insn["insn4"].depends_on == set(["insn1", "insn2"])) - assert (prog.root_kernel.id_to_insn["insn5"].depends_on == all_insns - + assert (prog["loopy_kernel"].id_to_insn["insn5"].depends_on == all_insns - set(["insn1", "insn5"])) @@ -2625,7 +2626,7 @@ def test_add_prefetch_works_in_lhs_index(): prog = lp.add_prefetch(prog, "a1_map", "k", default_tag="l.auto") from loopy.symbolic import get_dependencies - for insn in prog.root_kernel.instructions: + for insn in prog["loopy_kernel"].instructions: assert "a1_map" not in get_dependencies(insn.assignees) @@ -2679,7 +2680,7 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier): prog = lp.tag_inames(prog, "i:l.0") prog = lp.preprocess_kernel(prog) - knl = lp.get_one_scheduled_kernel(prog.root_kernel, + knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table) assert barrier_between(knl, "first", "second") == expect_barrier -- GitLab From 3d5efe6c4b3fea49419ebc6396e7cd6a3d31b089 Mon Sep 17 00:00:00 2001 From: "[6~" Date: Wed, 23 Oct 2019 20:34:43 -0500 Subject: [PATCH 642/916] Program.__setstate__: reinstate _program_executor_cache --- loopy/program.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/loopy/program.py b/loopy/program.py index 1fb691531..c874d7b39 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -403,6 +403,12 @@ class Program(ImmutableRecord): strify_callable(clbl) for name, clbl in self.callables_table.items()) + + def __setstate__(self, state_obj): + super(Program, self).__setstate__(state_obj) + + self._program_executor_cache = {} + # }}} -- GitLab From 842b25f759883c0beacc02be44e6c7197e1c6928 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 00:10:59 -0500 Subject: [PATCH 643/916] Fixes lang_version for make_kernel --- loopy/kernel/creation.py | 51 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 242389384..591a73483 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2209,7 +2209,7 @@ def make_function(domains, instructions, kernel_data=["..."], **kwargs): from loopy.version import ( MOST_RECENT_LANGUAGE_VERSION, FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " + warn("'lang_version' was not passed to make_function(). " "To avoid this warning, pass " "lang_version={ver} in this invocation. " "(Or say 'from loopy.version import " @@ -2383,6 +2383,55 @@ def make_function(domains, instructions, kernel_data=["..."], **kwargs): def make_kernel(*args, **kwargs): + # {{{ handle kernel language version + + from loopy.version import LANGUAGE_VERSION_SYMBOLS + + version_to_symbol = dict( + (getattr(loopy.version, lvs), lvs) + for lvs in LANGUAGE_VERSION_SYMBOLS) + + lang_version = kwargs.get("lang_version", None) + if lang_version is None: + # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION + + # This *is* gross. But it seems like the right thing interface-wise. + import inspect + caller_globals = inspect.currentframe().f_back.f_globals + + for ver_sym in LANGUAGE_VERSION_SYMBOLS: + try: + lang_version = caller_globals[ver_sym] + break + except KeyError: + pass + + # }}} + + if lang_version is None: + from warnings import warn + from loopy.diagnostic import LoopyWarning + from loopy.version import ( + MOST_RECENT_LANGUAGE_VERSION, + FALLBACK_LANGUAGE_VERSION) + warn("'lang_version' was not passed to make_kernel(). " + "To avoid this warning, pass " + "lang_version={ver} in this invocation. " + "(Or say 'from loopy.version import " + "{sym_ver}' in " + "the global scope of the calling frame.)" + .format( + ver=MOST_RECENT_LANGUAGE_VERSION, + sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] + ), + LoopyWarning, stacklevel=2) + + lang_version = FALLBACK_LANGUAGE_VERSION + + kwargs['lang_version'] = lang_version + + # }}} + tunit = make_function(*args, **kwargs) name, = [name for name in tunit.callables_table] return tunit.with_entrypoints(name) -- GitLab From f86aa4948846ae9fecea95c4bda5d9cc135f7931 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 00:20:04 -0500 Subject: [PATCH 644/916] duplicate_iname must take in a kernel and not a translation unit --- loopy/transform/iname.py | 43 ++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index c431fd45f..c2e268302 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -34,9 +34,9 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError -from loopy.program import iterate_over_kernels_if_given_program +from loopy.program import Program, iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.kernel.function_interface import CallableKernel __doc__ = """ @@ -1019,7 +1019,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False): +def get_iname_duplication_options(knl, use_boostable_into=False): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication @@ -1049,6 +1049,11 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be duplicated in a given kernel. """ + if isinstance(knl, Program): + if len([clbl for clbl in six.itervalues(knl.callables_table) if + isinstance(clbl, CallableKernel)]) == 1: + knl = knl[list(knl.entrypoints)[0]] + from loopy.kernel.data import ConcurrentTag concurrent_inames = set( @@ -1085,7 +1090,7 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals # If we find a duplication option and to not use boostable_into # information, we restart this generator with use_boostable_into=True if not use_boostable_into and not knl.options.ignore_boostable_into: - for option in get_iname_duplication_options_for_single_kernel(knl, True): + for option in get_iname_duplication_options(knl, True): yield option # Emit a warning that we needed boostable_into @@ -1113,36 +1118,18 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals yield iname, within -def get_iname_duplication_options(program, use_boostable_into=False): - for in_knl_callable in program.callables_table.values(): - if isinstance(in_knl_callable, CallableKernel): - for option in get_iname_duplication_options_for_single_kernel( - in_knl_callable.subkernel, use_boostable_into): - yield option - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of in kernel callable %s." - % (type(in_knl_callable))) - - return - - -def has_schedulable_iname_nesting_for_single_kernel(knl): +def has_schedulable_iname_nesting(knl): """ :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ - return not bool(next(get_iname_duplication_options_for_single_kernel(knl), + if isinstance(knl, Program): + if len([clbl for clbl in six.itervalues(knl.callables_table) if + isinstance(clbl, CallableKernel)]) == 1: + knl = knl[list(knl.entrypoints)[0]] + return not bool(next(get_iname_duplication_options(knl), False)) - -def has_schedulable_iname_nesting(program): - return all(has_schedulable_iname_nesting_for_single_kernel( - in_knl_callable.subkernel) for in_knl_callable in - program.callables_table.values() if isinstance(in_knl_callable, - CallableKernel)) - # }}} -- GitLab From 0bc0f6c3812b20d9cf4fdd49821092e142cef670 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 01:21:53 -0500 Subject: [PATCH 645/916] more appropriate to enforce expect_completion only at the entrypoint level --- loopy/kernel/function_interface.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index cbd948ef8..ff7faa006 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -676,8 +676,7 @@ class CallableKernel(InKernelCallable): specialized_kernel, callables_table = ( infer_unknown_types_for_a_single_kernel( pre_specialized_subkernel, - callables_table, - expect_completion=True)) + callables_table)) new_arg_id_to_dtype = {} for pos, kw in pos_to_kw.items(): -- GitLab From 6b23e7e80794c15f323b0b822da1e7cd4410165e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 02:20:25 -0500 Subject: [PATCH 646/916] Fix the address space of ConstantArg --- loopy/kernel/data.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index f0d7b3789..a717a8ced 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -418,12 +418,19 @@ def GlobalArg(*args, **kwargs): class ConstantArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ - min_target_axes = 0 - max_target_axes = 1 + + def __init__(self, *args, **kwargs): + if kwargs.pop('address_space', AddressSpace.GLOBAL) != AddressSpace.GLOBAL: + raise LoopyError("'address_space' for ConstantArg must be GLOBAL.") + super(ConstantArg, self).__init__(*args, **kwargs) # Constant Arg cannot be an output is_output = False is_input = True + address_space = AddressSpace.GLOBAL + + min_target_axes = 0 + max_target_axes = 1 def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written): return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape, -- GitLab From fba2c96c9f9fc720af8c839f6e4d80cd4dcd8abc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 02:20:59 -0500 Subject: [PATCH 647/916] minor fixes --- loopy/check.py | 9 ++++----- loopy/kernel/function_interface.py | 5 +++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index e77d009f7..cdce785e3 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -512,12 +512,11 @@ def check_write_destinations(kernel): # {{{ check_has_schedulable_iname_nesting def check_has_schedulable_iname_nesting(kernel): - from loopy.transform.iname import ( - has_schedulable_iname_nesting_for_single_kernel, - get_iname_duplication_options_for_single_kernel) - if not has_schedulable_iname_nesting_for_single_kernel(kernel): + from loopy.transform.iname import (has_schedulable_iname_nesting, + get_iname_duplication_options) + if not has_schedulable_iname_nesting(kernel): import itertools as it - opt = get_iname_duplication_options_for_single_kernel(kernel) + opt = get_iname_duplication_options(kernel) opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w) for i, w in it.islice(opt, 3)) raise LoopyError("Kernel does not have a schedulable iname nesting. " diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index ff7faa006..dfafe3c94 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -30,7 +30,7 @@ from loopy.diagnostic import LoopyError from loopy.tools import update_persistent_hash from loopy.kernel import LoopKernel -from loopy.kernel.data import ValueArg, ArrayArg +from loopy.kernel.data import ValueArg, ArrayArg, ConstantArg from loopy.symbolic import (SubstitutionMapper, DependencyMapper) from pymbolic.primitives import Variable @@ -752,7 +752,8 @@ class CallableKernel(InKernelCallable): assert isinstance(arg_id, str) if isinstance(descr, ArrayArgDescriptor): - if not isinstance(self.subkernel.arg_dict[arg_id], ArrayArg): + if not isinstance(self.subkernel.arg_dict[arg_id], (ArrayArg, + ConstantArg)): raise LoopyError("Array passed to scalar argument " "'%s' of the function '%s' (in '%s')." % ( arg_id, self.subkernel.name, -- GitLab From f11298047e4ab078cb884fd3895b679fa72897bc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 22:50:55 -0500 Subject: [PATCH 648/916] Changes to take into account ReductionOpFunction as callables keys --- loopy/library/function.py | 2 - loopy/library/reduction.py | 104 +++++++++++++++---------------------- loopy/program.py | 46 ++++++++++------ 3 files changed, 70 insertions(+), 82 deletions(-) diff --git a/loopy/library/function.py b/loopy/library/function.py index 118b9dcc5..a22ed3d7b 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -115,13 +115,11 @@ def get_loopy_callables(): - callables that have a predefined meaning in :mod:`loo.py` like ``make_tuple``, ``index_of``, ``indexof_vec``. """ - from loopy.library.reduction import get_reduction_callables known_callables = { "make_tuple": MakeTupleCallable(name="make_tuple"), "indexof": IndexOfCallable(name="indexof"), "indexof_vec": IndexOfCallable(name="indexof_vec"), } - known_callables.update(get_reduction_callables()) return known_callables diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 2d27d24ec..d21cbdca9 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -203,21 +203,18 @@ class MaxReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2, callables_table, target): dtype, = dtype + from loopy.program import update_table # getting the callable 'max' from target - from loopy.program import (find_in_knl_callable_from_identifier, - _default_func_id_to_kernel_callable_mappers) - max_scalar_callable = find_in_knl_callable_from_identifier( - _default_func_id_to_kernel_callable_mappers(target), - target, "max") + max_scalar_callable = target.get_device_ast_builder().known_callables["max"] # type specialize the callable max_scalar_callable, callables_table = max_scalar_callable.with_types( {0: dtype, 1: dtype}, None, callables_table) # populate callables_table - callables_table, func_id = callables_table.with_added_callable( - 'max', max_scalar_callable) + func_id, callables_table = update_table(callables_table, "max", + max_scalar_callable) return ResolvedFunction(func_id)(operand1, operand2), callables_table @@ -228,21 +225,18 @@ class MinReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2, callables_table, target): dtype, = dtype + from loopy.program import update_table - # getting the callable 'max' from target - from loopy.program import (find_in_knl_callable_from_identifier, - _default_func_id_to_kernel_callable_mappers) - min_scalar_callable = find_in_knl_callable_from_identifier( - _default_func_id_to_kernel_callable_mappers(target), - target, "min") + # getting the callable 'min' from target + min_scalar_callable = target.get_device_ast_builder().known_callables["min"] # type specialize the callable min_scalar_callable, callables_table = min_scalar_callable.with_types( {0: dtype, 1: dtype}, None, callables_table) # populate callables_table - callables_table, func_id = callables_table.with_added_callable( - 'min', min_scalar_callable) + func_id, callables_table = update_table(callables_table, "min", + min_scalar_callable) return ResolvedFunction(func_id)(operand1, operand2), callables_table @@ -305,21 +299,22 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype, callables_table, target): + from loopy.library.function import MakeTupleCallable + from loopy.program import update_table + scalar_neutral_element, calables_table = ( self.inner_reduction.neutral_element( scalar_dtype, callables_table, target)) - from loopy.program import (find_in_knl_callable_from_identifier, - _default_func_id_to_kernel_callable_mappers) - make_tuple_scalar_callable = find_in_knl_callable_from_identifier( - _default_func_id_to_kernel_callable_mappers(target), - target, "make_tuple") - make_tuple_scalar_callable, _ = ( - make_tuple_scalar_callable.with_types( - dict(enumerate([scalar_dtype, segment_flag_dtype])), None, - None)) - callables_table, func_id = callables_table.with_added_callable( - "make_tuple", make_tuple_scalar_callable) + make_tuple_callable = MakeTupleCallable( + name="make_tuple") + + make_tuple_callable, callables_table = make_tuple_callable.with_types( + dict(enumerate([scalar_dtype, segment_flag_dtype])), + None, callables_table) + + func_id, callables_table = update_table( + callables_table, "make_tuple", make_tuple_callable) return ResolvedFunction(func_id)(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)), callables_table @@ -339,13 +334,8 @@ class _SegmentedScalarReductionOperation(ReductionOperation): other.inner_reduction) def __call__(self, dtypes, operand1, operand2, callables_table, target): - # getting the callable 'max' from target - - from loopy.program import (find_in_knl_callable_from_identifier, - _default_func_id_to_kernel_callable_mappers) - segmented_scalar_callable = find_in_knl_callable_from_identifier( - _default_func_id_to_kernel_callable_mappers(target), - target, SegmentedOp(self)) + segmented_scalar_callable = ReductionCallable( + SegmentedOp(self)) # type specialize the callable segmented_scalar_callable, callables_table = ( @@ -354,8 +344,9 @@ class _SegmentedScalarReductionOperation(ReductionOperation): None, callables_table)) # populate callables_table - callables_table, func_id = callables_table.with_added_callable( - SegmentedOp(self), segmented_scalar_callable) + from loopy.program import update_table + func_id, callables_table = update_table( + callables_table, SegmentedOp(self), segmented_scalar_callable) return (ResolvedFunction(func_id)(*(operand1 + operand2)), callables_table) @@ -418,18 +409,18 @@ class _ArgExtremumReductionOperation(ReductionOperation): get_ge_neutral if self.neutral_sign < 0 else get_le_neutral) scalar_neutral_element = scalar_neutral_func(scalar_dtype) - from loopy.program import (find_in_knl_callable_from_identifier, - _default_func_id_to_kernel_callable_mappers) + from loopy.library.function import MakeTupleCallable + from loopy.program import update_table + make_tuple_callable = MakeTupleCallable( + name="make_tuple") - make_tuple_scalar_callable = find_in_knl_callable_from_identifier( - _default_func_id_to_kernel_callable_mappers(target), - target, "make_tuple") - make_tuple_scalar_callable, _ = ( - make_tuple_scalar_callable.with_types( - dict(enumerate([scalar_dtype, index_dtype])), None, - None)) - callables_table, func_id = callables_table.with_added_callable( - "make_tuple", make_tuple_scalar_callable) + make_tuple_callable, callables_table = make_tuple_callable.with_types( + dict(enumerate([scalar_dtype, index_dtype])), + None, callables_table) + + # populate callables_table + func_id, callables_table = update_table(callables_table, "make_tuple", + make_tuple_callable) return ResolvedFunction(func_id)(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)), callables_table @@ -448,13 +439,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2, callables_table, target): - # getting the callable 'max' from target - - from loopy.program import (find_in_knl_callable_from_identifier, - _default_func_id_to_kernel_callable_mappers) - arg_ext_scalar_callable = find_in_knl_callable_from_identifier( - _default_func_id_to_kernel_callable_mappers(target), - target, ArgExtOp(self)) + arg_ext_scalar_callable = ReductionCallable(ArgExtOp(self)) # type specialize the callable arg_ext_scalar_callable, callables_table = ( @@ -463,8 +448,9 @@ class _ArgExtremumReductionOperation(ReductionOperation): None, callables_table)) # populate callables_table - callables_table, func_id = callables_table.with_added_callable( - ArgExtOp(self), arg_ext_scalar_callable) + from loopy.program import update_table + func_id, callables_table = update_table( + callables_table, ArgExtOp(self), arg_ext_scalar_callable) return (ResolvedFunction(func_id)(*(operand1 + operand2)), callables_table) @@ -627,14 +613,6 @@ class ReductionCallable(ScalarCallable): return -def get_reduction_callables(): - return dict((id_, ReductionCallable(id_)) for id_ in [ - ReductionOpFunction(SegmentedSumReductionOperation), - ReductionOpFunction(SegmentedProductReductionOperation), - ReductionOpFunction(ArgMaxReductionOperation), - ReductionOpFunction(ArgMinReductionOperation), - ]) - # }}} # vim: fdm=marker diff --git a/loopy/program.py b/loopy/program.py index 75fd0d77d..f0dce3846 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -142,18 +142,6 @@ class CallableResolver(RuleAwareIdentityMapper): expn_state) -def _default_func_id_to_kernel_callable_mappers(target): - """ - Returns a list of functions that are provided through *target* by deafault. - """ - from loopy.library.function import ( - loopy_specific_callable_func_id_to_knl_callable_mappers) - return ( - [loopy_specific_callable_func_id_to_knl_callable_mappers] + ( - target.get_device_ast_builder().function_id_in_knl_callable_mapper( - ))) - - # {{{ program class Program(ImmutableRecord): @@ -541,10 +529,17 @@ class CallablesInferenceContext(ImmutableRecord): for func_id, in_knl_callable in self.callables.items(): if in_knl_callable == in_kernel_callable: history[func_id] = function.name - return ( - self.copy( - history=history), - Variable(func_id)) + if isinstance(func_id, str): + return ( + self.copy( + history=history), + Variable(func_id)) + else: + assert isinstance(func_id, ReductionOpFunction) + return ( + self.copy( + history=history), + func_id) assert False else: @@ -629,7 +624,8 @@ class CallablesInferenceContext(ImmutableRecord): program.entrypoints): # at this point we should not rename anything to the names of # entrypoints - for new_func_id in (new_callable_ids-six.viewkeys(renames)): + for new_func_id in (new_callable_ids-six.viewkeys(renames)) & set( + six.iterkeys(self.history)): if old_func_id == self.history[new_func_id]: renames[new_func_id] = old_func_id break @@ -734,6 +730,22 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): return wraps(transform_for_single_kernel)(_collective_transform) + +def update_table(callables_table, clbl_id, clbl): + from loopy.kernel.function_interface import InKernelCallable + assert isinstance(clbl, InKernelCallable) + + for i, c in six.iteritems(callables_table): + if c == clbl: + return i, callables_table + + while clbl_id in callables_table: + clbl_id = next_indexed_function_identifier(clbl_id) + + callables_table[clbl_id] = clbl + + return clbl_id, callables_table + # }}} -- GitLab From d80e940598d7366edbbeec194ef67966c6b41ce3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 23:32:28 -0500 Subject: [PATCH 649/916] do not perform checks on type identities if all variable types are not resolved --- loopy/check.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index cdce785e3..4ed8de3d1 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -760,7 +760,12 @@ def pre_schedule_checks(kernel, callables_table): try: logger.debug("%s: pre-schedule check: start" % kernel.name) - check_for_integer_subscript_indices(kernel, callables_table) + from loopy.kernel.data import auto + if all(arg.dtype not in [None, auto] for arg in kernel.args) and ( + all(tv.dtype not in [None, auto] for tv in + six.itervalues(kernel.temporary_variables))): + # only check if all types are known + check_for_integer_subscript_indices(kernel, callables_table) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel, callables_table) -- GitLab From dbe1b61c081c1dce8e75a35fa4082cd4178922f7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 23:56:06 -0500 Subject: [PATCH 650/916] implement target changing of a translation unit --- loopy/program.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index f0dce3846..f5118dd1b 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -211,17 +211,33 @@ class Program(ImmutableRecord): update_persistent_hash = update_persistent_hash def copy(self, **kwargs): - if 'target' in kwargs: + target = kwargs.pop('target', None) + program = super(Program, self).copy(**kwargs) + if target: from loopy.kernel import KernelState if max(callable_knl.subkernel.state for callable_knl in six.itervalues(self.callables_table) if isinstance(callable_knl, CallableKernel)) > ( KernelState.INITIAL): - if not isinstance(kwargs['target'], type(self.target)): + if not isinstance(target, type(self.target)): raise LoopyError("One of the kenels in the program has been " "preprocessed, cannot modify target now.") + callables = {} + for func_id, clbl in six.iteritems(program.callables_table): + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + knl = knl.copy(target=target) + clbl = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError() + callables[func_id] = clbl + + program = super(Program, program).copy( + callables_table=callables, target=target) - return super(Program, self).copy(**kwargs) + return program def with_entrypoints(self, entrypoints): """ -- GitLab From d24fb2e70a0de736463900a585511c80907ef251 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 23:57:29 -0500 Subject: [PATCH 651/916] do not resolve already resolved program --- loopy/preprocess.py | 10 ++++------ loopy/program.py | 7 +++++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 4db499dd4..264a49803 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2357,8 +2357,6 @@ preprocess_cache = WriteOncePersistentDict( def preprocess_single_kernel(kernel, callables_table, device=None): from loopy.kernel import KernelState - if kernel.state >= KernelState.PREPROCESSED: - return kernel # {{{ cache retrieval @@ -2442,6 +2440,9 @@ def preprocess_single_kernel(kernel, callables_table, device=None): def preprocess_program(program, device=None): + from loopy.kernel import KernelState + if program.state >= KernelState.PREPROCESSED: + return program if len([clbl for clbl in six.itervalues(program.callables_table) if isinstance(clbl, CallableKernel)]) == 1: @@ -2452,10 +2453,7 @@ def preprocess_program(program, device=None): if not program.entrypoints: raise LoopyError("Translation unit did not receive any entrypoints") - from loopy.kernel import KernelState - - if program.state < KernelState.CALLS_RESOLVED: - program = program.with_resolved_callables() + program = program.with_resolved_callables() if device is not None: # FIXME: Time to remove this? (Git blame shows 5 years ago) diff --git a/loopy/program.py b/loopy/program.py index f5118dd1b..0a20851da 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -272,9 +272,12 @@ class Program(ImmutableRecord): from loopy.library.function import get_loopy_callables from loopy.kernel import KernelState - known_callables = self.target.get_device_ast_builder().known_callables + if self.state >= KernelState.CALLS_RESOLVED: + return self + + known_callables = self.callables_table + known_callables.update(self.target.get_device_ast_builder().known_callables) known_callables.update(get_loopy_callables()) - known_callables.update(self.callables_table) # update the known callables from the target. callables_table = dict((e, self.callables_table[e]) for e in self.entrypoints) -- GitLab From 54f0439244ab56148c7dbe24aa8d455d9ede5823 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 24 Oct 2019 23:58:36 -0500 Subject: [PATCH 652/916] changes in tests to accomodate minor interfacial changes --- test/test_loopy.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 6a780eaab..42a2aa890 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2304,7 +2304,7 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel(): def test_multi_argument_reduction_type_inference(): - from loopy.type_inference import TypeInferenceMapper + from loopy.type_inference import TypeReader from loopy.library.reduction import SegmentedSumReductionOperation from loopy.types import to_loopy_type op = SegmentedSumReductionOperation() @@ -2323,7 +2323,7 @@ def test_multi_argument_reduction_type_inference(): allow_simultaneous=True), allow_simultaneous=True) - t_inf_mapper = TypeInferenceMapper(prog["loopy_kernel"], + t_inf_mapper = TypeReader(prog["loopy_kernel"], prog.callables_table) assert ( @@ -2368,7 +2368,8 @@ def test_global_barrier_order_finding(): ("yoink", "top"), ("postloop", "yoink"), ("zzzv", "postloop")): - assert lp.find_most_recent_global_barrier(prog["loopy_kernel"], insn) == barrier + assert lp.find_most_recent_global_barrier(prog["loopy_kernel"], + insn) == barrier def test_global_barrier_error_if_unordered(): @@ -2577,12 +2578,14 @@ def test_preamble_with_separate_temporaries(ctx_factory): def test_arg_inference_for_predicates(): - knl = lp.make_kernel("{[i]: 0 <= i < 10}", + prog = lp.make_kernel("{[i]: 0 <= i < 10}", """ if incr[i] a = a + 1 end - """) + """, name="loopy_kernel") + + knl = prog["loopy_kernel"] assert "incr" in knl.arg_dict assert knl.arg_dict["incr"].shape == (10,) -- GitLab From c19a0c125c4ea5a8cb9dc7dd796404b017163c30 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 25 Oct 2019 01:12:21 -0500 Subject: [PATCH 653/916] revamp the code handling mangler callables --- loopy/type_inference.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index e56a0f2af..939f34087 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -522,8 +522,7 @@ class TypeInferenceMapper(CombineMapper): break if mangle_result is not None: - from loopy.kernel.function_interface import (ManglerCallable, - ValueArgDescriptor) + from loopy.kernel.function_interface import ManglerCallable # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) @@ -531,21 +530,16 @@ class TypeInferenceMapper(CombineMapper): arg_id_to_dtype.update(dict((-i-1, dtype.with_target(self.kernel.target)) for i, dtype in enumerate( mangle_result.result_dtypes))) - arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in - enumerate(mangle_result.arg_dtypes)) - res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in - enumerate(mangle_result.result_dtypes)) - arg_id_to_descr = dict(arg_descrs+res_descrs) # creating the ManglerCallable object corresponding to the # function. in_knl_callable = ManglerCallable( identifier, function_mangler, arg_id_to_dtype, - arg_id_to_descr, mangle_result.target_name) + name_in_target=mangle_result.target_name) # FIXME: we have not tested how it works with mangler callable # yet. - self.callables_table, new_function_id = ( - self.callables_table.with_added_callable( + self.clbl_inf_ctx, new_function_id = ( + self.clbl_inf_ctx.with_callable( expr.function, in_knl_callable)) if isinstance(expr, Call): -- GitLab From 2044bfebcecd0295ab44c8098ff96950d9bda7ee Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 25 Oct 2019 01:23:58 -0500 Subject: [PATCH 654/916] dict.items() -> six.iteritems(dict) spree --- loopy/codegen/__init__.py | 2 +- loopy/kernel/creation.py | 4 ++-- loopy/kernel/function_interface.py | 17 +++++++++-------- loopy/library/function.py | 3 ++- loopy/preprocess.py | 2 +- loopy/program.py | 4 ++-- 6 files changed, 17 insertions(+), 15 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 3a3b88de5..dadc2222e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -666,7 +666,7 @@ def generate_code_v2(program): callee_fdecls = [] implemented_data_infos = [] - for func_id, in_knl_callable in program.callables_table.items(): + for func_id, in_knl_callable in six.iteritems(program.callables_table): if isinstance(in_knl_callable, CallableKernel): #FIXME: # 1. Diverge the kernels which are both entrypoint and callees at this diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 591a73483..5ab1aa488 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1975,7 +1975,7 @@ class SliceToInameReplacer(IdentityMapper): set=list(sar_bounds.keys())) from loopy.symbolic import DependencyMapper args_as_params_for_domains = set() - for _, (start, stop, step) in sar_bounds.items(): + for _, (start, stop, step) in six.iteritems(sar_bounds): args_as_params_for_domains |= DependencyMapper()(start) args_as_params_for_domains |= DependencyMapper()(stop) args_as_params_for_domains |= DependencyMapper()(step) @@ -1987,7 +1987,7 @@ class SliceToInameReplacer(IdentityMapper): iname_set = isl.BasicSet.universe(space) from loopy.isl_helpers import make_slab - for iname, (start, stop, step) in sar_bounds.items(): + for iname, (start, stop, step) in six.iteritems(sar_bounds): iname_set = iname_set & make_slab(space, iname, start, stop, step) subarray_ref_domains.append(iname_set) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index dfafe3c94..8809ac61a 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -22,7 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - +import six from six.moves import zip from pytools import ImmutableRecord @@ -393,7 +393,7 @@ class InKernelCallable(ImmutableRecord): new_arg_id_to_dtype = None if self.arg_id_to_dtype is not None: new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, - dtype in self.arg_id_to_dtype.items()) + dtype in six.iteritems(self.arg_id_to_dtype)) return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) @@ -679,7 +679,7 @@ class CallableKernel(InKernelCallable): callables_table)) new_arg_id_to_dtype = {} - for pos, kw in pos_to_kw.items(): + for pos, kw in six.iteritems(pos_to_kw): new_arg_id_to_dtype[kw] = specialized_kernel.arg_dict[kw].dtype new_arg_id_to_dtype[pos] = specialized_kernel.arg_dict[kw].dtype @@ -730,7 +730,7 @@ class CallableKernel(InKernelCallable): subst_mapper = SubstitutionMapper(subst_func) arg_id_to_descr = dict((arg_id, descr.map_expr(subst_mapper)) for - arg_id, descr in arg_id_to_descr.items()) + arg_id, descr in six.iteritems(arg_id_to_descr)) # }}} @@ -746,7 +746,7 @@ class CallableKernel(InKernelCallable): new_args = self.subkernel.args[:] kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - for arg_id, descr in arg_id_to_descr.items(): + for arg_id, descr in six.iteritems(arg_id_to_descr): if isinstance(arg_id, int): arg_id = pos_to_kw[arg_id] assert isinstance(arg_id, str) @@ -798,7 +798,8 @@ class CallableKernel(InKernelCallable): if assumptions: args_added_knl = assume(args_added_knl, ' and '.join([ - '{0}={1}'.format(key, val) for key, val in assumptions.items()])) + '{0}={1}'.format(key, val) for key, val in + six.iteritems(assumptions)])) return ( self.copy( @@ -812,7 +813,7 @@ class CallableKernel(InKernelCallable): arg_id_to_descr = {} - for pos, kw in pos_to_kw.items(): + for pos, kw in six.iteritems(pos_to_kw): arg = self.subkernel.arg_dict[kw] arg_id_to_descr[pos] = ArrayArgDescriptor( shape=arg.shape, @@ -931,7 +932,7 @@ class ManglerCallable(ScalarCallable): def with_types(self, arg_id_to_dtype, kernel, callables_table): if self.arg_id_to_dtype is not None: # specializing an already specialized function. - for arg_id, dtype in arg_id_to_dtype.items(): + for arg_id, dtype in six.iteritems(arg_id_to_dtype): # only checking for the ones which have been provided # if does not match, returns an error. if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]: diff --git a/loopy/library/function.py b/loopy/library/function.py index a22ed3d7b..607ebb316 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six from loopy.kernel.function_interface import ScalarCallable from loopy.diagnostic import LoopyError @@ -49,7 +50,7 @@ class MakeTupleCallable(ScalarCallable): class IndexOfCallable(ScalarCallable): def with_types(self, arg_id_to_dtype, kernel, callables_table): new_arg_id_to_dtype = dict((i, dtype) for i, dtype in - arg_id_to_dtype.items() if dtype is not None) + six.iteritems(arg_id_to_dtype) if dtype is not None) new_arg_id_to_dtype[-1] = kernel.index_dtype return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 264a49803..b47b9e1fa 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2490,7 +2490,7 @@ def preprocess_program(program, device=None): # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects new_callables = {} - for func_id, in_knl_callable in program.callables_table.items(): + for func_id, in_knl_callable in six.iteritems(program.callables_table): if isinstance(in_knl_callable, CallableKernel): new_subkernel = preprocess_single_kernel( in_knl_callable.subkernel, program.callables_table, diff --git a/loopy/program.py b/loopy/program.py index 0a20851da..2a4a548e5 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -379,7 +379,7 @@ class Program(ImmutableRecord): return "\n".join( strify_callable(clbl) - for name, clbl in self.callables_table.items()) + for name, clbl in six.iteritems(self.callables_table)) # }}} @@ -545,7 +545,7 @@ class CallablesInferenceContext(ImmutableRecord): if in_kernel_callable in self.callables.values(): # the callable already exists, hence return the function # identifier corresponding to that callable. - for func_id, in_knl_callable in self.callables.items(): + for func_id, in_knl_callable in six.iteritems(self.callables): if in_knl_callable == in_kernel_callable: history[func_id] = function.name if isinstance(func_id, str): -- GitLab From 8ebf1f640dd058295654e55b6e4c700b18b96ccb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 25 Oct 2019 03:32:36 -0500 Subject: [PATCH 655/916] make fortran return a program --- loopy/frontend/fortran/__init__.py | 38 ++++++++++------------------ loopy/frontend/fortran/translator.py | 1 - 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index bc360b996..aaa5962b5 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -22,6 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six import logging logger = logging.getLogger(__name__) @@ -296,12 +297,7 @@ def _add_assignees_to_calls(knl, all_kernels): def parse_fortran(source, filename="", free_form=None, strict=None, - seq_dependencies=None, auto_dependencies=None, target=None, - return_list_of_knls=False): - """ - :returns: an instance of :class:`list` of :class:`loopy.LoopKernel`s if - *return_list_of_knls* is True else a :class:`loopy.Program`. - """ + seq_dependencies=None, auto_dependencies=None, target=None): parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) @@ -342,25 +338,17 @@ def parse_fortran(source, filename="", free_form=None, strict=None, kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) - if return_list_of_knls: - return kernels - - kernels = [_add_assignees_to_calls(knl, kernels) for knl in kernels] - - from loopy.kernel.tools import identify_root_kernel - from loopy.program import make_program - from loopy.transform.callable import register_callable_kernel - - root_knl_name = identify_root_kernel(kernels) - root_knl = [knl for knl in kernels if knl.name == - root_knl_name][0].copy(is_called_from_host=True) - callee_kernels = [knl for knl in kernels if knl.name != root_knl_name] - prog = make_program(root_knl) - for callee_knl in callee_kernels: - #FIXME: This would need some sort of traversal to be valid - # for all cases - # THIS IS A VERY IMPORTANT FIXME!! - prog = register_callable_kernel(prog, callee_knl) + from loopy.transform.callable import merge + prog = merge(kernels) + all_kernels = [clbl.subkernel for clbl in + six.itervalues(prog.callables_table)] + + for knl in all_kernels: + prog.with_kernel(_add_assignees_to_calls(knl, all_kernels)) + + if len(all_kernels) == 1: + # guesssing in the case of only one function + prog = prog.with_entrypoints(all_kernels[0].name) parse_plog.done() diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 949a3d4cc..caa8fa681 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -763,7 +763,6 @@ class F2LoopyTranslator(FTreeWalkerBase): arg_name, dtype=sub.get_type(arg_name), shape=sub.get_loopy_shape(arg_name), - is_output=False, )) else: kernel_data.append( -- GitLab From 57c4f7e29fd0939ac45ecffc8cf070550b7de462 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 25 Oct 2019 03:33:21 -0500 Subject: [PATCH 656/916] do not consider deps with auto/None --- loopy/kernel/function_interface.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 8809ac61a..5ed292bb2 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -117,7 +117,9 @@ class ArrayArgDescriptor(ImmutableRecord): return self.copy(shape=new_shape, dim_tags=new_dim_tags) def depends_on(self): - result = DependencyMapper(composite_leaves=False)(self.shape) | ( + from loopy.kernel.data import auto + result = DependencyMapper(composite_leaves=False)([lngth for lngth in + self.shape if lngth not in [None, auto]]) | ( frozenset().union(*(dim_tag.depends_on() for dim_tag in self.dim_tags))) return frozenset(var.name for var in result) -- GitLab From eb588fb86a5c1d9ce2997422479ce0c60474a2bf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 25 Oct 2019 03:33:42 -0500 Subject: [PATCH 657/916] removes unnecessary code --- loopy/program.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 2a4a548e5..5c79edec7 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -333,17 +333,6 @@ class Program(ImmutableRecord): def __call__(self, *args, **kwargs): entrypoint = kwargs.get('entrypoint', None) - if self.entrypoints is None: - if len([clbl for clbl in self.callables_table.values() if - isinstance(clbl, CallableKernel)]) == 1: - #FIXME: in place update, can we do any better? - self.entrypoints = frozenset([clbl.subkernel.name for - clbl in self.callables_table.values() if isinstance(clbl, - CallableKernel)]) - else: - raise LoopyError("entrypoint attribute unset. Use" - " 'with_entrypoints' before calling.") - if entrypoint is None: # did not receive an entrypoint for the program to execute if len(self.entrypoints) == 1: -- GitLab From aaeffe41454b82169fedd37b9e50b46928b4181f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 25 Oct 2019 03:34:45 -0500 Subject: [PATCH 658/916] changes interface to some transforms, as iterating through all callalbles may not be the best idea --- loopy/transform/buffer.py | 50 +++++------- loopy/transform/fusion.py | 150 +++++++++++----------------------- loopy/transform/iname.py | 1 - loopy/transform/precompute.py | 44 ++++------ loopy/transform/subst.py | 13 ++- 5 files changed, 93 insertions(+), 165 deletions(-) diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 6849e40c3..0121fb49c 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -23,6 +23,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six + from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) from loopy.symbolic import (get_dependencies, @@ -33,9 +35,9 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError -from loopy.program import Program from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import ScalarCallable, CallableKernel +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel from pymbolic import var @@ -133,10 +135,10 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array_for_single_kernel(kernel, callables_table, var_name, - buffer_inames, init_expression=None, store_expression=None, - within=None, default_tag="l.auto", temporary_scope=None, - temporary_is_local=None, fetch_bounding_box=False): +def buffer_array(kernel, var_name, buffer_inames, init_expression=None, + store_expression=None, within=None, default_tag="l.auto", + temporary_scope=None, temporary_is_local=None, + fetch_bounding_box=False, callables_table=None): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable @@ -172,6 +174,18 @@ def buffer_array_for_single_kernel(kernel, callables_table, var_name, fetched. """ + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in + six.iteritems(kernel.callables_table) if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + + return kernel.with_kernel(buffer_array(kernel[kernel_names[0]], + var_name, buffer_inames, init_expression, store_expression, within, + default_tag, temporary_scope, temporary_is_local, + fetch_bounding_box, kernel.callables_table)) + assert isinstance(kernel, LoopKernel) # {{{ unify temporary_scope / temporary_is_local @@ -544,28 +558,4 @@ def buffer_array_for_single_kernel(kernel, callables_table, var_name, return kernel -def buffer_array(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = buffer_array_for_single_kernel( - in_knl_callable.subkernel, program.callables_table, - *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_callables_table = program.callables_table.copy( - resolved_functions=new_resolved_functions) - return program.copy(callables_table=new_callables_table) - # vim: foldmethod=marker diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 45e9c0a06..20b24793e 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -32,8 +32,6 @@ from loopy.diagnostic import LoopyError from pymbolic import var from loopy.kernel import LoopKernel -from loopy.kernel.function_interface import CallableKernel -from loopy.program import rename_resolved_functions_in_a_single_kernel def _apply_renames_in_exprs(kernel, var_renames): @@ -291,7 +289,51 @@ def _fuse_two_kernels(knla, knlb): # }}} -def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): +def fuse_kernels(kernels, suffixes=None, data_flow=None): + """Return a kernel that performs all the operations in all entries + of *kernels*. + + :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. + :arg suffixes: If given, must be a list of strings of a length matching + that of *kernels*. This will be used to disambiguate the names + of temporaries, as described below. + :arg data_flow: A list of data dependencies + ``[(var_name, from_kernel, to_kernel), ...]``. + Based on this, the fuser will create dependencies between all + writers of *var_name* in ``kernels[from_kernel]`` to + readers of *var_name* in ``kernels[to_kernel]``. + *from_kernel* and *to_kernel* are indices into *kernels*. + + The components of the kernels are fused as follows: + + * The resulting kernel will have a domain involving all the inames and + parameters occurring across *kernels*. Inames with matching names + across *kernels* are fused in such a way that they remain a single + iname in the fused kernel. Use :func:`loopy.rename_iname` if this is + not desired. + + * The projection of the domains of each pair of kernels onto their + common subset of inames must match in order for fusion to + succeed. + + * Assumptions are fused by taking their conjunction. + + * If kernel arguments with matching names are encountered across + *kernels*, their declarations must match in order for fusion to + succeed. + + * Temporaries are automatically renamed to remain uniquely associated + with each instruction stream. + + * The resulting kernel will contain all instructions from each entry + of *kernels*. Clashing instruction IDs will be renamed to ensure + uniqueness. + + .. versionchanged:: 2016.2 + + *data_flow* was added in version 2016.2 + """ + assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) @@ -371,108 +413,8 @@ def fuse_loop_kernels(kernels, suffixes=None, data_flow=None): # }}} - return result - - -def fuse_kernels(programs, suffixes=None, data_flow=None): - """Return a kernel that performs all the operations in all entries - of *kernels*. - - :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused. - :arg suffixes: If given, must be a list of strings of a length matching - that of *kernels*. This will be used to disambiguate the names - of temporaries, as described below. - :arg data_flow: A list of data dependencies - ``[(var_name, from_kernel, to_kernel), ...]``. - Based on this, the fuser will create dependencies between all - writers of *var_name* in ``kernels[from_kernel]`` to - readers of *var_name* in ``kernels[to_kernel]``. - *from_kernel* and *to_kernel* are indices into *kernels*. - - The components of the kernels are fused as follows: - - * The resulting kernel will have a domain involving all the inames - and parameters occurring across *kernels*. - Inames with matching names across *kernels* are fused in such a way - that they remain a single iname in the fused kernel. - Use :func:`loopy.rename_iname` if this is not desired. - - * The projection of the domains of each pair of kernels onto their - common subset of inames must match in order for fusion to - succeed. - - * Assumptions are fused by taking their conjunction. - - * If kernel arguments with matching names are encountered across - *kernels*, their declarations must match in order for fusion to - succeed. - - * Temporaries are automatically renamed to remain uniquely associated - with each instruction stream. - - * The resulting kernel will contain all instructions from each entry - of *kernels*. Clashing instruction IDs will be renamed to ensure - uniqueness. - - .. versionchanged:: 2016.2 - - *data_flow* was added in version 2016.2 - """ - from loopy.program import make_program + return make_program(result).with_entrypoints(result.name) - programs = [make_program(knl) if isinstance(knl, LoopKernel) else knl for - knl in programs] - - # all the resolved functions in programs must be registered in - # main_callables_table - main_prog_callables_info = ( - programs[0].callables_table) - old_root_kernel_callable = ( - programs[0].callables_table[programs[0].name]) - kernels = [programs[0].root_kernel] - - # removing the callable collisions that maybe present - for prog in programs[1:]: - root_kernel = prog.root_kernel - renames_needed = {} - for old_func_id, in_knl_callable in prog.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - # Fusing programs with multiple callable kernels is tough. - # Reason: Need to first figure out the order in which the - # callable kernels must be resolved into - # main_callables_table, because of renaming is - # needed to be done in the callable kernels before registering. - # Hence disabling it until required. - if in_knl_callable.subkernel.name != prog.name: - raise LoopyError("fuse_kernels cannot fuse programs with " - "multiple callable kernels.") - - # root kernel are dealt at the end after performing all the - # renaming. - continue - main_prog_callables_info, new_func_id = ( - main_prog_callables_info.with_added_callable(var(old_func_id), - in_knl_callable)) - - if old_func_id != new_func_id: - renames_needed[old_func_id] = new_func_id - - if renames_needed: - root_kernel = rename_resolved_functions_in_a_single_kernel( - root_kernel, renames_needed) - - kernels.append(root_kernel) - - new_root_kernel = fuse_loop_kernels(kernels, suffixes, data_flow) - new_root_kernel_callable = old_root_kernel_callable.copy( - subkernel=new_root_kernel.copy(name=programs[0].name)) - - # TODO: change the name of the final root kernel. - main_prog_callables_info, _ = main_prog_callables_info.with_added_callable( - var(programs[0].name), new_root_kernel_callable) - - return programs[0].copy( - callables_table=main_prog_callables_info) # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index c2e268302..50a6a505d 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -95,7 +95,6 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) -@iterate_over_kernels_if_given_program def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index acc2496ac..b322c3b28 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -31,6 +31,8 @@ from loopy.symbolic import (get_dependencies, SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel import numpy as np from pymbolic import var @@ -38,9 +40,6 @@ from pymbolic import var from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) -from loopy.program import Program -from loopy.kernel.function_interface import CallableKernel, ScalarCallable - class RuleAccessDescriptor(AccessDescriptor): __slots__ = ["args", "expansion_stack"] @@ -261,7 +260,7 @@ class _not_provided(object): # noqa: N801 pass -def precompute_for_single_kernel(kernel, callables_table, subst_use, +def precompute(kernel, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, @@ -273,6 +272,7 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, fetch_bounding_box=False, temporary_address_space=None, compute_insn_id=None, + callables_table=None, **kwargs): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two @@ -358,6 +358,18 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are eliminated. """ + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in + six.iteritems(kernel.callables_table) if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + + return kernel.with_kernel(precompute(kernel[kernel_names[0]], + subst_use, sweep_inames, within, storage_axes, temporary_name, + precompute_inames, precompute_outer_inames, storage_axis_to_tag, + default_tag, dtype, fetch_bounding_box, temporary_address_space, + compute_insn_id, kernel.callables_table, **kwargs)) # {{{ unify temporary_address_space / temporary_scope @@ -1052,28 +1064,4 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, return kernel -def precompute(program, *args, **kwargs): - assert isinstance(program, Program) - - new_resolved_functions = {} - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = precompute_for_single_kernel( - in_knl_callable.subkernel, program.callables_table, - *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) - - new_resolved_functions[func_id] = in_knl_callable - - new_callables_table = program.callables_table.copy( - resolved_functions=new_resolved_functions) - return program.copy(callables_table=new_callables_table) - # vim: foldmethod=marker diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 26252de86..09e2b268c 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -33,7 +33,7 @@ from pymbolic.mapper.substitutor import make_subst_func from pytools import ImmutableRecord from pymbolic import var -from loopy.program import iterate_over_kernels_if_given_program +from loopy.program import iterate_over_kernels_if_given_program, Program from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging @@ -46,7 +46,6 @@ class ExprDescriptor(ImmutableRecord): # {{{ extract_subst -@iterate_over_kernels_if_given_program def extract_subst(kernel, subst_name, template, parameters=()): """ :arg subst_name: The name of the substitution rule to be created. @@ -59,6 +58,16 @@ def extract_subst(kernel, subst_name, template, parameters=()): unifications. """ + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in + six.iteritems(kernel.callables_table) if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + + return kernel.with_kernel(extract_subst(kernel[kernel_names[0]], + subst_name, template, parameters)) + if isinstance(template, str): from pymbolic import parse template = parse(template) -- GitLab From f7143d3e3d1fa49a61e72a7c3920a572412500a9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 25 Oct 2019 03:37:55 -0500 Subject: [PATCH 659/916] minor changes to tests to adapt to the new interface --- test/test_fortran.py | 55 ++++++++++++++++++++++---------------------- test/test_loopy.py | 3 ++- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index 1ab28409b..92e3c2a82 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -162,7 +162,7 @@ def test_fill(ctx_factory): knl = lp.parse_transformed_fortran(fortran_src, pre_transform_code="split_amount = 128") - assert "i_inner" in knl.root_kernel.all_inames() + assert "i_inner" in knl["fill"].all_inames() ctx = ctx_factory() @@ -291,9 +291,9 @@ def test_assignment_to_subst_indices(ctx_factory): ref_knl = knl - assert "a" in knl.root_kernel.temporary_variables + assert "a" in knl['fill'].temporary_variables knl = lp.assignment_to_subst(knl, "a") - assert "a" not in knl.root_kernel.temporary_variables + assert "a" not in knl['fill'].temporary_variables ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl) @@ -384,31 +384,31 @@ def test_matmul(ctx_factory, buffer_inames): end subroutine """ - knl = lp.parse_fortran(fortran_src) + prog = lp.parse_fortran(fortran_src) - assert len(knl.root_kernel.domains) == 1 + assert len(prog['dgemm'].domains) == 1 - ref_knl = knl + ref_prog = prog - knl = lp.split_iname(knl, "i", 16, + prog = lp.split_iname(prog, "i", 16, outer_tag="g.0", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 8, + prog = lp.split_iname(prog, "j", 8, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_iname(knl, "k", 32) - knl = lp.assume(knl, "n mod 32 = 0") - knl = lp.assume(knl, "m mod 32 = 0") - knl = lp.assume(knl, "ell mod 16 = 0") + prog = lp.split_iname(prog, "k", 32) + prog = lp.assume(prog, "n mod 32 = 0") + prog = lp.assume(prog, "m mod 32 = 0") + prog = lp.assume(prog, "ell mod 16 = 0") - knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") - knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") - knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto") - knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto") + prog = lp.extract_subst(prog, "a_acc", "a[i1,i2]", parameters="i1, i2") + prog = lp.extract_subst(prog, "b_acc", "b[i1,i2]", parameters="i1, i2") + prog = lp.precompute(prog, "a_acc", "k_inner,i_inner", default_tag="l.auto") + prog = lp.precompute(prog, "b_acc", "j_inner,k_inner", default_tag="l.auto") - knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, + prog = lp.buffer_array(prog, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") ctx = ctx_factory() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128)) + lp.auto_test_vs_ref(ref_prog, ctx, prog, parameters=dict(n=128, m=128, ell=128)) @pytest.mark.xfail @@ -498,10 +498,11 @@ def test_fuse_kernels(ctx_factory): fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) - knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)]) - knl = lp.prioritize_loops(knl, "e,i,j,k") + knl = lp.fuse_kernels((xderiv["xderiv"], yderiv["yderiv"]), + data_flow=[("result", 0, 1)]) + knl = knl.with_kernel(lp.prioritize_loops(knl["xderiv_and_yderiv"], "e,i,j,k")) - assert len(knl.root_kernel.temporary_variables) == 2 + assert len(knl["xderiv_and_yderiv"].temporary_variables) == 2 ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4)) @@ -533,11 +534,9 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! # FIXME: correct this after the "Module" is done. - ! # prg = lp.parse_fortran(SOURCE) - ! # fill = prg["fill"] - ! # twice = prg["twice"] - ! fill, twice = lp.parse_fortran(SOURCE, return_list_of_knls=True) + ! prg = lp.parse_fortran(SOURCE) + ! fill = prg["fill"] + ! twice = prg["twice"] ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) ! RESULT = knl @@ -567,7 +566,7 @@ def test_precompute_some_exist(ctx_factory): knl = lp.parse_fortran(fortran_src) - assert len(knl.root_kernel.domains) == 1 + assert len(knl['dgemm'].domains) == 1 knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1") @@ -614,7 +613,7 @@ def test_fortran_subroutines(): call twice(n, a(i, 1:n)) end subroutine """ - prg = lp.parse_fortran(fortran_src) + prg = lp.parse_fortran(fortran_src).with_entrypoints("twice_cross") print(lp.generate_code_v2(prg).device_code()) diff --git a/test/test_loopy.py b/test/test_loopy.py index 42a2aa890..420af56ce 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1508,7 +1508,8 @@ def test_finite_difference_expr_subst(ctx_factory): lp.GlobalArg("u", shape="n+2"), ]) - fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], + fused_knl = lp.fuse_kernels( + [fin_diff_knl["loopy_kernel"], flux_knl["loopy_kernel"]], data_flow=[ ("f", 1, 0) ]) -- GitLab From 244f8d40ff0566e8f11cfac419486f719077085f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 26 Oct 2019 12:51:18 -0500 Subject: [PATCH 660/916] comes back to the earlier interface of iterating over kernels when supplied with a program --- loopy/transform/buffer.py | 30 +++++++++++++++++++++++++----- loopy/transform/iname.py | 4 ++++ loopy/transform/precompute.py | 23 ++++++++++++++++++++--- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 0121fb49c..96e7b8d2c 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -37,7 +37,7 @@ from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.program import Program -from loopy.kernel.function_interface import CallableKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pymbolic import var @@ -135,10 +135,10 @@ buffer_array_cache = WriteOncePersistentDict( # Adding an argument? also add something to the cache_key below. -def buffer_array(kernel, var_name, buffer_inames, init_expression=None, - store_expression=None, within=None, default_tag="l.auto", - temporary_scope=None, temporary_is_local=None, - fetch_bounding_box=False, callables_table=None): +def buffer_array_for_single_kernel(kernel, callables_table, var_name, + buffer_inames, init_expression=None, store_expression=None, + within=None, default_tag="l.auto", temporary_scope=None, + temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable @@ -558,4 +558,24 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None, return kernel +def buffer_array(program, *args, **kwargs): + assert isinstance(program, Program) + + new_callables = {} + + for func_id, clbl in six.iteritems(program.callables_table): + if isinstance(clbl, CallableKernel): + clbl = clbl.copy( + subkernel=buffer_array_for_single_kernel(clbl.subkernel, + program.callables_table, *args, **kwargs)) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError() + + new_callables[func_id] = clbl + + return program.copy(callables_table=new_callables) + + # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 50a6a505d..4093215ba 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -81,6 +81,7 @@ __doc__ = """ # {{{ set loop priority +@iterate_over_kernels_if_given_program def set_loop_priority(kernel, loop_priority): from warnings import warn warn("set_loop_priority is deprecated. Use prioritize_loops instead. " @@ -95,6 +96,7 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) +@iterate_over_kernels_if_given_program def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the @@ -1053,6 +1055,8 @@ def get_iname_duplication_options(knl, use_boostable_into=False): isinstance(clbl, CallableKernel)]) == 1: knl = knl[list(knl.entrypoints)[0]] + assert isinstance(knl, LoopKernel) + from loopy.kernel.data import ConcurrentTag concurrent_inames = set( diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index b322c3b28..87696a36f 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -32,7 +32,7 @@ from loopy.symbolic import (get_dependencies, from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func from loopy.program import Program -from loopy.kernel.function_interface import CallableKernel +from loopy.kernel.function_interface import CallableKernel, ScalarCallable import numpy as np from pymbolic import var @@ -260,7 +260,7 @@ class _not_provided(object): # noqa: N801 pass -def precompute(kernel, subst_use, +def precompute_for_single_kernel(kernel, callables_table, subst_use, sweep_inames=[], within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, storage_axis_to_tag={}, @@ -272,7 +272,6 @@ def precompute(kernel, subst_use, fetch_bounding_box=False, temporary_address_space=None, compute_insn_id=None, - callables_table=None, **kwargs): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two @@ -1064,4 +1063,22 @@ def precompute(kernel, subst_use, return kernel +def precompute(program, *args, **kwargs): + assert isinstance(program, Program) + new_callables = {} + + for func_id, clbl in six.iteritems(program.callables_table): + if isinstance(clbl, CallableKernel): + knl = precompute_for_single_kernel(clbl.subkernel, + program.callables_table, *args, **kwargs) + clbl = clbl.copy(subkernel=knl) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError() + + new_callables[func_id] = clbl + + return program.copy(callables_table=new_callables) + # vim: foldmethod=marker -- GitLab From fa58bf078e16ce2cfc62019c544ff2591ee358d9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 26 Oct 2019 12:51:57 -0500 Subject: [PATCH 661/916] restrict generate_code() for multiple entrypoints --- loopy/codegen/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index dadc2222e..48d4761bd 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -720,8 +720,11 @@ def generate_code(kernel, device=None): if len(codegen_result.device_programs) > 1: raise LoopyError("kernel passed to generate_code yielded multiple " "device programs. Use generate_code_v2.") + if len(codegen_result.host_programs) > 1: + raise LoopyError("kernel passed to generate_code yielded multiple " + "host programs. Use generate_code_v2.") - return codegen_result.device_code(), codegen_result.implemented_data_info + return codegen_result.device_code(), codegen_result.implemented_data_infos[0] # }}} -- GitLab From c5117847a25c0bc046772388d3b326cf32064019 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 26 Oct 2019 13:03:49 -0500 Subject: [PATCH 662/916] fixes padding for multi-entrypoint --- loopy/transform/padding.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index 2ee3bd9b1..073e1a74c 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -28,8 +28,10 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext -from loopy.program import iterate_over_kernels_if_given_program +from loopy.program import iterate_over_kernels_if_given_program, Program from loopy.kernel import LoopKernel +from loopy.kernel.function_interface import CallableKernel +from loopy.diagnostic import LoopyError class ArrayAxisSplitHelper(RuleAwareIdentityMapper): @@ -410,6 +412,15 @@ def split_array_axis(kernel, array_names, axis_nr, count, # {{{ find_padding_multiple def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1): + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in six.iteritems(kernel.callables_table) + if isinstance(clbl, CallableKernel)] + if len(kernel_names) > 1: + raise LoopyError() + return find_padding_multiple(kernel[kernel_names[0]], variable, axis, + align_bytes, allowed_waste) + assert isinstance(kernel, LoopKernel) + arg = kernel.arg_dict[variable] if arg.dim_tags is None: -- GitLab From a8e6e94f0d53d938e202a264605af879d55a4649 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 26 Oct 2019 16:57:46 -0500 Subject: [PATCH 663/916] gets rid of return_list_of_kernels --- test/test_numa_diff.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 55a2d2e11..e9d0acd2e 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -59,11 +59,10 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa source = source.replace("datafloat", "real*4") - hsv_r, hsv_s = [ - knl for knl in lp.parse_fortran(source, filename, - seq_dependencies=False, return_list_of_knls=True) - if "KernelR" in knl.name or "KernelS" in knl.name - ] + program = lp.parse_fortran(source, filename, seq_dependencies=False) + + hsv_r, hsv_s = program["strongVolumeKernelR"], program["strongVolumeKernelS"] + hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) -- GitLab From df53dfbd0a78b5d9adbea2b33102a33401048b91 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 26 Oct 2019 16:59:14 -0500 Subject: [PATCH 664/916] revamps statistics post root_kernel removal --- loopy/statistics.py | 70 +++++++++++---- test/test_statistics.py | 188 ++++++++++++++++++++-------------------- 2 files changed, 146 insertions(+), 112 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 86f39e55b..c8670e19f 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -39,8 +39,7 @@ from loopy.diagnostic import warn_with_kernel, LoopyError from loopy.symbolic import CoefficientCollector from pytools import ImmutableRecord, memoize_method from loopy.kernel.function_interface import CallableKernel -from loopy.kernel import LoopKernel -from loopy.program import make_program +from loopy.program import Program __doc__ = """ @@ -812,8 +811,8 @@ class CounterBase(CombineMapper): self.callables_table = callables_table self.kernel_rec = kernel_rec - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl, callables_table) + from loopy.type_inference import TypeReader + self.type_inf = TypeReader(knl, callables_table) self.zero = get_kernel_zero_pwqpolynomial(self.knl) self.one = self.zero + 1 @@ -1382,6 +1381,13 @@ def add_assumptions_guard(kernel, pwqpolynomial): def count(kernel, set, space=None): + if isinstance(kernel, Program): + kernel_names = [i for i, clbl in six.iteritems(kernel.callables_table) + if isinstance(clbl, CallableKernel)] + if len(kernel_names) > 1: + raise LoopyError() + return count(kernel[kernel_names[0]], set, space) + try: if space is not None: set = set.align_params(space) @@ -1390,7 +1396,7 @@ def count(kernel, set, space=None): except AttributeError: pass - count = isl.PwQPolynomial.zero( + total_count = isl.PwQPolynomial.zero( set.space .drop_dims(dim_type.set, 0, set.dim(dim_type.set)) .add_dims(dim_type.set, 1)) @@ -1452,7 +1458,7 @@ def count(kernel, set, space=None): # }}} if bset_count is not None: - count += bset_count + total_count += bset_count is_subset = bset <= bset_rebuilt is_superset = bset >= bset_rebuilt @@ -1477,7 +1483,7 @@ def count(kernel, set, space=None): "number of integer points in your loop " "domain.") - return add_assumptions_guard(kernel, count) + return add_assumptions_guard(kernel, total_count) def get_unused_hw_axes_factor(knl, callables_table, insn, @@ -1552,7 +1558,6 @@ def count_insn_runs(knl, callables_table, insn, count_redundant_work, return c -@memoize_method def _get_insn_count(knl, callables_table, insn_id, subgroup_size, count_redundant_work, count_granularity=CountGranularity.WORKITEM): insn = knl.id_to_insn[insn_id] @@ -1657,7 +1662,8 @@ def _get_op_map_for_single_kernel(knl, callables_table, def get_op_map(program, numpy_types=True, count_redundant_work=False, - count_within_subscripts=True, subgroup_size=None): + count_within_subscripts=True, subgroup_size=None, + entrypoint=None): """Count the number of operations in a loopy kernel. @@ -1713,8 +1719,13 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, """ - if isinstance(program, LoopKernel): - program = make_program(program) + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") + + entrypoint = list(program.entrypoints)[0] + + assert entrypoint in program.entrypoints from loopy.preprocess import preprocess_program, infer_unknown_types program = preprocess_program(program) @@ -1729,7 +1740,7 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False, DeprecationWarning, stacklevel=2) return _get_op_map_for_single_kernel( - program[program.name], program.callables_table, + program[entrypoint], program.callables_table, count_redundant_work=count_redundant_work, count_within_subscripts=count_within_subscripts, subgroup_size=subgroup_size) @@ -1848,7 +1859,7 @@ def _get_mem_access_map_for_single_kernel(knl, callables_table, def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, - subgroup_size=None): + subgroup_size=None, entrypoint=None): """Count the number of memory accesses in a loopy kernel. :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be @@ -1929,6 +1940,15 @@ def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, # (now use these counts to, e.g., predict performance) """ + + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") + + entrypoint = list(program.entrypoints)[0] + + assert entrypoint in program.entrypoints + from loopy.preprocess import preprocess_program, infer_unknown_types program = preprocess_program(program) @@ -1942,7 +1962,7 @@ def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, DeprecationWarning, stacklevel=2) return _get_mem_access_map_for_single_kernel( - program[program.name], program.callables_table, + program[entrypoint], program.callables_table, count_redundant_work=count_redundant_work, subgroup_size=subgroup_size) @@ -2004,7 +2024,7 @@ def _get_synchronization_map_for_single_kernel(knl, callables_table, return sync_map -def get_synchronization_map(program, subgroup_size=None): +def get_synchronization_map(program, subgroup_size=None, entrypoint=None): """Count the number of synchronization events each work-item encounters in a loopy kernel. @@ -2040,7 +2060,13 @@ def get_synchronization_map(program, subgroup_size=None): # (now use this count to, e.g., predict performance) """ + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") + + entrypoint = list(program.entrypoints)[0] + assert entrypoint in program.entrypoints from loopy.preprocess import preprocess_program, infer_unknown_types program = preprocess_program(program) @@ -2049,7 +2075,7 @@ def get_synchronization_map(program, subgroup_size=None): program = infer_unknown_types(program, expect_completion=True) return _get_synchronization_map_for_single_kernel( - program[program.name], program.callables_table, + program[entrypoint], program.callables_table, subgroup_size=subgroup_size) # }}} @@ -2083,7 +2109,7 @@ def _gather_access_footprints_for_single_kernel(kernel, ignore_uncountable): return write_footprints, read_footprints -def gather_access_footprints(program, ignore_uncountable=False): +def gather_access_footprints(program, ignore_uncountable=False, entrypoint=None): """Return a dictionary mapping ``(var_name, direction)`` to :class:`islpy.Set` instances capturing which indices of each the array *var_name* are read/written (where *direction* is either ``read`` or @@ -2094,6 +2120,14 @@ def gather_access_footprints(program, ignore_uncountable=False): nonlinear indices) """ + if entrypoint is None: + if len(program.entrypoints) > 1: + raise LoopyError("Must provide entrypoint") + + entrypoint = list(program.entrypoints)[0] + + assert entrypoint in program.entrypoints + # FIMXE: works only for one callable kernel till now. if len([in_knl_callable for in_knl_callable in program.callables_table.values() if isinstance(in_knl_callable, @@ -2112,7 +2146,7 @@ def gather_access_footprints(program, ignore_uncountable=False): read_footprints = [] write_footprints, read_footprints = _gather_access_footprints_for_single_kernel( - program[program.name], ignore_uncountable) + program[entrypoint], ignore_uncountable) write_footprints = AccessFootprintGatherer.combine(write_footprints) read_footprints = AccessFootprintGatherer.combine(read_footprints) diff --git a/test/test_statistics.py b/test/test_statistics.py index ef5450599..a1ee67a8d 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -67,15 +67,15 @@ def test_op_counter_basic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, "basic")].eval_with_dict( params) - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "basic")].eval_with_dict( params) - f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict( + f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, "basic")].eval_with_dict( params) - f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP, knl.name) + f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP, "basic") ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, "basic") ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == f32div == n*m*ell*n_subgroups @@ -102,10 +102,10 @@ def test_op_counter_reduction(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( - params) - f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP, knl.name) - ].eval_with_dict(params) + f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, + "matmul_serial")].eval_with_dict(params) + f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP, + "matmul_serial")].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == n*m*ell*n_subgroups @@ -138,13 +138,13 @@ def test_op_counter_logic(): m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} - f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict( + f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "logic")].eval_with_dict( params) - f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict( + f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, "logic")].eval_with_dict( params) - f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP, knl.name) + f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP, "logic") ].eval_with_dict(params) - i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, knl.name) + i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP, "logic") ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == n*m*n_subgroups @@ -153,7 +153,7 @@ def test_op_counter_logic(): assert i32add == n*m*n_subgroups -def test_op_counter_specialops(): +def test_op_counter_special_ops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i Date: Sat, 26 Oct 2019 18:47:54 -0500 Subject: [PATCH 665/916] formalizes type reader --- loopy/type_inference.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 939f34087..a5436baf4 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -739,6 +739,45 @@ class TypeReader(TypeInferenceMapper): return [] + def map_variable(self, expr): + if expr.name in self.kernel.all_inames(): + return [self.kernel.index_dtype] + + result = self.kernel.mangle_symbol( + self.kernel.target.get_device_ast_builder(), + expr.name) + + if result is not None: + result_dtype, _ = result + return [result_dtype] + + obj = self.new_assignments.get(expr.name) + + if obj is None: + obj = self.kernel.arg_dict.get(expr.name) + + if obj is None: + obj = self.kernel.temporary_variables.get(expr.name) + + if obj is None: + raise TypeInferenceFailure("name not known in type inference: %s" + % expr.name) + + from loopy.kernel.data import TemporaryVariable, KernelArgument + import loopy as lp + if isinstance(obj, (KernelArgument, TemporaryVariable)): + assert obj.dtype is not lp.auto + result = [obj.dtype] + if result[0] is None: + raise DependencyTypeInferenceFailure( + ", ".join(sorted(expr.name))) + else: + return result + + else: + raise RuntimeError("unexpected type inference " + "object type for '%s'" % expr.name) + map_call_with_kwargs = map_call # }}} -- GitLab From 6b01c62b005d37509275d7f68d089717aa60b879 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 26 Oct 2019 19:32:53 -0500 Subject: [PATCH 666/916] accept translation units with single kernels for fusion --- loopy/transform/fusion.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 20b24793e..c9f426dbf 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -32,6 +32,8 @@ from loopy.diagnostic import LoopyError from pymbolic import var from loopy.kernel import LoopKernel +from loopy.program import Program +from loopy.kernel.function_interface import CallableKernel def _apply_renames_in_exprs(kernel, var_renames): @@ -333,6 +335,16 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + if all(isinstance(kernel, Program) for kernel in kernels): + new_kernels = [] + for knl in kernels: + kernel_names = [i for i, clbl in + six.iteritems(knl.callables_table) if isinstance(clbl, + CallableKernel)] + if len(kernel_names) != 1: + raise LoopyError() + new_kernels.append(knl[kernel_names[0]]) + kernels = new_kernels[:] assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) -- GitLab From 1f5921f244947846243b45ae4ab4ac105d8eb24d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 26 Oct 2019 19:33:18 -0500 Subject: [PATCH 667/916] remove mentions of root_kernel --- test/test_apps.py | 5 ++-- test/test_transform.py | 53 +++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/test/test_apps.py b/test/test_apps.py index a9c3bf2a7..b2a64c808 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -659,9 +659,10 @@ def test_domain_tree_nesting(): TV('num_vals_offset', initializer=num_vals_offset, read_only=True, scope=scopes.PRIVATE), lp.GlobalArg('B', shape=(100, 31), dtype=np.float64), - lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)]) + lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)], + name="nested_domain") - parents_per_domain = knl.root_kernel.parents_per_domain() + parents_per_domain = knl["nested_domain"].parents_per_domain() def depth(i): if parents_per_domain[i] is None: diff --git a/test/test_transform.py b/test/test_transform.py index 180c0fa76..f49efbc3f 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -136,7 +136,8 @@ def test_to_batched_temp(ctx_factory): "cnst", dtype=np.float32, shape=(), - scope=lp.temp_var_scope.PRIVATE), '...']) + scope=lp.temp_var_scope.PRIVATE), '...'], + name="test_to_batch") prog = lp.add_and_infer_dtypes(prog, dict(out=np.float32, x=np.float32, a=np.float32)) @@ -151,7 +152,7 @@ def test_to_batched_temp(ctx_factory): bref_prog = lp.to_batched(ref_prog, "nbatches", "out,x") # checking that cnst is not being bathced - assert bprog.root_kernel.temporary_variables['cnst'].shape == () + assert bprog["test_to_batch"].temporary_variables['cnst'].shape == () a = np.random.randn(5, 5) x = np.random.randn(7, 5) @@ -168,10 +169,10 @@ def test_save_temporaries_in_loop(ctx_factory): "{[i, j]: 0 <= i, j < 4}", """ <> a[j] = j {inames=i:j} - """) + """, name="save_temps") prog = lp.save_temporaries_in_loop(prog, 'i', ['a']) - assert prog.root_kernel.temporary_variables['a'].shape == (4, 4) + assert prog["save_temps"].temporary_variables['a'].shape == (4, 4) def test_add_barrier(ctx_factory): @@ -291,7 +292,7 @@ def test_extract_subst(ctx_factory): "{[i]: 0<=itmp5[i] = 0 {id=insn5,groups=g1} tmp5[i] = 1 {id=insn6,conflicts=g1} - """) + """, name="nosync") orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local") orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local") @@ -514,27 +515,27 @@ def test_add_nosync(): prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2", empty_ok=True) assert frozenset() == ( - prog.root_kernel.id_to_insn["insn2"].no_sync_with) + prog["nosync"].id_to_insn["insn2"].no_sync_with) # Dependency present prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3") assert frozenset() == ( - prog.root_kernel.id_to_insn["insn3"].no_sync_with) + prog["nosync"].id_to_insn["insn3"].no_sync_with) assert frozenset([("insn3", "local")]) == ( - prog.root_kernel.id_to_insn["insn4"].no_sync_with) + prog["nosync"].id_to_insn["insn4"].no_sync_with) # Bidirectional prog = lp.add_nosync( orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) assert frozenset([("insn4", "local")]) == ( - prog.root_kernel.id_to_insn["insn3"].no_sync_with) + prog["nosync"].id_to_insn["insn3"].no_sync_with) assert frozenset([("insn3", "local")]) == ( - prog.root_kernel.id_to_insn["insn4"].no_sync_with) + prog["nosync"].id_to_insn["insn4"].no_sync_with) # Groups prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6") assert frozenset([("insn5", "local")]) == ( - prog.root_kernel.id_to_insn["insn6"].no_sync_with) + prog["nosync"].id_to_insn["insn6"].no_sync_with) def test_uniquify_instruction_ids(): @@ -543,14 +544,14 @@ def test_uniquify_instruction_ids(): i3 = lp.Assignment("b", 1, id=lp.UniqueName("b")) i4 = lp.Assignment("b", 1, id=lp.UniqueName("b")) - prog = lp.make_kernel("{[i]: i = 1}", []) - new_root_kernel = prog.root_kernel.copy(instructions=[i1, i2, i3, i4]) - prog = prog.with_root_kernel(new_root_kernel) + prog = lp.make_kernel("{[i]: i = 1}", [], name="lpy_knl") + new_root_kernel = prog["lpy_knl"].copy(instructions=[i1, i2, i3, i4]) + prog = prog.with_kernel(new_root_kernel) from loopy.transform.instruction import uniquify_instruction_ids prog = uniquify_instruction_ids(prog) - insn_ids = set(insn.id for insn in prog.root_kernel.instructions) + insn_ids = set(insn.id for insn in prog["lpy_knl"].instructions) assert len(insn_ids) == 4 assert all(isinstance(id, str) for id in insn_ids) @@ -562,11 +563,11 @@ def test_split_iname_only_if_in_within(): """ c[i] = 3*d[i] {id=to_split} a[i] = 2*b[i] {id=not_to_split} - """) + """, name="splitter") prog = lp.split_iname(prog, "i", 4, within='id:to_split') - for insn in prog.root_kernel.instructions: + for insn in prog["splitter"].instructions: if insn.id == 'to_split': assert insn.within_inames == frozenset({'i_outer', 'i_inner'}) if insn.id == 'not_to_split': @@ -590,7 +591,7 @@ def test_nested_substs_in_insns(ctx_factory): prg = lp.expand_subst(ref_prg) assert not any( cknl.subkernel.substitutions - for cknl in six.itervalues(prg.callables_table.resolved_functions)) + for cknl in six.itervalues(prg.callables_table)) lp.auto_test_vs_ref(ref_prg, ctx, prg) -- GitLab From de6a988432cdb0cb40bd2a734959c6a71a96b10d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 26 Oct 2019 19:42:52 -0500 Subject: [PATCH 668/916] minor fix in input to unique name generator --- loopy/codegen/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 48d4761bd..281f0154e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -590,7 +590,7 @@ def diverge_callee_entrypoints(program): new_callables = {} renames = {} - vng = UniqueNameGenerator(list(six.iterkeys(program.callables_table))) + vng = UniqueNameGenerator(set(six.iterkeys(program.callables_table))) for clbl_id in callable_ids & program.entrypoints: renames[clbl_id] = vng(based_on=clbl_id) -- GitLab From 3cd04890d163b0d08e3696b847057fda7ca78c13 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 27 Oct 2019 12:59:43 -0500 Subject: [PATCH 669/916] target agnostic way of creating a Collection --- loopy/codegen/__init__.py | 4 ++-- loopy/target/__init__.py | 4 ++++ loopy/target/c/__init__.py | 5 +++++ loopy/target/python.py | 22 ++++++++-------------- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 281f0154e..16792219b 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -696,9 +696,9 @@ def generate_code_v2(program): program.target))) # adding the callee fdecls to the device_programs - from cgen import Collection device_programs = ([device_programs[0].copy( - ast=Collection(callee_fdecls+[device_programs[0].ast]))] + + ast=program.target.get_device_ast_builder().ast_module.Collection( + callee_fdecls+[device_programs[0].ast]))] + device_programs[1:]) cgr = CodeGenerationResult( host_programs=host_programs, diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index fa76d4251..91b888c6f 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -170,6 +170,10 @@ class ASTBuilderBase(object): # {{{ code generation guts + @property + def ast_module(self): + raise NotImplementedError() + def get_function_definition(self, codegen_state, codegen_result, schedule_index, function_decl, function_body): raise NotImplementedError diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 04bfbe10a..4ea6feec1 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -802,6 +802,11 @@ class CASTBuilder(ASTBuilderBase): # {{{ code generation guts + @property + def ast_module(self): + import cgen + return cgen + def get_expression_to_code_mapper(self, codegen_state): return self.get_expression_to_c_expression_mapper(codegen_state) diff --git a/loopy/target/python.py b/loopy/target/python.py index a72e9c272..78bed2cde 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -33,7 +33,7 @@ from loopy.type_inference import TypeReader from loopy.kernel.data import ValueArg from loopy.diagnostic import LoopyError # noqa from loopy.target import ASTBuilderBase -from genpy import Suite +from genpy import Suite, Collection # {{{ expression to code @@ -139,17 +139,6 @@ class ExpressionToPythonMapper(StringifyMapper): # }}} -# {{{ genpy extensions - -class Collection(Suite): - def generate(self): - for item in self.contents: - for item_line in item.generate(): - yield item_line - -# }}} - - # {{{ ast builder def _numpy_single_arg_function_mangler(kernel, name, arg_dtypes): @@ -178,8 +167,6 @@ class PythonASTBuilderBase(ASTBuilderBase): """A Python host AST builder for integration with PyOpenCL. """ - # {{{ code generation guts - @property def known_callables(self): from loopy.target.c import get_c_callables @@ -193,6 +180,13 @@ class PythonASTBuilderBase(ASTBuilderBase): _base_python_preamble_generator ]) + # {{{ code generation guts + + @property + def ast_module(self): + import genpy + return genpy + def get_function_declaration(self, codegen_state, codegen_result, schedule_index): return None -- GitLab From a1b4ae0137a31775cdd98415cc7641ba6963f7ee Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 27 Oct 2019 13:26:39 -0500 Subject: [PATCH 670/916] minor type inference fixes --- loopy/type_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index a5436baf4..068721a48 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -734,8 +734,6 @@ class TypeReader(TypeInferenceMapper): return [get_return_types_as_tuple(arg_id_to_dtype)] else: return [arg_id_to_dtype[-1]] - else: - raise NotImplementedError() return [] @@ -1123,6 +1121,8 @@ def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" from loopy.kernel.data import auto + program = program.with_resolved_callables() + clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, program.entrypoints) -- GitLab From 02b040eeb84862c20a624d49bff1d35d0552c9b1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 27 Oct 2019 13:36:18 -0500 Subject: [PATCH 671/916] remove root_kernel usage from test_domain --- test/test_domain.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/test_domain.py b/test/test_domain.py index dd789d2cd..bc64c086e 100644 --- a/test/test_domain.py +++ b/test/test_domain.py @@ -198,9 +198,10 @@ def test_dependent_loop_bounds_3(ctx_factory): lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), ], - target=lp.PyOpenCLTarget(ctx.devices[0])) + target=lp.PyOpenCLTarget(ctx.devices[0]), + name="loopy_kernel") - assert knl.root_kernel.parents_per_domain()[1] == 0 + assert knl["loopy_kernel"].parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") @@ -267,13 +268,14 @@ def test_independent_multi_domain(ctx_factory): lp.GlobalArg("a", dtype, shape=("n"), order="C"), lp.GlobalArg("b", dtype, shape=("n"), order="C"), lp.ValueArg("n", np.int32), - ]) + ], + name="loopy_kernel") knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - assert knl.root_kernel.parents_per_domain() == 2*[None] + assert knl["loopy_kernel"].parents_per_domain() == 2*[None] n = 50 evt, (a, b) = knl(queue, n=n, out_host=True) -- GitLab From da8983fa58f5d3265c4b2fc8ab58b05068965b5a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 27 Oct 2019 18:28:00 -0500 Subject: [PATCH 672/916] passes diff transform tets --- loopy/transform/diff.py | 2 ++ test/test_diff.py | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index 54d06605a..1bca61d4b 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -342,6 +342,8 @@ class DifferentiationContext(object): arg.dtype, shape=shape, dim_tags=dim_tags, + is_input=arg.is_input, + is_output=arg.is_output )) elif var_name in self.kernel.temporary_variables: diff --git a/test/test_diff.py b/test/test_diff.py index d001233c0..ef005c70a 100644 --- a/test/test_diff.py +++ b/test/test_diff.py @@ -55,18 +55,20 @@ def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - knl = lp.make_function( + knl = lp.make_kernel( """{ [i,j]: 0<=i,j a = 1/(1+sinh(x[i] + y[j])**2) z[i] = sum(j, exp(a * x[j])) - """) + """, name="diff") knl = lp.fix_parameters(knl, n=50) from loopy.transform.diff import diff_kernel - dknl, diff_map = diff_kernel(knl, "z", "x") - dknl = lp.make_program(dknl) + #FIXME Is this the correct interface. Does it make sense to take the entire + #translation unit? + dknl, diff_map = diff_kernel(knl["diff"], "z", "x") + dknl = knl.with_kernel(dknl) dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") -- GitLab From 82d139d4c2a541981cae9aa07111e8e3455fb85d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 01:23:09 -0500 Subject: [PATCH 673/916] host_programs, implemented_data_infos now OrderedDicts instead of lists --- loopy/codegen/__init__.py | 12 +++++---- loopy/codegen/result.py | 40 +++++++++++++++++++----------- loopy/target/execution.py | 7 ++---- loopy/target/pyopencl_execution.py | 7 ++---- 4 files changed, 36 insertions(+), 30 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 16792219b..d96062226 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -26,10 +26,11 @@ import logging logger = logging.getLogger(__name__) import six +import islpy as isl +from collections import OrderedDict from loopy.diagnostic import LoopyError, warn from pytools import ImmutableRecord -import islpy as isl from pytools.persistent_dict import WriteOncePersistentDict from loopy.tools import LoopyKeyBuilder @@ -660,11 +661,11 @@ def generate_code_v2(program): program = diverge_callee_entrypoints(program) - host_programs = [] + host_programs = OrderedDict() device_programs = [] device_preambles = [] callee_fdecls = [] - implemented_data_infos = [] + implemented_data_infos = OrderedDict() for func_id, in_knl_callable in six.iteritems(program.callables_table): if isinstance(in_knl_callable, CallableKernel): @@ -676,8 +677,9 @@ def generate_code_v2(program): program.callables_table, program.target, func_id in program.entrypoints) if func_id in program.entrypoints: - host_programs.extend(cgr.host_programs) - implemented_data_infos.append(cgr.implemented_data_info) + assert len(cgr.host_programs) == 1 + host_programs[func_id] = cgr.host_programs[func_id] + implemented_data_infos[func_id] = cgr.implemented_data_info else: # FIXME: This assertion should be valid # assert cgr.host_programs == [] diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index e53f25835..ac1fbfa6d 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -24,6 +24,7 @@ THE SOFTWARE. import six from pytools import ImmutableRecord +from collections import OrderedDict def process_preambles(preambles): @@ -68,8 +69,8 @@ class CodeGenerationResult(ImmutableRecord): """ .. attribute:: host_programs - A list of :class:`GeneratedProgram` instances - intended to run on the host. + A mapping from entrypoints of a translation unit to instances of + :class:`GeneratedProgram` intended to be run on host. .. attribute:: device_programs @@ -88,14 +89,15 @@ class CodeGenerationResult(ImmutableRecord): .. automethod:: device_code .. automethod:: all_code - .. attribute:: implemented_data_info + .. attribute:: implemented_data_infos - a list of :class:`loopy.codegen.ImplementedDataInfo` objects. - Only added at the very end of code generation. + A mapping from entrypoints to a list of + :class:`loopy.codegen.ImplementedDataInfo` objects. Only added at the + very end of code generation. """ @staticmethod - def new(codegen_state, insn_id, ast, implemented_domain): + def new(codegen_state, insn_id, ast, implemented_domain, entrypoint=None): prg = GeneratedProgram( name=codegen_state.gen_program_name, is_device_program=codegen_state.is_generating_device_code, @@ -103,12 +105,12 @@ class CodeGenerationResult(ImmutableRecord): if codegen_state.is_generating_device_code: kwargs = { - "host_programs": [], "device_programs": [prg], + "host_programs": OrderedDict() } else: kwargs = { - "host_programs": [prg], + "host_programs": OrderedDict({codegen_state.kernel.name: prg}), "device_programs": [], } @@ -123,7 +125,8 @@ class CodeGenerationResult(ImmutableRecord): return ( "".join(preamble_codes) + "\n" - + "\n\n".join(str(hp.ast) for hp in self.host_programs)) + + "\n\n".join(str(hp.ast) for hp in + six.itervalues(self.host_programs))) def device_code(self): preamble_codes = process_preambles(getattr(self, "device_preambles", [])) @@ -145,7 +148,8 @@ class CodeGenerationResult(ImmutableRecord): + "\n" + "\n\n".join(str(dp.ast) for dp in self.device_programs) + "\n\n" - + "\n\n".join(str(hp.ast) for hp in self.host_programs)) + + "\n\n".join(str(hp.ast) for hp in + six.itervalues(self.host_programs))) def current_program(self, codegen_state): if codegen_state.is_generating_device_code: @@ -155,7 +159,8 @@ class CodeGenerationResult(ImmutableRecord): result = None else: if self.host_programs: - result = self.host_programs[-1] + host_programs = self.host_programs.copy() + _, result = host_programs.popitem() else: result = None @@ -181,11 +186,16 @@ class CodeGenerationResult(ImmutableRecord): else: assert program.name == codegen_state.gen_program_name assert not program.is_device_program + host_programs = self.host_programs.copy() + if host_programs: + e, _ = host_programs.popitem() + assert codegen_state.kernel.name == e + host_programs[e] = program + else: + host_programs[codegen_state.kernel.name] = program + pass return self.copy( - host_programs=( - self.host_programs[:-1] - + - [program])) + host_programs=host_programs) def current_ast(self, codegen_state): return self.current_program(codegen_state).ast diff --git a/loopy/target/execution.py b/loopy/target/execution.py index ee2390ab7..1fc7d26b0 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -635,9 +635,7 @@ class ExecutionWrapperGeneratorBase(object): options = program[entrypoint].options #FIXME: endswith is ugly maybe make # codegen_result.implemented_data_infos a dict? - implemented_data_info = [i for i, h in - zip(codegen_result.implemented_data_infos, - codegen_result.host_programs) if h.name.endswith(entrypoint)][0] + implemented_data_info = codegen_result.implemented_data_infos[entrypoint] from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( @@ -669,8 +667,7 @@ class ExecutionWrapperGeneratorBase(object): gen, program[entrypoint], implemented_data_info, options) #FIXME: should we make this as a dict as well. - host_program_name, = [h.name for h in codegen_result.host_programs if - h.name.endswith(entrypoint)] + host_program_name = codegen_result.host_programs[entrypoint].name self.generate_invocation(gen, host_program_name, args, program[entrypoint], implemented_data_info) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index d41fe7006..dad66c3c3 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -321,11 +321,8 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): return _KernelInfo( program=program, cl_kernels=cl_kernels, - implemented_data_info=[i for i, h in - zip(codegen_result.implemented_data_infos, - codegen_result.host_programs) if - h.name.endswith(entrypoint)][0], - # implemented_data_info=codegen_result.implemented_data_info[0], + implemented_data_info=codegen_result.implemented_data_infos[ + entrypoint], invoker=self.get_invoker(program, entrypoint, codegen_result)) def __call__(self, queue, **kwargs): -- GitLab From c4e7735a834615e8875765ce93aacb8e68f7b8a7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 02:03:00 -0500 Subject: [PATCH 674/916] removes incorrect usage of implemented_data_infos --- loopy/auto_test.py | 11 ++++------- loopy/codegen/__init__.py | 4 +++- loopy/codegen/control.py | 3 ++- loopy/codegen/result.py | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 8b09aead7..9727def24 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -446,10 +446,8 @@ def auto_test_vs_ref( properties=cl.command_queue_properties.PROFILING_ENABLE) ref_codegen_result = lp.generate_code_v2(ref_prog) - #FIXME: This is not correct, but I am thinking of moving to a dict of - #implemented_data_info anyway. That should make it more elegant. - assert len(ref_prog.entrypoints) == 1 - ref_implemented_data_info = ref_codegen_result.implemented_data_infos[0] + ref_implemented_data_info = ref_codegen_result.implemented_data_infos[ + ref_entrypoint] logger.info("%s (ref): trying %s for the reference calculation" % ( ref_entrypoint, dev)) @@ -530,10 +528,9 @@ def auto_test_vs_ref( test_prog = infer_unknown_types(test_prog, expect_completion=True) test_prog_codegen_result = lp.generate_code_v2(test_prog) - assert len(test_prog.entrypoints) == 1 - args = make_args(test_prog[test_entrypoint], - test_prog_codegen_result.implemented_data_infos[0], + test_prog_codegen_result.implemented_data_infos[ + test_entrypoint], queue, ref_arg_data, parameters) args["out_host"] = False diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d96062226..fae88b584 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -726,7 +726,9 @@ def generate_code(kernel, device=None): raise LoopyError("kernel passed to generate_code yielded multiple " "host programs. Use generate_code_v2.") - return codegen_result.device_code(), codegen_result.implemented_data_infos[0] + _, implemented_data_info = codegen_result.implemented_data_infos.popitem() + + return codegen_result.device_code(), implemented_data_info # }}} diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index e3c558916..198a60011 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -25,6 +25,7 @@ THE SOFTWARE. """ import six +from collections import OrderedDict from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl from loopy.schedule import ( @@ -179,7 +180,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): if sched_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( - host_programs=[], + host_programs=OrderedDict(), device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index ac1fbfa6d..36132a883 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -216,7 +216,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True): if not elements: return CodeGenerationResult( - host_programs=[], + host_programs=OrderedDict(), device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) @@ -338,7 +338,7 @@ def generate_host_or_device_program(codegen_state, schedule_index): body_ast=ast_builder.process_ast(body_ast))) else: codegen_result = codegen_result.copy( - host_programs=[]) + host_programs=OrderedDict()) return codegen_result -- GitLab From 24b17b77b9fa79290368044c14467c701f6d3feb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 02:17:53 -0500 Subject: [PATCH 675/916] make c execution adjust to the new multientrypoint execution pipeline --- loopy/target/c/__init__.py | 3 ++- loopy/target/c/c_execution.py | 43 +++++++++++++++++++---------------- test/test_c_execution.py | 7 +++--- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 4ea6feec1..cefc80ee8 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -384,7 +384,8 @@ class ExecutableCTarget(CTarget): def get_kernel_executor(self, knl, *args, **kwargs): from loopy.target.c.c_execution import CKernelExecutor - return CKernelExecutor(knl, compiler=self.compiler) + return CKernelExecutor(knl, entrypoint=kwargs.pop('entrypoint'), + compiler=self.compiler) def get_host_ast_builder(self): # enable host code generation diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index dde37739d..23f38ee60 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -157,7 +157,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ def generate_output_handler( - self, gen, options, program, implemented_data_info): + self, gen, options, kernel, implemented_data_info): from loopy.kernel.data import KernelArgument @@ -167,12 +167,12 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) if arg.base_name in - program.root_kernel.get_written_variables())) + kernel.get_written_variables())) else: out_args = [arg for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in program.root_kernel.get_written_variables()] + if arg.base_name in kernel.get_written_variables()] if out_args: gen("return None, (%s,)" % ", ".join(arg.name for arg in out_args)) @@ -379,7 +379,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, program, compiler=None): + def __init__(self, program, entrypoint, compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -388,15 +388,16 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(program) + super(CKernelExecutor, self).__init__(program, entrypoint) - def get_invoker_uncached(self, kernel, codegen_result): + def get_invoker_uncached(self, kernel, entrypoint, codegen_result): generator = CExecutionWrapperGenerator() - return generator(kernel, codegen_result) + return generator(kernel, entrypoint, codegen_result) @memoize_method - def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): - program = self.get_typed_and_scheduled_program(arg_to_dtype_set) + def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), + all_kwargs=None): + program = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype_set) from loopy.codegen import generate_code_v2 codegen_result = generate_code_v2(program) @@ -405,34 +406,36 @@ class CKernelExecutor(KernelExecutorBase): host_code = codegen_result.host_code() all_code = '\n'.join([dev_code, '', host_code]) - if self.program.root_kernel.options.write_cl: + if self.program[entrypoint].options.write_cl: output = all_code - if self.program.root_kernel.options.highlight_cl: + if self.program[entrypoint].options.highlight_cl: output = get_highlighted_code(output) - if self.program.root_kernel.options.write_cl is True: + if self.program[entrypoint].options.write_cl is True: print(output) else: - with open(self.program.root_kernel.options.write_cl, "w") as outf: + with open(self.program[entrypoint].options.write_cl, "w") as outf: outf.write(output) - if self.program.root_kernel.options.edit_cl: + if self.program[entrypoint].options.edit_cl: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor all_code = '\n'.join([dev_code, '', host_code]) c_kernels = [] + for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_info, all_code, self.program.target, - self.compiler)) + codegen_result.implemented_data_infos[entrypoint], all_code, + self.program.target, self.compiler)) return _KernelInfo( program=program, c_kernels=c_kernels, - implemented_data_info=codegen_result.implemented_data_info, - invoker=self.get_invoker(program, codegen_result)) + implemented_data_info=codegen_result.implemented_data_infos[ + entrypoint], + invoker=self.get_invoker(program, entrypoint, codegen_result)) # }}} @@ -449,7 +452,9 @@ class CKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - program_info = self.program_info(self.arg_to_dtype_set(kwargs)) + program_info = self.program_info(kwargs['entrypoint'], + self.arg_to_dtype_set(kwargs)) + kwargs.pop('entrypoint') return program_info.invoker( program_info.c_kernels, *args, **kwargs) diff --git a/test/test_c_execution.py b/test/test_c_execution.py index d996230a5..b6be1d18d 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -115,11 +115,12 @@ def test_c_target_strides_nonsquare(): lp.GlobalArg("a", np.float32, shape=sizes, order=order), "..." ], - target=ExecutableCTarget()) + target=ExecutableCTarget(), + name="nonsquare_strides") # test with C-order knl = __get_kernel('C') - a_lp = next(x for x in knl.args if x.name == 'a') + a_lp = next(x for x in knl["nonsquare_strides"].args if x.name == 'a') a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32), a_lp.shape, order='C') @@ -129,7 +130,7 @@ def test_c_target_strides_nonsquare(): # test with F-order knl = __get_kernel('F') - a_lp = next(x for x in knl.args if x.name == 'a') + a_lp = next(x for x in knl["nonsquare_strides"].args if x.name == 'a') a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32), a_lp.shape, order='F') -- GitLab From 8dd0ef13a1ce5748f9b5466012f2927799875ac6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 02:48:21 -0500 Subject: [PATCH 676/916] diff mapper updates for pymbolic updates --- loopy/transform/diff.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index 1bca61d4b..647fabb85 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -38,7 +38,7 @@ from loopy.kernel import LoopKernel # {{{ diff mapper -def func_map(i, func, args): +def func_map(i, func, args, allowed_nonsmoothness): if func.name == "exp": return var("exp")(*args) elif func.name == "log": @@ -63,8 +63,17 @@ def func_map(i, func, args): class LoopyDiffMapper(DifferentiationMapper, RuleAwareIdentityMapper): - def __init__(self, rule_mapping_context, diff_context, diff_inames): + def __init__(self, rule_mapping_context, diff_context, diff_inames, + allow_nonsmoothness=None): RuleAwareIdentityMapper.__init__(self, rule_mapping_context) + DifferentiationMapper.__init__( + self, + + # This is actually ignored because we + # override map_variable below. + variable=None, + + allowed_nonsmoothness=None) self.diff_context = diff_context self.diff_inames = diff_inames self.diff_iname_exprs = tuple(var(diname) for diname in diff_inames) -- GitLab From 036995f79ee7fce2fbeb29c5825b38bb47300d27 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 04:16:58 -0500 Subject: [PATCH 677/916] corrects mapping in inline --- loopy/transform/callable.py | 56 ++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 84537164f..1b0e791c0 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -234,23 +234,23 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): # add keyword parameters from pymbolic.primitives import CallWithKwargs + from loopy.kernel.function_interface import get_kw_pos_association + kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) if isinstance(instruction.expression, CallWithKwargs): - from loopy.kernel.function_interface import get_kw_pos_association - - _, pos_to_kw = get_kw_pos_association(callee_knl) kw_parameters = instruction.expression.kw_parameters - for i in range(len(parameters), len(parameters) + len(kw_parameters)): - parameters = parameters + (kw_parameters[pos_to_kw[i]],) - - assignee_pos = 0 - parameter_pos = 0 - for i, arg in enumerate(callee_knl.args): - if arg.is_output: - arg_map[arg.name] = assignees[assignee_pos] - assignee_pos += 1 - else: - arg_map[arg.name] = parameters[parameter_pos] - parameter_pos += 1 + else: + kw_parameters = {} + + for kw, par in six.iteritems(kw_parameters): + arg_map[kw] = par + + for i, par in enumerate(parameters): + arg_map[pos_to_kw[i]] = par + + for i, assignee in enumerate(assignees): + arg_map[pos_to_kw[-i-1]] = assignee + + print(arg_map) # }}} @@ -555,10 +555,19 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name): .. note:: The callee kernel addressed by *callee_function_name*, should be - called only once. + called at only one location throughout the program, as multiple + invocations would demand complex renaming logic which is not + implemented yet. """ + + # {{{ sanity checks + assert isinstance(program, Program) assert isinstance(callee_function_name, str) + assert callee_function_name not in program.entrypoints + assert callee_function_name in program.callables_table + + # }}} is_invoking_callee = _FunctionCalledChecker( callee_function_name).map_kernel @@ -568,16 +577,13 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name): CallableKernel) and is_invoking_callee(in_knl_callable.subkernel)] - old_callee_knl = program.callables_table[ - callee_function_name].subkernel + from pymbolic.primitives import Call + assert len([insn for insn in caller_knl.instructions if (isinstance(insn, + CallInstruction) and isinstance(insn.expression, Call) and + insn.expression.function.name == callee_function_name)]) == 1 new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel( - caller_knl, old_callee_knl) - - new_callables_table = program.callables_table.copy() - new_callables_table.resolved_functions[callee_function_name] = ( - new_callables_table[callee_function_name].copy( - subkernel=new_callee_kernel)) - return program.copy(callables_table=new_callables_table) + caller_knl, program[callee_function_name]) + return program.with_kernel(new_callee_kernel) # }}} -- GitLab From 9bf5677fbb4ccd6ac2c8dafddab574ac8e090dbe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 04:34:00 -0500 Subject: [PATCH 678/916] minor changes in docs --- doc/tutorial.rst | 98 +++++++++++++++++++++--------------------- test/test_callables.py | 2 - 2 files changed, 49 insertions(+), 51 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index e6ef54b66..708d0520f 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -325,7 +325,7 @@ an explicit dependency: ... """ ... out[j,i] = a[i,j] {id=transpose} ... out[i,j] = 2*out[i,j] {dep=transpose} - ... """) + ... """, name="transpose_and_dbl") ``{id=transpose}`` assigns the identifier *transpose* to the first instruction, and ``{dep=transpose}`` declares a dependency of the second @@ -334,9 +334,9 @@ that these dependencies show up there, too: .. doctest:: - >>> print(knl.root_kernel.stringify(with_dependencies=True)) + >>> print(knl["transpose_and_dbl"].stringify(with_dependencies=True)) --------------------------------------------------------------------------- - KERNEL: loopy_kernel + KERNEL: transpose_and_dbl --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- @@ -386,7 +386,7 @@ Let us take a look at the generated code for the above kernel: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -735,7 +735,7 @@ those for us: .. doctest:: - >>> glob, loc = knl.get_grid_size_upper_bounds() + >>> glob, loc = knl["loopy_kernel"].get_grid_size_upper_bounds(knl.callables_table) >>> print(glob) (Aff("[n] -> { [(floor((127 + n)/128))] }"),) >>> print(loc) @@ -1207,8 +1207,8 @@ happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: >>> prog = lp.preprocess_kernel(prog) - >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) - >>> prog = prog.with_root_kernel(knl) + >>> knl = lp.get_one_scheduled_kernel(prog["rotate_v2"], prog.callables_table) + >>> prog = prog.with_kernel(knl) >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 @@ -1239,8 +1239,8 @@ that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. >>> prog = lp.save_and_reload_temporaries(prog) - >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) # Schedule added instructions - >>> prog = prog.with_root_kernel(knl) + >>> knl = lp.get_one_scheduled_kernel(prog["rotate_v2"], prog.callables_table) # Schedule added instructions + >>> prog = prog.with_kernel(knl) >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 @@ -1306,7 +1306,7 @@ Now we can execute the kernel. >>> arr = cl.array.arange(queue, 16, dtype=np.int32) >>> print(arr) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] - >>> evt, (out,) = knl(queue, arr=arr) + >>> evt, (out,) = prog(queue, arr=arr) >>> print(arr) [15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] @@ -1543,7 +1543,7 @@ containing different types of data: ... """ ... c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] ... e[i, k] = g[i,k]*(2+h[i,k+1]) - ... """) + ... """, name="stats_knl") >>> knl = lp.add_and_infer_dtypes(knl, ... dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) @@ -1554,7 +1554,7 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, subgroup, loopy_kernel) : ... + Op(np:dtype('float32'), add, subgroup, stats_knl) : ... Each line of output will look roughly like:: @@ -1580,12 +1580,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} >>> from loopy.statistics import CountGranularity as CG - >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1642,15 +1642,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1685,13 +1685,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, knl.name) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, knl.name) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, knl.name) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, knl.name) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1709,7 +1709,7 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') @@ -1752,12 +1752,12 @@ this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, stats_knl) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1767,13 +1767,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1793,12 +1793,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, stats_knl) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1807,13 +1807,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1847,14 +1847,14 @@ kernel from the previous example: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - Sync(kernel_launch, loopy_kernel) : [l, m, n] -> { 1 } + Sync(kernel_launch, stats_knl) : [l, m, n] -> { 1 } We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> launch_count = sync_map[lp.Sync("kernel_launch", knl.name)].eval_with_dict(param_dict) + >>> launch_count = sync_map[lp.Sync("kernel_launch", "stats_knl")].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 diff --git a/test/test_callables.py b/test/test_callables.py index 111861f4e..32e12ded0 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -438,8 +438,6 @@ def test_non_sub_array_refs_arguments(ctx_factory): print(inlined) - print(inlined) - @pytest.mark.parametrize("inline", [False, True]) def test_empty_sub_array_refs(ctx_factory, inline): -- GitLab From 81656b353a5c98f8ee703866fd6c048d3cc15f2d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 05:09:11 -0500 Subject: [PATCH 679/916] slight changes in examples to account for changes in loopy --- examples/python/call-external.py | 9 +-------- examples/python/global_barrier_removal.py | 4 +++- examples/python/sparse.py | 4 ++-- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/examples/python/call-external.py b/examples/python/call-external.py index c13d99bd0..37579fdd8 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -83,12 +83,6 @@ class BLASCallable(lp.ScalarCallable): yield("99_cblas", "#include ") return - -def blas_fn_lookup(target, identifier): - if identifier == 'gemv': - return BLASCallable(name='gemv') - return None - # }}} @@ -105,7 +99,6 @@ knl = lp.make_kernel( target=CTarget(), lang_version=(2018, 2)) -knl = lp.register_function_id_to_in_knl_callable_mapper( - knl, blas_fn_lookup) +knl = lp.register_callable(knl, "gemv", BLASCallable(name="gemv")) print(lp.generate_code_v2(knl).device_code()) diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index be22e268c..e09c0d2cb 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -22,7 +22,9 @@ from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel -knl = get_one_scheduled_kernel(knl.root_kernel, knl.callables_table) +knl = knl.with_kernel(get_one_scheduled_kernel(knl["loopy_kernel"], + knl.callables_table)) + # map schedule onto host or device print(knl) diff --git a/examples/python/sparse.py b/examples/python/sparse.py index 7791f41ba..b4dd07df4 100644 --- a/examples/python/sparse.py +++ b/examples/python/sparse.py @@ -11,9 +11,9 @@ k = lp.make_kernel([ <> length = rowend - rowstart y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]]) end - """) + """, name="spmv") k = lp.add_and_infer_dtypes(k, { - "values,x": np.float64, "rowstarts,colindices": k.root_kernel.index_dtype + "values,x": np.float64, "rowstarts,colindices": k["spmv"].index_dtype }) print(lp.generate_code_v2(k).device_code()) -- GitLab From 45288fb9dfb4f935d5160aaf1f98b66dca298bec Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 05:10:23 -0500 Subject: [PATCH 680/916] slight changes to the ipython interface to account for changes in loo.py --- .../fortran/ipython-integration-demo.ipynb | 93 ++++++++++++++++--- loopy/ipython_ext.py | 9 +- loopy/program.py | 4 - 3 files changed, 84 insertions(+), 22 deletions(-) diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index 1b0a9df8d..8fe25780b 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -25,9 +25,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/kaushikggg/pack/loopy_kc_env/src/loopy/loopy/frontend/fortran/translator.py:807: LoopyWarning: 'lang_version' was not passed to make_function(). To avoid this warning, pass lang_version=(2018, 2) in this invocation. (Or say 'from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2' in the global scope of the calling frame.)\n", + " seq_dependencies=seq_dependencies,\n" + ] + } + ], "source": [ "%%fortran_kernel\n", "\n", @@ -45,11 +54,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------------------------------\n", + "KERNEL: fill\n", + "---------------------------------------------------------------------------\n", + "ARGUMENTS:\n", + "a: ValueArg, type: np:dtype('float64')\n", + "n: ValueArg, type: np:dtype('int32')\n", + "out: type: np:dtype('float64'), shape: (n), dim_tags: (N0:stride:1) aspace: global\n", + "---------------------------------------------------------------------------\n", + "DOMAINS:\n", + "[n] -> { [i] : 0 <= i < n }\n", + "---------------------------------------------------------------------------\n", + "INAME IMPLEMENTATION TAGS:\n", + "i: None\n", + "---------------------------------------------------------------------------\n", + "INSTRUCTIONS:\n", + "for i\n", + " \u001b[36mout[i]\u001b[0m = \u001b[35ma\u001b[0m {id=\u001b[32minsn0\u001b[0m}\n", + "end i\n", + "---------------------------------------------------------------------------\n" + ] + } + ], "source": [ - "print(fill)" + "print(prog)" ] }, { @@ -61,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -70,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -92,19 +127,53 @@ "! tr_fill = lp.parse_fortran(SOURCE)\n", "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n", "! outer_tag=\"g.0\", inner_tag=\"l.0\")\n", - "! RESULT = [tr_fill]\n", + "! RESULT = tr_fill\n", "!\n", "!$loopy end" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------------------------------\n", + "KERNEL: tr_fill\n", + "---------------------------------------------------------------------------\n", + "ARGUMENTS:\n", + "a: ValueArg, type: np:dtype('float64')\n", + "n: ValueArg, type: np:dtype('int32')\n", + "out: type: np:dtype('float64'), shape: (n), dim_tags: (N0:stride:1) aspace: global\n", + "---------------------------------------------------------------------------\n", + "DOMAINS:\n", + "[n] -> { [i_outer, i_inner] : i_inner >= 0 and -128i_outer <= i_inner <= 127 and i_inner < n - 128i_outer }\n", + "---------------------------------------------------------------------------\n", + "INAME IMPLEMENTATION TAGS:\n", + "i_inner: l.0\n", + "i_outer: g.0\n", + "---------------------------------------------------------------------------\n", + "INSTRUCTIONS:\n", + "for i_inner, i_outer\n", + " \u001b[36mout[i_inner + i_outer*128]\u001b[0m = \u001b[35ma\u001b[0m {id=\u001b[32minsn0\u001b[0m}\n", + "end i_inner, i_outer\n", + "---------------------------------------------------------------------------\n" + ] + } + ], "source": [ - "print(tr_fill)" + "print(prog)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py index e44b183ed..c0c74913b 100644 --- a/loopy/ipython_ext.py +++ b/loopy/ipython_ext.py @@ -9,10 +9,8 @@ import loopy as lp class LoopyMagics(Magics): @cell_magic def fortran_kernel(self, line, cell): - result = lp.parse_fortran(cell, return_list_of_knls=True) - - for knl in result: - self.shell.user_ns[knl.name] = knl + result = lp.parse_fortran(cell) + self.shell.user_ns['prog'] = result @cell_magic def transformed_fortran_kernel(self, line, cell): @@ -20,8 +18,7 @@ class LoopyMagics(Magics): cell, transform_code_context=self.shell.user_ns) - for knl in result: - self.shell.user_ns[knl.name] = knl + self.shell.user_ns['prog'] = result def load_ipython_extension(ip): diff --git a/loopy/program.py b/loopy/program.py index 5c79edec7..76568cafa 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -311,10 +311,6 @@ class Program(ImmutableRecord): return self.copy(callables_table=callables_table) - def __iter__(self): - #FIXME: Document - return six.iterkeys(self.callables_table.resolved_functions) - def __getitem__(self, name): result = self.callables_table[name] if isinstance(result, CallableKernel): -- GitLab From c82b2a59f14c78023db94739ffd990466d1edb84 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 05:31:08 -0500 Subject: [PATCH 681/916] correct persistent hashing for ArrayArgDescriptor --- loopy/kernel/function_interface.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 5ed292bb2..a9d3ec59f 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -127,7 +127,11 @@ class ArrayArgDescriptor(ImmutableRecord): # FIXME ArrayArgDescriptor should never need to be persisted, remove # this method when that is so. def update_persistent_hash(self, key_hash, key_builder): - key_builder.update_for_pymbolic_expression(key_hash, self.shape) + for shape_i in self.shape: + if shape_i is None: + key_builder.rec(key_hash, shape_i) + else: + key_builder.update_for_pymbolic_expression(key_hash, shape_i) key_builder.rec(key_hash, self.address_space) key_builder.rec(key_hash, self.dim_tags) -- GitLab From 6af42e0ca240fe6f5f0acc1f4af28987b76beba4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Apr 2020 16:34:15 -0500 Subject: [PATCH 682/916] handle merge leftover bugs --- loopy/__init__.py | 16 +++++++--------- loopy/auto_test.py | 2 +- loopy/schedule/__init__.py | 9 +++++++++ loopy/target/c/__init__.py | 5 +++-- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 7faa67879..78bfd70a0 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -131,11 +131,10 @@ from loopy.preprocess import (preprocess_kernel, realize_reduction, preprocess_program) from loopy.schedule import ( generate_loop_schedules, get_one_scheduled_kernel, get_one_linearized_kernel) -from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping, - Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly, - get_DRAM_access_poly, get_gmem_access_poly, get_mem_access_map, - get_synchronization_poly, get_synchronization_map, - gather_access_footprints, gather_access_footprint_bytes) +from loopy.statistics import (ToCountMap, CountGranularity, + stringify_stats_mapping, Op, MemAccess, get_op_map, get_mem_access_map, + get_synchronization_map, gather_access_footprints, + gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, generate_code, generate_code_v2, generate_body) @@ -273,10 +272,9 @@ __all__ = [ "generate_code", "generate_code_v2", "generate_body", "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op", - "MemAccess", "get_op_poly", "get_op_map", "get_lmem_access_poly", - "get_DRAM_access_poly", "get_gmem_access_poly", "get_mem_access_map", - "get_synchronization_poly", "get_synchronization_map", - "gather_access_footprints", "gather_access_footprint_bytes", + "MemAccess", "get_op_map", "get_mem_access_map", + "get_synchronization_map", "gather_access_footprints", + "gather_access_footprint_bytes", "CompiledKernel", diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 9a4a749c4..a079795bd 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -440,7 +440,7 @@ def auto_test_vs_ref( ref_errors = [] from loopy.kernel.data import ImageArg - need_ref_image_support = any(isinstance(arg, ImageArg) for arg in ref_knl.args) + need_ref_image_support = any(isinstance(arg, ImageArg) for arg in ref_prog.args) for dev in _enumerate_cl_devices_for_ref_test( blacklist_ref_vendors, need_ref_image_support): diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 1a2dac401..5348443c6 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -2033,6 +2033,15 @@ def _get_one_scheduled_kernel_inner(kernel, callables_table): return next(iter(generate_loop_schedules(kernel, callables_table))) +def get_one_scheduled_kernel(kernel, callables_table): + warn_with_kernel( + kernel, "get_one_scheduled_kernel_deprecated", + "get_one_scheduled_kernel is deprecated. " + "Use get_one_linearized_kernel instead.", + DeprecationWarning) + return get_one_linearized_kernel(kernel, callables_table) + + def get_one_linearized_kernel(kernel, callables_table): from loopy import CACHING_ENABLED diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 9eb45cf5b..c8aa041da 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -542,8 +542,9 @@ class CFamilyASTBuilder(ASTBuilderBase): def function_id_in_knl_callable_mapper(self): return ( - super(CASTBuilder, self).function_id_in_knl_callable_mapper() + [ - scope_c_math_functions]) + super(CFamilyASTBuilder, + self).function_id_in_knl_callable_mapper() + [ + scope_c_math_functions]) # }}} -- GitLab From 09052c072768684d0d4f870d553728f4c58db872 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 6 Apr 2020 19:40:42 -0500 Subject: [PATCH 683/916] merge leftover: handle is_input/is_output correctly --- loopy/kernel/tools.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 6120b41a1..ead996445 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1942,9 +1942,8 @@ def infer_args_are_input_output(kernel): for arg in kernel.args: if isinstance(arg, ArrayArg): - if arg.is_output_only is not None: - assert isinstance(arg.is_output_only, bool) - new_args.append(arg) + if arg.is_output is not None: + assert isinstance(arg.is_output, bool) else: if arg.name in kernel.get_written_variables(): arg = arg.copy(is_output=True) @@ -1959,9 +1958,9 @@ def infer_args_are_input_output(kernel): arg.name not in kernel.get_written_variables())): arg = arg.copy(is_input=True) else: - new_args.append(arg.copy(is_output_only=False)) + arg = arg.copy(is_input=False) elif isinstance(arg, (ConstantArg, ImageArg, ValueArg)): - new_args.append(arg) + pass else: raise NotImplementedError("Unkonwn argument type %s." % type(arg)) -- GitLab From 7648ac5e386a0c322be4840f1df62d18a872323e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 8 Oct 2020 00:25:05 -0500 Subject: [PATCH 684/916] Avoid using set_dim_id to preserve pickle-unpickle-round-trip-equality --- loopy/kernel/creation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 5582b0c63..e0834ba9d 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1991,7 +1991,7 @@ class SliceToInameReplacer(IdentityMapper): space = space.add_dims(dim_type.param, len(args_as_params_for_domains)) for i, arg in enumerate(args_as_params_for_domains): - space = space.set_dim_id(dim_type.param, i, isl.Id(arg.name)) + space = space.set_dim_name(dim_type.param, i, arg.name) iname_set = isl.BasicSet.universe(space) -- GitLab From 596b741b495d51adb7243cbc21d84ce0655f891c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 8 Oct 2020 00:44:16 -0500 Subject: [PATCH 685/916] islpy won't accept literal constants for enum values any more: replace 1 with dim_type.param (why was there a literal 1 in the first place?) --- loopy/kernel/function_interface.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 9c520ce96..3f5102887 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -24,6 +24,7 @@ THE SOFTWARE. from six.moves import zip +import islpy as isl from pytools import ImmutableRecord from loopy.diagnostic import LoopyError @@ -696,7 +697,7 @@ class CallableKernel(InKernelCallable): # perspective domain_dependent_vars = frozenset().union( - *(frozenset(dom.get_var_names(1)) for dom in + *(frozenset(dom.get_var_names(isl.dim_type.param)) for dom in self.subkernel.domains)) # FIXME: This is ill-formed, because par can be an expression, e.g. -- GitLab From 614b050166c5f34488a28686e54fb92579d2527d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 8 Oct 2020 14:03:29 -0500 Subject: [PATCH 686/916] Do not drop un-written temporaries in find_temporary_address_space, to avoid creating confusion --- loopy/preprocess.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d3b8ef8a3..504b361fb 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -256,15 +256,11 @@ def find_temporary_address_space(kernel): desired_aspace_per_insn.append(desired_aspace) if not desired_aspace_per_insn: - if temp_var.initializer is None: - warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, - "temporary variable '%s' never written, eliminating" - % temp_var.name, LoopyAdvisory) - else: - raise LoopyError("temporary variable '%s': never written, " - "cannot automatically determine address space" - % temp_var.name) + warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, + "cannot automatically determine address space of '%s'" + % temp_var.name, LoopyAdvisory) + new_temp_vars[temp_var.name] = temp_var continue overall_aspace = max(desired_aspace_per_insn) -- GitLab From e6fba05fb2eb25ec35469778dcfaf9bb57874e45 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 8 Oct 2020 14:05:35 -0500 Subject: [PATCH 687/916] get_arg_descriptor_for_expression: Do not assume all swept inames in a SubArrayRef occur in the expression --- loopy/kernel/function_interface.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 3f5102887..a1e221302 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -176,7 +176,11 @@ def get_arg_descriptor_for_expression(kernel, expr): tuple(iname.name for iname in expr.swept_inames) )(linearized_index) sub_dim_tags = tuple( - DimTag(strides_as_dict[iname]) for iname in expr.swept_inames) + # Not all swept inames necessarily occur in the expression. + # Also, some may have been simplified away by simplify_using_aff. + DimTag(strides_as_dict.get(iname, 0)) + + for iname in expr.swept_inames) sub_shape = tuple( pw_aff_to_expr( kernel.get_iname_bounds(iname.name).upper_bound_pw_aff -- GitLab From cdf8ad6d59fa6c18a9a2cb1ed1a80cd0dcee38ae Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Nov 2020 14:31:06 -0600 Subject: [PATCH 688/916] run pyupgrade --py36-plus --- examples/python/call-external.py | 14 ++--- loopy/auto_test.py | 2 +- loopy/check.py | 12 ++-- loopy/kernel/creation.py | 4 +- loopy/kernel/function_interface.py | 49 +++++++--------- loopy/kernel/tools.py | 4 +- loopy/library/function.py | 8 +-- loopy/library/reduction.py | 60 +++++++++---------- loopy/preprocess.py | 8 +-- loopy/program.py | 77 ++++++++++++------------- loopy/statistics.py | 40 ++++++------- loopy/symbolic.py | 12 ++-- loopy/target/c/__init__.py | 5 +- loopy/target/c/compyte | 2 +- loopy/target/cuda.py | 10 ++-- loopy/target/opencl.py | 16 ++--- loopy/target/pyopencl.py | 4 +- loopy/target/python.py | 3 +- loopy/transform/callable.py | 43 ++++++-------- loopy/transform/diff.py | 2 +- loopy/transform/iname.py | 5 +- loopy/transform/make_scalar.py | 4 +- loopy/transform/pack_and_unpack_args.py | 26 ++++----- loopy/type_inference.py | 30 +++++----- test/test_callables.py | 2 - test/test_transform.py | 2 +- 26 files changed, 213 insertions(+), 231 deletions(-) diff --git a/examples/python/call-external.py b/examples/python/call-external.py index c13d99bd0..104d12f38 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -68,8 +68,8 @@ class BLASCallable(lp.ScalarCallable): par_dtype).expr for par, par_dtype in zip( parameters, par_dtypes)] - c_parameters.insert(0, var('CblasRowMajor')) - c_parameters.insert(1, var('CblasNoTrans')) + c_parameters.insert(0, var("CblasRowMajor")) + c_parameters.insert(1, var("CblasNoTrans")) c_parameters.insert(2, mat_descr.shape[0]) c_parameters.insert(3, mat_descr.shape[1]) c_parameters.insert(4, 1) @@ -85,8 +85,8 @@ class BLASCallable(lp.ScalarCallable): def blas_fn_lookup(target, identifier): - if identifier == 'gemv': - return BLASCallable(name='gemv') + if identifier == "gemv": + return BLASCallable(name="gemv") return None # }}} @@ -99,9 +99,9 @@ knl = lp.make_kernel( """ y[:] = gemv(A[:, :], x[:]) """, [ - lp.GlobalArg('A', dtype=np.float64, shape=(n, n)), - lp.GlobalArg('x', dtype=np.float64, shape=(n, )), - lp.GlobalArg('y', shape=(n, )), ...], + lp.GlobalArg("A", dtype=np.float64, shape=(n, n)), + lp.GlobalArg("x", dtype=np.float64, shape=(n, )), + lp.GlobalArg("y", shape=(n, )), ...], target=CTarget(), lang_version=(2018, 2)) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index ff2bda7ef..dfcfe2a2f 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -641,7 +641,7 @@ def auto_test_vs_ref( rates = "" for cnt, lbl in zip(op_count, op_label): - rates += " %g %s/s" % (cnt/elapsed_wall, lbl) + rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl) if not quiet: def format_float_or_none(v): diff --git a/loopy/check.py b/loopy/check.py index 32db02b65..44fbfe155 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -111,11 +111,11 @@ class UnscopedCallCollector(CombineMapper): def map_call_with_kwargs(self, expr): if not isinstance(expr.function, ResolvedFunction): return (frozenset([expr.function.name]) | - self.combine((self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values())))) + self.combine(self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values()))) else: - return self.combine((self.rec(child) for child in - expr.parameters+tuple(expr.kw_parameters.values()))) + return self.combine(self.rec(child) for child in + expr.parameters+tuple(expr.kw_parameters.values())) def map_constant(self, expr): return frozenset() @@ -262,9 +262,9 @@ def _get_all_unique_iname_tags(kernel): from itertools import chain iname_tags = list(chain(*(kernel.iname_to_tags.get(iname, []) for iname in kernel.all_inames()))) - return set( + return { tag for tag in iname_tags if - isinstance(tag, UniqueTag)) + isinstance(tag, UniqueTag)} def check_multiple_tags_allowed(kernel): diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index f73bf278f..a9665f354 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2375,12 +2375,12 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): def make_function(*args, **kwargs): - lang_version = kwargs.pop('lang_version', None) + lang_version = kwargs.pop("lang_version", None) if lang_version: raise LoopyError("lang_version should be set for program, not " "functions.") - kwargs['is_callee_kernel'] = True + kwargs["is_callee_kernel"] = True return make_kernel(*args, **kwargs) # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 0cb610074..58f5f4db7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni" __license__ = """ @@ -22,9 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - -from six.moves import zip - from pytools import ImmutableRecord from loopy.diagnostic import LoopyError @@ -82,7 +77,7 @@ class ArrayArgDescriptor(ImmutableRecord): A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase` """ - fields = set(['shape', 'address_space', 'dim_tags']) + fields = {"shape", "address_space", "dim_tags"} def __init__(self, shape, address_space, dim_tags): @@ -99,7 +94,7 @@ class ArrayArgDescriptor(ImmutableRecord): # }}} - super(ArrayArgDescriptor, self).__init__( + super().__init__( shape=shape, address_space=address_space, dim_tags=dim_tags) @@ -264,7 +259,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord): This class acts as a pseudo-callable and its significance lies in solving picklability issues. """ - fields = set(["local_size", "global_size"]) + fields = {"local_size", "global_size"} def __init__(self, global_size, local_size): self.global_size = global_size @@ -317,12 +312,12 @@ class InKernelCallable(ImmutableRecord): .. automethod:: is_ready_for_codegen """ - fields = set(["arg_id_to_dtype", "arg_id_to_descr"]) + fields = {"arg_id_to_dtype", "arg_id_to_descr"} init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): - super(InKernelCallable, self).__init__( + super().__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) @@ -394,8 +389,8 @@ class InKernelCallable(ImmutableRecord): new_arg_id_to_dtype = None if self.arg_id_to_dtype is not None: - new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id, - dtype in self.arg_id_to_dtype.items()) + new_arg_id_to_dtype = {id: with_target_if_not_None(dtype) for id, + dtype in self.arg_id_to_dtype.items()} return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) @@ -461,7 +456,7 @@ class ScalarCallable(InKernelCallable): derived subclasses. """ - fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"]) + fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"} init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") hash_fields = fields @@ -469,7 +464,7 @@ class ScalarCallable(InKernelCallable): def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): - super(ScalarCallable, self).__init__( + super().__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) @@ -627,7 +622,7 @@ class CallableKernel(InKernelCallable): sizes for the :attr:`subkernel` of the callable. """ - fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr"]) + fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"} init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") hash_fields = fields @@ -635,7 +630,7 @@ class CallableKernel(InKernelCallable): arg_id_to_descr=None): assert isinstance(subkernel, LoopKernel) - super(CallableKernel, self).__init__( + super().__init__( arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) @@ -729,8 +724,8 @@ class CallableKernel(InKernelCallable): subst_mapper = SubstitutionMapper(subst_func) - arg_id_to_descr = dict((arg_id, descr.map_expr(subst_mapper)) for - arg_id, descr in arg_id_to_descr.items()) + arg_id_to_descr = {arg_id: descr.map_expr(subst_mapper) for + arg_id, descr in arg_id_to_descr.items()} # }}} @@ -793,8 +788,8 @@ class CallableKernel(InKernelCallable): callables_table)) if assumptions: - args_added_knl = assume(args_added_knl, ' and '.join([ - '{0}={1}'.format(key, val) for key, val in assumptions.items()])) + args_added_knl = assume(args_added_knl, " and ".join([ + f"{key}={val}" for key, val in assumptions.items()])) return ( self.copy( @@ -900,19 +895,19 @@ class ManglerCallable(ScalarCallable): A function of signature ``(kernel, name , arg_dtypes)`` and returns an instance of ``loopy.CallMangleInfo``. """ - fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"]) + fields = {"name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"} init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") - hash_fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"]) + hash_fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target"} def __init__(self, name, function_mangler, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): self.function_mangler = function_mangler - super(ManglerCallable, self).__init__( + super().__init__( name=name, arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr, @@ -941,8 +936,8 @@ class ManglerCallable(ScalarCallable): arg_dtypes) if mangle_result: new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) - new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in - enumerate(mangle_result.result_dtypes))) + new_arg_id_to_dtype.update({-i-1: dtype for i, dtype in + enumerate(mangle_result.result_dtypes)}) return ( self.copy(name_in_target=mangle_result.target_name, arg_id_to_dtype=new_arg_id_to_dtype), diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 725566c36..6f76f0144 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1956,8 +1956,8 @@ class CallCollector(CombineMapper): def map_call_with_kwargs(self, expr): return (frozenset([expr.function.name]) | - self.combine((self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values())))) + self.combine(self.rec(child) for child in expr.parameters + + tuple(expr.kw_parameters.values()))) def map_constant(self, expr): return frozenset() diff --git a/loopy/library/function.py b/loopy/library/function.py index f0914189a..291f0c372 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -36,8 +36,8 @@ class MakeTupleCallable(ScalarCallable): def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): from loopy.kernel.function_interface import ValueArgDescriptor - new_arg_id_to_descr = dict(((id, ValueArgDescriptor()), - (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys()) + new_arg_id_to_descr = {(id, ValueArgDescriptor()): + (-id-1, ValueArgDescriptor()) for id in arg_id_to_descr.keys()} return ( self.copy(arg_id_to_descr=new_arg_id_to_descr), @@ -46,8 +46,8 @@ class MakeTupleCallable(ScalarCallable): class IndexOfCallable(ScalarCallable): def with_types(self, arg_id_to_dtype, kernel, callables_table): - new_arg_id_to_dtype = dict((i, dtype) for i, dtype in - arg_id_to_dtype.items() if dtype is not None) + new_arg_id_to_dtype = {i: dtype for i, dtype in + arg_id_to_dtype.items() if dtype is not None} new_arg_id_to_dtype[-1] = kernel.index_dtype return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 28cfb8ba2..f44d24323 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -486,28 +486,28 @@ class ReductionCallable(ScalarCallable): prefix = op.prefix(scalar_dtype, index_dtype) yield (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(index_t)s index1, - %(scalar_t)s op2, %(index_t)s index2, - %(index_t)s *index_out) - { - if (op2 %(comp)s op1) - { + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {index_t} index1, + {scalar_t} op2, {index_t} index2, + {index_t} *index_out) + {{ + if (op2 {comp} op1) + {{ *index_out = index2; return op2; - } + }} else - { + {{ *index_out = index1; return op1; - } - } - """ % { - "scalar_t": target.dtype_to_typename(scalar_dtype), - "prefix": prefix, - "index_t": target.dtype_to_typename(index_dtype), - "comp": op.update_comparison, - }) + }} + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) elif isinstance(self.name, SegmentedOp): op = self.name.reduction_op scalar_dtype = self.arg_id_to_dtype[-1] @@ -515,20 +515,20 @@ class ReductionCallable(ScalarCallable): prefix = op.prefix(scalar_dtype, segment_flag_dtype) yield (prefix, """ - inline %(scalar_t)s %(prefix)s_op( - %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, - %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, - %(segment_flag_t)s *segment_flag_out) - { + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {segment_flag_t} segment_flag1, + {scalar_t} op2, {segment_flag_t} segment_flag2, + {segment_flag_t} *segment_flag_out) + {{ *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : %(combined)s; - } - """ % { - "scalar_t": target.dtype_to_typename(scalar_dtype), - "prefix": prefix, - "segment_flag_t": target.dtype_to_typename(segment_flag_dtype), - "combined": op.op % ("op1", "op2"), - }) + return segment_flag2 ? op2 : {combined}; + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) return diff --git a/loopy/preprocess.py b/loopy/preprocess.py index b70be0816..365c30d7f 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2049,7 +2049,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, caller_kernel, callables_table): - super(ArgDescrInferenceMapper, self).__init__( + super().__init__( rule_mapping_context) self.caller_kernel = caller_kernel self.callables_table = callables_table @@ -2060,15 +2060,15 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): if not isinstance(expr.function, ResolvedFunction): # ignore if the call is not to a ResolvedFunction - return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state) + return super().map_call(expr, expn_state) arg_id_to_val = dict(enumerate(expr.parameters)) if isinstance(expr, CallWithKwargs): arg_id_to_val.update(expr.kw_parameters) - if 'assignees' in kwargs: + if "assignees" in kwargs: # If supplied with assignees then this is a CallInstruction - assignees = kwargs['assignees'] + assignees = kwargs["assignees"] for i, arg in enumerate(assignees): arg_id_to_val[-i-1] = arg diff --git a/loopy/program.py b/loopy/program.py index 1fb691531..7224a7bbe 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" __license__ = """ @@ -22,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six import re from pytools import ImmutableRecord, memoize_method @@ -76,7 +73,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): """ def __init__(self, rule_mapping_context, kernel, callables_table, function_id_to_in_knl_callable_mappers): - super(ResolvedFunctionMarker, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.kernel = kernel self.callables_table = callables_table self.function_id_to_in_knl_callable_mappers = ( @@ -131,13 +128,13 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) + { + key: self.rec(val, expn_state) + for key, val in expr.kw_parameters.items()} ) # this is an unknown function as of yet, do not modify it - return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr, + return super().map_call_with_kwargs(expr, expn_state) def map_reduction(self, expr, expn_state): @@ -148,7 +145,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper): self.callables_table, _ = ( self.callables_table.with_added_callable(func_id, in_knl_callable)) - return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state) + return super().map_reduction(expr, expn_state) def _default_func_id_to_kernel_callable_mappers(target): @@ -243,7 +240,7 @@ class Program(ImmutableRecord): assert name in callables_table - super(Program, self).__init__( + super().__init__( name=name, callables_table=callables_table, target=target, @@ -260,10 +257,10 @@ class Program(ImmutableRecord): update_persistent_hash = update_persistent_hash def copy(self, **kwargs): - if 'target' in kwargs: + if "target" in kwargs: # target attribute of all the callable kernels should be updated. - target = kwargs['target'] - new_self = super(Program, self).copy(**kwargs) + target = kwargs["target"] + new_self = super().copy(**kwargs) new_resolved_functions = {} for func_id, in_knl_callable in ( new_self.callables_table.items()): @@ -280,7 +277,7 @@ class Program(ImmutableRecord): return super(Program, new_self).copy( callables_table=callables_table) else: - return super(Program, self).copy(**kwargs) + return super().copy(**kwargs) def get_grid_size_upper_bounds(self, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that @@ -371,7 +368,7 @@ class Program(ImmutableRecord): resolved_functions=new_resolved_functions)) def __iter__(self): - return six.iterkeys(self.callables_table.resolved_functions) + return self.callables_table.resolved_functions.keys() def __getitem__(self, name): result = self.callables_table[name] @@ -427,13 +424,13 @@ def next_indexed_function_identifier(function_id): match = func_name.match(function_id) if match is None: - if function_id[-1] == '_': - return "{old_name}0".format(old_name=function_id) + if function_id[-1] == "_": + return f"{function_id}0" else: - return "{old_name}_0".format(old_name=function_id) + return f"{function_id}_0" - return "{alpha}_{num}".format(alpha=match.group('alpha'), - num=int(match.group('num'))+1) + return "{alpha}_{num}".format(alpha=match.group("alpha"), + num=int(match.group("num"))+1) class ResolvedFunctionRenamer(RuleAwareIdentityMapper): @@ -442,7 +439,7 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper): *renaming_dict*. """ def __init__(self, rule_mapping_context, renaming_dict): - super(ResolvedFunctionRenamer, self).__init__( + super().__init__( rule_mapping_context) self.renaming_dict = renaming_dict @@ -450,7 +447,7 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper): if expr.name in self.renaming_dict: return ResolvedFunction(self.renaming_dict[expr.name]) else: - return super(ResolvedFunctionRenamer, self).map_resolved_function( + return super().map_resolved_function( expr, expn_state) @@ -499,8 +496,8 @@ class CallablesCountingMapper(CombineMapper): in_knl_callable = self.callables_table[expr.function.name] if isinstance(in_knl_callable, ScalarCallable): return (Counter([expr.function.name]) + - self.combine((self.rec(child) for child in expr.parameters - + tuple(kw_parameters.values())))) + self.combine(self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values()))) elif isinstance(in_knl_callable, CallableKernel): @@ -511,22 +508,22 @@ class CallablesCountingMapper(CombineMapper): self.callables_table)) return (Counter([expr.function.name]) + - self.combine((self.rec(child) for child in expr.parameters - + tuple(kw_parameters.values())))) + ( + self.combine(self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values()))) + ( callables_count_in_subkernel) else: raise NotImplementedError("Unknown callable type %s." % ( type)) else: return ( - self.combine((self.rec(child) for child in expr.parameters - + tuple(kw_parameters.values())))) + self.combine(self.rec(child) for child in expr.parameters + + tuple(kw_parameters.values()))) map_call_with_kwargs = map_call def map_reduction(self, expr): return Counter(expr.operation.get_scalar_callables()) + ( - super(CallablesCountingMapper, self).map_reduction(expr)) + super().map_reduction(expr)) def map_constant(self, expr): return Counter() @@ -604,10 +601,10 @@ class CallablesTable(ImmutableRecord): history=None, is_being_edited=False): if history is None: - history = dict((func_id, frozenset([func_id])) for func_id in - resolved_functions) + history = {func_id: frozenset([func_id]) for func_id in + resolved_functions} - super(CallablesTable, self).__init__( + super().__init__( resolved_functions=resolved_functions, history=history, is_being_edited=is_being_edited) @@ -619,8 +616,8 @@ class CallablesTable(ImmutableRecord): def __hash__(self): return hash(( - frozenset(six.iteritems(self.resolved_functions)), - frozenset(six.iteritems(self.history)), + frozenset(self.resolved_functions.items()), + frozenset(self.history.items()), self.is_being_edited )) @@ -780,8 +777,8 @@ class CallablesTable(ImmutableRecord): # equal to the old version of the callable. return self, function else: - print('Old: ', self.resolved_functions[function.name]) - print('New: ', in_kernel_callable) + print("Old: ", self.resolved_functions[function.name]) + print("New: ", in_kernel_callable) raise LoopyError("Use 'with_enter_edit_callables_mode' first.") # }}} @@ -869,7 +866,7 @@ class CallablesTable(ImmutableRecord): # this implies that all the function instances having the name # "func_id" have been renamed to something else. for new_func_id in ( - six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)): + new_callables_count.keys()-renames_needed.keys()): if old_func_id in self.history[new_func_id]: renames_needed[new_func_id] = old_func_id break @@ -926,13 +923,13 @@ class CallablesTable(ImmutableRecord): return item in self.resolved_functions def items(self): - return six.iteritems(self.resolved_functions) + return self.resolved_functions.items() def values(self): - return six.itervalues(self.resolved_functions) + return self.resolved_functions.values() def keys(self): - return six.iterkeys(self.resolved_functions) + return self.resolved_functions.keys() # }}} diff --git a/loopy/statistics.py b/loopy/statistics.py index 20b936ceb..a1c86d88b 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -164,7 +164,7 @@ class GuardedPwQPolynomial: # {{{ ToCountMap -class ToCountMap(object): +class ToCountMap: """A map from work descriptors like :class:`Op` and :class:`MemAccess` to any arithmetic type. @@ -215,9 +215,9 @@ class ToCountMap(object): def __mul__(self, other): if isinstance(other, GuardedPwQPolynomial): - return self.copy(dict( - (index, value*other) - for index, value in self.count_map.items())) + return self.copy({ + index: value*other + for index, value in self.count_map.items()}) else: raise ValueError("ToCountMap: Attempted to multiply " "ToCountMap by {} {}." @@ -233,7 +233,7 @@ class ToCountMap(object): def __str__(self): return "\n".join( - "%s: %s" % (k, v) + f"{k}: {v}" for k, v in sorted(self.count_map.items(), key=lambda k: str(k))) @@ -400,9 +400,9 @@ class ToCountMap(object): for self_key, self_val in self.count_map.items(): new_key = key_type( - **dict( - (field, getattr(self_key, field)) - for field in args)) + **{ + field: getattr(self_key, field) + for field in args}) new_count_map[new_key] = new_count_map.get(new_key, 0) + self_val @@ -487,7 +487,7 @@ class ToCountPolynomialMap(ToCountMap): assert _get_param_tuple(val.space) == space_param_tuple - super(ToCountPolynomialMap, self).__init__(count_map) + super().__init__(count_map) def _zero(self): space = self.space.insert_dims(dim_type.out, 0, 1) @@ -584,7 +584,7 @@ def stringify_stats_mapping(m): # {{{ CountGranularity -class CountGranularity(object): +class CountGranularity: """Strings specifying whether an operation should be counted once per *work-item*, *sub-group*, or *work-group*. @@ -658,7 +658,7 @@ class Op(ImmutableRecord): from loopy.types import to_loopy_type dtype = to_loopy_type(dtype) - super(Op, self).__init__(dtype=dtype, name=name, + super().__init__(dtype=dtype, name=name, count_granularity=count_granularity, kernel_name=kernel_name) @@ -752,7 +752,7 @@ class MemAccess(ImmutableRecord): from loopy.types import to_loopy_type dtype = to_loopy_type(dtype) - super(MemAccess, self).__init__(mtype=mtype, dtype=dtype, + super().__init__(mtype=mtype, dtype=dtype, lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, variable=variable, variable_tag=variable_tag, @@ -797,11 +797,11 @@ class Sync(ImmutableRecord): """ def __init__(self, kind=None, kernel_name=None): - super(Sync, self).__init__(kind=kind, kernel_name=kernel_name) + super().__init__(kind=kind, kernel_name=kernel_name) def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "Sync(%s, %s)" % (self.kind, self.kernel_name) + return f"Sync({self.kind}, {self.kernel_name})" # }}} @@ -846,12 +846,12 @@ class CounterBase(CombineMapper): if isinstance(clbl, CallableKernel): sub_result = self.kernel_rec(clbl.subkernel) - arg_dict = dict( - (arg.name, value) + arg_dict = { + arg.name: value for arg, value in zip( clbl.subkernel.args, expr.parameters) - if isinstance(arg, ValueArg)) + if isinstance(arg, ValueArg)} return subst_into_to_count_map( self.param_space, @@ -911,7 +911,7 @@ class CounterBase(CombineMapper): class ExpressionOpCounter(CounterBase): def __init__(self, knl, callables_table, kernel_rec, count_within_subscripts=True): - super(ExpressionOpCounter, self).__init__( + super().__init__( knl, callables_table, kernel_rec) self.count_within_subscripts = count_within_subscripts @@ -940,7 +940,7 @@ class ExpressionOpCounter(CounterBase): kernel_name=self.knl.name): self.one} ) + self.rec(expr.parameters) else: - return super(ExpressionOpCounter, self).map_call(expr) + return super().map_call(expr) def map_subscript(self, expr): if self.count_within_subscripts: @@ -1190,7 +1190,7 @@ class MemAccessCounterBase(CounterBase): if not isinstance(clbl, CallableKernel): return self.rec(expr.parameters) else: - return super(MemAccessCounterBase, self).map_call(expr) + return super().map_call(expr) # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index a9c8ab172..0c9f8307b 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -289,7 +289,7 @@ class StringifyMapper(StringifyMapperBase): def map_sub_array_ref(self, expr, prec): return "[{inames}]: {subscr}".format( - inames=','.join(self.rec(iname, prec) for iname in + inames=",".join(self.rec(iname, prec) for iname in expr.swept_inames), subscr=self.rec(expr.subscript, prec)) @@ -386,7 +386,7 @@ class DependencyMapper(DependencyMapperBase): def map_sub_array_ref(self, expr, *args): deps = self.rec(expr.subscript, *args) - return deps - set(iname for iname in expr.swept_inames) + return deps - {iname for iname in expr.swept_inames} map_linear_subscript = DependencyMapperBase.map_subscript @@ -838,7 +838,7 @@ class SweptInameStrideCollector(CoefficientCollectorBase): or expr.aggregate.name not in self.target_names): return {1: expr} - return super(SweptInameStrideCollector, self).map_algebraic_leaf(expr) + return super().map_algebraic_leaf(expr) class SubArrayRef(p.Expression): @@ -888,8 +888,8 @@ class SubArrayRef(p.Expression): subscript would be ``a[0, j, 0, l]`` """ # TODO: Set the zero to the minimum value of the iname. - swept_inames_to_zeros = dict( - (swept_iname.name, 0) for swept_iname in self.swept_inames) + swept_inames_to_zeros = { + swept_iname.name: 0 for swept_iname in self.swept_inames} return EvaluatorWithDeficientContext(swept_inames_to_zeros)( self.subscript) @@ -2215,7 +2215,7 @@ class BatchedAccessRangeMapper(WalkMapper): return self.rec(expr.child, inames) def map_sub_array_ref(self, expr, inames): - total_inames = inames | set([iname.name for iname in expr.swept_inames]) + total_inames = inames | {iname.name for iname in expr.swept_inames} return self.rec(expr.subscript, total_inames) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index e618d75a1..37997d7ab 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -464,7 +464,7 @@ class CMathCallable(ScalarCallable): elif dtype == np.float128: # pylint:disable=no-member name = name + "l" # fabsl else: - raise LoopyTypeError("%s does not support type %s" % (name, + raise LoopyTypeError("{} does not support type {}".format(name, dtype)) return ( @@ -553,8 +553,7 @@ class CFamilyASTBuilder(ASTBuilderBase): def function_id_in_knl_callable_mapper(self): return ( - super(CFamilyASTBuilder, - self).function_id_in_knl_callable_mapper() + [ + super().function_id_in_knl_callable_mapper() + [ scope_c_math_functions]) # }}} diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 7e48e1166..d1f993dae 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 7e48e1166a13cfbb7b60f909b071f088034ffda1 +Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786 diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 64b401b8b..83697e601 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -170,8 +170,8 @@ class CudaCallable(ScalarCallable): raise LoopyError("%s does not support complex numbers" % name) - updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, - num_args)) + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(-1, + num_args)} return ( self.copy(name_in_target=name, @@ -184,7 +184,7 @@ class CudaCallable(ScalarCallable): def scope_cuda_functions(target, identifier): - if identifier in set(["dot"]) | set( + if identifier in {"dot"} | set( _CUDA_SPECIFIC_FUNCTIONS): return CudaCallable(name=identifier) @@ -355,7 +355,7 @@ class CUDACASTBuilder(CFamilyASTBuilder): def preamble_generators(self): return ( - super(CUDACASTBuilder, self).preamble_generators() + [ + super().preamble_generators() + [ cuda_preamble_generator]) # }}} @@ -455,7 +455,7 @@ class CUDACASTBuilder(CFamilyASTBuilder): lhs_expr_code = ecm(lhs_expr) rhs_expr_code = ecm(new_rhs_expr) - return Statement("atomicAdd(&{0}, {1})".format( + return Statement("atomicAdd(&{}, {})".format( lhs_expr_code, rhs_expr_code)) else: from cgen import Block, DoWhile, Assign diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 6455cacc9..0cc93ca28 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -186,9 +186,9 @@ class OpenCLCallable(ScalarCallable): [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() if (id >= 0 and dtype is not None)]) - if dtype.kind in ['u', 'i', 'f']: - if dtype.kind == 'f': - name = 'f'+name + if dtype.kind in ["u", "i", "f"]: + if dtype.kind == "f": + name = "f"+name dtype = NumpyType(dtype) return ( self.copy(name_in_target=name, @@ -242,8 +242,8 @@ class OpenCLCallable(ScalarCallable): raise LoopyError("%s does not support complex numbers" % name) - updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1, - num_args)) + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(-1, + num_args)} return ( self.copy(name_in_target=name, @@ -266,8 +266,8 @@ class OpenCLCallable(ScalarCallable): self.copy(arg_id_to_dtype=arg_id_to_dtype), callables_table) - updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in - range(count)) + updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in + range(count)} updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype( NumpyType(dtype), count) @@ -288,7 +288,7 @@ def scope_opencl_functions(target, identifier): Returns an instance of :class:`InKernelCallable` if the function defined by *identifier* is known in OpenCL. """ - opencl_function_ids = set(["max", "min", "dot"]) | set( + opencl_function_ids = {"max", "min", "dot"} | set( _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) if identifier in opencl_function_ids: diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 898d1323e..2008c9224 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -229,7 +229,7 @@ class PyOpenCLCallable(ScalarCallable): raise LoopyTypeError("unexpected complex type '%s'" % dtype) return ( - self.copy(name_in_target="%s_%s" % (tpname, name), + self.copy(name_in_target=f"{tpname}_{name}", arg_id_to_dtype={0: dtype, -1: NumpyType( np.dtype(dtype.numpy_dtype.type(0).real))}), callables_table) @@ -248,7 +248,7 @@ class PyOpenCLCallable(ScalarCallable): raise LoopyTypeError("unexpected complex type '%s'" % dtype) return ( - self.copy(name_in_target="%s_%s" % (tpname, name), + self.copy(name_in_target=f"{tpname}_{name}", arg_id_to_dtype={0: dtype, -1: dtype}), callables_table) else: diff --git a/loopy/target/python.py b/loopy/target/python.py index c02943fd6..c27b4484d 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -111,7 +111,8 @@ class ExpressionToPythonMapper(StringifyMapper): str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters)) + return "{}({})".format(in_knl_callable.name_in_target, + ", ".join(str_parameters)) def map_group_hw_index(self, expr, enclosing_prec): raise LoopyError("plain Python does not have group hw axes") diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 479843697..6195f0b4c 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" __license__ = """ @@ -22,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - import islpy as isl from pymbolic.primitives import CallWithKwargs @@ -63,10 +59,10 @@ def _resolved_callables_from_function_lookup(program, """ callables_table = program.callables_table - callable_knls = dict( - (func_id, in_knl_callable) for func_id, in_knl_callable in + callable_knls = { + func_id: in_knl_callable for func_id, in_knl_callable in callables_table.items() if isinstance(in_knl_callable, - CallableKernel)) + CallableKernel)} edited_callable_knls = {} for func_id, in_knl_callable in callable_knls.items(): @@ -143,7 +139,7 @@ class _RegisterCalleeKernel(ImmutableRecord): :func:`loopy.transform.register_callable_kernel` picklable. As python cannot pickle lexical closures. """ - fields = set(['callable_kernel']) + fields = {"callable_kernel"} def __init__(self, callable_kernel): self.callable_kernel = callable_kernel @@ -166,8 +162,7 @@ def register_callable_kernel(program, callee_kernel): # {{{ sanity checks assert isinstance(program, Program) - assert isinstance(callee_kernel, LoopKernel), ('{0} !=' - '{1}'.format(type(callee_kernel), LoopKernel)) + assert isinstance(callee_kernel, LoopKernel) # check to make sure that the variables with 'out' direction is equal to # the number of assigness in the callee kernel intructions. @@ -263,7 +258,7 @@ class KernelInliner(SubstitutionMapper): """ def __init__(self, subst_func, caller, arg_map, arg_dict): - super(KernelInliner, self).__init__(subst_func) + super().__init__(subst_func) self.caller = caller self.arg_map = arg_map self.arg_dict = arg_dict @@ -287,7 +282,7 @@ class KernelInliner(SubstitutionMapper): from numbers import Integral if not all(isinstance(d, Integral) for d in callee_arg.shape): raise LoopyError( - "Argument: {0} in callee kernel does not have " + "Argument: {} in callee kernel does not have " "constant shape.".format(callee_arg)) flatten_index = 0 @@ -311,7 +306,7 @@ class KernelInliner(SubstitutionMapper): return aggregate.index(tuple(new_indices)) else: - return super(KernelInliner, self).map_subscript(expr) + return super().map_subscript(expr) # }}} @@ -360,7 +355,7 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): temp_map = {} new_temps = kernel.temporary_variables.copy() - for name, temp in six.iteritems(callee_knl.temporary_variables): + for name, temp in callee_knl.temporary_variables.items(): new_name = vng(callee_label+name) temp_map[name] = new_name new_temps[new_name] = temp.copy(name=new_name) @@ -404,11 +399,11 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): import pymbolic.primitives as p from pymbolic.mapper.substitutor import make_subst_func - var_map = dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(iname_map)) - var_map.update(dict((p.Variable(k), p.Variable(v)) - for k, v in six.iteritems(temp_map))) - for k, v in six.iteritems(arg_map): + var_map = {p.Variable(k): p.Variable(v) + for k, v in iname_map.items()} + var_map.update({p.Variable(k): p.Variable(v) + for k, v in temp_map.items()}) + for k, v in arg_map.items(): if isinstance(v, SubArrayRef): var_map[p.Variable(k)] = v.subscript.aggregate else: @@ -425,10 +420,10 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): dep_map = callee_knl.recursive_insn_dep_map() # roots depend on nothing - heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps) + heads = {insn for insn, deps in dep_map.items() if not deps} # leaves have nothing that depends on them tails = set(dep_map.keys()) - for insn, deps in six.iteritems(dep_map): + for insn, deps in dep_map.items(): tails = tails - deps # }}} @@ -458,7 +453,7 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( instruction.depends_on) if insn.id in heads: - depends_on = depends_on | set([noop_start.id]) + depends_on = depends_on | {noop_start.id} new_atomicity = tuple( type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) @@ -598,7 +593,7 @@ class DimChanger(IdentityMapper): def map_subscript(self, expr): if expr.aggregate.name not in self.callee_arg_dict: - return super(DimChanger, self).map_subscript(expr) + return super().map_subscript(expr) callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in zip(callee_arg_dim_tags, expr.index_tuple)) @@ -645,7 +640,7 @@ def _match_caller_callee_argument_dimension_for_single_kernel( get_kw_pos_association) _, pos_to_kw = get_kw_pos_association(callee_knl) arg_id_to_shape = {} - for arg_id, arg in six.iteritems(insn.arg_id_to_val()): + for arg_id, arg in insn.arg_id_to_val().items(): arg_id = pos_to_kw[arg_id] arg_descr = get_arg_descriptor_for_expression(caller_knl, arg) diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py index a85a8aa29..5a4297352 100644 --- a/loopy/transform/diff.py +++ b/loopy/transform/diff.py @@ -378,7 +378,7 @@ def diff_kernel(kernel, diff_outputs, by, diff_iname_prefix="diff_i", *diff_context.by_name*, or *None* if no dependency exists. """ - assert isinstance(knl, LoopKernel) + assert isinstance(kernel, LoopKernel) from loopy.kernel.creation import apply_single_writer_depencency_heuristic kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=True) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 378b4f2f7..473dbbca7 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1090,9 +1090,8 @@ def get_iname_duplication_options_for_single_kernel(kernel, def get_iname_duplication_options(program, use_boostable_into=False): for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): - for option in get_iname_duplication_options_for_single_kernel( - in_knl_callable.subkernel, use_boostable_into): - yield option + yield from get_iname_duplication_options_for_single_kernel( + in_knl_callable.subkernel, use_boostable_into) elif isinstance(in_knl_callable, ScalarCallable): pass else: diff --git a/loopy/transform/make_scalar.py b/loopy/transform/make_scalar.py index ab91fdf78..9f33e8394 100644 --- a/loopy/transform/make_scalar.py +++ b/loopy/transform/make_scalar.py @@ -7,13 +7,13 @@ from loopy.transform.iname import remove_unused_inames class ScalarChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, var_name): self.var_name = var_name - super(ScalarChanger, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) def map_subscript(self, expr, expn_state): if expr.aggregate.name == self.var_name: return Variable(self.var_name) - return super(ScalarChanger, self).map_subscript(expr, expn_state) + return super().map_subscript(expr, expn_state) def make_scalar(kernel, var_name): diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index a18326187..6fb4988f0 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - __copyright__ = "Copyright (C) 2018 Tianjiao Sun, Kaushik Kulkarni" __license__ = """ @@ -121,9 +119,9 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, from pymbolic import var dim_type = isl.dim_type.set - ilp_inames = set(iname for iname in insn.within_inames + ilp_inames = {iname for iname in insn.within_inames if all(isinstance(tag, (IlpBaseTag, VectorizeTag)) - for tag in kernel.iname_to_tags.get(iname, []))) + for tag in kernel.iname_to_tags.get(iname, []))} new_ilp_inames = set() ilp_inames_map = {} for iname in ilp_inames: @@ -156,10 +154,10 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, new_pack_inames = ilp_inames_map.copy() # packing-specific inames new_unpack_inames = ilp_inames_map.copy() # unpacking-specific iname - new_pack_inames = dict((iname, var(vng(iname.name + - "_pack"))) for iname in p.swept_inames) - new_unpack_inames = dict((iname, var(vng(iname.name + - "_unpack"))) for iname in p.swept_inames) + new_pack_inames = {iname: var(vng(iname.name + + "_pack")) for iname in p.swept_inames} + new_unpack_inames = {iname: var(vng(iname.name + + "_unpack")) for iname in p.swept_inames} # Updating the domains corresponding to the new inames. for iname in p.swept_inames: @@ -228,8 +226,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, packing_insns.append(Assignment( assignee=pack_lhs_assignee, expression=pack_subst_mapper.map_subscript(p.subscript), - within_inames=insn.within_inames - ilp_inames | set( - new_pack_inames[i].name for i in p.swept_inames) | ( + within_inames=insn.within_inames - ilp_inames | { + new_pack_inames[i].name for i in p.swept_inames} | ( new_ilp_inames), depends_on=insn.depends_on, id=ing(insn.id+"_pack"), @@ -240,8 +238,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, unpacking_insns.append(Assignment( expression=unpack_rhs, assignee=unpack_subst_mapper.map_subscript(p.subscript), - within_inames=insn.within_inames - ilp_inames | set( - new_unpack_inames[i].name for i in p.swept_inames) | ( + within_inames=insn.within_inames - ilp_inames | { + new_unpack_inames[i].name for i in p.swept_inames} | ( new_ilp_inames), id=ing(insn.id+"_unpack"), depends_on=frozenset([insn.id]), @@ -282,8 +280,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1]) for i, _ in enumerate(insn.assignees)) new_call_insn = new_call_insn.copy( - depends_on=new_call_insn.depends_on | set( - pack.id for pack in packing_insns), + depends_on=new_call_insn.depends_on | { + pack.id for pack in packing_insns}, within_inames=new_call_insn.within_inames - ilp_inames | ( new_ilp_inames), expression=new_call_insn.expression.function(*new_params), diff --git a/loopy/type_inference.py b/loopy/type_inference.py index e95146349..ac4afaac7 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -54,8 +54,8 @@ def get_return_types_as_tuple(arg_id_to_dtype): :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a mapping from the arguments to their inferred types. """ - return_arg_id_to_dtype = dict((id, dtype) for id, dtype in - arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)) + return_arg_id_to_dtype = {id: dtype for id, dtype in + arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)} return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True) return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos) @@ -71,7 +71,7 @@ class FunctionNameChanger(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, calls_to_new_names, subst_expander): - super(FunctionNameChanger, self).__init__(rule_mapping_context) + super().__init__(rule_mapping_context) self.calls_to_new_names = calls_to_new_names self.subst_expander = subst_expander @@ -94,7 +94,7 @@ class FunctionNameChanger(RuleAwareIdentityMapper): tuple(self.rec(child, expn_state) for child in expanded_expr.parameters)) else: - return super(FunctionNameChanger, self).map_call( + return super().map_call( expr, expn_state) else: return self.map_substitution(name, tag, expr.parameters, expn_state) @@ -106,12 +106,12 @@ class FunctionNameChanger(RuleAwareIdentityMapper): ResolvedFunction(self.calls_to_new_names[expr]), tuple(self.rec(child, expn_state) for child in expr.parameters), - dict( - (key, self.rec(val, expn_state)) - for key, val in six.iteritems(expr.kw_parameters)) + { + key: self.rec(val, expn_state) + for key, val in expr.kw_parameters.items()} ) else: - return super(FunctionNameChanger, self).map_call_with_kwargs( + return super().map_call_with_kwargs( expr, expn_state) @@ -422,8 +422,8 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in - tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())) + arg_id_to_dtype = {i: none_if_empty(self.rec(par)) for (i, par) in + tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())} # specializing the known function wrt type if isinstance(expr.function, ResolvedFunction): @@ -521,11 +521,11 @@ class TypeInferenceMapper(CombineMapper): ValueArgDescriptor) # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes - arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target)) - for i, dt in enumerate(mangle_result.arg_dtypes)) - arg_id_to_dtype.update(dict((-i-1, - dtype.with_target(self.kernel.target)) for i, dtype in enumerate( - mangle_result.result_dtypes))) + arg_id_to_dtype = {i: dt.with_target(self.kernel.target) + for i, dt in enumerate(mangle_result.arg_dtypes)} + arg_id_to_dtype.update({-i-1: + dtype.with_target(self.kernel.target) for i, dtype in enumerate( + mangle_result.result_dtypes)}) arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in enumerate(mangle_result.arg_dtypes)) res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in diff --git a/test/test_callables.py b/test/test_callables.py index f2f3acbd6..efb1e5e72 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import, print_function - __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" __license__ = """ diff --git a/test/test_transform.py b/test/test_transform.py index 684381c52..ff593a0c8 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -588,7 +588,7 @@ def test_nested_substs_in_insns(ctx_factory): prg = lp.expand_subst(ref_prg) assert not any( cknl.subkernel.substitutions - for cknl in six.itervalues(prg.callables_table.resolved_functions)) + for cknl in prg.callables_table.resolved_functions.values()) lp.auto_test_vs_ref(ref_prg, ctx, prg) -- GitLab From 467db1e85f75bd3be8e2ca98dce4d7c327c7ad0e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Nov 2020 14:40:48 -0600 Subject: [PATCH 689/916] run pyupgrade --py36-plus --- loopy/frontend/fortran/__init__.py | 2 +- loopy/program.py | 2 +- loopy/symbolic.py | 6 +++--- loopy/transform/callable.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index c8fda36d0..a434b3dc0 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -253,7 +253,7 @@ def _add_assignees_to_calls(knl, all_kernels): may be called by *kernel*. """ new_insns = [] - subroutine_dict = dict((kernel.name, kernel) for kernel in all_kernels) + subroutine_dict = {kernel.name: kernel for kernel in all_kernels} from loopy.kernel.instruction import (Assignment, CallInstruction, CInstruction, _DataObliviousInstruction, modify_assignee_for_array_call) diff --git a/loopy/program.py b/loopy/program.py index a8bdf91a2..aef3fc45c 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -401,7 +401,7 @@ class Program(ImmutableRecord): for name, clbl in self.callables_table.items()) def __setstate__(self, state_obj): - super(Program, self).__setstate__(state_obj) + super().__setstate__(state_obj) self._program_executor_cache = {} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 165b8ea44..49da656a7 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -856,9 +856,9 @@ def get_start_subscript_from_sar(sar, kernel): pwaff = kernel.get_iname_bounds(iname).lower_bound_pw_aff return int(pw_aff_to_expr(pwaff)) - swept_inames_to_zeros = dict( - (swept_iname.name, _get_lower_bound(swept_iname.name)) for - swept_iname in sar.swept_inames) + swept_inames_to_zeros = { + swept_iname.name: _get_lower_bound(swept_iname.name) for + swept_iname in sar.swept_inames} return EvaluatorWithDeficientContext(swept_inames_to_zeros)( sar.subscript) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 461a4cb5f..aa7f917eb 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -183,7 +183,7 @@ def _check_correctness_of_args_and_assignees(insn, callee_kernel): callee_args_to_insn_params[i].append(param) - for kw, param in six.iteritems(expr.kw_parameters): + for kw, param in expr.kw_parameters.items(): pos = kw_to_pos[kw] if pos < 0: raise LoopyError("Keyword argument '{}' meant for output obtained as an" -- GitLab From 1914ad9f0898068a58d3a3d016de52d3aa2ddabf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Nov 2020 14:56:29 -0600 Subject: [PATCH 690/916] removes bad quotes --- loopy/__init__.py | 4 +- test/test_callables.py | 106 ++++++++++++++++++++--------------------- test/test_loopy.py | 12 ++--- test/test_transform.py | 13 ----- test/testlib.py | 6 +-- 5 files changed, 64 insertions(+), 77 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 107019022..a9251da3b 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -111,7 +111,7 @@ from loopy.transform.padding import ( add_padding) from loopy.transform.privatize import privatize_temporaries_with_inames -from loopy.transform.batch import to_batched, save_temporaries_in_loop +from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier @@ -228,7 +228,7 @@ __all__ = [ "privatize_temporaries_with_inames", - "to_batched", "save_temporaries_in_loop", + "to_batched", "assume", "fix_parameters", diff --git a/test/test_callables.py b/test/test_callables.py index efb1e5e72..62c2a797e 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -70,13 +70,13 @@ def test_register_knl(ctx_factory, inline): "{[i, j]:0<= i, j< 16}", """ c[i, j] = 2*a[i, j] + 3*b[i, j] - """, name='linear_combo1') + """, name="linear_combo1") child_knl = lp.make_function( "{[i, j]:0<=i, j < 16}", """ [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j]) - """, name='linear_combo2') + """, name="linear_combo2") parent_knl = lp.make_kernel( "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}", @@ -86,13 +86,13 @@ def test_register_knl(ctx_factory, inline): """, kernel_data=[ lp.GlobalArg( - name='x', + name="x", dtype=np.float64, shape=(16, 16, 16, 16, 16)), lp.GlobalArg( - name='y', + name="y", dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(16, 16, 16, 16, 16)), ...], ) knl = lp.register_callable_kernel( @@ -100,8 +100,8 @@ def test_register_knl(ctx_factory, inline): knl = lp.register_callable_kernel( knl, grandchild_knl) if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo2') - knl = lp.inline_callable_kernel(knl, 'linear_combo1') + knl = lp.inline_callable_kernel(knl, "linear_combo2") + knl = lp.inline_callable_kernel(knl, "linear_combo1") evt, (out, ) = knl(queue, x=x, y=y) @@ -132,23 +132,23 @@ def test_slices_with_negative_step(ctx_factory, inline): """, kernel_data=[ lp.GlobalArg( - name='x', + name="x", dtype=np.float64, shape=(16, 16, 16, 16, 16)), lp.GlobalArg( - name='y', + name="y", dtype=np.float64, shape=(16, 16, 16, 16, 16)), lp.GlobalArg( - name='z', + name="z", dtype=np.float64, - shape=(16, 16, 16, 16, 16)), '...'], + shape=(16, 16, 16, 16, 16)), ...], ) knl = lp.register_callable_kernel( parent_knl, child_knl) if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') + knl = lp.inline_callable_kernel(knl, "linear_combo") evt, (out, ) = knl(queue, x=x, y=y) @@ -175,8 +175,8 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] """, [ - lp.GlobalArg('f, e, h, g'), '...'], - name='linear_combo') + lp.GlobalArg("f, e, h, g"), ...], + name="linear_combo") caller_knl = lp.make_kernel( "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, @@ -191,7 +191,7 @@ def test_register_knl_with_call_with_kwargs(ctx_factory, inline): knl = lp.register_callable_kernel( caller_knl, callee_knl) if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') + knl = lp.inline_callable_kernel(knl, "linear_combo") evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) @@ -222,7 +222,7 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): "{[i, j]:0<=i, j < 32}", """ g[i, j] = 2*e[i, j] + 3*f[i, j] - """, name='linear_combo') + """, name="linear_combo") callee_knl = lp.split_iname(callee_knl, "i", 2, inner_tag="l.0", outer_tag="g.0") @@ -238,12 +238,12 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): knl = lp.register_callable_kernel( caller_knl, callee_knl) - knl = lp.set_options(knl, 'return_dict') + knl = lp.set_options(knl, "return_dict") gsize, lsize = knl.get_grid_size_upper_bounds_as_exprs() if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') + knl = lp.inline_callable_kernel(knl, "linear_combo") evt, out = knl(queue, x=x_dev, y=y_dev) @@ -252,7 +252,7 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): assert gsize == (16, 4) assert lsize == (2, 8) - assert np.linalg.norm(2*x_host+3*y_host-out['z'].get())/np.linalg.norm( + assert np.linalg.norm(2*x_host+3*y_host-out["z"].get())/np.linalg.norm( 2*x_host+3*y_host) < 1e-15 @@ -296,17 +296,17 @@ def test_shape_translation_through_sub_array_ref(ctx_factory, inline): knl = lp.register_callable_kernel(knl, callee3) if inline: - knl = lp.inline_callable_kernel(knl, 'callee_fn1') - knl = lp.inline_callable_kernel(knl, 'callee_fn2') - knl = lp.inline_callable_kernel(knl, 'callee_fn3') + knl = lp.inline_callable_kernel(knl, "callee_fn1") + knl = lp.inline_callable_kernel(knl, "callee_fn2") + knl = lp.inline_callable_kernel(knl, "callee_fn3") knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3) - y1 = out_dict['y1'].get() - y2 = out_dict['y2'].get() - y3 = out_dict['y3'].get() + y1 = out_dict["y1"].get() + y2 = out_dict["y2"].get() + y3 = out_dict["y3"].get() assert (np.linalg.norm(y1-2*x1.get())) < 1e-15 assert (np.linalg.norm(y2-3*x2.get())) < 1e-15 @@ -353,8 +353,8 @@ def test_multi_arg_array_call(ctx_factory): evt, out_dict = knl(queue, b=b) tol = 1e-15 from numpy.linalg import norm - assert(norm(out_dict['min_val'][0] - np.min(b)) < tol) - assert(norm(out_dict['min_index'][0] - np.argmin(b)) < tol) + assert(norm(out_dict["min_val"][0] - np.min(b)) < tol) + assert(norm(out_dict["min_index"][0] - np.argmin(b)) < tol) @pytest.mark.parametrize("inline", [False, True]) @@ -387,19 +387,19 @@ def test_packing_unpacking(ctx_factory, inline): knl = lp.register_callable_kernel(knl, callee1) knl = lp.register_callable_kernel(knl, callee2) - knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1') - knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2') + knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn1") + knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn2") if inline: - knl = lp.inline_callable_kernel(knl, 'callee_fn1') - knl = lp.inline_callable_kernel(knl, 'callee_fn2') + knl = lp.inline_callable_kernel(knl, "callee_fn1") + knl = lp.inline_callable_kernel(knl, "callee_fn2") knl = lp.set_options(knl, "write_cl") knl = lp.set_options(knl, "return_dict") evt, out_dict = knl(queue, x1=x1, x2=x2) - y1 = out_dict['y1'].get() - y2 = out_dict['y2'].get() + y1 = out_dict["y1"].get() + y2 = out_dict["y2"].get() assert np.linalg.norm(2*x1.get()-y1)/np.linalg.norm( 2*x1.get()) < 1e-15 @@ -425,7 +425,7 @@ def test_non_sub_array_refs_arguments(ctx_factory): caller3 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], kappa)", [lp.GlobalArg("a", dtype="double", shape=(6, ), - is_output_only=False), '...'], + is_output_only=False), ...], name="caller", target=lp.CTarget()) registered = lp.register_callable_kernel(caller1, callee) @@ -461,13 +461,13 @@ def test_empty_sub_array_refs(ctx_factory, inline): """ a[d] = b[d] - c[d] - """, name='wence_function') + """, name="wence_function") caller = lp.make_kernel("{[i]: 0<=i<10}", """ []:z[i] = wence_function([]:x[i], []:y[i]) """, - [lp.GlobalArg('x, y', dtype=np.float64, shape=(10, )), '...']) + [lp.GlobalArg("x, y", dtype=np.float64, shape=(10, )), ...]) caller = lp.register_callable_kernel(caller, callee) @@ -500,23 +500,23 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline): """, kernel_data=[ lp.GlobalArg( - name='x', + name="x", dtype=np.float64, shape=(16, 16)), lp.GlobalArg( - name='y', + name="y", dtype=np.float64, shape=(16, 16)), lp.GlobalArg( - name='z', + name="z", dtype=np.float64, - shape=(16, 16)), '...'], + shape=(16, 16)), ...], ) knl = lp.register_callable_kernel( parent_knl, child_knl) if inline: - knl = lp.inline_callable_kernel(knl, 'linear_combo') + knl = lp.inline_callable_kernel(knl, "linear_combo") evt, (out, ) = knl(queue, x=x, y=y) @@ -529,16 +529,16 @@ def test_stride_depending_on_args(): "{[i, j]: 0<=i, j < n}", """ b[i, j] = 2*a[i, j] - """, [lp.ValueArg('n'), lp.GlobalArg('a'), lp.GlobalArg('b')], - name='twice') + """, [lp.ValueArg("n"), lp.GlobalArg("a"), lp.GlobalArg("b")], + name="twice") thrice = lp.make_function( "{[i, j]: 0<=i, j < n}", """ b[i, j] = 3*a[i, j] - """, [lp.ValueArg('n'), lp.GlobalArg('a', shape=lp.auto), - lp.GlobalArg('b', shape=lp.auto)], - name='thrice') + """, [lp.ValueArg("n"), lp.GlobalArg("a", shape=lp.auto), + lp.GlobalArg("b", shape=lp.auto)], + name="thrice") prog = lp.make_kernel( "{[i0,i1,i2,i3,i4,i5,i6,i7]: 0<=i0, i1, i2, i3, i4, i5, i6, i7< N}", @@ -546,8 +546,8 @@ def test_stride_depending_on_args(): [i0, i1]: y[i0, i1] = twice(N, [i2, i3]: x[2*i2, i3]) [i4, i5]: z[i4, i5] = thrice(N, [i6, i7]: x[2*i6+1, i7]) """, [ - lp.ValueArg('N', dtype=np.int32), lp.GlobalArg('x', - shape=lp.auto, dtype=np.float64), '...']) + lp.ValueArg("N", dtype=np.int32), lp.GlobalArg("x", + shape=lp.auto, dtype=np.float64), ...]) prog = lp.register_callable_kernel(prog, twice) prog = lp.register_callable_kernel(prog, thrice) @@ -561,17 +561,17 @@ def test_unknown_stride_to_callee(): "{[i, j]: 0<=i, j < n}", """ b[i, j] = 2*a[i, j] - """, [lp.ValueArg('n'), lp.GlobalArg('a'), lp.GlobalArg('b')], - name='twice') + """, [lp.ValueArg("n"), lp.GlobalArg("a"), lp.GlobalArg("b")], + name="twice") prog = lp.make_kernel( "{[i,i0,i1,i2,i3]: 0<=i0, i1, i2, i3< N and 0<=i a[j] = j {inames=i:j} - """) - - prog = lp.save_temporaries_in_loop(prog, 'i', ['a']) - assert prog.root_kernel.temporary_variables['a'].shape == (4, 4) - - def test_add_barrier(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) diff --git a/test/testlib.py b/test/testlib.py index 2d2a535fb..5f9b68893 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -149,7 +149,7 @@ class Log2Callable(lp.ScalarCallable): dtype = arg_id_to_dtype[0].numpy_dtype - if dtype.kind in ('u', 'i'): + if dtype.kind in ("u", "i"): # ints and unsigned casted to float32 dtype = np.float32 @@ -171,8 +171,8 @@ class Log2Callable(lp.ScalarCallable): def register_log2_lookup(target, identifier): - if identifier == 'log2': - return Log2Callable(name='log2') + if identifier == "log2": + return Log2Callable(name="log2") return None # }}} -- GitLab From d263461eb182be2eab8d30de38ec24596fd18f8f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Nov 2020 15:02:57 -0600 Subject: [PATCH 691/916] merge leftovers --- loopy/__init__.py | 5 +---- test/test_target.py | 26 -------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index a9251da3b..0b8382bba 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -146,12 +146,11 @@ from loopy.target import TargetBase, ASTBuilderBase from loopy.target.c import CFamilyTarget, CTarget, ExecutableCTarget, generate_header from loopy.target.cuda import CudaTarget from loopy.target.opencl import OpenCLTarget -from loopy.target.pyopencl import PyOpenCLTarget, NvidiaPyOpenCLTarget +from loopy.target.pyopencl import PyOpenCLTarget from loopy.target.ispc import ISPCTarget from loopy.target.numba import NumbaTarget, NumbaCudaTarget from loopy.tools import Optional -from loopy.tools import dump_as_python __all__ = [ @@ -236,8 +235,6 @@ __all__ = [ "add_barrier", - "dump_as_python", - "register_callable_kernel", "register_function_id_to_in_knl_callable_mapper", "inline_callable_kernel", diff --git a/test/test_target.py b/test/test_target.py index e5b743d37..505a6b70b 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -368,32 +368,6 @@ def test_cuda_short_vector(): print(lp.generate_code_v2(knl).device_code()) -def test_nvidia_pyopencl_target(ctx_factory): - ctx = ctx_factory() - if ctx.devices[0].vendor != "NVIDIA Corporation": - # do not test for non-Nvidia devices - return - - queue = cl.CommandQueue(ctx) - a = np.random.randn(16) - - knl = lp.make_kernel( - "{[i]: 0<=i<16}", - """ - res[0] = res[0] + a[i] {id=update, atomic} - """, - [ - lp.GlobalArg("res", for_atomic=True), - lp.GlobalArg("a", for_atomic=True, dtype=np.float64), - "..."]) - - knl = lp.split_iname(knl, "i", 4, inner_tag="l.0", outer_tag="g.0") - knl = knl.copy(target=lp.NvidiaPyOpenCLTarget(ctx.devices[0])) - - evt, (out, ) = knl(queue, a=a) - assert np.isclose(out, a.sum()) - - def test_pyopencl_execution_numpy_handling(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From 8cc0b6bd8d99c812198b3c43b6df498bb029762d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Nov 2020 16:14:39 -0600 Subject: [PATCH 692/916] merge leftover: account for changes to InstructionBase.with_transformed_expressions --- loopy/preprocess.py | 15 ++++++++------- loopy/program.py | 10 +++++++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 365c30d7f..ab813953a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -44,6 +44,7 @@ from loopy.program import Program, iterate_over_kernels_if_given_program from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pytools import ProcessLogger +from functools import partial # {{{ prepare for caching @@ -2054,7 +2055,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): self.caller_kernel = caller_kernel self.callables_table = callables_table - def map_call(self, expr, expn_state, **kwargs): + def map_call(self, expr, expn_state, assignees=None): from pymbolic.primitives import Call, CallWithKwargs from loopy.symbolic import ResolvedFunction @@ -2066,9 +2067,8 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): if isinstance(expr, CallWithKwargs): arg_id_to_val.update(expr.kw_parameters) - if "assignees" in kwargs: + if assignees is not None: # If supplied with assignees then this is a CallInstruction - assignees = kwargs["assignees"] for i, arg in enumerate(assignees): arg_id_to_val[-i-1] = arg @@ -2117,11 +2117,12 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): if isinstance(insn, CallInstruction): # In call instructions the assignees play an important in # determining the arg_id_to_descr - new_insns.append(insn.with_transformed_expressions( - self, kernel, insn, assignees=insn.assignees)) + mapper = partial(self, kernel=kernel, insn=insn, + assignees=insn.assignees) + new_insns.append(insn.with_transformed_expressions(mapper)) elif isinstance(insn, MultiAssignmentBase): - new_insns.append(insn.with_transformed_expressions( - self, kernel, insn)) + mapper = partial(self, kernel=kernel, insn=insn) + new_insns.append(insn.with_transformed_expressions(mapper)) elif isinstance(insn, (_DataObliviousInstruction, CInstruction)): new_insns.append(insn) else: diff --git a/loopy/program.py b/loopy/program.py index 7224a7bbe..dffce5d86 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -965,7 +965,15 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): ``transform`` being implemented on all of the callable kernels in a :class:`loopy.Program`. """ - def _collective_transform(program_or_kernel, *args, **kwargs): + def _collective_transform(*args, **kwargs): + if "program" in kwargs: + program_or_kernel = kwargs.pop("program") + elif "kernel" in kwargs: + program_or_kernel = kwargs.pop("kernel") + else: + program_or_kernel = args[0] + args = args[1:] + if isinstance(program_or_kernel, Program): program = program_or_kernel new_resolved_functions = {} -- GitLab From 9e8aa01ceb0228cfb8d71cdd649d2ac677ec9e97 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 2 Nov 2020 16:20:11 -0600 Subject: [PATCH 693/916] query root_kernel's state --- loopy/auto_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index dfcfe2a2f..6b9b27294 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -525,7 +525,7 @@ def auto_test_vs_ref( from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget - if test_prog.state not in [ + if test_prog.root_kernel.state not in [ KernelState.PREPROCESSED, KernelState.LINEARIZED]: if isinstance(test_prog.target, PyOpenCLTarget): -- GitLab From 1f0b750b25d668d251bfab812c760eeda7769699 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 3 Nov 2020 00:37:30 -0600 Subject: [PATCH 694/916] run pyupgrade --py36-plus --- loopy/codegen/result.py | 4 ++-- loopy/kernel/creation.py | 10 +++++----- loopy/kernel/data.py | 2 +- loopy/kernel/tools.py | 4 ++-- loopy/library/random123.py | 2 +- loopy/program.py | 10 +++++----- loopy/target/c/__init__.py | 4 ++-- loopy/target/cuda.py | 2 +- loopy/target/opencl.py | 6 +++--- loopy/target/pyopencl.py | 2 +- loopy/target/python.py | 2 +- loopy/transform/buffer.py | 4 ++-- loopy/transform/padding.py | 2 +- loopy/transform/precompute.py | 4 ++-- loopy/transform/subst.py | 2 +- loopy/type_inference.py | 4 ++-- 16 files changed, 32 insertions(+), 32 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 2cc8197e6..0ffd117d1 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -136,7 +136,7 @@ class CodeGenerationResult(ImmutableRecord): "".join(preamble_codes) + "\n" + "\n\n".join(str(hp.ast) for hp in - six.itervalues(self.host_programs))) + self.host_programs.values())) def device_code(self): preamble_codes = process_preambles(getattr(self, "device_preambles", [])) @@ -159,7 +159,7 @@ class CodeGenerationResult(ImmutableRecord): + "\n\n".join(str(dp.ast) for dp in self.device_programs) + "\n\n" + "\n\n".join(str(hp.ast) for hp in - six.itervalues(self.host_programs))) + self.host_programs.values())) def current_program(self, codegen_state): if codegen_state.is_generating_device_code: diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 4e4f25596..337ac67e5 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1985,7 +1985,7 @@ class SliceToInameReplacer(IdentityMapper): set=list(sar_bounds.keys())) from loopy.symbolic import DependencyMapper args_as_params_for_domains = set() - for _, (start, stop, step) in six.iteritems(sar_bounds): + for _, (start, stop, step) in sar_bounds.items(): args_as_params_for_domains |= DependencyMapper()(start) args_as_params_for_domains |= DependencyMapper()(stop) args_as_params_for_domains |= DependencyMapper()(step) @@ -1997,7 +1997,7 @@ class SliceToInameReplacer(IdentityMapper): iname_set = isl.BasicSet.universe(space) from loopy.isl_helpers import make_slab - for iname, (start, stop, step) in six.iteritems(sar_bounds): + for iname, (start, stop, step) in sar_bounds.items(): iname_set = iname_set & make_slab(space, iname, start, stop, step) subarray_ref_domains.append(iname_set) @@ -2395,9 +2395,9 @@ def make_kernel(*args, **kwargs): from loopy.version import LANGUAGE_VERSION_SYMBOLS - version_to_symbol = dict( - (getattr(loopy.version, lvs), lvs) - for lvs in LANGUAGE_VERSION_SYMBOLS) + version_to_symbol = { + getattr(loopy.version, lvs): lvs + for lvs in LANGUAGE_VERSION_SYMBOLS} lang_version = kwargs.get("lang_version", None) if lang_version is None: diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index b41e4b574..3e95fcb66 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -453,7 +453,7 @@ class ConstantArg(ArrayBase, KernelArgument): def __init__(self, *args, **kwargs): if kwargs.pop('address_space', AddressSpace.GLOBAL) != AddressSpace.GLOBAL: raise LoopyError("'address_space' for ConstantArg must be GLOBAL.") - super(ConstantArg, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) # Constant Arg cannot be an output is_output = False diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 53627db45..95bcbbba0 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -53,7 +53,7 @@ def add_dtypes(prog_or_kernel, dtype_dict): """ if isinstance(prog_or_kernel, Program): kernel_names = [clbl.subkernel.name for clbl in - six.itervalues(prog_or_kernel.callables_table) if isinstance(clbl, + prog_or_kernel.callables_table.values() if isinstance(clbl, CallableKernel)] if len(kernel_names) != 1: raise LoopyError("add_dtypes may not take a Program with more than" @@ -131,7 +131,7 @@ def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False, assert isinstance(prog, Program) if kernel_name is None: kernel_names = [clbl.subkernel.name for clbl in - six.itervalues(prog.callables_table) if isinstance(clbl, + prog.callables_table.values() if isinstance(clbl, CallableKernel)] if len(kernel_names) != 1: raise LoopyError("Provide 'kernel_name' argument.") diff --git a/loopy/library/random123.py b/loopy/library/random123.py index ea7824300..c2e64fc55 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -231,6 +231,6 @@ class Random123Callable(ScalarCallable): def get_random123_callables(): - return dict((id_, Random123Callable(id_)) for id_ in FUNC_NAMES_TO_RNG) + return {id_: Random123Callable(id_) for id_ in FUNC_NAMES_TO_RNG} # vim: foldmethod=marker diff --git a/loopy/program.py b/loopy/program.py index bb4c0ba32..e2a003c64 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -209,7 +209,7 @@ class Program(ImmutableRecord): def copy(self, **kwargs): target = kwargs.pop("target", None) - program = super(Program, self).copy(**kwargs) + program = super().copy(**kwargs) if target: from loopy.kernel import KernelState if max(callable_knl.subkernel.state for callable_knl in @@ -276,8 +276,8 @@ class Program(ImmutableRecord): known_callables.update(self.target.get_device_ast_builder().known_callables) known_callables.update(get_loopy_callables()) # update the known callables from the target. - callables_table = dict((e, self.callables_table[e]) for e in - self.entrypoints) + callables_table = {e: self.callables_table[e] for e in + self.entrypoints} # start a traversal to collect all the callables queue = list(self.entrypoints) @@ -321,7 +321,7 @@ class Program(ImmutableRecord): return lambda *args, **kwargs: self(*args, entrypoint=attr, **kwargs) - return super(Program, self).__getattr__(attr) + return super().__getattr__(attr) def __call__(self, *args, **kwargs): entrypoint = kwargs.get("entrypoint", None) @@ -500,7 +500,7 @@ class CallablesInferenceContext(ImmutableRecord): def __init__(self, callables, old_callable_ids, history={}): assert isinstance(callables, dict) - super(CallablesInferenceContext, self).__init__( + super().__init__( callables=callables, old_callable_ids=old_callable_ids, history=history) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index a08ca447c..fdc46570d 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -531,7 +531,7 @@ def get_c_callables(): "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", "fabs", "tan", "erf", "erfc"] - return dict((id_, CMathCallable(id_)) for id_ in cmath_ids) + return {id_: CMathCallable(id_) for id_ in cmath_ids} # }}} @@ -1132,7 +1132,7 @@ class ExecutableCTarget(CTarget): An executable CFamilyTarget that uses (by default) JIT compilation of C-code """ def __init__(self, compiler=None, fortran_abi=False): - super(ExecutableCTarget, self).__init__(fortran_abi=fortran_abi) + super().__init__(fortran_abi=fortran_abi) from loopy.target.c.c_execution import CCompiler self.compiler = compiler or CCompiler() diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index d84cc01bd..7aff36118 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -185,7 +185,7 @@ class CudaCallable(ScalarCallable): def get_cuda_callables(): cuda_func_ids = {"dot"} | set(_CUDA_SPECIFIC_FUNCTIONS) - return dict((id_, CudaCallable(name=id_)) for id_ in cuda_func_ids) + return {id_: CudaCallable(name=id_) for id_ in cuda_func_ids} # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 33c32d48b..5008c0146 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -191,7 +191,7 @@ class OpenCLCallable(ScalarCallable): if common_dtype.kind == "f": name = "f"+name - target = [dtype.target for dtype in six.itervalues(arg_id_to_dtype) + target = [dtype.target for dtype in arg_id_to_dtype.values() if (id >= 0 and dtype is not None)][0] dtype = NumpyType(common_dtype, target) return ( @@ -295,8 +295,8 @@ def get_opencl_callables(): opencl_function_ids = {"max", "min", "dot"} | set( _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) - return dict((id_, OpenCLCallable(name=id_)) for id_ in - opencl_function_ids) + return {id_: OpenCLCallable(name=id_) for id_ in + opencl_function_ids} # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 90b73f801..59b90ef90 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -271,7 +271,7 @@ class PyOpenCLCallable(ScalarCallable): def get_pyopencl_callables(): pyopencl_ids = ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh", "tanh", "conj", "real", "imag", "abs"] - return dict((id_, PyOpenCLCallable(name=id_)) for id_ in pyopencl_ids) + return {id_: PyOpenCLCallable(name=id_) for id_ in pyopencl_ids} # }}} diff --git a/loopy/target/python.py b/loopy/target/python.py index ef4a9f364..8162dbb8f 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -169,7 +169,7 @@ class PythonASTBuilderBase(ASTBuilderBase): @property def known_callables(self): from loopy.target.c import get_c_callables - callables = super(PythonASTBuilderBase, self).known_callables + callables = super().known_callables callables.update(get_c_callables()) return callables diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 0f09f98f1..787517468 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -173,7 +173,7 @@ def buffer_array_for_single_kernel(kernel, callables_table, var_name, if isinstance(kernel, Program): kernel_names = [i for i, clbl in - six.iteritems(kernel.callables_table) if isinstance(clbl, + kernel.callables_table.items() if isinstance(clbl, CallableKernel)] if len(kernel_names) != 1: raise LoopyError() @@ -560,7 +560,7 @@ def buffer_array(program, *args, **kwargs): new_callables = {} - for func_id, clbl in six.iteritems(program.callables_table): + for func_id, clbl in program.callables_table.items(): if isinstance(clbl, CallableKernel): clbl = clbl.copy( subkernel=buffer_array_for_single_kernel(clbl.subkernel, diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index ec4017a1f..455ce31d0 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -410,7 +410,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1): if isinstance(kernel, Program): - kernel_names = [i for i, clbl in six.iteritems(kernel.callables_table) + kernel_names = [i for i, clbl in kernel.callables_table.items() if isinstance(clbl, CallableKernel)] if len(kernel_names) > 1: raise LoopyError() diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index dc8ef6c26..438c07339 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -356,7 +356,7 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, """ if isinstance(kernel, Program): kernel_names = [i for i, clbl in - six.iteritems(kernel.callables_table) if isinstance(clbl, + kernel.callables_table.items() if isinstance(clbl, CallableKernel)] if len(kernel_names) != 1: raise LoopyError() @@ -1060,7 +1060,7 @@ def precompute(program, *args, **kwargs): assert isinstance(program, Program) new_callables = {} - for func_id, clbl in six.iteritems(program.callables_table): + for func_id, clbl in program.callables_table.items(): if isinstance(clbl, CallableKernel): knl = precompute_for_single_kernel(clbl.subkernel, program.callables_table, *args, **kwargs) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 9a316326d..066cf326c 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -55,7 +55,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): if isinstance(kernel, Program): kernel_names = [i for i, clbl in - six.iteritems(kernel.callables_table) if isinstance(clbl, + kernel.callables_table.items() if isinstance(clbl, CallableKernel)] if len(kernel_names) != 1: raise LoopyError() diff --git a/loopy/type_inference.py b/loopy/type_inference.py index fb65a6559..4c3a3b224 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1127,8 +1127,8 @@ def infer_unknown_types(program, expect_completion=False): for e in program.entrypoints: # FIXME: Need to add docs which say that we need not add the current # callable to the clbl_inf_ctx while writing the "with_types" - arg_id_to_dtype = dict((arg.name, arg.dtype) for arg in - program[e].args if arg.dtype not in (None, auto)) + arg_id_to_dtype = {arg.name: arg.dtype for arg in + program[e].args if arg.dtype not in (None, auto)} new_callable, clbl_inf_ctx = program.callables_table[e].with_types( arg_id_to_dtype, None, clbl_inf_ctx) clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) -- GitLab From 443143a82b3460bf6df75e7c722085090fffb3ed Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 3 Nov 2020 00:40:19 -0600 Subject: [PATCH 695/916] formatting: remove bad quotes --- loopy/kernel/creation.py | 2 +- loopy/kernel/data.py | 2 +- loopy/target/opencl.py | 1 - loopy/transform/buffer.py | 2 -- test/test_callables.py | 20 ++++++++++---------- 5 files changed, 12 insertions(+), 15 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 337ac67e5..0b757593d 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2436,7 +2436,7 @@ def make_kernel(*args, **kwargs): lang_version = FALLBACK_LANGUAGE_VERSION - kwargs['lang_version'] = lang_version + kwargs["lang_version"] = lang_version # }}} diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 3e95fcb66..b4e783f86 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -451,7 +451,7 @@ class ConstantArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ def __init__(self, *args, **kwargs): - if kwargs.pop('address_space', AddressSpace.GLOBAL) != AddressSpace.GLOBAL: + if kwargs.pop("address_space", AddressSpace.GLOBAL) != AddressSpace.GLOBAL: raise LoopyError("'address_space' for ConstantArg must be GLOBAL.") super().__init__(*args, **kwargs) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 5008c0146..3aa23cd4a 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -24,7 +24,6 @@ THE SOFTWARE. """ import numpy as np -import six from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 787517468..e8c4bc2e9 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -20,8 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six - from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) from loopy.symbolic import (get_dependencies, diff --git a/test/test_callables.py b/test/test_callables.py index 1c521821f..d7a808047 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -49,7 +49,7 @@ def test_register_function_lookup(ctx_factory): """ y[i] = log2(x[i]) """) - prog = lp.register_callable(prog, 'log2', Log2Callable('log2')) + prog = lp.register_callable(prog, "log2", Log2Callable("log2")) evt, (out, ) = prog(queue, x=x) @@ -216,14 +216,14 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): """ [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]) - """, name='caller') + """, name="caller") caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1") knl = lp.merge([caller_knl, callee_knl]) knl = lp.set_options(knl, "return_dict") - gsize, lsize = knl['caller'].get_grid_size_upper_bounds_as_exprs( + gsize, lsize = knl["caller"].get_grid_size_upper_bounds_as_exprs( knl.callables_table) if inline: @@ -419,20 +419,20 @@ def test_non_sub_array_refs_arguments(ctx_factory): name="caller", target=lp.CTarget()) registered = lp.merge([caller1, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, 'callee') - inlined = lp.inline_callable_kernel(inlined, 'callee') + inlined = _match_caller_callee_argument_dimension_(registered, "callee") + inlined = lp.inline_callable_kernel(inlined, "callee") print(inlined) registered = lp.merge([caller2, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, 'callee') - inlined = lp.inline_callable_kernel(inlined, 'callee') + inlined = _match_caller_callee_argument_dimension_(registered, "callee") + inlined = lp.inline_callable_kernel(inlined, "callee") print(inlined) registered = lp.merge([caller3, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, 'callee') - inlined = lp.inline_callable_kernel(inlined, 'callee') + inlined = _match_caller_callee_argument_dimension_(registered, "callee") + inlined = lp.inline_callable_kernel(inlined, "callee") print(inlined) @@ -461,7 +461,7 @@ def test_empty_sub_array_refs(ctx_factory, inline): caller = lp.merge([caller, callee]) if inline: - caller = lp.inline_callable_kernel(caller, 'wence_function') + caller = lp.inline_callable_kernel(caller, "wence_function") evt, (out, ) = caller(queue, x=x, y=y) assert np.allclose(out, x-y) -- GitLab From 48daa6f1835b970dc2dfc30a709f0e97b159325c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 3 Nov 2020 01:49:58 -0600 Subject: [PATCH 696/916] fixes bugs accumulated during merge --- loopy/auto_test.py | 5 +++-- loopy/preprocess.py | 4 ++-- loopy/program.py | 2 +- loopy/transform/save.py | 10 +++++----- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 87d660fe2..91ef62d78 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -449,7 +449,8 @@ def auto_test_vs_ref( ref_errors = [] from loopy.kernel.data import ImageArg - need_ref_image_support = any(isinstance(arg, ImageArg) for arg in ref_prog.args) + need_ref_image_support = any(isinstance(arg, ImageArg) + for arg in ref_prog[ref_entrypoint].args) for dev in _enumerate_cl_devices_for_ref_test( blacklist_ref_vendors, need_ref_image_support): @@ -538,7 +539,7 @@ def auto_test_vs_ref( from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget - if test_prog.root_kernel.state not in [ + if test_prog[test_entrypoint].state not in [ KernelState.PREPROCESSED, KernelState.LINEARIZED]: if isinstance(test_prog.target, PyOpenCLTarget): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index b4baf5877..d732d2696 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -36,7 +36,7 @@ from loopy.kernel.data import make_assignment, filter_iname_tags_by_type # for the benefit of loopy.statistics, for now from loopy.type_inference import infer_unknown_types from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper -from loopy.transform.iname import remove_any_newly_unused_inames +# from loopy.transform.iname import remove_any_newly_unused_inames from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) @@ -901,7 +901,7 @@ class RealizeReductionCallbackMapper(ReductionCallbackMapper): return result -@remove_any_newly_unused_inames +# @remove_any_newly_unused_inames def realize_reduction_for_single_kernel(kernel, callables_table, insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False, force_scan=False, force_outer_iname_for_scan=None): diff --git a/loopy/program.py b/loopy/program.py index e2a003c64..aefec0366 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -361,7 +361,7 @@ class Program(ImmutableRecord): return "\n".join( strify_callable(clbl) - for name, clbl in self.callables_table).items() + for name, clbl in self.callables_table.items()) def __setstate__(self, state_obj): super().__setstate__(state_obj) diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 515a2e3b5..884e17f77 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -763,7 +763,7 @@ def save_and_reload_temporaries(program, entrypoint=None): from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) - for sched_idx, sched_item in enumerate(program.root_kernel.schedule): + for sched_idx, sched_item in enumerate(knl.schedule): if isinstance(sched_item, CallKernel): # Any written temporary that is live-out needs to be read into @@ -774,8 +774,8 @@ def save_and_reload_temporaries(program, entrypoint=None): else: subkernel = sched_item.kernel_name interesting_temporaries = ( - temporaries_read_in_subkernel(program.root_kernel, subkernel) - | temporaries_written_in_subkernel(program.root_kernel, + temporaries_read_in_subkernel(knl, subkernel) + | temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: @@ -784,13 +784,13 @@ def save_and_reload_temporaries(program, entrypoint=None): saver.reload(temporary, sched_item.kernel_name) elif isinstance(sched_item, ReturnFromKernel): - if sched_idx == len(program.root_kernel.schedule) - 1: + if sched_idx == len(knl.schedule) - 1: # Kernel exit: nothing live interesting_temporaries = set() else: subkernel = sched_item.kernel_name interesting_temporaries = ( - temporaries_written_in_subkernel(program.root_kernel, subkernel)) + temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: logger.info("saving {} before return of {}" -- GitLab From a5b564b7a627daf09de0003520255b15d3ca0117 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 3 Nov 2020 02:14:00 -0600 Subject: [PATCH 697/916] fix typo --- loopy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 0b8382bba..1aa3a8907 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -282,7 +282,7 @@ __all__ = [ "TargetBase", "CFamilyTarget", "CTarget", "ExecutableCTarget", "generate_header", "CudaTarget", "OpenCLTarget", - "PyOpenCLTarget", "NvidiaPyOpenCLTarget", "ISPCTarget", + "PyOpenCLTarget", "ISPCTarget", "NumbaTarget", "NumbaCudaTarget", "ASTBuilderBase", -- GitLab From 5857a2714ce751400918dd43d58c7bba32fa92e3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 3 Nov 2020 02:21:25 -0600 Subject: [PATCH 698/916] ignore_boostable_into was dropped --- loopy/statistics.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index a1c86d88b..fcfa31ae4 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1636,10 +1636,6 @@ def _get_op_map_for_single_kernel(knl, callables_table, count_redundant_work, count_within_subscripts, subgroup_size): - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - subgroup_size = _process_subgroup_size(knl, subgroup_size) kernel_rec = partial(_get_op_map_for_single_kernel, @@ -1816,10 +1812,6 @@ def _process_subgroup_size(knl, subgroup_size_requested): def _get_mem_access_map_for_single_kernel(knl, callables_table, count_redundant_work, subgroup_size): - if not knl.options.ignore_boostable_into: - raise LoopyError("Kernel '%s': Using operation counting requires the option " - "ignore_boostable_into to be set." % knl.name) - subgroup_size = _process_subgroup_size(knl, subgroup_size) kernel_rec = partial(_get_mem_access_map_for_single_kernel, -- GitLab From 8c387ba40efc77b0ed1640d6c1aec3f9acd60279 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 3 Nov 2020 08:57:53 -0600 Subject: [PATCH 699/916] fixes to fuse_kernels --- loopy/transform/fusion.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 26a92eb3e..dc8e66787 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -331,15 +331,22 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): *data_flow* was added in version 2016.2 """ + # FIXME: This should take in inputs as (prog1, knlname1) and (prog2, + # knlname2). if prog1 == prog2 then the callable names belong to the same + # namespace, otherwise the kernel names should be uniquified. + # We should also somehow be able to know that callables like "sin"/"cos" + # belong to the global namespace and need not be uniquified. if all(isinstance(kernel, Program) for kernel in kernels): new_kernels = [] for knl in kernels: kernel_names = [i for i, clbl in knl.callables_table.items() if isinstance(clbl, CallableKernel)] - if len(kernel_names) != 1: - raise LoopyError() - new_kernels.append(knl[kernel_names[0]]) + if len(kernel_names) != 1: + raise NotImplementedError("Kernel containing more than one" + " callable kernel, not allowed for now.") + new_kernels.append(knl[kernel_names[0]]) + kernels = new_kernels[:] assert all(isinstance(knl, LoopKernel) for knl in kernels) -- GitLab From 3dbff55d6c8744d44842f27e9195b254567f74f5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Nov 2020 18:17:11 -0600 Subject: [PATCH 700/916] each callee can (and should) have its own lang_version --- loopy/kernel/creation.py | 49 ---------------------------------------- 1 file changed, 49 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 0b757593d..e89599528 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2391,55 +2391,6 @@ def make_function(domains, instructions, kernel_data=["..."], **kwargs): def make_kernel(*args, **kwargs): - # {{{ handle kernel language version - - from loopy.version import LANGUAGE_VERSION_SYMBOLS - - version_to_symbol = { - getattr(loopy.version, lvs): lvs - for lvs in LANGUAGE_VERSION_SYMBOLS} - - lang_version = kwargs.get("lang_version", None) - if lang_version is None: - # {{{ peek into caller's module to look for LOOPY_KERNEL_LANGUAGE_VERSION - - # This *is* gross. But it seems like the right thing interface-wise. - import inspect - caller_globals = inspect.currentframe().f_back.f_globals - - for ver_sym in LANGUAGE_VERSION_SYMBOLS: - try: - lang_version = caller_globals[ver_sym] - break - except KeyError: - pass - - # }}} - - if lang_version is None: - from warnings import warn - from loopy.diagnostic import LoopyWarning - from loopy.version import ( - MOST_RECENT_LANGUAGE_VERSION, - FALLBACK_LANGUAGE_VERSION) - warn("'lang_version' was not passed to make_kernel(). " - "To avoid this warning, pass " - "lang_version={ver} in this invocation. " - "(Or say 'from loopy.version import " - "{sym_ver}' in " - "the global scope of the calling frame.)" - .format( - ver=MOST_RECENT_LANGUAGE_VERSION, - sym_ver=version_to_symbol[MOST_RECENT_LANGUAGE_VERSION] - ), - LoopyWarning, stacklevel=2) - - lang_version = FALLBACK_LANGUAGE_VERSION - - kwargs["lang_version"] = lang_version - - # }}} - tunit = make_function(*args, **kwargs) name, = [name for name in tunit.callables_table] return tunit.with_entrypoints(name) -- GitLab From a9456cdebf9c0c6dce89742e89cf0a808581717b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Nov 2020 18:17:34 -0600 Subject: [PATCH 701/916] fixes a bug in arg descr inference --- loopy/preprocess.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index d732d2696..f0bdd6264 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2138,6 +2138,18 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): map_call_with_kwargs = map_call + def __call__(self, expr, kernel, insn, assignees=None): + from loopy.kernel.data import InstructionBase + from loopy.symbolic import IdentityMapper, ExpansionState + assert insn is None or isinstance(insn, InstructionBase) + + return IdentityMapper.__call__(self, expr, + ExpansionState( + kernel=kernel, + instruction=insn, + stack=(), + arg_context={}), assignees=assignees) + def map_kernel(self, kernel): new_insns = [] -- GitLab From 64211e5d1715d5da49f96974558ac12014413e9a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Nov 2020 18:18:50 -0600 Subject: [PATCH 702/916] completes the implementation of Program.with_kernel --- loopy/program.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index aefec0366..eea875a26 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -257,13 +257,27 @@ class Program(ImmutableRecord): isinstance(callable_knl, CallableKernel)) def with_kernel(self, kernel): - # FIXME: Currently only replaces kernel. Should also work for adding. - # FIXME: Document - new_in_knl_callable = self.callables_table[kernel.name].copy( - subkernel=kernel) - new_callables = self.callables_table.copy() - new_callables[kernel.name] = new_in_knl_callable - return self.copy(callables_table=new_callables) + """ + If *self* contains a callable kernel with *kernel*'s name, replaces its + subkernel and returns a copy of *self*. Else records a new callable + kernel with *kernel* as its subkernel. + + :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :returns: Copy of *self* with updated callable kernels. + """ + if kernel.name in self.callables_table: + # update the callable kernel + new_in_knl_callable = self.callables_table[kernel.name].copy( + subkernel=kernel) + new_callables = self.callables_table.copy() + new_callables[kernel.name] = new_in_knl_callable + return self.copy(callables_table=new_callables) + else: + # add a new callable kernel + clbl = CallableKernel(kernel) + new_callables = self.callables_table.copy() + new_callables[kernel.name] = clbl + return self.copy(callables_table=new_callables) def with_resolved_callables(self): from loopy.library.function import get_loopy_callables -- GitLab From 1ce833d3f9b7f6d8a3db057e6cf16e69ef1f77fa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Nov 2020 18:23:08 -0600 Subject: [PATCH 703/916] RuleAwareIdentityMapper: no need for the base class to allow for *args, **kwargs --- loopy/symbolic.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index a53a229de..e9226e487 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1195,7 +1195,11 @@ class RuleAwareIdentityMapper(IdentityMapper): else: return sym - def __call__(self, expr, kernel, insn, *args, **kwargs): + def __call__(self, expr, kernel, insn): + """ + :arg insn: A :class:`~loopy.kernel.InstructionBase` of which *expr* is + a part of, or *None* if *expr*'s source is not an instruction. + """ from loopy.kernel.data import InstructionBase assert insn is None or isinstance(insn, InstructionBase) @@ -1204,7 +1208,7 @@ class RuleAwareIdentityMapper(IdentityMapper): kernel=kernel, instruction=insn, stack=(), - arg_context={}), *args, **kwargs) + arg_context={})) def map_instruction(self, kernel, insn): return insn -- GitLab From 2625c85a554b918e40dc6f9bd12d2c4906735f94 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Nov 2020 18:27:26 -0600 Subject: [PATCH 704/916] better program printing; adds error msg --- loopy/program.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index eea875a26..4cd1158ae 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -340,6 +340,9 @@ class Program(ImmutableRecord): def __call__(self, *args, **kwargs): entrypoint = kwargs.get("entrypoint", None) + if self.entrypoints is None: + raise LoopyError("Cannot execute program with no entrypoints.") + if entrypoint is None: # did not receive an entrypoint for the program to execute if len(self.entrypoints) == 1: @@ -368,14 +371,12 @@ class Program(ImmutableRecord): # FIXME: do a topological sort by the call graph def strify_callable(clbl): - if isinstance(clbl, CallableKernel): - return str(clbl.subkernel) - else: - return str(clbl) + return str(clbl.subkernel) return "\n".join( strify_callable(clbl) - for name, clbl in self.callables_table.items()) + for name, clbl in self.callables_table.items() + if isinstance(clbl, CallableKernel)) def __setstate__(self, state_obj): super().__setstate__(state_obj) -- GitLab From aedc56408e52283514cb6b3843d0478be0fd25a8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Nov 2020 18:27:53 -0600 Subject: [PATCH 705/916] relax when we are forced to rename a given callable --- loopy/transform/callable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 207de3b9b..b8db48ebb 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -56,7 +56,8 @@ def register_callable(translation_unit, function_identifier, callable_, assert isinstance(callable_, InKernelCallable) if (function_identifier in translation_unit.callables_table) and ( - redefining_not_ok): + translation_unit.callables_table[function_identifier] != callable_ + and redefining_not_ok): raise LoopyError("Redifining function identifier not allowed. Set the" " option 'redefining_not_ok=False' to bypass this error.") -- GitLab From 4a3ca24e0d1dad4df848f8c08c329b0096e23ce0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Nov 2020 22:40:43 -0600 Subject: [PATCH 706/916] better callable resolver --- loopy/preprocess.py | 3 +- loopy/program.py | 186 ++++++++++++++++++------------------ loopy/target/execution.py | 4 +- loopy/transform/callable.py | 3 +- loopy/type_inference.py | 3 +- 5 files changed, 99 insertions(+), 100 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f0bdd6264..161a913ed 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2343,7 +2343,8 @@ def preprocess_program(program, device=None): if not program.entrypoints: raise LoopyError("Translation unit did not receive any entrypoints") - program = program.with_resolved_callables() + from loopy.program import resolve_callables + program = resolve_callables(program) if device is not None: # FIXME: Time to remove this? (Git blame shows 5 years ago) diff --git a/loopy/program.py b/loopy/program.py index 4cd1158ae..32e240ac5 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -72,71 +72,57 @@ def find_in_knl_callable_from_identifier( class CallableResolver(RuleAwareIdentityMapper): - #FIXME: Recheck this! """ - Mapper to convert the ``function`` attribute of a - :class:`pymbolic.primitives.Call` known in the kernel as instances of - :class:`loopy.symbolic.ResolvedFunction`. A function is known in the - *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier` - returns an instance of - :class:`loopy.kernel.function_interface.InKernelCallable`. - - **Example:** If given an expression of the form ``sin(x) + unknown_function(y) + - log(z)``, then the mapper would return ``ResolvedFunction('sin')(x) + - unknown_function(y) + ResolvedFunction('log')(z)``. - - :arg rule_mapping_context: An instance of - :class:`loopy.symbolic.RuleMappingContext`. - :arg function_ids: A container with instances of :class:`str` indicating - the function identifiers to look for while scoping functions. + Resolves callables in expressions and records the names of the calls + resolved. + + .. attribute:: known_callables + + An instance of :class:`frozenset` of the call names to be resolved. + + .. attribute:: rule_mapping_context + + An instance of :class:`loopy.symbolic.RuleMappingContext`. """ def __init__(self, rule_mapping_context, known_callables): + assert isinstance(known_callables, frozenset) + super().__init__(rule_mapping_context) - self.resolved_functions = {} + self.known_callables = known_callables + # a record of the call names that were resolved + self.calls_resolved = set() + def map_call(self, expr, expn_state): from loopy.symbolic import parse_tagged_name - name, tag = parse_tagged_name(expr.function) - if name not in self.rule_mapping_context.old_subst_rules: - new_call_with_kwargs = self.rec(CallWithKwargs( - function=expr.function, parameters=expr.parameters, - kw_parameters={}), expn_state) - return Call(new_call_with_kwargs.function, - new_call_with_kwargs.parameters) - else: - return self.map_substitution(name, tag, expr.parameters, expn_state) + + if name in self.known_callables: + params = tuple(self.rec(par, expn_state) for par in expr.parameters) + + # record that we resolved a call + self.calls_resolved.add(name) + + return Call(ResolvedFunction(expr.function), params) + + return super().map_call(expr, expn_state) def map_call_with_kwargs(self, expr, expn_state): + from loopy.symbolic import parse_tagged_name + name, tag = parse_tagged_name(expr.function) - if not isinstance(expr.function, ResolvedFunction): - # FIXME: Do we need to care about ReductionOpFunctions over here? - in_knl_callable = self.known_callables.get(expr.function.name) - - if in_knl_callable: - if expr.function.name in self.resolved_functions: - assert self.resolved_functions[expr.function.name] == ( - in_knl_callable) - self.resolved_functions[expr.function.name] = in_knl_callable - return type(expr)( - ResolvedFunction(expr.function.name), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - { - key: self.rec(val, expn_state) - for key, val in expr.kw_parameters.items()} - ) - else: - # FIXME: Once function mangler is completely deprecated raise here. - # Oh function mangler I loathe you so much! - pass - else: - self.resolved_functions[expr.function.name] = ( - self.known_callables[expr.function.name]) + if name in self.known_callables: + params = tuple(self.rec(par, expn_state) for par in expr.parameters) + kw_params = {kw: self.rec(par, expn_state) + for kw, par in expr.kw_parameters.items()} + + # record that we resolved a call + self.calls_resolved.add(name) + + return CallWithKwargs(ResolvedFunction(expr.function), params, kw_params) - return super().map_call_with_kwargs(expr, - expn_state) + return super().map_call_with_kwargs(expr, expn_state) # {{{ program @@ -279,49 +265,6 @@ class Program(ImmutableRecord): new_callables[kernel.name] = clbl return self.copy(callables_table=new_callables) - def with_resolved_callables(self): - from loopy.library.function import get_loopy_callables - from loopy.kernel import KernelState - - if self.state >= KernelState.CALLS_RESOLVED: - return self - - known_callables = self.callables_table - known_callables.update(self.target.get_device_ast_builder().known_callables) - known_callables.update(get_loopy_callables()) - # update the known callables from the target. - callables_table = {e: self.callables_table[e] for e in - self.entrypoints} - - # start a traversal to collect all the callables - queue = list(self.entrypoints) - - while queue: - top = queue[0] - assert top in callables_table - queue = queue[1:] - - knl = callables_table[top].subkernel - rule_mapping_context = SubstitutionRuleMappingContext( - knl.substitutions, knl.get_var_name_generator()) - callables_collector = CallableResolver( - rule_mapping_context, - known_callables) - knl = rule_mapping_context.finish_kernel( - callables_collector.map_kernel(knl)) - knl = knl.copy(state=KernelState.CALLS_RESOLVED) - callables_table[top] = callables_table[top].copy(subkernel=knl) - - for func, clbl in callables_collector.resolved_functions.items(): - if func not in callables_table: - if isinstance(clbl, CallableKernel): - queue.append(func) - callables_table[func] = clbl - else: - assert callables_table[func] == clbl - - return self.copy(callables_table=callables_table) - def __getitem__(self, name): result = self.callables_table[name] if isinstance(result, CallableKernel): @@ -778,4 +721,57 @@ def update_table(callables_table, clbl_id, clbl): # }}} +def resolve_callables(program): + """ + Returns a :class:`Program` with known :class:`pymbolic.primitives.Call` + expression nodes converted to :class:`loopy.symbolic.ResolvedFunction`. + """ + from loopy.library.function import get_loopy_callables + from loopy.kernel import KernelState + + if program.state >= KernelState.CALLS_RESOLVED: + # program's callables have been resolved + return program + + # get registered callables + known_callables = program.callables_table.copy() + # get target specific callables + known_callables.update(program.target.get_device_ast_builder().known_callables) + # get loopy specific callables + known_callables.update(get_loopy_callables()) + + callables_table = {} + + # callables: name of the calls seen in the program + callables = set(program.entrypoints) + + while callables: + clbl_name = callables.pop() + clbl = known_callables[clbl_name] + + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, knl.get_var_name_generator()) + clbl_resolver = CallableResolver(rule_mapping_context, + frozenset(known_callables)) + knl = rule_mapping_context.finish_kernel(clbl_resolver.map_kernel(knl)) + knl = knl.copy(state=KernelState.CALLS_RESOLVED) + + # add the updated callable kernel to the table + callables_table[clbl_name] = clbl.copy(subkernel=knl) + + # note the resolved callable for traversal + callables.update(clbl_resolver.calls_resolved - set(callables_table)) + elif isinstance(clbl, ScalarCallable): + # nothing to resolve within a scalar callable + callables_table[clbl_name] = clbl + pass + else: + raise NotImplementedError(f"{type(clbl)}") + + return program.copy(callables_table=callables_table) + + # vim: foldmethod=marker diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 36513ba10..1234d1e5a 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -740,9 +740,9 @@ class KernelExecutorBase: def get_typed_and_scheduled_program_uncached(self, entrypoint, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes from loopy.kernel import KernelState + from loopy.program import resolve_callables - program = self.program - program = program.with_resolved_callables() + program = resolve_callables(self.program) if arg_to_dtype_set: var_to_dtype = {} diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index b8db48ebb..94c416795 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -386,7 +386,8 @@ def inline_callable_kernel(program, function_name): (scoped) name *function_name* inlined. """ from loopy.preprocess import infer_arg_descr - program = program.with_resolved_callables() + from loopy.program import resolve_callables + program = resolve_callables(program) program = infer_arg_descr(program) callables_table = program.callables_table new_callables = {} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 4c3a3b224..1d0f0cc70 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1116,8 +1116,9 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx, def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" from loopy.kernel.data import auto + from loopy.program import resolve_callables - program = program.with_resolved_callables() + program = resolve_callables(program) clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, program.entrypoints) -- GitLab From 1d220c213e1b4326273ffa26b19851cc7bb7a3c1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Nov 2020 23:11:59 -0600 Subject: [PATCH 707/916] move check_functions_are_resolved to pre_schedule_checks --- loopy/check.py | 15 +++++++-------- loopy/type_inference.py | 9 --------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index fbec8c032..921d94ab5 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -127,12 +127,9 @@ class UnscopedCallCollector(CombineMapper): def check_functions_are_resolved(kernel): - """ Checks if all the calls in the instruction expression have been scoped, - otherwise indicates to what all calls we await signature. Refer - :class:`loopy.symbolic.ResolvedFunction` for a detailed explanation of a - scoped function. + """ Checks if all call nodes in the *kernel* expression have been + resolved. """ - from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) @@ -141,9 +138,9 @@ def check_functions_are_resolved(kernel): unscoped_calls = UnscopedCallCollector()(subst_expander( insn.expression)) if unscoped_calls: - raise LoopyError("Unknown function '%s' obtained -- register a " - "function or a kernel corresponding to it." % - set(unscoped_calls).pop()) + raise LoopyError("Unknown function '%s' -- register a " + "callable corresponding to it." % + set(unscoped_calls).pop()) elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): pass else: @@ -907,6 +904,8 @@ def pre_schedule_checks(kernel, callables_table): kernel.temporary_variables.values())): # only check if all types are known check_for_integer_subscript_indices(kernel, callables_table) + + check_functions_are_resolved(kernel) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel, callables_table) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 1d0f0cc70..ddfc5e746 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1101,15 +1101,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx, type_specialized_kernel = change_names_of_pymbolic_calls( pre_type_specialized_knl, old_calls_to_new_calls) - # the check is unnecessary as we would first get TypeInfereceFailure before - # encountering this. Move this at the start once ManglerCallable is - # deprecated. - if expect_completion: - # if completion is expected, then it is important that all the - # callables are scoped. - from loopy.check import check_functions_are_resolved - check_functions_are_resolved(type_specialized_kernel) - return type_specialized_kernel, clbl_inf_ctx -- GitLab From 2f0972a4aea9123de2fbbd343b88431366c725bd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 16 Nov 2020 23:12:26 -0600 Subject: [PATCH 708/916] prettier way of denoting ResolvedFunctions --- loopy/symbolic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index e9226e487..76e32ede2 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -287,7 +287,8 @@ class StringifyMapper(StringifyMapperBase): repr(expr.type), self.rec(expr.child, PREC_NONE)) def map_resolved_function(self, expr, prec): - return "Resolved(%s)" % expr.name + # underlining a resolved call + return "\u0332".join(expr.name) def map_sub_array_ref(self, expr, prec): return "[{inames}]: {subscr}".format( -- GitLab From e8df949c6353fd19e2887c5ba18b2f9f3e8d7557 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Nov 2020 00:17:25 -0600 Subject: [PATCH 709/916] adds test_incomplete_entrypoint_raises_type_inf_failure --- test/test_callables.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/test_callables.py b/test/test_callables.py index d7a808047..6a66d59b3 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -606,6 +606,31 @@ def test_non_zero_start_in_subarray_ref(ctx_factory): assert np.allclose(2*x, out) +def test_incomplete_entrypoint_raises_type_inf_failure(): + from loopy.diagnostic import LoopyError + + twice = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = 2*x[i] + """, name="dosify") + + quadr = lp.make_kernel( + "{:}", + """ + y[:] = dosify(x[:]) + y[:] = dosify(y[:]) + """, [lp.GlobalArg("x,y", shape=(10,))], name="cuatroify", + seq_dependencies=True) + + prog = lp.merge([quadr, twice]) + + with pytest.raises(LoopyError): + # 'twice' is also registered as an entrypoint but provided args aren't + # enough to infer the types + lp.generate_code_v2(prog) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From d9fee5386de5914cedfe0ca21ccffdd7e078ca9c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Nov 2020 01:29:09 -0600 Subject: [PATCH 710/916] merge translation units: better error msg --- loopy/transform/callable.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 94c416795..a0e7fc48f 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -68,10 +68,9 @@ def register_callable(translation_unit, function_identifier, callable_, callables_table=callables) -def merge(translation_units, collision_not_ok=True): +def merge(translation_units): """ :param translation_units: A list of :class:`loopy.Program`. - :param collision_not_ok: An instance of :class:`bool`. :returns: An instance of :class:`loopy.Program` which contains all the callables from each of the *translation_units. @@ -79,22 +78,29 @@ def merge(translation_units, collision_not_ok=True): for i in range(1, len(translation_units)): if translation_units[i].target != translation_units[i-1].target: - raise LoopyError("merge() should have" - " translation_units to be of the same target to be able to" - " fuse.") + raise LoopyError("translation units to be merged should have the" + " same target.") + + # {{{ check for callable collision + + for i, prg_i in enumerate(translation_units): + for prg_j in translation_units[i+1:]: + for clbl_name in (set(prg_i.callables_table) + & set(prg_j.callables_table)): + if (prg_i.callables_table[clbl_name] + != prg_j.callables_table[clbl_name]): + # FIXME: generate unique names + rename for the colliding + # callables + raise NotImplementedError("Translation units to be merged" + " must have different callable names" + " for now.") + + # }}} + callables_table = {} for trans_unit in translation_units: callables_table.update(trans_unit.callables_table.copy()) - # {{{ - - if len(callables_table) != sum(len(trans_unit.callables_table) for trans_unit in - translation_units) and collision_not_ok: - raise LoopyError("translation units in merge() cannot" - " not contain callables with same names.") - - # }}} - return Program( entrypoints=frozenset().union(*( t.entrypoints or frozenset() for t in translation_units)), -- GitLab From c57368ab075e02f596accfc3c5fdb85152a7df3d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Nov 2020 08:40:36 -0600 Subject: [PATCH 711/916] implement rename_callable --- loopy/__init__.py | 4 +-- loopy/transform/callable.py | 57 +++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index 0434f37ce..d621f0591 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -116,7 +116,7 @@ from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries from loopy.transform.add_barrier import add_barrier from loopy.transform.callable import (register_callable, - merge, inline_callable_kernel) + merge, inline_callable_kernel, rename_callable) from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call # }}} @@ -238,7 +238,7 @@ __all__ = [ "register_callable", "merge", - "inline_callable_kernel", + "inline_callable_kernel", "rename_callable", "pack_and_unpack_args_for_call", diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index a0e7fc48f..917c0b08b 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -22,6 +22,8 @@ THE SOFTWARE. import islpy as isl +from pytools import UniqueNameGenerator + from loopy.kernel import LoopKernel from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, @@ -593,4 +595,59 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name): # }}} +def rename_callable(program, old_name, new_name=None, existing_ok=False): + """ + :arg program: An instance of :class:`loopy.Program` + :arg old_name: The callable to be renamed + :arg new_name: New name for the callable to be renamed + :arg existing_ok: An instance of :class:`bool` + """ + from loopy.symbolic import ( + RuleAwareSubstitutionMapper, + SubstitutionRuleMappingContext) + from pymbolic import var + + assert isinstance(program, Program) + assert isinstance(old_name, str) + + if (new_name in program.callables_table) and not existing_ok: + raise LoopyError(f"callables named '{new_name}' already exists") + + if new_name is None: + namegen = UniqueNameGenerator(program.callables_table.keys()) + new_name = namegen(old_name) + + assert isinstance(new_name, str) + + new_callables_table = {} + + for name, clbl in program.callables_table.items(): + if name == old_name: + name = new_name + + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + rule_mapping_context = SubstitutionRuleMappingContext( + knl.substitutions, knl.get_var_name_generator()) + smap = RuleAwareSubstitutionMapper(rule_mapping_context, + {var(old_name): var(new_name)}.get, + within=lambda *args: True) + knl = rule_mapping_context.finish_kernel(smap.map_kernel(knl)) + clbl = clbl.copy(subkernel=knl.copy(name=name)) + elif isinstance(clbl, ScalarCallable): + pass + else: + raise NotImplementedError(f"{type(clbl)}") + + new_callables_table[name] = clbl + + new_entrypoints = program.entrypoints.copy() + if old_name in new_entrypoints: + new_entrypoints = ((new_entrypoints | frozenset([new_name])) + - frozenset([old_name])) + + return program.copy(callables_table=new_callables_table, + entrypoints=new_entrypoints) + + # vim: foldmethod=marker -- GitLab From 24b88a4049feb8b71087fffa2a5597fe75423aca Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Nov 2020 08:41:00 -0600 Subject: [PATCH 712/916] entrypoints should always be a frozenset --- loopy/program.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 32e240ac5..c3caba8f7 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -160,13 +160,12 @@ class Program(ImmutableRecord): immutable, any modifications should be done through :method:`copy`. .. automethod:: __init__ - .. automethod:: with_root_kernel - .. method:: __getitem__(name) + .. method:: __getitem__ Look up the resolved callable with identifier *name*. """ def __init__(self, - entrypoints=None, + entrypoints=frozenset(), callables_table={}, target=None, func_id_to_in_knl_callable_mappers=[]): @@ -174,6 +173,7 @@ class Program(ImmutableRecord): # {{{ sanity checks assert isinstance(callables_table, dict) + assert isinstance(entrypoints, frozenset) # }}} @@ -283,9 +283,6 @@ class Program(ImmutableRecord): def __call__(self, *args, **kwargs): entrypoint = kwargs.get("entrypoint", None) - if self.entrypoints is None: - raise LoopyError("Cannot execute program with no entrypoints.") - if entrypoint is None: # did not receive an entrypoint for the program to execute if len(self.entrypoints) == 1: @@ -767,7 +764,6 @@ def resolve_callables(program): elif isinstance(clbl, ScalarCallable): # nothing to resolve within a scalar callable callables_table[clbl_name] = clbl - pass else: raise NotImplementedError(f"{type(clbl)}") -- GitLab From 7a099e4fa20cf3df8937869d118fe01f1807e20e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Nov 2020 09:41:56 -0600 Subject: [PATCH 713/916] update atomicity only for assignment type instructions --- loopy/transform/callable.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 917c0b08b..a5596efd4 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -321,11 +321,10 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): if insn.id in heads: depends_on = depends_on | {noop_start.id} - new_atomicity = tuple( - type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) - for atomicity in insn.atomicity) - if isinstance(insn, Assignment): + new_atomicity = tuple( + type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) + for atomicity in insn.atomicity) insn = insn.copy( id=insn_id[insn.id], within_inames=within_inames, -- GitLab From e3ec03c4df55ca22ad6dc3616f3e1cee79d204c9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Nov 2020 09:42:28 -0600 Subject: [PATCH 714/916] inline callees with gbarriers --- loopy/preprocess.py | 31 +++++++++++++++++++++++++++++++ test/test_callables.py | 27 +++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 161a913ed..39a551d2e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2244,6 +2244,32 @@ def infer_arg_descr(program): # }}} +# {{{ inline_kernels_with_gbarriers + + +def inline_kernels_with_gbarriers(program): + from loopy.kernel.instruction import BarrierInstruction + from loopy.transform.callable import inline_callable_kernel + + def has_gbarrier(knl): + return any((isinstance(insn, BarrierInstruction) + and insn.synchronization_kind == "global") + for insn in knl.instructions) + + callees_to_inline = [name for name, knl_clbl in program.callables_table.items() + if (isinstance(knl_clbl, CallableKernel) + and has_gbarrier(knl_clbl.subkernel))] + + for callee_to_inline in callees_to_inline: + print(f"inlining {callee_to_inline}") + program = inline_callable_kernel(program, callee_to_inline) + + return program + + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2403,6 +2429,11 @@ def preprocess_program(program, device=None): # infer arg descrs of the callables program = infer_arg_descr(program) + # Ordering restriction: + # callees with gbarrier in them must be inlined after inferrring arg_descr. + # inline_kernels_with_gbarriers does not recursively inline the callees. + program = inline_kernels_with_gbarriers(program) + return program diff --git a/test/test_callables.py b/test/test_callables.py index 6a66d59b3..c073fdecc 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -631,6 +631,33 @@ def test_incomplete_entrypoint_raises_type_inf_failure(): lp.generate_code_v2(prog) +def test_callees_with_gbarriers_are_inlined(ctx_factory): + queue = cl.CommandQueue(ctx_factory()) + + ones_and_zeros = lp.make_function( + "{[i, j]: 0<=i<6 and 0<=j<3}", + """ + x[i] = 0.0f + ...gbarrier + x[j] = 1.0f + """, + seq_dependencies=True, + name="ones_and_zeros") + + prg = lp.make_kernel( + "{ : }", + """ + y[:] = ones_and_zeros() + """, [lp.GlobalArg("y", shape=6, dtype=lp.auto)]) + + prg = lp.merge([prg, ones_and_zeros]) + evt, (out,) = prg(queue) + + expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32) + + assert (expected_out == out.get()).all() + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From cf259579b0563b62e3fd418193e0f7b8dbf5f440 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Nov 2020 10:12:34 -0600 Subject: [PATCH 715/916] support for inlining with args accessed through indirection --- loopy/isl_helpers.py | 13 +++++++++---- test/test_callables.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 59748e01b..d6aaafa9f 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -462,11 +462,16 @@ def boxify(cache_manager, domain, box_inames, context): def simplify_via_aff(expr): - from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies + from loopy.symbolic import aff_to_expr, guarded_aff_from_expr, get_dependencies + from loopy.diagnostic import ExpressionToAffineConversionError + deps = get_dependencies(expr) - return aff_to_expr(aff_from_expr( - isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)), - expr)) + try: + return aff_to_expr(guarded_aff_from_expr( + isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)), + expr)) + except ExpressionToAffineConversionError: + return expr def project_out(set, inames): diff --git a/test/test_callables.py b/test/test_callables.py index c073fdecc..a73a8a6c3 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -658,6 +658,37 @@ def test_callees_with_gbarriers_are_inlined(ctx_factory): assert (expected_out == out.get()).all() +def test_inlining_with_indirections(ctx_factory): + queue = cl.CommandQueue(ctx_factory()) + + ones_and_zeros = lp.make_function( + "{[i, j]: 0<=i<6 and 0<=j<3}", + """ + x[i] = 0.0f + ...gbarrier + x[map[j]] = 1.0f + """, + seq_dependencies=True, + name="ones_and_zeros") + + prg = lp.make_kernel( + "{ : }", + """ + y[:] = ones_and_zeros(map[:]) + """, [lp.GlobalArg("y", shape=6, dtype=lp.auto), + lp.GlobalArg("map", dtype=np.int32, shape=3)]) + + prg = lp.merge([prg, ones_and_zeros]) + prg = lp.inline_callable_kernel(prg, "ones_and_zeros") + + map_in = np.arange(3).astype(np.int32) + + evt, (out, ) = prg(queue, map=map_in) + + expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32) + assert (expected_out == out).all() + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 53cb14880b2f32a5909d6ade4cd77852525bcc42 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Nov 2020 13:17:54 -0600 Subject: [PATCH 716/916] map insn no_sync_with's during inlining --- loopy/transform/callable.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index a5596efd4..006bf9b6d 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -318,6 +318,9 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): within_inames = within_inames | instruction.within_inames depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( instruction.depends_on) + no_sync_with = frozenset((insn_id[id], scope) + for id, scope in insn.no_sync_with) + if insn.id in heads: depends_on = depends_on | {noop_start.id} @@ -332,7 +335,8 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): priority=instruction.priority, depends_on=depends_on, tags=insn.tags | instruction.tags, - atomicity=new_atomicity + atomicity=new_atomicity, + no_sync_with=no_sync_with ) else: insn = insn.copy( @@ -342,6 +346,7 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): priority=instruction.priority, depends_on=depends_on, tags=insn.tags | instruction.tags, + no_sync_with=no_sync_with ) inner_insns.append(insn) -- GitLab From 479d8edc14f64fba1f35a284c3e300ebeb55a5cf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 17 Nov 2020 13:31:40 -0600 Subject: [PATCH 717/916] gets rid of spurious print statement --- loopy/preprocess.py | 1 - loopy/transform/callable.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 39a551d2e..2e1a56bc3 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2261,7 +2261,6 @@ def inline_kernels_with_gbarriers(program): and has_gbarrier(knl_clbl.subkernel))] for callee_to_inline in callees_to_inline: - print(f"inlining {callee_to_inline}") program = inline_callable_kernel(program, callee_to_inline) return program diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 006bf9b6d..da3b107e1 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -256,8 +256,6 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): for i, assignee in enumerate(assignees): arg_map[pos_to_kw[-i-1]] = assignee - print(arg_map) - # }}} # {{{ rewrite instructions -- GitLab From b2986dd1912054f0cc592376f22cdb6de9e29412 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 26 Nov 2020 17:15:13 -0600 Subject: [PATCH 718/916] make type inference functional again --- loopy/type_inference.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index ddfc5e746..0047b9d59 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1119,6 +1119,7 @@ def infer_unknown_types(program, expect_completion=False): for e in program.entrypoints: # FIXME: Need to add docs which say that we need not add the current # callable to the clbl_inf_ctx while writing the "with_types" + logger.debug(f"Entering entrypoint: {e}") arg_id_to_dtype = {arg.name: arg.dtype for arg in program[e].args if arg.dtype not in (None, auto)} new_callable, clbl_inf_ctx = program.callables_table[e].with_types( @@ -1126,6 +1127,23 @@ def infer_unknown_types(program, expect_completion=False): clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) renamed_entrypoints.add(new_name.name) + if expect_completion: + from loopy.types import LoopyType + args_not_inferred = {arg.name + for arg in program[e].args + if not isinstance(arg.dtype, LoopyType)} + + tvs_not_inferred = {tv.name + for tv in program[e].temporary_variables.values() + if not isinstance(tv.dtype, LoopyType)} + + vars_not_inferred = tvs_not_inferred | args_not_inferred + + if vars_not_inferred: + if expect_completion: + raise LoopyError("could not determine type of" + f" '{vars_not_inferred.pop()}' of kernel '{e}'.") + return clbl_inf_ctx.finish_program(program, renamed_entrypoints) # }}} -- GitLab From 00683467a623b49451971f98ba92c1d883997483 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 26 Nov 2020 17:35:47 -0600 Subject: [PATCH 719/916] type inference: LoopKernel level type inference is always invoked with *do not expect completion* --- loopy/type_inference.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 0047b9d59..422404411 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -863,8 +863,7 @@ class _DictUnionView: # {{{ infer_unknown_types -def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx, - expect_completion=False): +def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) @@ -1000,14 +999,10 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx, " (need type of '%s'--check for missing arguments)" % ", ".join(symbols_with_unavailable_types)) - if expect_completion: - raise LoopyError( - "could not determine type of '%s'%s" - % (item.name, advice)) - - else: - # We're done here. - break + debug("could not determine type of '%s'%s" + % (item.name, advice)) + # We're done here + break # remember that this item failed failed_names.add(item.name) @@ -1015,7 +1010,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx, if set(queue) == failed_names: # We did what we could... print(queue, failed_names, item.name) - assert not expect_completion break # can't infer type yet, put back into queue @@ -1129,12 +1123,14 @@ def infer_unknown_types(program, expect_completion=False): if expect_completion: from loopy.types import LoopyType + new_knl = new_callable.subkernel + args_not_inferred = {arg.name - for arg in program[e].args + for arg in new_knl.args if not isinstance(arg.dtype, LoopyType)} tvs_not_inferred = {tv.name - for tv in program[e].temporary_variables.values() + for tv in new_knl.temporary_variables.values() if not isinstance(tv.dtype, LoopyType)} vars_not_inferred = tvs_not_inferred | args_not_inferred -- GitLab From 6442d675ae4eb05b0173de869e0d2c70f414ca5a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 5 Dec 2020 15:33:47 -0600 Subject: [PATCH 720/916] cache the codegen_result of a program --- loopy/codegen/__init__.py | 40 +++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index ce23db29b..11eb3cdc0 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -461,21 +461,6 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") - # {{{ cache retrieval - - from loopy import CACHING_ENABLED - - if CACHING_ENABLED: - input_kernel = kernel - try: - result = code_gen_cache[input_kernel] - logger.debug("%s: code generation cache hit" % kernel.name) - return result - except KeyError: - pass - - # }}} - from loopy.check import pre_codegen_checks pre_codegen_checks(kernel, callables_table) @@ -590,13 +575,13 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, codegen_plog.done() - if CACHING_ENABLED: - code_gen_cache.store_if_not_present(input_kernel, codegen_result) - return codegen_result def diverge_callee_entrypoints(program): + """ + If a kernel is both an entrypoint and a callee, then rename the callee. + """ from loopy.program import _get_callable_ids from pytools import UniqueNameGenerator callable_ids = _get_callable_ids(program.callables_table, @@ -641,6 +626,22 @@ def generate_code_v2(program): from loopy.program import make_program from loopy.codegen.result import CodeGenerationResult + # {{{ cache retrieval + + from loopy import CACHING_ENABLED + + if CACHING_ENABLED: + input_program = program + try: + result = code_gen_cache[input_program] + logger.debug(f"Program with entrypoints {program.entrypoints}:" + " code generation cache hit") + return result + except KeyError: + pass + + # }}} + if isinstance(program, LoopKernel): program = make_program(program) @@ -722,6 +723,9 @@ def generate_code_v2(program): device_preambles=device_preambles, implemented_data_infos=implemented_data_infos) + if CACHING_ENABLED: + code_gen_cache.store_if_not_present(input_program, cgr) + return cgr -- GitLab From 9b650f05839f67cc06f32970c8d2b48032a0fd47 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 5 Dec 2020 15:34:22 -0600 Subject: [PATCH 721/916] hash_fields should be ordered data structures --- loopy/kernel/function_interface.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index a22f5bf34..c6609b167 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -468,7 +468,7 @@ class ScalarCallable(InKernelCallable): fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"} init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") - hash_fields = fields + hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): @@ -633,7 +633,7 @@ class CallableKernel(InKernelCallable): fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"} init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") - hash_fields = fields + hash_fields = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") def __init__(self, subkernel, arg_id_to_dtype=None, arg_id_to_descr=None): @@ -917,8 +917,8 @@ class ManglerCallable(ScalarCallable): "name_in_target"} init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") - hash_fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"} + hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr", + "name_in_target") def __init__(self, name, function_mangler, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None): -- GitLab From 78487fb6b7e16ae7f9ce111fe509bc722e46d284 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 5 Dec 2020 15:34:42 -0600 Subject: [PATCH 722/916] dont clobber program's namespace --- loopy/program.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index c3caba8f7..bf3ce5d73 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -272,14 +272,6 @@ class Program(ImmutableRecord): else: return result - def __getattr__(self, attr): - if self.entrypoints: - if attr in self.entrypoints: - return lambda *args, **kwargs: self(*args, entrypoint=attr, - **kwargs) - - return super().__getattr__(attr) - def __call__(self, *args, **kwargs): entrypoint = kwargs.get("entrypoint", None) -- GitLab From 59e5eefee2c8e157bbb92d0c97ae5577f4a30fde Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 10 Jan 2021 16:06:06 -0600 Subject: [PATCH 723/916] fixes lang_version for make_kernel --- loopy/kernel/creation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 6139c9458..ec1d90487 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2204,7 +2204,11 @@ def make_function(domains, instructions, kernel_data=["..."], **kwargs): # This *is* gross. But it seems like the right thing interface-wise. import inspect - caller_globals = inspect.currentframe().f_back.f_globals + if inspect.currentframe().f_back.f_code.co_name == "make_kernel": + # if caller is "make_kernel", read globals from make_kernel's caller + caller_globals = inspect.currentframe().f_back.f_back.f_globals + else: + caller_globals = inspect.currentframe().f_back.f_globals for ver_sym in LANGUAGE_VERSION_SYMBOLS: try: -- GitLab From e4f193fc3eb788f64ce6188216c97ed63b8419e9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 11 Jan 2021 20:08:26 -0600 Subject: [PATCH 724/916] cleanup - kernel/__init__.py - kernel/data.py --- loopy/kernel/creation.py | 85 ++++++++++++------------------ loopy/kernel/data.py | 12 ++--- loopy/kernel/function_interface.py | 24 +++------ loopy/target/c/__init__.py | 30 ----------- loopy/transform/instruction.py | 3 ++ 5 files changed, 49 insertions(+), 105 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index ec1d90487..3e682b33a 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -34,8 +34,6 @@ from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, SubstitutionRule, AddressSpace, ValueArg) -from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction, - CallInstruction) from loopy.program import iterate_over_kernels_if_given_program from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl @@ -243,11 +241,13 @@ def parse_insn_options(opt_dict, options_str, assignee_names=None): if arrow_idx >= 0: result["inames_to_dup"] = ( result.get("inames_to_dup", []) - + [(value[:arrow_idx], value[arrow_idx+2:])]) + + + [(value[:arrow_idx], value[arrow_idx+2:])]) else: result["inames_to_dup"] = ( result.get("inames_to_dup", []) - + [(value, None)]) + + + [(value, None)]) elif opt_key == "dep" and opt_value is not None: if opt_value.startswith("*"): @@ -1657,7 +1657,7 @@ def _is_wildcard(s): def _resolve_dependencies(what, knl, insn, deps): - from loopy.transform.instruction import find_instructions_in_single_kernel + from loopy.transform.instruction import find_instructions from loopy.match import MatchExpressionBase new_deps = [] @@ -1666,7 +1666,7 @@ def _resolve_dependencies(what, knl, insn, deps): found_any = False if isinstance(dep, MatchExpressionBase): - for new_dep in find_instructions_in_single_kernel(knl, dep): + for new_dep in find_instructions(knl, dep): if new_dep.id != insn.id: new_deps.append(new_dep.id) found_any = True @@ -1822,13 +1822,12 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): # {{{ slice to sub array ref -def get_slice_params(slice, dimension_length): +def normalize_slice_params(slice, dimension_length): """ - Returns the slice parameters across an axes spanning *domain_length* as a - tuple of ``(start, stop, step)``. + Returns the normalized slice parameters ``(start, stop, step)``. :arg slice: An instance of :class:`pymbolic.primitives.Slice`. - :arg dimension_length: The axes length swept by *slice*. + :arg dimension_length: Length of the axis being sliced. """ from pymbolic.primitives import Slice assert isinstance(slice, Slice) @@ -1881,17 +1880,10 @@ class SliceToInameReplacer(IdentityMapper): the ``iname`` by the corresponding slice notation its intended to replace. """ - def __init__(self, knl, var_name_gen): - self.var_name_gen = var_name_gen - self.knl = knl - - # caching to map equivalent slices to equivalent SubArrayRefs - self.cache = {} - + def __init__(self, knl): self.subarray_ref_bounds = [] - - def clear_cache(self): - self.cache = {} + self.knl = knl + self.var_name_gen = knl.get_var_name_generator() def map_subscript(self, expr): if expr in self.cache: @@ -1918,7 +1910,7 @@ class SliceToInameReplacer(IdentityMapper): expr.aggregate.name)) domain_length = shape[i] - start, stop, step = get_slice_params( + start, stop, step = normalize_slice_params( index, domain_length) subscript_iname_bounds[unique_var_name] = (start, stop, step) @@ -1940,7 +1932,12 @@ class SliceToInameReplacer(IdentityMapper): return result def map_call(self, expr): - def _convert_array_to_slices(arg): + from pymbolic.primitives import CallWithKwargs + new_expr = self.rec(CallWithKwargs(expr.function, expr.parameters, {})) + return Call(new_expr.function, new_expr.parameters) + + def map_call_with_kwargs(self, expr): + def _convert_array_to_slices(knl, arg): # FIXME: We do not support something like A[1] should point to the # second row if 'A' is 3 x 3 array. if isinstance(arg, Variable): @@ -1949,6 +1946,8 @@ class SliceToInameReplacer(IdentityMapper): if self.knl.temporary_variables[arg.name].shape in [ auto, None]: # do not convert arrays with unknown shapes to slices. + # (If an array of unknown shape was passed in error, with be + # caught and raised during preprocessing). array_arg_shape = () else: array_arg_shape = ( @@ -1963,15 +1962,15 @@ class SliceToInameReplacer(IdentityMapper): array_arg_shape = () if array_arg_shape != (): - return Subscript(arg, tuple(Slice(()) for _ in - array_arg_shape)) + return Subscript(arg, tuple(Slice(()) + for _ in array_arg_shape)) return arg return Call(expr.function, - tuple(self.rec(_convert_array_to_slices(par)) for par in - expr.parameters)) - - # FIXME: Missing map_call_with_kwargs + tuple(self.rec(_convert_array_to_slices(par)) + for par in expr.parameters), + {kw: self.rec(_convert_array_to_slices(par)) + for kw, par in expr.kw_parameters.items()}) def get_iname_domain_as_isl_set(self): """ @@ -1983,12 +1982,10 @@ class SliceToInameReplacer(IdentityMapper): ctx = self.knl.isl_context space = isl.Space.create_from_names(ctx, set=list(sar_bounds.keys())) - from loopy.symbolic import DependencyMapper + from loopy.symbolic import get_dependencies args_as_params_for_domains = set() - for _, (start, stop, step) in sar_bounds.items(): - args_as_params_for_domains |= DependencyMapper()(start) - args_as_params_for_domains |= DependencyMapper()(stop) - args_as_params_for_domains |= DependencyMapper()(step) + for slice_ in sar_bounds.values(): + args_as_params_for_domains |= get_dependencies(slice_) space = space.add_dims(dim_type.param, len(args_as_params_for_domains)) for i, arg in enumerate(args_as_params_for_domains): @@ -2010,25 +2007,9 @@ def realize_slices_array_inputs_as_sub_array_refs(kernel): Returns a kernel with the instances of :class:`pymbolic.primitives.Slice` encountered in expressions replaced as `loopy.symbolic.SubArrayRef`. """ - unique_var_name_generator = kernel.get_var_name_generator() - slice_replacer = SliceToInameReplacer(kernel, unique_var_name_generator) - new_insns = [] - - for insn in kernel.instructions: - if isinstance(insn, CallInstruction): - new_expr = slice_replacer(insn.expression) - new_assignees = tuple(slice_replacer(assignee) for assignee in - insn.assignees) - new_insns.append(insn.copy(assignees=new_assignees, - expression=new_expr)) - elif isinstance(insn, (CInstruction, MultiAssignmentBase, - _DataObliviousInstruction)): - new_insns.append(insn) - else: - raise NotImplementedError("Unknown type of instruction -- %s" % - type(insn)) - - slice_replacer.clear_cache() + slice_replacer = SliceToInameReplacer(kernel) + new_insns = [insn.with_transformed_expressions(slice_replacer) + for insn in kernel.instructions] return kernel.copy( domains=( diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 4fe22a480..ece606ddc 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -375,15 +375,15 @@ class ArrayArg(ArrayBase, KernelArgument): .. attribute:: is_output - An instance of :class:`bool`. If set to *True*, the argument is used - to return information to the caller. If set to *False*, then the - callee should not write the array during execution. + An instance of :class:`bool`. If set to *True*, the array is used to + return information to the caller. If set to *False*, the callee does not + write to the array during a call. .. attribute:: is_input - An instance of :class:`bool`. If set to *True*, expected to be - provided by the caller. If *False* then the callee should not depend - on the state of the array on entry to a function. + An instance of :class:`bool`. If set to *True*, expected to be provided + by the caller. If *False*, the callee does not depend on the array + at kernel entry. """) allowed_extra_kwargs = [ diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index c6609b167..1120dd2bb 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -60,10 +60,10 @@ class ValueArgDescriptor(ImmutableRecord): class ArrayArgDescriptor(ImmutableRecord): """ - Records information about an array argument to an in-kernel callable, to be + Records information about an array argument to an in-kernel callable. To be passed to and returned from - :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used - for matching shape and scope of caller and callee kernels. + :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used for + matching shape and address space of caller and callee kernels. ..attribute:: shape @@ -101,15 +101,9 @@ class ArrayArgDescriptor(ImmutableRecord): address_space=address_space, dim_tags=dim_tags) - hash_fields = ( - "shape", - "address_space", - "dim_tags") - - def map_expr(self, subst_mapper): - new_shape = tuple(subst_mapper(axis_len) for axis_len in self.shape) - new_dim_tags = tuple(dim_tag.map_expr(subst_mapper) for dim_tag in - self.dim_tags) + def map_expr(self, f): + new_shape = tuple(f(axis_len) for axis_len in self.shape) + new_dim_tags = tuple(dim_tag.map_expr(f) for dim_tag in self.dim_tags) return self.copy(shape=new_shape, dim_tags=new_dim_tags) def depends_on(self): @@ -120,8 +114,6 @@ class ArrayArgDescriptor(ImmutableRecord): self.dim_tags))) return frozenset(var.name for var in result) - # FIXME ArrayArgDescriptor should never need to be persisted, remove - # this method when that is so. def update_persistent_hash(self, key_hash, key_builder): for shape_i in self.shape: if shape_i is None: @@ -162,7 +154,7 @@ def get_arg_descriptor_for_expression(kernel, expr): # will not work for non-stride dim tags (e.g. vec or sep). # (AK) FIXME: This will almost always be nonlinear--when does this - # actually help? Maybe the + # actually help? Maybe remove this? # (KK) Reply: This helps in identifying identities like # "2*(i//2) + i%2" := "i" # See the kernel in @@ -179,9 +171,7 @@ def get_arg_descriptor_for_expression(kernel, expr): )(linearized_index) sub_dim_tags = tuple( # Not all swept inames necessarily occur in the expression. - # Also, some may have been simplified away by simplify_using_aff. DimTag(strides_as_dict.get(iname, 0)) - for iname in expr.swept_inames) sub_shape = tuple( pw_aff_to_expr( diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 9e4131565..8babd6fec 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1019,42 +1019,12 @@ class CFamilyASTBuilder(ASTBuilderBase): in_knl_callable.name_in_target == "loopy_make_tuple"): return self.emit_tuple_assignment(codegen_state, insn) -<<<<<<< HEAD # takes "is_returned" to infer whether insn.assignees[0] is a part of # LHS. in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn( insn=insn, target=self.target, expression_to_code_mapper=ecm) -======= - from loopy.expression import dtype_to_type_context - c_parameters = [ - ecm(par, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, mangle_result.arg_dtypes)] - - from loopy.codegen import SeenFunction - codegen_state.seen_functions.add( - SeenFunction(func_id, - mangle_result.target_name, - mangle_result.arg_dtypes, - mangle_result.result_dtypes)) - - from pymbolic import var - for i, (a, tgt_dtype) in enumerate( - zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): - if tgt_dtype != ecm.infer_type(a): - raise LoopyError("type mismatch in %d'th (1-based) left-hand " - "side of instruction '%s'" % (i+1, insn.id)) - c_parameters.append( - # TODO Yuck: The "where-at function": &(...) - var("&")( - ecm(a, PREC_NONE, - dtype_to_type_context(self.target, tgt_dtype), - tgt_dtype).expr)) ->>>>>>> origin/master if is_returned: from cgen import Assign diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index c84c1b9c6..3ebcc3bc4 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -36,6 +36,9 @@ def find_instructions_in_single_kernel(kernel, insn_match): def find_instructions(program, insn_match): + if isinstance(program, LoopKernel): + return find_instructions_in_single_kernel(program, insn_match) + assert isinstance(program, Program) insns = [] for in_knl_callable in program.callables_table.values(): -- GitLab From 4f622e645fcd5248f8d49065d1d7d9c415482d42 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 12 Jan 2021 11:21:06 -0600 Subject: [PATCH 725/916] minor fixes in the opencl backend --- loopy/kernel/creation.py | 17 +++++------------ loopy/target/c/codegen/expression.py | 12 +++++++++++- loopy/target/opencl.py | 16 ++++++++++++++-- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 3e682b33a..3388306dc 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1886,9 +1886,6 @@ class SliceToInameReplacer(IdentityMapper): self.var_name_gen = knl.get_var_name_generator() def map_subscript(self, expr): - if expr in self.cache: - return self.cache[expr] - subscript_iname_bounds = {} self.subarray_ref_bounds.append(subscript_iname_bounds) @@ -1910,12 +1907,9 @@ class SliceToInameReplacer(IdentityMapper): expr.aggregate.name)) domain_length = shape[i] - start, stop, step = normalize_slice_params( - index, domain_length) + start, stop, step = normalize_slice_params(index, domain_length) subscript_iname_bounds[unique_var_name] = (start, stop, step) - new_index.append(start+step*Variable(unique_var_name)) - swept_inames.append(Variable(unique_var_name)) else: new_index.append(index) @@ -1925,9 +1919,7 @@ class SliceToInameReplacer(IdentityMapper): self.rec(expr.aggregate), self.rec(tuple(new_index)))) else: - result = IdentityMapper.map_subscript(self, expr) - - self.cache[expr] = result + result = super().map_subscript(expr) return result @@ -1937,7 +1929,7 @@ class SliceToInameReplacer(IdentityMapper): return Call(new_expr.function, new_expr.parameters) def map_call_with_kwargs(self, expr): - def _convert_array_to_slices(knl, arg): + def _convert_array_to_slices(arg): # FIXME: We do not support something like A[1] should point to the # second row if 'A' is 3 x 3 array. if isinstance(arg, Variable): @@ -1966,7 +1958,8 @@ class SliceToInameReplacer(IdentityMapper): for _ in array_arg_shape)) return arg - return Call(expr.function, + from pymbolic.primitives import CallWithKwargs + return CallWithKwargs(expr.function, tuple(self.rec(_convert_array_to_slices(par)) for par in expr.parameters), {kw: self.rec(_convert_array_to_slices(par)) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 06bd93a95..23f6e92f3 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -663,7 +663,17 @@ class ExpressionToCExpressionMapper(IdentityMapper): return var(func_name)(self.rec(expr.base, type_context), self.rec(expr.exponent, type_context)) else: - return self.rec(var("pow")(expr.base, expr.exponent), type_context) + from loopy.codegen import SeenFunction + clbl = self.codegen_state.ast_builder.known_callables["pow"] + clbl = clbl.with_types({0: tgt_dtype, 1: exponent_dtype}, + self.kernel, self.codegen_state.callables_table)[0] + self.codegen_state.seen_functions.add( + SeenFunction( + clbl.name, clbl.name_in_target, + (base_dtype, exponent_dtype), + (tgt_dtype,))) + return var(clbl.name_in_target)(self.rec(expr.base, type_context), + self.rec(expr.exponent, type_context)) if not self.allow_complex: return base_impl(expr, type_context) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 90d8eb25d..22fa78a55 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -236,12 +236,12 @@ class OpenCLCallable(ScalarCallable): else: raise LoopyTypeError(f"'pow' does not support type {dtype}.") - result_dtype = NumpyType(dtype) + result_dtype = NumpyType(common_dtype) return ( self.copy(name_in_target=name, arg_id_to_dtype={-1: result_dtype, - 0: dtype, 1: dtype}), + 0: common_dtype, 1: common_dtype}), callables_table) if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: @@ -307,6 +307,18 @@ class OpenCLCallable(ScalarCallable): self.copy(arg_id_to_dtype=arg_id_to_dtype), callables_table) + +def get_opencl_callables(): + """ + Returns an instance of :class:`InKernelCallable` if the function defined by + *identifier* is known in OpenCL. + """ + opencl_function_ids = {"max", "min", "dot", "pow"} | set( + _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) + + return {id_: OpenCLCallable(name=id_) for id_ in + opencl_function_ids} + # }}} -- GitLab From ae1263325a959befb8b8de055f201ca25d2585fc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 14 Jan 2021 15:56:52 -0600 Subject: [PATCH 726/916] makes loopy.Program hashable --- loopy/program.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/loopy/program.py b/loopy/program.py index bf3ce5d73..1b45a3518 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -315,6 +315,13 @@ class Program(ImmutableRecord): self._program_executor_cache = {} + def __hash__(self): + from loopy.tools import LoopyKeyBuilder + from pytools.persistent_dict import new_hash + key_hash = new_hash() + self.update_persistent_hash(key_hash, LoopyKeyBuilder()) + return hash(key_hash.digest()) + # }}} -- GitLab From decfdbd8c79d3947f734d6d8c2c2723be52b7cb7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 15 Jan 2021 23:59:39 -0600 Subject: [PATCH 727/916] multiple minor fixes - prepare_for_caching before checking for codegen cache hit - corrects default value of ArrayArg.is_output - removes unnecessary infer_root_kernel --- loopy/__init__.py | 4 ++- loopy/codegen/__init__.py | 4 ++- loopy/kernel/data.py | 2 +- loopy/kernel/tools.py | 61 --------------------------------------- loopy/preprocess.py | 10 +++---- 5 files changed, 11 insertions(+), 70 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index d621f0591..ba013365c 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -123,7 +123,7 @@ from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call from loopy.type_inference import infer_unknown_types from loopy.preprocess import (preprocess_kernel, realize_reduction, - preprocess_program) + preprocess_program, infer_arg_descr) from loopy.schedule import ( generate_loop_schedules, get_one_scheduled_kernel, get_one_linearized_kernel) from loopy.statistics import (ToCountMap, ToCountPolynomialMap, CountGranularity, @@ -258,6 +258,8 @@ __all__ = [ "infer_unknown_types", "preprocess_kernel", "realize_reduction", "preprocess_program", + "infer_arg_descr", + "generate_loop_schedules", "get_one_scheduled_kernel", "get_one_linearized_kernel", "GeneratedProgram", "CodeGenerationResult", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 784e8412a..7d3df545d 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -627,6 +627,7 @@ def generate_code_v2(program): :param program: An instance of :class:`loopy.Program`. """ + from loopy.kernel import LoopKernel from loopy.program import make_program from loopy.codegen.result import CodeGenerationResult @@ -634,9 +635,10 @@ def generate_code_v2(program): # {{{ cache retrieval from loopy import CACHING_ENABLED + from loopy.preprocess import prepare_for_caching if CACHING_ENABLED: - input_program = program + input_program = prepare_for_caching(program) try: result = code_gen_cache[input_program] logger.debug(f"Program with entrypoints {program.entrypoints}:" diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index ece606ddc..d176488b6 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -396,7 +396,7 @@ class ArrayArg(ArrayBase, KernelArgument): if "address_space" not in kwargs: raise TypeError("'address_space' must be specified") - is_output_only = kwargs.pop("is_output_only", False) + is_output_only = kwargs.pop("is_output_only", None) if is_output_only is not None: warn("'is_output_only' is deprecated. Use 'is_output', 'is_input'" " instead.", DeprecationWarning, stacklevel=2) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 4acadcfe0..7f5979a04 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -32,13 +32,9 @@ import islpy as isl from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg, natsorted -from loopy.symbolic import CombineMapper from loopy.kernel import LoopKernel from loopy.program import Program, iterate_over_kernels_if_given_program from loopy.kernel.function_interface import CallableKernel -from loopy.kernel.instruction import (MultiAssignmentBase, - _DataObliviousInstruction) -from functools import reduce import logging logger = logging.getLogger(__name__) @@ -1982,61 +1978,4 @@ def infer_args_are_input_output(kernel): # }}} - -# {{{ identify_root_kernel - -class CallCollector(CombineMapper): - def combine(self, values): - import operator - return reduce(operator.or_, values, frozenset()) - - def map_call(self, expr): - from pymbolic.primitives import CallWithKwargs - return self.rec(CallWithKwargs( - function=expr.function, parameters=expr.parameters, - kw_parameters={})) - - def map_call_with_kwargs(self, expr): - return (frozenset([expr.function.name]) | - self.combine(self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values()))) - - def map_constant(self, expr): - return frozenset() - - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - map_type_cast = map_constant - - -def identify_root_kernel(kernels): - assert isinstance(kernels, list) - assert all(isinstance(knl, LoopKernel) for knl in kernels) - call_collector = CallCollector() - - def _calls_in_a_kernel(knl): - calls = set() - for insn in knl.instructions: - if isinstance(insn, MultiAssignmentBase): - calls = calls | call_collector(insn.expression) - elif isinstance(insn, _DataObliviousInstruction): - pass - else: - raise NotImplementedError() - - return calls - - all_calls = frozenset().union(*[_calls_in_a_kernel(knl) for knl in - kernels]) - - kernel_names = frozenset([knl.name for knl in kernels]) - - assert len(kernel_names - all_calls) == 1 - - root_knl_name, = (kernel_names - all_calls) - return root_knl_name - -# }}} - # vim: foldmethod=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index dd14b0eb4..e377adc28 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2175,7 +2175,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): def traverse_to_infer_arg_descr(kernel, callables_table): """ Returns a copy of *kernel* with the argument shapes and strides matching for - scoped functions in the *kernel*. Refer + resolved functions in the *kernel*. Refer :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. .. note:: @@ -2202,22 +2202,20 @@ def infer_arg_descr(program): :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the callables. """ - - from loopy.program import make_clbl_inf_ctx + from loopy.program import make_clbl_inf_ctx, resolve_callables from loopy.kernel.array import ArrayBase from loopy.kernel.function_interface import (ArrayArgDescriptor, ValueArgDescriptor) from loopy import auto, ValueArg + program = resolve_callables(program) + clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, program.entrypoints) renamed_entrypoints = set() for e in program.entrypoints: - # FIXME: Need to add docs which say that we need not add the current - # callable to the clbl_inf_ctx while writing the "with_types" - # This is treacherous, we should use traverse... instead. def _tuple_if_int(s): if isinstance(s, int): return s, -- GitLab From 17c2451c55fc8f58a811186506f05501e79c1ac5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 17 Jan 2021 18:03:14 -0600 Subject: [PATCH 728/916] bunch of callables related changes: - simplifies interface to with_types - simplifies interface to with_descr - simplifies the logic within CallableKernel.with_descrs - gets rid of ManglerCallable - introduces InKernelCallable.with_added_arg --- loopy/kernel/function_interface.py | 392 ++++++++++++--------------- loopy/library/function.py | 11 +- loopy/library/random123.py | 19 +- loopy/library/reduction.py | 35 +-- loopy/preprocess.py | 56 +++- loopy/target/c/__init__.py | 47 ++-- loopy/target/c/codegen/expression.py | 19 +- loopy/target/cuda.py | 5 +- loopy/target/opencl.py | 82 +++++- loopy/target/pyopencl.py | 4 +- loopy/target/python.py | 8 - loopy/type_inference.py | 174 ++++-------- test/testlib.py | 17 +- 13 files changed, 419 insertions(+), 450 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 1120dd2bb..9eb707e81 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -20,15 +20,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import islpy as isl from pytools import ImmutableRecord from loopy.diagnostic import LoopyError from loopy.tools import update_persistent_hash from loopy.kernel import LoopKernel -from loopy.kernel.data import ValueArg, ArrayArg, ConstantArg -from loopy.symbolic import (SubstitutionMapper, DependencyMapper) -from pymbolic.primitives import Variable +from loopy.kernel.data import ValueArg, ArrayArg +from loopy.symbolic import DependencyMapper, WalkMapper __doc__ = """ @@ -39,7 +37,6 @@ __doc__ = """ .. autoclass:: InKernelCallable .. autoclass:: CallableKernel .. autoclass:: ScalarCallable -.. autoclass:: ManglerCallable """ @@ -77,6 +74,9 @@ class ArrayArgDescriptor(ImmutableRecord): A tuple of instances of :class:`loopy.kernel.array.ArrayDimImplementationTag` + + .. automethod:: map_expr + .. automethod:: depends_on """ fields = {"shape", "address_space", "dim_tags"} @@ -102,11 +102,19 @@ class ArrayArgDescriptor(ImmutableRecord): dim_tags=dim_tags) def map_expr(self, f): + """ + Returns an instance of :class:`ArrayArgDescriptor` with its shapes, strides, + mapped by *f*. + """ new_shape = tuple(f(axis_len) for axis_len in self.shape) new_dim_tags = tuple(dim_tag.map_expr(f) for dim_tag in self.dim_tags) return self.copy(shape=new_shape, dim_tags=new_dim_tags) def depends_on(self): + """ + Returns class:`frozenset` of all the variable names the + :class:`ArrayArgDescriptor` depends on. + """ from loopy.kernel.data import auto result = DependencyMapper(composite_leaves=False)([lngth for lngth in self.shape if lngth not in [None, auto]]) | ( @@ -124,13 +132,50 @@ class ArrayArgDescriptor(ImmutableRecord): key_builder.rec(key_hash, self.dim_tags) +class ExpressionIsScalarChecker(WalkMapper): + def __init__(self, kernel): + self.kernel = kernel + + def map_sub_array_ref(self, expr): + raise LoopyError("Sub-array refs can only be used as call's parameters" + f" or assignees. '{expr}'violates this.") + + def map_call(self, expr): + for child in expr.parameters: + self.rec(child) + + def map_call_with_kwargs(self, expr): + for child in expr.parameters + tuple(expr.kw_parameters.values()): + self.rec(child) + + def map_subscript(self, expr): + for child in expr.index_tuple: + self.rec(child) + + def map_variable(self, expr): + from loopy.kernel.data import TemporaryVariable, ArrayArg + if expr.name in self.kernel.all_inames(): + # inames are scalar + return + + var = self.kernel.arg_dict.get(expr.name, None) or ( + self.kernel.temporary_variables.get(expr.name, None)) + + if var is not None: + if isinstance(var, (ArrayArg, TemporaryVariable)) and ( + var.shape != ()): + raise LoopyError("Array regions can only passed as sub-array refs.") + + def map_slice(self, expr): + raise LoopyError("Array regions can only passed as sub-array refs.") + + def get_arg_descriptor_for_expression(kernel, expr): """ :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` describing the argument expression *expr* which occurs in a call in the code of *kernel*. """ - from pymbolic.primitives import Variable from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, SweptInameStrideCollector) from loopy.kernel.data import TemporaryVariable, ArrayArg @@ -186,24 +231,8 @@ def get_arg_descriptor_for_expression(kernel, expr): address_space=aspace, dim_tags=sub_dim_tags, shape=sub_shape) - - elif isinstance(expr, Variable): - arg = kernel.get_var_descriptor(expr.name) - from loopy.kernel.array import ArrayBase - - if isinstance(arg, ValueArg) or (isinstance(arg, ArrayBase) - and arg.shape == ()): - return ValueArgDescriptor() - elif isinstance(arg, (ArrayArg, TemporaryVariable)): - raise LoopyError("may not pass entire array " - "'%s' in call statement in kernel '%s'" - % (expr.name, kernel.name)) - else: - raise LoopyError("unsupported argument type " - "'%s' of '%s' in call statement" - % (type(arg).__name__, expr.name)) - else: + ExpressionIsScalarChecker(kernel)(expr) return ValueArgDescriptor() # }}} @@ -242,8 +271,8 @@ class GridOverrideForCalleeKernel(ImmutableRecord): Helper class to set the :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the callee kernels. Refer to - :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, - :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. + :meth:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, + :meth:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. .. attribute:: global_size @@ -325,7 +354,7 @@ class InKernelCallable(ImmutableRecord): update_persistent_hash = update_persistent_hash - def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + def with_types(self, arg_id_to_dtype, callables_table): """ :arg arg_id_to_type: a mapping from argument identifiers (integers for positional arguments, names for keyword @@ -345,12 +374,15 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() - def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + def with_descrs(self, arg_id_to_descr, callables_table): """ - :arg arg_id_to_descr: a mapping from argument identifiers - (integers for positional arguments, names for keyword - arguments) to :class:`loopy.ArrayArgDescriptor` instances. - Unspecified/unknown types are not represented in *arg_id_to_descr*. + :arg arg_id_to_descr: a mapping from argument identifiers (integers for + positional arguments, names for keyword arguments) to + :class:`loopy.ArrayArgDescriptor` instances. Unspecified/unknown + descriptors are not represented in *arg_id_to_descr*. + + All the expressions in arg_id_to_descr must have variables that belong + to the callable's namespace. Return values are denoted by negative integers, with the first returned value identified as *-1*. @@ -439,6 +471,13 @@ class InKernelCallable(ImmutableRecord): return hash(tuple(self.fields)) + def with_added_arg(self, arg_dtype, arg_descr): + """ + Registers a new argument to the callable and returns the name of the + argument in the callable's namespace. + """ + raise NotImplementedError() + # }}} @@ -451,8 +490,7 @@ class ScalarCallable(InKernelCallable): .. note:: The :meth:`ScalarCallable.with_types` is intended to assist with type - specialization of the function and is expected to be supplemented in the - derived subclasses. + specialization of the function and sub-classes must define it. """ fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"} @@ -474,16 +512,16 @@ class ScalarCallable(InKernelCallable): return (self.arg_id_to_dtype, self.arg_id_to_descr, self.name_in_target) - def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + def with_types(self, arg_id_to_dtype, callables_table): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) - def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + def with_descrs(self, arg_id_to_descr, callables_table): arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - callables_table, ()) + callables_table) def with_hw_axes_sizes(self, global_size, local_size): return self.copy() @@ -584,9 +622,6 @@ class ScalarCallable(InKernelCallable): # assignee is returned whenever the size of assignees is non zero. first_assignee_is_returned = len(insn.assignees) > 0 - # TODO: Maybe this interface a bit confusing. Should we allow this - # method to directly return a cgen.Assign or cgen.ExpressionStatement? - return var(self.name_in_target)(*c_parameters), first_assignee_is_returned def generate_preambles(self, target): @@ -595,6 +630,9 @@ class ScalarCallable(InKernelCallable): # }}} + def with_added_arg(self, arg_dtype, arg_descr): + raise LoopyError("Cannot add args to scalar callables.") + # }}} @@ -645,8 +683,7 @@ class CallableKernel(InKernelCallable): def name(self): return self.subkernel.name - def with_types(self, arg_id_to_dtype, caller_kernel, - callables_table): + def with_types(self, arg_id_to_dtype, callables_table): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) new_args = [] @@ -684,124 +721,116 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=specialized_kernel, arg_id_to_dtype=new_arg_id_to_dtype), callables_table - def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, - expr=None): - # tune the subkernel so that we have the matching shapes and - # dim_tags - - # {{{ map the arg_descrs so that all the variables are from the callees - # perspective - - domain_dependent_vars = frozenset().union( - *(frozenset(dom.get_var_names(isl.dim_type.param)) for dom in - self.subkernel.domains)) - - # FIXME: This is ill-formed, because par can be an expression, e.g. - # 2*i+2 or 2*(i+1). A key feature of expression is that structural - # equality and semantic equality are not the same, so even if the - # SubstitutionMapper allowed non-variables, it would have to solve the - # (considerable) problem of expression equivalence. - - import numbers - substs = {} - assumptions = {} - - if expr: - for arg, par in zip(self.subkernel.args, expr.parameters): - if isinstance(arg, ValueArg) and arg.name in domain_dependent_vars: - if isinstance(par, Variable): - if par in substs: - assumptions[arg.name] = substs[par].name - else: - substs[par] = Variable(arg.name) - elif isinstance(par, numbers.Number): - assumptions[arg.name] = par - - def subst_func(expr): - if expr in substs: - return substs[expr] - else: - return expr - - subst_mapper = SubstitutionMapper(subst_func) - - arg_id_to_descr = {arg_id: descr.map_expr(subst_mapper) - for arg_id, descr in arg_id_to_descr.items()} + def with_descrs(self, arg_id_to_descr, callables_table): - # }}} + # arg_id_to_descr expressions provided are from the caller's namespace, + # need to register - dependents = frozenset().union(*(descr.depends_on() for descr in - arg_id_to_descr.values())) - unknown_deps = dependents - self.subkernel.all_variable_names() + kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - if expr is None: - assert unknown_deps == frozenset() - # FIXME: Need to make sure that we make the name of the variables - # unique, and then run a subst_mapper + kw_to_callee_idx = {arg.name: i + for i, arg in enumerate(self.subkernel.args)} new_args = self.subkernel.args[:] - kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) for arg_id, descr in arg_id_to_descr.items(): if isinstance(arg_id, int): arg_id = pos_to_kw[arg_id] - assert isinstance(arg_id, str) + + callee_arg = new_args[kw_to_callee_idx[arg_id]] + + # {{{ checks + + if isinstance(callee_arg, ValueArg) and ( + isinstance(descr, ArrayArgDescriptor)): + raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}' " + "expected to be a scalar, got an array region.") + + if isinstance(callee_arg, ArrayArg) and ( + isinstance(descr, ValueArgDescriptor)): + raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}' " + "expected to be an array, got a scalar.") + + if (isinstance(descr, ArrayArgDescriptor) + and isinstance(callee_arg.shape, tuple) + and len(callee_arg.shape) != len(descr.shape)): + raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}'" + " has a dimensionality mismatch, expected " + f"{len(callee_arg.shape)}, got {len(descr.shape)}") + + # }}} if isinstance(descr, ArrayArgDescriptor): - if not isinstance(self.subkernel.arg_dict[arg_id], (ArrayArg, - ConstantArg)): - raise LoopyError("Array passed to scalar argument " - "'%s' of the function '%s' (in '%s')." % ( - arg_id, self.subkernel.name, - caller_kernel.name)) - if self.subkernel.arg_dict[arg_id].shape and ( - len(self.subkernel.arg_dict[arg_id].shape) != - len(descr.shape)): - raise LoopyError("Dimension mismatch for argument " - " '%s' of the function '%s' (in '%s')." % ( - arg_id, self.subkernel.name, - caller_kernel.name)) - - new_arg = self.subkernel.arg_dict[arg_id].copy( - shape=descr.shape, - dim_tags=descr.dim_tags, - address_space=descr.address_space) - # replacing the new arg with the arg of the same name - new_args = [new_arg if arg.name == arg_id else arg for arg in - new_args] - elif isinstance(descr, ValueArgDescriptor): - if not isinstance(self.subkernel.arg_dict[arg_id], ValueArg): - raise LoopyError("Scalar passed to array argument " - "'%s' of the callable '%s' (in '%s')" % ( - arg_id, self.subkernel.name, - caller_kernel.name)) + callee_arg = callee_arg.copy(shape=descr.shape, + dim_tags=descr.dim_tags, + address_space=descr.address_space) else: - raise LoopyError("Descriptor must be either an instance of " - "ArrayArgDescriptor or ValueArgDescriptor -- got %s" % - type(descr)) - - descriptor_specialized_knl = self.subkernel.copy(args=new_args) - # add the variables on which the strides/shapes depend but not provided - # as arguments - args_added_knl = descriptor_specialized_knl.copy( - args=descriptor_specialized_knl.args - + [ValueArg(dep) for dep in unknown_deps]) + # do nothing for a scalar arg. + assert isinstance(descr, ValueArgDescriptor) + + new_args[kw_to_callee_idx[arg_id]] = callee_arg + + subkernel = self.subkernel.copy(args=new_args) + from loopy.preprocess import traverse_to_infer_arg_descr - from loopy.transform.parameter import assume - args_added_knl, callables_table = ( - traverse_to_infer_arg_descr(args_added_knl, + subkernel, callables_table = ( + traverse_to_infer_arg_descr(subkernel, callables_table)) - if assumptions: - assumption_str = " and ".join([f"{key}={val}" - for key, val in assumptions.items()]) - args_added_knl = assume(args_added_knl, assumption_str) + # {{{ update the arg descriptors - return ( - self.copy( - subkernel=args_added_knl, - arg_id_to_descr=arg_id_to_descr), - callables_table, tuple(Variable(dep) for dep in unknown_deps)) + for arg in subkernel.args: + kw = arg.name + if isinstance(arg, ArrayArg): + arg_id_to_descr[kw] = ( + ArrayArgDescriptor(shape=arg.shape, + dim_tags=arg.dim_tags, + address_space=arg.address_space)) + else: + assert isinstance(arg, ValueArg) + arg_id_to_descr[kw] = ValueArgDescriptor() + + arg_id_to_descr[kw_to_pos[kw]] = arg_id_to_descr[kw] + + # }}} + + return (self.copy(subkernel=subkernel, + arg_id_to_descr=arg_id_to_descr), + callables_table) + + def with_added_arg(self, arg_dtype, arg_descr): + var_name = self.subkernel.get_var_name_generator()(based_on="_lpy_arg") + + if isinstance(arg_descr, ValueArgDescriptor): + subknl = self.subkernel.copy( + args=self.subkernel.args+[ + ValueArg(var_name, arg_dtype, self.subkernel.target)]) + + kw_to_pos, pos_to_kw = get_kw_pos_association(subknl) + + if self.arg_id_to_dtype is None: + arg_id_to_dtype = {} + else: + arg_id_to_dtype = self.arg_id_to_dtype.copy() + if self.arg_id_to_descr is None: + arg_id_to_descr = {} + else: + arg_id_to_descr = self.arg_id_to_descr.copy() + + arg_id_to_dtype[var_name] = arg_dtype + arg_id_to_descr[var_name] = arg_descr + arg_id_to_dtype[kw_to_pos[var_name]] = arg_dtype + arg_id_to_descr[kw_to_pos[var_name]] = arg_descr + + return (self.copy(subkernel=subknl, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr), + var_name) + + else: + # don't think this should ever be needed + raise NotImplementedError("with_added_arg not implemented for array" + " types arguments.") def with_packing_for_args(self): from loopy.kernel.data import AddressSpace @@ -892,81 +921,4 @@ class CallableKernel(InKernelCallable): # }}} -# {{{ mangler callable - -class ManglerCallable(ScalarCallable): - """ - A callable whose characteristic is defined by a function mangler. - - .. attribute:: function_mangler - - A function of signature ``(kernel, name , arg_dtypes)`` and returns an - instance of ``loopy.CallMangleInfo``. - """ - fields = {"name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target"} - init_arg_names = ("name", "function_mangler", "arg_id_to_dtype", - "arg_id_to_descr", "name_in_target") - hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target") - - def __init__(self, name, function_mangler, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): - - self.function_mangler = function_mangler - - super().__init__( - name=name, - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr, - name_in_target=name_in_target) - - def __getinitargs__(self): - return (self.name, self.function_mangler, self.arg_id_to_dtype, - self.arg_id_to_descr, self.name_in_target) - - def with_types(self, arg_id_to_dtype, kernel, callables_table): - if self.arg_id_to_dtype is not None: - # specializing an already specialized function. - for arg_id, dtype in arg_id_to_dtype.items(): - # only checking for the ones which have been provided - # if does not match, returns an error. - if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]: - raise LoopyError("Overwriting a specialized" - " function is illegal--maybe start with new instance of" - " ManglerCallable?") - - sorted_keys = sorted(arg_id_to_dtype.keys()) - arg_dtypes = tuple(arg_id_to_dtype[key] for key in sorted_keys if - key >= 0) - - mangle_result = self.function_mangler(kernel, self.name, - arg_dtypes) - if mangle_result: - new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes)) - new_arg_id_to_dtype.update({-i-1: dtype for i, dtype in - enumerate(mangle_result.result_dtypes)}) - return ( - self.copy(name_in_target=mangle_result.target_name, - arg_id_to_dtype=new_arg_id_to_dtype), - callables_table) - else: - # The function mangler does not agree with the arg id to dtypes - # provided. Indicating that is illegal. - raise LoopyError("Function %s not coherent with the provided types." % ( - self.name)) - - def mangle_result(self, kernel): - """ - Returns an instance of :class:`loopy.kernel.data.CallMangleInfo` for - the given pair :attr:`function_mangler` and :attr:`arg_id_to_dtype`. - """ - sorted_keys = sorted(self.arg_id_to_dtype.keys()) - arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if - key >= 0) - - return self.function_mangler(kernel, self.name, arg_dtypes) - -# }}} - # vim: foldmethod=marker diff --git a/loopy/library/function.py b/loopy/library/function.py index bea9a4a70..73241152f 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,10 +22,11 @@ THE SOFTWARE. from loopy.kernel.function_interface import ScalarCallable from loopy.diagnostic import LoopyError +import numpy as np class MakeTupleCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, callables_table): + def with_types(self, arg_id_to_dtype, callables_table): new_arg_id_to_dtype = arg_id_to_dtype.copy() for i in range(len(arg_id_to_dtype)): if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None: @@ -34,22 +35,22 @@ class MakeTupleCallable(ScalarCallable): return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target="loopy_make_tuple"), callables_table) - def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + def with_descrs(self, arg_id_to_descr, callables_table): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = {(id, ValueArgDescriptor()): (-id-1, ValueArgDescriptor()) for id in arg_id_to_descr.keys()} return ( self.copy(arg_id_to_descr=new_arg_id_to_descr), - callables_table, ()) + callables_table) class IndexOfCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, callables_table): + def with_types(self, arg_id_to_dtype, callables_table): new_arg_id_to_dtype = {i: dtype for i, dtype in arg_id_to_dtype.items() if dtype is not None} - new_arg_id_to_dtype[-1] = kernel.index_dtype + new_arg_id_to_dtype[-1] = np.int32 return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), callables_table) diff --git a/loopy/library/random123.py b/loopy/library/random123.py index c2e64fc55..14199b279 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -168,7 +168,18 @@ class Random123Callable(ScalarCallable): Records information about for the random123 functions. """ - def with_types(self, arg_id_to_dtype, kernel, callables_table): + def __init__(self, name, arg_id_to_dtype=None, + arg_id_to_descr=None, name_in_target=None, target=None): + + super().__init__( + name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) + + self.target = target + + def with_types(self, arg_id_to_dtype, callables_table): if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): @@ -178,7 +189,7 @@ class Random123Callable(ScalarCallable): callables_table) name = self.name - target = kernel.target + target = self.target rng_variant = FUNC_NAMES_TO_RNG[name] @@ -230,7 +241,7 @@ class Random123Callable(ScalarCallable): return -def get_random123_callables(): - return {id_: Random123Callable(id_) for id_ in FUNC_NAMES_TO_RNG} +def get_random123_callables(target): + return {id_: Random123Callable(id_, target=target) for id_ in FUNC_NAMES_TO_RNG} # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index fa6c0cd89..1d53d06b0 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -53,7 +53,7 @@ class ReductionOperation: equality-comparable. """ - def result_dtypes(self, target, *arg_dtypes): + def result_dtypes(self, *arg_dtypes): """ :arg arg_dtypes: may be None if not known :returns: None if not known, otherwise the returned type @@ -112,10 +112,11 @@ class ScalarReductionOperation(ReductionOperation): def arg_count(self): return 1 - def result_dtypes(self, kernel, arg_dtype): + def result_dtypes(self, arg_dtype): if self.forced_result_type is not None: - return (self.parse_result_type( - kernel.target, self.forced_result_type),) + raise NotImplementedError() + # return (self.parse_result_type( + # kernel.target, self.forced_result_type),) if arg_dtype is None: return None @@ -224,7 +225,7 @@ class MaxReductionOperation(ScalarReductionOperation): # type specialize the callable max_scalar_callable, callables_table = max_scalar_callable.with_types( - {0: dtype, 1: dtype}, None, callables_table) + {0: dtype, 1: dtype}, callables_table) # populate callables_table func_id, callables_table = update_table(callables_table, "max", @@ -246,7 +247,7 @@ class MinReductionOperation(ScalarReductionOperation): # type specialize the callable min_scalar_callable, callables_table = min_scalar_callable.with_types( - {0: dtype, 1: dtype}, None, callables_table) + {0: dtype, 1: dtype}, callables_table) # populate callables_table func_id, callables_table = update_table(callables_table, "min", @@ -325,7 +326,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): make_tuple_callable, callables_table = make_tuple_callable.with_types( dict(enumerate([scalar_dtype, segment_flag_dtype])), - None, callables_table) + callables_table) func_id, callables_table = update_table( callables_table, "make_tuple", make_tuple_callable) @@ -333,8 +334,8 @@ class _SegmentedScalarReductionOperation(ReductionOperation): return ResolvedFunction(func_id)(scalar_neutral_element, segment_flag_dtype.numpy_dtype.type(0)), callables_table - def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): - return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) + def result_dtypes(self, scalar_dtype, segment_flag_dtype): + return (self.inner_reduction.result_dtypes(scalar_dtype) + (segment_flag_dtype,)) def __str__(self): @@ -355,7 +356,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): segmented_scalar_callable, callables_table = ( segmented_scalar_callable.with_types( {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, - None, callables_table)) + callables_table)) # populate callables_table from loopy.program import update_table @@ -414,7 +415,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_dtype.numpy_dtype.type.__name__, index_dtype.numpy_dtype.type.__name__) - def result_dtypes(self, kernel, scalar_dtype, index_dtype): + def result_dtypes(self, scalar_dtype, index_dtype): return (scalar_dtype, index_dtype) def neutral_element(self, scalar_dtype, index_dtype, callables_table, @@ -430,7 +431,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): make_tuple_callable, callables_table = make_tuple_callable.with_types( dict(enumerate([scalar_dtype, index_dtype])), - None, callables_table) + callables_table) # populate callables_table func_id, callables_table = update_table(callables_table, "make_tuple", @@ -459,7 +460,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): arg_ext_scalar_callable, callables_table = ( arg_ext_scalar_callable.with_types( {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]}, - None, callables_table)) + callables_table)) # populate callables_table from loopy.program import update_table @@ -549,10 +550,10 @@ def parse_reduction_op(name): # {{{ reduction specific callables class ReductionCallable(ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, callables_table): + def with_types(self, arg_id_to_dtype, callables_table): scalar_dtype = arg_id_to_dtype[0] index_dtype = arg_id_to_dtype[1] - result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype, + result_dtypes = self.name.reduction_op.result_dtypes(scalar_dtype, index_dtype) new_arg_id_to_dtype = arg_id_to_dtype.copy() new_arg_id_to_dtype[-1] = result_dtypes[0] @@ -563,13 +564,13 @@ class ReductionCallable(ScalarCallable): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype, name_in_target=name_in_target), callables_table - def with_descrs(self, arg_id_to_descr, caller_kernel, callables_table, expr): + def with_descrs(self, arg_id_to_descr, callables_table): from loopy.kernel.function_interface import ValueArgDescriptor new_arg_id_to_descr = arg_id_to_descr.copy() new_arg_id_to_descr[-1] = ValueArgDescriptor() return ( self.copy(arg_id_to_descr=arg_id_to_descr), - callables_table, ()) + callables_table) def generate_preambles(self, target): if isinstance(self.name, ArgExtOp): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index e377adc28..20ed08402 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2084,8 +2084,13 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): self.callables_table = callables_table def map_call(self, expr, expn_state, assignees=None): - from pymbolic.primitives import Call, CallWithKwargs + from pymbolic.primitives import Call, CallWithKwargs, Variable + from loopy.kernel.function_interface import ValueArgDescriptor from loopy.symbolic import ResolvedFunction + from loopy.kernel.array import ArrayBase + from loopy.kernel.data import ValueArg + from pymbolic.mapper.substitutor import make_subst_func + from loopy.symbolic import SubstitutionMapper if not isinstance(expr.function, ResolvedFunction): # ignore if the call is not to a ResolvedFunction @@ -2105,13 +2110,45 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): arg_id: get_arg_descriptor_for_expression( self.caller_kernel, arg) for arg_id, arg in arg_id_to_val.items()} + in_knl_callable = self.callables_table[expr.function.name] + + # {{{ translating descriptor expressions to the callable's namespace + + deps_as_params = [] + subst_map = {} + + deps = frozenset().union(*(descr.depends_on() + for descr in arg_id_to_descr.values())) + + assert deps <= self.caller_kernel.all_variable_names() + + for dep in deps: + caller_arg = self.caller_kernel.arg_dict.get(dep, None) + caller_arg = self.caller_kernel.temporary_variables.get(dep, caller_arg) + + if not (isinstance(caller_arg, ValueArg) or (isinstance(caller_arg, + ArrayBase) and arg.shape == ())): + raise NotImplementedError(f"Obtained '{dep}' as a dependency for" + f" call '{expr.function.name}' which is not a scalar.") + + in_knl_callable, callee_name = in_knl_callable.with_added_arg( + caller_arg.dtype, ValueArgDescriptor()) + + subst_map[dep] = Variable(callee_name) + deps_as_params.append(Variable(dep)) + + mapper = SubstitutionMapper(make_subst_func(subst_map)) + arg_id_to_descr = {id_: descr.map_expr(mapper) + for id_, descr in arg_id_to_descr.items()} + + # }}} # specializing the function according to the parameter description - in_knl_callable = self.callables_table[expr.function.name] - new_in_knl_callable, self.callables_table, new_vars = ( + new_in_knl_callable, self.callables_table = ( in_knl_callable.with_descrs( - arg_id_to_descr, self.caller_kernel, - self.callables_table, expr)) + arg_id_to_descr, self.callables_table)) + + # find the deps of the new in kernel callablen and add those arguments to self.callables_table, new_func_id = ( self.callables_table.with_callable( @@ -2122,9 +2159,10 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): return Call( ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) - for child in expr.parameters)+new_vars) + for child in expr.parameters) + + tuple(deps_as_params)) else: - # FIXME: Order for vars when kwards are present? + # FIXME: Order for vars when kwargs are present? assert isinstance(expr, CallWithKwargs) return CallWithKwargs( ResolvedFunction(new_func_id), @@ -2231,8 +2269,8 @@ def infer_arg_descr(program): arg_id_to_descr[arg.name] = ValueArgDescriptor() else: raise NotImplementedError() - new_callable, clbl_inf_ctx, _ = program.callables_table[e].with_descrs( - arg_id_to_descr, None, clbl_inf_ctx) + new_callable, clbl_inf_ctx = program.callables_table[e].with_descrs( + arg_id_to_descr, clbl_inf_ctx) clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) renamed_entrypoints.add(new_name.name) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 8babd6fec..5fe9e3842 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -466,7 +466,7 @@ class CMathCallable(ScalarCallable): C-Target. """ - def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + def with_types(self, arg_id_to_dtype, callables_table): name = self.name if name in ["abs", "min", "max"]: @@ -497,18 +497,16 @@ class CMathCallable(ScalarCallable): elif dtype.kind == "c": raise LoopyTypeError(f"{name} does not support type {dtype}") - from loopy.target.opencl import OpenCLTarget - if not isinstance(caller_kernel.target, OpenCLTarget): - # for CUDA, C Targets the name must be modified - if dtype == np.float64: - pass # fabs - elif dtype == np.float32: - name = name + "f" # fabsf - elif dtype == np.float128: # pylint:disable=no-member - name = name + "l" # fabsl - else: - raise LoopyTypeError("{} does not support type {}".format(name, - dtype)) + # for CUDA, C Targets the name must be modified + if dtype == np.float64: + pass # fabs + elif dtype == np.float32: + name = name + "f" # fabsf + elif dtype == np.float128: # pylint:disable=no-member + name = name + "l" # fabsl + else: + raise LoopyTypeError("{} does not support type {}".format(name, + dtype)) return ( self.copy(name_in_target=name, @@ -521,9 +519,6 @@ class CMathCallable(ScalarCallable): for id in arg_id_to_dtype: if not -1 <= id <= 1: - #FIXME: Do we need to raise here?: - # The pattern we generally follow is that if we don't find - # a function, then we just return None raise LoopyError("%s can take only two arguments." % name) if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( @@ -542,17 +537,15 @@ class CMathCallable(ScalarCallable): raise LoopyTypeError("%s does not support complex numbers") elif dtype.kind == "f": - from loopy.target.opencl import OpenCLTarget - if not isinstance(caller_kernel.target, OpenCLTarget): - if dtype == np.float64: - pass # fmin - elif dtype == np.float32: - name = name + "f" # fminf - elif dtype == np.float128: # pylint:disable=no-member - name = name + "l" # fminl - else: - raise LoopyTypeError("%s does not support type %s" - % (name, dtype)) + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: # pylint:disable=no-member + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) dtype = NumpyType(dtype) return ( self.copy(name_in_target=name, diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 23f6e92f3..70f046c9d 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -451,23 +451,6 @@ class ExpressionToCExpressionMapper(IdentityMapper): "for constant '%s'" % expr) def map_call(self, expr, type_context): - - identifier_name = ( - self.codegen_state.callables_table[expr.function.name].name) - - from loopy.kernel.function_interface import ManglerCallable - if isinstance(self.codegen_state.callables_table[expr.function.name], - ManglerCallable): - from loopy.codegen import SeenFunction - in_knl_callable = ( - self.codegen_state.callables_table[ - expr.function.name]) - mangle_result = in_knl_callable.mangle_result(self.kernel) - self.codegen_state.seen_functions.add( - SeenFunction(identifier_name, - mangle_result.target_name, - mangle_result.arg_dtypes)) - return ( self.codegen_state.callables_table[ expr.function.name].emit_call( @@ -666,7 +649,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): from loopy.codegen import SeenFunction clbl = self.codegen_state.ast_builder.known_callables["pow"] clbl = clbl.with_types({0: tgt_dtype, 1: exponent_dtype}, - self.kernel, self.codegen_state.callables_table)[0] + self.codegen_state.callables_table)[0] self.codegen_state.seen_functions.add( SeenFunction( clbl.name, clbl.name_in_target, diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 54b1006ad..ee99f27e7 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -121,13 +121,10 @@ _CUDA_SPECIFIC_FUNCTIONS = { class CudaCallable(ScalarCallable): - def cuda_with_types(self, arg_id_to_dtype, caller_kernel, - callables_table): + def cuda_with_types(self, arg_id_to_dtype, callables_table): name = self.name - # FIXME: dot is not implemented yet. - if name in _CUDA_SPECIFIC_FUNCTIONS: num_args = _CUDA_SPECIFIC_FUNCTIONS[name] diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 22fa78a55..affe9ff5b 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -171,10 +171,71 @@ class OpenCLCallable(ScalarCallable): :class:`loopy.target.c.CMathCallable`. """ - def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + def with_types(self, arg_id_to_dtype, callables_table): name = self.name - if name in ["max", "min"]: + # unary functions + if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", + "erf", "erfc"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 0: + raise LoopyError(f"'{name}' can take only one argument.") + + if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = arg_id_to_dtype[0] + dtype = dtype.numpy_dtype + + if dtype.kind in ("u", "i"): + # ints and unsigned casted to float32 + dtype = np.float32 + elif dtype.kind == "c": + raise LoopyTypeError(f"{name} does not support type {dtype}") + + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={0: NumpyType(dtype), -1: + NumpyType(dtype)}), + callables_table) + # binary functions + elif name in ["fmax", "fmin", "atan2", "copysign"]: + + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + #FIXME: Do we need to raise here?: + # The pattern we generally follow is that if we don't find + # a function, then we just return None + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + + if dtype.kind == "c": + raise LoopyTypeError("%s does not support complex numbers") + + dtype = NumpyType(dtype) + return ( + self.copy(name_in_target=name, + arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), + callables_table) + + elif name in ["max", "min"]: for id in arg_id_to_dtype: if not -1 <= id <= 1: raise LoopyError("%s can take only 2 arguments." % name) @@ -200,7 +261,7 @@ class OpenCLCallable(ScalarCallable): raise LoopyError("%s function not supported for the types %s" % (name, common_dtype)) - if name == "dot": + elif name == "dot": for id in arg_id_to_dtype: if not -1 <= id <= 1: raise LoopyError(f"'{name}' can take only 2 arguments.") @@ -220,7 +281,7 @@ class OpenCLCallable(ScalarCallable): NumpyType(scalar_dtype), 0: dtype, 1: dtype}), callables_table) - if name == "pow": + elif name == "pow": for id in arg_id_to_dtype: if not -1 <= id <= 1: raise LoopyError(f"'{name}' can take only 2 arguments.") @@ -244,7 +305,7 @@ class OpenCLCallable(ScalarCallable): 0: common_dtype, 1: common_dtype}), callables_table) - if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: + elif name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS: num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name] for id in arg_id_to_dtype: if not -1 <= id < num_args: @@ -275,7 +336,7 @@ class OpenCLCallable(ScalarCallable): arg_id_to_dtype=updated_arg_id_to_dtype), callables_table) - if name in VECTOR_LITERAL_FUNCS: + elif name in VECTOR_LITERAL_FUNCS: base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name] for id in arg_id_to_dtype: @@ -313,8 +374,13 @@ def get_opencl_callables(): Returns an instance of :class:`InKernelCallable` if the function defined by *identifier* is known in OpenCL. """ - opencl_function_ids = {"max", "min", "dot", "pow"} | set( - _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS) + opencl_function_ids = ( + {"max", "min", "dot", "pow", "abs", "acos", "asin", + "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp", + "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", + "fabs", "tan", "erf", "erfc"} + | set(_CL_SIMPLE_MULTI_ARG_FUNCTIONS) + | set(VECTOR_LITERAL_FUNCS)) return {id_: OpenCLCallable(name=id_) for id_ in opencl_function_ids} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 59b90ef90..a192520c4 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -201,7 +201,7 @@ class PyOpenCLCallable(ScalarCallable): Records information about the callables which are not covered by :class:`loopy.target.opencl.OpenCLCallable` """ - def with_types(self, arg_id_to_dtype, caller_kernel, callables_table): + def with_types(self, arg_id_to_dtype, callables_table): name = self.name @@ -816,7 +816,7 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): from loopy.library.random123 import get_random123_callables callables = super().known_callables callables.update(get_pyopencl_callables()) - callables.update(get_random123_callables()) + callables.update(get_random123_callables(self.target)) return callables def preamble_generators(self): diff --git a/loopy/target/python.py b/loopy/target/python.py index 03910e120..c7f20ff55 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -90,16 +90,8 @@ class ExpressionToPythonMapper(StringifyMapper): raise LoopyError( "indexof, indexof_vec not yet supported in Python") - from loopy.kernel.function_interface import ManglerCallable clbl = self.codegen_state.callables_table[ expr.function.name] - if isinstance(clbl, ManglerCallable): - from loopy.codegen import SeenFunction - mangle_result = clbl.mangle_result(self.kernel) - self.codegen_state.seen_functions.add( - SeenFunction(identifier_name, - mangle_result.target_name, - mangle_result.arg_dtypes)) str_parameters = None number_of_assignees = len([key for key in diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 422404411..4410a2676 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -426,137 +426,75 @@ class TypeInferenceMapper(CombineMapper): tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())} # specializing the known function wrt type - if isinstance(expr.function, ResolvedFunction): - in_knl_callable = self.clbl_inf_ctx[expr.function.name] - - # {{{ checking that there is no overwriting of types of in_knl_callable - - if in_knl_callable.arg_id_to_dtype is not None: - - # specializing an already specialized function. - for id, dtype in arg_id_to_dtype.items(): - if id in in_knl_callable.arg_id_to_dtype and ( - in_knl_callable.arg_id_to_dtype[id] != - arg_id_to_dtype[id]): - - # {{{ ignoring the the cases when there is a discrepancy - # between np.uint and np.int + in_knl_callable = self.clbl_inf_ctx[expr.function.name] - import numpy as np - if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( - np.uint32) and ( - arg_id_to_dtype[id].dtype.type == np.int32): - continue - if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( - np.uint64) and ( - arg_id_to_dtype[id].dtype.type == - np.int64): - continue + # {{{ checking that there is no overwriting of types of in_knl_callable - if np.can_cast(arg_id_to_dtype[id].dtype.type, - in_knl_callable.arg_id_to_dtype[id].dtype.type): - continue + if in_knl_callable.arg_id_to_dtype is not None: - # }}} + # specializing an already specialized function. + for id, dtype in arg_id_to_dtype.items(): + if id in in_knl_callable.arg_id_to_dtype and ( + in_knl_callable.arg_id_to_dtype[id] != + arg_id_to_dtype[id]): - raise LoopyError("Overwriting a specialized function " - "is illegal--maybe start with new instance of " - "InKernelCallable?") + # {{{ ignoring the the cases when there is a discrepancy + # between np.uint and np.int - # }}} + import numpy as np + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint32) and ( + arg_id_to_dtype[id].dtype.type == np.int32): + continue + if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( + np.uint64) and ( + arg_id_to_dtype[id].dtype.type == + np.int64): + continue - in_knl_callable, self.clbl_inf_ctx = ( - in_knl_callable.with_types( - arg_id_to_dtype, self.kernel, - self.clbl_inf_ctx)) - - in_knl_callable = in_knl_callable.with_target(self.kernel.target) - - # storing the type specialized function so that it can be used for - # later use - self.clbl_inf_ctx, new_function_id = ( - self.clbl_inf_ctx.with_callable( - expr.function.function, - in_knl_callable)) - - if isinstance(expr, Call): - self.old_calls_to_new_calls[expr] = new_function_id - else: - assert isinstance(expr, CallWithKwargs) - self.old_calls_to_new_calls[expr] = new_function_id + if np.can_cast(arg_id_to_dtype[id].dtype.type, + in_knl_callable.arg_id_to_dtype[id].dtype.type): + continue - new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype + # }}} - if new_arg_id_to_dtype is None: - return [] + raise LoopyError("Overwriting a specialized function " + "is illegal--maybe start with new instance of " + "InKernelCallable?") - # collecting result dtypes in order of the assignees - if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: - if return_tuple: - return [get_return_types_as_tuple(new_arg_id_to_dtype)] - else: - return [new_arg_id_to_dtype[-1]] + # }}} - elif isinstance(expr.function, Variable): - # Since, the function is not "scoped", attempt to infer using - # kernel.function_manglers + in_knl_callable, self.clbl_inf_ctx = ( + in_knl_callable.with_types( + arg_id_to_dtype, + self.clbl_inf_ctx)) - # {{{ trying to infer using function manglers + in_knl_callable = in_knl_callable.with_target(self.kernel.target) - arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in - expr.parameters) + # storing the type specialized function so that it can be used for + # later use + self.clbl_inf_ctx, new_function_id = ( + self.clbl_inf_ctx.with_callable( + expr.function.function, + in_knl_callable)) - # finding the function_mangler which would be associated with the - # realized function. + if isinstance(expr, Call): + self.old_calls_to_new_calls[expr] = new_function_id + else: + assert isinstance(expr, CallWithKwargs) + self.old_calls_to_new_calls[expr] = new_function_id - mangle_result = None - for function_mangler in self.kernel.function_manglers: - mangle_result = function_mangler(self.kernel, identifier, - arg_dtypes) - if mangle_result: - # found a match. - break + new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype - if mangle_result is not None: - from loopy.kernel.function_interface import ManglerCallable - - # creating arg_id_to_dtype from arg_dtypes - arg_id_to_dtype = {i: dt.with_target(self.kernel.target) - for i, dt in enumerate(mangle_result.arg_dtypes)} - arg_id_to_dtype.update({-i-1: - dtype.with_target(self.kernel.target) for i, dtype in enumerate( - mangle_result.result_dtypes)}) - - # creating the ManglerCallable object corresponding to the - # function. - in_knl_callable = ManglerCallable( - identifier, function_mangler, arg_id_to_dtype, - name_in_target=mangle_result.target_name) - # FIXME: we have not tested how it works with mangler callable - # yet. - self.clbl_inf_ctx, new_function_id = ( - self.clbl_inf_ctx.with_callable( - expr.function, in_knl_callable)) - - if isinstance(expr, Call): - self.old_calls_to_new_calls[expr] = new_function_id - else: - assert isinstance(expr, CallWithKwargs) - self.old_calls_to_new_calls = new_function_id + if new_arg_id_to_dtype is None: + return [] - # Returning the type. + # collecting result dtypes in order of the assignees + if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None: if return_tuple: - if mangle_result is not None: - return [mangle_result.result_dtypes] + return [get_return_types_as_tuple(new_arg_id_to_dtype)] else: - if mangle_result is not None: - if len(mangle_result.result_dtypes) != 1 and not return_tuple: - raise LoopyError("functions with more or fewer than one " - "return value may only be used in direct " - "assignments") - - return [mangle_result.result_dtypes[0]] - # }}} + return [new_arg_id_to_dtype[-1]] return [] @@ -678,10 +616,10 @@ class TypeInferenceMapper(CombineMapper): rec_results = self.rec(expr.expr) if return_tuple: - return [expr.operation.result_dtypes(self.kernel, *rec_result) + return [expr.operation.result_dtypes(*rec_result) for rec_result in rec_results] else: - return [expr.operation.result_dtypes(self.kernel, rec_result)[0] + return [expr.operation.result_dtypes(rec_result)[0] for rec_result in rec_results] def map_sub_array_ref(self, expr): @@ -1111,13 +1049,11 @@ def infer_unknown_types(program, expect_completion=False): renamed_entrypoints = set() for e in program.entrypoints: - # FIXME: Need to add docs which say that we need not add the current - # callable to the clbl_inf_ctx while writing the "with_types" logger.debug(f"Entering entrypoint: {e}") arg_id_to_dtype = {arg.name: arg.dtype for arg in program[e].args if arg.dtype not in (None, auto)} new_callable, clbl_inf_ctx = program.callables_table[e].with_types( - arg_id_to_dtype, None, clbl_inf_ctx) + arg_id_to_dtype, clbl_inf_ctx) clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) renamed_entrypoints.add(new_name.name) @@ -1174,7 +1110,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression( raise LoopyError("failed to determine type of accumulator for " "reduction '%s'" % expr) - reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes) + reduction_dtypes = expr.operation.result_dtypes(*arg_dtypes) reduction_dtypes = tuple( dt.with_target(kernel.target) if dt is not lp.auto else dt diff --git a/test/testlib.py b/test/testlib.py index 034a0188e..7009e8f5a 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -138,7 +138,7 @@ class SeparateTemporariesPreambleTestPreambleGenerator( class Log2Callable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, callables_table): + def with_types(self, arg_id_to_dtype, callables_table): if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None: # the types provided aren't mature enough to specialize the @@ -153,14 +153,13 @@ class Log2Callable(lp.ScalarCallable): # ints and unsigned casted to float32 dtype = np.float32 - from loopy.target.opencl import OpenCLTarget - name_in_target = "log2" - if not isinstance(kernel.target, OpenCLTarget): - # for CUDA, C Targets the name must be modified - if dtype == np.float32: - name_in_target = "log2f" - elif dtype == np.float128: - name_in_target = "log2l" + if dtype.type == np.float32: + name_in_target = "log2f" + elif dtype.type == np.float64: + name_in_target = "log2" + pass + else: + raise TypeError(f"log2: unexpected type {dtype}") from loopy.types import NumpyType return ( -- GitLab From 72251c12af7ece0b5ae146949062cde367ed64d2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 20 Jan 2021 18:00:53 -0600 Subject: [PATCH 729/916] fixes minor typo (bug) --- loopy/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 20ed08402..0fcde42a5 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2127,7 +2127,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): caller_arg = self.caller_kernel.temporary_variables.get(dep, caller_arg) if not (isinstance(caller_arg, ValueArg) or (isinstance(caller_arg, - ArrayBase) and arg.shape == ())): + ArrayBase) and caller_arg.shape == ())): raise NotImplementedError(f"Obtained '{dep}' as a dependency for" f" call '{expr.function.name}' which is not a scalar.") -- GitLab From d87794e2b2d1db400c7052e7ba77aefe1acd6ce5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 21 Jan 2021 03:07:30 -0600 Subject: [PATCH 730/916] do not empty domain for every array access --- loopy/kernel/creation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 3388306dc..1e40fc1e1 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1887,7 +1887,6 @@ class SliceToInameReplacer(IdentityMapper): def map_subscript(self, expr): subscript_iname_bounds = {} - self.subarray_ref_bounds.append(subscript_iname_bounds) new_index = [] swept_inames = [] @@ -1915,6 +1914,7 @@ class SliceToInameReplacer(IdentityMapper): new_index.append(index) if swept_inames: + self.subarray_ref_bounds.append(subscript_iname_bounds) result = SubArrayRef(tuple(swept_inames), Subscript( self.rec(expr.aggregate), self.rec(tuple(new_index)))) -- GitLab From 6765834007747c39ad636a504c90e2fe95ebbfd6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 25 Jan 2021 12:23:13 -0600 Subject: [PATCH 731/916] adds a failing case for isl parameters in domain --- test/test_callables.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/test_callables.py b/test/test_callables.py index a73a8a6c3..6dd2fef17 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -689,6 +689,30 @@ def test_inlining_with_indirections(ctx_factory): assert (expected_out == out).all() +def test_inlining_with_callee_domain_param(ctx_factory): + queue = cl.CommandQueue(ctx_factory()) + + fill2 = lp.make_function( + "{[i]: 0<=i 1: exec(sys.argv[1]) -- GitLab From bb2857b74755ad86d9cac1690795d6bd94de3f64 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 25 Jan 2021 12:25:24 -0600 Subject: [PATCH 732/916] avoid passing lang_version during kernel instantiation --- test/test_callables.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_callables.py b/test/test_callables.py index 6dd2fef17..2ce571274 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -696,9 +696,8 @@ def test_inlining_with_callee_domain_param(ctx_factory): "{[i]: 0<=i Date: Mon, 1 Feb 2021 14:46:19 -0600 Subject: [PATCH 733/916] minor: order arithmetic so that GuarderPwQPolynomial.__(add|mul)__ is invoked --- loopy/statistics.py | 4 ++-- test/test_statistics.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 1fec25a62..9257cafc1 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -215,7 +215,7 @@ class ToCountMap: def __mul__(self, other): if isinstance(other, GuardedPwQPolynomial): return self.copy({ - index: value*other + index: other*value for index, value in self.count_map.items()}) else: raise ValueError("ToCountMap: Attempted to multiply " @@ -451,7 +451,7 @@ class ToCountMap: total = self._zero() for k, v in self.count_map.items(): - total += v + total = v + total return total diff --git a/test/test_statistics.py b/test/test_statistics.py index 24cb1bd4f..4136f8d06 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -1268,7 +1268,7 @@ def test_gather_access_footprint_2(): fp = gather_access_footprints(knl) params = {"n": 200} - for key, footprint in fp.item(): + for key, footprint in fp.items(): assert count(knl, footprint).eval_with_dict(params) == 200 print(key, count(knl, footprint)) -- GitLab From e2d8d5bf21587ee2ac2f246df449eca592e23218 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 1 Feb 2021 16:18:12 -0600 Subject: [PATCH 734/916] cache the preprocessing of entire program, rather than individual kernels --- loopy/preprocess.py | 58 +++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 0fcde42a5..45738dd1a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2312,21 +2312,6 @@ preprocess_cache = WriteOncePersistentDict( def preprocess_single_kernel(kernel, callables_table, device=None): from loopy.kernel import KernelState - # {{{ cache retrieval - - from loopy import CACHING_ENABLED - if CACHING_ENABLED: - input_kernel = kernel - - try: - result = preprocess_cache[kernel] - logger.debug("%s: preprocess cache hit" % kernel.name) - return result - except KeyError: - pass - - # }}} - prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name) from loopy.check import check_identifiers_in_subst_rules @@ -2369,27 +2354,27 @@ def preprocess_single_kernel(kernel, callables_table, device=None): prepro_logger.done() - # {{{ prepare for caching - - # PicklableDtype instances for example need to know the target they're working - # towards in order to pickle and unpickle them. This is the first pass that - # uses caching, so we need to be ready to pickle. This means propagating - # this target information. + return kernel - if CACHING_ENABLED: - input_kernel = prepare_for_caching(input_kernel) - kernel = prepare_for_caching(kernel) +def preprocess_program(program, device=None): - # }}} + # {{{ cache retrieval + from loopy import CACHING_ENABLED if CACHING_ENABLED: - preprocess_cache.store_if_not_present(input_kernel, kernel) + input_program = program - return kernel + try: + result = preprocess_cache[program] + logger.debug(f"program with entrypoints: {program.entrypoints}" + " preprocess cache hit") + return result + except KeyError: + pass + # }}} -def preprocess_program(program, device=None): from loopy.kernel import KernelState if program.state >= KernelState.PREPROCESSED: return program @@ -2468,6 +2453,23 @@ def preprocess_program(program, device=None): # inline_kernels_with_gbarriers does not recursively inline the callees. program = inline_kernels_with_gbarriers(program) + # {{{ prepare for caching + + # PicklableDtype instances for example need to know the target they're working + # towards in order to pickle and unpickle them. This is the first pass that + # uses caching, so we need to be ready to pickle. This means propagating + # this target information. + + if CACHING_ENABLED: + input_program = prepare_for_caching(input_program) + + program = prepare_for_caching(program) + + # }}} + + if CACHING_ENABLED: + preprocess_cache.store_if_not_present(input_program, program) + return program -- GitLab From dbb086e4fdbf687dd340b8ba4dcffa8ee574d631 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 1 Feb 2021 17:08:41 -0600 Subject: [PATCH 735/916] tests statistics for callable kernels --- loopy/statistics.py | 10 ++++---- test/test_statistics.py | 51 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 9257cafc1..34027a5a0 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -98,6 +98,7 @@ def _get_param_tuple(obj): class GuardedPwQPolynomial: def __init__(self, pwqpolynomial, valid_domain): + assert isinstance(pwqpolynomial, isl.PwQPolynomial) self.pwqpolynomial = pwqpolynomial self.valid_domain = valid_domain @@ -664,10 +665,10 @@ class Op(ImmutableRecord): def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness if self.kernel_name is not None: - return (f"Op({self.dtype}, {self.name}, {self.count_granularity}," - f" {self.kernel_name})") + return (f'Op("{self.dtype}", "{self.name}", "{self.count_granularity}",' + f' "{self.kernel_name}")') else: - return f"Op({self.dtype}, {self.name}, {self.count_granularity})" + return f'Op("{self.dtype}", "{self.name}", "{self.count_granularity}")' # }}} @@ -1548,7 +1549,8 @@ def get_unused_hw_axes_factor(knl, callables_table, insn, def count_inames_domain(knl, inames): space = get_kernel_parameter_space(knl) if not inames: - return get_kernel_zero_pwqpolynomial(knl) + 1 + return add_assumptions_guard(knl, + get_kernel_zero_pwqpolynomial(knl) + 1) inames_domain = knl.get_inames_domain(inames) domain = inames_domain.project_out_except(inames, [dim_type.set]) diff --git a/test/test_statistics.py b/test/test_statistics.py index 4136f8d06..ca38b9af6 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -1400,6 +1400,57 @@ def test_strided_footprint(): assert 2*num < denom +def test_stats_on_callable_kernel(): + callee = lp.make_function( + "{[i, j]: 0<=i, j< 20}", + """ + y[i] = sum(j, A[i,j]*x[j]) + """, name="matvec20x20") + + caller = lp.make_kernel( + "{:}", + """ + y[:] = matvec20x20(A[:,:], x[:]) + """, + [ + lp.GlobalArg("x,y", shape=(20,), dtype=np.float), + lp.GlobalArg("A", shape=(20, 20), dtype=np.float), + ], + name="matvec") + caller = lp.merge([caller, callee]) + + op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + f64_add = op_map.filter_by(name="add").eval_and_sum({}) + assert f64_add == 400 + + +def test_stats_on_callable_kernel_within_loop(): + callee = lp.make_function( + "{[i, j]: 0<=i, j< 20}", + """ + y[i] = sum(j, A[i,j]*x[j]) + """, name="matvec20x20") + + caller = lp.make_kernel( + "{[i]: 0<=i< 20}", + """ + y[i, :] = matvec20x20(A[:,:], x[i, :]) + """, + [ + lp.GlobalArg("x,y", shape=(20, 20), dtype=np.float), + lp.GlobalArg("A", shape=(20, 20), dtype=np.float), + ], + name="matmat") + caller = lp.merge([caller, callee]) + + op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + + f64_add = op_map.filter_by(name="add").eval_and_sum({}) + assert f64_add == 8000 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From e658837714adc9bd738e1670d325b7aaedfff223 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 1 Feb 2021 20:16:18 -0600 Subject: [PATCH 736/916] minor fix to correct the substitution of caller args in callee's stats exprs --- loopy/statistics.py | 17 +++++++++-------- test/test_statistics.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 34027a5a0..c86896054 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -841,21 +841,22 @@ class CounterBase(CombineMapper): assert isinstance(expr.function, ResolvedFunction) clbl = self.callables_table[expr.function.name] - from loopy.kernel.function_interface import CallableKernel + from loopy.kernel.function_interface import (CallableKernel, + get_kw_pos_association) from loopy.kernel.data import ValueArg if isinstance(clbl, CallableKernel): sub_result = self.kernel_rec(clbl.subkernel) + _, pos_to_kw = get_kw_pos_association(clbl.subkernel) - arg_dict = { - arg.name: value - for arg, value in zip( - clbl.subkernel.args, - expr.parameters) - if isinstance(arg, ValueArg)} + subst_dict = { + pos_to_kw[i]: param + for i, param in enumerate(expr.parameters) + if isinstance(clbl.subkernel.arg_dict[pos_to_kw[i]], + ValueArg)} return subst_into_to_count_map( self.param_space, - sub_result, arg_dict) \ + sub_result, subst_dict) \ + self.rec(expr.parameters) else: diff --git a/test/test_statistics.py b/test/test_statistics.py index ca38b9af6..499179351 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -1451,6 +1451,34 @@ def test_stats_on_callable_kernel_within_loop(): assert f64_add == 8000 +def test_callable_kernel_with_substitution(): + callee = lp.make_function( + "{[i, j]: 0<=i, j< n}", + """ + y[i] = sum(j, A[i,j]*x[j]) + """, + [lp.ValueArg("n"), ...], + name="matvec") + + caller = lp.make_kernel( + "{[i]: 0<=i< 20}", + """ + y[i, :] = matvec(20, A[:,:], x[i, :]) + """, + [ + lp.GlobalArg("x,y", shape=(20, 20), dtype=np.float), + lp.GlobalArg("A", shape=(20, 20), dtype=np.float), + ], + name="matmat") + caller = lp.merge([caller, callee]) + + op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) + + f64_add = op_map.filter_by(name="add").eval_and_sum({}) + assert f64_add == 8000 + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 0b67ac1e7d77a928e22c40705119c6b0a0bd59f8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 1 Feb 2021 20:58:53 -0600 Subject: [PATCH 737/916] fixes minor typo --- loopy/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index c86896054..7bd9fd113 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -1851,8 +1851,8 @@ def _get_mem_access_map_for_single_kernel(knl, callables_table, ).with_set_attributes(direction="load") for assignee in insn.assignees: insn_access_map = insn_access_map + ( - access_counter_g(insn.assignee) - + access_counter_l(insn.assignee) + access_counter_g(assignee) + + access_counter_l(assignee) ).with_set_attributes(direction="store") for key, val in insn_access_map.count_map.items(): -- GitLab From 74379a4ce407b7b164cfef5396ce7a4b6390e658 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 5 Feb 2021 20:58:38 -0600 Subject: [PATCH 738/916] completes inliner implementation for parametric callee domains --- loopy/kernel/__init__.py | 2 +- loopy/transform/callable.py | 351 +++++++++++++++++++++--------------- 2 files changed, 211 insertions(+), 142 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index dc26c2d9a..390969bf8 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -332,7 +332,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): assumptions = isl.BasicSet.read_from_str(domains[0].get_ctx(), assumptions_set_str) - assert assumptions.is_params() + # assert assumptions.is_params() # }}} diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index da3b107e1..76f17c02a 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -28,12 +28,13 @@ from loopy.kernel import LoopKernel from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, Assignment, CInstruction, _DataObliviousInstruction) -from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper +from loopy.symbolic import ( + RuleAwareSubstitutionMapper, + SubstitutionRuleMappingContext, CombineMapper, IdentityMapper) from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.program import Program -from loopy.symbolic import SubArrayRef __doc__ = """ .. currentmodule:: loopy @@ -112,54 +113,41 @@ def merge(translation_units): # {{{ kernel inliner mapper -class KernelInliner(SubstitutionMapper): - """Mapper to replace variables (indices, temporaries, arguments) in the - callee kernel with variables in the caller kernel. - - :arg caller: the caller kernel - :arg arg_map: dict of argument name to variables in caller - :arg arg_dict: dict of argument name to arguments in callee - """ +class KernelInliner(RuleAwareSubstitutionMapper): + def __init__(self, rule_mapping_context, subst_func, caller_knl, + callee_knl, callee_arg_to_call_param): + super().__init__(rule_mapping_context, subst_func, lambda *args: True) + self.caller_knl = caller_knl + self.callee_knl = callee_knl + self.callee_arg_to_call_param = callee_arg_to_call_param - def __init__(self, subst_func, caller, arg_map, arg_dict): - super().__init__(subst_func) - self.caller = caller - self.arg_map = arg_map - self.arg_dict = arg_dict + def map_subscript(self, expr, expn_state): + if expr.aggregate.name in self.callee_knl.arg_dict: + from loopy.symbolic import get_start_subscript_from_sar + from loopy.isl_helpers import simplify_via_aff + from pymbolic.primitives import Subscript, Variable - def map_subscript(self, expr): - if expr.aggregate.name in self.arg_map: + sar = self.callee_arg_to_call_param[expr.aggregate.name] # SubArrayRef - aggregate = self.subst_func(expr.aggregate) - sar = self.arg_map[expr.aggregate.name] # SubArrayRef in caller - callee_arg = self.arg_dict[expr.aggregate.name] # Arg in callee - if aggregate.name in self.caller.arg_dict: - caller_arg = self.caller.arg_dict[aggregate.name] # Arg in caller + callee_arg = self.callee_knl.arg_dict[expr.aggregate.name] + if sar.subscript.aggregate.name in self.caller_knl.arg_dict: + caller_arg = self.caller_knl.arg_dict[sar.subscript.aggregate.name] else: - caller_arg = self.caller.temporary_variables[aggregate.name] + caller_arg = self.caller_knl.temporary_variables[ + sar.subscript.aggregate.name] - # Firstly, map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple) - - # Next, reshape to match dimension of outer arrays. - # We can have e.g. A[3, 2] from outside and B[6] from inside - from numbers import Integral - if not all(isinstance(d, Integral) for d in callee_arg.shape): - raise LoopyError( - "Argument: {} in callee kernel does not have " - "constant shape.".format(callee_arg)) + # map inner inames to outer inames. + outer_indices = self.map_tuple(expr.index_tuple, expn_state) flatten_index = 0 - from loopy.symbolic import get_start_subscript_from_sar for i, idx in enumerate(get_start_subscript_from_sar(sar, - self.caller).index_tuple): + self.caller_knl).index_tuple): flatten_index += idx*caller_arg.dim_tags[i].stride flatten_index += sum( idx * tag.stride for idx, tag in zip(outer_indices, callee_arg.dim_tags)) - from loopy.isl_helpers import simplify_via_aff flatten_index = simplify_via_aff(flatten_index) new_indices = [] @@ -170,80 +158,143 @@ class KernelInliner(SubstitutionMapper): new_indices = tuple(simplify_via_aff(i) for i in new_indices) - return aggregate.index(tuple(new_indices)) + return Subscript(Variable(sar.subscript.aggregate.name), new_indices) else: - return super().map_subscript(expr) + assert expr.aggregate.name in self.callee_knl.temporary_variables + return super().map_subscript(expr, expn_state) # }}} # {{{ inlining of a single call instruction -def _inline_call_instruction(caller_kernel, callee_knl, instruction): +def substitute_into_domain(domain, param_name, expr, allowed_param_dims): """ - Returns a copy of *kernel* with the *instruction* in the *kernel* - replaced by inlining :attr:`subkernel` within it. + :arg allowed_deps: A :class:`list` of :class:`str` that are """ - callee_label = callee_knl.name[:4] + "_" + import pymbolic.primitives as prim + from loopy.symbolic import get_dependencies, isl_set_from_expr + if param_name not in domain.get_var_dict(): + # param_name not in domain => domain will be unchanged + return domain - # {{{ duplicate and rename inames + # {{{ rename 'param_name' to avoid namespace pollution with allowed_param_dims - vng = caller_kernel.get_var_name_generator() - ing = caller_kernel.get_instruction_id_generator() - dim_type = isl.dim_type.set + dt, pos = domain.get_var_dict()[param_name] + domain = domain.set_dim_name(dt, pos, UniqueNameGenerator( + set(allowed_param_dims))(param_name)) - iname_map = {} - for iname in callee_knl.all_inames(): - iname_map[iname] = vng(callee_label+iname) + # }}} - new_domains = [] - new_iname_to_tags = caller_kernel.iname_to_tags.copy() + for dep in get_dependencies(expr): + if dep in allowed_param_dims: + domain = domain.add_dims(isl.dim_type.param, 1) + domain = domain.set_dim_name( + isl.dim_type.param, + domain.dim(isl.dim_type.param)-1, + dep) + else: + raise ValueError("Augmenting caller's domain " + f"with '{dep}' is not allowed.") - # transferring iname tags info from the callee to the caller kernel - for domain in callee_knl.domains: - new_domain = domain.copy() - for i in range(new_domain.n_dim()): - iname = new_domain.get_dim_name(dim_type, i) + set_ = isl_set_from_expr(domain.space, + prim.Comparison(prim.Variable(param_name), + "==", + expr)) - if iname in callee_knl.iname_to_tags: - new_iname_to_tags[iname_map[iname]] = ( - callee_knl.iname_to_tags[iname]) - new_domain = new_domain.set_dim_name( - dim_type, i, iname_map[iname]) - new_domains.append(new_domain) + bset, = set_.get_basic_sets() + domain = domain & bset + + return domain.project_out(dt, pos, 1) - kernel = caller_kernel.copy(domains=caller_kernel.domains + new_domains, - iname_to_tags=new_iname_to_tags) + +def rename_iname(domain, old_iname, new_iname): + if old_iname not in domain.get_var_dict(): + return domain + + dt, pos = domain.get_var_dict()[old_iname] + return domain.set_dim_name(dt, pos, new_iname) + + +def get_valid_domain_param_names(knl): + from loopy.kernel.data import ValueArg + return ([arg.name for arg in knl.args if isinstance(arg, ValueArg)] + + [tv.name + for tv in knl.temporary_variables.values() + if tv.shape == ()] + + list(knl.all_inames()) + ) + + +def _inline_call_instruction(caller_knl, callee_knl, call_insn): + """ + Returns a copy of *caller_knl* with the *call_insn* in the *kernel* + replaced by inlining *callee_knl* into it within it. + """ + import pymbolic.primitives as prim + from pymbolic.mapper.substitutor import make_subst_func + from loopy.kernel.data import ValueArg + + # {{{ sanity checks + + assert call_insn.expression.function.name == callee_knl.name # }}} - # {{{ rename temporaries + callee_label = callee_knl.name[:4] + "_" + vng = caller_knl.get_var_name_generator() + ing = caller_knl.get_instruction_id_generator() + + # {{{ construct callee->caller name mappings - temp_map = {} - new_temps = kernel.temporary_variables.copy() - for name, temp in callee_knl.temporary_variables.items(): + # name_map: Mapping[str, str] + # A mapping from variable names in the callee kernel's namespace to + # the ones they would be referred by in the caller's namespace post inlining. + name_map = {} + + # only consider temporary variables and inames, arguments would be mapping + # according to the invocation in call_insn. + for name in (callee_knl.all_inames() + | set(callee_knl.temporary_variables.keys())): new_name = vng(callee_label+name) - temp_map[name] = new_name - new_temps[new_name] = temp.copy(name=new_name) + name_map[name] = new_name + + # }}} + + # {{{ iname_to_tags - kernel = kernel.copy(temporary_variables=new_temps) + # new_iname_to_tags: caller's iname_to_tags post inlining + new_iname_to_tags = caller_knl.iname_to_tags + + for old_name, tags in callee_knl.iname_to_tags.items(): + new_iname_to_tags[name_map[old_name]] = tags # }}} - # {{{ match kernel arguments + # {{{ register callee's temps as caller's + + # new_temps: caller's temps post inlining + new_temps = caller_knl.temporary_variables.copy() + + for name, tv in callee_knl.temporary_variables.items(): + new_temps[name_map[name]] = tv.copy(name=name_map[name]) + + # }}} + + # {{{ get callee args -> parameters passed to the call arg_map = {} # callee arg name -> caller symbols (e.g. SubArrayRef) - assignees = instruction.assignees # writes - parameters = instruction.expression.parameters # reads + assignees = call_insn.assignees # writes + parameters = call_insn.expression.parameters # reads # add keyword parameters from pymbolic.primitives import CallWithKwargs from loopy.kernel.function_interface import get_kw_pos_association kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) - if isinstance(instruction.expression, CallWithKwargs): - kw_parameters = instruction.expression.kw_parameters + if isinstance(call_insn.expression, CallWithKwargs): + kw_parameters = call_insn.expression.kw_parameters else: kw_parameters = {} @@ -258,37 +309,51 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): # }}} - # {{{ rewrite instructions + # {{{ domains/assumptions - import pymbolic.primitives as p - from pymbolic.mapper.substitutor import make_subst_func + new_domains = callee_knl.domains.copy() + for old_iname in callee_knl.all_inames(): + new_domains = [rename_iname(dom, old_iname, name_map[old_iname]) + for dom in new_domains] - var_map = {p.Variable(k): p.Variable(v) - for k, v in iname_map.items()} - var_map.update({p.Variable(k): p.Variable(v) - for k, v in temp_map.items()}) - for k, v in arg_map.items(): - if isinstance(v, SubArrayRef): - var_map[p.Variable(k)] = v.subscript.aggregate - else: - var_map[p.Variable(k)] = v + new_assumptions = callee_knl.assumptions - subst_mapper = KernelInliner( - make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict) + for callee_arg_name, param_expr in arg_map.items(): + if isinstance(callee_knl.arg_dict[callee_arg_name], + ValueArg): + new_domains = [ + substitute_into_domain( + dom, + callee_arg_name, + param_expr, get_valid_domain_param_names(caller_knl)) + for dom in new_domains] - insn_id = {} - for insn in callee_knl.instructions: - insn_id[insn.id] = ing(callee_label+insn.id) + new_assumptions = substitute_into_domain( + new_assumptions, + callee_arg_name, + param_expr, get_valid_domain_param_names(caller_knl)) + + # }}} - # {{{ root and leave instructions in callee kernel + # {{{ map callee's expressions to get expressions after inlining - dep_map = callee_knl.recursive_insn_dep_map() - # roots depend on nothing - heads = {insn for insn, deps in dep_map.items() if not deps} - # leaves have nothing that depends on them - tails = set(dep_map.keys()) - for insn, deps in dep_map.items(): - tails = tails - deps + rule_mapping_context = SubstitutionRuleMappingContext( + callee_knl.substitutions, vng) + smap = KernelInliner(rule_mapping_context, + make_subst_func({old_name: prim.Variable(new_name) + for old_name, new_name in name_map.items()}), + caller_knl, callee_knl, arg_map) + + callee_knl = rule_mapping_context.finish_kernel(smap.map_kernel( + callee_knl)) + + # }}} + + # {{{ generate new ids for instructions + + insn_id_map = {} + for insn in callee_knl.instructions: + insn_id_map[insn.id] = ing(callee_label+insn.id) # }}} @@ -298,70 +363,74 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction): noop_start = NoOpInstruction( id=ing(callee_label+"_start"), - within_inames=instruction.within_inames, - depends_on=instruction.depends_on + within_inames=call_insn.within_inames, + depends_on=call_insn.depends_on ) noop_end = NoOpInstruction( - id=instruction.id, - within_inames=instruction.within_inames, - depends_on=frozenset(insn_id[insn] for insn in tails) + id=call_insn.id, + within_inames=call_insn.within_inames, + depends_on=frozenset(insn_id_map.values()) ) + # }}} - inner_insns = [noop_start] + # {{{ map callee's instruction ids + + inlined_insns = [noop_start] for insn in callee_knl.instructions: - insn = insn.with_transformed_expressions(subst_mapper) - within_inames = frozenset(map(iname_map.get, insn.within_inames)) - within_inames = within_inames | instruction.within_inames - depends_on = frozenset(map(insn_id.get, insn.depends_on)) | ( - instruction.depends_on) - no_sync_with = frozenset((insn_id[id], scope) + new_within_inames = (frozenset(name_map[iname] + for iname in insn.within_inames) + | call_insn.within_inames) + new_depends_on = (frozenset(insn_id_map[dep] for dep in insn.depends_on) + | {noop_start.id}) + new_no_sync_with = frozenset((insn_id_map[id], scope) for id, scope in insn.no_sync_with) - - if insn.id in heads: - depends_on = depends_on | {noop_start.id} + new_id = insn_id_map[insn.id] if isinstance(insn, Assignment): new_atomicity = tuple( - type(atomicity)(var_map[p.Variable(atomicity.var_name)].name) + type(atomicity)(name_map[atomicity.var_name]) for atomicity in insn.atomicity) insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on, - tags=insn.tags | instruction.tags, + id=insn_id_map[insn.id], + within_inames=new_within_inames, + depends_on=new_depends_on, + tags=insn.tags | call_insn.tags, atomicity=new_atomicity, - no_sync_with=no_sync_with + no_sync_with=new_no_sync_with ) else: insn = insn.copy( - id=insn_id[insn.id], - within_inames=within_inames, - # TODO: probaby need to keep priority in callee kernel - priority=instruction.priority, - depends_on=depends_on, - tags=insn.tags | instruction.tags, - no_sync_with=no_sync_with + id=new_id, + within_inames=new_within_inames, + depends_on=new_depends_on, + tags=insn.tags | call_insn.tags, + no_sync_with=new_no_sync_with ) - inner_insns.append(insn) + inlined_insns.append(insn) - inner_insns.append(noop_end) + inlined_insns.append(noop_end) - new_insns = [] - for insn in kernel.instructions: - if insn == instruction: - new_insns.extend(inner_insns) - else: - new_insns.append(insn) + # }}} - kernel = kernel.copy(instructions=new_insns) + # {{{ swap out call_insn with inlined_instructions + + idx = caller_knl.instructions.index(call_insn) + new_insns = (caller_knl.instructions[:idx] + + inlined_insns + + caller_knl.instructions[idx+1:]) # }}} - return kernel + old_assumptions, new_assumptions = isl.align_two( + caller_knl.assumptions, new_assumptions) + + return caller_knl.copy(instructions=new_insns, + temporary_variables=new_temps, + domains=caller_knl.domains+new_domains, + assumptions=old_assumptions.params() & new_assumptions.params(), + iname_to_tags=new_iname_to_tags) # }}} -- GitLab From 8a36a8f34e36c36c7486df566ca73fc2131c37d4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 23 Feb 2021 09:28:58 -0600 Subject: [PATCH 739/916] store callables as a pyrsistent.PMap --- loopy/preprocess.py | 8 +++---- loopy/program.py | 42 +++++++++++++++++++++---------------- loopy/tools.py | 2 ++ loopy/transform/callable.py | 8 +++---- setup.py | 1 + 5 files changed, 35 insertions(+), 26 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 006d55ae6..2f497a98f 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1990,10 +1990,10 @@ def realize_reduction_for_single_kernel(kernel, callables_table, def realize_reduction(program, *args, **kwargs): assert isinstance(program, Program) - callables_table = program.callables_table.copy() - kernels_to_scan = [in_knl_callable.subkernel for in_knl_callable in - program.callables_table.values() if isinstance(in_knl_callable, - CallableKernel)] + callables_table = dict(program.callables_table) + kernels_to_scan = [in_knl_callable.subkernel + for in_knl_callable in program.callables_table.values() + if isinstance(in_knl_callable, CallableKernel)] for knl in kernels_to_scan: new_knl, callables_table = realize_reduction_for_single_kernel( diff --git a/loopy/program.py b/loopy/program.py index 1b45a3518..8e8a8382f 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -21,6 +21,7 @@ THE SOFTWARE. """ import re +import collections from pytools import ImmutableRecord from pymbolic.primitives import Variable @@ -39,6 +40,7 @@ from loopy.kernel import LoopKernel from loopy.tools import update_persistent_hash from pymbolic.primitives import Call, CallWithKwargs from functools import reduce +from pyrsistent import pmap, PMap __doc__ = """ @@ -138,8 +140,8 @@ class Program(ImmutableRecord): .. attribute:: callables_table - An instance of :class:`dict` mapping the function identifiers in a - kernel to their associated instances of + An instance of :class:`pyrsistent.PMap` mapping the function + identifiers in a kernel to their associated instances of :class:`loopy.kernel.function_interface.InKernelCallable`. .. attribute:: target @@ -166,20 +168,23 @@ class Program(ImmutableRecord): """ def __init__(self, entrypoints=frozenset(), - callables_table={}, + callables_table=pmap(), target=None, func_id_to_in_knl_callable_mappers=[]): # {{{ sanity checks - assert isinstance(callables_table, dict) + assert isinstance(callables_table, collections.abc.Mapping) assert isinstance(entrypoints, frozenset) + if not isinstance(callables_table, PMap): + callables_table = pmap(callables_table) + # }}} super().__init__( entrypoints=entrypoints, - callables_table=callables_table, + callables_table=pmap(callables_table), target=target, func_id_to_in_knl_callable_mappers=( func_id_to_in_knl_callable_mappers)) @@ -198,14 +203,15 @@ class Program(ImmutableRecord): program = super().copy(**kwargs) if target: from loopy.kernel import KernelState - if max(callable_knl.subkernel.state for callable_knl in - self.callables_table.values() if - isinstance(callable_knl, CallableKernel)) > ( + if max(callable_knl.subkernel.state + for callable_knl in self.callables_table.values() + if isinstance(callable_knl, CallableKernel)) > ( KernelState.INITIAL): if not isinstance(target, type(self.target)): - raise LoopyError("One of the kenels in the program has been " + raise LoopyError("One of the kernels in the program has been " "preprocessed, cannot modify target now.") - callables = {} + + new_callables = {} for func_id, clbl in program.callables_table.items(): if isinstance(clbl, CallableKernel): knl = clbl.subkernel @@ -215,10 +221,10 @@ class Program(ImmutableRecord): pass else: raise NotImplementedError() - callables[func_id] = clbl + new_callables[func_id] = clbl program = super().copy( - callables_table=callables, target=target) + callables_table=new_callables, target=target) return program @@ -255,14 +261,13 @@ class Program(ImmutableRecord): # update the callable kernel new_in_knl_callable = self.callables_table[kernel.name].copy( subkernel=kernel) - new_callables = self.callables_table.copy() - new_callables[kernel.name] = new_in_knl_callable + new_callables = self.callables_table.remove(kernel.name).set( + kernel.name, new_in_knl_callable) return self.copy(callables_table=new_callables) else: # add a new callable kernel clbl = CallableKernel(kernel) - new_callables = self.callables_table.copy() - new_callables[kernel.name] = clbl + new_callables = self.callables_table.set(kernel.name, clbl) return self.copy(callables_table=new_callables) def __getitem__(self, name): @@ -452,7 +457,8 @@ def make_clbl_inf_ctx(callables, entrypoints): class CallablesInferenceContext(ImmutableRecord): def __init__(self, callables, old_callable_ids, history={}): - assert isinstance(callables, dict) + assert isinstance(callables, collections.abc.Mapping) + callables = dict(callables) super().__init__( callables=callables, @@ -730,7 +736,7 @@ def resolve_callables(program): return program # get registered callables - known_callables = program.callables_table.copy() + known_callables = dict(program.callables_table) # get target specific callables known_callables.update(program.target.get_device_ast_builder().known_callables) # get loopy specific callables diff --git a/loopy/tools.py b/loopy/tools.py index e8d529d2d..6572b69e8 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -114,6 +114,8 @@ class LoopyKeyBuilder(KeyBuilderBase): else: PersistentHashWalkMapper(key_hash)(key) + update_for_PMap = update_for_dict # noqa: N815 + class PymbolicExpressionHashWrapper: def __init__(self, expression): diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 76f17c02a..10f9f0b26 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -61,14 +61,14 @@ def register_callable(translation_unit, function_identifier, callable_, if (function_identifier in translation_unit.callables_table) and ( translation_unit.callables_table[function_identifier] != callable_ and redefining_not_ok): - raise LoopyError("Redifining function identifier not allowed. Set the" + raise LoopyError("Redefining function identifier not allowed. Set the" " option 'redefining_not_ok=False' to bypass this error.") - callables = translation_unit.callables_table.copy() - callables[function_identifier] = callable_ + new_callables = translation_unit.callables_table.set(function_identifier, + callable_) return translation_unit.copy( - callables_table=callables) + callables_table=new_callables) def merge(translation_units): diff --git a/setup.py b/setup.py index fcf284bc8..08a7ac62a 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,7 @@ setup(name="loopy", "codepy>=2017.1", "colorama", "Mako", + "pyrsistent", ], extras_require={ -- GitLab From cc101058e0bbfd9e0e3c4e0523d099fed253bf8b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 23 Feb 2021 09:35:48 -0600 Subject: [PATCH 740/916] get_grid sizes are memoized again --- loopy/kernel/__init__.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 2fb25f770..f25a20e23 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1098,21 +1098,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): self.get_iname_bounds(iname, constants_only=True).size, constants_only=True))) + @memoize_method def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, callables_table, ignore_auto=False): - # FIXME: re-add the memoization? # FIXME: docs - """Return a tuple (global_size, local_size) containing a grid that - could accommodate execution of all instructions whose IDs are given - in *insn_ids*. - - :arg insn_ids: a :class:`frozenset` of instruction IDs - :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` - - *global_size* and *local_size* are instances of :class:`dict` with - mapping of the form from ``axis`` to :class:`islpy.PwAff` objects. - """ - # {{{ collecting the callee kernels in insn_ids from loopy.kernel.tools import get_direct_callee_kernels @@ -1186,9 +1175,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): return global_sizes, local_sizes + @memoize_method def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, ignore_auto=False, return_dict=False): - #Fixme: Re-add the memoize wrap here? # Fixme: docs """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given @@ -1241,6 +1230,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + @memoize_method def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, callables_table, ignore_auto=False, return_dict=False): # FIXME docs -- GitLab From e9c1c1132eaa604ea549e12e10d441ea0501351c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 24 Feb 2021 14:49:23 -0600 Subject: [PATCH 741/916] ArrayArgDescriptor: acknowledge [None, auto] as valid shapes --- loopy/kernel/function_interface.py | 46 +++++++++++++++++++----------- loopy/preprocess.py | 12 +++++--- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 9eb707e81..272383211 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -86,13 +86,15 @@ class ArrayArgDescriptor(ImmutableRecord): # {{{ sanity checks from loopy.kernel.array import ArrayDimImplementationTag + from loopy.kernel.data import auto - assert isinstance(shape, tuple) - assert isinstance(dim_tags, tuple) + assert isinstance(shape, tuple) or shape in [None, auto] + assert isinstance(dim_tags, tuple) or dim_tags is None - # FIXME at least vector dim tags should be supported - assert all(isinstance(dim_tag, ArrayDimImplementationTag) for dim_tag in - dim_tags) + if dim_tags: + # FIXME at least vector dim tags should be supported + assert all(isinstance(dim_tag, ArrayDimImplementationTag) for dim_tag in + dim_tags) # }}} @@ -106,8 +108,16 @@ class ArrayArgDescriptor(ImmutableRecord): Returns an instance of :class:`ArrayArgDescriptor` with its shapes, strides, mapped by *f*. """ - new_shape = tuple(f(axis_len) for axis_len in self.shape) - new_dim_tags = tuple(dim_tag.map_expr(f) for dim_tag in self.dim_tags) + if self.shape is not None: + new_shape = tuple(f(axis_len) for axis_len in self.shape) + else: + new_shape = None + + if self.dim_tags is not None: + new_dim_tags = tuple(dim_tag.map_expr(f) for dim_tag in self.dim_tags) + else: + new_dim_tags = None + return self.copy(shape=new_shape, dim_tags=new_dim_tags) def depends_on(self): @@ -116,18 +126,22 @@ class ArrayArgDescriptor(ImmutableRecord): :class:`ArrayArgDescriptor` depends on. """ from loopy.kernel.data import auto - result = DependencyMapper(composite_leaves=False)([lngth for lngth in - self.shape if lngth not in [None, auto]]) | ( - frozenset().union(*(dim_tag.depends_on() for dim_tag in - self.dim_tags))) + result = set() + + if self.shape: + dep_mapper = DependencyMapper(composite_leaves=False) + for axis_len in self.shape: + if axis_len not in [None, auto]: + result |= dep_mapper(axis_len) + + if self.dim_tags: + for dim_tag in self.dim_tags: + result |= dim_tag.depends_on() + return frozenset(var.name for var in result) def update_persistent_hash(self, key_hash, key_builder): - for shape_i in self.shape: - if shape_i is None: - key_builder.rec(key_hash, shape_i) - else: - key_builder.update_for_pymbolic_expression(key_hash, shape_i) + key_builder.update_for_pymbolic_expression(key_hash, self.shape) key_builder.rec(key_hash, self.address_space) key_builder.rec(key_hash, self.dim_tags) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 2f497a98f..1586c9d20 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2254,16 +2254,20 @@ def infer_arg_descr(program): renamed_entrypoints = set() for e in program.entrypoints: - def _tuple_if_int(s): - if isinstance(s, int): + def _tuple_or_None(s): + if isinstance(s, tuple): + return s + elif s in [None, auto]: + return s + else: return s, - return s + arg_id_to_descr = {} for arg in program[e].args: if isinstance(arg, ArrayBase): if arg.shape not in (None, auto): arg_id_to_descr[arg.name] = ArrayArgDescriptor( - _tuple_if_int(arg.shape), arg.address_space, + _tuple_or_None(arg.shape), arg.address_space, arg.dim_tags) elif isinstance(arg, ValueArg): arg_id_to_descr[arg.name] = ValueArgDescriptor() -- GitLab From 68bd8860eff3dc50f45a42e8de6b1ad3826f93ac Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 27 Feb 2021 12:22:40 -0600 Subject: [PATCH 742/916] do not attempt to resolve already resolved callables --- loopy/program.py | 7 ++++++- test/test_callables.py | 25 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 8e8a8382f..c8615d4d7 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -106,7 +106,12 @@ class CallableResolver(RuleAwareIdentityMapper): # record that we resolved a call self.calls_resolved.add(name) - return Call(ResolvedFunction(expr.function), params) + function = expr.function + + if not isinstance(expr.function, ResolvedFunction): + function = ResolvedFunction(expr.function) + + return Call(function, params) return super().map_call(expr, expn_state) diff --git a/test/test_callables.py b/test/test_callables.py index 2ce571274..dd5dcb4c8 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -712,6 +712,31 @@ def test_inlining_with_callee_domain_param(ctx_factory): assert (out == 2).all() +def test_double_resolving(): + from loopy.program import resolve_callables + from loopy.kernel import KernelState + from loopy.symbolic import ResolvedFunction + + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = sin(x[i]) + """, + [ + lp.GlobalArg("x", dtype=float, shape=lp.auto), + ...], + name="foo" + ) + + knl = resolve_callables(knl) + knl = knl.with_kernel(knl["foo"].copy(state=KernelState.INITIAL)) + knl = resolve_callables(knl) + + assert "sin" in knl.callables_table + assert isinstance(knl["foo"].instructions[0].expression.function, + ResolvedFunction) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From e4693736221e8dce719b795c19d55a866f8c7811 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 6 Mar 2021 23:41:59 -0600 Subject: [PATCH 743/916] prepare_for_caching: handle dtypes in the callables --- loopy/preprocess.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 1586c9d20..eae8a474d 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -40,7 +40,8 @@ from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) -from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.kernel import LoopKernel +from loopy.program import Program from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pytools import ProcessLogger @@ -49,8 +50,7 @@ from functools import partial # {{{ prepare for caching -@iterate_over_kernels_if_given_program -def prepare_for_caching(kernel): +def prepare_for_caching_inner(kernel): import loopy as lp from loopy.types import OpaqueType new_args = [] @@ -81,6 +81,32 @@ def prepare_for_caching(kernel): return kernel + +def prepare_for_caching(program): + if isinstance(program, LoopKernel): + return prepare_for_caching_inner(program) + + assert isinstance(program, Program) + tgt = program.target + + new_clbls = {} + for name, clbl in program.callables_table.items(): + if clbl.arg_id_to_dtype is not None: + arg_id_to_dtype = {id: dtype.with_target(tgt) + for id, dtype in clbl.arg_id_to_dtype.items()} + clbl = clbl.copy(arg_id_to_dtype=arg_id_to_dtype) + if isinstance(clbl, ScalarCallable): + pass + elif isinstance(clbl, CallableKernel): + subknl = prepare_for_caching_inner(clbl.subkernel) + clbl = clbl.copy(subkernel=subknl) + else: + raise NotImplementedError(type(clbl)) + + new_clbls[name] = clbl + + return program.copy(callables_table=new_clbls) + # }}} -- GitLab From 606ade49e9066716c3bae7127d1ebfe94e9d49dc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 6 Mar 2021 23:43:42 -0600 Subject: [PATCH 744/916] only add non-NoneType dtypes to arg_id_to_dtype --- loopy/kernel/function_interface.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 272383211..103ac45b3 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -727,8 +727,10 @@ class CallableKernel(InKernelCallable): new_arg_id_to_dtype = {} for pos, kw in pos_to_kw.items(): - new_arg_id_to_dtype[kw] = specialized_kernel.arg_dict[kw].dtype - new_arg_id_to_dtype[pos] = specialized_kernel.arg_dict[kw].dtype + arg = specialized_kernel.arg_dict[kw] + if arg.dtype: + new_arg_id_to_dtype[kw] = arg.dtype + new_arg_id_to_dtype[pos] = arg.dtype # Return the kernel call with specialized subkernel and the corresponding # new arg_id_to_dtype -- GitLab From 472081394db92705b58911389a7f397791e19836 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 17 Mar 2021 17:59:59 -0500 Subject: [PATCH 745/916] Fix formatting and bad merge --- loopy/statistics.py | 18 +++++++----------- loopy/target/c/__init__.py | 5 +---- loopy/target/c/codegen/expression.py | 1 + loopy/target/cuda.py | 3 ++- loopy/target/pyopencl.py | 1 - 5 files changed, 11 insertions(+), 17 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index cc1c09d4a..f5ecf5b75 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -768,20 +768,16 @@ class MemAccess(ImmutableRecord): # }}} - if dtype is None: - Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, + if dtype is not None: + from loopy.types import to_loopy_type + dtype = to_loopy_type(dtype) + + ImmutableRecord.__init__(self, mtype=mtype, dtype=dtype, + lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, variable=variable, variable_tags=variable_tags, count_granularity=count_granularity, kernel_name=kernel_name) - else: - from loopy.types import to_loopy_type - Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), - lid_strides=lid_strides, gid_strides=gid_strides, - direction=direction, variable=variable, - variable_tags=variable_tags, - count_granularity=count_granularity, - kernel_name=kernel_name) @property def variable_tag(self): @@ -813,7 +809,7 @@ class MemAccess(ImmutableRecord): self.direction, self.variable, self.variable_tags, - self.count_granularity + self.count_granularity, self.kernel_name) # }}} diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 7c08bc2ec..c6d59084d 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -472,10 +472,7 @@ class CMathCallable(ScalarCallable): # {{{ (abs|max|min) -> (fabs|fmax|fmin) if name in ["abs", "min", "max"]: - dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) - if dtype.kind == "f": - name = "f" + name + name = "f" + name # }}} diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index d7621e25b..9902e5f44 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -464,6 +464,7 @@ class ExpressionToCExpressionMapper(IdentityMapper): def map_power(self, expr, type_context): tgt_dtype = self.infer_type(expr) + base_dtype = self.infer_type(expr.base) exponent_dtype = self.infer_type(expr.exponent) from pymbolic.primitives import is_constant, is_zero diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index ee99f27e7..63018189e 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -136,7 +136,8 @@ class CudaCallable(ScalarCallable): num_args)) if dtype is not None and dtype.kind == "c": - raise LoopyTypeError(f"'{name}' does not support complex arguments.") + raise LoopyTypeError( + f"'{name}' does not support complex arguments.") # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index bb165f8c0..da2d221d7 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -25,7 +25,6 @@ THE SOFTWARE. import numpy as np import pymbolic.primitives as p -from loopy.kernel.data import CallMangleInfo from loopy.target.opencl import (OpenCLTarget, OpenCLCASTBuilder, ExpressionToOpenCLCExpressionMapper) from loopy.target.python import PythonASTBuilderBase -- GitLab From 0dfec8d157cb8a45bab9eb70dcead72a8ee19c13 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 17 Mar 2021 19:04:41 -0500 Subject: [PATCH 746/916] Fix preprocess bad merge --- loopy/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index dfd5a9da4..673d4c0a9 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1083,7 +1083,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes) + expression=expression, # Do not inherit predicates: Those might read variables # that may not yet be set, and we don't have a great way @@ -1484,7 +1484,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, (sweep_iname,) + expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=init_insn_depends_on, - expression=expr.operation.neutral_element(*arg_dtypes), + expression=expression, # Do not inherit predicates: Those might read variables # that may not yet be set, and we don't have a great way # of figuring out what the dependencies of the accumulator -- GitLab From 3c77618c466fb0097bda7890966a53c60e60a88c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 17 Mar 2021 20:09:17 -0500 Subject: [PATCH 747/916] prefer derived class' callables over super classes' --- loopy/target/pyopencl.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index da2d221d7..2a26130e6 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -1022,11 +1022,12 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder): @property def known_callables(self): from loopy.library.random123 import get_random123_callables - callables = get_pyopencl_callables() - callables.update(get_random123_callables(self.target)) + # order matters: e.g. prefer our abs() over that of the # superclass - callables.update(super().known_callables) + callables = super().known_callables + callables.update(get_pyopencl_callables()) + callables.update(get_random123_callables(self.target)) return callables def preamble_generators(self): -- GitLab From fd179f4a2bf6589072a503191a3182fe65f97ed9 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Wed, 17 Mar 2021 23:39:12 -0500 Subject: [PATCH 748/916] Fix bad merge in cmathcallable to fix complex support --- loopy/target/c/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index c6d59084d..bc6f91088 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -498,8 +498,6 @@ class CMathCallable(ScalarCallable): if dtype.kind in ("u", "i"): # ints and unsigned casted to float32 dtype = np.float32 - elif dtype.kind == "c": - raise LoopyTypeError(f"{name} does not support type {dtype}") # for CUDA, C Targets the name must be modified if real_dtype == np.float64: -- GitLab From 55e3c84bce6a3b63d8d8e1fd59c2dee34aeb605a Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Thu, 18 Mar 2021 00:00:10 -0500 Subject: [PATCH 749/916] Fix pylint errors --- loopy/auto_test.py | 6 ------ loopy/compiled.py | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 91ef62d78..4f7dfbed5 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -677,12 +677,6 @@ def auto_test_vs_ref( rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl) if not quiet: - def format_float_or_none(v): - if v is None: - return "" - else: - return "%g" % v - print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % ( format_float_or_none(elapsed_event), diff --git a/loopy/compiled.py b/loopy/compiled.py index f9313c6c9..0fa18eacb 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -31,11 +31,11 @@ class CompiledKernel(PyOpenCLKernelExecutor): """ .. automethod:: __call__ """ - def __init__(self, context, kernel): + def __init__(self, context, kernel, entrypoint): from warnings import warn warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.", DeprecationWarning, stacklevel=2) - super().__init__(context, kernel) + super().__init__(context, kernel, entrypoint) # }}} -- GitLab From 084416dc88a9c3e1ae7425c144ba27b6e2dabee6 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Thu, 18 Mar 2021 01:49:05 -0500 Subject: [PATCH 750/916] Fix typo in parse_fortran --- loopy/frontend/fortran/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 71fa5b972..4ad7cd21c 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -338,7 +338,7 @@ def parse_fortran(source, filename="", free_form=None, strict=None, from loopy.transform.callable import merge prog = merge(kernels) all_kernels = [clbl.subkernel - for clbl in prog.callables_table.items()] + for clbl in prog.callables_table.values()] for knl in all_kernels: prog.with_kernel(_add_assignees_to_calls(knl, all_kernels)) -- GitLab From 9d4f50f2317136af012d537862d9365bafa385cf Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Thu, 18 Mar 2021 13:46:07 -0500 Subject: [PATCH 751/916] Fix caching of ArrayArgDescriptor --- loopy/tools.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/loopy/tools.py b/loopy/tools.py index 234a8d6f4..6356b9765 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -64,6 +64,14 @@ class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): self.rec(expr.expr, *args) + def map_foreign(self, expr, *args, **kwargs): + """Mapper method dispatch for non-:mod:`pymbolic` objects.""" + if expr is None: + self.key_hash.update(b"") + else: + PersistentHashWalkMapperBase.map_foreign(self, expr, *args, **kwargs) + + class LoopyKeyBuilder(KeyBuilderBase): """A custom :class:`pytools.persistent_dict.KeyBuilder` subclass for objects within :mod:`loopy`. -- GitLab From a1189e2ed7b7e62e4456029fe1c7db7242230205 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Fri, 19 Mar 2021 12:39:47 -0500 Subject: [PATCH 752/916] Fix exit early in split_iname --- loopy/transform/iname.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 9acc8e952..1bebd15ae 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -234,15 +234,9 @@ def _split_iname_backend(kernel, iname_to_split, # {{{ return the same kernel if no kernel matches - def _do_not_transform_if_no_within_matches(): - for insn in kernel.instructions: - if within(kernel, insn): - return - + if not any(within(kernel, insn) for insn in kernel.instructions): return kernel - _do_not_transform_if_no_within_matches() - # }}} existing_tags = kernel.iname_tags(iname_to_split) -- GitLab From 1df5e06cfd82b537367f51f786f6e92e5aaf02a1 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Fri, 19 Mar 2021 13:16:44 -0500 Subject: [PATCH 753/916] Fix exit early in join_Inames too --- loopy/tools.py | 1 - loopy/transform/iname.py | 8 +------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/loopy/tools.py b/loopy/tools.py index 6356b9765..644082ed6 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -63,7 +63,6 @@ class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase): self.key_hash.update(type(expr.operation).__name__.encode("utf-8")) self.rec(expr.expr, *args) - def map_foreign(self, expr, *args, **kwargs): """Mapper method dispatch for non-:mod:`pymbolic` objects.""" if expr is None: diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 1bebd15ae..984268ca1 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -526,15 +526,9 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None): # {{{ return the same kernel if no kernel matches - def _do_not_transform_if_no_within_matches(): - for insn in kernel.instructions: - if within(kernel, insn): - return - + if not any(within(kernel, insn) for insn in kernel.instructions): return kernel - _do_not_transform_if_no_within_matches() - # }}} # now fastest varying first -- GitLab From cc523c5dcca623812cf52f641d29ec297b6cffc6 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Mon, 29 Mar 2021 13:50:47 -0500 Subject: [PATCH 754/916] support programs for add_inames_for_unused_hw_axes --- loopy/transform/iname.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 984268ca1..292186eab 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1799,6 +1799,7 @@ def add_inames_to_insn(kernel, inames, insn_match): # }}} +@iterate_over_kernels_if_given_program def add_inames_for_unused_hw_axes(kernel, within=None): """ Returns a kernel with inames added to each instruction -- GitLab From 7c270a3fb58d827404d906349a106deae3e6ce39 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 30 Mar 2021 14:01:14 -0500 Subject: [PATCH 755/916] removes unnecessary subkernel copies --- loopy/kernel/function_interface.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 103ac45b3..dd713cf83 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -685,9 +685,7 @@ class CallableKernel(InKernelCallable): arg_id_to_dtype=arg_id_to_dtype, arg_id_to_descr=arg_id_to_descr) - self.subkernel = subkernel.copy( - args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target)) - if arg.dtype is not None else arg for arg in subkernel.args]) + self.subkernel = subkernel def __getinitargs__(self): return (self.subkernel, self.arg_id_to_dtype, -- GitLab From 21a999d4c14cba0f4e49e266404a09bf5ccac0c2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 1 Apr 2021 19:58:05 -0500 Subject: [PATCH 756/916] handle arg.shape in [lp.auto, None] for slicing mapper --- loopy/kernel/creation.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index dd42c2222..3f761e552 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1974,7 +1974,7 @@ class SliceToInameReplacer(IdentityMapper): if self.knl.temporary_variables[arg.name].shape in [ auto, None]: # do not convert arrays with unknown shapes to slices. - # (If an array of unknown shape was passed in error, with be + # (If an array of unknown shape was passed in error, will be # caught and raised during preprocessing). array_arg_shape = () else: @@ -1984,7 +1984,16 @@ class SliceToInameReplacer(IdentityMapper): if isinstance(self.knl.arg_dict[arg.name], ValueArg): array_arg_shape = () else: - array_arg_shape = self.knl.arg_dict[arg.name].shape + + if self.knl.arg_dict[arg.name].shape in [ + auto, None]: + # do not convert arrays with unknown shapes to slices. + # (If an array of unknown shape was passed in error, will + # be caught and raised during preprocessing). + array_arg_shape = () + else: + array_arg_shape = ( + self.knl.arg_dict[arg.name].shape) else: assert arg.name in self.knl.all_inames() array_arg_shape = () -- GitLab From c994ffb3943298e6f0684f45b3800a1fe7d1531f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 1 Apr 2021 20:02:56 -0500 Subject: [PATCH 757/916] corrects the mistake of allowing user to pass ndim=1 sub-arrays for ndim=0 callee array args --- loopy/kernel/function_interface.py | 3 --- loopy/target/c/codegen/expression.py | 1 - test/test_callables.py | 17 +++++++++-------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index dd713cf83..b74df73e7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -237,9 +237,6 @@ def get_arg_descriptor_for_expression(kernel, expr): kernel.get_iname_bounds(iname.name).upper_bound_pw_aff - kernel.get_iname_bounds(iname.name).lower_bound_pw_aff)+1 for iname in expr.swept_inames) - if expr.swept_inames == (): - sub_shape = (1, ) - sub_dim_tags = (DimTag(1),) return ArrayArgDescriptor( address_space=aspace, diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index a46f123bb..f54c46b84 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -414,7 +414,6 @@ class ExpressionToCExpressionMapper(IdentityMapper): if iinfo.max > (2**31-1): suffix += "l" return Literal(repr(expr)+suffix) - else: raise LoopyError("do not know how to generate code for " "constant of numpy type '%s'" % type(expr).__name__) diff --git a/test/test_callables.py b/test/test_callables.py index dd5dcb4c8..81ccb145e 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -302,9 +302,9 @@ def test_multi_arg_array_call(ctx_factory): queue = cl.CommandQueue(ctx) import pymbolic.primitives as p n = 10 - acc_i = p.Variable("acc_i")[0] + acc_i = p.Variable("acc_i") i = p.Variable("i") - index = p.Variable("index")[0] + index = p.Variable("index") a_i = p.Subscript(p.Variable("a"), p.Variable("i")) argmin_kernel = lp.make_function( "{[i]: 0 <= i < n}", @@ -321,7 +321,8 @@ def test_multi_arg_array_call(ctx_factory): depends_on="init1,init2")], [ lp.GlobalArg("a"), - lp.GlobalArg("acc_i, index", is_input=False, is_output=True), + lp.GlobalArg("acc_i, index", is_input=False, is_output=True, + shape=lp.auto), ...], name="custom_argmin") @@ -330,7 +331,7 @@ def test_multi_arg_array_call(ctx_factory): knl = lp.make_kernel( "{[i]:0<=i Date: Mon, 5 Apr 2021 12:26:42 -0500 Subject: [PATCH 758/916] adds a failing test --- test/test_callables.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/test_callables.py b/test/test_callables.py index 81ccb145e..b7e2365a6 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -738,6 +738,31 @@ def test_double_resolving(): ResolvedFunction) +@pytest.mark.parametrize("inline", [False, True]) +def test_passing_and_getting_scalar_in_clbl_knl(ctx_factory, inline): + ctx = cl.create_some_context() + cq = cl.CommandQueue(ctx) + + call_sin = lp.make_function( + "{:}", + """ + y = sin(x) + """, name="call_sin") + + knl = lp.make_kernel( + "{:}", + """ + []: real_y[()] = call_sin(real_x) + """) + + knl = lp.merge([knl, call_sin]) + knl = lp.set_options(knl, "write_cl") + if inline: + knl = lp.inline_callable_kernel(knl, "call_sin") + + evt, (out,) = knl(cq, real_x=np.asarray(3.0, dtype=float)) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 9c69d06ad3fd54c499e54b4e53131d0e42e22d89 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 5 Apr 2021 14:50:03 -0500 Subject: [PATCH 759/916] [inlining] handle value args correctly --- loopy/transform/callable.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 10f9f0b26..a5c4c5284 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -163,6 +163,23 @@ class KernelInliner(RuleAwareSubstitutionMapper): assert expr.aggregate.name in self.callee_knl.temporary_variables return super().map_subscript(expr, expn_state) + def map_variable(self, expr, expn_state): + from loopy.kernel.data import ArrayArg, ValueArg + from loopy.symbolic import SubArrayRef + if expr.name in self.callee_knl.arg_dict: + arg = self.callee_knl.arg_dict[expr.name] + par = self.callee_arg_to_call_param[expr.name] + if isinstance(arg, ArrayArg): + assert arg.shape == () + assert isinstance(par, SubArrayRef) and par.swept_inames == () + return par.subscript.aggregate + else: + assert isinstance(arg, ValueArg) + return par + + else: + return super().map_variable(expr, expn_state) + # }}} -- GitLab From cdaab7c770ea5f71a4f440edd3b55c82d25b9267 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 17 Apr 2021 19:17:15 -0500 Subject: [PATCH 760/916] obj_get_var_dict: handle BasicSets --- loopy/isl_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index a0ce79cc0..8ed4d3d43 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -618,7 +618,7 @@ def find_max_of_pwaff_with_params(pw_aff, n_allowed_params): def set_dim_name(obj, dt, pos, name): assert isinstance(name, str) - if isinstance(obj, isl.PwQPolynomial): + if isinstance(obj, (isl.PwQPolynomial, isl.BasicSet)): return obj.set_dim_name(dt, pos, name) elif isinstance(obj, isl.PwAff): # work around missing isl_pw_aff_set_dim_name for now. -- GitLab From e573c9f9f89f12a3c7198e5e514e81f7c6e29aba Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Apr 2021 11:01:19 -0500 Subject: [PATCH 761/916] run nb-clean on ipython-integration-demo.ipynb --- .../fortran/ipython-integration-demo.ipynb | 90 ++++--------------- 1 file changed, 16 insertions(+), 74 deletions(-) diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb index 8fe25780b..d9ac1f1b2 100644 --- a/examples/fortran/ipython-integration-demo.ipynb +++ b/examples/fortran/ipython-integration-demo.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -25,18 +25,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/kaushikggg/pack/loopy_kc_env/src/loopy/loopy/frontend/fortran/translator.py:807: LoopyWarning: 'lang_version' was not passed to make_function(). To avoid this warning, pass lang_version=(2018, 2) in this invocation. (Or say 'from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2' in the global scope of the calling frame.)\n", - " seq_dependencies=seq_dependencies,\n" - ] - } - ], + "outputs": [], "source": [ "%%fortran_kernel\n", "\n", @@ -54,35 +45,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---------------------------------------------------------------------------\n", - "KERNEL: fill\n", - "---------------------------------------------------------------------------\n", - "ARGUMENTS:\n", - "a: ValueArg, type: np:dtype('float64')\n", - "n: ValueArg, type: np:dtype('int32')\n", - "out: type: np:dtype('float64'), shape: (n), dim_tags: (N0:stride:1) aspace: global\n", - "---------------------------------------------------------------------------\n", - "DOMAINS:\n", - "[n] -> { [i] : 0 <= i < n }\n", - "---------------------------------------------------------------------------\n", - "INAME IMPLEMENTATION TAGS:\n", - "i: None\n", - "---------------------------------------------------------------------------\n", - "INSTRUCTIONS:\n", - "for i\n", - " \u001b[36mout[i]\u001b[0m = \u001b[35ma\u001b[0m {id=\u001b[32minsn0\u001b[0m}\n", - "end i\n", - "---------------------------------------------------------------------------\n" - ] - } - ], + "outputs": [], "source": [ "print(prog)" ] @@ -96,8 +61,10 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "execution_count": null, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "split_amount = 128" @@ -105,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -134,36 +101,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---------------------------------------------------------------------------\n", - "KERNEL: tr_fill\n", - "---------------------------------------------------------------------------\n", - "ARGUMENTS:\n", - "a: ValueArg, type: np:dtype('float64')\n", - "n: ValueArg, type: np:dtype('int32')\n", - "out: type: np:dtype('float64'), shape: (n), dim_tags: (N0:stride:1) aspace: global\n", - "---------------------------------------------------------------------------\n", - "DOMAINS:\n", - "[n] -> { [i_outer, i_inner] : i_inner >= 0 and -128i_outer <= i_inner <= 127 and i_inner < n - 128i_outer }\n", - "---------------------------------------------------------------------------\n", - "INAME IMPLEMENTATION TAGS:\n", - "i_inner: l.0\n", - "i_outer: g.0\n", - "---------------------------------------------------------------------------\n", - "INSTRUCTIONS:\n", - "for i_inner, i_outer\n", - " \u001b[36mout[i_inner + i_outer*128]\u001b[0m = \u001b[35ma\u001b[0m {id=\u001b[32minsn0\u001b[0m}\n", - "end i_inner, i_outer\n", - "---------------------------------------------------------------------------\n" - ] - } - ], + "outputs": [], "source": [ "print(prog)" ] @@ -171,7 +111,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [] } @@ -192,7 +134,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.4" } }, "nbformat": 4, -- GitLab From c629041cc16d82f226a4134d2ceffa737ebd96bc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Apr 2021 15:05:27 -0500 Subject: [PATCH 762/916] call-blas: code cleanup --- examples/python/call-external.py | 125 +++++++++++++++---------------- 1 file changed, 60 insertions(+), 65 deletions(-) diff --git a/examples/python/call-external.py b/examples/python/call-external.py index 01eccb352..49b25d6e0 100644 --- a/examples/python/call-external.py +++ b/examples/python/call-external.py @@ -2,81 +2,78 @@ import loopy as lp import numpy as np from loopy.diagnostic import LoopyError from loopy.target.c import CTarget +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401 # {{{ blas callable -class BLASCallable(lp.ScalarCallable): - def with_types(self, arg_id_to_dtype, kernel, callables_table): - for i in range(0, 2): - if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None: - # the types provided aren't mature enough to specialize the - # callable - return ( - self.copy(arg_id_to_dtype=arg_id_to_dtype), - callables_table) +class CBLASGEMV(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, callables_table): + mat_dtype = arg_id_to_dtype.get(0) + vec_dtype = arg_id_to_dtype.get(1) - mat_dtype = arg_id_to_dtype[0].numpy_dtype - vec_dtype = arg_id_to_dtype[1].numpy_dtype + if mat_dtype is None or vec_dtype is None: + # types aren't specialized enough to be resolved + return self, callables_table if mat_dtype != vec_dtype: - raise LoopyError("DGEMV should have same dtype for matrix and " - "vector") + raise LoopyError("GEMV requires same dtypes for matrix and " + "vector") - if vec_dtype == np.float32: + if vec_dtype.numpy_dtype == np.float32: name_in_target = "cblas_sgemv" - elif vec_dtype == np.float64: + elif vec_dtype. numpy_dtype == np.float64: name_in_target = "cblas_dgemv" else: - raise LoopyError("GEMV only supported for float32 and float64 " - "types") - - from loopy.types import NumpyType - return self.copy(name_in_target=name_in_target, - arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype), - -1: NumpyType(vec_dtype)}), callables_table + raise LoopyError("GEMV is only supported for float32 and float64 " + "types") + + return (self.copy(name_in_target=name_in_target, + arg_id_to_dtype={0: vec_dtype, + 1: vec_dtype, + -1: vec_dtype}), + callables_table) + + def with_descrs(self, arg_id_to_descr, callables_table): + mat_descr = arg_id_to_descr.get(0) + vec_descr = arg_id_to_descr.get(1) + res_descr = arg_id_to_descr.get(-1) + + if mat_descr is None or vec_descr is None or res_descr is None: + # shapes aren't specialized enough to be resolved + return self, callables_table + + assert mat_descr.shape[1] == vec_descr.shape[0] + assert mat_descr.shape[0] == res_descr.shape[0] + assert len(vec_descr.shape) == len(res_descr.shape) == 1 + # handling only the easy case when stride == 1 + assert vec_descr.dim_tags[0].stride == 1 + assert mat_descr.dim_tags[1].stride == 1 + assert res_descr.dim_tags[0].stride == 1 + + return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table def emit_call_insn(self, insn, target, expression_to_code_mapper): - assert self.is_ready_for_codegen() - - from loopy.kernel.instruction import CallInstruction - - assert isinstance(insn, CallInstruction) - - parameters = insn.expression.parameters - - parameters = list(parameters) - par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] - - parameters.append(insn.assignees[0]) - par_dtypes.append(self.arg_id_to_dtype[-1]) - - # no type casting in array calls. - from loopy.expression import dtype_to_type_context - from pymbolic.mapper.stringifier import PREC_NONE - from loopy.symbolic import SubArrayRef from pymbolic import var - mat_descr = self.arg_id_to_descr[0] - - c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr if isinstance(par, SubArrayRef) else - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr - for par, par_dtype in zip( - parameters, par_dtypes)] - c_parameters.insert(0, var("CblasRowMajor")) - c_parameters.insert(1, var("CblasNoTrans")) - c_parameters.insert(2, mat_descr.shape[0]) - c_parameters.insert(3, mat_descr.shape[1]) - c_parameters.insert(4, 1) - c_parameters.insert(6, 1) - c_parameters.insert(8, 1) - c_parameters.insert(10, 1) - return var(self.name_in_target)(*c_parameters), False + m, n = mat_descr.shape + ecm = expression_to_code_mapper + mat, vec = insn.expression.parameters + result, = insn.assignees + + c_parameters = [var("CblasRowMajor"), + var("CblasNoTrans"), + m, n, + 1, + ecm(mat).expr, + 1, + ecm(vec).expr, + 1, + ecm(result).expr, + 1] + return (var(self.name_in_target)(*c_parameters), + False # cblas_gemv does not return anything + ) def generate_preambles(self, target): assert isinstance(target, CTarget) @@ -89,16 +86,14 @@ class BLASCallable(lp.ScalarCallable): n = 10 knl = lp.make_kernel( - "{[i]: 0<=i<10}", + "{:}", """ y[:] = gemv(A[:, :], x[:]) """, [ lp.GlobalArg("A", dtype=np.float64, shape=(n, n)), lp.GlobalArg("x", dtype=np.float64, shape=(n, )), lp.GlobalArg("y", shape=(n, )), ...], - target=CTarget(), - lang_version=(2018, 2)) - -knl = lp.register_callable(knl, "gemv", BLASCallable(name="gemv")) + target=CTarget()) +knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv")) print(lp.generate_code_v2(knl).device_code()) -- GitLab From b40a7066d39f575acdd7dec34a4d17d768c87b02 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Apr 2021 16:10:14 -0500 Subject: [PATCH 763/916] [cleanup] edits to reduce diff vs main --- examples/python/global_barrier_removal.py | 2 +- examples/python/hello-loopy.py | 3 +- loopy/codegen/control.py | 2 +- loopy/codegen/result.py | 7 ++- loopy/kernel/__init__.py | 63 ++--------------------- 5 files changed, 10 insertions(+), 67 deletions(-) diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py index e09c0d2cb..d97fc3fa6 100644 --- a/examples/python/global_barrier_removal.py +++ b/examples/python/global_barrier_removal.py @@ -23,7 +23,7 @@ knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = knl.with_kernel(get_one_scheduled_kernel(knl["loopy_kernel"], - knl.callables_table)) + knl.callables_table)) # map schedule onto host or device diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index ad0028d19..3458a6e0e 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -16,8 +16,7 @@ a = cl.array.arange(queue, n, dtype=np.float32) # ------ knl = lp.make_kernel( "{ [i]: 0<=i { : 1 = 1}") - elif isinstance(assumptions, str): assumptions_set_str = "[%s] -> { : %s}" \ % (",".join(s for s in self.outer_params(domains)), @@ -355,7 +347,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): assumptions = isl.BasicSet.read_from_str(domains[0].get_ctx(), assumptions_set_str) - # assert assumptions.is_params() + assert assumptions.is_params() # }}} @@ -412,7 +404,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): substitutions=substitutions, cache_manager=cache_manager, applied_iname_rewrites=applied_iname_rewrites, - function_manglers=function_manglers, symbol_manglers=symbol_manglers, index_dtype=index_dtype, options=options, @@ -426,51 +417,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): # }}} - # {{{ function mangling/scoping - - def mangle_function(self, identifier, arg_dtypes, ast_builder=None): - if ast_builder is None: - ast_builder = self.target.get_device_ast_builder() - - manglers = ast_builder.function_manglers() + self.function_manglers - - for mangler in manglers: - mangle_result = mangler(self, identifier, arg_dtypes) - if mangle_result is not None: - from loopy.kernel.data import CallMangleInfo - if isinstance(mangle_result, CallMangleInfo): - assert len(mangle_result.arg_dtypes) == len(arg_dtypes) - return mangle_result - - assert isinstance(mangle_result, tuple) - - from warnings import warn - warn("'%s' returned a tuple instead of a CallMangleInfo instance. " - "This is deprecated." % mangler.__name__, - DeprecationWarning) - - if len(mangle_result) == 2: - result_dtype, target_name = mangle_result - return CallMangleInfo( - target_name=target_name, - result_dtypes=(result_dtype,), - arg_dtypes=None) - - elif len(mangle_result) == 3: - result_dtype, target_name, actual_arg_dtypes = mangle_result - return CallMangleInfo( - target_name=target_name, - result_dtypes=(result_dtype,), - arg_dtypes=actual_arg_dtypes) - - else: - raise ValueError("unexpected size of tuple returned by '%s'" - % mangler.__name__) - - return None - - # }}} - # {{{ symbol mangling def mangle_symbol(self, ast_builder, identifier): @@ -1617,7 +1563,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): # resolve hash conflicts. "preamble_generators", - "function_manglers", "symbol_manglers", ) -- GitLab From 87541fa4a1c9a7a711826e46df546db116a5f154 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Apr 2021 18:17:13 -0500 Subject: [PATCH 764/916] codegen/result: cleanup --- loopy/codegen/result.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 8ddeb1d8a..358088922 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -106,7 +106,7 @@ class CodeGenerationResult(ImmutableRecord): """ @staticmethod - def new(codegen_state, insn_id, ast, implemented_domain, entrypoint=None): + def new(codegen_state, insn_id, ast, implemented_domain): prg = GeneratedProgram( name=codegen_state.gen_program_name, is_device_program=codegen_state.is_generating_device_code, @@ -134,8 +134,8 @@ class CodeGenerationResult(ImmutableRecord): return ( "".join(preamble_codes) + "\n" - + "\n\n".join(str(hp.ast) for hp in - self.host_programs.values())) + + "\n\n".join(str(hp.ast) + for hp in self.host_programs.values())) def device_code(self): preamble_codes = process_preambles(getattr(self, "device_preambles", [])) @@ -149,7 +149,7 @@ class CodeGenerationResult(ImmutableRecord): preamble_codes = process_preambles( getattr(self, "host_preambles", []) + - list(getattr(self, "device_preambles", [])) + getattr(self, "device_preambles", []) ) return ( @@ -202,7 +202,6 @@ class CodeGenerationResult(ImmutableRecord): host_programs[e] = program else: host_programs[codegen_state.kernel.name] = program - pass return self.copy( host_programs=host_programs) -- GitLab From 5420b843c5da5a0f5154c7e52e3408eeab9eb6d1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Apr 2021 18:40:52 -0500 Subject: [PATCH 765/916] corrects the grid sizes calculation --- loopy/codegen/__init__.py | 2 -- loopy/kernel/__init__.py | 67 +++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index bd4d74c51..3c02a724b 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -467,8 +467,6 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, :returns: a :class:`CodeGenerationResult` :param kernel: An instance of :class:`loopy.LoopKernel`. - :param callables_table: An instance of - :class:`loopy.CallablesTable`. """ from loopy.kernel import KernelState diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 9a9740f06..47c86e023 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -618,7 +618,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): for dom in self.domains: return dom.get_ctx() - return isl.DEFAULT_CONTEXT + assert False @memoize_method def combine_domains(self, domains): @@ -1047,15 +1047,12 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, callables_table, ignore_auto=False): - # FIXME: docs - # {{{ collecting the callee kernels in insn_ids - - from loopy.kernel.tools import get_direct_callee_kernels - callee_kernels = get_direct_callee_kernels(self, - callables_table, insn_ids) - - # }}} - + """ + Returns a tuple of ``(global_sizes, local_sizes)``, where + ``global_sizes``, ``local_sizes`` are the grid sizes that could + accommodate all of *insn_ids*. The grid sizes as a dict from the axis + index to the corresponding grid size. + """ all_inames_by_insns = set() for insn_id in insn_ids: all_inames_by_insns |= self.insn_inames(insn_id) @@ -1066,18 +1063,46 @@ class LoopKernel(ImmutableRecordWithoutPickling): % (", ".join(sorted(all_inames_by_insns)), ", ".join(sorted(self.all_inames())))) + # {{{ include grid constraints due to callees + global_sizes = {} local_sizes = {} - # updating the grid sizes from the callee_kernels. - for callee_kernel in callee_kernels: - gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts( - frozenset(insn.id for insn in callee_kernel.instructions), - callables_table, ignore_auto) + from loopy.kernel.data import ValueArg + from loopy.kernel.instruction import CallInstruction + from loopy.kernel.function_interface import (CallableKernel, + get_kw_pos_association) + from loopy.isl_helpers import subst_into_pwaff + + for insn in self.instructions: + if isinstance(insn, CallInstruction): + clbl = callables_table[insn.expression.function.name] + if isinstance(clbl, CallableKernel): + _, pos_to_kw = get_kw_pos_association(clbl.subkernel) + subst_dict = { + pos_to_kw[i]: param + for i, param in enumerate(insn.expression.parameters) + if isinstance(clbl.subkernel.arg_dict[pos_to_kw[i]], + ValueArg)} + + gsize, lsize = ( + clbl.subkernel.get_grid_sizes_for_insn_ids_as_dicts( + frozenset(insn.id + for insn in clbl.subkernel.instructions), + callables_table, ignore_auto)) + + for tgt_dict, tgt_size in [(global_sizes, gsize), + (local_sizes, lsize)]: + + for iaxis, size in tgt_size.items(): + size = subst_into_pwaff(self.assumptions.space, + size, subst_dict) + if iaxis in tgt_dict: + tgt_dict[iaxis] = tgt_dict[iaxis].max(size) + else: + tgt_dict[iaxis] = size - # FIXME: Should assert that nothing is being overwritten - global_sizes.update(gsize) - local_sizes.update(lsize) + # }}} from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, @@ -1125,13 +1150,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table, ignore_auto=False, return_dict=False): - # Fixme: docs """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs - :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ @@ -1180,13 +1203,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): @memoize_method def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, callables_table, ignore_auto=False, return_dict=False): - # FIXME docs """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs - :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` *global_size* and *local_size* are :mod:`pymbolic` expressions """ @@ -1224,8 +1245,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of *all* instructions in the kernel. - :arg callables_table: an instance of :class:`loopy.Program.CallablesTable` - *global_size* and *local_size* are :mod:`pymbolic` expressions """ return self.get_grid_sizes_for_insn_ids_as_exprs( -- GitLab From 9c107dfb5281208739f6c451d3ea32885d5ad71a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Apr 2021 22:04:12 -0500 Subject: [PATCH 766/916] port function mangling to new callable interface --- test/library_for_test.py | 41 ++++++++++++++++++++++------------------ test/test_loopy.py | 5 ++--- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/test/library_for_test.py b/test/library_for_test.py index 2cb4067e0..a279e34cd 100644 --- a/test/library_for_test.py +++ b/test/library_for_test.py @@ -1,23 +1,28 @@ -# This exists because function handles can't be pickled. +import loopy as lp -def no_ret_f_mangler(kernel, name, arg_dtypes): - if not isinstance(name, str): - return None +class NoRetFunction(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, callables): + if len(arg_id_to_dtype) != 0: + raise RuntimeError("'f' cannot take any inputs.") - if (name == "f" and len(arg_dtypes) == 0): - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="f", - result_dtypes=arg_dtypes, - arg_dtypes=arg_dtypes) + return (self.copy(arg_id_to_dtype=arg_id_to_dtype, + name_in_target="f"), + callables) + def with_descrs(self, arg_id_to_descr, callables): + if len(arg_id_to_descr) != 0: + raise RuntimeError("'f' cannot take any inputs.") -def no_ret_f_preamble_gen(preamble_info): - yield ("10_define_f", - r""" - void f() - { - printf("Hi!\n"); - } - """) + return (self.copy(arg_id_to_descr=arg_id_to_descr), + callables) + + def generate_preambles(self, target): + assert isinstance(target, lp.CFamilyTarget) + yield ("10_define_f", + r""" + void f() + { + printf("Hi!\n"); + } + """) diff --git a/test/test_loopy.py b/test/test_loopy.py index 8859c754e..3108ec5d2 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1547,9 +1547,8 @@ def test_call_with_no_returned_value(ctx_factory): [lp.CallInstruction((), p.Call(p.Variable("f"), ()))] ) - from library_for_test import no_ret_f_mangler, no_ret_f_preamble_gen - knl = lp.register_function_manglers(knl, [no_ret_f_mangler]) - knl = lp.register_preamble_generators(knl, [no_ret_f_preamble_gen]) + from library_for_test import NoRetFunction + knl = lp.register_callable(knl, "f", NoRetFunction("f")) evt, _ = knl(queue) -- GitLab From 25dada29fccccea1c0fc5aa9bedfcae2ea6b341a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Apr 2021 22:04:52 -0500 Subject: [PATCH 767/916] allow rollback for parsing lists if input is not a SAR --- loopy/symbolic.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 1e78e6e56..170165d36 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1569,6 +1569,7 @@ class LoopyParser(ParserBase): self.parse_expression(pstate, _PREC_UNARY)) elif pstate.is_next(_openbracket): + rollback_pstate = pstate.copy() pstate.advance() pstate.expect_not_end() if pstate.is_next(_closebracket): @@ -1578,11 +1579,14 @@ class LoopyParser(ParserBase): pstate.expect(_closebracket) pstate.advance() - pstate.expect(_colon) - pstate.advance() - subscript = self.parse_expression(pstate, _PREC_UNARY) - return SubArrayRef(swept_inames, subscript) - + if pstate.is_next(_colon): + # pstate.expect(_colon): + pstate.advance() + subscript = self.parse_expression(pstate, _PREC_UNARY) + return SubArrayRef(swept_inames, subscript) + else: + pstate = rollback_pstate + return super().parse_prefix(rollback_pstate) else: return super().parse_prefix(pstate) -- GitLab From f6f1c38897347998ef30fea70fa67dc87581f0da Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 18 Apr 2021 22:06:01 -0500 Subject: [PATCH 768/916] [cleanup]: docs, remove function mangling bits --- loopy/kernel/__init__.py | 9 +++------ loopy/transform/fusion.py | 3 --- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 47c86e023..97be5d982 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -303,8 +303,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): if cache_manager is None: from loopy.kernel.tools import SetOperationCacheManager cache_manager = SetOperationCacheManager() - if index_dtype is None: - index_dtype = np.int32 if iname_to_tags is not None: warn("Providing iname_to_tags is deprecated, pass inames instead. " @@ -1048,10 +1046,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids, callables_table, ignore_auto=False): """ - Returns a tuple of ``(global_sizes, local_sizes)``, where - ``global_sizes``, ``local_sizes`` are the grid sizes that could - accommodate all of *insn_ids*. The grid sizes as a dict from the axis - index to the corresponding grid size. + Returns a tuple of (global_sizes, local_sizes), where global_sizes, + local_sizes are the grid sizes accommodating all of *insn_ids*. The grid + sizes are a dict from the axis index to the corresponding grid size. """ all_inames_by_insns = set() for insn_id in insn_ids: diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index dbbb8022f..0880c22ae 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -256,9 +256,6 @@ def _fuse_two_kernels(kernela, kernelb): "substitution", kernela.substitutions, kernelb.substitutions), - function_manglers=_ordered_merge_lists( - kernela.function_manglers, - kernelb.function_manglers), symbol_manglers=_ordered_merge_lists( kernela.symbol_manglers, kernelb.symbol_manglers), -- GitLab From f9814c94da95084847b79b5dac1d956ce5d14fcf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 03:33:36 -0500 Subject: [PATCH 769/916] iterate remove instructions over all the callees --- loopy/transform/instruction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 213548c59..a48e8eda7 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -145,6 +145,7 @@ def add_dependency(kernel, insn_match, depends_on): # {{{ remove_instructions +@iterate_over_kernels_if_given_program def remove_instructions(kernel, insn_ids): """Return a new kernel with instructions in *insn_ids* removed. -- GitLab From dacfc187d30ae6dbca70e031fa01d14939829967 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 03:37:29 -0500 Subject: [PATCH 770/916] guard accessing callables table for only ResolvedFunction --- loopy/kernel/__init__.py | 9 +++++++-- loopy/program.py | 22 ++++++++++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 97be5d982..e6c05c878 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -264,7 +264,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): applied_iname_rewrites=None, cache_manager=None, - index_dtype=np.int32, + index_dtype=None, options=None, state=KernelState.INITIAL, @@ -324,6 +324,9 @@ class LoopKernel(ImmutableRecordWithoutPickling): name: inames.get(name, Iname(name, frozenset())) for name in _get_inames_from_domains(domains)} + if index_dtype is None: + index_dtype = np.int32 + # }}} # {{{ process assumptions @@ -1070,9 +1073,11 @@ class LoopKernel(ImmutableRecordWithoutPickling): from loopy.kernel.function_interface import (CallableKernel, get_kw_pos_association) from loopy.isl_helpers import subst_into_pwaff + from loopy.symbolic import ResolvedFunction for insn in self.instructions: - if isinstance(insn, CallInstruction): + if isinstance(insn, CallInstruction) and isinstance( + insn.expression.function, ResolvedFunction): clbl = callables_table[insn.expression.function.name] if isinstance(clbl, CallableKernel): _, pos_to_kw = get_kw_pos_association(clbl.subkernel) diff --git a/loopy/program.py b/loopy/program.py index c8615d4d7..792abe59a 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -73,6 +73,14 @@ def find_in_knl_callable_from_identifier( return None +def _is_a_reduction_op(expr): + if isinstance(expr, ResolvedFunction): + return _is_a_reduction_op(expr.function) + + from loopy.library.reduction import ReductionOpFunction + return isinstance(expr, ReductionOpFunction) + + class CallableResolver(RuleAwareIdentityMapper): """ Resolves callables in expressions and records the names of the calls @@ -98,7 +106,14 @@ class CallableResolver(RuleAwareIdentityMapper): def map_call(self, expr, expn_state): from loopy.symbolic import parse_tagged_name - name, tag = parse_tagged_name(expr.function) + + if not _is_a_reduction_op(expr.function): + name, tag = parse_tagged_name(expr.function) + else: + if isinstance(expr.function, ResolvedFunction): + name = expr.function.function + else: + name = expr.function if name in self.known_callables: params = tuple(self.rec(par, expn_state) for par in expr.parameters) @@ -655,11 +670,6 @@ def make_program(kernel): callable kernel. """ - # get the program from program callables info - #FIXME:(For KK): do we need to register the current kernel in - # func_id_to_in_knl_callable_mappers - #FIXME(For inducer): Deriving the target of this program from the kernel's - # target. program = Program( callables_table={ kernel.name: CallableKernel(kernel)}, -- GitLab From d58a5d8cbb9446c885e6651c52a326a09abbad99 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 03:37:54 -0500 Subject: [PATCH 771/916] modernize tests - loop kernel attributes must be queried on loop kernels and not programs - function manglers -> ScalarCallable --- test/library_for_test.py | 33 ++++++++++++++++++++++++++++++ test/test_callables.py | 5 ----- test/test_loopy.py | 14 ++++++------- test/test_transform.py | 44 +++++++++++++++++++++------------------- 4 files changed, 63 insertions(+), 33 deletions(-) diff --git a/test/library_for_test.py b/test/library_for_test.py index a279e34cd..cfaacdc0e 100644 --- a/test/library_for_test.py +++ b/test/library_for_test.py @@ -1,4 +1,5 @@ import loopy as lp +import numpy as np class NoRetFunction(lp.ScalarCallable): @@ -26,3 +27,35 @@ class NoRetFunction(lp.ScalarCallable): printf("Hi!\n"); } """) + + +class SingleArgNoRetFunction(lp.ScalarCallable): + def with_types(self, arg_id_to_dtype, callables): + input_dtype = arg_id_to_dtype.get(0) + if input_dtype is None: + return self, callables + + if input_dtype.numpy_dtype != np.float32: + raise RuntimeError("'f' only supports f32.") + + return (self.copy(arg_id_to_dtype=arg_id_to_dtype, + name_in_target="f"), + callables) + + def with_descrs(self, arg_id_to_descr, callables): + if len(arg_id_to_descr) != 0: + raise RuntimeError("'f' cannot take any inputs.") + + return (self.copy(arg_id_to_descr=arg_id_to_descr), + callables) + + def generate_preambles(self, target): + assert isinstance(target, lp.CFamilyTarget) + + yield ("10_define_f", + r""" + void f(float x) + { + printf("Hi!\n"); + } + """) diff --git a/test/test_callables.py b/test/test_callables.py index b7e2365a6..ef22b1632 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -223,9 +223,6 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): knl = lp.set_options(knl, "return_dict") - gsize, lsize = knl["caller"].get_grid_size_upper_bounds_as_exprs( - knl.callables_table) - if inline: knl = lp.inline_callable_kernel(knl, "linear_combo") @@ -234,8 +231,6 @@ def test_register_knl_with_hw_axes(ctx_factory, inline): x_host = x_dev.get() y_host = y_dev.get() - assert gsize == (4, 1) - assert lsize == (1, 4) assert np.linalg.norm(2*x_host+3*y_host-out["z"].get())/np.linalg.norm( 2*x_host+3*y_host) < 1e-15 diff --git a/test/test_loopy.py b/test/test_loopy.py index 3108ec5d2..1e728eefb 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1563,8 +1563,8 @@ def test_call_with_options(): "f() {id=init}" ) - from library_for_test import no_ret_f_mangler - knl = lp.register_function_manglers(knl, [no_ret_f_mangler]) + from library_for_test import NoRetFunction + knl = lp.register_callable(knl, "f", NoRetFunction("f")) print(lp.generate_code_v2(knl).device_code()) @@ -2826,7 +2826,7 @@ def test_shape_mismatch_check(ctx_factory): a = np.random.rand(10, 10).astype(np.float32) b = np.random.rand(10).astype(np.float32) - if prg.options.skip_arg_checks: + if prg["loopy_kernel"].options.skip_arg_checks: pytest.skip("args checks disabled, cannot check") with pytest.raises(TypeError, match="strides mismatch"): @@ -3101,16 +3101,16 @@ def test_deps_from_conditionals(): result = result + simul_reduce(sum, i, i*i) result = result + simul_reduce(sum, i, 2*i*i) end - """) + """, name="lpy_knl") ppknl = lp.preprocess_kernel(knl) # accumulator initializers must be dependency-less assert all(not insn.depends_on - for insn in ppknl.instructions + for insn in ppknl["lpy_knl"].instructions if "init" in insn.id) # accumulator initializers must not have inherited the predicates assert all(not insn.predicates - for insn in ppknl.instructions + for insn in ppknl["lpy_knl"].instructions if "init" in insn.id) # Ensure valid linearization exists: No valid linearization unless the @@ -3149,7 +3149,7 @@ def test_cached_written_variables_doesnt_carry_over_invalidly(): knl2 = loads(dumps(knl)) knl2 = lp.remove_instructions(knl2, {"write_b"}) - assert "b" not in knl2.get_written_variables() + assert "b" not in knl2["loopy_kernel"].get_written_variables() if __name__ == "__main__": diff --git a/test/test_transform.py b/test/test_transform.py index d3c04ef9d..9ac29766b 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -125,38 +125,37 @@ def test_to_batched(ctx_factory): def test_to_batched_temp(ctx_factory): ctx = ctx_factory() - prog = lp.make_kernel( - """ { [i,j]: 0<=i,j{: n_new=10}")) - == knl.assumptions) + (assumptions & isl.BasicSet("[n_new]->{: n_new=10}")) + == assumptions) def test_tag_iname_with_match_pattern(): @@ -753,6 +753,7 @@ def test_tag_iname_with_match_pattern(): """) knl = lp.tag_inames(knl, "i*:unr") + knl = knl["loopy_kernel"] i0_tag, = knl.inames["i0"].tags i1_tag, = knl.inames["i1"].tags @@ -778,6 +779,7 @@ def test_custom_iname_tag(): """) knl = lp.tag_inames(knl, {"ifuzz0": ElementLoopTag(), "ifuzz1": DOFLoopTag()}) + knl = knl["loopy_kernel"] ifuzz0_tag, = knl.inames["ifuzz0"].tags ifuzz1_tag, = knl.inames["ifuzz1"].tags -- GitLab From 9635ff1e5f20e6e74d81385346f7df3945557261 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 03:40:00 -0500 Subject: [PATCH 772/916] only do type inference for ResolvedFunctions --- loopy/type_inference.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index a2e8725f5..ee1ddf33d 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -416,6 +416,11 @@ class TypeInferenceMapper(CombineMapper): kw_parameters = {} identifier = expr.function + + if not isinstance(identifier, ResolvedFunction): + # function not resolved => exit + return [] + if isinstance(identifier, (Variable, ResolvedFunction)): identifier = identifier.name -- GitLab From 3615711065c332749894047a8979d4a1880533f7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 03:41:12 -0500 Subject: [PATCH 773/916] fix complex specific stuff --- loopy/target/c/__init__.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 133c81766..a45965c80 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -472,14 +472,17 @@ class CMathCallable(ScalarCallable): # {{{ (abs|max|min) -> (fabs|fmax|fmin) if name in ["abs", "min", "max"]: - name = "f" + name + dtype = np.find_common_type( + [], [dtype.numpy_dtype for dtype in arg_id_to_dtype.values()]) + if dtype.kind == "f": + name = "f" + name # }}} # unary functions if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", - "erf", "erfc"]: + "erf", "erfc", "abs", "real", "imag"]: for id in arg_id_to_dtype: if not -1 <= id <= 0: @@ -511,6 +514,12 @@ class CMathCallable(ScalarCallable): raise LoopyTypeError("{} does not support type {}".format(name, dtype)) + if dtype.kind == "c": + name = "c" + name + + if name in ["abs", "real", "imag"]: + dtype = real_dtype + return ( self.copy(name_in_target=name, arg_id_to_dtype={0: NumpyType(dtype), -1: @@ -589,7 +598,7 @@ def get_c_callables(): cmath_ids = ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", - "fabs", "tan", "erf", "erfc", "isnan"] + "fabs", "tan", "erf", "erfc", "isnan", "real", "imag"] return {id_: CMathCallable(id_) for id_ in cmath_ids} -- GitLab From 63fb2ae845f28b01e18ffa36127cf7152cc0883b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 03:42:16 -0500 Subject: [PATCH 774/916] minor fixes/typos --- loopy/kernel/creation.py | 5 ++++- loopy/kernel/function_interface.py | 7 ++++--- loopy/library/function.py | 3 ++- loopy/library/random123.py | 1 + loopy/preprocess.py | 6 ++---- loopy/symbolic.py | 2 +- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 38ec35e02..8a2e9cde1 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1084,6 +1084,9 @@ def parse_domains(domains, defines): result.append(dom) + if result == []: + result = [isl.BasicSet("{:}")] + return result # }}} @@ -2057,7 +2060,7 @@ class SliceToInameReplacer(IdentityMapper): space = space.add_dims(dim_type.param, len(args_as_params_for_domains)) for i, arg in enumerate(args_as_params_for_domains): - space = space.set_dim_name(dim_type.param, i, arg.name) + space = space.set_dim_name(dim_type.param, i, arg) iname_set = isl.BasicSet.universe(space) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index b74df73e7..6779a1bc7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -25,6 +25,7 @@ from loopy.diagnostic import LoopyError from loopy.tools import update_persistent_hash from loopy.kernel import LoopKernel +from loopy.kernel.array import ArrayBase from loopy.kernel.data import ValueArg, ArrayArg from loopy.symbolic import DependencyMapper, WalkMapper @@ -167,7 +168,7 @@ class ExpressionIsScalarChecker(WalkMapper): self.rec(child) def map_variable(self, expr): - from loopy.kernel.data import TemporaryVariable, ArrayArg + from loopy.kernel.data import TemporaryVariable, ArrayArg, auto if expr.name in self.kernel.all_inames(): # inames are scalar return @@ -177,7 +178,7 @@ class ExpressionIsScalarChecker(WalkMapper): if var is not None: if isinstance(var, (ArrayArg, TemporaryVariable)) and ( - var.shape != ()): + var.shape != () and var.shape is not auto): raise LoopyError("Array regions can only passed as sub-array refs.") def map_slice(self, expr): @@ -792,7 +793,7 @@ class CallableKernel(InKernelCallable): for arg in subkernel.args: kw = arg.name - if isinstance(arg, ArrayArg): + if isinstance(arg, ArrayBase): arg_id_to_descr[kw] = ( ArrayArgDescriptor(shape=arg.shape, dim_tags=arg.dim_tags, diff --git a/loopy/library/function.py b/loopy/library/function.py index 73241152f..d7558960a 100644 --- a/loopy/library/function.py +++ b/loopy/library/function.py @@ -22,6 +22,7 @@ THE SOFTWARE. from loopy.kernel.function_interface import ScalarCallable from loopy.diagnostic import LoopyError +from loopy.types import NumpyType import numpy as np @@ -50,7 +51,7 @@ class IndexOfCallable(ScalarCallable): new_arg_id_to_dtype = {i: dtype for i, dtype in arg_id_to_dtype.items() if dtype is not None} - new_arg_id_to_dtype[-1] = np.int32 + new_arg_id_to_dtype[-1] = NumpyType(np.int32) return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype), callables_table) diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 14199b279..2d4f82205 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -167,6 +167,7 @@ class Random123Callable(ScalarCallable): """ Records information about for the random123 functions. """ + fields = ScalarCallable.fields | {"target"} def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None, target=None): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 4cbd91506..90e527ae4 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -1964,14 +1964,12 @@ def realize_reduction_for_single_kernel(kernel, callables_table, # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: - # FIXME[KK]: With the new mapper emitting callables_table - # something should be done. new_expressions = cb_mapper(insn.expression, - callables_table=callables_table, + callables_table=cb_mapper.callables_table, nresults=nresults) else: new_expressions = cb_mapper(insn.expression, - callables_table=callables_table), + callables_table=cb_mapper.callables_table), if generated_insns: # An expansion happened, so insert the generated stuff plus diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 170165d36..82f7525dc 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -293,7 +293,7 @@ class StringifyMapper(StringifyMapperBase): def map_resolved_function(self, expr, prec): # underlining a resolved call - return "\u0332".join(expr.name) + return "\u0332".join(str(expr.function)) def map_sub_array_ref(self, expr, prec): return "[{inames}]: {subscr}".format( -- GitLab From 42cc9a3d8d5a4756ef476d219426ddc36103fce8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 07:51:59 -0500 Subject: [PATCH 775/916] corrects docs for the new Program interface --- doc/ref_call.rst | 209 +++++++---------------------------------------- 1 file changed, 29 insertions(+), 180 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 5a59e8428..23d358c6e 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -1,193 +1,42 @@ -Calling Loopy Kernels and External Functions -============================================ +Function Interface +================== -Goals of a function interface ------------------------------ -- *FIXME: * Needs to change after the new design of program. +Resolving and specialization +---------------------------- -- Must be able to have complete information of the function just through the - epxression node. -- Must adhere to :mod:`loopy` semantics of immutability. -- Must have a class instance linked with the expression node which would record - the properties of the function. -- Must indicate in the expression if the function is known to the kernel. (This - is intended to be done by making the function expression node an instance of - ``ResolvedFunction`` as soon as the function definition is resolved by the - kernel) -- Function overloading is not encouraged in :mod:`loopy` as it gives rise to - contention while debugging with the help of the kernel intermediate - representation and hence if the expression nodes point to different function - instances they must differ in their representation. For example: ``float - sin(float )`` and ``double sin(double )`` should diverge by having different - identifiers as soon as data type of the argument is inferred. -- Must have an interface to register external functions. +In :mod:`loopy`, a :class:`loopy.program.Program` is a collection of callables +and entrypoints. Callable are of type +:class`:loopy.kernel.function_interface.InKernelCallable`. Any expression node +which has a callable corresponding to it appears as +:class:`~loopy.symbolic.ResolvedFunction`. The process of realizing a function as +a :class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as +resolving. -Scoped Function and resolving ------------------------------ +During code-generation process for a :class:`~loopy.program.Program`, a callable +is *specialized* depending on the types and shapes of the arguments passed at a +call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to +begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl`` +depending on the type of its argument ``x``. A callable's behavior during type +or shape specialization is encoded via +:meth:`loopy.kernel.function_interface.InKernelCallable.with_dtypes` and +:meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. -``ResolvedFunctions`` are pymbolic nodes within expressions in a ``Loo.py`` -kernel, whose name has been resolved by the kernel. The process of matching a -function idenitifier with the function definition is called "resolving". -A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it -is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a -:attr:`LoopKernel.scoped_functions` - -- Functions already registered by the target. Some examples include -- - ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.) -- Functions that are defined in ``Loo.py`` and are realized into - different set of instructions during code generation. Some examples - include ``make_tuple``, ``ArgExtOp``, ``index_of``, ... -- Functions registered as ``CallableKernels`` using - ``lp.register_callable_kernel(...)``. -- Functions that have been provided through - ``lp.register_function_id_to_in_knl_callable_mapper(...)`` -- Functions that can be made known from the user through - ``lp.register_function_mangler``. This is planned to be deprecated, - as its functionality is superseded by - ``lp.register_function_id_to_in_knl_callable_mapper(...)``. - -Expressions after a function is scoped --------------------------------------- - -Consider the following expression. - -:: - - sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i]) - -During the kernel creation phase, the kernel would know that ``sin`` is -a function known to the target and hence it should be scoped. And as -expected, after ``make_kernel`` has been called the above expression -would get converted to: - -:: - - ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + - callable_knl_func(c[i])*mangler_call(d[i]) - -This would also make an entry in the kernel's ``scoped_functions`` -dictionary as: - -:: - - {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None)} - -It might be noteworthy that at this step, it only scopes functions -through their names without any information about the types of the -function. - -Once, the user calls the transformation: -``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``, -the expression gets converted to: - -:: - - ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) + - ResolvedFunction('callable_knl_func')(c[i])*mangler_call(d[i]) - -This also makes an entry in the ``scoped_functions`` dictionary as -- - -:: - - {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None), - Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), - arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)} - -Now, if the user calls -``register_function_mangler(knl, 'mangler_call')``, one might expect -that the mangler call function should get scoped, but that does **not** -happen, because the "old" ``function_manglers``, would return a match -only if all the parameters of the function match viz. name, argument -arity and argument types. Hence, the ``scoped_functions`` dictionary -would remain unchanged. - -``ResolvedFunctions`` and specializations ---------------------------------------- - -Consider the same ``ResolvedFunction('sin')`` as above. This function -although scoped does not the know the types i.e. it does yet know that -for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or -``sinl``. Hence, right now the function can be called as a -"type-generic" function as further in the pipeline it can take any one -of the above definitions. The functions go through a "specialization" -processes at various points in the pipeline, where the attributes of the -callables are resolved. - -- During type inference, the functions go though type specialization - where in the ``arg_id_to_dtype`` of the functions is realized. -- During descriptor inference, the functions goes through a description - specialization where the ``arg_id_to_descr`` is populated. The - ``arg_id_to_descr`` contains important information regarding shape, - strides and scope of the arguments which form an important part of - ``CallableKernel`` as this information would be helpful to to - generate the function signature and make changes to the data access - pattern of the variables in the callee kernel. -- Whenever a ``ResolvedFunction`` goes through a specialization, this is - indicated by changing the name in the ``pymbolic`` node. - -If during type inference, it is inferred that the type of ``a[i]`` is -``np.float32``. The new ``pymbolic`` node would be: - -:: - - ResolvedFunction('sin_0')(a[i]) + ... - -This name change is done so that it indicates that the node points to a -different ``ScalarCallable`` in the dictionary. And hence a new entry is -added to the ``scoped_functions`` dictionary as: - -:: - - {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None), - Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...), - arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None), - 'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32, - -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')} - -Description Inference +Registering callables --------------------- -Although this step has no significance for a ``ScalarCallable``, it -forms a very important part of ``CallableKernel``. In which the -``dim_tags``, ``shape`` and ``address_space`` of the arguments of the -callable kernel is altered. - -- The ``dim_tags`` attribute helps to ensure that the memory layout - between the caller and the callee kernel is coherent. -- The ``address_space`` attribute ensures that, while writing the device - code we emit the appropriate scope qualifiers for the function - declaration arguments. -- The ``shape`` attribute helps in: - - - Storage allocation. - - Memory layout. - - Out of bounds accesses to be caught in ``Loo.py``. - -Hence, in the ``Loo.py`` pipeline, one might expect the following -developments of the ``sin`` pymbolic call expression node. - -:: +A user can *register* callables within a :class:`~loopy.program.Program` to +allow loopy to resolve calls not pre-defined in :mod:`loopy`. In :mod:`loopy`, +we typically aim to expose all the standard math functions defined for +a :class:`~loopy.target.Target`. Other foreign functions could be invoked by +*registering* them. - sin -> (Kernel creation) -> ResolvedFunction(Variable('sin')) -> - (Type Inference) -> ResolvedFunction(Variable('sin_0')) -> - (Descriptor Inference) -> ResolvedFunction(Variable('sin_1')) - -Changes on the target side to accommodate the new function interface --------------------------------------------------------------------- - -The earlier "function\_mangler" as a member method of the class -``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The -function scopers would return a list of functions with the signature -``(target, identifier)->lp.InKernelCallable``. - -An example: Calling BLAS ------------------------- +An example demonstrating registering a CBlasGemv as a loopy callable: .. literalinclude:: ../examples/python/external-call.py + +A :class:`loopy.kernel.LoopKernel` can be registered via +:func:`loopy.transform.callable.register_callable_kernel`. -- GitLab From 59c7f81033de5dde81b0220b18cbb4bdc562d48d Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Apr 2021 08:34:32 -0500 Subject: [PATCH 776/916] Remove ignore_boostable_into=True from gnuma test --- test/test_numa_diff.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index bb0a28126..40309f70f 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -229,7 +229,6 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa hsv = tap_hsv hsv = lp.set_options(hsv, - ignore_boostable_into=True, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", -- GitLab From f652ad1d2ed71ca88c4528a6d390cf6f098f2190 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Apr 2021 09:07:41 -0500 Subject: [PATCH 777/916] Add a comment clarifying that SeenFunction and callables don't talk --- loopy/codegen/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 3c02a724b..17c5d3480 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -152,7 +152,10 @@ class VectorizationInfo: class SeenFunction(ImmutableRecord): - """ + """This is used to track functions that emerge late during code generation, + e.g. C functions to realize arithmetic. No connection with + :class:`~loopy.kernel.function_interface.InKernelCallable`. + .. attribute:: name .. attribute:: c_name .. attribute:: arg_dtypes -- GitLab From 327c1216e86a48403c4966daa2d09a7c4057da5c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Apr 2021 09:08:02 -0500 Subject: [PATCH 778/916] Drop SeenFunction handling from py target --- loopy/target/python.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/loopy/target/python.py b/loopy/target/python.py index c7f20ff55..d30dd41a7 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -103,13 +103,6 @@ class ExpressionToPythonMapper(StringifyMapper): str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters] - from loopy.codegen import SeenFunction - self.codegen_state.seen_functions.add( - SeenFunction(clbl.name, - clbl.name_in_target, - clbl.input_dtypes, - clbl.result_dtypes)) - return "{}({})".format(clbl.name_in_target, ", ".join(str_parameters)) -- GitLab From 262fc34afc55a40b4d0cebb107f4cec26f045898 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Apr 2021 09:20:18 -0500 Subject: [PATCH 779/916] Drop forced_result_type in reductions --- loopy/library/reduction.py | 28 ++++------------------------ test/test_nbody.py | 4 ++-- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 1d53d06b0..1417568f2 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -100,42 +100,25 @@ class ReductionOperation: class ScalarReductionOperation(ReductionOperation): - def __init__(self, forced_result_type=None): - """ - :arg forced_result_type: Force the reduction result to be of this type. - May be a string identifying the type for the backend under - consideration. - """ - self.forced_result_type = forced_result_type - @property def arg_count(self): return 1 def result_dtypes(self, arg_dtype): - if self.forced_result_type is not None: - raise NotImplementedError() - # return (self.parse_result_type( - # kernel.target, self.forced_result_type),) - if arg_dtype is None: return None return (arg_dtype,) def __hash__(self): - return hash((type(self), self.forced_result_type)) + return hash((type(self),)) def __eq__(self, other): - return (type(self) == type(other) - and self.forced_result_type == other.forced_result_type) + return type(self) == type(other) def __str__(self): result = type(self).__name__.replace("ReductionOperation", "").lower() - if self.forced_result_type is not None: - result = "{}<{}>".format(result, str(self.forced_result_type)) - return result @@ -528,11 +511,8 @@ def parse_reduction_op(name): red_op_match = re.match(r"^([a-z]+)_([a-z0-9_]+)$", name) if red_op_match: - op_name = red_op_match.group(1) - op_type = red_op_match.group(2) - - if op_name in _REDUCTION_OPS: - return _REDUCTION_OPS[op_name](op_type) + raise NotImplementedError("reductions with specified types are no longer " + "supported") if name in _REDUCTION_OPS: return _REDUCTION_OPS[name]() diff --git a/test/test_nbody.py b/test/test_nbody.py index 1254be7d3..3a8d509d6 100644 --- a/test/test_nbody.py +++ b/test/test_nbody.py @@ -45,8 +45,8 @@ def test_nbody(ctx_factory): "[N] -> {[i,j,k]: 0<=i,j Date: Mon, 19 Apr 2021 09:31:37 -0500 Subject: [PATCH 780/916] Be less aggressive about reporting reduction-with-forced-result no longer existing --- loopy/library/reduction.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 1417568f2..468aaff26 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -511,8 +511,14 @@ def parse_reduction_op(name): red_op_match = re.match(r"^([a-z]+)_([a-z0-9_]+)$", name) if red_op_match: - raise NotImplementedError("reductions with specified types are no longer " - "supported") + op_name = red_op_match.group(1) + + if op_name in _REDUCTION_OPS: + from warnings import warn + warn("Reductions with forced result types are no longer supported. " + f"Encountered '{name}', which might be one.", + DeprecationWarning) + return None if name in _REDUCTION_OPS: return _REDUCTION_OPS[name]() -- GitLab From 02a901a0f068a31dbaa3f4d2966197313d444e58 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Apr 2021 10:58:03 -0500 Subject: [PATCH 781/916] Drop test_preamble_with_separate_temporaries (we no longer support function manglers) --- test/test_loopy.py | 60 ----------------------- test/testlib.py | 117 --------------------------------------------- 2 files changed, 177 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1e728eefb..1be7ba732 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2520,66 +2520,6 @@ def test_wildcard_dep_matching(): {"insn1", "insn5"}) -def test_preamble_with_separate_temporaries(ctx_factory): - # create a function mangler - - # and finally create a test - n = 5 - # for each entry come up with a random number of data points - num_data = np.asarray(np.random.randint(2, 10, size=n), dtype=np.int32) - # turn into offsets - offsets = np.asarray(np.hstack(([0], np.cumsum(num_data))), dtype=np.int32) - # create lookup data - lookup = np.empty(0) - for i in num_data: - lookup = np.hstack((lookup, np.arange(i))) - lookup = np.asarray(lookup, dtype=np.int32) - # and create data array - data = np.random.rand(np.product(num_data)) - - # make kernel - kernel = lp.make_kernel("{[i]: 0 <= i < n}", - """ - for i - <>ind = indirect(offsets[i], offsets[i + 1], 1) - out[i] = data[ind] - end - """, - [lp.GlobalArg("out", shape=("n",)), - lp.TemporaryVariable( - "offsets", shape=(offsets.size,), initializer=offsets, - address_space=lp.AddressSpace.GLOBAL, - read_only=True), - lp.GlobalArg("data", shape=(data.size,), dtype=np.float64)], - ) - - # fixt params, and add manglers / preamble - from testlib import ( - SeparateTemporariesPreambleTestMangler, - SeparateTemporariesPreambleTestPreambleGenerator, - ) - func_info = dict( - func_name="indirect", - func_arg_dtypes=(np.int32, np.int32, np.int32), - func_result_dtypes=(np.int32,), - arr=lookup - ) - - kernel = lp.fix_parameters(kernel, **{"n": n}) - kernel = lp.register_preamble_generators( - kernel, [SeparateTemporariesPreambleTestPreambleGenerator(**func_info)]) - kernel = lp.register_function_manglers( - kernel, [SeparateTemporariesPreambleTestMangler(**func_info)]) - - print(lp.generate_code(kernel)[0]) - # and call (functionality unimportant, more that it compiles) - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) - # check that it actually performs the lookup correctly - assert np.allclose(kernel( - queue, data=data.flatten("C"))[1][0], data[offsets[:-1] + 1]) - - def test_arg_inference_for_predicates(): prog = lp.make_kernel("{[i]: 0 <= i < 10}", """ diff --git a/test/testlib.py b/test/testlib.py index 7009e8f5a..847c7423a 100644 --- a/test/testlib.py +++ b/test/testlib.py @@ -17,123 +17,6 @@ class GridOverride: # }}} -# {{{ test_preamble_with_separate_temporaries - -class SeparateTemporariesPreambleTestDataHolder: - def __init__(self, func_name, func_arg_dtypes, func_result_dtypes, arr): - self.func_name = func_name - self.func_arg_dtypes = func_arg_dtypes - self.func_result_dtypes = func_result_dtypes - self.arr = arr - - def __eq__(self, other): - import numpy as np - return ( - isinstance(other, type(self)) - and self.func_name == other.func_name - and self.func_arg_dtypes == other.func_arg_dtypes - and self.func_result_dtypes == other.func_result_dtypes - and np.array_equal(self.arr, other.arr)) - - def __ne__(self, other): - return not self.__eq__(other) - - -class SeparateTemporariesPreambleTestMangler( - SeparateTemporariesPreambleTestDataHolder): - def __call__(self, kernel, name, arg_dtypes): - """ - A function that will return a :class:`loopy.kernel.data.CallMangleInfo` - to interface with the calling :class:`loopy.LoopKernel` - """ - if name != self.func_name: - return None - - from loopy.types import to_loopy_type - from loopy.kernel.data import CallMangleInfo - - def __compare(d1, d2): - # compare dtypes ignoring atomic - return to_loopy_type(d1, for_atomic=True) == \ - to_loopy_type(d2, for_atomic=True) - - # check types - if len(arg_dtypes) != len(arg_dtypes): - raise Exception("Unexpected number of arguments provided to mangler " - "{}, expected {}, got {}".format( - self.func_name, len(self.func_arg_dtypes), - len(arg_dtypes))) - - for i, (d1, d2) in enumerate(zip(self.func_arg_dtypes, arg_dtypes)): - if not __compare(d1, d2): - raise Exception("Argument at index {} for mangler {} does not " - "match expected dtype. Expected {}, got {}". - format(i, self.func_name, str(d1), str(d2))) - - # get target for creation - target = arg_dtypes[0].target - return CallMangleInfo( - target_name=self.func_name, - result_dtypes=tuple(to_loopy_type(x, target=target) for x in - self.func_result_dtypes), - arg_dtypes=arg_dtypes) - - -class SeparateTemporariesPreambleTestPreambleGenerator( - SeparateTemporariesPreambleTestDataHolder): - def __call__(self, preamble_info): - - # find a function matching our name - func_match = next( - (x for x in preamble_info.seen_functions - if x.name == self.func_name), None) - desc = "custom_funcs_indirect" - if func_match is not None: - from loopy.types import to_loopy_type - # check types - if tuple(to_loopy_type(x) for x in self.func_arg_dtypes) == \ - func_match.arg_dtypes: - # if match, create our temporary - var = lp.TemporaryVariable( - "lookup", initializer=self.arr, dtype=self.arr.dtype, - shape=self.arr.shape, - address_space=lp.AddressSpace.GLOBAL, read_only=True) - # and code - code = """ - int {name}(int start, int end, int match) - {{ - int result = start; - for (int i = start + 1; i < end; ++i) - {{ - if (lookup[i] == match) - result = i; - }} - return result; - }} - """.format(name=self.func_name) - - # generate temporary variable code - from cgen import Initializer - from loopy.target.c import generate_array_literal - codegen_state = preamble_info.codegen_state.copy( - is_generating_device_code=True) - kernel = preamble_info.kernel - ast_builder = codegen_state.ast_builder - target = kernel.target - decl_info, = var.decl_info(target, index_dtype=kernel.index_dtype) - decl = ast_builder.wrap_global_constant( - ast_builder.get_temporary_decl( - codegen_state, None, var, - decl_info)) - if var.initializer is not None: - decl = Initializer(decl, generate_array_literal( - codegen_state, var, var.initializer)) - # return generated code - yield (desc, "\n".join([str(decl), code])) - -# }}} - - # {{{ test_register_function_lookup class Log2Callable(lp.ScalarCallable): -- GitLab From ede1c09b2ebcf1bca18b288f569750a1668c3969 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Apr 2021 12:53:53 -0500 Subject: [PATCH 782/916] Fix deprecation warnings relating to get_one_scheduled_kernel --- loopy/codegen/__init__.py | 4 ++-- loopy/kernel/tools.py | 4 ++-- loopy/schedule/__init__.py | 2 +- loopy/statistics.py | 2 +- loopy/target/execution.py | 4 ++-- loopy/transform/save.py | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 3c02a724b..f04c276e1 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -684,10 +684,10 @@ def generate_code_v2(program): for name, clbl in program.callables_table.items(): if isinstance(clbl, CallableKernel): - from loopy.schedule import get_one_scheduled_kernel + from loopy.schedule import get_one_linearized_kernel knl = clbl.subkernel if knl.schedule is None: - knl = get_one_scheduled_kernel( + knl = get_one_linearized_kernel( knl, program.callables_table) new_callables[name] = clbl.copy(subkernel=knl) elif isinstance(clbl, ScalarCallable): diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 5cae76192..fa924467b 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -506,8 +506,8 @@ def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, if iname_cluster and not kernel.schedule: try: - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel, callables_table) + from loopy.schedule import get_one_linearized_kernel + kernel = get_one_linearized_kernel(kernel, callables_table) except RuntimeError as e: iname_cluster = False from warnings import warn diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 91f7cf70f..0951db869 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -2141,7 +2141,7 @@ def get_one_scheduled_kernel(kernel, callables_table): kernel, "get_one_scheduled_kernel_deprecated", "get_one_scheduled_kernel is deprecated. " "Use get_one_linearized_kernel instead.", - DeprecationWarning) + DeprecationWarning, stacklevel=2) return get_one_linearized_kernel(kernel, callables_table) diff --git a/loopy/statistics.py b/loopy/statistics.py index f5ecf5b75..a20f6fe48 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -2028,7 +2028,7 @@ def get_mem_access_map(program, numpy_types=None, count_redundant_work=False, def _get_synchronization_map_for_single_kernel(knl, callables_table, subgroup_size=None): - knl = lp.get_one_scheduled_kernel(knl, callables_table) + knl = lp.get_one_linearized_kernel(knl, callables_table) from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel, ReturnFromKernel, RunInstruction) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index a8666f02b..3d4d71147 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -763,10 +763,10 @@ class KernelExecutorBase: from loopy.preprocess import preprocess_program program = preprocess_program(program) - from loopy.schedule import get_one_scheduled_kernel + from loopy.schedule import get_one_linearized_kernel for e in program.entrypoints: program = program.with_kernel( - get_one_scheduled_kernel(program[e], program.callables_table)) + get_one_linearized_kernel(program[e], program.callables_table)) return program diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 884e17f77..7c7f00932 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -751,8 +751,8 @@ def save_and_reload_temporaries(program, entrypoint=None): if not knl.schedule: program = lp.preprocess_program(program) - from loopy.schedule import get_one_scheduled_kernel - knl = get_one_scheduled_kernel(program[entrypoint], + from loopy.schedule import get_one_linearized_kernel + knl = get_one_linearized_kernel(program[entrypoint], program.callables_table) assert knl.schedule is not None -- GitLab From 3f72beace86cc4791b1c090fdfd3c44cf0678c7f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Apr 2021 13:03:12 -0500 Subject: [PATCH 783/916] warn_with_kernel: support stacklevel --- loopy/diagnostic.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 0ae2e530a..c471facc8 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -48,7 +48,7 @@ class WriteRaceConditionWarning(LoopyWarning): # }}} -def warn_with_kernel(kernel, id, text, type=LoopyWarning): +def warn_with_kernel(kernel, id, text, type=LoopyWarning, stacklevel=None): from fnmatch import fnmatchcase for sw in kernel.silenced_warnings: if fnmatchcase(id, sw): @@ -57,8 +57,12 @@ def warn_with_kernel(kernel, id, text, type=LoopyWarning): text += (" (add '%s' to silenced_warnings kernel attribute to disable)" % id) + if stacklevel is None: + stacklevel = 2 + else: + stacklevel = stacklevel + 1 from warnings import warn - warn(f"in kernel {kernel.name}: {text}", type, stacklevel=2) + warn(f"in kernel {kernel.name}: {text}", type, stacklevel=stacklevel) warn = MovedFunctionDeprecationWrapper(warn_with_kernel) -- GitLab From 028103bbb1e6a78b7e359cff5643a93b59c5ab76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 19 Apr 2021 14:41:53 -0500 Subject: [PATCH 784/916] Delete LoopKernel.default_entrypoint on kernel_callables --- loopy/kernel/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 26f05ef81..e6c05c878 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1643,11 +1643,6 @@ class LoopKernel(ImmutableRecordWithoutPickling): return super().copy(**kwargs) - # forward compatibility with kernel callables - @property - def default_entrypoint(self): - return self - # }}} # vim: foldmethod=marker -- GitLab From ae2e78a5b91fd1b092eaff97b86c2ae4709a917a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 14:45:22 -0500 Subject: [PATCH 785/916] [CI, intelCL]: keep cl_ctx alive --- test/test_callables.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/test_callables.py b/test/test_callables.py index ef22b1632..9545cdf53 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -628,7 +628,8 @@ def test_incomplete_entrypoint_raises_type_inf_failure(): def test_callees_with_gbarriers_are_inlined(ctx_factory): - queue = cl.CommandQueue(ctx_factory()) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) ones_and_zeros = lp.make_function( "{[i, j]: 0<=i<6 and 0<=j<3}", @@ -655,7 +656,8 @@ def test_callees_with_gbarriers_are_inlined(ctx_factory): def test_inlining_with_indirections(ctx_factory): - queue = cl.CommandQueue(ctx_factory()) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) ones_and_zeros = lp.make_function( "{[i, j]: 0<=i<6 and 0<=j<3}", @@ -686,7 +688,8 @@ def test_inlining_with_indirections(ctx_factory): def test_inlining_with_callee_domain_param(ctx_factory): - queue = cl.CommandQueue(ctx_factory()) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) fill2 = lp.make_function( "{[i]: 0<=i Date: Mon, 19 Apr 2021 12:53:21 -0500 Subject: [PATCH 786/916] [docs]: handle all sphinx warnings --- doc/conf.py | 1 + doc/index.rst | 1 + doc/ref_call.rst | 27 ++++++++------ doc/ref_program.rst | 6 +++ doc/tutorial.rst | 2 +- loopy/kernel/function_interface.py | 60 ++++++++++++++++++------------ loopy/library/reduction.py | 2 + loopy/program.py | 17 +++++---- loopy/statistics.py | 5 ++- loopy/symbolic.py | 14 +++---- 10 files changed, 81 insertions(+), 54 deletions(-) create mode 100644 doc/ref_program.rst diff --git a/doc/conf.py b/doc/conf.py index 9b8cf81e1..4c6d28c9d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -116,6 +116,7 @@ intersphinx_mapping = { "https://documen.tician.de/cgen": None, "https://documen.tician.de/pymbolic": None, "https://documen.tician.de/pytools": None, + "https://pyrsistent.readthedocs.io/en/latest/": None, } autoclass_content = "class" diff --git a/doc/index.rst b/doc/index.rst index 8ab62928d..d3cb6f38c 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -42,6 +42,7 @@ Please check :ref:`installation` to get started. tutorial ref_creation ref_kernel + ref_program ref_transform ref_call ref_other diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 23d358c6e..4ba3246a2 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -1,3 +1,8 @@ +.. currentmodule:: loopy + + +.. _func-interface: + Function Interface ================== @@ -5,38 +10,38 @@ Function Interface Resolving and specialization ---------------------------- -In :mod:`loopy`, a :class:`loopy.program.Program` is a collection of callables +In :mod:`loopy`, a :class:`loopy.Program` is a collection of callables and entrypoints. Callable are of type :class`:loopy.kernel.function_interface.InKernelCallable`. Any expression node which has a callable corresponding to it appears as :class:`~loopy.symbolic.ResolvedFunction`. The process of realizing a function as -a :class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as -resolving. +a :class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as resolving. -During code-generation process for a :class:`~loopy.program.Program`, a callable +During code-generation process for a :class:`~loopy.Program`, a callable is *specialized* depending on the types and shapes of the arguments passed at a call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl`` depending on the type of its argument ``x``. A callable's behavior during type or shape specialization is encoded via -:meth:`loopy.kernel.function_interface.InKernelCallable.with_dtypes` and -:meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`. +:meth:`~loopy.kernel.function_interface.InKernelCallable.with_types` and +:meth:`~loopy.kernel.function_interface.InKernelCallable.with_descrs`. Registering callables --------------------- -A user can *register* callables within a :class:`~loopy.program.Program` to +A user can *register* callables within a :class:`~loopy.Program` to allow loopy to resolve calls not pre-defined in :mod:`loopy`. In :mod:`loopy`, we typically aim to expose all the standard math functions defined for -a :class:`~loopy.target.Target`. Other foreign functions could be invoked by +a :class:`~loopy.target.TargetBase`. Other foreign functions could be invoked by *registering* them. An example demonstrating registering a CBlasGemv as a loopy callable: -.. literalinclude:: ../examples/python/external-call.py +.. literalinclude:: ../examples/python/call-external.py +Reference +--------- -A :class:`loopy.kernel.LoopKernel` can be registered via -:func:`loopy.transform.callable.register_callable_kernel`. +.. automodule:: loopy.kernel.function_interface diff --git a/doc/ref_program.rst b/doc/ref_program.rst new file mode 100644 index 000000000..2e4d5b9bc --- /dev/null +++ b/doc/ref_program.rst @@ -0,0 +1,6 @@ +.. currentmodule:: loopy + +Program +======= + +.. autoclass:: Program diff --git a/doc/tutorial.rst b/doc/tutorial.rst index d93be3e58..7732aacbe 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1607,7 +1607,7 @@ One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_wi grouping, and evaluating subsets of the counts. Suppose we want to know the total number of 32-bit operations of any kind. We can easily count these using functions :func:`loopy.ToCountMap.filter_by` and -:func:`loopy.ToCountMap.eval_and_sum`: +:func:`loopy.ToCountPolynomialMap.eval_and_sum`: .. doctest:: diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 6779a1bc7..81ae58343 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -30,15 +30,17 @@ from loopy.kernel.data import ValueArg, ArrayArg from loopy.symbolic import DependencyMapper, WalkMapper __doc__ = """ - -.. currentmodule:: loopy +.. currentmodule:: loopy.kernel.function_interface .. autoclass:: ValueArgDescriptor + .. autoclass:: ArrayArgDescriptor + .. autoclass:: InKernelCallable + .. autoclass:: CallableKernel -.. autoclass:: ScalarCallable +.. autoclass:: ScalarCallable """ @@ -60,7 +62,7 @@ class ArrayArgDescriptor(ImmutableRecord): """ Records information about an array argument to an in-kernel callable. To be passed to and returned from - :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used for + :meth:`InKernelCallable.with_descrs`, used for matching shape and address space of caller and callee kernels. ..attribute:: shape @@ -69,7 +71,7 @@ class ArrayArgDescriptor(ImmutableRecord): .. attribute:: address_space - An attribute of :class:`loopy.kernel.data.AddressSpace`. + An attribute of :class:`loopy.AddressSpace`. .. attribute:: dim_tags @@ -332,14 +334,6 @@ class InKernelCallable(ImmutableRecord): A mapping which gives indicates the argument shape and ``dim_tags`` it would be responsible for generating code. - .. note:: - - "``arg_id`` can either be an instance of :class:`int` integer - corresponding to the position of the argument or an instance of - :class:`str` corresponding to the name of keyword argument accepted - by the function. - - - Negative "arg_id" values ``-i`` in the mapping attributes indicate - return value with (0-based) index *i*. .. automethod:: __init__ .. automethod:: with_types @@ -350,6 +344,17 @@ class InKernelCallable(ImmutableRecord): .. automethod:: emit_call .. automethod:: emit_call_insn .. automethod:: is_ready_for_codegen + + .. note:: + + * "``arg_id`` can either be an instance of :class:`int` integer + corresponding to the position of the argument or an instance of + :class:`str` corresponding to the name of keyword argument accepted + by the function. + + * Negative "arg_id" values ``-i`` in the mapping attributes indicate + return value with (0-based) index *i*. + """ fields = {"arg_id_to_dtype", "arg_id_to_descr"} @@ -390,7 +395,7 @@ class InKernelCallable(ImmutableRecord): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for positional arguments, names for keyword arguments) to - :class:`loopy.ArrayArgDescriptor` instances. Unspecified/unknown + :class:`ArrayArgDescriptor` instances. Unspecified/unknown descriptors are not represented in *arg_id_to_descr*. All the expressions in arg_id_to_descr must have variables that belong @@ -411,8 +416,7 @@ class InKernelCallable(ImmutableRecord): def with_target(self, target): """ Returns a copy of *self* with all the ``dtypes`` in - ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer - :meth:`loopy.types.LoopyType.with_target`. + ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. :arg target: An instance of :class:`loopy.target.TargetBase`. """ @@ -499,6 +503,10 @@ class ScalarCallable(InKernelCallable): """ An abstract interface the to a scalar callable encountered in a kernel. + .. automethod:: with_types + + .. automethod:: with_descrs + .. note:: The :meth:`ScalarCallable.with_types` is intended to assist with type @@ -654,21 +662,25 @@ class CallableKernel(InKernelCallable): """ Records informations about a callee kernel. Also provides interface through member methods to make the callee kernel compatible to be called from a - caller kernel. The :meth:`loopy.register_callable_kernel` should be called - in order to initiate association between a function in caller kernel and - the callee kernel. + caller kernel. :meth:`CallableKernel.with_types` should be called in order to match the ``dtypes`` of the arguments that are shared between the caller and the callee kernel. :meth:`CallableKernel.with_descrs` should be called in order to match - :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`, - :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the - caller and the callee kernel. + the arguments' shapes/strides across the caller and the callee kernel. + + :meth:`CallableKernel.with_hw_axes_sizes` should be called to set the grid + sizes for the :attr:`CallableKernel.subkernel` of the callable. + + .. attribute:: subkernel + + :class:`~loopy.LoopKernel` which is being called. - :meth:`CallableKernel.with_hw_axes` should be called to set the grid - sizes for the :attr:`subkernel` of the callable. + .. automethod:: with_descrs + .. automethod:: with_types + .. automethod:: with_hw_axes_sizes """ fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"} diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 468aaff26..6f97e1667 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -45,6 +45,8 @@ __doc__ = """ .. autoclass:: MaxReductionOperation .. autoclass:: MinReductionOperation + +.. autoclass:: ReductionOpFunction """ diff --git a/loopy/program.py b/loopy/program.py index 792abe59a..2b7c61e2c 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -49,6 +49,7 @@ __doc__ = """ .. autoclass:: Program .. autofunction:: make_program + .. autofunction:: iterate_over_kernels_if_given_program """ @@ -162,7 +163,7 @@ class Program(ImmutableRecord): An instance of :class:`pyrsistent.PMap` mapping the function identifiers in a kernel to their associated instances of - :class:`loopy.kernel.function_interface.InKernelCallable`. + :class:`~loopy.kernel.function_interface.InKernelCallable`. .. attribute:: target @@ -174,17 +175,16 @@ class Program(ImmutableRecord): TargetBase, function_indentifier: str)`` that would return an instance of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + .. automethod:: copy + .. automethod:: __getitem__ + .. note:: - To create an instance of :class:`loopy.Program`, it is recommended to - go through :method:`loopy.make_kernel`. + go through :func:`loopy.make_kernel`. - This data structure and its attributes should be considered - immutable, any modifications should be done through :method:`copy`. + immutable, any modifications should be done through :meth:`~Program.copy`. - .. automethod:: __init__ - .. method:: __getitem__ - - Look up the resolved callable with identifier *name*. """ def __init__(self, entrypoints=frozenset(), @@ -291,6 +291,9 @@ class Program(ImmutableRecord): return self.copy(callables_table=new_callables) def __getitem__(self, name): + """ + Look up the resolved callable with identifier *name*. + """ result = self.callables_table[name] if isinstance(result, CallableKernel): return result.subkernel diff --git a/loopy/statistics.py b/loopy/statistics.py index a20f6fe48..96d96e3a4 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -185,7 +185,6 @@ class ToCountMap: .. automethod:: group_by .. automethod:: to_bytes .. automethod:: sum - .. automethod:: eval_and_sum """ @@ -463,7 +462,9 @@ class ToCountMap: class ToCountPolynomialMap(ToCountMap): """Maps any type of key to a :class:`islpy.PwQPolynomial` or a - :class:`GuardedPwQPolynomial`. + :class:`~loopy.statistics.GuardedPwQPolynomial`. + + .. automethod:: eval_and_sum """ def __init__(self, space, count_map=None): diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 82f7525dc..c6f1fdd8c 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -89,6 +89,8 @@ __doc__ = """ .. autoclass:: ExpansionState .. autoclass:: RuleAwareIdentityMapper + +.. autoclass:: ResolvedFunction """ @@ -800,19 +802,13 @@ class RuleArgument(LoopyExpressionBase): class ResolvedFunction(LoopyExpressionBase): """ - A function invocation whose definition is known in a :mod:`loopy` kernel. - Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression - points to an instance of - :class:`loopy.kernel.function_interface.InKernelCallable` through the - mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer - :ref:`ref_scoped_function` for a slightly detailed explanation on scoped - functions. + A function invocation whose definition is known in a :mod:`loopy` program. + Refer ref:`func-interface`. .. attribute:: function An instance of :class:`pymbolic.primitives.Variable`, - :class:`loopy.library.reduction.ArgExtOp` or - :class:`loopy.library.reduction.SegmentedOp`. + an instance of :class:`loopy.library.reduction.ReductionOpFunction` """ init_arg_names = ("function", ) -- GitLab From 493937997ee210b0022e5a7ac78956bea4ab9599 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 14:10:29 -0500 Subject: [PATCH 787/916] [docs]: grammar, typos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Andreas Klöckner --- loopy/program.py | 4 +++- loopy/symbolic.py | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index 2b7c61e2c..a975ec000 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -292,7 +292,9 @@ class Program(ImmutableRecord): def __getitem__(self, name): """ - Look up the resolved callable with identifier *name*. + For the callable named *name*, return a :class:`loopy.LoopKernel` if + it's a :class:`~loopy.kernel.function_interface.CallableKernel` + otherwise return the callable itself. """ result = self.callables_table[name] if isinstance(result, CallableKernel): diff --git a/loopy/symbolic.py b/loopy/symbolic.py index c6f1fdd8c..19c75c08c 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -803,12 +803,14 @@ class RuleArgument(LoopyExpressionBase): class ResolvedFunction(LoopyExpressionBase): """ A function invocation whose definition is known in a :mod:`loopy` program. - Refer ref:`func-interface`. + A function is said to be *known* in a :class:`~loopy.Program` if it's + identifier maps to :class:`~loopy.kernel.function_interface.InKernelCallable` + in :attr:`loopy.Program.callables_table`. Refer to :ref:`func-interface`. .. attribute:: function - An instance of :class:`pymbolic.primitives.Variable`, - an instance of :class:`loopy.library.reduction.ReductionOpFunction` + An instance of :class:`pymbolic.primitives.Variable` or + :class:`loopy.library.reduction.ReductionOpFunction`. """ init_arg_names = ("function", ) -- GitLab From c4726582ccf67a81766580fbc7e49636ca25ecf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 19 Apr 2021 14:30:30 -0500 Subject: [PATCH 788/916] [grammar] Fix ResolvedFunction docstring --- loopy/symbolic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 19c75c08c..63aa6a4d7 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -803,8 +803,8 @@ class RuleArgument(LoopyExpressionBase): class ResolvedFunction(LoopyExpressionBase): """ A function invocation whose definition is known in a :mod:`loopy` program. - A function is said to be *known* in a :class:`~loopy.Program` if it's - identifier maps to :class:`~loopy.kernel.function_interface.InKernelCallable` + A function is said to be *known* in a :class:`~loopy.Program` if its + identifier maps to an :class:`~loopy.kernel.function_interface.InKernelCallable` in :attr:`loopy.Program.callables_table`. Refer to :ref:`func-interface`. .. attribute:: function -- GitLab From 37d34a128183d7cba4fb8cfee0a1f62b118212f2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 14:27:07 -0500 Subject: [PATCH 789/916] modernize fortran examples - parse_fortran returns a Program instead of a list of kernels --- examples/fortran/matmul-driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fortran/matmul-driver.py b/examples/fortran/matmul-driver.py index 111ac2411..499bc9b71 100644 --- a/examples/fortran/matmul-driver.py +++ b/examples/fortran/matmul-driver.py @@ -11,7 +11,7 @@ def main(): with open(fn) as inf: source = inf.read() - dgemm, = lp.parse_transformed_fortran(source, filename=fn) + dgemm = lp.parse_transformed_fortran(source, filename=fn) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) -- GitLab From 67e4241ca7df26e0dcd7be380363cc091268cd96 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Apr 2021 12:50:57 -0500 Subject: [PATCH 790/916] Add Program.default_entrypoint --- loopy/program.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/loopy/program.py b/loopy/program.py index a975ec000..09b13ffd5 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -159,6 +159,12 @@ class Program(ImmutableRecord): A :class:`frozenset` of the names of the kernels which could be called from the host. + .. attribute:: default_entrypoint + + The :class:`~loopy.LoopKernel` representing the main entrypoint + of the program, if defined. Currently, this attribute may only be + accessed if there is exactly one entrypoint in the program. + .. attribute:: callables_table An instance of :class:`pyrsistent.PMap` mapping the function @@ -302,21 +308,33 @@ class Program(ImmutableRecord): else: return result + @property + def default_entrypoint(self): + if len(self.entrypoints) == 1: + entrypoint, = self.entrypoints + return self[entrypoint] + else: + raise ValueError("Program has multiple possible entrypoints. The " + "default entry point kernel is not uniquely determined.") + def __call__(self, *args, **kwargs): entrypoint = kwargs.get("entrypoint", None) if entrypoint is None: # did not receive an entrypoint for the program to execute if len(self.entrypoints) == 1: - entrypoint, = list(self.entrypoints) + entrypoint, = self.entrypoints else: raise TypeError("Program.__call__() missing 1 required" - " keyword argument: 'entrypoint'") + " keyword argument: 'entrypoint'. " + "(Multiple possible entrypoints are present in the " + "program.)") if entrypoint not in self.entrypoints: - raise LoopyError("'{}' not in list possible entrypoints supplied to" - " the program. Maybe you want to invoke 'with_entrypoints'" - " before calling the program.".format(entrypoint)) + raise LoopyError(f"'{entrypoint}' not in list of possible entrypoints " + "for the program. " + "Maybe you want to invoke 'with_entrypoints' before " + "calling the program?") kwargs["entrypoint"] = entrypoint -- GitLab From 154212401a7c9f2f221e6ce7c78b76bc4d7115ea Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 19 Apr 2021 12:56:11 -0500 Subject: [PATCH 791/916] Clarify C target kernel executor interface --- loopy/target/c/__init__.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index a45965c80..c3c874110 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -423,10 +423,10 @@ class CFamilyTarget(TargetBase): return self.get_dtype_registry().dtype_to_ctype(dtype) def get_kernel_executor_cache_key(self, *args, **kwargs): - return None # TODO: ??? + raise NotImplementedError def get_kernel_executor(self, knl, *args, **kwargs): - raise NotImplementedError() + raise NotImplementedError # }}} @@ -1205,9 +1205,14 @@ class ExecutableCTarget(CTarget): from loopy.target.c.c_execution import CCompiler self.compiler = compiler or CCompiler() - def get_kernel_executor(self, knl, *args, **kwargs): + def get_kernel_executor_cache_key(self, *args, **kwargs): + # This is for things like the context in OpenCL. There is no such + # thing that CPU JIT is specific to. + return None + + def get_kernel_executor(self, prg, *args, **kwargs): from loopy.target.c.c_execution import CKernelExecutor - return CKernelExecutor(knl, entrypoint=kwargs.pop("entrypoint"), + return CKernelExecutor(prg, entrypoint=kwargs.pop("entrypoint"), compiler=self.compiler) def get_host_ast_builder(self): -- GitLab From f46729dcc961becb752b2996a4898220cce870b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 19 Apr 2021 17:38:48 -0500 Subject: [PATCH 792/916] Add autopush for kernel_callables_v3-edit2 --- .github/workflows/autopush.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/autopush.yml b/.github/workflows/autopush.yml index f89b08ac5..f7e229731 100644 --- a/.github/workflows/autopush.yml +++ b/.github/workflows/autopush.yml @@ -3,6 +3,7 @@ on: push: branches: - main + - kernel_callables_v3-edit2 jobs: autopush: @@ -14,7 +15,9 @@ jobs: mkdir ~/.ssh && echo -e "Host gitlab.tiker.net\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config eval $(ssh-agent) && echo "$GITLAB_AUTOPUSH_KEY" | ssh-add - git fetch --unshallow - git push "git@gitlab.tiker.net:inducer/$(basename $GITHUB_REPOSITORY).git" main + TGT_BRANCH="${GITHUB_REF#refs/heads/}" + echo "pushing to $TGT_BRANCH..." + git push "git@gitlab.tiker.net:inducer/$(basename $GITHUB_REPOSITORY).git" "$TGT_BRANCH" env: GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }} -- GitLab From 0bd17a473d19b6e63620e2c2c79e2f2d1e9cb6c8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 22:15:43 -0500 Subject: [PATCH 793/916] do not perform inplace updates to implemented_data_infos --- loopy/codegen/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index d458e9654..0884bad32 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -770,7 +770,8 @@ def generate_code(kernel, device=None): raise LoopyError("kernel passed to generate_code yielded multiple " "host programs. Use generate_code_v2.") - _, implemented_data_info = codegen_result.implemented_data_infos.popitem() + assert len(codegen_result.implemented_data_infos) == 1 + implemented_data_info, = codegen_result.implemented_data_infos.values() return codegen_result.device_code(), implemented_data_info -- GitLab From b25fb8146c33d65b7e2fd3c96b73534a571bf657 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 22:16:26 -0500 Subject: [PATCH 794/916] separate TranslationUnitCodeGenResult and LoopKernelCodeGenResult --- loopy/codegen/__init__.py | 74 +++++++++++++++++++++++++++++++++++---- loopy/codegen/control.py | 3 +- loopy/codegen/result.py | 48 +++++++------------------ 3 files changed, 80 insertions(+), 45 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 0884bad32..68ee6c808 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -25,7 +25,6 @@ logger = logging.getLogger(__name__) import islpy as isl -from collections import OrderedDict from loopy.diagnostic import LoopyError, warn from pytools import ImmutableRecord @@ -54,6 +53,8 @@ __doc__ = """ .. autoclass:: CodeGenerationState +.. autoclass:: TranslationUnitCodeGenerationResult + .. automodule:: loopy.codegen.result .. automodule:: loopy.codegen.tools @@ -639,6 +640,67 @@ def diverge_callee_entrypoints(program): return program.copy(callables_table=new_callables) +class TranslationUnitCodeGenerationResult(ImmutableRecord): + """ + .. attribute:: host_program + + A mapping from names of entrypoints to their host + :class:`~loopy.codegen.result.GeneratedProgram`. + + .. attribute:: device_programs + + A list of :class:`~loopy.codegen.result.GeneratedProgram` instances + intended to run on the compute device. + + .. attribute:: host_preambles + .. attribute:: device_preambles + + .. attribute:: implemented_data_infos + + A mapping from names of entrypoints to their + list of :class:`ImplementedDataInfo` objects. + + .. automethod:: host_code + .. automethod:: device_code + .. automethod:: all_code + + """ + def host_code(self): + from loopy.codegen.result import process_preambles + preamble_codes = process_preambles(getattr(self, "host_preambles", [])) + + return ( + "".join(preamble_codes) + + "\n" + + "\n\n".join(str(hp.ast) + for hp in self.host_programs.values())) + + def device_code(self): + from loopy.codegen.result import process_preambles + preamble_codes = process_preambles(getattr(self, "device_preambles", [])) + + return ( + "".join(preamble_codes) + + "\n" + + "\n\n".join(str(dp.ast) for dp in self.device_programs)) + + def all_code(self): + from loopy.codegen.result import process_preambles + preamble_codes = process_preambles( + getattr(self, "host_preambles", []) + + + getattr(self, "device_preambles", []) + ) + + return ( + "".join(preamble_codes) + + "\n" + + "\n\n".join(str(dp.ast) for dp in self.device_programs) + + "\n\n" + + "\n\n".join(str(hp.ast) for hp in + self.host_programs.values())) + + @memoize_method def generate_code_v2(program): """ @@ -649,7 +711,6 @@ def generate_code_v2(program): from loopy.kernel import LoopKernel from loopy.program import make_program - from loopy.codegen.result import CodeGenerationResult # {{{ cache retrieval @@ -702,11 +763,11 @@ def generate_code_v2(program): program = diverge_callee_entrypoints(program) - host_programs = OrderedDict() + host_programs = {} device_programs = [] device_preambles = [] callee_fdecls = [] - implemented_data_infos = OrderedDict() + implemented_data_infos = {} for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): @@ -718,8 +779,7 @@ def generate_code_v2(program): program.callables_table, program.target, func_id in program.entrypoints) if func_id in program.entrypoints: - assert len(cgr.host_programs) == 1 - host_programs[func_id] = cgr.host_programs[func_id] + host_programs[func_id] = cgr.host_program implemented_data_infos[func_id] = cgr.implemented_data_info else: # FIXME: This assertion should be valid @@ -743,7 +803,7 @@ def generate_code_v2(program): ast=program.target.get_device_ast_builder().ast_module.Collection( callee_fdecls+[device_programs[0].ast]))] + device_programs[1:]) - cgr = CodeGenerationResult( + cgr = TranslationUnitCodeGenerationResult( host_programs=host_programs, device_programs=device_programs, device_preambles=device_preambles, diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index f65c39742..bf74f4789 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -23,7 +23,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from collections import OrderedDict from loopy.codegen.result import merge_codegen_results, wrap_in_if import islpy as isl from loopy.schedule import ( @@ -154,7 +153,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): if sched_item.synchronization_kind in ["global", "local"]: # host code is assumed globally and locally synchronous return CodeGenerationResult( - host_programs=OrderedDict(), + host_program=None, device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 358088922..620430f93 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -21,7 +21,6 @@ THE SOFTWARE. """ from pytools import ImmutableRecord -from collections import OrderedDict def process_preambles(preambles): @@ -77,11 +76,7 @@ class GeneratedProgram(ImmutableRecord): class CodeGenerationResult(ImmutableRecord): """ - .. attribute:: host_programs - - A mapping from entrypoints of a translation unit to instances of - :class:`GeneratedProgram` intended to be run on host. - + .. attribute:: host_program .. attribute:: device_programs A list of :class:`GeneratedProgram` instances @@ -102,7 +97,7 @@ class CodeGenerationResult(ImmutableRecord): .. attribute:: implemented_data_info a list of :class:`loopy.codegen.ImplementedDataInfo` objects. - Only added at the very end of code generation + Only added at the very end of code generation. """ @staticmethod @@ -114,12 +109,12 @@ class CodeGenerationResult(ImmutableRecord): if codegen_state.is_generating_device_code: kwargs = { + "host_program": None, "device_programs": [prg], - "host_programs": OrderedDict() } else: kwargs = { - "host_programs": OrderedDict({codegen_state.kernel.name: prg}), + "host_program": prg, "device_programs": [], } @@ -133,9 +128,8 @@ class CodeGenerationResult(ImmutableRecord): return ( "".join(preamble_codes) - + "\n" - + "\n\n".join(str(hp.ast) - for hp in self.host_programs.values())) + + + str(self.host_program.ast)) def device_code(self): preamble_codes = process_preambles(getattr(self, "device_preambles", [])) @@ -157,8 +151,7 @@ class CodeGenerationResult(ImmutableRecord): + "\n" + "\n\n".join(str(dp.ast) for dp in self.device_programs) + "\n\n" - + "\n\n".join(str(hp.ast) for hp in - self.host_programs.values())) + + str(self.host_program.ast)) def current_program(self, codegen_state): if codegen_state.is_generating_device_code: @@ -167,11 +160,7 @@ class CodeGenerationResult(ImmutableRecord): else: result = None else: - if self.host_programs: - host_programs = self.host_programs.copy() - _, result = host_programs.popitem() - else: - result = None + result = self.host_program if result is None: ast = codegen_state.ast_builder.ast_block_class([]) @@ -195,15 +184,7 @@ class CodeGenerationResult(ImmutableRecord): else: assert program.name == codegen_state.gen_program_name assert not program.is_device_program - host_programs = self.host_programs.copy() - if host_programs: - e, _ = host_programs.popitem() - assert codegen_state.kernel.name == e - host_programs[e] = program - else: - host_programs[codegen_state.kernel.name] = program - return self.copy( - host_programs=host_programs) + return self.copy(host_program=program) def current_ast(self, codegen_state): return self.current_program(codegen_state).ast @@ -224,7 +205,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True): if not elements: return CodeGenerationResult( - host_programs=OrderedDict(), + host_program=None, device_programs=[], implemented_domains={}, implemented_data_info=codegen_state.implemented_data_info) @@ -321,8 +302,8 @@ def generate_host_or_device_program(codegen_state, schedule_index): else: codegen_result = build_loop_nest(codegen_state, schedule_index) - if (codegen_state.is_generating_device_code) or ( - codegen_state.is_entrypoint): + if (codegen_state.is_generating_device_code + or codegen_state.is_entrypoint): codegen_result = merge_codegen_results( codegen_state, ast_builder.generate_top_of_body(codegen_state) @@ -344,12 +325,7 @@ def generate_host_or_device_program(codegen_state, schedule_index): cur_prog.copy( ast=ast_builder.process_ast(fdef_ast), body_ast=ast_builder.process_ast(body_ast))) - else: - codegen_result = codegen_result.copy( - host_programs=OrderedDict()) return codegen_result # }}} - -# vim: foldmethod=marker -- GitLab From 879cda874c5210ec18c3780aa5a981194b9a6509 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 22:38:15 -0500 Subject: [PATCH 795/916] test_fortran: keep cl_context alive --- test/test_fortran.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/test/test_fortran.py b/test/test_fortran.py index f596acbf5..65126cdf3 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -73,6 +73,9 @@ def test_fp_prec_comparison(): def test_assign_double_precision_scalar(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + fortran_src = """ subroutine assign_scalar(a) real*8 a(1) @@ -84,7 +87,6 @@ def test_assign_double_precision_scalar(ctx_factory): prg = lp.parse_fortran(fortran_src) print(lp.generate_code_v2(prg).device_code()) assert "1.1;" in lp.generate_code_v2(prg).device_code() - queue = cl.CommandQueue(ctx_factory()) a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") prg(queue, a=a_dev) @@ -94,6 +96,9 @@ def test_assign_double_precision_scalar(ctx_factory): def test_assign_double_precision_scalar_as_rational(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + fortran_src = """ subroutine assign_scalar(a) real*8 a(1) @@ -104,7 +109,6 @@ def test_assign_double_precision_scalar_as_rational(ctx_factory): """ prg = lp.parse_fortran(fortran_src) - queue = cl.CommandQueue(ctx_factory()) a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") prg(queue, a=a_dev) @@ -114,6 +118,9 @@ def test_assign_double_precision_scalar_as_rational(ctx_factory): def test_assign_single_precision_scalar(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + fortran_src = """ subroutine assign_scalar(a) real*8 a(1) @@ -124,7 +131,6 @@ def test_assign_single_precision_scalar(ctx_factory): prg = lp.parse_fortran(fortran_src) assert "1.1f" in lp.generate_code_v2(prg).device_code() - queue = cl.CommandQueue(ctx_factory()) a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") prg(queue, a=a_dev) -- GitLab From 7c9fea04c091ac0afcd9f478bb4c1257bbb1d702 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 22:43:31 -0500 Subject: [PATCH 796/916] change_arg_to_image: iterate over kernels if given program --- loopy/transform/data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 185af24c4..10344da0e 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -395,6 +395,7 @@ def add_prefetch(program, *args, **kwargs): # {{{ change variable kinds +@iterate_over_kernels_if_given_program def change_arg_to_image(kernel, name): new_args = [] for arg in kernel.args: -- GitLab From 3aa4d5f51acd6c347f7a75e1ffdf9d8858c35991 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 19 Apr 2021 23:18:07 -0500 Subject: [PATCH 797/916] set ImageArg's address space, direction --- loopy/kernel/data.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 377d13e61..5c26a3f19 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -475,9 +475,20 @@ class ConstantArg(ArrayBase, KernelArgument): class ImageArg(ArrayBase, KernelArgument): __doc__ = ArrayBase.__doc__ + + def __init__(self, *args, **kwargs): + if kwargs.pop("address_space", AddressSpace.GLOBAL) != AddressSpace.GLOBAL: + raise LoopyError("'address_space' for ImageArg must be GLOBAL.") + super().__init__(*args, **kwargs) + min_target_axes = 1 max_target_axes = 3 + # Image Arg cannot be an output + is_output = False + is_input = True + address_space = AddressSpace.GLOBAL + @property def dimensions(self): return len(self.dim_tags) -- GitLab From 1d06a7422eb06d872fb9f3d3e02dbba98d26c520 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Apr 2021 00:53:38 -0500 Subject: [PATCH 798/916] test mangled symbols in call exprs --- loopy/kernel/creation.py | 2 +- test/library_for_test.py | 10 ++++++++++ test/test_callables.py | 21 +++++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 8a2e9cde1..3fec7f758 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2028,7 +2028,7 @@ class SliceToInameReplacer(IdentityMapper): array_arg_shape = ( self.knl.arg_dict[arg.name].shape) else: - assert arg.name in self.knl.all_inames() + # arg could be either an iname or a "mangled symbol" array_arg_shape = () if array_arg_shape != (): diff --git a/test/library_for_test.py b/test/library_for_test.py index cfaacdc0e..5f83a22aa 100644 --- a/test/library_for_test.py +++ b/test/library_for_test.py @@ -59,3 +59,13 @@ class SingleArgNoRetFunction(lp.ScalarCallable): printf("Hi!\n"); } """) + + +def symbol_x(knl, name): + if name == "X": + from loopy.types import to_loopy_type + return to_loopy_type(np.float32), "X" + + +def preamble_for_x(preamble_info): + yield("preamble_ten", r"#define X 10.0") diff --git a/test/test_callables.py b/test/test_callables.py index 9545cdf53..52f7b1d04 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -761,6 +761,27 @@ def test_passing_and_getting_scalar_in_clbl_knl(ctx_factory, inline): evt, (out,) = knl(cq, real_x=np.asarray(3.0, dtype=float)) +def test_symbol_mangler_in_call(ctx_factory): + from library_for_test import (symbol_x, + preamble_for_x) + ctx = cl.create_some_context() + cq = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{:}", + """ + y = sin(X) + """, + [lp.GlobalArg("y", shape=lp.auto)]) + + knl = lp.register_symbol_manglers(knl, [symbol_x]) + + knl = lp.register_preamble_generators(knl, [preamble_for_x]) + + evt, (out,) = knl(cq) + np.testing.assert_allclose(out.get(), np.sin(10)) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 00f8845897b282259fa0623a9a37f9bbd9f8be7d Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 20 Apr 2021 09:25:04 +0200 Subject: [PATCH 799/916] Fix C99 preamble generation when Kernel Callables have args with OpaqueTypes. --- loopy/target/c/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index c3c874110..54c9e6f0a 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -93,7 +93,8 @@ def c99_preamble_generator(preamble_info): if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes): yield("10_stdint", "#include ") if any(dtype.numpy_dtype == np.dtype("bool") - for dtype in preamble_info.seen_dtypes): + for dtype in preamble_info.seen_dtypes + if isinstance(dtype, NumpyType)): yield("10_stdbool", "#include ") if any(dtype.is_complex() for dtype in preamble_info.seen_dtypes): yield("10_complex", "#include ") -- GitLab From 9287802f59c336d48c3c25c57ed1de7d11ac52cf Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Apr 2021 09:02:24 -0500 Subject: [PATCH 800/916] test _match_caller_callee_argument_dimension_ and fix the argument shapes --- loopy/transform/callable.py | 9 ++++++++- test/test_callables.py | 38 +++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index a5c4c5284..c25535e02 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -559,6 +559,8 @@ def _match_caller_callee_argument_dimension_for_single_kernel( *callee_function_name* in the *caller_knl* aligned with the argument dimensions required by *caller_knl*. """ + from loopy.kernel.array import ArrayBase + for insn in caller_knl.instructions: if not isinstance(insn, CallInstruction) or ( insn.expression.function.name != @@ -606,8 +608,13 @@ def _match_caller_callee_argument_dimension_for_single_kernel( raise NotImplementedError("Unknown instruction %s." % type(insn)) + new_args = [arg if not isinstance(arg, ArrayBase) + else arg.copy(shape=arg_id_to_shape[arg.name], dim_tags=None) + for arg in callee_knl.args] + # subkernel with instructions adjusted according to the new dimensions - new_callee_knl = callee_knl.copy(instructions=new_callee_insns) + new_callee_knl = callee_knl.copy(instructions=new_callee_insns, + args=new_args) return new_callee_knl diff --git a/test/test_callables.py b/test/test_callables.py index 9545cdf53..295989308 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -391,46 +391,52 @@ def test_packing_unpacking(ctx_factory, inline): 3*x2.get()) < 1e-15 -def test_non_sub_array_refs_arguments(ctx_factory): +@pytest.mark.parametrize("inline", [False, True]) +def test_non_sub_array_refs_arguments(ctx_factory, inline): from loopy.transform.callable import _match_caller_callee_argument_dimension_ + ctx = ctx_factory() callee = lp.make_function("{[i] : 0 <= i < 6}", "a[i] = a[i] + j", [lp.GlobalArg("a", dtype="double", shape=(6,), is_output=True, is_input=True), - lp.ValueArg("j", dtype="int")], name="callee", - target=lp.CTarget()) + lp.ValueArg("j", dtype="int")], name="callee") caller1 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:] = callee(a[:], b[0])", [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output=False), lp.GlobalArg("b", dtype="double", shape=(1, ), is_output=False)], - name="caller", target=lp.CTarget()) + name="caller") caller2 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], 3.1415926)", [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output=False)], - name="caller", target=lp.CTarget()) + name="caller") caller3 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], kappa)", [lp.GlobalArg("a", dtype="double", shape=(6, ), - is_output=False), ...], - name="caller", target=lp.CTarget()) + is_output=False), + lp.ValueArg("kappa", dtype=np.float64), ...], + name="caller") registered = lp.merge([caller1, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, "callee") - inlined = lp.inline_callable_kernel(inlined, "callee") + knl = _match_caller_callee_argument_dimension_(registered, "callee") + + if inline: + knl = lp.inline_callable_kernel(knl, "callee") - print(inlined) + lp.auto_test_vs_ref(knl, ctx) registered = lp.merge([caller2, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, "callee") - inlined = lp.inline_callable_kernel(inlined, "callee") + knl = _match_caller_callee_argument_dimension_(registered, "callee") + if inline: + knl = lp.inline_callable_kernel(knl, "callee") - print(inlined) + lp.auto_test_vs_ref(knl, ctx) registered = lp.merge([caller3, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, "callee") - inlined = lp.inline_callable_kernel(inlined, "callee") + knl = _match_caller_callee_argument_dimension_(registered, "callee") + if inline: + knl = lp.inline_callable_kernel(knl, "callee") - print(inlined) + lp.auto_test_vs_ref(knl, ctx, parameters={"kappa": 42.0}) @pytest.mark.parametrize("inline", [False, True]) -- GitLab From 89eb35fbd10c341e256683860c3a33c3193ce753 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Tue, 20 Apr 2021 09:33:10 -0500 Subject: [PATCH 801/916] Tweak ImageArg comment --- loopy/kernel/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 5c26a3f19..fe165b0f2 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -484,7 +484,7 @@ class ImageArg(ArrayBase, KernelArgument): min_target_axes = 1 max_target_axes = 3 - # Image Arg cannot be an output + # ImageArg cannot be an output (for now) is_output = False is_input = True address_space = AddressSpace.GLOBAL -- GitLab From e1f669ed196d5d2e0f76f68401a8772b2c8a43a9 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Mon, 11 May 2020 13:19:24 +0100 Subject: [PATCH 802/916] Error: Instruction and assigment are from multiassigmentbase, but one of them has assignee as attribute and of them assignees. --- loopy/transform/callable.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index a5c4c5284..bf1997e96 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -595,9 +595,14 @@ def _match_caller_callee_argument_dimension_for_single_kernel( new_callee_insns = [] for callee_insn in callee_knl.instructions: if isinstance(callee_insn, MultiAssignmentBase): - new_callee_insns.append(callee_insn.copy(expression=dim_changer( - callee_insn.expression), - assignee=dim_changer(callee_insn.assignee))) + if isinstance(callee_insn, CallInstruction): + new_callee_insns.append(callee_insn.copy(expression=dim_changer( + callee_insn.expression), + assignees=dim_changer(callee_insn.assignees))) + else: + new_callee_insns.append(callee_insn.copy(expression=dim_changer( + callee_insn.expression), + assignee=dim_changer(callee_insn.assignee))) elif isinstance(callee_insn, (CInstruction, _DataObliviousInstruction)): -- GitLab From 8b42858a1e400f369758350162d43dfbf03f5e01 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Apr 2021 09:02:24 -0500 Subject: [PATCH 803/916] test _match_caller_callee_argument_dimension_ and fix the argument shapes --- loopy/transform/callable.py | 9 ++++++++- test/test_callables.py | 38 +++++++++++++++++++++---------------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index bf1997e96..a140a1c54 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -559,6 +559,8 @@ def _match_caller_callee_argument_dimension_for_single_kernel( *callee_function_name* in the *caller_knl* aligned with the argument dimensions required by *caller_knl*. """ + from loopy.kernel.array import ArrayBase + for insn in caller_knl.instructions: if not isinstance(insn, CallInstruction) or ( insn.expression.function.name != @@ -611,8 +613,13 @@ def _match_caller_callee_argument_dimension_for_single_kernel( raise NotImplementedError("Unknown instruction %s." % type(insn)) + new_args = [arg if not isinstance(arg, ArrayBase) + else arg.copy(shape=arg_id_to_shape[arg.name], dim_tags=None) + for arg in callee_knl.args] + # subkernel with instructions adjusted according to the new dimensions - new_callee_knl = callee_knl.copy(instructions=new_callee_insns) + new_callee_knl = callee_knl.copy(instructions=new_callee_insns, + args=new_args) return new_callee_knl diff --git a/test/test_callables.py b/test/test_callables.py index 52f7b1d04..146bf3172 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -391,46 +391,52 @@ def test_packing_unpacking(ctx_factory, inline): 3*x2.get()) < 1e-15 -def test_non_sub_array_refs_arguments(ctx_factory): +@pytest.mark.parametrize("inline", [False, True]) +def test_non_sub_array_refs_arguments(ctx_factory, inline): from loopy.transform.callable import _match_caller_callee_argument_dimension_ + ctx = ctx_factory() callee = lp.make_function("{[i] : 0 <= i < 6}", "a[i] = a[i] + j", [lp.GlobalArg("a", dtype="double", shape=(6,), is_output=True, is_input=True), - lp.ValueArg("j", dtype="int")], name="callee", - target=lp.CTarget()) + lp.ValueArg("j", dtype="int")], name="callee") caller1 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:] = callee(a[:], b[0])", [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output=False), lp.GlobalArg("b", dtype="double", shape=(1, ), is_output=False)], - name="caller", target=lp.CTarget()) + name="caller") caller2 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], 3.1415926)", [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output=False)], - name="caller", target=lp.CTarget()) + name="caller") caller3 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], kappa)", [lp.GlobalArg("a", dtype="double", shape=(6, ), - is_output=False), ...], - name="caller", target=lp.CTarget()) + is_output=False), + lp.ValueArg("kappa", dtype=np.float64), ...], + name="caller") registered = lp.merge([caller1, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, "callee") - inlined = lp.inline_callable_kernel(inlined, "callee") + knl = _match_caller_callee_argument_dimension_(registered, "callee") + + if inline: + knl = lp.inline_callable_kernel(knl, "callee") - print(inlined) + lp.auto_test_vs_ref(knl, ctx) registered = lp.merge([caller2, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, "callee") - inlined = lp.inline_callable_kernel(inlined, "callee") + knl = _match_caller_callee_argument_dimension_(registered, "callee") + if inline: + knl = lp.inline_callable_kernel(knl, "callee") - print(inlined) + lp.auto_test_vs_ref(knl, ctx) registered = lp.merge([caller3, callee]) - inlined = _match_caller_callee_argument_dimension_(registered, "callee") - inlined = lp.inline_callable_kernel(inlined, "callee") + knl = _match_caller_callee_argument_dimension_(registered, "callee") + if inline: + knl = lp.inline_callable_kernel(knl, "callee") - print(inlined) + lp.auto_test_vs_ref(knl, ctx, parameters={"kappa": 42.0}) @pytest.mark.parametrize("inline", [False, True]) -- GitLab From 1fb869a7b81613d9e0d5a141598e554da25f09df Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 00:55:19 -0500 Subject: [PATCH 804/916] ctarget: support int (max|min) --- loopy/target/c/__init__.py | 43 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 54c9e6f0a..ab6320a2c 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -568,7 +568,32 @@ class CMathCallable(ScalarCallable): self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), callables_table) + elif name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't mature enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + if dtype.kind not in "iu": + raise LoopyError(f"{name} does not support '{dtype}' arguments.") + + return ( + self.copy(name_in_target=f"lpy_{name}_{dtype.name}", + arg_id_to_dtype={-1: NumpyType(dtype), + 0: NumpyType(dtype), + 1: NumpyType(dtype)}), + callables_table) elif name == "isnan": for id in arg_id_to_dtype: if not -1 <= id <= 0: @@ -590,6 +615,24 @@ class CMathCallable(ScalarCallable): -1: NumpyType(np.int32)}), callables_table) + def generate_preambles(self, target): + if self.name_in_target.startswith("lpy_max"): + dtype = self.arg_id_to_dtype[-1] + ctype = target.dtype_to_typename(dtype) + + yield ("40_lpy_max", f""" + static inline {ctype} {self.name_in_target}({ctype} a, {ctype} b) {{ + return (a > b ? a : b); + }}""") + + if self.name_in_target.startswith("lpy_min"): + dtype = self.arg_id_to_dtype[-1] + ctype = target.dtype_to_typename(dtype) + yield ("40_lpy_min", f""" + static inline {ctype} {self.name_in_target}({ctype} a, {ctype} b) {{ + return (a < b ? a : b); + }}""") + def get_c_callables(): """ -- GitLab From beb6d2210e7eedf4d4f520b3a573c8491cf92c02 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 20 Apr 2021 09:02:24 -0500 Subject: [PATCH 805/916] test _match_caller_callee_argument_dimension_ and fix the argument shapes -- GitLab From d5c494035210c47ec9acb9590400bb8d29fcd5c6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 08:39:59 -0500 Subject: [PATCH 806/916] removes a restrictive type inference criterion * If need be: should be handled by callables --- loopy/target/pyopencl.py | 2 +- loopy/type_inference.py | 43 +++------------------------------------- 2 files changed, 4 insertions(+), 41 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 70663b2da..29a59b273 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -257,7 +257,7 @@ class PyOpenCLCallable(ScalarCallable): # function calls for floating parameters. numpy_dtype = dtype.numpy_dtype if numpy_dtype.kind in ("u", "i"): - dtype = dtype.copy(numpy_dtype=np.float32) + dtype = NumpyType(np.float32) if name == "abs": name = "fabs" return ( diff --git a/loopy/type_inference.py b/loopy/type_inference.py index ee1ddf33d..bee6db0bc 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -437,46 +437,9 @@ class TypeInferenceMapper(CombineMapper): # specializing the known function wrt type in_knl_callable = self.clbl_inf_ctx[expr.function.name] - # {{{ checking that there is no overwriting of types of in_knl_callable - - if in_knl_callable.arg_id_to_dtype is not None: - - # specializing an already specialized function. - for id, dtype in arg_id_to_dtype.items(): - if id in in_knl_callable.arg_id_to_dtype and ( - in_knl_callable.arg_id_to_dtype[id] != - arg_id_to_dtype[id]): - - # {{{ ignoring the the cases when there is a discrepancy - # between np.uint and np.int - - import numpy as np - if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( - np.uint32) and ( - arg_id_to_dtype[id].dtype.type == np.int32): - continue - if in_knl_callable.arg_id_to_dtype[id].dtype.type == ( - np.uint64) and ( - arg_id_to_dtype[id].dtype.type == - np.int64): - continue - - if np.can_cast(arg_id_to_dtype[id].dtype.type, - in_knl_callable.arg_id_to_dtype[id].dtype.type): - continue - - # }}} - - raise LoopyError("Overwriting a specialized function " - "is illegal--maybe start with new instance of " - "InKernelCallable?") - - # }}} - - in_knl_callable, self.clbl_inf_ctx = ( - in_knl_callable.with_types( - arg_id_to_dtype, - self.clbl_inf_ctx)) + in_knl_callable, self.clbl_inf_ctx = (in_knl_callable + .with_types(arg_id_to_dtype, + self.clbl_inf_ctx)) in_knl_callable = in_knl_callable.with_target(self.kernel.target) -- GitLab From fa034c80b701b3df54bde4e4f4d77a368fb132f4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 09:13:33 -0500 Subject: [PATCH 807/916] ctarget: add support for conj --- loopy/target/c/__init__.py | 8 +++++--- test/test_expression.py | 10 +++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 54c9e6f0a..0c0c15ddb 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -483,7 +483,7 @@ class CMathCallable(ScalarCallable): # unary functions if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", - "erf", "erfc", "abs", "real", "imag"]: + "erf", "erfc", "abs", "real", "imag", "conj"]: for id in arg_id_to_dtype: if not -1 <= id <= 0: @@ -516,7 +516,8 @@ class CMathCallable(ScalarCallable): dtype)) if dtype.kind == "c": - name = "c" + name + if name != "conj": + name = "c" + name if name in ["abs", "real", "imag"]: dtype = real_dtype @@ -599,7 +600,8 @@ def get_c_callables(): cmath_ids = ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", - "fabs", "tan", "erf", "erfc", "isnan", "real", "imag"] + "fabs", "tan", "erf", "erfc", "isnan", "real", "imag", + "conj"] return {id_: CMathCallable(id_) for id_ in cmath_ids} diff --git a/test/test_expression.py b/test/test_expression.py index 89b25e530..9e997422e 100644 --- a/test/test_expression.py +++ b/test/test_expression.py @@ -504,9 +504,11 @@ def test_complex_support(ctx_factory, target): complex_div_complex[i] = (2jf + 7*in1[i])/(32jf + 37*in1[i]) complex_div_real[i] = (2jf + 7*in1[i])/in1[i] real_div_complex[i] = in1[i]/(2jf + 7*in1[i]) - tmp_sum[0] = sum(i1, 1.0*i1 + i1*1jf)*sum(i2, 1.0*i2 + i2*1jf) + out_sum = sum(i1, 1.0*i1 + i1*1jf)*sum(i2, 1.0*i2 + i2*1jf) + conj_out_sum = conj(out_sum) """, - target=target()) + target=target(), + seq_dependencies=True) knl = lp.set_options(knl, "return_dict") n = 10 @@ -534,7 +536,9 @@ def test_complex_support(ctx_factory, target): np.testing.assert_allclose(out["complex_div_complex"], (2j+7*in1)/(32j+37*in1)) np.testing.assert_allclose(out["complex_div_real"], (2j + 7*in1)/in1) np.testing.assert_allclose(out["real_div_complex"], in1/(2j + 7*in1)) - np.testing.assert_allclose(out["tmp_sum"], (0.5*n*(n-1) + 0.5*n*(n-1)*1j) ** 2) + np.testing.assert_allclose(out["out_sum"], (0.5*n*(n-1) + 0.5*n*(n-1)*1j) ** 2) + np.testing.assert_allclose(out["conj_out_sum"], + (0.5*n*(n-1) - 0.5*n*(n-1)*1j) ** 2) def test_bool_type_context(ctx_factory): -- GitLab From 455c30c880424f01cb25f0fcdd3575fe255a3459 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 09:47:04 -0500 Subject: [PATCH 808/916] append strides=auto with dim_tags=None --- loopy/transform/callable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index c25535e02..5f0ad6e00 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -560,6 +560,7 @@ def _match_caller_callee_argument_dimension_for_single_kernel( dimensions required by *caller_knl*. """ from loopy.kernel.array import ArrayBase + from loopy.kernel.data import auto for insn in caller_knl.instructions: if not isinstance(insn, CallInstruction) or ( @@ -609,7 +610,8 @@ def _match_caller_callee_argument_dimension_for_single_kernel( type(insn)) new_args = [arg if not isinstance(arg, ArrayBase) - else arg.copy(shape=arg_id_to_shape[arg.name], dim_tags=None) + else arg.copy(shape=arg_id_to_shape[arg.name], + dim_tags=None, strides=auto) for arg in callee_knl.args] # subkernel with instructions adjusted according to the new dimensions -- GitLab From 5e08f4ca5c0617db907f6858a1b4daf4a66e9bbd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 13:25:07 -0500 Subject: [PATCH 809/916] must also pass the order with dim_tags=None and strides=auto to actually make ArrayBase compute the dim tags --- loopy/transform/callable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 5f0ad6e00..da9891879 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -611,7 +611,7 @@ def _match_caller_callee_argument_dimension_for_single_kernel( new_args = [arg if not isinstance(arg, ArrayBase) else arg.copy(shape=arg_id_to_shape[arg.name], - dim_tags=None, strides=auto) + dim_tags=None, strides=auto, order="C") for arg in callee_knl.args] # subkernel with instructions adjusted according to the new dimensions -- GitLab From 67bdae31a7c445198dbade6baf54c6d1ce4a80e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Wed, 21 Apr 2021 17:39:28 -0500 Subject: [PATCH 810/916] floating -> floating-point in pyopencl target --- loopy/target/pyopencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 29a59b273..3123c2714 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -254,7 +254,7 @@ class PyOpenCLCallable(ScalarCallable): arg_id_to_dtype={0: dtype, -1: dtype}), callables_table) else: - # function calls for floating parameters. + # function calls for floating-point parameters. numpy_dtype = dtype.numpy_dtype if numpy_dtype.kind in ("u", "i"): dtype = NumpyType(np.float32) -- GitLab From ec51602f68ceea4124ae8ce3085fdd48dbdc77f7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <15399010+kaushikcfd@users.noreply.github.com> Date: Wed, 21 Apr 2021 20:37:58 -0500 Subject: [PATCH 811/916] CTarget: support max, min (#317) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ctarget: support int (max|min) * test CTarget (max|min) * Tweak comments in integer max/min support Co-authored-by: Andreas Klöckner --- loopy/target/c/__init__.py | 44 ++++++++++++++++++++++++++++++++++++++ test/test_callables.py | 21 ++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 0c0c15ddb..3722bc1c8 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -569,7 +569,33 @@ class CMathCallable(ScalarCallable): self.copy(name_in_target=name, arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}), callables_table) + elif name in ["max", "min"]: + for id in arg_id_to_dtype: + if not -1 <= id <= 1: + raise LoopyError("%s can take only two arguments." % name) + + if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or ( + arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None): + # the types provided aren't resolved enough to specialize the + # callable + return ( + self.copy(arg_id_to_dtype=arg_id_to_dtype), + callables_table) + + dtype = np.find_common_type( + [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items() + if id >= 0]) + if dtype.kind not in "iu": + # only support integers for now to avoid having to deal with NaNs + raise LoopyError(f"{name} does not support '{dtype}' arguments.") + + return ( + self.copy(name_in_target=f"lpy_{name}_{dtype.name}", + arg_id_to_dtype={-1: NumpyType(dtype), + 0: NumpyType(dtype), + 1: NumpyType(dtype)}), + callables_table) elif name == "isnan": for id in arg_id_to_dtype: if not -1 <= id <= 0: @@ -591,6 +617,24 @@ class CMathCallable(ScalarCallable): -1: NumpyType(np.int32)}), callables_table) + def generate_preambles(self, target): + if self.name_in_target.startswith("lpy_max"): + dtype = self.arg_id_to_dtype[-1] + ctype = target.dtype_to_typename(dtype) + + yield ("40_lpy_max", f""" + static inline {ctype} {self.name_in_target}({ctype} a, {ctype} b) {{ + return (a > b ? a : b); + }}""") + + if self.name_in_target.startswith("lpy_min"): + dtype = self.arg_id_to_dtype[-1] + ctype = target.dtype_to_typename(dtype) + yield ("40_lpy_min", f""" + static inline {ctype} {self.name_in_target}({ctype} a, {ctype} b) {{ + return (a < b ? a : b); + }}""") + def get_c_callables(): """ diff --git a/test/test_callables.py b/test/test_callables.py index 146bf3172..3d3e3f427 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -788,6 +788,27 @@ def test_symbol_mangler_in_call(ctx_factory): np.testing.assert_allclose(out.get(), np.sin(10)) +@pytest.mark.parametrize("which", ["max", "min"]) +def test_int_max_min_c_target(ctx_factory, which): + from numpy.random import default_rng + from pymbolic import parse + rng = default_rng() + + n = 100 + arr1 = rng.integers(-100, 100, n) + arr2 = rng.integers(-100, 100, n) + np_func = getattr(np, f"{which}imum") + + knl = lp.make_kernel( + "{[i]: 0<=i<100}", + [lp.Assignment(parse("out[i]"), + parse(f"{which}(arr1[i], arr2[i])"))], + target=lp.ExecutableCTarget()) + + _, (out,) = knl(arr1=arr1, arr2=arr2) + np.testing.assert_allclose(np_func(arr1, arr2), out) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 57756410297a5f5808fda215848870916a4563ba Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 22 Apr 2021 14:42:49 +0200 Subject: [PATCH 812/916] Use with_transformed_expressions instead of copy --- loopy/transform/callable.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index a140a1c54..6cb10e14a 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -597,14 +597,8 @@ def _match_caller_callee_argument_dimension_for_single_kernel( new_callee_insns = [] for callee_insn in callee_knl.instructions: if isinstance(callee_insn, MultiAssignmentBase): - if isinstance(callee_insn, CallInstruction): - new_callee_insns.append(callee_insn.copy(expression=dim_changer( - callee_insn.expression), - assignees=dim_changer(callee_insn.assignees))) - else: - new_callee_insns.append(callee_insn.copy(expression=dim_changer( - callee_insn.expression), - assignee=dim_changer(callee_insn.assignee))) + new_callee_insns.append(callee_insn + .with_transformed_expressions(dim_changer)) elif isinstance(callee_insn, (CInstruction, _DataObliviousInstruction)): -- GitLab From 307ff25da70fb4116389751ad338340d20137557 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 01:32:22 -0500 Subject: [PATCH 813/916] account sub-array-refs' swept inames are used inames --- loopy/kernel/instruction.py | 14 ++++++++++++++ loopy/symbolic.py | 22 ++++++++++++++++++++++ loopy/transform/iname.py | 3 ++- 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 6b428ae93..7d4b9a50c 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -324,6 +324,9 @@ class InstructionBase(ImmutableRecord, Taggable): def reduction_inames(self): raise NotImplementedError + def sub_array_ref_inames(self): + raise NotImplementedError + def assignee_var_names(self): """Return a tuple of assignee variable names, one for each quantity being assigned to. @@ -808,6 +811,11 @@ class MultiAssignmentBase(InstructionBase): from loopy.symbolic import get_reduction_inames return get_reduction_inames(self.expression) + @memoize_method + def sub_array_ref_inames(self): + from loopy.symbolic import get_sub_array_ref_swept_inames + return get_sub_array_ref_swept_inames((self.assignees, self.expression)) + # }}} @@ -1401,6 +1409,9 @@ class CInstruction(InstructionBase): def reduction_inames(self): return set() + def sub_array_ref_inames(self): + return frozenset() + def assignee_var_names(self): return tuple(_get_assignee_var_name(expr) for expr in self.assignees) @@ -1448,6 +1459,9 @@ class _DataObliviousInstruction(InstructionBase): def reduction_inames(self): return frozenset() + def sub_array_ref_inames(self): + return frozenset() + def assignee_var_names(self): return frozenset() diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 63aa6a4d7..e810024e9 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -988,6 +988,28 @@ def get_reduction_inames(expr): return _get_dependencies_and_reduction_inames(expr)[1] +class SubArrayRefSweptInamesCollector(CombineMapper): + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_sub_array_ref(self, expr): + return frozenset({iname.name for iname in expr.swept_inames}) + + def map_constant(self, expr): + return frozenset() + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + map_type_cast = map_constant + map_resolved_function = map_constant + + +def get_sub_array_ref_swept_inames(expr): + return SubArrayRefSweptInamesCollector()(expr) + + # {{{ rule-aware mappers def parse_tagged_name(expr): diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 324cec8e9..1686a87d9 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1204,7 +1204,8 @@ def get_used_inames(kernel): for insn in exp_kernel.instructions: used_inames.update( insn.within_inames - | insn.reduction_inames()) + | insn.reduction_inames() + | insn.sub_array_ref_inames()) return used_inames -- GitLab From d73d30ba5985d5434cdb153e39fc661b01dd7328 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 01:40:17 -0500 Subject: [PATCH 814/916] remove unused inames post inlining --- loopy/transform/callable.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index c60d26385..4afae7587 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -471,7 +471,10 @@ def _inline_single_callable_kernel(caller_kernel, callee_kernel, "Unknown instruction type %s" % type(insn).__name__) - return caller_kernel + from loopy.transform.iname import remove_unused_inames + # sub-array refs might have been removed during inlining + # => remove their swept inames from domains + return remove_unused_inames(caller_kernel) # FIXME This should take a 'within' parameter to be able to only inline -- GitLab From f2d41768b45209e50e3c75ec4cae62e3d27ae250 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 08:19:32 -0500 Subject: [PATCH 815/916] remove only sub-array-refs' inames --- loopy/transform/callable.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 4afae7587..54a694775 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -456,13 +456,19 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): def _inline_single_callable_kernel(caller_kernel, callee_kernel, callables_table): + from loopy.symbolic import ResolvedFunction + + # sub-array refs might be removed during inlining + # => remove their swept inames from domains + inames_to_remove = frozenset() + for insn in caller_kernel.instructions: - if isinstance(insn, CallInstruction): - # FIXME This seems to use identifiers across namespaces. Why not - # check whether the function is a scoped function first? ~AK + if (isinstance(insn, CallInstruction) + and isinstance(insn.expression.function, ResolvedFunction)): if insn.expression.function.name == callee_kernel.name: caller_kernel = _inline_call_instruction( caller_kernel, callee_kernel, insn) + inames_to_remove |= insn.sub_array_ref_inames() elif isinstance(insn, (MultiAssignmentBase, CInstruction, _DataObliviousInstruction)): pass @@ -472,9 +478,7 @@ def _inline_single_callable_kernel(caller_kernel, callee_kernel, % type(insn).__name__) from loopy.transform.iname import remove_unused_inames - # sub-array refs might have been removed during inlining - # => remove their swept inames from domains - return remove_unused_inames(caller_kernel) + return remove_unused_inames(caller_kernel, inames_to_remove) # FIXME This should take a 'within' parameter to be able to only inline -- GitLab From 2de9ef3a9218d33a30ab33cdb361e818e25a6f5e Mon Sep 17 00:00:00 2001 From: Connor Ward Date: Thu, 22 Apr 2021 12:06:23 +0100 Subject: [PATCH 816/916] Add c prefix to real and imag --- loopy/target/c/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 3722bc1c8..128e085e3 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -515,13 +515,13 @@ class CMathCallable(ScalarCallable): raise LoopyTypeError("{} does not support type {}".format(name, dtype)) - if dtype.kind == "c": - if name != "conj": - name = "c" + name - if name in ["abs", "real", "imag"]: dtype = real_dtype + if dtype.kind == "c" or name in ["real", "imag"]: + if name != "conj": + name = "c" + name + return ( self.copy(name_in_target=name, arg_id_to_dtype={0: NumpyType(dtype), -1: -- GitLab From 945e89931a7c427799b0044dab30fcab3574850d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 15:03:41 -0500 Subject: [PATCH 817/916] Program -> TranslationUnit --- doc/ref_call.rst | 6 +-- doc/ref_program.rst | 4 +- loopy/__init__.py | 6 +-- loopy/cli.py | 4 +- loopy/codegen/__init__.py | 4 +- loopy/frontend/fortran/__init__.py | 2 +- loopy/kernel/__init__.py | 2 +- loopy/kernel/tools.py | 10 ++--- loopy/preprocess.py | 6 +-- loopy/program.py | 49 +++++++++++++++++-------- loopy/statistics.py | 4 +- loopy/symbolic.py | 4 +- loopy/transform/buffer.py | 6 +-- loopy/transform/callable.py | 16 ++++---- loopy/transform/data.py | 4 +- loopy/transform/fusion.py | 4 +- loopy/transform/iname.py | 6 +-- loopy/transform/instruction.py | 4 +- loopy/transform/pack_and_unpack_args.py | 4 +- loopy/transform/padding.py | 4 +- loopy/transform/precompute.py | 6 +-- loopy/transform/subst.py | 4 +- 22 files changed, 88 insertions(+), 71 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 4ba3246a2..60170a5f4 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -10,7 +10,7 @@ Function Interface Resolving and specialization ---------------------------- -In :mod:`loopy`, a :class:`loopy.Program` is a collection of callables +In :mod:`loopy`, a :class:`loopy.TranslationUnit` is a collection of callables and entrypoints. Callable are of type :class`:loopy.kernel.function_interface.InKernelCallable`. Any expression node which has a callable corresponding to it appears as @@ -18,7 +18,7 @@ which has a callable corresponding to it appears as a :class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as resolving. -During code-generation process for a :class:`~loopy.Program`, a callable +During code-generation process for a :class:`~loopy.TranslationUnit`, a callable is *specialized* depending on the types and shapes of the arguments passed at a call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl`` @@ -31,7 +31,7 @@ or shape specialization is encoded via Registering callables --------------------- -A user can *register* callables within a :class:`~loopy.Program` to +A user can *register* callables within a :class:`~loopy.TranslationUnit` to allow loopy to resolve calls not pre-defined in :mod:`loopy`. In :mod:`loopy`, we typically aim to expose all the standard math functions defined for a :class:`~loopy.target.TargetBase`. Other foreign functions could be invoked by diff --git a/doc/ref_program.rst b/doc/ref_program.rst index 2e4d5b9bc..e83530b62 100644 --- a/doc/ref_program.rst +++ b/doc/ref_program.rst @@ -1,6 +1,6 @@ .. currentmodule:: loopy -Program +TranslationUnit ======= -.. autoclass:: Program +.. autoclass:: TranslationUnit diff --git a/loopy/__init__.py b/loopy/__init__.py index 6cabbf614..602074305 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -47,7 +47,7 @@ from loopy.kernel.data import ( from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.program import ( - Program, make_program) + TranslationUnit, Program, make_program) from loopy.kernel import LoopKernel, KernelState, kernel_state from loopy.kernel.tools import ( @@ -175,7 +175,7 @@ __all__ = [ "ScalarCallable", "CallableKernel", - "Program", "make_program", + "TranslationUnit", "make_program", "Program", "KernelArgument", "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg", @@ -461,7 +461,7 @@ class CacheMode: # {{{ make copy kernel def make_copy_kernel(new_dim_tags, old_dim_tags=None): - """Returns a :class:`loopy.Program` that changes the data layout + """Returns a :class:`loopy.TranslationUnit` that changes the data layout of a variable (called "input") to the new layout specified by *new_dim_tags* from the one specified by *old_dim_tags*. *old_dim_tags* defaults to an all-C layout of the same rank diff --git a/loopy/cli.py b/loopy/cli.py index a7d209ae8..787c3ee32 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -184,10 +184,10 @@ def main(): raise RuntimeError("unknown language: '%s'" % args.lang) - if not isinstance(prg, lp.Program): + if not isinstance(prg, lp.TranslationUnit): # FIXME assert isinstance(prg, list) # of kernels - raise NotImplementedError("convert list of kernels to Program") + raise NotImplementedError("convert list of kernels to TranslationUnit") if args.print_ir: print(prg, file=sys.stderr) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 68ee6c808..ef244e23f 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -706,7 +706,7 @@ def generate_code_v2(program): """ Returns an instance of :class:`CodeGenerationResult`. - :param program: An instance of :class:`loopy.Program`. + :param program: An instance of :class:`loopy.TranslationUnit`. """ from loopy.kernel import LoopKernel @@ -721,7 +721,7 @@ def generate_code_v2(program): input_program = prepare_for_caching(program) try: result = code_gen_cache[input_program] - logger.debug(f"Program with entrypoints {program.entrypoints}:" + logger.debug(f"TranslationUnit with entrypoints {program.entrypoints}:" " code generation cache hit") return result except KeyError: diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 4ad7cd21c..d6311131f 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -157,7 +157,7 @@ def parse_transformed_fortran(source, free_form=True, strict=True, * ``FILENAME``: the file name of the code being processed The transform code must define ``RESULT``, conventionally a list of kernels - or a :class:`loopy.Program`, which is returned from this function + or a :class:`loopy.TranslationUnit`, which is returned from this function unmodified. An example of *source* may look as follows:: diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index e6c05c878..7fb779b63 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1483,7 +1483,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ Execute the :class:`LoopKernel`. """ - warn("Calling a LoopKernel is deprecated, call a Program " + warn("Calling a LoopKernel is deprecated, call a TranslationUnit " "instead.", DeprecationWarning, stacklevel=2) from loopy.program import make_program program = make_program(self) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index fa924467b..6465965f9 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -33,7 +33,7 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg, natsorted from loopy.kernel import LoopKernel -from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.program import TranslationUnit, iterate_over_kernels_if_given_program from loopy.kernel.function_interface import CallableKernel import logging logger = logging.getLogger(__name__) @@ -47,13 +47,13 @@ def add_dtypes(prog_or_kernel, dtype_dict): :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype` instances """ - if isinstance(prog_or_kernel, Program): + if isinstance(prog_or_kernel, TranslationUnit): kernel_names = [clbl.subkernel.name for clbl in prog_or_kernel.callables_table.values() if isinstance(clbl, CallableKernel)] if len(kernel_names) != 1: - raise LoopyError("add_dtypes may not take a Program with more than" - " one callable kernels. Please provide individual kernels" + raise LoopyError("add_dtypes may not take a TranslationUnit with more" + " than one callable kernels. Please provide individual kernels" " instead.") kernel_name, = kernel_names @@ -124,7 +124,7 @@ def get_arguments_with_incomplete_dtype(kernel): def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False, kernel_name=None): - assert isinstance(prog, Program) + assert isinstance(prog, TranslationUnit) if kernel_name is None: kernel_names = [clbl.subkernel.name for clbl in prog.callables_table.values() if isinstance(clbl, diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 90e527ae4..cd6de2ab5 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -42,7 +42,7 @@ from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) from loopy.kernel import LoopKernel -from loopy.program import Program +from loopy.program import TranslationUnit from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pytools import ProcessLogger @@ -87,7 +87,7 @@ def prepare_for_caching(program): if isinstance(program, LoopKernel): return prepare_for_caching_inner(program) - assert isinstance(program, Program) + assert isinstance(program, TranslationUnit) tgt = program.target new_clbls = {} @@ -2060,7 +2060,7 @@ def realize_reduction_for_single_kernel(kernel, callables_table, def realize_reduction(program, *args, **kwargs): - assert isinstance(program, Program) + assert isinstance(program, TranslationUnit) callables_table = dict(program.callables_table) kernels_to_scan = [in_knl_callable.subkernel diff --git a/loopy/program.py b/loopy/program.py index 09b13ffd5..9143f3c88 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -46,7 +46,7 @@ __doc__ = """ .. currentmodule:: loopy -.. autoclass:: Program +.. autoclass:: TranslationUnit .. autofunction:: make_program @@ -150,10 +150,14 @@ class CallableResolver(RuleAwareIdentityMapper): # {{{ program -class Program(ImmutableRecord): +class TranslationUnit(ImmutableRecord): """ Records the information about all the callables in a :mod:`loopy` program. + An instance of :class:`TranslationUnit` is the object that gets lowered + for a :class:`loopy.target.TargetBase`. + + .. attribute:: entrypoints A :class:`frozenset` of the names of the kernels which @@ -181,15 +185,18 @@ class Program(ImmutableRecord): TargetBase, function_indentifier: str)`` that would return an instance of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + .. automethod:: __call__ .. automethod:: copy .. automethod:: __getitem__ + .. automethod:: with_kernel .. note:: - - To create an instance of :class:`loopy.Program`, it is recommended to - go through :func:`loopy.make_kernel`. + - To create an instance of :class:`loopy.TranslationUnit`, it is + recommended to go through :func:`loopy.make_kernel`. - This data structure and its attributes should be considered - immutable, any modifications should be done through :meth:`~Program.copy`. + immutable, any modifications should be done through + :meth:`~TranslationUnit.copy`. """ def __init__(self, @@ -270,9 +277,9 @@ class Program(ImmutableRecord): @property def state(self): """ Returns an instance of :class:`loopy.kernel.KernelState`. """ - return min(callable_knl.subkernel.state for callable_knl in - self.callables_table.values() if - isinstance(callable_knl, CallableKernel)) + return min(callable_knl.subkernel.state + for callable_knl in self.callables_table.values() + if isinstance(callable_knl, CallableKernel)) def with_kernel(self, kernel): """ @@ -314,10 +321,17 @@ class Program(ImmutableRecord): entrypoint, = self.entrypoints return self[entrypoint] else: - raise ValueError("Program has multiple possible entrypoints. The " - "default entry point kernel is not uniquely determined.") + raise ValueError("TranslationUnit has multiple possible entrypoints." + " The default entry point kernel is not uniquely" + " determined.") def __call__(self, *args, **kwargs): + """ + Builds and calls the *entrypoint* kernel, if + :attr:`TranslationUnit.target` is an executable target. + + :arg entrypoint: The entrypoint which is to be called + """ entrypoint = kwargs.get("entrypoint", None) if entrypoint is None: @@ -325,7 +339,7 @@ class Program(ImmutableRecord): if len(self.entrypoints) == 1: entrypoint, = self.entrypoints else: - raise TypeError("Program.__call__() missing 1 required" + raise TypeError("TranslationUnit.__call__() missing 1 required" " keyword argument: 'entrypoint'. " "(Multiple possible entrypoints are present in the " "program.)") @@ -370,6 +384,9 @@ class Program(ImmutableRecord): self.update_persistent_hash(key_hash, LoopyKeyBuilder()) return hash(key_hash.digest()) + +Program = TranslationUnit + # }}} @@ -689,11 +706,11 @@ class CallablesInferenceContext(ImmutableRecord): def make_program(kernel): """ - Returns an instance of :class:`loopy.Program` with *kernel* as the only + Returns an instance of :class:`loopy.TranslationUnit` with *kernel* as the only callable kernel. """ - program = Program( + program = TranslationUnit( callables_table={ kernel.name: CallableKernel(kernel)}, target=kernel.target) @@ -706,7 +723,7 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): Function wrapper for transformations of the type ``transform(kernel: LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the ``transform`` being implemented on all of the callable kernels in a - :class:`loopy.Program`. + :class:`loopy.TranslationUnit`. """ def _collective_transform(*args, **kwargs): if "program" in kwargs: @@ -717,7 +734,7 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel): program_or_kernel = args[0] args = args[1:] - if isinstance(program_or_kernel, Program): + if isinstance(program_or_kernel, TranslationUnit): program = program_or_kernel new_callables = {} for func_id, in_knl_callable in program.callables_table.items(): @@ -763,7 +780,7 @@ def update_table(callables_table, clbl_id, clbl): def resolve_callables(program): """ - Returns a :class:`Program` with known :class:`pymbolic.primitives.Call` + Returns a :class:`TranslationUnit` with known :class:`pymbolic.primitives.Call` expression nodes converted to :class:`loopy.symbolic.ResolvedFunction`. """ from loopy.library.function import get_loopy_callables diff --git a/loopy/statistics.py b/loopy/statistics.py index 96d96e3a4..3ae783608 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -35,7 +35,7 @@ from loopy.diagnostic import warn_with_kernel, LoopyError from loopy.symbolic import CoefficientCollector from pytools import ImmutableRecord, memoize_method from loopy.kernel.function_interface import CallableKernel -from loopy.program import Program +from loopy.program import TranslationUnit from functools import partial @@ -1435,7 +1435,7 @@ def add_assumptions_guard(kernel, pwqpolynomial): def count(kernel, set, space=None): - if isinstance(kernel, Program): + if isinstance(kernel, TranslationUnit): kernel_names = [i for i, clbl in kernel.callables_table.items() if isinstance(clbl, CallableKernel)] if len(kernel_names) > 1: diff --git a/loopy/symbolic.py b/loopy/symbolic.py index e810024e9..0c6a8d50f 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -803,9 +803,9 @@ class RuleArgument(LoopyExpressionBase): class ResolvedFunction(LoopyExpressionBase): """ A function invocation whose definition is known in a :mod:`loopy` program. - A function is said to be *known* in a :class:`~loopy.Program` if its + A function is said to be *known* in a :class:`~loopy.TranslationUnit` if its identifier maps to an :class:`~loopy.kernel.function_interface.InKernelCallable` - in :attr:`loopy.Program.callables_table`. Refer to :ref:`func-interface`. + in :attr:`loopy.TranslationUnit.callables_table`. Refer to :ref:`func-interface`. .. attribute:: function diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index e8c4bc2e9..92b3d63fb 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -31,7 +31,7 @@ from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel -from loopy.program import Program +from loopy.program import TranslationUnit from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pymbolic import var @@ -169,7 +169,7 @@ def buffer_array_for_single_kernel(kernel, callables_table, var_name, fetched. """ - if isinstance(kernel, Program): + if isinstance(kernel, TranslationUnit): kernel_names = [i for i, clbl in kernel.callables_table.items() if isinstance(clbl, CallableKernel)] @@ -554,7 +554,7 @@ def buffer_array_for_single_kernel(kernel, callables_table, var_name, def buffer_array(program, *args, **kwargs): - assert isinstance(program, Program) + assert isinstance(program, TranslationUnit) new_callables = {} diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 54a694775..5e5125c49 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -34,7 +34,7 @@ from loopy.symbolic import ( from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) -from loopy.program import Program +from loopy.program import TranslationUnit __doc__ = """ .. currentmodule:: loopy @@ -48,7 +48,7 @@ __doc__ = """ def register_callable(translation_unit, function_identifier, callable_, redefining_not_ok=True): """ - :param translation_unit: A :class:`loopy.Program`. + :param translation_unit: A :class:`loopy.TranslationUnit`. :param callable_: A :class:`loopy.InKernelCallable`. """ @@ -73,9 +73,9 @@ def register_callable(translation_unit, function_identifier, callable_, def merge(translation_units): """ - :param translation_units: A list of :class:`loopy.Program`. + :param translation_units: A list of :class:`loopy.TranslationUnit`. - :returns: An instance of :class:`loopy.Program` which contains all the + :returns: An instance of :class:`loopy.TranslationUnit` which contains all the callables from each of the *translation_units. """ @@ -104,7 +104,7 @@ def merge(translation_units): for trans_unit in translation_units: callables_table.update(trans_unit.callables_table.copy()) - return Program( + return TranslationUnit( entrypoints=frozenset().union(*( t.entrypoints or frozenset() for t in translation_units)), callables_table=callables_table, @@ -672,7 +672,7 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name): # {{{ sanity checks - assert isinstance(program, Program) + assert isinstance(program, TranslationUnit) assert isinstance(callee_function_name, str) assert callee_function_name not in program.entrypoints assert callee_function_name in program.callables_table @@ -700,7 +700,7 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name): def rename_callable(program, old_name, new_name=None, existing_ok=False): """ - :arg program: An instance of :class:`loopy.Program` + :arg program: An instance of :class:`loopy.TranslationUnit` :arg old_name: The callable to be renamed :arg new_name: New name for the callable to be renamed :arg existing_ok: An instance of :class:`bool` @@ -710,7 +710,7 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False): SubstitutionRuleMappingContext) from pymbolic import var - assert isinstance(program, Program) + assert isinstance(program, TranslationUnit) assert isinstance(old_name, str) if (new_name in program.callables_table) and not existing_ok: diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 10344da0e..467f8bd5f 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -26,7 +26,7 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper -from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.program import TranslationUnit, iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel, ScalarCallable @@ -369,7 +369,7 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name, def add_prefetch(program, *args, **kwargs): - assert isinstance(program, Program) + assert isinstance(program, TranslationUnit) new_callables = {} for func_id, in_knl_callable in program.callables_table.items(): diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 0880c22ae..269853024 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -28,7 +28,7 @@ from loopy.diagnostic import LoopyError from pymbolic import var from loopy.kernel import LoopKernel -from loopy.program import Program +from loopy.program import TranslationUnit from loopy.kernel.function_interface import CallableKernel @@ -333,7 +333,7 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): # namespace, otherwise the kernel names should be uniquified. # We should also somehow be able to know that callables like "sin"/"cos" # belong to the global namespace and need not be uniquified. - if all(isinstance(kernel, Program) for kernel in kernels): + if all(isinstance(kernel, TranslationUnit) for kernel in kernels): new_kernels = [] for knl in kernels: kernel_names = [i for i, clbl in diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 1686a87d9..e99f10e7a 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -29,7 +29,7 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError -from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.program import TranslationUnit, iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel @@ -1038,7 +1038,7 @@ def get_iname_duplication_options(kernel, use_boostable_into=False): Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be duplicated in a given kernel. """ - if isinstance(kernel, Program): + if isinstance(kernel, TranslationUnit): if len([clbl for clbl in kernel.callables_table.values() if isinstance(clbl, CallableKernel)]) == 1: kernel = kernel[list(kernel.entrypoints)[0]] @@ -1092,7 +1092,7 @@ def has_schedulable_iname_nesting(kernel): :returns: a :class:`bool` indicating whether this kernel needs an iname duplication in order to be schedulable. """ - if isinstance(kernel, Program): + if isinstance(kernel, TranslationUnit): if len([clbl for clbl in kernel.callables_table.values() if isinstance(clbl, CallableKernel)]) == 1: kernel = kernel[list(kernel.entrypoints)[0]] diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index a48e8eda7..3ec4b8d42 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -23,7 +23,7 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) -from loopy.program import Program, iterate_over_kernels_if_given_program +from loopy.program import TranslationUnit, iterate_over_kernels_if_given_program # {{{ find_instructions @@ -39,7 +39,7 @@ def find_instructions(program, insn_match): if isinstance(program, LoopKernel): return find_instructions_in_single_kernel(program, insn_match) - assert isinstance(program, Program) + assert isinstance(program, TranslationUnit) insns = [] for in_knl_callable in program.callables_table.values(): if isinstance(in_knl_callable, CallableKernel): diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index cf0730760..d6f2858cf 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -22,7 +22,7 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError from loopy.kernel.instruction import CallInstruction -from loopy.program import Program +from loopy.program import TranslationUnit from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel, ScalarCallable from loopy.symbolic import SubArrayRef @@ -317,7 +317,7 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel, def pack_and_unpack_args_for_call(program, *args, **kwargs): - assert isinstance(program, Program) + assert isinstance(program, TranslationUnit) new_callables = {} for func_id, in_knl_callable in program.callables_table.items(): diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index 455ce31d0..f56588932 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -24,7 +24,7 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext -from loopy.program import iterate_over_kernels_if_given_program, Program +from loopy.program import iterate_over_kernels_if_given_program, TranslationUnit from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel from loopy.diagnostic import LoopyError @@ -409,7 +409,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, # {{{ find_padding_multiple def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1): - if isinstance(kernel, Program): + if isinstance(kernel, TranslationUnit): kernel_names = [i for i, clbl in kernel.callables_table.items() if isinstance(clbl, CallableKernel)] if len(kernel_names) > 1: diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 438c07339..ac133ed59 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -27,7 +27,7 @@ from loopy.symbolic import (get_dependencies, SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func -from loopy.program import Program +from loopy.program import TranslationUnit from loopy.kernel.function_interface import CallableKernel, ScalarCallable import numpy as np @@ -354,7 +354,7 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are eliminated. """ - if isinstance(kernel, Program): + if isinstance(kernel, TranslationUnit): kernel_names = [i for i, clbl in kernel.callables_table.items() if isinstance(clbl, CallableKernel)] @@ -1057,7 +1057,7 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, def precompute(program, *args, **kwargs): - assert isinstance(program, Program) + assert isinstance(program, TranslationUnit) new_callables = {} for func_id, clbl in program.callables_table.items(): diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 066cf326c..41ca67759 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -28,7 +28,7 @@ from loopy.transform.iname import remove_any_newly_unused_inames from pytools import ImmutableRecord from pymbolic import var -from loopy.program import iterate_over_kernels_if_given_program, Program +from loopy.program import iterate_over_kernels_if_given_program, TranslationUnit from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging @@ -53,7 +53,7 @@ def extract_subst(kernel, subst_name, template, parameters=()): unifications. """ - if isinstance(kernel, Program): + if isinstance(kernel, TranslationUnit): kernel_names = [i for i, clbl in kernel.callables_table.items() if isinstance(clbl, CallableKernel)] -- GitLab From 01e070a1396e233c45a26d20f22058bf7c2074e7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 15:15:27 -0500 Subject: [PATCH 818/916] prg -> t_unit --- loopy/cli.py | 14 +++++++------- loopy/target/c/__init__.py | 4 ++-- test/test_callables.py | 14 +++++++------- test/test_fortran.py | 32 ++++++++++++++++---------------- test/test_loopy.py | 16 ++++++++-------- test/test_transform.py | 6 +++--- 6 files changed, 43 insertions(+), 43 deletions(-) diff --git a/loopy/cli.py b/loopy/cli.py index 787c3ee32..4544df166 100644 --- a/loopy/cli.py +++ b/loopy/cli.py @@ -159,7 +159,7 @@ def main(): raise RuntimeError("loopy-lang requires 'lp_knl' " "to be defined on exit") - prg = [kernel] + t_unit = [kernel] elif lang in ["fortran", "floopy", "fpp"]: pre_transform_code = None @@ -176,7 +176,7 @@ def main(): defines_to_python_code(defines_fd.read()) + pre_transform_code) - prg = lp.parse_transformed_fortran( + t_unit = lp.parse_transformed_fortran( infile_content, pre_transform_code=pre_transform_code, filename=args.infile) @@ -184,16 +184,16 @@ def main(): raise RuntimeError("unknown language: '%s'" % args.lang) - if not isinstance(prg, lp.TranslationUnit): + if not isinstance(t_unit, lp.TranslationUnit): # FIXME - assert isinstance(prg, list) # of kernels + assert isinstance(t_unit, list) # of kernels raise NotImplementedError("convert list of kernels to TranslationUnit") if args.print_ir: - print(prg, file=sys.stderr) + print(t_unit, file=sys.stderr) - prg = lp.preprocess_kernel(prg) - cgr = lp.generate_code_v2(prg) + t_unit = lp.preprocess_kernel(t_unit) + cgr = lp.generate_code_v2(t_unit) if args.outfile is not None: outfile = args.outfile diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 128e085e3..4548d8487 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1257,9 +1257,9 @@ class ExecutableCTarget(CTarget): # thing that CPU JIT is specific to. return None - def get_kernel_executor(self, prg, *args, **kwargs): + def get_kernel_executor(self, t_unit, *args, **kwargs): from loopy.target.c.c_execution import CKernelExecutor - return CKernelExecutor(prg, entrypoint=kwargs.pop("entrypoint"), + return CKernelExecutor(t_unit, entrypoint=kwargs.pop("entrypoint"), compiler=self.compiler) def get_host_ast_builder(self): diff --git a/test/test_callables.py b/test/test_callables.py index 3d3e3f427..3cf39c2bf 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -647,14 +647,14 @@ def test_callees_with_gbarriers_are_inlined(ctx_factory): seq_dependencies=True, name="ones_and_zeros") - prg = lp.make_kernel( + t_unit = lp.make_kernel( "{ : }", """ y[:] = ones_and_zeros() """, [lp.GlobalArg("y", shape=6, dtype=lp.auto)]) - prg = lp.merge([prg, ones_and_zeros]) - evt, (out,) = prg(queue) + t_unit = lp.merge([t_unit, ones_and_zeros]) + evt, (out,) = t_unit(queue) expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32) @@ -675,19 +675,19 @@ def test_inlining_with_indirections(ctx_factory): seq_dependencies=True, name="ones_and_zeros") - prg = lp.make_kernel( + t_unit = lp.make_kernel( "{ : }", """ y[:] = ones_and_zeros(map[:]) """, [lp.GlobalArg("y", shape=6, dtype=lp.auto), lp.GlobalArg("map", dtype=np.int32, shape=3)]) - prg = lp.merge([prg, ones_and_zeros]) - prg = lp.inline_callable_kernel(prg, "ones_and_zeros") + t_unit = lp.merge([t_unit, ones_and_zeros]) + t_unit = lp.inline_callable_kernel(t_unit, "ones_and_zeros") map_in = np.arange(3).astype(np.int32) - evt, (out, ) = prg(queue, map=map_in) + evt, (out, ) = t_unit(queue, map=map_in) expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32) assert (expected_out == out).all() diff --git a/test/test_fortran.py b/test/test_fortran.py index 65126cdf3..72f7b7e01 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -84,12 +84,12 @@ def test_assign_double_precision_scalar(ctx_factory): end """ - prg = lp.parse_fortran(fortran_src) - print(lp.generate_code_v2(prg).device_code()) - assert "1.1;" in lp.generate_code_v2(prg).device_code() + t_unit = lp.parse_fortran(fortran_src) + print(lp.generate_code_v2(t_unit).device_code()) + assert "1.1;" in lp.generate_code_v2(t_unit).device_code() a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") - prg(queue, a=a_dev) + t_unit(queue, a=a_dev) abs_err = abs(a_dev.get()[0] - 1.1) assert abs_err < 1e-15 @@ -108,10 +108,10 @@ def test_assign_double_precision_scalar_as_rational(ctx_factory): end """ - prg = lp.parse_fortran(fortran_src) + t_unit = lp.parse_fortran(fortran_src) a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") - prg(queue, a=a_dev) + t_unit(queue, a=a_dev) abs_err = abs(a_dev.get()[0] - 1.1) assert abs_err < 1e-15 @@ -129,11 +129,11 @@ def test_assign_single_precision_scalar(ctx_factory): end """ - prg = lp.parse_fortran(fortran_src) - assert "1.1f" in lp.generate_code_v2(prg).device_code() + t_unit = lp.parse_fortran(fortran_src) + assert "1.1f" in lp.generate_code_v2(t_unit).device_code() a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") - prg(queue, a=a_dev) + t_unit(queue, a=a_dev) abs_err = abs(a_dev.get()[0] - 1.1) assert abs_err > 1e-15 @@ -547,9 +547,9 @@ def test_parse_and_fuse_two_kernels(): !$loopy begin ! - ! prg = lp.parse_fortran(SOURCE) - ! fill = prg["fill"] - ! twice = prg["twice"] + ! t_unit = lp.parse_fortran(SOURCE) + ! fill = t_unit["fill"] + ! twice = t_unit["twice"] ! knl = lp.fuse_kernels((fill, twice)) ! print(knl) ! RESULT = knl @@ -628,8 +628,8 @@ def test_fortran_subroutines(): call twice(n, a(i, 1:n)) end subroutine """ - prg = lp.parse_fortran(fortran_src).with_entrypoints("twice_cross") - print(lp.generate_code_v2(prg).device_code()) + t_unit = lp.parse_fortran(fortran_src).with_entrypoints("twice_cross") + print(lp.generate_code_v2(t_unit).device_code()) def test_domain_fusion_imperfectly_nested(): @@ -648,10 +648,10 @@ def test_domain_fusion_imperfectly_nested(): end subroutine """ - prg = lp.parse_fortran(fortran_src) + t_unit = lp.parse_fortran(fortran_src) # If n > 0 and m == 0, a single domain would be empty, # leading (incorrectly) to no assignments to 'a'. - assert len(prg["imperfect"].domains) > 1 + assert len(t_unit["imperfect"].domains) > 1 if __name__ == "__main__": diff --git a/test/test_loopy.py b/test/test_loopy.py index 1be7ba732..e1e803a77 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2758,7 +2758,7 @@ def test_shape_mismatch_check(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - prg = lp.make_kernel( + t_unit = lp.make_kernel( "{[i,j]: 0 <= i < n and 0 <= j < m}", "c[i] = sum(j, a[i,j]*b[j])", default_order="F") @@ -2766,11 +2766,11 @@ def test_shape_mismatch_check(ctx_factory): a = np.random.rand(10, 10).astype(np.float32) b = np.random.rand(10).astype(np.float32) - if prg["loopy_kernel"].options.skip_arg_checks: + if t_unit["loopy_kernel"].options.skip_arg_checks: pytest.skip("args checks disabled, cannot check") with pytest.raises(TypeError, match="strides mismatch"): - prg(queue, a=a, b=b) + t_unit(queue, a=a, b=b) def test_array_arg_extra_kwargs_persis_hash(): @@ -2846,7 +2846,7 @@ def test_empty_domain(ctx_factory, tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - prg = lp.make_kernel( + t_unit = lp.make_kernel( "{[i,j]: 0 <= i < n}", """ for i @@ -2855,15 +2855,15 @@ def test_empty_domain(ctx_factory, tag): """) if tag == "fixed": - prg = lp.fix_parameters(prg, n=0) + t_unit = lp.fix_parameters(t_unit, n=0) kwargs = {} else: - prg = lp.tag_inames(prg, {"i": tag}) + t_unit = lp.tag_inames(t_unit, {"i": tag}) kwargs = {"n": 0} - prg = lp.set_options(prg, write_code=True) + t_unit = lp.set_options(t_unit, write_code=True) c = cl.array.zeros(queue, (), dtype=np.int32) - prg(queue, c=c, **kwargs) + t_unit(queue, c=c, **kwargs) assert (c.get() == 0).all() diff --git a/test/test_transform.py b/test/test_transform.py index 9ac29766b..1e75aa0bc 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -572,12 +572,12 @@ def test_nested_substs_in_insns(ctx_factory): """ ) - prg = lp.expand_subst(ref_prg) + t_unit = lp.expand_subst(ref_prg) assert not any( cknl.subkernel.substitutions - for cknl in prg.callables_table.values()) + for cknl in t_unit.callables_table.values()) - lp.auto_test_vs_ref(ref_prg, ctx, prg) + lp.auto_test_vs_ref(ref_prg, ctx, t_unit) def test_extract_subst_with_iname_deps_in_templ(ctx_factory): -- GitLab From ef61238456a7c7de0466abf0fdc9f94dfbfe718d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 15:26:14 -0500 Subject: [PATCH 819/916] ref_program -> ref_translation_unit --- doc/index.rst | 2 +- doc/{ref_program.rst => ref_translation_unit.rst} | 2 +- loopy/program.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename doc/{ref_program.rst => ref_translation_unit.rst} (82%) diff --git a/doc/index.rst b/doc/index.rst index d3cb6f38c..3bc0361c5 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -42,7 +42,7 @@ Please check :ref:`installation` to get started. tutorial ref_creation ref_kernel - ref_program + ref_translation_unit ref_transform ref_call ref_other diff --git a/doc/ref_program.rst b/doc/ref_translation_unit.rst similarity index 82% rename from doc/ref_program.rst rename to doc/ref_translation_unit.rst index e83530b62..9d7c49158 100644 --- a/doc/ref_program.rst +++ b/doc/ref_translation_unit.rst @@ -1,6 +1,6 @@ .. currentmodule:: loopy TranslationUnit -======= +=============== .. autoclass:: TranslationUnit diff --git a/loopy/program.py b/loopy/program.py index 9143f3c88..14cae4819 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -287,7 +287,7 @@ class TranslationUnit(ImmutableRecord): subkernel and returns a copy of *self*. Else records a new callable kernel with *kernel* as its subkernel. - :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`. + :arg kernel: An instance of :class:`loopy.LoopKernel`. :returns: Copy of *self* with updated callable kernels. """ if kernel.name in self.callables_table: -- GitLab From 8b60deff03867722e06b1e9998609a3ea39cef25 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Apr 2021 02:18:00 -0500 Subject: [PATCH 820/916] add a DeprecationWarning to Program instantiation --- loopy/program.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/loopy/program.py b/loopy/program.py index 14cae4819..be4f96b55 100644 --- a/loopy/program.py +++ b/loopy/program.py @@ -385,7 +385,12 @@ class TranslationUnit(ImmutableRecord): return hash(key_hash.digest()) -Program = TranslationUnit +class Program(TranslationUnit): + def __init__(self, *args, **kwargs): + from warnings import warn + warn("Program is deprecated, use TranslationUnit instead, " + "will be removed in 2022", DeprecationWarning, stacklevel=2) + super().__init__(*args, **kwargs) # }}} -- GitLab From db6e2b13f96157f70087ef0abf45548479850f46 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Apr 2021 02:23:15 -0500 Subject: [PATCH 821/916] program.py -> translation_unit.py --- loopy/__init__.py | 4 ++-- loopy/codegen/__init__.py | 6 +++--- loopy/kernel/__init__.py | 2 +- loopy/kernel/creation.py | 4 ++-- loopy/kernel/tools.py | 3 ++- loopy/library/reduction.py | 12 ++++++------ loopy/loop.py | 2 +- loopy/preprocess.py | 6 +++--- loopy/statistics.py | 2 +- loopy/target/execution.py | 2 +- loopy/transform/add_barrier.py | 2 +- loopy/transform/arithmetic.py | 2 +- loopy/transform/batch.py | 2 +- loopy/transform/buffer.py | 2 +- loopy/transform/callable.py | 4 ++-- loopy/transform/data.py | 3 ++- loopy/transform/fusion.py | 4 ++-- loopy/transform/iname.py | 3 ++- loopy/transform/instruction.py | 3 ++- loopy/transform/pack_and_unpack_args.py | 2 +- loopy/transform/padding.py | 3 ++- loopy/transform/parameter.py | 2 +- loopy/transform/precompute.py | 2 +- loopy/transform/subst.py | 3 ++- loopy/{program.py => translation_unit.py} | 0 loopy/type_inference.py | 4 ++-- test/test_callables.py | 2 +- 27 files changed, 46 insertions(+), 40 deletions(-) rename loopy/{program.py => translation_unit.py} (100%) diff --git a/loopy/__init__.py b/loopy/__init__.py index 602074305..f2a780977 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -24,7 +24,7 @@ THE SOFTWARE. from loopy.symbolic import ( TaggedVariable, Reduction, LinearSubscript, TypeCast) from loopy.diagnostic import LoopyError, LoopyWarning -from loopy.program import iterate_over_kernels_if_given_program +from loopy.translation_unit import iterate_over_kernels_if_given_program # {{{ imported user interface @@ -46,7 +46,7 @@ from loopy.kernel.data import ( CallMangleInfo) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) -from loopy.program import ( +from loopy.translation_unit import ( TranslationUnit, Program, make_program) from loopy.kernel import LoopKernel, KernelState, kernel_state diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index ef244e23f..f467a5dc4 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -607,7 +607,7 @@ def diverge_callee_entrypoints(program): """ If a kernel is both an entrypoint and a callee, then rename the callee. """ - from loopy.program import _get_callable_ids + from loopy.translation_unit import _get_callable_ids from pytools import UniqueNameGenerator callable_ids = _get_callable_ids(program.callables_table, program.entrypoints) @@ -622,7 +622,7 @@ def diverge_callee_entrypoints(program): for name, clbl in program.callables_table.items(): if isinstance(clbl, CallableKernel): - from loopy.program import ( + from loopy.translation_unit import ( rename_resolved_functions_in_a_single_kernel) knl = rename_resolved_functions_in_a_single_kernel( clbl.subkernel, renames) @@ -710,7 +710,7 @@ def generate_code_v2(program): """ from loopy.kernel import LoopKernel - from loopy.program import make_program + from loopy.translation_unit import make_program # {{{ cache retrieval diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 7fb779b63..244a733e2 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1485,7 +1485,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): """ warn("Calling a LoopKernel is deprecated, call a TranslationUnit " "instead.", DeprecationWarning, stacklevel=2) - from loopy.program import make_program + from loopy.translation_unit import make_program program = make_program(self) return program(*args, **kwargs) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 3fec7f758..7c09fcece 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -34,7 +34,7 @@ from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, SubstitutionRule, AddressSpace, ValueArg) -from loopy.program import iterate_over_kernels_if_given_program +from loopy.translation_unit import iterate_over_kernels_if_given_program from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -2442,7 +2442,7 @@ def make_function(domains, instructions, kernel_data=["..."], **kwargs): creation_plog.done() - from loopy.program import make_program + from loopy.translation_unit import make_program return make_program(knl) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 6465965f9..800909afc 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -33,7 +33,8 @@ from islpy import dim_type from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg, natsorted from loopy.kernel import LoopKernel -from loopy.program import TranslationUnit, iterate_over_kernels_if_given_program +from loopy.translation_unit import (TranslationUnit, + iterate_over_kernels_if_given_program) from loopy.kernel.function_interface import CallableKernel import logging logger = logging.getLogger(__name__) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 6f97e1667..9f23bcb37 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -203,7 +203,7 @@ class MaxReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2, callables_table, target): dtype, = dtype - from loopy.program import update_table + from loopy.translation_unit import update_table # getting the callable 'max' from target max_scalar_callable = target.get_device_ast_builder().known_callables["max"] @@ -225,7 +225,7 @@ class MinReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2, callables_table, target): dtype, = dtype - from loopy.program import update_table + from loopy.translation_unit import update_table # getting the callable 'min' from target min_scalar_callable = target.get_device_ast_builder().known_callables["min"] @@ -300,7 +300,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype, callables_table, target): from loopy.library.function import MakeTupleCallable - from loopy.program import update_table + from loopy.translation_unit import update_table scalar_neutral_element, calables_table = ( self.inner_reduction.neutral_element( @@ -344,7 +344,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): callables_table)) # populate callables_table - from loopy.program import update_table + from loopy.translation_unit import update_table func_id, callables_table = update_table( callables_table, SegmentedOp(self), segmented_scalar_callable) @@ -410,7 +410,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_element = scalar_neutral_func(scalar_dtype) from loopy.library.function import MakeTupleCallable - from loopy.program import update_table + from loopy.translation_unit import update_table make_tuple_callable = MakeTupleCallable( name="make_tuple") @@ -448,7 +448,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): callables_table)) # populate callables_table - from loopy.program import update_table + from loopy.translation_unit import update_table func_id, callables_table = update_table( callables_table, ArgExtOp(self), arg_ext_scalar_callable) diff --git a/loopy/loop.py b/loopy/loop.py index 73ca8d728..0127c1262 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -22,7 +22,7 @@ THE SOFTWARE. import islpy as isl -from loopy.program import iterate_over_kernels_if_given_program +from loopy.translation_unit import iterate_over_kernels_if_given_program def potential_loop_nest_map(kernel): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index cd6de2ab5..7efeffa78 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -42,7 +42,7 @@ from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction, CallInstruction, _DataObliviousInstruction) from loopy.kernel import LoopKernel -from loopy.program import TranslationUnit +from loopy.translation_unit import TranslationUnit from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pytools import ProcessLogger @@ -2315,7 +2315,7 @@ def infer_arg_descr(program): :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the callables. """ - from loopy.program import make_clbl_inf_ctx, resolve_callables + from loopy.translation_unit import make_clbl_inf_ctx, resolve_callables from loopy.kernel.array import ArrayBase from loopy.kernel.function_interface import (ArrayArgDescriptor, ValueArgDescriptor) @@ -2467,7 +2467,7 @@ def preprocess_program(program, device=None): if not program.entrypoints: raise LoopyError("Translation unit did not receive any entrypoints") - from loopy.program import resolve_callables + from loopy.translation_unit import resolve_callables program = resolve_callables(program) if device is not None: diff --git a/loopy/statistics.py b/loopy/statistics.py index 3ae783608..43bce10ca 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -35,7 +35,7 @@ from loopy.diagnostic import warn_with_kernel, LoopyError from loopy.symbolic import CoefficientCollector from pytools import ImmutableRecord, memoize_method from loopy.kernel.function_interface import CallableKernel -from loopy.program import TranslationUnit +from loopy.translation_unit import TranslationUnit from functools import partial diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 3d4d71147..2f8335848 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -739,7 +739,7 @@ class KernelExecutorBase: def get_typed_and_scheduled_program_uncached(self, entrypoint, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes from loopy.kernel import KernelState - from loopy.program import resolve_callables + from loopy.translation_unit import resolve_callables program = resolve_callables(self.program) diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index 1e03ade94..e54695d95 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -24,7 +24,7 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency -from loopy.program import iterate_over_kernels_if_given_program +from loopy.translation_unit import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel __doc__ = """ diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index 837668819..2896af68d 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -23,7 +23,7 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError -from loopy.program import iterate_over_kernels_if_given_program +from loopy.translation_unit import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index 5da142e3d..d1c1672da 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -25,7 +25,7 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl -from loopy.program import iterate_over_kernels_if_given_program +from loopy.translation_unit import iterate_over_kernels_if_given_program __doc__ = """ diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index 92b3d63fb..400be5554 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -31,7 +31,7 @@ from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper from loopy.version import DATA_MODEL_VERSION from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel -from loopy.program import TranslationUnit +from loopy.translation_unit import TranslationUnit from loopy.kernel.function_interface import CallableKernel, ScalarCallable from pymbolic import var diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 5e5125c49..5d88a78be 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -34,7 +34,7 @@ from loopy.symbolic import ( from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) -from loopy.program import TranslationUnit +from loopy.translation_unit import TranslationUnit __doc__ = """ .. currentmodule:: loopy @@ -489,7 +489,7 @@ def inline_callable_kernel(program, function_name): (scoped) name *function_name* inlined. """ from loopy.preprocess import infer_arg_descr - from loopy.program import resolve_callables + from loopy.translation_unit import resolve_callables program = resolve_callables(program) program = infer_arg_descr(program) callables_table = program.callables_table diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 467f8bd5f..baee68203 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -26,7 +26,8 @@ from islpy import dim_type from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper -from loopy.program import TranslationUnit, iterate_over_kernels_if_given_program +from loopy.translation_unit import (TranslationUnit, + iterate_over_kernels_if_given_program) from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel, ScalarCallable diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 269853024..994cbf444 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -28,7 +28,7 @@ from loopy.diagnostic import LoopyError from pymbolic import var from loopy.kernel import LoopKernel -from loopy.program import TranslationUnit +from loopy.translation_unit import TranslationUnit from loopy.kernel.function_interface import CallableKernel @@ -425,7 +425,7 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): # }}} - from loopy.program import make_program + from loopy.translation_unit import make_program return make_program(result).with_entrypoints(result.name) # vim: foldmethod=marker diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index e99f10e7a..27f337021 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -29,7 +29,8 @@ from loopy.symbolic import ( SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError -from loopy.program import TranslationUnit, iterate_over_kernels_if_given_program +from loopy.translation_unit import (TranslationUnit, + iterate_over_kernels_if_given_program) from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 3ec4b8d42..870348d71 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -23,7 +23,8 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) -from loopy.program import TranslationUnit, iterate_over_kernels_if_given_program +from loopy.translation_unit import (TranslationUnit, + iterate_over_kernels_if_given_program) # {{{ find_instructions diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index d6f2858cf..9335bb0bb 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -22,7 +22,7 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError from loopy.kernel.instruction import CallInstruction -from loopy.program import TranslationUnit +from loopy.translation_unit import TranslationUnit from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel, ScalarCallable from loopy.symbolic import SubArrayRef diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index f56588932..0cd953ea8 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -24,7 +24,8 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext -from loopy.program import iterate_over_kernels_if_given_program, TranslationUnit +from loopy.translation_unit import (iterate_over_kernels_if_given_program, + TranslationUnit) from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel from loopy.diagnostic import LoopyError diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 52feb577a..0e9dbe09e 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -25,7 +25,7 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl -from loopy.program import iterate_over_kernels_if_given_program +from loopy.translation_unit import iterate_over_kernels_if_given_program from loopy.kernel import LoopKernel __doc__ = """ diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index ac133ed59..9ba572efe 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -27,7 +27,7 @@ from loopy.symbolic import (get_dependencies, SubstitutionRuleMappingContext) from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func -from loopy.program import TranslationUnit +from loopy.translation_unit import TranslationUnit from loopy.kernel.function_interface import CallableKernel, ScalarCallable import numpy as np diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 41ca67759..2681d69ea 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -28,7 +28,8 @@ from loopy.transform.iname import remove_any_newly_unused_inames from pytools import ImmutableRecord from pymbolic import var -from loopy.program import iterate_over_kernels_if_given_program, TranslationUnit +from loopy.translation_unit import (iterate_over_kernels_if_given_program, + TranslationUnit) from loopy.kernel.function_interface import CallableKernel, ScalarCallable import logging diff --git a/loopy/program.py b/loopy/translation_unit.py similarity index 100% rename from loopy/program.py rename to loopy/translation_unit.py diff --git a/loopy/type_inference.py b/loopy/type_inference.py index bee6db0bc..24df0ea15 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -36,7 +36,7 @@ from loopy.symbolic import ( SubstitutionRuleExpander, ResolvedFunction, SubstitutionRuleMappingContext, SubArrayRef) from pymbolic.primitives import Variable, Subscript, Lookup -from loopy.program import CallablesInferenceContext, make_clbl_inf_ctx +from loopy.translation_unit import CallablesInferenceContext, make_clbl_inf_ctx import logging logger = logging.getLogger(__name__) @@ -1011,7 +1011,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx): def infer_unknown_types(program, expect_completion=False): """Infer types on temporaries and arguments.""" from loopy.kernel.data import auto - from loopy.program import resolve_callables + from loopy.translation_unit import resolve_callables program = resolve_callables(program) diff --git a/test/test_callables.py b/test/test_callables.py index 3cf39c2bf..8acdef425 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -718,7 +718,7 @@ def test_inlining_with_callee_domain_param(ctx_factory): def test_double_resolving(): - from loopy.program import resolve_callables + from loopy.translation_unit import resolve_callables from loopy.kernel import KernelState from loopy.symbolic import ResolvedFunction -- GitLab From 9e74c5eff126eabfdb17989fa118911d742508bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Thu, 22 Apr 2021 17:22:50 -0500 Subject: [PATCH 822/916] Extend entrypoint docs for TranslationUnit.__call__ --- loopy/translation_unit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index be4f96b55..14ed2d400 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -330,7 +330,8 @@ class TranslationUnit(ImmutableRecord): Builds and calls the *entrypoint* kernel, if :attr:`TranslationUnit.target` is an executable target. - :arg entrypoint: The entrypoint which is to be called + :arg entrypoint: The name of the entrypoint callable to be called. + Defaults to *the* entrypoint if there is only one. """ entrypoint = kwargs.get("entrypoint", None) -- GitLab From be08af0cabd4e1c6545fa06b578d222cb944622a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Apr 2021 23:08:09 -0500 Subject: [PATCH 823/916] simplifies inlining logic --- loopy/transform/callable.py | 141 +++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 68 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 5d88a78be..e964a24f9 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -29,18 +29,21 @@ from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, Assignment, CInstruction, _DataObliviousInstruction) from loopy.symbolic import ( - RuleAwareSubstitutionMapper, + RuleAwareSubstitutionMapper, RuleAwareIdentityMapper, SubstitutionRuleMappingContext, CombineMapper, IdentityMapper) from loopy.isl_helpers import simplify_via_aff from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) -from loopy.translation_unit import TranslationUnit +from loopy.translation_unit import (TranslationUnit, + iterate_over_kernels_if_given_program) __doc__ = """ .. currentmodule:: loopy .. autofunction:: register_callable +.. autofunction:: inline_callable_kernel + .. autofunction:: merge """ @@ -92,8 +95,9 @@ def merge(translation_units): & set(prg_j.callables_table)): if (prg_i.callables_table[clbl_name] != prg_j.callables_table[clbl_name]): - # FIXME: generate unique names + rename for the colliding - # callables + # TODO: generate unique names + rename for the colliding + # callables (if entrypoints are colliding that shuold still + # be an error) raise NotImplementedError("Translation units to be merged" " must have different callable names" " for now.") @@ -113,10 +117,10 @@ def merge(translation_units): # {{{ kernel inliner mapper -class KernelInliner(RuleAwareSubstitutionMapper): - def __init__(self, rule_mapping_context, subst_func, caller_knl, - callee_knl, callee_arg_to_call_param): - super().__init__(rule_mapping_context, subst_func, lambda *args: True) +class KernelArgumentSubstitutor(RuleAwareIdentityMapper): + def __init__(self, rule_mapping_context, caller_knl, + callee_knl, callee_arg_to_call_param): + super().__init__(rule_mapping_context) self.caller_knl = caller_knl self.callee_knl = callee_knl self.callee_arg_to_call_param = callee_arg_to_call_param @@ -136,9 +140,6 @@ class KernelInliner(RuleAwareSubstitutionMapper): caller_arg = self.caller_knl.temporary_variables[ sar.subscript.aggregate.name] - # map inner inames to outer inames. - outer_indices = self.map_tuple(expr.index_tuple, expn_state) - flatten_index = 0 for i, idx in enumerate(get_start_subscript_from_sar(sar, self.caller_knl).index_tuple): @@ -146,7 +147,7 @@ class KernelInliner(RuleAwareSubstitutionMapper): flatten_index += sum( idx * tag.stride - for idx, tag in zip(outer_indices, callee_arg.dim_tags)) + for idx, tag in zip(expr.index_tuple, callee_arg.dim_tags)) flatten_index = simplify_via_aff(flatten_index) @@ -160,7 +161,6 @@ class KernelInliner(RuleAwareSubstitutionMapper): return Subscript(Variable(sar.subscript.aggregate.name), new_indices) else: - assert expr.aggregate.name in self.callee_knl.temporary_variables return super().map_subscript(expr, expn_state) def map_variable(self, expr, expn_state): @@ -176,7 +176,6 @@ class KernelInliner(RuleAwareSubstitutionMapper): else: assert isinstance(arg, ValueArg) return par - else: return super().map_variable(expr, expn_state) @@ -247,6 +246,9 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): """ Returns a copy of *caller_knl* with the *call_insn* in the *kernel* replaced by inlining *callee_knl* into it within it. + + + :arg call_insn: An instance of `loopy.CallInstruction` of the call-site. """ import pymbolic.primitives as prim from pymbolic.mapper.substitutor import make_subst_func @@ -280,11 +282,12 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): # {{{ iname_to_tags - # new_iname_to_tags: caller's iname_to_tags post inlining - new_iname_to_tags = caller_knl.iname_to_tags + # new_inames: caller's inames post inlining + new_inames = caller_knl.inames - for old_name, tags in callee_knl.iname_to_tags.items(): - new_iname_to_tags[name_map[old_name]] = tags + for old_name, callee_iname in callee_knl.inames.items(): + new_name = name_map[old_name] + new_inames[new_name] = callee_iname.copy(name=new_name) # }}} @@ -326,15 +329,16 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): # }}} - # {{{ domains/assumptions + # {{{ process domains/assumptions + # rename inames new_domains = callee_knl.domains.copy() for old_iname in callee_knl.all_inames(): new_domains = [rename_iname(dom, old_iname, name_map[old_iname]) for dom in new_domains] + # realize domains' dim params in terms of caller's variables new_assumptions = callee_knl.assumptions - for callee_arg_name, param_expr in arg_map.items(): if isinstance(callee_knl.arg_dict[callee_arg_name], ValueArg): @@ -352,17 +356,29 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): # }}} + # {{{ rename inames/temporaries in the program + + rule_mapping_context = SubstitutionRuleMappingContext(callee_knl.substitutions, + vng) + subst_func = make_subst_func({old_name: prim.Variable(new_name) + for old_name, new_name in name_map.items()}) + inames_temps_renamer = RuleAwareSubstitutionMapper(rule_mapping_context, + subst_func, + within=lambda *args: True) + + callee_knl = rule_mapping_context.finish_kernel(inames_temps_renamer + .map_kernel(callee_knl)) + + # }}} + # {{{ map callee's expressions to get expressions after inlining - rule_mapping_context = SubstitutionRuleMappingContext( - callee_knl.substitutions, vng) - smap = KernelInliner(rule_mapping_context, - make_subst_func({old_name: prim.Variable(new_name) - for old_name, new_name in name_map.items()}), - caller_knl, callee_knl, arg_map) + rule_mapping_context = SubstitutionRuleMappingContext(callee_knl.substitutions, + vng) + smap = KernelArgumentSubstitutor(rule_mapping_context, caller_knl, + callee_knl, arg_map) - callee_knl = rule_mapping_context.finish_kernel(smap.map_kernel( - callee_knl)) + callee_knl = rule_mapping_context.finish_kernel(smap.map_kernel(callee_knl)) # }}} @@ -397,18 +413,17 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): for insn in callee_knl.instructions: new_within_inames = (frozenset(name_map[iname] - for iname in insn.within_inames) - | call_insn.within_inames) + for iname in insn.within_inames) + | call_insn.within_inames) new_depends_on = (frozenset(insn_id_map[dep] for dep in insn.depends_on) - | {noop_start.id}) + | {noop_start.id}) new_no_sync_with = frozenset((insn_id_map[id], scope) - for id, scope in insn.no_sync_with) + for id, scope in insn.no_sync_with) new_id = insn_id_map[insn.id] if isinstance(insn, Assignment): - new_atomicity = tuple( - type(atomicity)(name_map[atomicity.var_name]) - for atomicity in insn.atomicity) + new_atomicity = tuple(type(atomicity)(name_map[atomicity.var_name]) + for atomicity in insn.atomicity) insn = insn.copy( id=insn_id_map[insn.id], within_inames=new_within_inames, @@ -444,18 +459,19 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): caller_knl.assumptions, new_assumptions) return caller_knl.copy(instructions=new_insns, - temporary_variables=new_temps, - domains=caller_knl.domains+new_domains, - assumptions=old_assumptions.params() & new_assumptions.params(), - iname_to_tags=new_iname_to_tags) + temporary_variables=new_temps, + domains=caller_knl.domains+new_domains, + assumptions=(old_assumptions.params() + & new_assumptions.params()), + inames=new_inames) # }}} # {{{ inline callable kernel -def _inline_single_callable_kernel(caller_kernel, callee_kernel, - callables_table): +@iterate_over_kernels_if_given_program +def _inline_single_callable_kernel(caller_kernel, callee_kernel): from loopy.symbolic import ResolvedFunction # sub-array refs might be removed during inlining @@ -466,16 +482,14 @@ def _inline_single_callable_kernel(caller_kernel, callee_kernel, if (isinstance(insn, CallInstruction) and isinstance(insn.expression.function, ResolvedFunction)): if insn.expression.function.name == callee_kernel.name: - caller_kernel = _inline_call_instruction( - caller_kernel, callee_kernel, insn) + caller_kernel = _inline_call_instruction(caller_kernel, + callee_kernel, insn) inames_to_remove |= insn.sub_array_ref_inames() elif isinstance(insn, (MultiAssignmentBase, CInstruction, - _DataObliviousInstruction)): + _DataObliviousInstruction)): pass else: - raise NotImplementedError( - "Unknown instruction type %s" - % type(insn).__name__) + raise NotImplementedError(type(insn)) from loopy.transform.iname import remove_unused_inames return remove_unused_inames(caller_kernel, inames_to_remove) @@ -483,33 +497,24 @@ def _inline_single_callable_kernel(caller_kernel, callee_kernel, # FIXME This should take a 'within' parameter to be able to only inline # *some* calls to a kernel, but not others. -def inline_callable_kernel(program, function_name): +def inline_callable_kernel(translation_unit, function_name): """ - Returns a copy of *kernel* with the callable kernel addressed by - (scoped) name *function_name* inlined. + Returns an copy of *translation_unit* with the callable kernel + named *function_name* inlined at all call-sites. """ from loopy.preprocess import infer_arg_descr from loopy.translation_unit import resolve_callables - program = resolve_callables(program) - program = infer_arg_descr(program) - callables_table = program.callables_table - new_callables = {} - callee = program[function_name] - - for func_id, in_knl_callable in callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - caller = in_knl_callable.subkernel - in_knl_callable = in_knl_callable.copy( - subkernel=_inline_single_callable_kernel(caller, - callee, program.callables_table)) - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError() - new_callables[func_id] = in_knl_callable + # {{{ must have argument shape information at call sites to inline + + translation_unit = resolve_callables(translation_unit) + translation_unit = infer_arg_descr(translation_unit) + + # }}} + + callee = translation_unit[function_name] - return program.copy(callables_table=new_callables) + return _inline_single_callable_kernel(translation_unit, callee) # }}} -- GitLab From 1f682a234b70ea3921155017ce56d946c4d73c62 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Apr 2021 23:47:51 -0500 Subject: [PATCH 824/916] adds an inlining test to check value arg mapping --- test/test_callables.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/test_callables.py b/test/test_callables.py index 8acdef425..afc1112de 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -809,6 +809,31 @@ def test_int_max_min_c_target(ctx_factory, which): np.testing.assert_allclose(np_func(arr1, arr2), out) +def test_valueargs_being_mapped_in_inling(ctx_factory): + doublify = lp.make_function( + "{[i]: 0<=i 1: exec(sys.argv[1]) -- GitLab From a2768cb96b36d6a9297596761a045c6977669eb5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <15399010+kaushikcfd@users.noreply.github.com> Date: Thu, 22 Apr 2021 23:52:57 -0500 Subject: [PATCH 825/916] fix minor typo --- loopy/transform/callable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index e964a24f9..218779537 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -499,7 +499,7 @@ def _inline_single_callable_kernel(caller_kernel, callee_kernel): # *some* calls to a kernel, but not others. def inline_callable_kernel(translation_unit, function_name): """ - Returns an copy of *translation_unit* with the callable kernel + Returns a copy of *translation_unit* with the callable kernel named *function_name* inlined at all call-sites. """ from loopy.preprocess import infer_arg_descr -- GitLab From a4d9bd84304826d45cb92a3ad986aed90bd8f9cb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 18:19:15 -0500 Subject: [PATCH 826/916] sharpens docstrings --- loopy/kernel/function_interface.py | 155 +++++++++++++++-------------- 1 file changed, 81 insertions(+), 74 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 81ae58343..dc3961b84 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -371,44 +371,57 @@ class InKernelCallable(ImmutableRecord): update_persistent_hash = update_persistent_hash - def with_types(self, arg_id_to_dtype, callables_table): + def with_types(self, arg_id_to_dtype, clbl_inf_ctx): """ - :arg arg_id_to_type: a mapping from argument identifiers - (integers for positional arguments, names for keyword - arguments) to :class:`loopy.types.LoopyType` instances. + :arg arg_id_to_type: a mapping from argument identifiers (integers for + positional arguments) to :class:`loopy.types.LoopyType` instances. Unspecified/unknown types are not represented in *arg_id_to_type*. - Return values are denoted by negative integers, with the - first returned value identified as *-1*. + Return values are denoted by negative integers, with the first + returned value identified as *-1*. + + :arg clbl_inf_ctx: An instance of + :class:`loopy.translation_unit.CallablesInferenceContext`. *clbl_inf_ctx* + provides the namespace of other callables contained within *self*. - :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a - new :class:`InKernelCallable` specialized for the given types, - and *arg_id_to_type* is a mapping of the same form as the - argument above, however it may have more information present. - Any argument information exists both by its positional and - its keyword identifier. + :returns: a tuple ``(new_self, new_clbl_inf_ctx)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given types. + *new_clbl_inf_ctx* is *clbl_inf_ctx*'s updated state if the + type-specialization of *self* updated other calls contained within + it. + + .. note:: + + If then :class:`InKernelCallable` does not contain any + other callables within it, then *clbl_inf_ctx* is returned as is. """ raise NotImplementedError() - def with_descrs(self, arg_id_to_descr, callables_table): + def with_descrs(self, arg_id_to_descr, clbl_inf_ctx): """ :arg arg_id_to_descr: a mapping from argument identifiers (integers for - positional arguments, names for keyword arguments) to - :class:`ArrayArgDescriptor` instances. Unspecified/unknown - descriptors are not represented in *arg_id_to_descr*. + positional arguments) to instances of :class:`ArrayArgDescriptor` + or :class:`ValueArgDescriptor`. Unspecified/unknown descriptors are + not represented in *arg_id_to_type*. + + Return values are denoted by negative integers, with the first + returned value identified as *-1*. - All the expressions in arg_id_to_descr must have variables that belong - to the callable's namespace. + :arg clbl_inf_ctx: An instance of + :class:`loopy.translation_unit.CallablesInferenceContext`. *clbl_inf_ctx* + provides the namespace of other callables contained within *self*. - Return values are denoted by negative integers, with the - first returned value identified as *-1*. + :returns: a tuple ``(new_self, new_clbl_inf_ctx)``, where *new_self* is a + new :class:`InKernelCallable` specialized for the given argument + descriptors. *new_clbl_inf_ctx* is the *clbl_inf_ctx*'s updated state + if descriptor-specialization of *self* updated other calls contained + within it. - :returns: a copy of *self* which is a new instance of - :class:`InKernelCallable` specialized for the given types, and - *arg_id_to_descr* is a mapping of the same form as the argument above, - however it may have more information present. Any argument information - exists both by its positional and its keyword identifier. + .. note:: + + If then :class:`InKernelCallable` does not contain any + other callables within it, then *clbl_inf_ctx* is returned as is. """ raise NotImplementedError() @@ -536,12 +549,11 @@ class ScalarCallable(InKernelCallable): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) - def with_descrs(self, arg_id_to_descr, callables_table): + def with_descrs(self, arg_id_to_descr, clbl_inf_ctx): arg_id_to_descr[-1] = ValueArgDescriptor() - return ( - self.copy(arg_id_to_descr=arg_id_to_descr), - callables_table) + return (self.copy(arg_id_to_descr=arg_id_to_descr), + clbl_inf_ctx) def with_hw_axes_sizes(self, global_size, local_size): return self.copy() @@ -597,52 +609,51 @@ class ScalarCallable(InKernelCallable): *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)`` """ - - # Currently this is formulated such that the first argument is returned - # and rest all are passed by reference as arguments to the function. - assert self.is_ready_for_codegen() + from loopy.target.c import CFamilyTarget + if not isinstance(target, CFamilyTarget): + raise NotImplementedError() from loopy.kernel.instruction import CallInstruction + from loopy.expression import dtype_to_type_context + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic import var assert isinstance(insn, CallInstruction) + assert self.is_ready_for_codegen() + ecm = expression_to_code_mapper parameters = insn.expression.parameters assignees = insn.assignees[1:] - par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in - parameters) - arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in - enumerate(parameters)) + par_dtypes = tuple(expression_to_code_mapper.infer_type(par) + for par in parameters) + arg_dtypes = tuple(self.arg_id_to_dtype[i] + for i, _ in enumerate(parameters)) - assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in - enumerate(assignees)) + assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] + for i, _ in enumerate(assignees)) - from loopy.expression import dtype_to_type_context - from pymbolic.mapper.stringifier import PREC_NONE - from pymbolic import var - - c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, tgt_dtype), - tgt_dtype).expr - for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, arg_dtypes)] + tgt_parameters = [ecm(par, PREC_NONE, + dtype_to_type_context(target, tgt_dtype), + tgt_dtype).expr + for par, par_dtype, tgt_dtype in zip(parameters, + par_dtypes, + arg_dtypes)] for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)): if tgt_dtype != expression_to_code_mapper.infer_type(a): raise LoopyError("Type Mismatch in function %s. Expected: %s" "Got: %s" % (self.name, tgt_dtype, expression_to_code_mapper.infer_type(a))) - c_parameters.append( - var("&")( - expression_to_code_mapper(a, PREC_NONE, - dtype_to_type_context(target, tgt_dtype), - tgt_dtype).expr)) + tgt_parameters.append(var("&")(ecm(a, PREC_NONE, + dtype_to_type_context(target, + tgt_dtype), + tgt_dtype).expr)) # assignee is returned whenever the size of assignees is non zero. first_assignee_is_returned = len(insn.assignees) > 0 - return var(self.name_in_target)(*c_parameters), first_assignee_is_returned + return var(self.name_in_target)(*tgt_parameters), first_assignee_is_returned def generate_preambles(self, target): return @@ -745,7 +756,7 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=specialized_kernel, arg_id_to_dtype=new_arg_id_to_dtype), callables_table - def with_descrs(self, arg_id_to_descr, callables_table): + def with_descrs(self, arg_id_to_descr, clbl_inf_ctx): # arg_id_to_descr expressions provided are from the caller's namespace, # need to register @@ -797,9 +808,8 @@ class CallableKernel(InKernelCallable): subkernel = self.subkernel.copy(args=new_args) from loopy.preprocess import traverse_to_infer_arg_descr - subkernel, callables_table = ( - traverse_to_infer_arg_descr(subkernel, - callables_table)) + subkernel, clbl_inf_ctx = traverse_to_infer_arg_descr(subkernel, + clbl_inf_ctx) # {{{ update the arg descriptors @@ -820,7 +830,7 @@ class CallableKernel(InKernelCallable): return (self.copy(subkernel=subkernel, arg_id_to_descr=arg_id_to_descr), - callables_table) + clbl_inf_ctx) def with_added_arg(self, arg_dtype, arg_descr): var_name = self.subkernel.get_var_name_generator()(based_on="_lpy_arg") @@ -891,14 +901,17 @@ class CallableKernel(InKernelCallable): yield def emit_call_insn(self, insn, target, expression_to_code_mapper): - - assert self.is_ready_for_codegen() + from loopy.target.c import CFamilyTarget + if not isinstance(target, CFamilyTarget): + raise NotImplementedError() from loopy.kernel.instruction import CallInstruction from pymbolic.primitives import CallWithKwargs + assert self.is_ready_for_codegen() assert isinstance(insn, CallInstruction) + ecm = expression_to_code_mapper parameters = insn.expression.parameters kw_parameters = {} if isinstance(insn.expression, CallWithKwargs): @@ -927,20 +940,14 @@ class CallableKernel(InKernelCallable): # no type casting in array calls from loopy.expression import dtype_to_type_context from pymbolic.mapper.stringifier import PREC_NONE - from loopy.symbolic import SubArrayRef from pymbolic import var - c_parameters = [ - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr if isinstance(par, SubArrayRef) else - expression_to_code_mapper(par, PREC_NONE, - dtype_to_type_context(target, par_dtype), - par_dtype).expr - for par, par_dtype in zip( - parameters, par_dtypes)] - - return var(self.subkernel.name)(*c_parameters), False + tgt_parameters = [ecm(par, PREC_NONE, dtype_to_type_context(target, + par_dtype), + par_dtype).expr + for par, par_dtype in zip(parameters, par_dtypes)] + + return var(self.subkernel.name)(*tgt_parameters), False # }}} -- GitLab From 3fc01688eedbc52dfe3a1f5439eb1ae7ba3a6c0d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Apr 2021 02:05:24 -0500 Subject: [PATCH 827/916] CallablesInferenceContext: better implementation; docs --- doc/ref_translation_unit.rst | 5 + loopy/preprocess.py | 30 ++-- loopy/translation_unit.py | 312 ++++++++++++++++++----------------- loopy/type_inference.py | 9 +- 4 files changed, 185 insertions(+), 171 deletions(-) diff --git a/doc/ref_translation_unit.rst b/doc/ref_translation_unit.rst index 9d7c49158..631c57561 100644 --- a/doc/ref_translation_unit.rst +++ b/doc/ref_translation_unit.rst @@ -4,3 +4,8 @@ TranslationUnit =============== .. autoclass:: TranslationUnit + +Reference +--------- + +.. automodule:: loopy.translation_unit diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 7efeffa78..53ddcefe1 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2151,12 +2151,10 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): Infers the :attr:`loopy` """ - def __init__(self, rule_mapping_context, caller_kernel, - callables_table): - super().__init__( - rule_mapping_context) + def __init__(self, rule_mapping_context, caller_kernel, clbl_inf_ctx): + super().__init__(rule_mapping_context) self.caller_kernel = caller_kernel - self.callables_table = callables_table + self.clbl_inf_ctx = clbl_inf_ctx def map_call(self, expr, expn_state, assignees=None): from pymbolic.primitives import Call, CallWithKwargs, Variable @@ -2185,7 +2183,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): arg_id: get_arg_descriptor_for_expression( self.caller_kernel, arg) for arg_id, arg in arg_id_to_val.items()} - in_knl_callable = self.callables_table[expr.function.name] + in_knl_callable = self.clbl_inf_ctx[expr.function.name] # {{{ translating descriptor expressions to the callable's namespace @@ -2219,14 +2217,14 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): # }}} # specializing the function according to the parameter description - new_in_knl_callable, self.callables_table = ( + new_in_knl_callable, self.clbl_inf_ctx = ( in_knl_callable.with_descrs( - arg_id_to_descr, self.callables_table)) + arg_id_to_descr, self.clbl_inf_ctx)) # find the deps of the new in kernel callablen and add those arguments to - self.callables_table, new_func_id = ( - self.callables_table.with_callable( + self.clbl_inf_ctx, new_func_id = ( + self.clbl_inf_ctx.with_callable( expr.function.function, new_in_knl_callable)) @@ -2306,7 +2304,7 @@ def traverse_to_infer_arg_descr(kernel, callables_table): descr_inferred_kernel = rule_mapping_context.finish_kernel( arg_descr_inf_mapper.map_kernel(kernel)) - return descr_inferred_kernel, arg_descr_inf_mapper.callables_table + return descr_inferred_kernel, arg_descr_inf_mapper.clbl_inf_ctx def infer_arg_descr(program): @@ -2324,9 +2322,7 @@ def infer_arg_descr(program): program = resolve_callables(program) clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, - program.entrypoints) - - renamed_entrypoints = set() + program.entrypoints) for e in program.entrypoints: def _tuple_or_None(s): @@ -2350,10 +2346,10 @@ def infer_arg_descr(program): raise NotImplementedError() new_callable, clbl_inf_ctx = program.callables_table[e].with_descrs( arg_id_to_descr, clbl_inf_ctx) - clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) - renamed_entrypoints.add(new_name.name) + clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable, + is_entrypoint=True) - return clbl_inf_ctx.finish_program(program, renamed_entrypoints) + return clbl_inf_ctx.finish_program(program) # }}} diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 14ed2d400..9ce69b4ae 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -44,10 +44,12 @@ from pyrsistent import pmap, PMap __doc__ = """ -.. currentmodule:: loopy +.. currentmodule:: loopy.translation_unit .. autoclass:: TranslationUnit +.. autoclass:: CallablesInferenceContext + .. autofunction:: make_program .. autofunction:: iterate_over_kernels_if_given_program @@ -396,10 +398,10 @@ class Program(TranslationUnit): # }}} -def next_indexed_function_identifier(function_id): +def next_indexed_function_id(function_id): """ Returns an instance of :class:`str` with the next indexed-name in the - sequence for the name of *function*. + sequence for the name of *function_id*. *Example:* ``'sin_0'`` will return ``'sin_1'``. @@ -462,9 +464,8 @@ def rename_resolved_functions_in_a_single_kernel(kernel, class CallablesIDCollector(CombineMapper): """ - Returns an instance of :class:`frozenset` containing instances of - :class:`loopy.kernel.function_interface.InKernelCallable` in the - :attr:``kernel`. + Mapper to collect function identifiers of all resolved callables in an + expression. """ def combine(self, values): import operator @@ -512,192 +513,207 @@ def _get_callable_ids_for_knl(knl, callables): def _get_callable_ids(callables, entrypoints): return frozenset().union(*( - _get_callable_ids_for_knl(callables[e].subkernel, callables) for e in - entrypoints)) + _get_callable_ids_for_knl(callables[e].subkernel, callables) + for e in entrypoints)) def make_clbl_inf_ctx(callables, entrypoints): - return CallablesInferenceContext(callables, _get_callable_ids(callables, - entrypoints)) + return CallablesInferenceContext(callables) class CallablesInferenceContext(ImmutableRecord): - def __init__(self, callables, old_callable_ids, history={}): + """ + Helper class for housekeeping a :attr:`loopy.TranslationUnit.callables_table` + while traversing through callables of :class:`loopy.TranslationUnit`. + + .. attribute:: callables + + A mapping from the callable names to instances of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + .. attribute:: renames + + A mapping from old function identifiers to a :class:`frozenset` of new + function identifiers. + + .. attribute:: new_entrypoints + + A :class:`frozenset` of renamed entrypoint names. + + .. automethod:: with_callable + + .. automethod:: finish_program + + .. automethod:: __getitem__ + """ + def __init__(self, callables, + renames=collections.defaultdict(frozenset), + new_entrypoints=frozenset()): assert isinstance(callables, collections.abc.Mapping) callables = dict(callables) - super().__init__( - callables=callables, - old_callable_ids=old_callable_ids, - history=history) + super().__init__(callables=callables, + renames=renames, + new_entrypoints=new_entrypoints) # {{{ interface to perform edits on callables - def with_callable(self, function, in_kernel_callable): + def with_callable(self, old_function_id, new_clbl, + is_entrypoint=False): """ - Returns an instance of :class:`tuple` ``(new_self, new_function)``. + Updates the callable referred by *function_id*'s in *self*'s namespace + to *new_clbl*. - :arg function: An instance of :class:`pymbolic.primitives.Variable` or + :arg old_function_id: An instance of :class:`pymbolic.primitives.Variable` or :class:`loopy.library.reduction.ReductionOpFunction`. - :arg in_kernel_callable: An instance of - :class:`loopy.InKernelCallable`. + :arg new_clbl: An instance of + :class:`loopy.kernel.function_interface.InKernelCallable`. + + :returns: ``(new_self, new_function_id)`` is a copy of *self* with + *new_clbl* in its namespace. *new_clbl* would be referred by + *new_function_id* in *new_self*'s namespace. """ - # {{{ sanity checks + assert isinstance(old_function_id, (str, Variable, ReductionOpFunction)) + + if isinstance(old_function_id, Variable): + old_function_id = old_function_id.name + + renames = self.renames.copy() + + # if the callable already exists => return the function + # identifier corresponding to that callable. + for func_id, clbl in self.callables.items(): + if clbl == new_clbl: + renames[old_function_id] |= frozenset([func_id]) + if isinstance(func_id, str): + new_entrypoints = self.new_entrypoints + if is_entrypoint: + new_entrypoints |= frozenset([func_id]) + return (self.copy(renames=renames, + new_entrypoints=new_entrypoints), + Variable(func_id),) + else: + assert not is_entrypoint + assert isinstance(func_id, ReductionOpFunction) + return (self.copy(renames=renames), + func_id) + + # {{{ handle ReductionOpFunction + + if isinstance(old_function_id, ReductionOpFunction): + # FIXME: Check if we have 2 ArgMax functions + # with different types in the same kernel the generated code + # does not mess up the types. + assert not is_entrypoint + unique_function_id = old_function_id.copy() + updated_callables = self.callables.copy() + updated_callables[unique_function_id] = new_clbl + renames[old_function_id] |= frozenset([unique_function_id]) + + return (self.copy(callables=updated_callables, + renames=renames), + unique_function_id) - if isinstance(function, str): - function = Variable(function) + # }}} - assert isinstance(function, (Variable, ReductionOpFunction)) + # {{{ must allocate a new clbl in the namespace => find a unique id for it - # }}} + unique_function_id = old_function_id - history = self.history.copy() - - if in_kernel_callable in self.callables.values(): - # the callable already exists, hence return the function - # identifier corresponding to that callable. - for func_id, in_knl_callable in self.callables.items(): - if in_knl_callable == in_kernel_callable: - history[func_id] = function.name - if isinstance(func_id, str): - return ( - self.copy( - history=history), - Variable(func_id)) - else: - assert isinstance(func_id, ReductionOpFunction) - return ( - self.copy( - history=history), - func_id) - - assert False - else: - # {{{ handle ReductionOpFunction + while unique_function_id in self.callables: + unique_function_id = next_indexed_function_id(unique_function_id) - if isinstance(function, ReductionOpFunction): - # FIXME: Check if we have 2 ArgMax functions - # with different types in the same kernel the generated code - # does not mess up the types. - unique_function_identifier = function.copy() - updated_callables = self.callables.copy() - updated_callables[unique_function_identifier] = ( - in_kernel_callable) + # }}} - return ( - self.copy( - callables=updated_callables), - unique_function_identifier) + updated_callables = self.callables.copy() + updated_callables[unique_function_id] = new_clbl + renames[old_function_id] |= frozenset([unique_function_id]) - # }}} + new_entrypoints = self.new_entrypoints + if is_entrypoint: + new_entrypoints |= frozenset([unique_function_id]) - unique_function_identifier = function.name + return (self.copy(renames=renames, + callables=updated_callables, + new_entrypoints=new_entrypoints), + Variable(unique_function_id)) - while unique_function_identifier in self.callables: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + def finish_program(self, program): + """ + Returns a copy of *program* with rollback renaming of the callables + done whenever possible. - updated_callables = self.callables.copy() - updated_callables[unique_function_identifier] = ( - in_kernel_callable) + For example: If all the ``sin`` function ids got diverged as + ``sin_0``, ``sin_1``, then all the renaming is done such that one of + flavors of the callable is renamed back to ``sin``. + """ + # FIXME: Generalize this if an inference happens over a proper subgraph + # of the callgraph (the following assert should be removed) + assert len(self.new_entrypoints) == len(program.entrypoints) - history[unique_function_identifier] = function.name + # {{{ get all the callables reachable from the new entrypoints. - return ( - self.copy( - history=history, - callables=updated_callables), - Variable(unique_function_identifier)) + # get the names of all callables reachable from the new entrypoints + new_callable_ids = _get_callable_ids(self.callables, self.new_entrypoints) - def finish_program(self, program, renamed_entrypoints): - """ - Returns a copy of *program* with renaming of the callables done whenever - needed. + # get the history of function ids from the performed renames: + history = {} + for old_func_id, new_func_ids in self.renames.items(): + for new_func_id in new_func_ids: + if new_func_id in (new_callable_ids | self.new_entrypoints): + history[new_func_id] = old_func_id - *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``, - then all the renaming is done such that one of flavors of the callable - is renamed back to ``sin``. + # }}} - :param renamed_entrypoints: A :class:`frozenset` of the names of the - renamed callable kernels which correspond to the entrypoints in - *self.callables_table*. - """ - assert len(renamed_entrypoints) == len(program.entrypoints) - new_callable_ids = _get_callable_ids(self.callables, renamed_entrypoints) + # AIM: Preserve the entrypoints of *program* - callees_with_entrypoint_names = (program.entrypoints & - new_callable_ids) - renamed_entrypoints + # If there are any callees having old entrypoint names => mark them for + # renaming + callees_with_old_entrypoint_names = ((program.entrypoints & new_callable_ids) + - self.new_entrypoints) - renames = {} + todo_renames = {} new_callables = {} - for c in callees_with_entrypoint_names: - unique_function_identifier = c + for c in callees_with_old_entrypoint_names: + unique_func_id = c - while unique_function_identifier in self.callables: - unique_function_identifier = ( - next_indexed_function_identifier( - unique_function_identifier)) + while unique_func_id in self.callables: + unique_func_id = next_indexed_function_id(unique_func_id) - renames[c] = unique_function_identifier + todo_renames[c] = unique_func_id - # we should perform a rewrite here. + for e in self.new_entrypoints: + # note renames to "rollback" the renaming of entrypoints + todo_renames[e] = history[e] + assert todo_renames[e] in program.entrypoints - for e in renamed_entrypoints: - renames[e] = self.history[e] - assert renames[e] in program.entrypoints + # try to rollback the names as much as possible + for new_id in new_callable_ids: + old_func_id = history[new_id] + if (isinstance(old_func_id, str) + and old_func_id not in set(todo_renames.values())): + todo_renames[new_id] = old_func_id - # {{{ calculate the renames needed + # {{{ perform the renames form todo_renames - for old_func_id in ((self.old_callable_ids-new_callable_ids) - - program.entrypoints): - # at this point we should not rename anything to the names of - # entrypoints - for new_func_id in (new_callable_ids-renames.keys()) & set( - self.history.keys()): - if old_func_id == self.history[new_func_id]: - renames[new_func_id] = old_func_id - break - # }}} + for func_id in (new_callable_ids | self.new_entrypoints): + clbl = self.callables[func_id] + if func_id in todo_renames: + assert history[func_id] == todo_renames[func_id] + func_id = todo_renames[func_id] + if isinstance(clbl, CallableKernel): + subknl = clbl.subkernel.copy(name=func_id) + subknl = rename_resolved_functions_in_a_single_kernel(subknl, + todo_renames) - for e in renamed_entrypoints: - new_subkernel = self.callables[e].subkernel.copy(name=self.history[e]) - new_subkernel = rename_resolved_functions_in_a_single_kernel( - new_subkernel, renames) - new_callables[self.history[e]] = self.callables[e].copy( - subkernel=new_subkernel) - - for func_id in new_callable_ids-renamed_entrypoints: - in_knl_callable = self.callables[func_id] - if isinstance(in_knl_callable, CallableKernel): - # if callable kernel, perform renames inside its expressions. - old_subkernel = in_knl_callable.subkernel - new_subkernel = rename_resolved_functions_in_a_single_kernel( - old_subkernel, renames) - in_knl_callable = ( - in_knl_callable.copy(subkernel=new_subkernel)) - elif isinstance(in_knl_callable, ScalarCallable): - pass - else: - raise NotImplementedError("Unknown callable type %s." % - type(in_knl_callable).__name__) + clbl = clbl.copy(subkernel=subknl) - if func_id in renames: - new_func_id = renames[func_id] - if isinstance(in_knl_callable, CallableKernel): - in_knl_callable = (in_knl_callable.copy( - subkernel=in_knl_callable.subkernel.copy( - name=new_func_id))) - new_callables[new_func_id] = in_knl_callable - else: - if isinstance(in_knl_callable, CallableKernel): - in_knl_callable = in_knl_callable.copy( - subkernel=in_knl_callable.subkernel.copy( - name=func_id)) - new_callables[func_id] = in_knl_callable + new_callables[func_id] = clbl + + # }}} return program.copy(callables_table=new_callables) @@ -775,7 +791,7 @@ def update_table(callables_table, clbl_id, clbl): return i, callables_table while clbl_id in callables_table: - clbl_id = next_indexed_function_identifier(clbl_id) + clbl_id = next_indexed_function_id(clbl_id) callables_table[clbl_id] = clbl diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 24df0ea15..36d5408a0 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1018,17 +1018,14 @@ def infer_unknown_types(program, expect_completion=False): clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, program.entrypoints) - renamed_entrypoints = set() - for e in program.entrypoints: logger.debug(f"Entering entrypoint: {e}") arg_id_to_dtype = {arg.name: arg.dtype for arg in program[e].args if arg.dtype not in (None, auto)} new_callable, clbl_inf_ctx = program.callables_table[e].with_types( arg_id_to_dtype, clbl_inf_ctx) - clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable) - renamed_entrypoints.add(new_name.name) - + clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable, + is_entrypoint=True) if expect_completion: from loopy.types import LoopyType new_knl = new_callable.subkernel @@ -1048,7 +1045,7 @@ def infer_unknown_types(program, expect_completion=False): raise LoopyError("could not determine type of" f" '{vars_not_inferred.pop()}' of kernel '{e}'.") - return clbl_inf_ctx.finish_program(program, renamed_entrypoints) + return clbl_inf_ctx.finish_program(program) # }}} -- GitLab From 8b78e2c956ee3482547a04cacd18a94e02b7ccdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Fri, 23 Apr 2021 00:51:15 -0500 Subject: [PATCH 828/916] Grammar fixes for InKernelCallable/CallablesInferenceContext docs --- loopy/kernel/function_interface.py | 4 ++-- loopy/translation_unit.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index dc3961b84..ce8dcf6eb 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -392,7 +392,7 @@ class InKernelCallable(ImmutableRecord): .. note:: - If then :class:`InKernelCallable` does not contain any + If the :class:`InKernelCallable` does not contain any other callables within it, then *clbl_inf_ctx* is returned as is. """ @@ -420,7 +420,7 @@ class InKernelCallable(ImmutableRecord): .. note:: - If then :class:`InKernelCallable` does not contain any + If the :class:`InKernelCallable` does not contain any other callables within it, then *clbl_inf_ctx* is returned as is. """ diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 9ce69b4ae..ea803afc7 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -645,9 +645,9 @@ class CallablesInferenceContext(ImmutableRecord): Returns a copy of *program* with rollback renaming of the callables done whenever possible. - For example: If all the ``sin`` function ids got diverged as + For example: If all the ``sin`` function ids diverged as ``sin_0``, ``sin_1``, then all the renaming is done such that one of - flavors of the callable is renamed back to ``sin``. + the flavors of the callable is renamed back to ``sin``. """ # FIXME: Generalize this if an inference happens over a proper subgraph # of the callgraph (the following assert should be removed) -- GitLab From ad9d296f1d17cd65bba837bc733453f91a102480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Fri, 23 Apr 2021 00:52:05 -0500 Subject: [PATCH 829/916] Doc fix: Expose TranslationUnit in loopy namespace --- loopy/translation_unit.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index ea803afc7..549b3c78c 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -44,10 +44,12 @@ from pyrsistent import pmap, PMap __doc__ = """ -.. currentmodule:: loopy.translation_unit +.. currentmodule:: loopy .. autoclass:: TranslationUnit +.. currentmodule:: loopy.translation_unit + .. autoclass:: CallablesInferenceContext .. autofunction:: make_program -- GitLab From 95ea7d49f53b5ad2a33defda96bd0aea47710a24 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 23 Apr 2021 01:09:29 -0500 Subject: [PATCH 830/916] Drop duplicate doc mention of TranslationUnit --- loopy/translation_unit.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 549b3c78c..3b158e1cf 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -43,11 +43,6 @@ from functools import reduce from pyrsistent import pmap, PMap __doc__ = """ - -.. currentmodule:: loopy - -.. autoclass:: TranslationUnit - .. currentmodule:: loopy.translation_unit .. autoclass:: CallablesInferenceContext -- GitLab From 132c640f5ffff9dc80aef487e0f3ebfb53768dfe Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 23 Apr 2021 01:10:14 -0500 Subject: [PATCH 831/916] Import pytools intersphinx inventory --- doc/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/conf.py b/doc/conf.py index 4c6d28c9d..9e45e0764 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -111,6 +111,7 @@ man_pages = [ intersphinx_mapping = { "https://docs.python.org/3": None, "https://numpy.org/doc/stable/": None, + "https://documen.tician.de/pytools": None, "https://documen.tician.de/islpy": None, "https://documen.tician.de/pyopencl": None, "https://documen.tician.de/cgen": None, -- GitLab From 43754daee7ebd2f2fb3d029138cdd188508d67c9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <15399010+kaushikcfd@users.noreply.github.com> Date: Fri, 23 Apr 2021 02:59:03 -0500 Subject: [PATCH 832/916] [callables] Adds :attr:`InKernelCallable.name` (#331) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * adds :attr:`InKernelCallable.name` * Fix grammar in ScalarCallable docstring Co-authored-by: Andreas Klöckner --- loopy/kernel/function_interface.py | 74 +++++++++++++----------------- loopy/library/random123.py | 13 +++--- 2 files changed, 39 insertions(+), 48 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index ce8dcf6eb..95a2d3d55 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -357,17 +357,13 @@ class InKernelCallable(ImmutableRecord): """ - fields = {"arg_id_to_dtype", "arg_id_to_descr"} - init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr") + hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr") - def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None): + def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None): - super().__init__( - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr) - - def __getinitargs__(self): - return (self.arg_id_to_dtype, self.arg_id_to_descr) + super().__init__(name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) update_persistent_hash = update_persistent_hash @@ -497,8 +493,7 @@ class InKernelCallable(ImmutableRecord): raise NotImplementedError() def __hash__(self): - - return hash(tuple(self.fields)) + return hash(self.hash_fields) def with_added_arg(self, arg_dtype, arg_descr): """ @@ -516,6 +511,13 @@ class ScalarCallable(InKernelCallable): """ An abstract interface the to a scalar callable encountered in a kernel. + .. attribute:: name_in_target + + A :class:`str` to denote the name of the function in a + :class:`loopy.target.TargetBase` for which the callable is specialized. + *None* if the callable is not specialized enough to know its name + in target. + .. automethod:: with_types .. automethod:: with_descrs @@ -525,26 +527,16 @@ class ScalarCallable(InKernelCallable): The :meth:`ScalarCallable.with_types` is intended to assist with type specialization of the function and sub-classes must define it. """ - fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"} - init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr", - "name_in_target") - hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target") + hash_fields = InKernelCallable.hash_fields + ("name_in_target",) def __init__(self, name, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None): - - super().__init__( - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr) - - self.name = name + arg_id_to_descr=None, name_in_target=None): + super().__init__(name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) self.name_in_target = name_in_target - def __getinitargs__(self): - return (self.arg_id_to_dtype, self.arg_id_to_descr, - self.name_in_target) - def with_types(self, arg_id_to_dtype, callables_table): raise LoopyError("No type inference information present for " "the function %s." % (self.name)) @@ -695,26 +687,26 @@ class CallableKernel(InKernelCallable): """ fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"} - init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") hash_fields = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr") def __init__(self, subkernel, arg_id_to_dtype=None, - arg_id_to_descr=None): + arg_id_to_descr=None): assert isinstance(subkernel, LoopKernel) - - super().__init__( - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr) - + super().__init__(name=subkernel.name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr) self.subkernel = subkernel - def __getinitargs__(self): - return (self.subkernel, self.arg_id_to_dtype, - self.arg_id_to_descr) + def copy(self, subkernel=None, arg_id_to_dtype=None, + arg_id_to_descr=None): + if subkernel is None: + subkernel = self.subkernel + if arg_id_to_descr is None: + arg_id_to_descr = self.arg_id_to_descr + if arg_id_to_dtype is None: + arg_id_to_dtype = self.arg_id_to_dtype - @property - def name(self): - return self.subkernel.name + return CallableKernel(subkernel, arg_id_to_dtype, arg_id_to_descr) def with_types(self, arg_id_to_dtype, callables_table): kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) @@ -889,8 +881,8 @@ class CallableKernel(InKernelCallable): GridOverrideForCalleeKernel(gsize, lsize)))) def is_ready_for_codegen(self): - return (self.arg_id_to_dtype is not None and - self.arg_id_to_descr is not None) + return (self.arg_id_to_dtype is not None + and self.arg_id_to_descr is not None) def generate_preambles(self, target): """ Yields the *target* specific preambles. diff --git a/loopy/library/random123.py b/loopy/library/random123.py index 2d4f82205..8978f4419 100644 --- a/loopy/library/random123.py +++ b/loopy/library/random123.py @@ -168,15 +168,14 @@ class Random123Callable(ScalarCallable): Records information about for the random123 functions. """ fields = ScalarCallable.fields | {"target"} + hash_fields = ScalarCallable.hash_fields + ("target",) def __init__(self, name, arg_id_to_dtype=None, - arg_id_to_descr=None, name_in_target=None, target=None): - - super().__init__( - name=name, - arg_id_to_dtype=arg_id_to_dtype, - arg_id_to_descr=arg_id_to_descr, - name_in_target=name_in_target) + arg_id_to_descr=None, name_in_target=None, target=None): + super().__init__(name=name, + arg_id_to_dtype=arg_id_to_dtype, + arg_id_to_descr=arg_id_to_descr, + name_in_target=name_in_target) self.target = target -- GitLab From f70214252addc5f85cf60b3758ffd7402bb9b942 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 23 Apr 2021 09:57:47 -0500 Subject: [PATCH 833/916] Drop Firedrake-specific automatic reshaping business --- loopy/transform/callable.py | 188 +----------------------------------- test/test_callables.py | 48 --------- 2 files changed, 1 insertion(+), 235 deletions(-) diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 5d88a78be..dafbe076e 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -29,9 +29,7 @@ from loopy.diagnostic import LoopyError from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase, Assignment, CInstruction, _DataObliviousInstruction) from loopy.symbolic import ( - RuleAwareSubstitutionMapper, - SubstitutionRuleMappingContext, CombineMapper, IdentityMapper) -from loopy.isl_helpers import simplify_via_aff + RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.translation_unit import TranslationUnit @@ -514,190 +512,6 @@ def inline_callable_kernel(program, function_name): # }}} -# {{{ tools to match caller to callee args by (guessed) automatic reshaping - -# (This is undocumented and not recommended, but it is currently needed -# to support Firedrake.) - -class DimChanger(IdentityMapper): - """ - Mapper to change the dimensions of an argument. - - .. attribute:: callee_arg_dict - - A mapping from the argument name (:class:`str`) to instances of - :class:`loopy.kernel.array.ArrayBase`. - - .. attribute:: desried_shape - - A mapping from argument name (:class:`str`) to an instance of - :class:`tuple`. - """ - def __init__(self, callee_arg_dict, desired_shape): - self.callee_arg_dict = callee_arg_dict - self.desired_shape = desired_shape - - def map_subscript(self, expr): - if expr.aggregate.name not in self.callee_arg_dict: - return super().map_subscript(expr) - callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags - flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in - zip(callee_arg_dim_tags, expr.index_tuple)) - new_indices = [] - - from operator import mul - from functools import reduce - stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1) - - for length in self.desired_shape[expr.aggregate.name]: - stride /= length - ind = flattened_index // int(stride) - flattened_index -= (int(stride) * ind) - new_indices.append(simplify_via_aff(ind)) - - return expr.aggregate.index(tuple(new_indices)) - - -def _match_caller_callee_argument_dimension_for_single_kernel( - caller_knl, callee_knl): - """ - :returns: a copy of *caller_knl* with the instance of - :class:`loopy.kernel.function_interface.CallableKernel` addressed by - *callee_function_name* in the *caller_knl* aligned with the argument - dimensions required by *caller_knl*. - """ - from loopy.kernel.array import ArrayBase - from loopy.kernel.data import auto - - for insn in caller_knl.instructions: - if not isinstance(insn, CallInstruction) or ( - insn.expression.function.name != - callee_knl.name): - # Call to a callable kernel can only occur through a - # CallInstruction. - continue - - def _shape_1_if_empty(shape): - assert isinstance(shape, tuple) - if shape == (): - return (1, ) - else: - return shape - - from loopy.kernel.function_interface import ( - ArrayArgDescriptor, get_arg_descriptor_for_expression, - get_kw_pos_association) - _, pos_to_kw = get_kw_pos_association(callee_knl) - arg_id_to_shape = {} - for arg_id, arg in insn.arg_id_to_val().items(): - arg_id = pos_to_kw[arg_id] - - arg_descr = get_arg_descriptor_for_expression(caller_knl, arg) - if isinstance(arg_descr, ArrayArgDescriptor): - arg_id_to_shape[arg_id] = _shape_1_if_empty(arg_descr.shape) - else: - arg_id_to_shape[arg_id] = (1, ) - - dim_changer = DimChanger( - callee_knl.arg_dict, - arg_id_to_shape) - - new_callee_insns = [] - for callee_insn in callee_knl.instructions: - if isinstance(callee_insn, MultiAssignmentBase): - new_callee_insns.append(callee_insn - .with_transformed_expressions(dim_changer)) - - elif isinstance(callee_insn, (CInstruction, - _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknown instruction %s." % - type(insn)) - - new_args = [arg if not isinstance(arg, ArrayBase) - else arg.copy(shape=arg_id_to_shape[arg.name], - dim_tags=None, strides=auto, order="C") - for arg in callee_knl.args] - - # subkernel with instructions adjusted according to the new dimensions - new_callee_knl = callee_knl.copy(instructions=new_callee_insns, - args=new_args) - - return new_callee_knl - - -class _FunctionCalledChecker(CombineMapper): - def __init__(self, func_name): - self.func_name = func_name - - def combine(self, values): - return any(values) - - def map_call(self, expr): - if expr.function.name == self.func_name: - return True - return self.combine( - tuple( - self.rec(child) for child in expr.parameters) - ) - - map_call_with_kwargs = map_call - - def map_constant(self, expr): - return False - - def map_algebraic_leaf(self, expr): - return False - - def map_kernel(self, kernel): - return any(self.rec(insn.expression) for insn in kernel.instructions if - isinstance(insn, MultiAssignmentBase)) - - -def _match_caller_callee_argument_dimension_(program, callee_function_name): - """ - Returns a copy of *program* with the instance of - :class:`loopy.kernel.function_interface.CallableKernel` addressed by - *callee_function_name* in the *program* aligned with the argument - dimensions required by *caller_knl*. - - .. note:: - - The callee kernel addressed by *callee_function_name*, should be - called at only one location throughout the program, as multiple - invocations would demand complex renaming logic which is not - implemented yet. - """ - - # {{{ sanity checks - - assert isinstance(program, TranslationUnit) - assert isinstance(callee_function_name, str) - assert callee_function_name not in program.entrypoints - assert callee_function_name in program.callables_table - - # }}} - - is_invoking_callee = _FunctionCalledChecker( - callee_function_name).map_kernel - - caller_knl, = [in_knl_callable.subkernel for in_knl_callable in - program.callables_table.values() if isinstance(in_knl_callable, - CallableKernel) and - is_invoking_callee(in_knl_callable.subkernel)] - - from pymbolic.primitives import Call - assert len([insn for insn in caller_knl.instructions if (isinstance(insn, - CallInstruction) and isinstance(insn.expression, Call) and - insn.expression.function.name == callee_function_name)]) == 1 - new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel( - caller_knl, program[callee_function_name]) - return program.with_kernel(new_callee_kernel) - -# }}} - - def rename_callable(program, old_name, new_name=None, existing_ok=False): """ :arg program: An instance of :class:`loopy.TranslationUnit` diff --git a/test/test_callables.py b/test/test_callables.py index 8acdef425..5476960ce 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -391,54 +391,6 @@ def test_packing_unpacking(ctx_factory, inline): 3*x2.get()) < 1e-15 -@pytest.mark.parametrize("inline", [False, True]) -def test_non_sub_array_refs_arguments(ctx_factory, inline): - from loopy.transform.callable import _match_caller_callee_argument_dimension_ - ctx = ctx_factory() - - callee = lp.make_function("{[i] : 0 <= i < 6}", "a[i] = a[i] + j", - [lp.GlobalArg("a", dtype="double", shape=(6,), is_output=True, - is_input=True), - lp.ValueArg("j", dtype="int")], name="callee") - caller1 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:] = callee(a[:], b[0])", - [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output=False), - lp.GlobalArg("b", dtype="double", shape=(1, ), is_output=False)], - name="caller") - - caller2 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], 3.1415926)", - [lp.GlobalArg("a", dtype="double", shape=(6, ), - is_output=False)], - name="caller") - - caller3 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], kappa)", - [lp.GlobalArg("a", dtype="double", shape=(6, ), - is_output=False), - lp.ValueArg("kappa", dtype=np.float64), ...], - name="caller") - - registered = lp.merge([caller1, callee]) - knl = _match_caller_callee_argument_dimension_(registered, "callee") - - if inline: - knl = lp.inline_callable_kernel(knl, "callee") - - lp.auto_test_vs_ref(knl, ctx) - - registered = lp.merge([caller2, callee]) - knl = _match_caller_callee_argument_dimension_(registered, "callee") - if inline: - knl = lp.inline_callable_kernel(knl, "callee") - - lp.auto_test_vs_ref(knl, ctx) - - registered = lp.merge([caller3, callee]) - knl = _match_caller_callee_argument_dimension_(registered, "callee") - if inline: - knl = lp.inline_callable_kernel(knl, "callee") - - lp.auto_test_vs_ref(knl, ctx, parameters={"kappa": 42.0}) - - @pytest.mark.parametrize("inline", [False, True]) def test_empty_sub_array_refs(ctx_factory, inline): # See: https://github.com/OP2/PyOP2/pull/559#discussion_r272208618 -- GitLab From c6baed2ea915614122fb7984b7de14870710f2b7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 22 Apr 2021 17:40:07 -0500 Subject: [PATCH 834/916] [codegen]: generate a reproducible order of function definitions --- loopy/codegen/__init__.py | 87 ++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index f467a5dc4..008b95bab 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -605,37 +605,34 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, def diverge_callee_entrypoints(program): """ - If a kernel is both an entrypoint and a callee, then rename the callee. + If a :class:`loopy.kernel.function_interface.CallableKernel` is both an + entrypoint and a callee, then rename the callee. """ - from loopy.translation_unit import _get_callable_ids + from loopy.translation_unit import (_get_callable_ids, + rename_resolved_functions_in_a_single_kernel) from pytools import UniqueNameGenerator callable_ids = _get_callable_ids(program.callables_table, - program.entrypoints) + program.entrypoints) new_callables = {} - renames = {} + todo_renames = {} vng = UniqueNameGenerator(set(program.callables_table.keys())) for clbl_id in callable_ids & program.entrypoints: - renames[clbl_id] = vng(based_on=clbl_id) + todo_renames[clbl_id] = vng(based_on=clbl_id) for name, clbl in program.callables_table.items(): + if name in todo_renames: + name = todo_renames[name] + if isinstance(clbl, CallableKernel): - from loopy.translation_unit import ( - rename_resolved_functions_in_a_single_kernel) - knl = rename_resolved_functions_in_a_single_kernel( - clbl.subkernel, renames) - new_callables[name] = clbl.copy(subkernel=knl) - elif isinstance(clbl, ScalarCallable): - new_callables[name] = clbl - else: - raise NotImplementedError(type(clbl)) + knl = rename_resolved_functions_in_a_single_kernel(clbl.subkernel, + todo_renames) + knl = knl.copy(name=name) + clbl = clbl.copy(subkernel=knl) - for clbl_id in callable_ids & program.entrypoints: - knl = new_callables[clbl_id].subkernel.copy(name=renames[clbl_id]) - new_callables[renames[clbl_id]] = new_callables[clbl_id].copy( - subkernel=knl) + new_callables[name] = clbl return program.copy(callables_table=new_callables) @@ -769,34 +766,32 @@ def generate_code_v2(program): callee_fdecls = [] implemented_data_infos = {} - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - #FIXME: - # 1. Diverge the kernels which are both entrypoint and callees at this - # point. By diverge we should rename the callees in kernels. - # 2. Then pass the callee versions by saying is_entrypoint=False - cgr = generate_code_for_a_single_kernel(in_knl_callable.subkernel, - program.callables_table, program.target, func_id in - program.entrypoints) - if func_id in program.entrypoints: - host_programs[func_id] = cgr.host_program - implemented_data_infos[func_id] = cgr.implemented_data_info - else: - # FIXME: This assertion should be valid - # assert cgr.host_programs == [] - assert len(cgr.device_programs) == 1 - #FIXME: - # if isinstance(callee_prog_ast, Collection): - # for entry in callee_prog_ast.contents: - # if isinstance(entry, FunctionBody): - # callee_fdecls.append(entry.fdecl) - callee_fdecls.append(cgr.device_programs[0].ast.fdecl) - - device_programs.extend(cgr.device_programs) - device_preambles.extend(cgr.device_preambles) - - device_preambles.extend(list(in_knl_callable.generate_preambles( - program.target))) + # {{{ collect host/device programs + + for func_id in sorted(key for key, val in program.callables_table.items() + if isinstance(val, CallableKernel)): + cgr = generate_code_for_a_single_kernel(program[func_id], + program.callables_table, + program.target, + func_id in program.entrypoints) + if func_id in program.entrypoints: + host_programs[func_id] = cgr.host_program + implemented_data_infos[func_id] = cgr.implemented_data_info + else: + assert len(cgr.device_programs) == 1 + callee_fdecls.append(cgr.device_programs[0].ast.fdecl) + + device_programs.extend(cgr.device_programs) + device_preambles.extend(cgr.device_preambles) + + # }}} + + # {{{ collect preambles + + for func_id, clbl in program.callables_table.items(): + device_preambles.extend(list(clbl.generate_preambles(program.target))) + + # }}} # adding the callee fdecls to the device_programs device_programs = ([device_programs[0].copy( -- GitLab From 73e07e506ae3a35b5f62b2791932bba41e2b3ef4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 23 Apr 2021 17:56:05 -0500 Subject: [PATCH 835/916] moderning test_kernel_tagging --- test/test_loopy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_loopy.py b/test/test_loopy.py index 41e2af213..97612a9dc 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -3110,6 +3110,7 @@ def test_kernel_tagging(): "{:}", "y = 0", tags=frozenset((t1, t2))) + knl1 = knl1.default_entrypoint assert knl1.tags == frozenset((t1, t2)) -- GitLab From 4206a8ad8c7f8c3c6544e259bc42dadd04e49142 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 21 Apr 2021 17:14:14 -0500 Subject: [PATCH 836/916] remove all references to CallWithKwargs * needs more thinking --- loopy/check.py | 31 +++++----------- loopy/kernel/creation.py | 13 ++----- loopy/kernel/function_interface.py | 17 +-------- loopy/kernel/instruction.py | 19 ++++------ loopy/preprocess.py | 59 ++++++++++-------------------- loopy/statistics.py | 4 -- loopy/symbolic.py | 18 +-------- loopy/transform/callable.py | 10 ----- loopy/translation_unit.py | 23 +++--------- loopy/type_inference.py | 37 ++----------------- test/test_callables.py | 51 -------------------------- 11 files changed, 51 insertions(+), 231 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 4656abbd0..4aea9fae3 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -91,13 +91,12 @@ def check_identifiers_in_subst_rules(knl): ", ".join(deps-rule_allowed_identifiers))) -class UnscopedCallCollector(CombineMapper): +class UnresolvedCallCollector(CombineMapper): """ - Collects all the unscoped calls within a kernel. + Collects all the unresolved calls within a kernel. :returns: - An :class:`frozenset` of function names that are not scoped in - the kernel. + An :class:`frozenset` of function names that are not resolved. """ def combine(self, values): @@ -105,19 +104,10 @@ class UnscopedCallCollector(CombineMapper): return reduce(operator.or_, values, frozenset()) def map_call(self, expr): - from pymbolic.primitives import CallWithKwargs - return self.rec(CallWithKwargs( - function=expr.function, parameters=expr.parameters, - kw_parameters={})) - - def map_call_with_kwargs(self, expr): if not isinstance(expr.function, ResolvedFunction): - return (frozenset([expr.function.name]) | - self.combine(self.rec(child) for child in expr.parameters - + tuple(expr.kw_parameters.values()))) + return frozenset([expr.function.name]) | self.rec(expr.parameters) else: - return self.combine(self.rec(child) for child in - expr.parameters+tuple(expr.kw_parameters.values())) + return self.rec(expr.parameters) def map_constant(self, expr): return frozenset() @@ -137,17 +127,16 @@ def check_functions_are_resolved(kernel): for insn in kernel.instructions: if isinstance(insn, MultiAssignmentBase): - unscoped_calls = UnscopedCallCollector()(subst_expander( - insn.expression)) - if unscoped_calls: + unresolved_calls = UnresolvedCallCollector()(subst_expander(insn + .expression)) + if unresolved_calls: raise LoopyError("Unknown function '%s' -- register a " "callable corresponding to it." % - set(unscoped_calls).pop()) + set(unresolved_calls).pop()) elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): pass else: - raise NotImplementedError( - "Unsupported instruction type %s." % type(insn).__name__) + raise NotImplementedError(type(insn)) # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 453559036..e60da6e2d 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1993,11 +1993,7 @@ class SliceToInameReplacer(IdentityMapper): return result def map_call(self, expr): - from pymbolic.primitives import CallWithKwargs - new_expr = self.rec(CallWithKwargs(expr.function, expr.parameters, {})) - return Call(new_expr.function, new_expr.parameters) - def map_call_with_kwargs(self, expr): def _convert_array_to_slices(arg): # FIXME: We do not support something like A[1] should point to the # second row if 'A' is 3 x 3 array. @@ -2036,12 +2032,9 @@ class SliceToInameReplacer(IdentityMapper): for _ in array_arg_shape)) return arg - from pymbolic.primitives import CallWithKwargs - return CallWithKwargs(expr.function, - tuple(self.rec(_convert_array_to_slices(par)) - for par in expr.parameters), - {kw: self.rec(_convert_array_to_slices(par)) - for kw, par in expr.kw_parameters.items()}) + return Call(expr.function, + tuple(self.rec(_convert_array_to_slices(par)) + for par in expr.parameters)) def get_iname_domain_as_isl_set(self): """ diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 95a2d3d55..cdd1679a2 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -158,12 +158,7 @@ class ExpressionIsScalarChecker(WalkMapper): f" or assignees. '{expr}'violates this.") def map_call(self, expr): - for child in expr.parameters: - self.rec(child) - - def map_call_with_kwargs(self, expr): - for child in expr.parameters + tuple(expr.kw_parameters.values()): - self.rec(child) + self.rec(expr.parameters) def map_subscript(self, expr): for child in expr.index_tuple: @@ -887,8 +882,6 @@ class CallableKernel(InKernelCallable): def generate_preambles(self, target): """ Yields the *target* specific preambles. """ - # FIXME Check that this is correct. - return yield @@ -898,25 +891,17 @@ class CallableKernel(InKernelCallable): raise NotImplementedError() from loopy.kernel.instruction import CallInstruction - from pymbolic.primitives import CallWithKwargs assert self.is_ready_for_codegen() assert isinstance(insn, CallInstruction) ecm = expression_to_code_mapper parameters = insn.expression.parameters - kw_parameters = {} - if isinstance(insn.expression, CallWithKwargs): - kw_parameters = insn.expression.kw_parameters - assignees = insn.assignees parameters = list(parameters) par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel) - for i in range(len(parameters), len(parameters)+len(kw_parameters)): - parameters.append(kw_parameters[pos_to_kw[i]]) - par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]]) # insert the assignees at the required positions assignee_write_count = -1 diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 7d4b9a50c..42cf4e017 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1052,9 +1052,9 @@ class CallInstruction(MultiAssignmentBase): predicates=predicates, tags=tags) - from pymbolic.primitives import Call, CallWithKwargs + from pymbolic.primitives import Call from loopy.symbolic import Reduction - if not isinstance(expression, (Call, CallWithKwargs, Reduction)) and ( + if not isinstance(expression, (Call, Reduction)) and ( expression is not None): raise LoopyError("'expression' argument to CallInstruction " "must be a function call") @@ -1145,15 +1145,10 @@ class CallInstruction(MultiAssignmentBase): def arg_id_to_val(self): """:returns: a :class:`dict` mapping argument identifiers (non-negative numbers - for positional arguments, strings for keyword args, and negative numbers + for positional arguments and negative numbers for assignees) to their respective values """ - - from pymbolic.primitives import CallWithKwargs arg_id_to_val = dict(enumerate(self.expression.parameters)) - if isinstance(self.expression, CallWithKwargs): - for kw, val in self.expression.kw_parameters.items(): - arg_id_to_val[kw] = val for i, arg in enumerate(self.assignees): arg_id_to_val[-i-1] = arg @@ -1186,10 +1181,10 @@ def is_array_call(assignees, expression): the arguemnts or assignees to the function is an array, :meth:`is_array_call` will return *True*. """ - from pymbolic.primitives import Call, CallWithKwargs, Subscript + from pymbolic.primitives import Call, Subscript from loopy.symbolic import SubArrayRef - if not isinstance(expression, (Call, CallWithKwargs)): + if not isinstance(expression, Call): return False for par in expression.parameters+assignees: @@ -1236,9 +1231,9 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs): raise LoopyError("atomic operations with more than one " "left-hand side not supported") - from pymbolic.primitives import Call, CallWithKwargs + from pymbolic.primitives import Call from loopy.symbolic import Reduction - if not isinstance(expression, (Call, CallWithKwargs, Reduction)): + if not isinstance(expression, (Call, Reduction)): raise LoopyError("right-hand side in multiple assignment must be " "function call or reduction, got: '%s'" % expression) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 53ddcefe1..c0428dab8 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2157,33 +2157,30 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): self.clbl_inf_ctx = clbl_inf_ctx def map_call(self, expr, expn_state, assignees=None): - from pymbolic.primitives import Call, CallWithKwargs, Variable + from pymbolic.primitives import Call, Variable from loopy.kernel.function_interface import ValueArgDescriptor from loopy.symbolic import ResolvedFunction from loopy.kernel.array import ArrayBase from loopy.kernel.data import ValueArg from pymbolic.mapper.substitutor import make_subst_func from loopy.symbolic import SubstitutionMapper + from loopy.kernel.function_interface import get_arg_descriptor_for_expression if not isinstance(expr.function, ResolvedFunction): # ignore if the call is not to a ResolvedFunction return super().map_call(expr, expn_state) arg_id_to_val = dict(enumerate(expr.parameters)) - if isinstance(expr, CallWithKwargs): - arg_id_to_val.update(expr.kw_parameters) if assignees is not None: # If supplied with assignees then this is a CallInstruction for i, arg in enumerate(assignees): arg_id_to_val[-i-1] = arg - from loopy.kernel.function_interface import get_arg_descriptor_for_expression arg_id_to_descr = { - arg_id: get_arg_descriptor_for_expression( - self.caller_kernel, arg) - for arg_id, arg in arg_id_to_val.items()} - in_knl_callable = self.clbl_inf_ctx[expr.function.name] + arg_id: get_arg_descriptor_for_expression(self.caller_kernel, arg) + for arg_id, arg in arg_id_to_val.items()} + clbl = self.clbl_inf_ctx[expr.function.name] # {{{ translating descriptor expressions to the callable's namespace @@ -2196,16 +2193,17 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): assert deps <= self.caller_kernel.all_variable_names() for dep in deps: - caller_arg = self.caller_kernel.arg_dict.get(dep, None) - caller_arg = self.caller_kernel.temporary_variables.get(dep, caller_arg) - - if not (isinstance(caller_arg, ValueArg) or (isinstance(caller_arg, - ArrayBase) and caller_arg.shape == ())): + caller_arg = self.caller_kernel.arg_dict.get(dep, (self.caller_kernel + .temporary_variables + .get(dep))) + if not (isinstance(caller_arg, ValueArg) + or (isinstance(caller_arg, ArrayBase) + and caller_arg.shape == ())): raise NotImplementedError(f"Obtained '{dep}' as a dependency for" f" call '{expr.function.name}' which is not a scalar.") - in_knl_callable, callee_name = in_knl_callable.with_added_arg( - caller_arg.dtype, ValueArgDescriptor()) + clbl, callee_name = clbl.with_added_arg(caller_arg.dtype, + ValueArgDescriptor()) subst_map[dep] = Variable(callee_name) deps_as_params.append(Variable(dep)) @@ -2217,36 +2215,17 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): # }}} # specializing the function according to the parameter description - new_in_knl_callable, self.clbl_inf_ctx = ( - in_knl_callable.with_descrs( - arg_id_to_descr, self.clbl_inf_ctx)) - - # find the deps of the new in kernel callablen and add those arguments to + new_clbl, self.clbl_inf_ctx = clbl.with_descrs(arg_id_to_descr, + self.clbl_inf_ctx) - self.clbl_inf_ctx, new_func_id = ( - self.clbl_inf_ctx.with_callable( - expr.function.function, - new_in_knl_callable)) + self.clbl_inf_ctx, new_func_id = (self.clbl_inf_ctx + .with_callable(expr.function.function, + new_clbl)) - if isinstance(expr, Call): - return Call( - ResolvedFunction(new_func_id), + return Call(ResolvedFunction(new_func_id), tuple(self.rec(child, expn_state) for child in expr.parameters) + tuple(deps_as_params)) - else: - # FIXME: Order for vars when kwargs are present? - assert isinstance(expr, CallWithKwargs) - return CallWithKwargs( - ResolvedFunction(new_func_id), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - { - key: self.rec(val, expn_state) - for key, val in expr.kw_parameters.items()} - ) - - map_call_with_kwargs = map_call def __call__(self, expr, kernel, insn, assignees=None): from loopy.kernel.data import InstructionBase diff --git a/loopy/statistics.py b/loopy/statistics.py index 43bce10ca..5ddc84949 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -896,10 +896,6 @@ class CounterBase(CombineMapper): else: raise NotImplementedError() - def map_call_with_kwargs(self, expr): - # FIXME - raise NotImplementedError() - def map_sum(self, expr): if expr.children: return sum(self.rec(child) for child in expr.children) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 0c6a8d50f..fddd4479d 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -374,15 +374,7 @@ class DependencyMapper(DependencyMapperBase): def map_call(self, expr, *args, **kwargs): # Loopy does not have first-class functions. Do not descend # into 'function' attribute of Call. - return self.combine( - self.rec(child, *args, **kwargs) for child in expr.parameters) - - def map_call_with_kwargs(self, expr, *args): - # Loopy does not have first-class functions. Do not descend - # into 'function' attribute of Call. - return self.combine( - self.rec(child, *args) for child in expr.parameters+tuple( - expr.kw_parameters.values())) + return self.rec(expr.parameters, *args, **kwargs) def map_reduction(self, expr, *args, **kwargs): deps = self.rec(expr.expr, *args, **kwargs) @@ -1530,14 +1522,6 @@ class FunctionToPrimitiveMapper(IdentityMapper): else: return IdentityMapper.map_call(self, expr) - def map_call_with_kwargs(self, expr): - for par in expr.kw_parameters.values(): - if not isinstance(par, SubArrayRef): - raise LoopyError("Keyword Arguments is only supported for" - " array arguments--use positional order to specify" - " the order of the arguments in the call.") - return IdentityMapper.map_call_with_kwargs(self, expr) - # {{{ customization to pymbolic parser diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index a216530ea..45063458e 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -307,18 +307,8 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): assignees = call_insn.assignees # writes parameters = call_insn.expression.parameters # reads - # add keyword parameters - from pymbolic.primitives import CallWithKwargs - from loopy.kernel.function_interface import get_kw_pos_association kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl) - if isinstance(call_insn.expression, CallWithKwargs): - kw_parameters = call_insn.expression.kw_parameters - else: - kw_parameters = {} - - for kw, par in kw_parameters.items(): - arg_map[kw] = par for i, par in enumerate(parameters): arg_map[pos_to_kw[i]] = par diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 3b158e1cf..c860de537 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -38,7 +38,7 @@ from loopy.library.reduction import ReductionOpFunction from loopy.kernel import LoopKernel from loopy.tools import update_persistent_hash -from pymbolic.primitives import Call, CallWithKwargs +from pymbolic.primitives import Call from functools import reduce from pyrsistent import pmap, PMap @@ -93,6 +93,11 @@ class CallableResolver(RuleAwareIdentityMapper): .. attribute:: rule_mapping_context An instance of :class:`loopy.symbolic.RuleMappingContext`. + + .. attribute:: calls_resolved + + An :class:`set` of calls that were resolved. Updated during an + expression traversal. """ def __init__(self, rule_mapping_context, known_callables): assert isinstance(known_callables, frozenset) @@ -130,22 +135,6 @@ class CallableResolver(RuleAwareIdentityMapper): return super().map_call(expr, expn_state) - def map_call_with_kwargs(self, expr, expn_state): - from loopy.symbolic import parse_tagged_name - name, tag = parse_tagged_name(expr.function) - - if name in self.known_callables: - params = tuple(self.rec(par, expn_state) for par in expr.parameters) - kw_params = {kw: self.rec(par, expn_state) - for kw, par in expr.kw_parameters.items()} - - # record that we resolved a call - self.calls_resolved.add(name) - - return CallWithKwargs(ResolvedFunction(expr.function), params, kw_params) - - return super().map_call_with_kwargs(expr, expn_state) - # {{{ program diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 36d5408a0..bc45144d0 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -99,21 +99,6 @@ class FunctionNameChanger(RuleAwareIdentityMapper): else: return self.map_substitution(name, tag, expr.parameters, expn_state) - def map_call_with_kwargs(self, expr, expn_state): - - if expr in self.calls_to_new_names: - return type(expr)( - ResolvedFunction(self.calls_to_new_names[expr]), - tuple(self.rec(child, expn_state) - for child in expr.parameters), - { - key: self.rec(val, expn_state) - for key, val in expr.kw_parameters.items()} - ) - else: - return super().map_call_with_kwargs( - expr, expn_state) - def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): """ @@ -407,13 +392,7 @@ class TypeInferenceMapper(CombineMapper): def map_call(self, expr, return_tuple=False): - from pymbolic.primitives import Variable, CallWithKwargs, Call - - if isinstance(expr, CallWithKwargs): - kw_parameters = expr.kw_parameters - else: - assert isinstance(expr, Call) - kw_parameters = {} + from pymbolic.primitives import Variable identifier = expr.function @@ -431,8 +410,8 @@ class TypeInferenceMapper(CombineMapper): else: return None - arg_id_to_dtype = {i: none_if_empty(self.rec(par)) for (i, par) in - tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())} + arg_id_to_dtype = {i: none_if_empty(self.rec(par)) + for (i, par) in enumerate(expr.parameters)} # specializing the known function wrt type in_knl_callable = self.clbl_inf_ctx[expr.function.name] @@ -450,11 +429,7 @@ class TypeInferenceMapper(CombineMapper): expr.function.function, in_knl_callable)) - if isinstance(expr, Call): - self.old_calls_to_new_calls[expr] = new_function_id - else: - assert isinstance(expr, CallWithKwargs) - self.old_calls_to_new_calls[expr] = new_function_id + self.old_calls_to_new_calls[expr] = new_function_id new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype @@ -470,8 +445,6 @@ class TypeInferenceMapper(CombineMapper): return [] - map_call_with_kwargs = map_call - def map_variable(self, expr): if expr.name in self.kernel.all_inames(): return [self.kernel.index_dtype] @@ -682,8 +655,6 @@ class TypeReader(TypeInferenceMapper): raise RuntimeError("unexpected type inference " "object type for '%s'" % expr.name) - map_call_with_kwargs = map_call - # }}} diff --git a/test/test_callables.py b/test/test_callables.py index 95d039024..82efb25ee 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -142,57 +142,6 @@ def test_slices_with_negative_step(ctx_factory, inline): np.linalg.norm(2*x+3*y))) < 1e-15 -@pytest.mark.parametrize("inline", [False, True]) -def test_register_knl_with_call_with_kwargs(ctx_factory, inline): - ctx = ctx_factory() - queue = cl.CommandQueue(ctx) - - n = 4 - - a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32) - c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64) - - callee_knl = lp.make_function( - "{[i, j]:0<=i, j < %d}" % n, - """ - h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j] - <>f1[i, j] = 2*f[i, j] - p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j] - """, - [ - lp.GlobalArg("f, e, h, g"), ...], - name="linear_combo") - - caller_knl = lp.make_kernel( - "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n, - """ - <> d[i, j, k, l, m] = 2*b[i, j, k, l, m] - [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m] = linear_combo( - f=[j, l]: a[i, j, k, l, m], - g=[j, l]: d[i, j, k, l, m], - e=[j, l]: c[i, j, k, l, m]) - """) - - knl = lp.merge([caller_knl, callee_knl]) - if inline: - knl = lp.inline_callable_kernel(knl, "linear_combo") - - evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev) - - a = a_dev.get() - b = b_dev.get() - c = c_dev.get() - - h = out1.get() # h = 2c + 3a + 8b - p = out2.get() # p = 7c + 8a + 4b - h_exact = 3*a + 8*b + 2*c - p_exact = 8*a + 4*b + 7*c - - assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7 - assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7 - - @pytest.mark.parametrize("inline", [False, True]) def test_register_knl_with_hw_axes(ctx_factory, inline): ctx = ctx_factory() -- GitLab From b4dfd53d6b8d7ab742fd13d364532804492ed3b6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <15399010+kaushikcfd@users.noreply.github.com> Date: Fri, 23 Apr 2021 17:36:32 -0500 Subject: [PATCH 837/916] [docs] fix grammar MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Andreas Klöckner --- loopy/check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index 4aea9fae3..42aed29bd 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -96,7 +96,7 @@ class UnresolvedCallCollector(CombineMapper): Collects all the unresolved calls within a kernel. :returns: - An :class:`frozenset` of function names that are not resolved. + A :class:`frozenset` of function names that are not resolved. """ def combine(self, values): -- GitLab From 882a3a11cb71178d814c137ff0d05a61c3b2a4c2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <15399010+kaushikcfd@users.noreply.github.com> Date: Fri, 23 Apr 2021 17:43:28 -0500 Subject: [PATCH 838/916] [docs] fix grammar MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Andreas Klöckner --- loopy/translation_unit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index c860de537..d00dbbf31 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -96,7 +96,7 @@ class CallableResolver(RuleAwareIdentityMapper): .. attribute:: calls_resolved - An :class:`set` of calls that were resolved. Updated during an + A :class:`set` of calls that were resolved. Updated during an expression traversal. """ def __init__(self, rule_mapping_context, known_callables): -- GitLab From ea432d9cebbe6cbe355674539313ff51a4d13c6d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 23 Apr 2021 17:48:39 -0500 Subject: [PATCH 839/916] replace dropping with raising NotImplementedErrors --- loopy/check.py | 4 ++++ loopy/kernel/creation.py | 4 ++++ loopy/kernel/function_interface.py | 4 ++++ loopy/preprocess.py | 4 ++++ loopy/statistics.py | 4 ++++ loopy/symbolic.py | 4 ++++ loopy/translation_unit.py | 4 ++++ loopy/type_inference.py | 12 ++++++++++++ 8 files changed, 40 insertions(+) diff --git a/loopy/check.py b/loopy/check.py index 42aed29bd..36ffc8df0 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -109,6 +109,10 @@ class UnresolvedCallCollector(CombineMapper): else: return self.rec(expr.parameters) + def map_call_with_kwargs(self, expr): + # See: https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + def map_constant(self, expr): return frozenset() diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index e60da6e2d..58e4ffb0e 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2036,6 +2036,10 @@ class SliceToInameReplacer(IdentityMapper): tuple(self.rec(_convert_array_to_slices(par)) for par in expr.parameters)) + def map_call_with_kwargs(self, expr): + # See: https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + def get_iname_domain_as_isl_set(self): """ Returns the extra domain constraints imposed by the slice inames, diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index cdd1679a2..6eed98b88 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -181,6 +181,10 @@ class ExpressionIsScalarChecker(WalkMapper): def map_slice(self, expr): raise LoopyError("Array regions can only passed as sub-array refs.") + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + def get_arg_descriptor_for_expression(kernel, expr): """ diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c0428dab8..f8fe8eef8 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2227,6 +2227,10 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): for child in expr.parameters) + tuple(deps_as_params)) + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + def __call__(self, expr, kernel, insn, assignees=None): from loopy.kernel.data import InstructionBase from loopy.symbolic import IdentityMapper, ExpansionState diff --git a/loopy/statistics.py b/loopy/statistics.py index 5ddc84949..f1448931e 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -896,6 +896,10 @@ class CounterBase(CombineMapper): else: raise NotImplementedError() + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + def map_sum(self, expr): if expr.children: return sum(self.rec(child) for child in expr.children) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index fddd4479d..d4437c1ec 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -401,6 +401,10 @@ class DependencyMapper(DependencyMapperBase): def map_literal(self, expr): return set() + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + class SubstitutionRuleExpander(IdentityMapper): def __init__(self, rules): diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index d00dbbf31..7fab41850 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -135,6 +135,10 @@ class CallableResolver(RuleAwareIdentityMapper): return super().map_call(expr, expn_state) + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + # {{{ program diff --git a/loopy/type_inference.py b/loopy/type_inference.py index bc45144d0..cfc04d096 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -99,6 +99,10 @@ class FunctionNameChanger(RuleAwareIdentityMapper): else: return self.map_substitution(name, tag, expr.parameters, expn_state) + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names): """ @@ -445,6 +449,10 @@ class TypeInferenceMapper(CombineMapper): return [] + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + def map_variable(self, expr): if expr.name in self.kernel.all_inames(): return [self.kernel.index_dtype] @@ -655,6 +663,10 @@ class TypeReader(TypeInferenceMapper): raise RuntimeError("unexpected type inference " "object type for '%s'" % expr.name) + def map_call_with_kwargs(self, expr): + # See https://github.com/inducer/loopy/pull/323 + raise NotImplementedError + # }}} -- GitLab From caf1778ddf89043de452fd4816db113f0b1b095f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 23 Apr 2021 17:59:33 -0500 Subject: [PATCH 840/916] Delete dead find_in_knl_callable_from_identifier, fix sectioning in translation_unit --- loopy/translation_unit.py | 47 ++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 7fab41850..6153a4fcf 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -54,24 +54,7 @@ __doc__ = """ """ -def find_in_knl_callable_from_identifier( - function_id_to_in_knl_callable_mappers, target, identifier): - """ - Returns an instance of - :class:`loopy.kernel.function_interface.InKernelCallable` if the - :arg:`identifier` is known to any kernel function scoper, otherwise returns - *None*. - """ - for func_id_to_in_knl_callable_mapper in ( - function_id_to_in_knl_callable_mappers): - # fixme: do we really need to given target for the function - in_knl_callable = func_id_to_in_knl_callable_mapper( - target, identifier) - if in_knl_callable is not None: - return in_knl_callable - - return None - +# {{{ CallableResolver def _is_a_reduction_op(expr): if isinstance(expr, ResolvedFunction): @@ -139,8 +122,10 @@ class CallableResolver(RuleAwareIdentityMapper): # See https://github.com/inducer/loopy/pull/323 raise NotImplementedError +# }}} + -# {{{ program +# {{{ translation unit class TranslationUnit(ImmutableRecord): """ @@ -388,6 +373,8 @@ class Program(TranslationUnit): # }}} +# {{{ next_indexed_function_id + def next_indexed_function_id(function_id): """ Returns an instance of :class:`str` with the next indexed-name in the @@ -417,6 +404,10 @@ def next_indexed_function_id(function_id): return "{alpha}_{num}".format(alpha=match.group("alpha"), num=int(match.group("num"))+1) +# }}} + + +# {{{ rename_resolved_functions_in_a_single_kernel class ResolvedFunctionRenamer(RuleAwareIdentityMapper): """ @@ -451,6 +442,10 @@ def rename_resolved_functions_in_a_single_kernel(kernel, rule_mapping_context.finish_kernel( resolved_function_renamer.map_kernel(kernel))) +# }}} + + +# {{{ CallablesIDCollector class CallablesIDCollector(CombineMapper): """ @@ -506,6 +501,10 @@ def _get_callable_ids(callables, entrypoints): _get_callable_ids_for_knl(callables[e].subkernel, callables) for e in entrypoints)) +# }}} + + +# {{{ CallablesInferenceContext def make_clbl_inf_ctx(callables, entrypoints): return CallablesInferenceContext(callables) @@ -546,8 +545,6 @@ class CallablesInferenceContext(ImmutableRecord): renames=renames, new_entrypoints=new_entrypoints) - # {{{ interface to perform edits on callables - def with_callable(self, old_function_id, new_clbl, is_entrypoint=False): """ @@ -707,12 +704,12 @@ class CallablesInferenceContext(ImmutableRecord): return program.copy(callables_table=new_callables) - # }}} - def __getitem__(self, name): result = self.callables[name] return result +# }}} + # {{{ helper functions @@ -790,6 +787,8 @@ def update_table(callables_table, clbl_id, clbl): # }}} +# {{{ resolve_callables + def resolve_callables(program): """ Returns a :class:`TranslationUnit` with known :class:`pymbolic.primitives.Call` @@ -841,5 +840,7 @@ def resolve_callables(program): return program.copy(callables_table=callables_table) +# }}} + # vim: foldmethod=marker -- GitLab From 47cf5636f3a927fdbec4efd9a488b61df168672a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 23 Apr 2021 17:07:03 -0500 Subject: [PATCH 841/916] Fix phrasing in ResolvedFunction docstring --- loopy/symbolic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index d4437c1ec..ef8960014 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -798,9 +798,9 @@ class RuleArgument(LoopyExpressionBase): class ResolvedFunction(LoopyExpressionBase): """ - A function invocation whose definition is known in a :mod:`loopy` program. + A function identifier whose definition is known in a :mod:`loopy` program. A function is said to be *known* in a :class:`~loopy.TranslationUnit` if its - identifier maps to an :class:`~loopy.kernel.function_interface.InKernelCallable` + name maps to an :class:`~loopy.kernel.function_interface.InKernelCallable` in :attr:`loopy.TranslationUnit.callables_table`. Refer to :ref:`func-interface`. .. attribute:: function -- GitLab From e652a7393fb3d31b07ef3601477f996ebe61d44c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 23 Apr 2021 18:35:20 -0500 Subject: [PATCH 842/916] motivate diverging entrypoint and non-entrypoint callable kernels --- loopy/codegen/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 008b95bab..db9247ae9 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -758,6 +758,11 @@ def generate_code_v2(program): program = program.copy(callables_table=new_callables) + # Why diverge? Generated code for a non-entrypoint kernel and an entrypoint + # kernel isn't same for a general loopy target. For example in OpenCL, a + # kernel callable from host and the one supposed to be callable from device + # have different function signatures. To generate correct code, each + # callable should be exclusively an entrypoint or a non-entrypoint kernel. program = diverge_callee_entrypoints(program) host_programs = {} -- GitLab From cad0ac98889dca5c86047ae4c5fbccbd3859fa12 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 25 Apr 2021 15:20:41 -0500 Subject: [PATCH 843/916] [callables] Fix benchmarks --- benchmarks/run_sumpy_kernels.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmarks/run_sumpy_kernels.py b/benchmarks/run_sumpy_kernels.py index d37a6bb8a..49b6bd8f9 100644 --- a/benchmarks/run_sumpy_kernels.py +++ b/benchmarks/run_sumpy_kernels.py @@ -31,7 +31,7 @@ def _sumpy_kernel_init(param): m_expn = mpole_expn_class(knl, order=order) l_expn = local_expn_class(knl, order=order) - m2l = E2EFromCSR(ctx, m_expn, l_expn) + m2l = E2EFromCSR(ctx, m_expn, l_expn, name="loopy_kernel") m2l.get_translation_loopy_insns() m2l.ctx = None m2l.device = None @@ -77,7 +77,8 @@ def cached_data(params): knl = _sumpy_kernel_make(expn, param) knl = lp.preprocess_kernel(knl) data[param]["instantiated"] = knl - scheduled = lp.get_one_scheduled_kernel(knl) + scheduled = knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], + knl.callables_table)) data[param]["scheduled"] = scheduled return data @@ -101,7 +102,9 @@ class SumpyBenchmarkSuite: lp.preprocess_kernel(knl) def time_schedule(self, data, param): - lp.get_one_scheduled_kernel(data[param]["instantiated"]) + knl = data["param"]["instantiated"] + knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], + knl.callables_table)) def time_generate_code(self, data, param): lp.generate_code_v2(data[param]["scheduled"]) -- GitLab From 3b2dfad975e99596f5e4ac6e18ae361e17a1205c Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 25 Apr 2021 15:26:34 -0500 Subject: [PATCH 844/916] update show_commit_url --- asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv.conf.json b/asv.conf.json index 99c2ea2b5..3988c0faf 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -40,7 +40,7 @@ //"install_timeout": 600, // the base URL to show a commit for the project. - "show_commit_url": "http://gitlab.tiker.net/inducer/loopy/commits/", + "show_commit_url": "http://github.com/inducer/loopy/commit/", // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. -- GitLab From deba12b28e932b3fd3bc548fa248703f36ac72e3 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 25 Apr 2021 15:27:35 -0500 Subject: [PATCH 845/916] add benchmarks url to README --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 61cdfb02c..3f1e416b9 100644 --- a/README.rst +++ b/README.rst @@ -55,4 +55,5 @@ Places on the web related to Loopy: * `Documentation `_ (read how things work) * `Github `_ (get latest source code, file bugs) * `Homepage `_ +* `Benchmarks `_ -- GitLab From 0731f70aa9b43ecae5d8ad5a2180ce0e05148639 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Sun, 25 Apr 2021 16:06:57 -0500 Subject: [PATCH 846/916] [callables] Fix typo --- benchmarks/run_sumpy_kernels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/run_sumpy_kernels.py b/benchmarks/run_sumpy_kernels.py index 49b6bd8f9..c3580e7d7 100644 --- a/benchmarks/run_sumpy_kernels.py +++ b/benchmarks/run_sumpy_kernels.py @@ -102,7 +102,7 @@ class SumpyBenchmarkSuite: lp.preprocess_kernel(knl) def time_schedule(self, data, param): - knl = data["param"]["instantiated"] + knl = data[param]["instantiated"] knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], knl.callables_table)) -- GitLab From 4e6e112b67f74ab8387d9b5f16fa5152b4c84a93 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 25 Apr 2021 16:40:29 -0500 Subject: [PATCH 847/916] iterate_over_kernels_if_given_program -> for_each_kernel --- loopy/__init__.py | 10 +++++----- loopy/kernel/creation.py | 4 ++-- loopy/kernel/tools.py | 4 ++-- loopy/loop.py | 4 ++-- loopy/transform/add_barrier.py | 4 ++-- loopy/transform/arithmetic.py | 6 +++--- loopy/transform/batch.py | 4 ++-- loopy/transform/callable.py | 4 ++-- loopy/transform/data.py | 20 ++++++++++---------- loopy/transform/iname.py | 30 +++++++++++++++--------------- loopy/transform/instruction.py | 14 +++++++------- loopy/transform/padding.py | 8 ++++---- loopy/transform/parameter.py | 6 +++--- loopy/transform/subst.py | 6 +++--- loopy/translation_unit.py | 4 ++-- 15 files changed, 64 insertions(+), 64 deletions(-) diff --git a/loopy/__init__.py b/loopy/__init__.py index cb1ef2179..8820cbd99 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -24,7 +24,7 @@ THE SOFTWARE. from loopy.symbolic import ( TaggedVariable, Reduction, LinearSubscript, TypeCast) from loopy.diagnostic import LoopyError, LoopyWarning -from loopy.translation_unit import iterate_over_kernels_if_given_program +from loopy.translation_unit import for_each_kernel # {{{ imported user interface @@ -313,7 +313,7 @@ __all__ = [ # {{{ set_options -@iterate_over_kernels_if_given_program +@for_each_kernel def set_options(kernel, *args, **kwargs): """Return a new kernel with the options given as keyword arguments, or from a string representation passed in as the first (and only) positional @@ -353,7 +353,7 @@ def set_options(kernel, *args, **kwargs): # {{{ library registration -@iterate_over_kernels_if_given_program +@for_each_kernel def register_preamble_generators(kernel, preamble_generators): """ :arg manglers: list of functions of signature ``(preamble_info)`` @@ -378,7 +378,7 @@ def register_preamble_generators(kernel, preamble_generators): return kernel.copy(preamble_generators=new_pgens) -@iterate_over_kernels_if_given_program +@for_each_kernel def register_symbol_manglers(kernel, manglers): from loopy.tools import unpickles_equally @@ -396,7 +396,7 @@ def register_symbol_manglers(kernel, manglers): return kernel.copy(symbol_manglers=new_manglers) -@iterate_over_kernels_if_given_program +@for_each_kernel def register_function_manglers(kernel, manglers): """ :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)`` diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 58e4ffb0e..6718168bd 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -34,7 +34,7 @@ from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, SubstitutionRule, AddressSpace, ValueArg) -from loopy.translation_unit import iterate_over_kernels_if_given_program +from loopy.translation_unit import for_each_kernel from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl from islpy import dim_type @@ -1820,7 +1820,7 @@ def add_inferred_inames(knl): # {{{ apply single-writer heuristic -@iterate_over_kernels_if_given_program +@for_each_kernel def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True): logger.debug("%s: default deps" % kernel.name) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 800909afc..19cb8acbd 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -34,7 +34,7 @@ from loopy.diagnostic import LoopyError, warn_with_kernel from pytools import memoize_on_first_arg, natsorted from loopy.kernel import LoopKernel from loopy.translation_unit import (TranslationUnit, - iterate_over_kernels_if_given_program) + for_each_kernel) from loopy.kernel.function_interface import CallableKernel import logging logger = logging.getLogger(__name__) @@ -494,7 +494,7 @@ class DomainChanger: # {{{ graphviz / dot export -@iterate_over_kernels_if_given_program +@for_each_kernel def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True, use_insn_id=False): """Return a string in the `dot `_ language depicting diff --git a/loopy/loop.py b/loopy/loop.py index 0127c1262..af61b7db5 100644 --- a/loopy/loop.py +++ b/loopy/loop.py @@ -22,7 +22,7 @@ THE SOFTWARE. import islpy as isl -from loopy.translation_unit import iterate_over_kernels_if_given_program +from loopy.translation_unit import for_each_kernel def potential_loop_nest_map(kernel): @@ -54,7 +54,7 @@ def potential_loop_nest_map(kernel): return result -@iterate_over_kernels_if_given_program +@for_each_kernel def merge_loop_domains(kernel): # FIXME: This should be moved to loopy.transforms.iname from loopy.kernel.tools import is_domain_dependent_on_inames diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py index e54695d95..d49cf574e 100644 --- a/loopy/transform/add_barrier.py +++ b/loopy/transform/add_barrier.py @@ -24,7 +24,7 @@ THE SOFTWARE. from loopy.kernel.instruction import BarrierInstruction from loopy.match import parse_match from loopy.transform.instruction import add_dependency -from loopy.translation_unit import iterate_over_kernels_if_given_program +from loopy.translation_unit import for_each_kernel from loopy.kernel import LoopKernel __doc__ = """ @@ -36,7 +36,7 @@ __doc__ = """ # {{{ add_barrier -@iterate_over_kernels_if_given_program +@for_each_kernel def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None, tags=None, synchronization_kind="global", mem_kind=None): """Takes in a kernel that needs to be added a barrier and returns a kernel diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py index 2896af68d..8203f0d52 100644 --- a/loopy/transform/arithmetic.py +++ b/loopy/transform/arithmetic.py @@ -23,13 +23,13 @@ THE SOFTWARE. from loopy.diagnostic import LoopyError -from loopy.translation_unit import iterate_over_kernels_if_given_program +from loopy.translation_unit import for_each_kernel from loopy.kernel import LoopKernel # {{{ fold constants -@iterate_over_kernels_if_given_program +@for_each_kernel def fold_constants(kernel): from loopy.symbolic import ConstantFoldingMapper cfm = ConstantFoldingMapper() @@ -53,7 +53,7 @@ def fold_constants(kernel): # {{{ collect_common_factors_on_increment # thus far undocumented -@iterate_over_kernels_if_given_program +@for_each_kernel def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py index d1c1672da..536a7a826 100644 --- a/loopy/transform/batch.py +++ b/loopy/transform/batch.py @@ -25,7 +25,7 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont from loopy.kernel.data import ValueArg, ArrayArg import islpy as isl -from loopy.translation_unit import iterate_over_kernels_if_given_program +from loopy.translation_unit import for_each_kernel __doc__ = """ @@ -101,7 +101,7 @@ def _add_unique_dim_name(name, dim_names): return (ng(name),) + tuple(dim_names) -@iterate_over_kernels_if_given_program +@for_each_kernel def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch", sequential=False): """Takes in a kernel that carries out an operation and returns a kernel diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 45063458e..651f4457c 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -34,7 +34,7 @@ from loopy.symbolic import ( from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) from loopy.translation_unit import (TranslationUnit, - iterate_over_kernels_if_given_program) + for_each_kernel) __doc__ = """ .. currentmodule:: loopy @@ -459,7 +459,7 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): # {{{ inline callable kernel -@iterate_over_kernels_if_given_program +@for_each_kernel def _inline_single_callable_kernel(caller_kernel, callee_kernel): from loopy.symbolic import ResolvedFunction diff --git a/loopy/transform/data.py b/loopy/transform/data.py index baee68203..d866f8a5e 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -27,7 +27,7 @@ from loopy.kernel.data import ImageArg from pytools import MovedFunctionDeprecationWrapper from loopy.translation_unit import (TranslationUnit, - iterate_over_kernels_if_given_program) + for_each_kernel) from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel, ScalarCallable @@ -396,7 +396,7 @@ def add_prefetch(program, *args, **kwargs): # {{{ change variable kinds -@iterate_over_kernels_if_given_program +@for_each_kernel def change_arg_to_image(kernel, name): new_args = [] for arg in kernel.args: @@ -414,7 +414,7 @@ def change_arg_to_image(kernel, name): # {{{ tag array axes -@iterate_over_kernels_if_given_program +@for_each_kernel def tag_array_axes(kernel, ary_names, dim_tags): """ :arg dim_tags: a tuple of @@ -461,7 +461,7 @@ tag_data_axes = ( # {{{ set_array_axis_names -@iterate_over_kernels_if_given_program +@for_each_kernel def set_array_axis_names(kernel, ary_names, dim_names): """ .. versionchanged:: 2016.2 @@ -494,7 +494,7 @@ set_array_dim_names = (MovedFunctionDeprecationWrapper( # {{{ remove_unused_arguments -@iterate_over_kernels_if_given_program +@for_each_kernel def remove_unused_arguments(kernel): new_args = [] @@ -536,7 +536,7 @@ def remove_unused_arguments(kernel): # {{{ alias_temporaries -@iterate_over_kernels_if_given_program +@for_each_kernel def alias_temporaries(kernel, names, base_name_prefix=None, synchronize_for_exclusive_use=True): """Sets all temporaries given by *names* to be backed by a single piece of @@ -621,7 +621,7 @@ def alias_temporaries(kernel, names, base_name_prefix=None, # {{{ set argument order -@iterate_over_kernels_if_given_program +@for_each_kernel def set_argument_order(kernel, arg_names): """ :arg arg_names: A list (or comma-separated string) or argument @@ -657,7 +657,7 @@ def set_argument_order(kernel, arg_names): # {{{ rename argument -@iterate_over_kernels_if_given_program +@for_each_kernel def rename_argument(kernel, old_name, new_name, existing_ok=False): """ .. versionadded:: 2016.2 @@ -731,7 +731,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False): # {{{ set temporary scope -@iterate_over_kernels_if_given_program +@for_each_kernel def set_temporary_scope(kernel, temp_var_names, scope): """ :arg temp_var_names: a container with membership checking, @@ -773,7 +773,7 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule -@iterate_over_kernels_if_given_program +@for_each_kernel def reduction_arg_to_subst_rule( kernel, inames, insn_match=None, subst_rule_name=None): if isinstance(inames, str): diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 27f337021..7f1273d3c 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -30,7 +30,7 @@ from loopy.symbolic import ( from loopy.diagnostic import LoopyError from loopy.translation_unit import (TranslationUnit, - iterate_over_kernels_if_given_program) + for_each_kernel) from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel @@ -79,7 +79,7 @@ __doc__ = """ # {{{ set loop priority -@iterate_over_kernels_if_given_program +@for_each_kernel def set_loop_priority(kernel, loop_priority): from warnings import warn warn("set_loop_priority is deprecated. Use prioritize_loops instead. " @@ -94,7 +94,7 @@ def set_loop_priority(kernel, loop_priority): return kernel.copy(loop_priority=frozenset([loop_priority])) -@iterate_over_kernels_if_given_program +@for_each_kernel def prioritize_loops(kernel, loop_priority): """Indicates the textual order in which loops should be entered in the kernel code. Note that this priority has an advisory role only. If the @@ -339,7 +339,7 @@ def _split_iname_backend(kernel, iname_to_split, # {{{ split iname -@iterate_over_kernels_if_given_program +@for_each_kernel def split_iname(kernel, split_iname, inner_length, *, outer_iname=None, inner_iname=None, @@ -384,7 +384,7 @@ def split_iname(kernel, split_iname, inner_length, # {{{ chunk iname -@iterate_over_kernels_if_given_program +@for_each_kernel def chunk_iname(kernel, split_iname, num_chunks, outer_iname=None, inner_iname=None, outer_tag=None, inner_tag=None, @@ -519,7 +519,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper): return super().map_reduction(expr, expn_state) -@iterate_over_kernels_if_given_program +@for_each_kernel def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """In a sense, the inverse of :func:`split_iname`. Takes in inames, finds their bounds (all but the first have to be bounded), and combines @@ -676,7 +676,7 @@ def untag_inames(kernel, iname_to_untag, tag_type): # {{{ tag inames -@iterate_over_kernels_if_given_program +@for_each_kernel def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False): """Tag an iname @@ -848,7 +848,7 @@ class _InameDuplicator(RuleAwareIdentityMapper): return insn.copy(within_inames=new_fid) -@iterate_over_kernels_if_given_program +@for_each_kernel def duplicate_inames(kernel, inames, within, new_inames=None, suffix=None, tags={}): """ @@ -1104,7 +1104,7 @@ def has_schedulable_iname_nesting(kernel): # {{{ rename_inames -@iterate_over_kernels_if_given_program +@for_each_kernel def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None): """ :arg within: a stack match as understood by @@ -1355,7 +1355,7 @@ def _split_reduction(kernel, inames, direction, within=None): rsplit.map_kernel(kernel)) -@iterate_over_kernels_if_given_program +@for_each_kernel def split_reduction_inward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1375,7 +1375,7 @@ def split_reduction_inward(kernel, inames, within=None): return _split_reduction(kernel, inames, "in", within) -@iterate_over_kernels_if_given_program +@for_each_kernel def split_reduction_outward(kernel, inames, within=None): """Takes a reduction of the form:: @@ -1399,7 +1399,7 @@ def split_reduction_outward(kernel, inames, within=None): # {{{ affine map inames -@iterate_over_kernels_if_given_program +@for_each_kernel def affine_map_inames(kernel, old_inames, new_inames, equations): """Return a new *kernel* where the affine transform specified by *equations* has been applied to the inames. @@ -1731,7 +1731,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): expr, expn_state) -@iterate_over_kernels_if_given_program +@for_each_kernel def make_reduction_inames_unique(kernel, inames=None, within=None): """ :arg inames: if not *None*, only apply to these inames @@ -1778,7 +1778,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None): # {{{ add_inames_to_insn -@iterate_over_kernels_if_given_program +@for_each_kernel def add_inames_to_insn(kernel, inames, insn_match): """ :arg inames: a frozenset of inames that will be added to the @@ -1817,7 +1817,7 @@ def add_inames_to_insn(kernel, inames, insn_match): # }}} -@iterate_over_kernels_if_given_program +@for_each_kernel def add_inames_for_unused_hw_axes(kernel, within=None): """ Returns a kernel with inames added to each instruction diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 870348d71..287321e3e 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -24,7 +24,7 @@ from loopy.diagnostic import LoopyError from loopy.kernel import LoopKernel from loopy.kernel.function_interface import (ScalarCallable, CallableKernel) from loopy.translation_unit import (TranslationUnit, - iterate_over_kernels_if_given_program) + for_each_kernel) # {{{ find_instructions @@ -78,7 +78,7 @@ def map_instructions(kernel, insn_match, f): # {{{ set_instruction_priority -@iterate_over_kernels_if_given_program +@for_each_kernel def set_instruction_priority(kernel, insn_match, priority): """Set the priority of instructions matching *insn_match* to *priority*. @@ -96,7 +96,7 @@ def set_instruction_priority(kernel, insn_match, priority): # {{{ add_dependency -@iterate_over_kernels_if_given_program +@for_each_kernel def add_dependency(kernel, insn_match, depends_on): """Add the instruction dependency *dependency* to the instructions matched by *insn_match*. @@ -146,7 +146,7 @@ def add_dependency(kernel, insn_match, depends_on): # {{{ remove_instructions -@iterate_over_kernels_if_given_program +@for_each_kernel def remove_instructions(kernel, insn_ids): """Return a new kernel with instructions in *insn_ids* removed. @@ -237,7 +237,7 @@ def replace_instruction_ids(kernel, replacements): # {{{ tag_instructions -@iterate_over_kernels_if_given_program +@for_each_kernel def tag_instructions(kernel, new_tag, within=None): from loopy.match import parse_match within = parse_match(within) @@ -260,7 +260,7 @@ def tag_instructions(kernel, new_tag, within=None): # {{{ add nosync -@iterate_over_kernels_if_given_program +@for_each_kernel def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, empty_ok=False): """Add a *no_sync_with* directive between *source* and *sink*. @@ -360,7 +360,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False, # {{{ uniquify_instruction_ids -@iterate_over_kernels_if_given_program +@for_each_kernel def uniquify_instruction_ids(kernel): """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique strings. diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py index 0cd953ea8..44b2bbf33 100644 --- a/loopy/transform/padding.py +++ b/loopy/transform/padding.py @@ -24,7 +24,7 @@ THE SOFTWARE. from pytools import MovedFunctionDeprecationWrapper from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext -from loopy.translation_unit import (iterate_over_kernels_if_given_program, +from loopy.translation_unit import (for_each_kernel, TranslationUnit) from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel @@ -46,7 +46,7 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper): # {{{ split_array_dim (deprecated since June 2016) -@iterate_over_kernels_if_given_program +@for_each_kernel def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, split_kwargs=None): @@ -374,7 +374,7 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): return kernel -@iterate_over_kernels_if_given_program +@for_each_kernel def split_array_axis(kernel, array_names, axis_nr, count, order="C"): """ @@ -456,7 +456,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1 # {{{ add_padding -@iterate_over_kernels_if_given_program +@for_each_kernel def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = {arg.name: i for i, arg in enumerate(kernel.args)} arg_idx = arg_to_idx[variable] diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 0e9dbe09e..5ceaeb121 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -25,7 +25,7 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext) import islpy as isl -from loopy.translation_unit import iterate_over_kernels_if_given_program +from loopy.translation_unit import for_each_kernel from loopy.kernel import LoopKernel __doc__ = """ @@ -40,7 +40,7 @@ __doc__ = """ # {{{ assume -@iterate_over_kernels_if_given_program +@for_each_kernel def assume(kernel, assumptions): """Include an assumption about :ref:`domain-parameters` in the kernel, e.g. `n mod 4 = 0`. @@ -134,7 +134,7 @@ def _fix_parameter(kernel, name, value, remove_argument, within=None): )) -@iterate_over_kernels_if_given_program +@for_each_kernel def fix_parameters(kernel, **value_dict): """Fix the values of the arguments to specific constants. diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 2681d69ea..a58ede339 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -28,7 +28,7 @@ from loopy.transform.iname import remove_any_newly_unused_inames from pytools import ImmutableRecord from pymbolic import var -from loopy.translation_unit import (iterate_over_kernels_if_given_program, +from loopy.translation_unit import (for_each_kernel, TranslationUnit) from loopy.kernel.function_interface import CallableKernel, ScalarCallable @@ -289,7 +289,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper): return var(subst_name)(*index) -@iterate_over_kernels_if_given_program +@for_each_kernel @remove_any_newly_unused_inames def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, force_retain_argument=False): @@ -473,7 +473,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None, # {{{ expand_subst -@iterate_over_kernels_if_given_program +@for_each_kernel def expand_subst(kernel, within=None): """ Returns an instance of :class:`loopy.LoopKernel` with the substitutions diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 6153a4fcf..26409ec5e 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -49,7 +49,7 @@ __doc__ = """ .. autofunction:: make_program -.. autofunction:: iterate_over_kernels_if_given_program +.. autofunction:: for_each_kernel """ @@ -727,7 +727,7 @@ def make_program(kernel): return program -def iterate_over_kernels_if_given_program(transform_for_single_kernel): +def for_each_kernel(transform_for_single_kernel): """ Function wrapper for transformations of the type ``transform(kernel: LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the -- GitLab From e539c6bd7f6177f18af586caafe2893981690f2e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 25 Apr 2021 16:52:45 -0500 Subject: [PATCH 848/916] [docs] for_each_kernel: improve docs, minor cleanup --- loopy/translation_unit.py | 46 ++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 26409ec5e..e129df3c0 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -727,46 +727,42 @@ def make_program(kernel): return program -def for_each_kernel(transform_for_single_kernel): +def for_each_kernel(transform): """ Function wrapper for transformations of the type ``transform(kernel: - LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the - ``transform`` being implemented on all of the callable kernels in a - :class:`loopy.TranslationUnit`. + LoopKernel, *args, **kwargs) -> LoopKernel``. Returns a function that would + apply *transform* to all callable kernels in a :class:`loopy.TranslationUnit`. """ def _collective_transform(*args, **kwargs): - if "program" in kwargs: - program_or_kernel = kwargs.pop("program") + if "translation_unit" in kwargs: + t_unit_or_kernel = kwargs.pop("translation_unit") elif "kernel" in kwargs: - program_or_kernel = kwargs.pop("kernel") + t_unit_or_kernel = kwargs.pop("kernel") else: - program_or_kernel = args[0] + t_unit_or_kernel = args[0] args = args[1:] - if isinstance(program_or_kernel, TranslationUnit): - program = program_or_kernel + if isinstance(t_unit_or_kernel, TranslationUnit): + t_unit = t_unit_or_kernel new_callables = {} - for func_id, in_knl_callable in program.callables_table.items(): - if isinstance(in_knl_callable, CallableKernel): - new_subkernel = transform_for_single_kernel( - in_knl_callable.subkernel, *args, **kwargs) - in_knl_callable = in_knl_callable.copy( - subkernel=new_subkernel) - elif isinstance(in_knl_callable, ScalarCallable): + for func_id, clbl in t_unit.callables_table.items(): + if isinstance(clbl, CallableKernel): + new_subkernel = transform(clbl.subkernel, *args, **kwargs) + clbl = clbl.copy(subkernel=new_subkernel) + elif isinstance(clbl, ScalarCallable): pass else: - raise NotImplementedError("Unknown type of callable %s." % ( - type(in_knl_callable).__name__)) + raise NotImplementedError(f"{type(clbl)}") - new_callables[func_id] = in_knl_callable + new_callables[func_id] = clbl - return program.copy(callables_table=new_callables) + return t_unit.copy(callables_table=new_callables) else: - assert isinstance(program_or_kernel, LoopKernel) - kernel = program_or_kernel - return transform_for_single_kernel(kernel, *args, **kwargs) + assert isinstance(t_unit_or_kernel, LoopKernel) + kernel = t_unit_or_kernel + return transform(kernel, *args, **kwargs) - return wraps(transform_for_single_kernel)(_collective_transform) + return wraps(transform)(_collective_transform) def update_table(callables_table, clbl_id, clbl): -- GitLab From b26a40c6c3d5e12d2b529994c487b3b5964e3e62 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Apr 2021 20:56:10 -0500 Subject: [PATCH 849/916] implement validate_kernel_call_sites --- loopy/check.py | 118 ++++++++++++++++++++++++++++++++++++++ loopy/translation_unit.py | 7 ++- 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index 36ffc8df0..83af35c4e 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1207,6 +1207,124 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} +# {{{ validate_kernel_call_sites + +def _are_sub_array_refs_equal(sar1, sar2, caller): + if len(sar1.swept_inames) != len(sar2.swept_inames): + return False + + if sar1.subscript.aggregate.name != sar2.subscript.aggregate.name: + return False + + if len(sar1.subscript.index_tuple) != len(sar2.subscript.index_tuple): + return False + + from loopy.symbolic import SubstitutionMapper + from pymbolic.mapper.substitutor import make_subst_func + from pymbolic.mapper.distributor import distribute + subst_func = make_subst_func({iname1.name: iname2 + for iname1, iname2 in zip(sar1.swept_inames, + sar2.swept_inames) + }) + + for sweep1, sweep2 in zip(sar1.swept_inames, sar2.swept_inames): + sweep1_bounds = caller.get_iname_bounds(sweep1.name) + sweep2_bounds = caller.get_iname_bounds(sweep2.name) + if (sweep1_bounds.lower_bound_pw_aff != sweep2_bounds.lower_bound_pw_aff): + return False + + if (sweep1_bounds.upper_bound_pw_aff != sweep2_bounds.upper_bound_pw_aff): + return False + + # subst_mapper: maps swept inames from sar1 to sar2 + subst_mapper = SubstitutionMapper(subst_func) + + for idx1, idx2 in zip(sar1.subscript.index_tuple, + sar2.subscript.index_tuple): + if distribute(subst_mapper(idx1) - idx2) != 0: + return False + return True + + +def _validate_kernel_call_insn(caller, call_insn, callee): + assert call_insn.expression.function.name == callee.name + from loopy.symbolic import SubArrayRef + from loopy.kernel.array import ArrayBase + + arg_id_to_val = call_insn.arg_id_to_val() + + ipar = 0 + iassignee = -1 + + for arg in callee.args: + if arg.is_input: + if ipar not in arg_id_to_val: + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" + f" a {ipar+1}-th positional argument corresponding" + f" to '{arg.name}'in the callee.") + in_val = arg_id_to_val[ipar] + ipar += 1 + if isinstance(arg, ArrayBase): + if not isinstance(in_val, SubArrayRef): + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}'" + f" expects a sub-array-ref for '{arg.name}'" + f" (got {in_val}).") + else: + if isinstance(in_val, SubArrayRef): + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}'" + f" expects a value argument for '{arg.name}'" + f" (got {in_val}).") + if arg.is_output: + if iassignee not in arg_id_to_val: + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" + f" a {-iassignee}-th positional assignee" + f" corresponding to '{arg.name}'in the callee.") + + out_val = arg_id_to_val[iassignee] + iassignee -= 1 + assert isinstance(arg, ArrayBase) + if not isinstance(out_val, SubArrayRef): + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}'" + f" expects a sub-array-ref for '{arg.name}'" + f" (got {out_val}).") + + if arg.is_input and arg.is_output: + if not _are_sub_array_refs_equal(in_val, out_val, caller): + raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" + f" equivalent sub-array-refs for '{arg.name}'" + f" (got {in_val}, {out_val}).") + + +def _validate_kernel_call_sites_inner(kernel, callables): + from pymbolic.primitives import Call + from loopy.kernel.function_interface import CallableKernel + + for insn in kernel.instructions: + if (isinstance(insn, CallInstruction) + and isinstance(insn.expression, Call) + and isinstance(insn.expression.function, ResolvedFunction)): + clbl = callables[insn.expression.function.name] + if isinstance(clbl, CallableKernel): + _validate_kernel_call_insn(kernel, insn, clbl.subkernel) + elif isinstance(insn, (MultiAssignmentBase, CInstruction, + _DataObliviousInstruction)): + pass + else: + raise NotImplementedError(type(insn)) + + +def validate_kernel_call_sites(translation_unit): + from loopy import LoopKernel + + for name in translation_unit.callables_table: + clbl = translation_unit[name] + if isinstance(clbl, LoopKernel): + _validate_kernel_call_sites_inner(clbl, translation_unit.callables_table) + + +# }}} + + def pre_codegen_checks(kernel, callables_table): try: logger.debug("pre-codegen check %s: start" % kernel.name) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index e129df3c0..269fd53f9 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -791,6 +791,7 @@ def resolve_callables(program): expression nodes converted to :class:`loopy.symbolic.ResolvedFunction`. """ from loopy.library.function import get_loopy_callables + from loopy.check import validate_kernel_call_sites from loopy.kernel import KernelState if program.state >= KernelState.CALLS_RESOLVED: @@ -834,7 +835,11 @@ def resolve_callables(program): else: raise NotImplementedError(f"{type(clbl)}") - return program.copy(callables_table=callables_table) + program = program.copy(callables_table=callables_table) + + validate_kernel_call_sites(program) + + return program # }}} -- GitLab From 63f7e9dd77932a43d499d0d518c1d3aa6d20449b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Apr 2021 22:49:11 -0500 Subject: [PATCH 850/916] call: comply with loopy's kernel call requirement --- loopy/frontend/fortran/translator.py | 36 ++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 22e532c6e..d54ded4c4 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -231,6 +231,14 @@ class Scope: return expr + def written_vars(self): + return frozenset().union(*(insn.write_dependency_names() + for insn in self.instructions)) + + def read_vars(self): + return frozenset().union(*(insn.read_dependency_names() + for insn in self.instructions)) + # }}} @@ -493,16 +501,40 @@ class F2LoopyTranslator(FTreeWalkerBase): raise NotImplementedError("goto") def map_Call(self, node): + from loopy.kernel.instruction import _get_assignee_var_name scope = self.scope_stack[-1] new_id = self.get_insn_id() + # {{{ comply with loopy's kernel call requirements + + callee, = (knl for knl in self.kernels + if knl.subprogram_name == node.designator) + call_params = [scope.process_expression_for_loopy(self.parse_expr(node, + item)) + for item in node.items] + callee_read_vars = callee.read_vars() + callee_written_vars = callee.written_vars() + + lpy_params = [] + lpy_assignees = [] + for param in call_params: + name = _get_assignee_var_name(param) + if name in callee_read_vars: + lpy_params.append(param) + if name in callee_written_vars: + lpy_assignees.append(param) + if name not in (callee_read_vars | callee_written_vars): + lpy_params.append(param) + + # }}} + from pymbolic import var from loopy.kernel.data import CallInstruction insn = CallInstruction( - (), var(node.designator)(*(scope.process_expression_for_loopy( - self.parse_expr(node, item)) for item in node.items)), + tuple(lpy_assignees), + var(node.designator)(*lpy_params), within_inames=frozenset( scope.active_loopy_inames), id=new_id, -- GitLab From 46eb3e31e8c83285d9d7edf1b05008eaedfb08fd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Apr 2021 22:56:40 -0500 Subject: [PATCH 851/916] make read_vars accurate --- loopy/frontend/fortran/translator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index d54ded4c4..7109f1ed2 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -236,8 +236,10 @@ class Scope: for insn in self.instructions)) def read_vars(self): - return frozenset().union(*(insn.read_dependency_names() + return (frozenset().union(*(insn.read_dependency_names() for insn in self.instructions)) + | frozenset().union(*(frozenset(bset.get_var_names(dim_type.param)) + for bset in self.index_sets))) # }}} -- GitLab From c119c744e534b89ec69f0296217b3db6104346af Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 25 Apr 2021 19:06:17 -0500 Subject: [PATCH 852/916] motivate the existence of _are_sub_array_refs_equivalent --- loopy/check.py | 15 +++++++++++++-- loopy/symbolic.py | 9 ++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 83af35c4e..4730e6995 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1209,7 +1209,18 @@ def check_that_shapes_and_strides_are_arguments(kernel): # {{{ validate_kernel_call_sites -def _are_sub_array_refs_equal(sar1, sar2, caller): +def _are_sub_array_refs_equivalent(sar1, sar2, caller): + """ + Returns *True* iff *sar1* and *sar2* are equivalent + :class:`loopy.SubArrayRef`s. + + Two sub-array-refs are said to be equivalent iff they point to the same + array sub-regions. This equivalence check is less stricter than + :meth:`~loopy.SubArrayRef.is_equal`. + + :arg caller: An instance of :class:`loopy.LoopKernel` in which they are + referenced. + """ if len(sar1.swept_inames) != len(sar2.swept_inames): return False @@ -1289,7 +1300,7 @@ def _validate_kernel_call_insn(caller, call_insn, callee): f" (got {out_val}).") if arg.is_input and arg.is_output: - if not _are_sub_array_refs_equal(in_val, out_val, caller): + if not _are_sub_array_refs_equivalent(in_val, out_val, caller): raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" f" equivalent sub-array-refs for '{arg.name}'" f" (got {in_val}, {out_val}).") diff --git a/loopy/symbolic.py b/loopy/symbolic.py index ef8960014..8883045eb 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -91,6 +91,8 @@ __doc__ = """ .. autoclass:: RuleAwareIdentityMapper .. autoclass:: ResolvedFunction + +.. autoclass:: SubArrayRef """ @@ -910,12 +912,14 @@ class SubArrayRef(LoopyExpressionBase): .. attribute:: swept_inames An instance of :class:`tuple` denoting the axes to which the sub array - is supposed to be mapper to. + is supposed to be mapped to. .. attribute:: subscript An instance of :class:`pymbolic.primitives.Subscript` denoting the array in the kernel. + + .. automethod:: is_equal """ init_arg_names = ("swept_inames", "subscript") @@ -946,6 +950,9 @@ class SubArrayRef(LoopyExpressionBase): return hash((self.__class__, self.swept_inames, self.subscript)) def is_equal(self, other): + """ + Returns *True* iff the sub-array refs have identical expressions. + """ return (other.__class__ == self.__class__ and other.subscript == self.subscript and other.swept_inames == self.swept_inames) -- GitLab From 749e5683aa327187ffd0146a9547b9f9e82fd0f9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 25 Apr 2021 18:46:10 -0500 Subject: [PATCH 853/916] [callables] remove memoize_method from generate_code_v2 --- loopy/codegen/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index db9247ae9..7bda7f57a 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -38,7 +38,7 @@ from functools import reduce from loopy.kernel.function_interface import CallableKernel, ScalarCallable -from pytools import ProcessLogger, memoize_method +from pytools import ProcessLogger __doc__ = """ .. currentmodule:: loopy.codegen @@ -698,7 +698,6 @@ class TranslationUnitCodeGenerationResult(ImmutableRecord): self.host_programs.values())) -@memoize_method def generate_code_v2(program): """ Returns an instance of :class:`CodeGenerationResult`. -- GitLab From 1b80ed1200390c9dff1ff64050f66c1ffd71b17d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Apr 2021 11:16:56 -0500 Subject: [PATCH 854/916] corrects sub_array_refs access_maps --- loopy/symbolic.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8883045eb..29e22ab31 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -2434,8 +2434,31 @@ class BatchedAccessMapMapper(WalkMapper): return self.rec(expr.child, inames) def map_sub_array_ref(self, expr, inames): + arg_name = expr.subscript.aggregate.name + if arg_name not in self._var_names: + return + + if arg_name in self.bad_subscripts: + return + total_inames = inames | {iname.name for iname in expr.swept_inames} - return self.rec(expr.subscript, total_inames) + assert total_inames not in self.access_maps[arg_name] + + self.rec(expr.subscript, total_inames) + + # {{{ project out swept_inames as within inames they are swept locally + + amap = self.access_maps[arg_name].pop(total_inames) + for iname in expr.swept_inames: + dt, pos = amap.get_var_dict()[iname.name] + amap = amap.project_out(dt, pos, 1) + + # }}} + + if self.access_maps[arg_name][inames] is None: + self.access_maps[arg_name][inames] = amap + else: + self.access_maps[arg_name][inames] |= amap class AccessRangeMapper: -- GitLab From 521a2c0ee61ba9c4f92ef80029f6e284d08cb490 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Apr 2021 11:17:15 -0500 Subject: [PATCH 855/916] use access_range instead of iname bounds --- loopy/check.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 4730e6995..1a1cd8636 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1209,13 +1209,20 @@ def check_that_shapes_and_strides_are_arguments(kernel): # {{{ validate_kernel_call_sites +def _get_sub_array_ref_swept_range(kernel, sar): + from loopy.symbolic import get_access_map + domain = kernel.get_inames_domain({iname_var.name + for iname_var in sar.swept_inames}) + return get_access_map(domain, sar.swept_inames, kernel.assumptions).range() + + def _are_sub_array_refs_equivalent(sar1, sar2, caller): """ Returns *True* iff *sar1* and *sar2* are equivalent :class:`loopy.SubArrayRef`s. Two sub-array-refs are said to be equivalent iff they point to the same - array sub-regions. This equivalence check is less stricter than + array sub-regions. This equivalence check is less strict than :meth:`~loopy.SubArrayRef.is_equal`. :arg caller: An instance of :class:`loopy.LoopKernel` in which they are @@ -1230,6 +1237,10 @@ def _are_sub_array_refs_equivalent(sar1, sar2, caller): if len(sar1.subscript.index_tuple) != len(sar2.subscript.index_tuple): return False + if (_get_sub_array_ref_swept_range(caller, sar1) + != _get_sub_array_ref_swept_range(caller, sar2)): + return False + from loopy.symbolic import SubstitutionMapper from pymbolic.mapper.substitutor import make_subst_func from pymbolic.mapper.distributor import distribute @@ -1238,15 +1249,6 @@ def _are_sub_array_refs_equivalent(sar1, sar2, caller): sar2.swept_inames) }) - for sweep1, sweep2 in zip(sar1.swept_inames, sar2.swept_inames): - sweep1_bounds = caller.get_iname_bounds(sweep1.name) - sweep2_bounds = caller.get_iname_bounds(sweep2.name) - if (sweep1_bounds.lower_bound_pw_aff != sweep2_bounds.lower_bound_pw_aff): - return False - - if (sweep1_bounds.upper_bound_pw_aff != sweep2_bounds.upper_bound_pw_aff): - return False - # subst_mapper: maps swept inames from sar1 to sar2 subst_mapper = SubstitutionMapper(subst_func) -- GitLab From 993ccadac2853e50487801de7f12bd5f119a1a5d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Apr 2021 11:23:55 -0500 Subject: [PATCH 856/916] use frozenset for get_inames_domain --- loopy/check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 1a1cd8636..dea59fc0a 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1211,8 +1211,8 @@ def check_that_shapes_and_strides_are_arguments(kernel): def _get_sub_array_ref_swept_range(kernel, sar): from loopy.symbolic import get_access_map - domain = kernel.get_inames_domain({iname_var.name - for iname_var in sar.swept_inames}) + domain = kernel.get_inames_domain(frozenset({iname_var.name + for iname_var in sar.swept_inames})) return get_access_map(domain, sar.swept_inames, kernel.assumptions).range() -- GitLab From de6f54166f49b9bda8b516e3faf403ed6b03f7f6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Apr 2021 14:04:57 -0500 Subject: [PATCH 857/916] clarify the call instructions for a callable kernel call site --- doc/ref_call.rst | 12 ++++++++++++ loopy/kernel/data.py | 2 ++ 2 files changed, 14 insertions(+) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 60170a5f4..4208624ce 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -41,6 +41,18 @@ An example demonstrating registering a CBlasGemv as a loopy callable: .. literalinclude:: ../examples/python/call-external.py + +Call Instruction for a kernel call +---------------------------------- + +At a call-site involving a call to a :class:`loopy.LoopKernel`, the arguments to +the call must be ordered by the order of input arguments of the callee kernel. +Similarly, the assignees must be ordered by the order of callee kernel's output +arguments. Since a :class:`~loopy.kernel.data.KernelArgument` can be both an +input and an output, such arguments would be a part of the call instruction's +assignees as well as the call expression node's parameters. + + Reference --------- diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index fe165b0f2..8d0f05daa 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -68,6 +68,8 @@ __doc__ = """ .. autoclass:: UnrollTag .. autoclass:: Iname + +.. autoclass:: KernelArgument """ -- GitLab From d7e2c3677445104fc2b0cec203fc3c09702bb2f9 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 28 Apr 2021 22:20:27 +0200 Subject: [PATCH 858/916] Fix complex abs. --- loopy/target/c/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 4548d8487..31d53ebf4 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -518,7 +518,7 @@ class CMathCallable(ScalarCallable): if name in ["abs", "real", "imag"]: dtype = real_dtype - if dtype.kind == "c" or name in ["real", "imag"]: + if dtype.kind == "c" or name in ["real", "imag", "abs"]: if name != "conj": name = "c" + name -- GitLab From 3d2aa260a5e6f835dd0a460e9ea33ab5871f799c Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 30 Apr 2021 11:51:43 +0200 Subject: [PATCH 859/916] Add complex abs test. --- test/test_expression.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_expression.py b/test/test_expression.py index 9e997422e..2175c4750 100644 --- a/test/test_expression.py +++ b/test/test_expression.py @@ -501,6 +501,7 @@ def test_complex_support(ctx_factory, target): euler1_imag[i] = imag(euler1[i]) real_times_complex[i] = in1[i]*(in2[i]*1j) real_plus_complex[i] = in1[i] + (in2[i]*1j) + abs_complex[i] = cabs(real_plus_complex[i]) complex_div_complex[i] = (2jf + 7*in1[i])/(32jf + 37*in1[i]) complex_div_real[i] = (2jf + 7*in1[i])/in1[i] real_div_complex[i] = in1[i]/(2jf + 7*in1[i]) @@ -533,6 +534,7 @@ def test_complex_support(ctx_factory, target): np.testing.assert_allclose(out["euler1_imag"], 0, atol=1e-10) np.testing.assert_allclose(out["real_times_complex"], in1*(in2*1j)) np.testing.assert_allclose(out["real_plus_complex"], in1+(in2*1j)) + np.testing.assert_allclose(out["abs_complex"], np.sqrt(in1**2+in2**2)) np.testing.assert_allclose(out["complex_div_complex"], (2j+7*in1)/(32j+37*in1)) np.testing.assert_allclose(out["complex_div_real"], (2j + 7*in1)/in1) np.testing.assert_allclose(out["real_div_complex"], in1/(2j + 7*in1)) -- GitLab From 4ba8df8d3f4452766eef885e761d9eae46e7cdf4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <15399010+kaushikcfd@users.noreply.github.com> Date: Fri, 30 Apr 2021 08:23:45 -0500 Subject: [PATCH 860/916] cabs -> abs cabs is automatically emitted by loopy for a c99 target --- test/test_expression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_expression.py b/test/test_expression.py index 2175c4750..444aab749 100644 --- a/test/test_expression.py +++ b/test/test_expression.py @@ -501,7 +501,7 @@ def test_complex_support(ctx_factory, target): euler1_imag[i] = imag(euler1[i]) real_times_complex[i] = in1[i]*(in2[i]*1j) real_plus_complex[i] = in1[i] + (in2[i]*1j) - abs_complex[i] = cabs(real_plus_complex[i]) + abs_complex[i] = abs(real_plus_complex[i]) complex_div_complex[i] = (2jf + 7*in1[i])/(32jf + 37*in1[i]) complex_div_real[i] = (2jf + 7*in1[i])/in1[i] real_div_complex[i] = in1[i]/(2jf + 7*in1[i]) @@ -534,7 +534,7 @@ def test_complex_support(ctx_factory, target): np.testing.assert_allclose(out["euler1_imag"], 0, atol=1e-10) np.testing.assert_allclose(out["real_times_complex"], in1*(in2*1j)) np.testing.assert_allclose(out["real_plus_complex"], in1+(in2*1j)) - np.testing.assert_allclose(out["abs_complex"], np.sqrt(in1**2+in2**2)) + np.testing.assert_allclose(out["abs_complex"], np.abs(out["real_plus_complex"])) np.testing.assert_allclose(out["complex_div_complex"], (2j+7*in1)/(32j+37*in1)) np.testing.assert_allclose(out["complex_div_real"], (2j + 7*in1)/in1) np.testing.assert_allclose(out["real_div_complex"], in1/(2j + 7*in1)) -- GitLab From 0d6ed6f5850eff483d4e009724b325541d7f62b6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Apr 2021 13:13:41 -0500 Subject: [PATCH 861/916] add check_sub_array_ref_inames_not_within_or_redn_inames --- loopy/check.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/loopy/check.py b/loopy/check.py index 36ffc8df0..69c2c12fe 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -187,6 +187,27 @@ def check_for_integer_subscript_indices(kernel, callables_table): type(insn).__name__)) +def check_sub_array_ref_inames_not_within_or_redn_inames(kernel): + all_within_inames = frozenset().union(*(insn.within_inames + for insn in kernel.instructions)) + all_redn_inames = frozenset().union(*(insn.reduction_inames() + for insn in kernel.instructions)) + all_sar_inames = frozenset().union(*(insn.sub_array_ref_inames() + for insn in kernel.instructions)) + + if all_sar_inames & all_within_inames: + sample = (all_sar_inames & all_within_inames).pop() + raise LoopyError(f"Iname '{sample}' used as a sub-array ref's sweep" + " iname and an instruction's within inames. Such usage" + " is illegal.") + + if all_sar_inames & all_redn_inames: + sample = (all_sar_inames & all_redn_inames).pop() + raise LoopyError(f"Iname '{sample}' used as a sub-array ref's sweep" + " iname and a reduction iname. Such usage is" + " illegal.") + + def check_insn_attributes(kernel): """ Check for legality of attributes of every instruction in *kernel*. @@ -933,6 +954,11 @@ def pre_schedule_checks(kernel, callables_table): check_for_integer_subscript_indices(kernel, callables_table) check_functions_are_resolved(kernel) + # Ordering restriction: + # check_sub_array_ref_inames_not_within_or_redn_inames should be done + # before check_bounds. check_bounds involves certain assertions + # triggering this restriction. + check_sub_array_ref_inames_not_within_or_redn_inames(kernel) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel, callables_table) -- GitLab From d1ad1d1af94858f59102fb26a2ccef6502c466e6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Apr 2021 17:54:56 -0500 Subject: [PATCH 862/916] less stateful in accessing a set's entry Co-authored-by: Andreas Kloeckner --- loopy/check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 69c2c12fe..7d2c3ae2d 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -196,13 +196,13 @@ def check_sub_array_ref_inames_not_within_or_redn_inames(kernel): for insn in kernel.instructions)) if all_sar_inames & all_within_inames: - sample = (all_sar_inames & all_within_inames).pop() + sample = next(iter(all_sar_inames & all_within_inames)) raise LoopyError(f"Iname '{sample}' used as a sub-array ref's sweep" " iname and an instruction's within inames. Such usage" " is illegal.") if all_sar_inames & all_redn_inames: - sample = (all_sar_inames & all_redn_inames).pop() + sample = next(iter(all_sar_inames & all_within_inames)) raise LoopyError(f"Iname '{sample}' used as a sub-array ref's sweep" " iname and a reduction iname. Such usage is" " illegal.") -- GitLab From 1d8e33f1ff4bf21c6798d768b4ea24ff9566a3c5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Apr 2021 18:03:42 -0500 Subject: [PATCH 863/916] adds more context to the preschedule_check ordering restriction --- loopy/check.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 7d2c3ae2d..19e467cd3 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -956,8 +956,7 @@ def pre_schedule_checks(kernel, callables_table): check_functions_are_resolved(kernel) # Ordering restriction: # check_sub_array_ref_inames_not_within_or_redn_inames should be done - # before check_bounds. check_bounds involves certain assertions - # triggering this restriction. + # before check_bounds. See: BatchedAccessMapMapper.map_sub_array_ref. check_sub_array_ref_inames_not_within_or_redn_inames(kernel) check_for_duplicate_insn_ids(kernel) check_for_orphaned_user_hardware_axes(kernel) -- GitLab From 2f97fb2275815508c5ba395a50649dda728ea8f6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 1 May 2021 20:59:38 -0500 Subject: [PATCH 864/916] purge function_manglers --- doc/misc.rst | 2 +- doc/ref_transform.rst | 3 --- loopy/__init__.py | 24 ------------------------ loopy/kernel/creation.py | 3 --- loopy/target/python.py | 17 ----------------- 5 files changed, 1 insertion(+), 48 deletions(-) diff --git a/doc/misc.rst b/doc/misc.rst index da61e3051..cd3d20501 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -332,7 +332,7 @@ This list is always growing, but here are a few pointers: * Interface with your own library functions - Use :func:`loopy.register_function_manglers`. + See :ref:`func-interface` for details. * Loop collapse diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index 57d33b539..1ba295777 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -87,8 +87,6 @@ Registering Library Routines .. autofunction:: register_symbol_manglers -.. autofunction:: register_function_manglers - Modifying Arguments ------------------- @@ -144,4 +142,3 @@ TODO: Matching instruction tags .. automodule:: loopy.match .. vim: tw=75:spell - diff --git a/loopy/__init__.py b/loopy/__init__.py index 8820cbd99..a50c622f7 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -298,7 +298,6 @@ __all__ = [ "register_preamble_generators", "register_symbol_manglers", - "register_function_manglers", "set_caching_enabled", "CacheMode", @@ -395,29 +394,6 @@ def register_symbol_manglers(kernel, manglers): return kernel.copy(symbol_manglers=new_manglers) - -@for_each_kernel -def register_function_manglers(kernel, manglers): - """ - :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)`` - returning a :class:`loopy.CallMangleInfo`. - :returns: *kernel* with *manglers* registered - """ - from loopy.tools import unpickles_equally - - new_manglers = kernel.function_manglers[:] - for m in manglers: - if m not in new_manglers: - if not unpickles_equally(m): - raise LoopyError("mangler '%s' does not " - "compare equally after being upickled " - "and would disrupt loopy's caches" - % m) - - new_manglers.insert(0, m) - - return kernel.copy(function_manglers=new_manglers) - # }}} diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 6718168bd..4d1e86ca7 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -2144,9 +2144,6 @@ def make_function(domains, instructions, kernel_data=["..."], **kwargs): :arg default_offset: 0 or :class:`loopy.auto`. The default value of *offset* in :attr:`ArrayArg` for guessed arguments. Defaults to 0. - :arg function_manglers: list of functions of signature - ``(target, name, arg_dtypes)`` - returning a :class:`loopy.CallMangleInfo`. :arg symbol_manglers: list of functions of signature (name) returning a tuple (result_dtype, c_name), where c_name is the C-level symbol to be evaluated. diff --git a/loopy/target/python.py b/loopy/target/python.py index d30dd41a7..08142e66f 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -23,8 +23,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import numpy as np - from pymbolic.mapper import Mapper from pymbolic.mapper.stringifier import StringifyMapper from loopy.type_inference import TypeReader @@ -132,21 +130,6 @@ class ExpressionToPythonMapper(StringifyMapper): # {{{ ast builder -def _numpy_single_arg_function_mangler(kernel, name, arg_dtypes): - if (not isinstance(name, str) - or not hasattr(np, name) - or len(arg_dtypes) != 1): - return None - - arg_dtype, = arg_dtypes - - from loopy.kernel.data import CallMangleInfo - return CallMangleInfo( - target_name="_lpy_np."+name, - result_dtypes=(arg_dtype,), - arg_dtypes=arg_dtypes) - - def _base_python_preamble_generator(preamble_info): yield ("00_future", "from __future__ import division, print_function\n") yield ("05_numpy_import", """ -- GitLab From 4b8d0cfbfe9cfdee25e9576467aaf3f0171ec7fa Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 3 May 2021 16:04:22 -0500 Subject: [PATCH 865/916] Fix test_write_block_matrix_fusion for kernel_callables --- test/test_fusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_fusion.py b/test/test_fusion.py index 8e28fb349..a811b3b3b 100644 --- a/test/test_fusion.py +++ b/test/test_fusion.py @@ -134,7 +134,7 @@ def test_write_block_matrix_fusion(ctx_factory): knl = lp.rename_argument(write_into_mat_prg(), "mat", f"mat_{idx}") kwargs[f"mat_{idx}"] = mat - for iname in knl.all_inames(): + for iname in knl.default_entrypoint.all_inames(): knl = lp.rename_iname(knl, iname, f"{iname}_{idx}") knl = lp.rename_argument(knl, "ndofs", f"ndofs_{idx}") -- GitLab From 107324139165e13274e088a5f8e586cc2471711a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 30 Apr 2021 23:17:35 -0500 Subject: [PATCH 866/916] re-enable check_unused_hw_axes --- loopy/check.py | 49 +++++++++++++++++++++++++------------- loopy/codegen/__init__.py | 6 ++--- loopy/codegen/control.py | 6 ++--- loopy/codegen/loop.py | 2 +- loopy/kernel/__init__.py | 21 ++++++++-------- loopy/schedule/__init__.py | 4 ++-- loopy/target/__init__.py | 5 +++- loopy/target/ispc.py | 2 +- loopy/target/pyopencl.py | 2 +- test/test_callables.py | 28 ++++++++++++++++++++++ 10 files changed, 86 insertions(+), 39 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 19e467cd3..10a191a1f 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1232,27 +1232,42 @@ def check_that_shapes_and_strides_are_arguments(kernel): # }}} -def pre_codegen_checks(kernel, callables_table): +def pre_codegen_entrypoint_checks(kernel, callables_table): + logger.debug("pre-codegen (entrypoint) check %s: start" % kernel.name) + + check_for_unused_hw_axes_in_insns(kernel, callables_table) + kernel.target.pre_codegen_entrypoint_check(kernel, callables_table) + + logger.debug("pre-codegen (entrypoint) check %s: done" % kernel.name) + + +def pre_codegen_callee_checks(kernel, callables_table): + logger.debug("pre-codegen (callee) check %s: start" % kernel.name) + + check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) + check_that_temporaries_are_defined_in_subkernels_where_used(kernel) + check_that_all_insns_are_scheduled(kernel) + kernel.target.pre_codegen_callee_check(kernel, callables_table) + check_that_shapes_and_strides_are_arguments(kernel) + + logger.debug("pre-codegen (callee) check %s: done" % kernel.name) + + +def pre_codegen_checks(t_unit): + from loopy.kernel.function_interface import CallableKernel + try: - logger.debug("pre-codegen check %s: start" % kernel.name) - - # FIXME `check_for_unused_hw_axes_in_insns` currently flags a problem - # in the callee if a caller kernel, at a call site, uses hardware axes - # (say `g.0` and `g.1`). It does not seem that that knowledge is - # propagated to the callee. - # check_for_unused_hw_axes_in_insns(kernel, callables_table) - check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) - check_that_temporaries_are_defined_in_subkernels_where_used(kernel) - check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_check(kernel, callables_table) - check_that_shapes_and_strides_are_arguments(kernel) - - logger.debug("pre-codegen check %s: done" % kernel.name) + for e in t_unit.entrypoints: + pre_codegen_entrypoint_checks(t_unit[e], t_unit.callables_table) + + for name, clbl in t_unit.callables_table.items(): + if isinstance(clbl, CallableKernel): + pre_codegen_callee_checks(clbl.subkernel, t_unit.callables_table) except Exception: print(75*"=") - print("failing kernel during pre-schedule check:") + print("failing kernel during pre-codegen check:") print(75*"=") - print(kernel) + print(t_unit) print(75*"=") raise diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 7bda7f57a..93dd745ee 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -478,9 +478,6 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") - from loopy.check import pre_codegen_checks - pre_codegen_checks(kernel, callables_table) - codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code") # {{{ examine arg list @@ -764,6 +761,9 @@ def generate_code_v2(program): # callable should be exclusively an entrypoint or a non-entrypoint kernel. program = diverge_callee_entrypoints(program) + from loopy.check import pre_codegen_checks + pre_codegen_checks(program) + host_programs = {} device_programs = [] device_preambles = [] diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index bf74f4789..ec2a2e283 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -88,10 +88,10 @@ def generate_code_for_sched_index(codegen_state, sched_index): codegen_result = generate_host_or_device_program( new_codegen_state, sched_index) - glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( - get_insn_ids_for_block_at(kernel.schedule, sched_index), - codegen_state.callables_table) if codegen_state.is_entrypoint: + glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( + get_insn_ids_for_block_at(kernel.schedule, sched_index), + codegen_state.callables_table) return merge_codegen_results(codegen_state, [ codegen_result, diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index c343483f0..724989d28 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -251,7 +251,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( - insn_ids_for_block, codegen_state.callables_table) + insn_ids_for_block, codegen_state.callables_table, return_dict=True) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 273328073..5598714cf 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1157,6 +1157,12 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): tgt_dict[tag.axis] = size + # {{{ override local_sizes with self.local_sizes + + local_sizes.update(self.local_sizes) + + # }}} + return global_sizes, local_sizes @memoize_method @@ -1183,22 +1189,16 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): if return_dict: return global_sizes, local_sizes - def to_dim_tuple(size_dict, which, forced_sizes={}): - forced_sizes = forced_sizes.copy() - + def to_dim_tuple(size_dict, which): size_list = [] sorted_axes = sorted(size_dict.keys()) - while sorted_axes or forced_sizes: + while sorted_axes: if sorted_axes: cur_axis = sorted_axes.pop(0) else: cur_axis = None - if len(size_list) in forced_sizes: - size_list.append(forced_sizes.pop(len(size_list))) - continue - assert cur_axis is not None if cur_axis > len(size_list): @@ -1210,7 +1210,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): return tuple(size_list) return (to_dim_tuple(global_sizes, "global"), - to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes)) + to_dim_tuple(local_sizes, "local")) @memoize_method def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, @@ -1250,7 +1250,8 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): """ return self.get_grid_sizes_for_insn_ids( frozenset(insn.id for insn in self.instructions), - callables_table, ignore_auto=ignore_auto) + callables_table, ignore_auto=ignore_auto, + return_dict=return_dict) def get_grid_size_upper_bounds_as_exprs(self, callables_table, ignore_auto=False, return_dict=False): diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 0951db869..f5b298d5e 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -2065,8 +2065,8 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args={}): gen_sched = convert_barrier_instructions_to_barriers( kernel, gen_sched) - gsize, lsize = ( - kernel.get_grid_size_upper_bounds(callables_table)) + gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) if (gsize or lsize): if not kernel.options.disable_global_barriers: diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 8706c4a37..073abc870 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -87,7 +87,10 @@ class TargetBase: def preprocess(self, kernel): return kernel - def pre_codegen_check(self, kernel, callables_table): + def pre_codegen_entrypoint_check(self, kernel, callables_table): + pass + + def pre_codegen_callee_check(self, kernel, callables_table): pass # }}} diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 526d3855e..67af90a24 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -171,7 +171,7 @@ class ISPCTarget(CFamilyTarget): host_program_name_suffix = "" device_program_name_suffix = "_inner" - def pre_codegen_check(self, kernel, callables_table): + def pre_codegen_entrypoint_check(self, kernel, callables_table): gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs( callables_table) if len(lsize) > 1: diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 3123c2714..bcced16df 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -620,7 +620,7 @@ class PyOpenCLTarget(OpenCLTarget): kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel - def pre_codegen_check(self, kernel, callables_table): + def pre_codegen_entrypoint_check(self, kernel, callables_table): check_sizes(kernel, callables_table, self.device) def get_host_ast_builder(self): diff --git a/test/test_callables.py b/test/test_callables.py index 82efb25ee..6f36ce818 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -735,6 +735,34 @@ def test_valueargs_being_mapped_in_inling(ctx_factory): lp.auto_test_vs_ref(knl, ctx_factory(), knl) +@pytest.mark.parametrize("inline", [True, False]) +def test_unused_hw_axes_in_callee(ctx_factory, inline): + ctx = ctx_factory() + + twice = lp.make_function( + "{[i]: 0<=i<10}", + """ + y[i] = 2*x[i] + """, name="twice") + + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[:, i] = twice(x[:, i]) + """, [lp.GlobalArg("x", shape=(10, 10), dtype=float), + lp.GlobalArg("y", shape=(10, 10))], + name="outer") + + twice = lp.tag_inames(twice, {"i": "l.1"}) + knl = lp.tag_inames(knl, {"i": "l.0"}) + knl = lp.merge([knl, twice]) + + if inline: + knl = lp.inline_callable_kernel(knl, "twice") + + lp.auto_test_vs_ref(knl, ctx, knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 2bf82a4349a80c1168035bf56474d45ab4070175 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 1 May 2021 17:05:47 -0500 Subject: [PATCH 867/916] removes dead with_hw_axes_sizes in favor of get_used_hw_axes, get_hw_axes_sizes --- loopy/kernel/function_interface.py | 105 ++++++++++++++--------------- 1 file changed, 51 insertions(+), 54 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 6eed98b88..8c9a0f2ac 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -278,37 +278,6 @@ def get_kw_pos_association(kernel): return kw_to_pos, pos_to_kw - -class GridOverrideForCalleeKernel(ImmutableRecord): - """ - Helper class to set the - :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the - callee kernels. Refer to - :meth:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`, - :meth:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`. - - .. attribute:: global_size - - The global work group size that to be set in the callee kernel. - - .. attribute:: local_size - - The local work group size that has to be set in the callee kernel. - - .. note:: - - This class acts as a pseudo-callable and its significance lies in - solving picklability issues. - """ - fields = {"local_size", "global_size"} - - def __init__(self, global_size, local_size): - self.global_size = global_size - self.local_size = local_size - - def __call__(self, insn_ids, callables_table, ignore_auto=True): - return self.global_size, self.local_size - # }}} @@ -338,11 +307,12 @@ class InKernelCallable(ImmutableRecord): .. automethod:: with_types .. automethod:: with_descrs .. automethod:: with_target - .. automethod:: with_hw_axes_sizes .. automethod:: generate_preambles .. automethod:: emit_call .. automethod:: emit_call_insn .. automethod:: is_ready_for_codegen + .. automethod:: get_hw_axes_sizes + .. automethod:: get_used_hw_axes .. note:: @@ -449,22 +419,32 @@ class InKernelCallable(ImmutableRecord): return self.copy(arg_id_to_dtype=new_arg_id_to_dtype) - def with_hw_axes_sizes(self, global_size, local_size): - """ - Returns a copy of *self* with modifications to comply with the grid - sizes ``(local_size, global_size)`` of the program in which it is - supposed to be called. - - :arg local_size: An instance of :class:`islpy.PwAff`. - :arg global_size: An instance of :class:`islpy.PwAff`. - """ - raise NotImplementedError() - def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None) + def get_hw_axes_sizes(self, arg_id_to_val, space, callables_table): + """ + Returns ``gsizes, lsizes``, where *gsizes* and *lsizes* are mappings + from axis indices to corresponding group or local hw axis sizes. The hw + axes sizes are represented as instances of :class:`islpy.PwAff` on the + given *space*. + + :arg arg_id_to_val: A mapping from the passed argument *id* to the + arguments at a call-site. + :arg space: An instance of :class:`islpy.Space`. + """ + raise NotImplementedError + + def get_used_hw_axes(self, callables_table): + """ + Returns a tuple ``group_axes_used, local_axes_used``, where + ``(group|local)_axes_used`` are :class:`frozenset` of hardware axes + indices used by the callable. + """ + raise NotImplementedError + def generate_preambles(self, target): """ Yields the target specific preamble. @@ -546,8 +526,11 @@ class ScalarCallable(InKernelCallable): return (self.copy(arg_id_to_descr=arg_id_to_descr), clbl_inf_ctx) - def with_hw_axes_sizes(self, global_size, local_size): - return self.copy() + def get_hw_axes_sizes(self, arg_id_to_val, space, callables_table): + return {}, {} + + def get_used_hw_axes(self, callables_table): + return frozenset(), frozenset() def is_ready_for_codegen(self): @@ -673,16 +656,12 @@ class CallableKernel(InKernelCallable): :meth:`CallableKernel.with_descrs` should be called in order to match the arguments' shapes/strides across the caller and the callee kernel. - :meth:`CallableKernel.with_hw_axes_sizes` should be called to set the grid - sizes for the :attr:`CallableKernel.subkernel` of the callable. - .. attribute:: subkernel :class:`~loopy.LoopKernel` which is being called. .. automethod:: with_descrs .. automethod:: with_types - .. automethod:: with_hw_axes_sizes """ fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"} @@ -873,11 +852,29 @@ class CallableKernel(InKernelCallable): return self.copy(subkernel=self.subkernel, arg_id_to_descr=arg_id_to_descr) - def with_hw_axes_sizes(self, gsize, lsize): - return self.copy( - subkernel=self.subkernel.copy( - overridden_get_grid_sizes_for_insn_ids=( - GridOverrideForCalleeKernel(gsize, lsize)))) + def get_used_hw_axes(self, callables_table): + gsize, lsize = self.subkernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) + + return frozenset(gsize.keys()), frozenset(lsize.keys()) + + def get_hw_axes_sizes(self, arg_id_to_val, space, callables_table): + from loopy.isl_helpers import subst_into_pwaff + _, pos_to_kw = get_kw_pos_association(self.subkernel) + gsize, lsize = self.subkernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) + + subst_dict = {i: val + for i, val in arg_id_to_val.items() + if isinstance(self.subkernel.arg_dict[pos_to_kw[i]], + ValueArg)} + + gsize = {iaxis: subst_into_pwaff(space, size, subst_dict) + for iaxis, size in gsize.items()} + lsize = {iaxis: subst_into_pwaff(space, size, subst_dict) + for iaxis, size in lsize.items()} + + return gsize, lsize def is_ready_for_codegen(self): return (self.arg_id_to_dtype is not None -- GitLab From ba08b53670823fae9cd3f4b31ab868ac2fff8bf7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 1 May 2021 17:07:13 -0500 Subject: [PATCH 868/916] cleanup LoopKernel.get_grid_sizes_for_insn_ids to use clbl.get_hw_axes_sizes --- loopy/check.py | 27 +++++++++++++--- loopy/kernel/__init__.py | 66 +++++++++++++++++++--------------------- 2 files changed, 53 insertions(+), 40 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 10a191a1f..66e6ccc98 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -28,7 +28,8 @@ from loopy.diagnostic import (LoopyError, WriteRaceConditionWarning, warn_with_kernel) from loopy.type_inference import TypeReader from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction, - CInstruction, _DataObliviousInstruction) + CInstruction, _DataObliviousInstruction, + NoOpInstruction) from pytools import memoize_method from collections import defaultdict @@ -1006,10 +1007,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, _, past_end_i = gather_schedule_block(kernel.schedule, sched_index) group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.schedule, sched_index), - callables_table) + callables_table, return_dict=True) - group_axes = {ax for ax, length in enumerate(group_size)} - local_axes = {ax for ax, length in enumerate(local_size)} + group_axes = set(group_size.keys()) + local_axes = set(local_size.keys()) i = sched_index + 1 assert isinstance(kernel.schedule[past_end_i - 1], ReturnFromKernel) @@ -1030,6 +1031,9 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, insn = kernel.id_to_insn[sched_item.insn_id] i += 1 + if isinstance(insn, NoOpInstruction): + continue + group_axes_used = set() local_axes_used = set() @@ -1048,6 +1052,19 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table, elif altags: raise LoopyError("auto local tag encountered") + # {{{ account for any hw axes due to a callable + + if isinstance(insn, CallInstruction): + assert isinstance(insn.expression.function, ResolvedFunction) + clbl = callables_table[insn.expression.function.name] + clbl_g_axes, clbl_l_axes = clbl.get_used_hw_axes(callables_table) + assert len(group_axes_used & clbl_g_axes) == 0 + assert len(local_axes_used & clbl_l_axes) == 0 + group_axes_used |= clbl_g_axes + local_axes_used |= clbl_l_axes + + # }}} + if group_axes != group_axes_used: raise LoopyError( f"instruction '{insn.id}' does not use all group hw axes " @@ -1235,7 +1252,6 @@ def check_that_shapes_and_strides_are_arguments(kernel): def pre_codegen_entrypoint_checks(kernel, callables_table): logger.debug("pre-codegen (entrypoint) check %s: start" % kernel.name) - check_for_unused_hw_axes_in_insns(kernel, callables_table) kernel.target.pre_codegen_entrypoint_check(kernel, callables_table) logger.debug("pre-codegen (entrypoint) check %s: done" % kernel.name) @@ -1244,6 +1260,7 @@ def pre_codegen_entrypoint_checks(kernel, callables_table): def pre_codegen_callee_checks(kernel, callables_table): logger.debug("pre-codegen (callee) check %s: start" % kernel.name) + check_for_unused_hw_axes_in_insns(kernel, callables_table) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 5598714cf..e69a9fa41 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1078,41 +1078,26 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): global_sizes = {} local_sizes = {} - from loopy.kernel.data import ValueArg from loopy.kernel.instruction import CallInstruction - from loopy.kernel.function_interface import (CallableKernel, - get_kw_pos_association) - from loopy.isl_helpers import subst_into_pwaff from loopy.symbolic import ResolvedFunction for insn in self.instructions: - if isinstance(insn, CallInstruction) and isinstance( - insn.expression.function, ResolvedFunction): + if isinstance(insn, CallInstruction): + assert isinstance(insn.expression.function, ResolvedFunction) + clbl = callables_table[insn.expression.function.name] - if isinstance(clbl, CallableKernel): - _, pos_to_kw = get_kw_pos_association(clbl.subkernel) - subst_dict = { - pos_to_kw[i]: param - for i, param in enumerate(insn.expression.parameters) - if isinstance(clbl.subkernel.arg_dict[pos_to_kw[i]], - ValueArg)} - - gsize, lsize = ( - clbl.subkernel.get_grid_sizes_for_insn_ids_as_dicts( - frozenset(insn.id - for insn in clbl.subkernel.instructions), - callables_table, ignore_auto)) - - for tgt_dict, tgt_size in [(global_sizes, gsize), - (local_sizes, lsize)]: - - for iaxis, size in tgt_size.items(): - size = subst_into_pwaff(self.assumptions.space, - size, subst_dict) - if iaxis in tgt_dict: - tgt_dict[iaxis] = tgt_dict[iaxis].max(size) - else: - tgt_dict[iaxis] = size + gsize, lsize = clbl.get_hw_axes_sizes(insn.arg_id_to_val(), + self.assumptions.space, + callables_table) + + for tgt_dict, tgt_size in [(global_sizes, gsize), + (local_sizes, lsize)]: + + for iaxis, size in tgt_size.items(): + if iaxis in tgt_dict: + tgt_dict[iaxis] = tgt_dict[iaxis].max(size) + else: + tgt_dict[iaxis] = size # }}} @@ -1159,7 +1144,14 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): # {{{ override local_sizes with self.local_sizes - local_sizes.update(self.local_sizes) + for i_lsize, lsize in self.local_sizes.items(): + if i_lsize <= max(local_sizes.keys()): + local_sizes[i_lsize] = lsize + else: + from warnings import warn + warn(f"Forced local sizes '{i_lsize}: {lsize}' is unused" + f" because kernel '{self.name}' uses {max(local_sizes.keys())}" + " local hardware axes.") # }}} @@ -1178,10 +1170,14 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): """ if self.overridden_get_grid_sizes_for_insn_ids: - return self.overridden_get_grid_sizes_for_insn_ids( - insn_ids, - callables_table=callables_table, - ignore_auto=ignore_auto) + gsize, lsize = self.overridden_get_grid_sizes_for_insn_ids( + insn_ids, + callables_table=callables_table, + ignore_auto=ignore_auto) + if return_dict: + return dict(enumerate(gsize)), dict(enumerate(lsize)) + else: + return gsize, lsize global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts( insn_ids, callables_table, ignore_auto=ignore_auto) -- GitLab From c381d8df0dc8e627b2fa6c0ebd291001781ff5a9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 1 May 2021 17:08:14 -0500 Subject: [PATCH 869/916] adds a test for double hw_axes usage at a call site --- test/test_callables.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/test/test_callables.py b/test/test_callables.py index 6f36ce818..ce53a2ee1 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -763,6 +763,35 @@ def test_unused_hw_axes_in_callee(ctx_factory, inline): lp.auto_test_vs_ref(knl, ctx, knl) +@pytest.mark.parametrize("inline", [True, False]) +def test_double_hw_axes_used_in_knl_call(inline): + from loopy.diagnostic import LoopyError + + twice = lp.make_function( + "{[i]: 0<=i<10}", + """ + y[i] = 2*x[i] + """, name="twice") + + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[:, i] = twice(x[:, i]) + """, [lp.GlobalArg("x", shape=(10, 10), dtype=float), + lp.GlobalArg("y", shape=(10, 10))], + name="outer") + + twice = lp.tag_inames(twice, {"i": "l.0"}) + knl = lp.tag_inames(knl, {"i": "l.0"}) + knl = lp.merge([knl, twice]) + + if inline: + knl = lp.inline_callable_kernel(knl, "twice") + + with pytest.raises(LoopyError): + lp.generate_code_v2(knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 3351a9863908d44b11bfef2c3068aaf618f1f8fd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 1 May 2021 17:22:53 -0500 Subject: [PATCH 870/916] allow calls to be unresolved during LoopKernel.get_grid_sizes --- loopy/kernel/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index e69a9fa41..dbcaad5b5 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1082,8 +1082,12 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): from loopy.symbolic import ResolvedFunction for insn in self.instructions: - if isinstance(insn, CallInstruction): - assert isinstance(insn.expression.function, ResolvedFunction) + # TODO: This might be unsafe as call-sites must be resolved to get + # any hardware axes size constraints they might impose. However, + # transforms like 'precompute' use this method and callables might + # not be resolved by then. + if (isinstance(insn, CallInstruction) + and isinstance(insn.expression.function, ResolvedFunction)): clbl = callables_table[insn.expression.function.name] gsize, lsize = clbl.get_hw_axes_sizes(insn.arg_id_to_val(), -- GitLab From 76eea553a6a4a797f5c31ec87c031846fc299a8a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 1 May 2021 20:21:00 -0500 Subject: [PATCH 871/916] [cleanup]: better kernel callable implementation for check_for_double_use_of_hw_axes --- loopy/check.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 66e6ccc98..1ad14d5c7 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -303,12 +303,19 @@ def check_for_double_use_of_hw_axes(kernel, callables_table): Check if any instruction of *kernel* is within multiple inames tagged with the same hw axis tag. """ - from loopy.kernel.data import UniqueTag + from loopy.kernel.data import UniqueTag, GroupIndexTag, LocalIndexTag from loopy.kernel.instruction import CallInstruction - from loopy.kernel.function_interface import CallableKernel + from loopy.symbolic import ResolvedFunction for insn in kernel.instructions: insn_tag_keys = set() + if isinstance(insn, CallInstruction): + assert isinstance(insn.expression.function, ResolvedFunction) + clbl = callables_table[insn.expression.function.name] + gsize, lsize = clbl.get_used_hw_axes(callables_table) + insn_tag_keys |= {GroupIndexTag(i).key for i in gsize} + insn_tag_keys |= {LocalIndexTag(i).key for i in lsize} + for iname in insn.within_inames: for tag in kernel.iname_tags_of_type(iname, UniqueTag): key = tag.key @@ -318,21 +325,6 @@ def check_for_double_use_of_hw_axes(kernel, callables_table): insn_tag_keys.add(key) - # check usage of iname tags in the callee kernel - if isinstance(insn, CallInstruction): - in_knl_callable = callables_table[ - insn.expression.function.name] - if isinstance(in_knl_callable, CallableKernel): - # check for collision in iname_tag keys in the instruction - # due to the callee kernel - common_iname_tags = [tag for tag in - _get_all_unique_iname_tags(in_knl_callable.subkernel) - if tag.key in insn_tag_keys] - if common_iname_tags: - raise LoopyError("instruction '%s' has multiple " - "inames tagged '%s'" % (insn.id, - common_iname_tags.pop())) - def check_for_inactive_iname_access(kernel): """ -- GitLab From 2ee74bd22fb8307df4b7491bf8378dbafd10d44c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 1 May 2021 20:23:46 -0500 Subject: [PATCH 872/916] twice -> thrice avoids cache collision --- test/test_callables.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_callables.py b/test/test_callables.py index ce53a2ee1..d7b3ab447 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -767,26 +767,26 @@ def test_unused_hw_axes_in_callee(ctx_factory, inline): def test_double_hw_axes_used_in_knl_call(inline): from loopy.diagnostic import LoopyError - twice = lp.make_function( + thrice = lp.make_function( "{[i]: 0<=i<10}", """ y[i] = 2*x[i] - """, name="twice") + """, name="thrice") knl = lp.make_kernel( "{[i]: 0<=i<10}", """ - y[:, i] = twice(x[:, i]) + y[:, i] = thrice(x[:, i]) """, [lp.GlobalArg("x", shape=(10, 10), dtype=float), lp.GlobalArg("y", shape=(10, 10))], name="outer") - twice = lp.tag_inames(twice, {"i": "l.0"}) + thrice = lp.tag_inames(thrice, {"i": "l.0"}) knl = lp.tag_inames(knl, {"i": "l.0"}) - knl = lp.merge([knl, twice]) + knl = lp.merge([knl, thrice]) if inline: - knl = lp.inline_callable_kernel(knl, "twice") + knl = lp.inline_callable_kernel(knl, "thrice") with pytest.raises(LoopyError): lp.generate_code_v2(knl) -- GitLab From 13bc0f85d430d0d3b8826c1c7aa0efa7fd5c1ce9 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 2 May 2021 10:06:31 -0500 Subject: [PATCH 873/916] get rid of dead '_get_all_unique_iname_tags' --- loopy/check.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 1ad14d5c7..d2664db44 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -268,19 +268,6 @@ def check_loop_priority_inames_known(kernel): raise LoopyError("unknown iname '%s' in loop priorities" % iname) -def _get_all_unique_iname_tags(kernel): - """Returns an instance of :class:`set` of all the iname tags used in - *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`. - """ - from loopy.kernel.data import UniqueTag - from itertools import chain - iname_tags = list(chain(*(kernel.iname_to_tags.get(iname, []) for iname in - kernel.all_inames()))) - return { - tag for tag in iname_tags if - isinstance(tag, UniqueTag)} - - def check_multiple_tags_allowed(kernel): """ Checks if a multiple tags of an iname are compatible. -- GitLab From d9f623fbb44f96bdcd2874488537de9a366d29e0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 3 May 2021 17:12:19 -0500 Subject: [PATCH 874/916] Rename pre_codegen_callee_checks -> pre_codegen_callable_checks --- loopy/check.py | 14 +++++++------- loopy/target/__init__.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index d2664db44..a675634ee 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1229,24 +1229,24 @@ def check_that_shapes_and_strides_are_arguments(kernel): def pre_codegen_entrypoint_checks(kernel, callables_table): - logger.debug("pre-codegen (entrypoint) check %s: start" % kernel.name) + logger.debug("pre-codegen entrypoint check %s: start" % kernel.name) kernel.target.pre_codegen_entrypoint_check(kernel, callables_table) - logger.debug("pre-codegen (entrypoint) check %s: done" % kernel.name) + logger.debug("pre-codegen entrypoint check %s: done" % kernel.name) -def pre_codegen_callee_checks(kernel, callables_table): - logger.debug("pre-codegen (callee) check %s: start" % kernel.name) +def pre_codegen_callable_checks(kernel, callables_table): + logger.debug("pre-codegen callable check %s: start" % kernel.name) check_for_unused_hw_axes_in_insns(kernel, callables_table) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) check_that_all_insns_are_scheduled(kernel) - kernel.target.pre_codegen_callee_check(kernel, callables_table) + kernel.target.pre_codegen_callable_check(kernel, callables_table) check_that_shapes_and_strides_are_arguments(kernel) - logger.debug("pre-codegen (callee) check %s: done" % kernel.name) + logger.debug("pre-codegen callable check %s: done" % kernel.name) def pre_codegen_checks(t_unit): @@ -1258,7 +1258,7 @@ def pre_codegen_checks(t_unit): for name, clbl in t_unit.callables_table.items(): if isinstance(clbl, CallableKernel): - pre_codegen_callee_checks(clbl.subkernel, t_unit.callables_table) + pre_codegen_callable_checks(clbl.subkernel, t_unit.callables_table) except Exception: print(75*"=") print("failing kernel during pre-codegen check:") diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 073abc870..a6357a12b 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -90,7 +90,7 @@ class TargetBase: def pre_codegen_entrypoint_check(self, kernel, callables_table): pass - def pre_codegen_callee_check(self, kernel, callables_table): + def pre_codegen_callable_check(self, kernel, callables_table): pass # }}} -- GitLab From 3222ef28e2b1e5712155d2a48f893d3a36fae9d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 3 May 2021 19:10:57 -0500 Subject: [PATCH 875/916] kcv3 downstream grudge CI: Use appropriate branch (#359) * kcv3 downstream grudge CI: Use appropriate branch * Debug grudge downstream CI kcv3 clone * Grudge downstream CI for kcv3: Also for kcv3-grudge-downstream-ci * kcv3 downstream grudge CI: I hate nested shell conditionals * kcv3 grudge downstream CI: Use "special grudge" for kcv3 and branches targeting it --- .github/workflows/ci.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0e932ef2f..88372caee 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -146,7 +146,15 @@ jobs: env: DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }} run: | - git clone "https://github.com/inducer/$DOWNSTREAM_PROJECT.git" + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh + . ./ci-support.sh + + # Use "special grudge" for kcv3 and branches targeting it. + if [[ "$DOWNSTREAM_PROJECT" = "grudge" ]] && [[ "$GITHUB_HEAD_REF" = "kernel_callables_v3-edit2" || "$GITHUB_BASE_REF" = "kernel_callables_v3-edit2" ]]; then + with_echo git clone "https://github.com/kaushikcfd/$DOWNSTREAM_PROJECT.git" -b "kcv3-e2-compat" + else + with_echo git clone "https://github.com/inducer/$DOWNSTREAM_PROJECT.git" + fi cd "$DOWNSTREAM_PROJECT" echo "*** $DOWNSTREAM_PROJECT version: $(git rev-parse --short HEAD)" @@ -159,8 +167,6 @@ jobs: sed -i "/mpi4py/ d" requirements.txt - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh - . ./ci-support.sh build_py_project_in_conda_env test_py_project -- GitLab From 053fdbf09e690fd9cc83cb84e1c94176a0ab9921 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 3 May 2021 20:21:18 -0500 Subject: [PATCH 876/916] Rename arg counters in _validate_kernel_call_insn --- loopy/check.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index fa3ddd2d0..cf197a2d8 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1287,17 +1287,18 @@ def _validate_kernel_call_insn(caller, call_insn, callee): arg_id_to_val = call_insn.arg_id_to_val() - ipar = 0 - iassignee = -1 + next_iarg_input = 0 + next_iarg_output = -1 for arg in callee.args: if arg.is_input: - if ipar not in arg_id_to_val: + if next_iarg_input not in arg_id_to_val: raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" - f" a {ipar+1}-th positional argument corresponding" + f" a {next_iarg_input+1}-th positional " + "argument corresponding" f" to '{arg.name}'in the callee.") - in_val = arg_id_to_val[ipar] - ipar += 1 + in_val = arg_id_to_val[next_iarg_input] + next_iarg_input += 1 if isinstance(arg, ArrayBase): if not isinstance(in_val, SubArrayRef): raise LoopyError(f"Call to '{callee.name}' in '{call_insn}'" @@ -1309,13 +1310,13 @@ def _validate_kernel_call_insn(caller, call_insn, callee): f" expects a value argument for '{arg.name}'" f" (got {in_val}).") if arg.is_output: - if iassignee not in arg_id_to_val: + if next_iarg_output not in arg_id_to_val: raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" - f" a {-iassignee}-th positional assignee" + f" a {-next_iarg_output}-th positional assignee" f" corresponding to '{arg.name}'in the callee.") - out_val = arg_id_to_val[iassignee] - iassignee -= 1 + out_val = arg_id_to_val[next_iarg_output] + next_iarg_output -= 1 assert isinstance(arg, ArrayBase) if not isinstance(out_val, SubArrayRef): raise LoopyError(f"Call to '{callee.name}' in '{call_insn}'" -- GitLab From 0bac81c78e81043601fe3a8749e259c1cde1dae0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 17 May 2021 07:56:45 -0500 Subject: [PATCH 877/916] use simplify_via_aff instead of distribute to turn on the simplification on affine expressions --- loopy/check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index cf197a2d8..552dbaa36 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1264,7 +1264,7 @@ def _are_sub_array_refs_equivalent(sar1, sar2, caller): from loopy.symbolic import SubstitutionMapper from pymbolic.mapper.substitutor import make_subst_func - from pymbolic.mapper.distributor import distribute + from loopy.isl_helpers import simplify_via_aff subst_func = make_subst_func({iname1.name: iname2 for iname1, iname2 in zip(sar1.swept_inames, sar2.swept_inames) @@ -1275,7 +1275,7 @@ def _are_sub_array_refs_equivalent(sar1, sar2, caller): for idx1, idx2 in zip(sar1.subscript.index_tuple, sar2.subscript.index_tuple): - if distribute(subst_mapper(idx1) - idx2) != 0: + if simplify_via_aff(subst_mapper(idx1) - idx2) != 0: return False return True -- GitLab From abcebc984e15d4103bfa033c173f9746bd0a3c57 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 17 May 2021 08:05:26 -0500 Subject: [PATCH 878/916] test floor div in sub-array-ref exprs --- test/test_callables.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test/test_callables.py b/test/test_callables.py index d7b3ab447..f79f6e8f1 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -792,6 +792,32 @@ def test_double_hw_axes_used_in_knl_call(inline): lp.generate_code_v2(knl) +@pytest.mark.parametrize("inline", [True, False]) +def test_kc_with_floor_div_in_expr(ctx_factory, inline): + # See https://github.com/inducer/loopy/issues/366 + import loopy as lp + + ctx = ctx_factory() + callee = lp.make_function( + "{[i]: 0<=i<10}", + """ + x[i] = 2*x[i] + """, name="callee_with_update") + + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + [i]: x[2*(i//2) + (i%2)] = callee_with_update([i]: x[i]) + """) + + knl = lp.merge([knl, callee]) + + if inline: + knl = lp.inline_callable_kernel(knl, "callee_with_update") + + lp.auto_test_vs_ref(knl, ctx, knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 40e33240b531e65cc912d22f4d39fd26860deb5f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 22 May 2021 15:40:29 -0500 Subject: [PATCH 879/916] Fix up Fortran division specialization for kernel callables --- loopy/frontend/fortran/translator.py | 4 +++- loopy/type_inference.py | 12 +++++++++++- test/test_fortran.py | 8 ++++---- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index 8dcc32e00..af701cf8e 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -31,6 +31,7 @@ from loopy.frontend.fortran.tree import FTreeWalkerBase from loopy.diagnostic import warn_with_kernel from loopy.frontend.fortran.diagnostic import ( TranslationError, TranslatorWarning) +from loopy.translation_unit import for_each_kernel import islpy as isl from islpy import dim_type from loopy.symbolic import (IdentityMapper, RuleAwareIdentityMapper, @@ -268,7 +269,7 @@ class FortranDivisionSpecializer(RuleAwareIdentityMapper): def __init__(self, rule_mapping_context, kernel): super().__init__(rule_mapping_context) from loopy.type_inference import TypeInferenceMapper - self.infer_type = TypeInferenceMapper(kernel) + self.infer_type = TypeInferenceMapper(kernel, None) self.kernel = kernel def map_fortran_division(self, expr, *args): @@ -292,6 +293,7 @@ class FortranDivisionSpecializer(RuleAwareIdentityMapper): self.rec(expr.denominator, *args)) +@for_each_kernel def specialize_fortran_division(knl): rmc = SubstitutionRuleMappingContext( knl.substitutions, knl.get_var_name_generator()) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index cfc04d096..92df0323a 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -191,7 +191,13 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel - assert isinstance(clbl_inf_ctx, CallablesInferenceContext) + assert ( + # FIXME: HACK + # only used in kernel-local type inference for division + # specialization in Fortran + clbl_inf_ctx is None + + or isinstance(clbl_inf_ctx, CallablesInferenceContext)) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments @@ -417,6 +423,10 @@ class TypeInferenceMapper(CombineMapper): arg_id_to_dtype = {i: none_if_empty(self.rec(par)) for (i, par) in enumerate(expr.parameters)} + if self.clbl_inf_ctx is None: + raise LoopyError("TypeInferenceMapper was created without a " + "CallablesInferenceContext, but encountered a function call") + # specializing the known function wrt type in_knl_callable = self.clbl_inf_ctx[expr.function.name] diff --git a/test/test_fortran.py b/test/test_fortran.py index d6bb57162..45e83b384 100644 --- a/test/test_fortran.py +++ b/test/test_fortran.py @@ -667,13 +667,13 @@ def test_division_in_shapes(ctx_factory): end do end subroutine """ - knl, = lp.parse_fortran(fortran_src) - ref_knl = knl + t_unit = lp.parse_fortran(fortran_src) + ref_t_unit = t_unit - print(knl) + print(t_unit) ctx = ctx_factory() - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(m=128)) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit, parameters=dict(m=128)) if __name__ == "__main__": -- GitLab From dbde6bb6eb428d021b8123571a477c65972f876c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 22 May 2021 18:38:26 -0500 Subject: [PATCH 880/916] kernel fusion: removes the restriction that only unresolved translation units could be fused --- loopy/transform/fusion.py | 53 ++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index a62ba7147..6e28d9e7b 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -130,9 +130,6 @@ def _merge_values(item_name, val_a, val_b): # {{{ two-kernel fusion def _fuse_two_kernels(kernela, kernelb): - from loopy.kernel import KernelState - if kernela.state != KernelState.INITIAL or kernelb.state != KernelState.INITIAL: - raise LoopyError("can only fuse kernels in INITIAL state") # {{{ fuse domains @@ -333,20 +330,42 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): # namespace, otherwise the kernel names should be uniquified. # We should also somehow be able to know that callables like "sin"/"cos" # belong to the global namespace and need not be uniquified. + if all(isinstance(kernel, TranslationUnit) for kernel in kernels): - new_kernels = [] + # {{{ sanity checks + for knl in kernels: - kernel_names = [i for i, clbl in - knl.callables_table.items() if isinstance(clbl, - CallableKernel)] - if len(kernel_names) != 1: - raise NotImplementedError("Kernel containing more than one" - " callable kernel, not allowed for now.") - new_kernels.append(knl[kernel_names[0]]) + nkernels = len([i for i, clbl in knl.callables_table.items() + if isinstance(clbl, CallableKernel)]) + if nkernels != 1: + raise NotImplementedError("Translation unit with more than one" + " callable kernel not allowed for now.") + + # }}} + + # {{{ "merge" the callable namespace + + from loopy.transform.callable import rename_callable + loop_kernels_to_be_fused = [] + new_callables = {} - kernels = new_kernels[:] + for t_unit in kernels: + for name in set(t_unit.callables_table) & set(new_callables): + t_unit = rename_callable(t_unit, name) + + for name, clbl in t_unit.callables_table.items(): + if isinstance(clbl, CallableKernel): + loop_kernels_to_be_fused.append(clbl.subkernel) + else: + new_callables[name] = clbl + + # }}} + + kernels = loop_kernels_to_be_fused[:] + else: + assert all(isinstance(knl, LoopKernel) for knl in kernels) + new_callables = {} - assert all(isinstance(knl, LoopKernel) for knl in kernels) kernels = list(kernels) if data_flow is None: @@ -425,7 +444,11 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): # }}} - from loopy.translation_unit import make_program - return make_program(result).with_entrypoints(result.name) + new_callables[result.name] = CallableKernel(result) + + return TranslationUnit(callables_table=new_callables, + target=result.target, + entrypoints=frozenset([result.name])) + # vim: foldmethod=marker -- GitLab From 64de6799e816d547e743ee9814dae9f3d7d2e7aa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 22 May 2021 18:41:05 -0500 Subject: [PATCH 881/916] re-adding the restriction that TypeInferenceMapper can only take CallablesInferenceContext --- loopy/type_inference.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 92df0323a..d168848db 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -191,13 +191,7 @@ class TypeInferenceMapper(CombineMapper): instances """ self.kernel = kernel - assert ( - # FIXME: HACK - # only used in kernel-local type inference for division - # specialization in Fortran - clbl_inf_ctx is None - - or isinstance(clbl_inf_ctx, CallablesInferenceContext)) + assert isinstance(clbl_inf_ctx, CallablesInferenceContext) if new_assignments is None: new_assignments = {} self.new_assignments = new_assignments @@ -423,10 +417,6 @@ class TypeInferenceMapper(CombineMapper): arg_id_to_dtype = {i: none_if_empty(self.rec(par)) for (i, par) in enumerate(expr.parameters)} - if self.clbl_inf_ctx is None: - raise LoopyError("TypeInferenceMapper was created without a " - "CallablesInferenceContext, but encountered a function call") - # specializing the known function wrt type in_knl_callable = self.clbl_inf_ctx[expr.function.name] @@ -588,6 +578,9 @@ class TypeInferenceMapper(CombineMapper): def map_sub_array_ref(self, expr): return self.rec(expr.subscript) + def map_fortran_division(self, expr): + return [] + # }}} -- GitLab From b7640ecf75669ef62f0a89a623bb60e9533e626a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 22 May 2021 18:44:24 -0500 Subject: [PATCH 882/916] translation unit minor fixes: - fix TranslationUnit.state when it does not contain any callable kernels - fix callables being "lost" during inference when they weren't reachable from the entrypoints --- loopy/translation_unit.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 269fd53f9..ac3b7076b 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -254,9 +254,11 @@ class TranslationUnit(ImmutableRecord): @property def state(self): """ Returns an instance of :class:`loopy.kernel.KernelState`. """ - return min(callable_knl.subkernel.state - for callable_knl in self.callables_table.values() - if isinstance(callable_knl, CallableKernel)) + from loopy.kernel import KernelState + return min((callable_knl.subkernel.state + for callable_knl in self.callables_table.values() + if isinstance(callable_knl, CallableKernel)), + default=KernelState.INITIAL) def with_kernel(self, kernel): """ @@ -662,7 +664,7 @@ class CallablesInferenceContext(ImmutableRecord): - self.new_entrypoints) todo_renames = {} - new_callables = {} + new_callables = dict(program.callables_table) for c in callees_with_old_entrypoint_names: unique_func_id = c @@ -808,7 +810,8 @@ def resolve_callables(program): callables_table = {} # callables: name of the calls seen in the program - callables = set(program.entrypoints) + callables = {name for name, clbl in program.callables_table.items() + if isinstance(clbl, CallableKernel)} while callables: clbl_name = callables.pop() -- GitLab From e6cd2511949ef9bb6d692eb65de64fc30895c7b4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 22 May 2021 18:46:42 -0500 Subject: [PATCH 883/916] remove unreachable callables before going further into the codegen pipeline --- loopy/preprocess.py | 11 +++++++++++ loopy/transform/callable.py | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index f8fe8eef8..87b42160d 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2362,6 +2362,15 @@ def inline_kernels_with_gbarriers(program): # }}} +def filter_reachable_callables(t_unit): + from loopy.translation_unit import _get_callable_ids + reachable_function_ids = _get_callable_ids(t_unit.callables_table, + t_unit.entrypoints) + new_callables = {name: clbl for name, clbl in t_unit.callables_table.items() + if name in (reachable_function_ids | t_unit.entrypoints)} + return t_unit.copy(callables_table=new_callables) + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2449,6 +2458,8 @@ def preprocess_program(program, device=None): from loopy.translation_unit import resolve_callables program = resolve_callables(program) + program = filter_reachable_callables(program) + if device is not None: # FIXME: Time to remove this? (Git blame shows 5 years ago) from warnings import warn diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 651f4457c..e88c88239 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -491,7 +491,7 @@ def inline_callable_kernel(translation_unit, function_name): Returns a copy of *translation_unit* with the callable kernel named *function_name* inlined at all call-sites. """ - from loopy.preprocess import infer_arg_descr + from loopy.preprocess import infer_arg_descr, filter_reachable_callables from loopy.translation_unit import resolve_callables # {{{ must have argument shape information at call sites to inline @@ -503,7 +503,9 @@ def inline_callable_kernel(translation_unit, function_name): callee = translation_unit[function_name] - return _inline_single_callable_kernel(translation_unit, callee) + return filter_reachable_callables( + _inline_single_callable_kernel(translation_unit, + callee)) # }}} -- GitLab From 81b7ee4d8804a193076fcb4437f755040c42e3fa Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 22 May 2021 18:47:38 -0500 Subject: [PATCH 884/916] call specialize_fortran_division on the entire translation unit rather than the kernel --- loopy/frontend/fortran/__init__.py | 3 +++ loopy/frontend/fortran/translator.py | 36 ++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index 15c6a7dc3..c5c1943ca 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -356,6 +356,9 @@ def parse_fortran(source, filename="", free_form=None, strict=None, # guesssing in the case of only one function prog = prog.with_entrypoints(all_kernels[0].name) + from loopy.frontend.fortran.translator import specialize_fortran_division + prog = specialize_fortran_division(prog) + parse_plog.done() return prog diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index af701cf8e..f13f63098 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -31,7 +31,6 @@ from loopy.frontend.fortran.tree import FTreeWalkerBase from loopy.diagnostic import warn_with_kernel from loopy.frontend.fortran.diagnostic import ( TranslationError, TranslatorWarning) -from loopy.translation_unit import for_each_kernel import islpy as isl from islpy import dim_type from loopy.symbolic import (IdentityMapper, RuleAwareIdentityMapper, @@ -266,10 +265,10 @@ class FortranDivisionToFloorDiv(IdentityMapper): class FortranDivisionSpecializer(RuleAwareIdentityMapper): - def __init__(self, rule_mapping_context, kernel): + def __init__(self, rule_mapping_context, kernel, callables): super().__init__(rule_mapping_context) - from loopy.type_inference import TypeInferenceMapper - self.infer_type = TypeInferenceMapper(kernel, None) + from loopy.type_inference import TypeReader + self.infer_type = TypeReader(kernel, callables) self.kernel = kernel def map_fortran_division(self, expr, *args): @@ -293,11 +292,31 @@ class FortranDivisionSpecializer(RuleAwareIdentityMapper): self.rec(expr.denominator, *args)) -@for_each_kernel -def specialize_fortran_division(knl): +def _specialize_fortran_division_for_kernel(knl, callables): rmc = SubstitutionRuleMappingContext( knl.substitutions, knl.get_var_name_generator()) - return FortranDivisionSpecializer(rmc, knl).map_kernel(knl) + return FortranDivisionSpecializer(rmc, knl, callables).map_kernel(knl) + + +def specialize_fortran_division(t_unit): + from loopy.translation_unit import TranslationUnit, resolve_callables + from loopy.kernel.function_interface import CallableKernel + from loopy.type_inference import infer_unknown_types + assert isinstance(t_unit, TranslationUnit) + + t_unit = resolve_callables(t_unit) + t_unit = infer_unknown_types(t_unit) + new_callables = {} + + for name, clbl in t_unit.callables_table.items(): + if isinstance(clbl, CallableKernel): + knl = clbl.subkernel + clbl = clbl.copy(subkernel=_specialize_fortran_division_for_kernel( + knl, t_unit.callables_table)) + + new_callables[name] = clbl + + return t_unit.copy(callables_table=new_callables) # }}} @@ -904,9 +923,6 @@ class F2LoopyTranslator(FTreeWalkerBase): seq_dependencies=seq_dependencies, ) - if self.all_names_known: - knl = specialize_fortran_division(knl) - from loopy.loop import merge_loop_domains knl = merge_loop_domains(knl) -- GitLab From a54b2b0ec354e6265f6583a747ba907ee166af0d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 22 May 2021 19:49:15 -0500 Subject: [PATCH 885/916] specialize_fortran_division: account that the type of num/denom may not be inferred --- loopy/frontend/fortran/translator.py | 9 +++++++-- loopy/type_inference.py | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index f13f63098..e04e2cb78 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -273,8 +273,13 @@ class FortranDivisionSpecializer(RuleAwareIdentityMapper): def map_fortran_division(self, expr, *args): # We remove all these before type inference ever sees them. - num_dtype = self.infer_type(expr.numerator).numpy_dtype - den_dtype = self.infer_type(expr.denominator).numpy_dtype + from loopy.type_inference import TypeInferenceFailure + + try: + num_dtype = self.infer_type(expr.numerator).numpy_dtype + den_dtype = self.infer_type(expr.denominator).numpy_dtype + except TypeInferenceFailure: + return super().map_fortran_division(expr, *args) from pymbolic.primitives import Quotient, FloorDiv if num_dtype.kind in "iub" and den_dtype.kind in "iub": diff --git a/loopy/type_inference.py b/loopy/type_inference.py index d168848db..8b9b47f00 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -578,8 +578,7 @@ class TypeInferenceMapper(CombineMapper): def map_sub_array_ref(self, expr): return self.rec(expr.subscript) - def map_fortran_division(self, expr): - return [] + map_fortran_division = map_quotient # }}} -- GitLab From 0bdf4f0405959ac46207a1a27f96801df8f9b917 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 24 May 2021 12:04:03 -0500 Subject: [PATCH 886/916] Rename _get_callable_ids_for_knl -> _get_reachable_callable_ids_for_knl --- loopy/codegen/__init__.py | 4 ++-- loopy/preprocess.py | 4 ++-- loopy/translation_unit.py | 11 ++++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 93dd745ee..14f0a75eb 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -605,10 +605,10 @@ def diverge_callee_entrypoints(program): If a :class:`loopy.kernel.function_interface.CallableKernel` is both an entrypoint and a callee, then rename the callee. """ - from loopy.translation_unit import (_get_callable_ids, + from loopy.translation_unit import (_get_reachable_callable_ids, rename_resolved_functions_in_a_single_kernel) from pytools import UniqueNameGenerator - callable_ids = _get_callable_ids(program.callables_table, + callable_ids = _get_reachable_callable_ids(program.callables_table, program.entrypoints) new_callables = {} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 87b42160d..59e70827e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2363,8 +2363,8 @@ def inline_kernels_with_gbarriers(program): def filter_reachable_callables(t_unit): - from loopy.translation_unit import _get_callable_ids - reachable_function_ids = _get_callable_ids(t_unit.callables_table, + from loopy.translation_unit import _get_reachable_callable_ids + reachable_function_ids = _get_reachable_callable_ids(t_unit.callables_table, t_unit.entrypoints) new_callables = {name: clbl for name, clbl in t_unit.callables_table.items() if name in (reachable_function_ids | t_unit.entrypoints)} diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index ac3b7076b..d76e6f9fe 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -488,19 +488,19 @@ class CallablesIDCollector(CombineMapper): map_type_cast = map_constant -def _get_callable_ids_for_knl(knl, callables): +def _get_reachable_callable_ids_for_knl(knl, callables): clbl_id_collector = CallablesIDCollector() return frozenset().union(*( - _get_callable_ids_for_knl(callables[clbl].subkernel, callables) | + _get_reachable_callable_ids_for_knl(callables[clbl].subkernel, callables) | frozenset([clbl]) if isinstance(callables[clbl], CallableKernel) else frozenset([clbl]) for clbl in clbl_id_collector.map_kernel(knl))) -def _get_callable_ids(callables, entrypoints): +def _get_reachable_callable_ids(callables, entrypoints): return frozenset().union(*( - _get_callable_ids_for_knl(callables[e].subkernel, callables) + _get_reachable_callable_ids_for_knl(callables[e].subkernel, callables) for e in entrypoints)) # }}} @@ -645,7 +645,8 @@ class CallablesInferenceContext(ImmutableRecord): # {{{ get all the callables reachable from the new entrypoints. # get the names of all callables reachable from the new entrypoints - new_callable_ids = _get_callable_ids(self.callables, self.new_entrypoints) + new_callable_ids = _get_reachable_callable_ids( + self.callables, self.new_entrypoints) # get the history of function ids from the performed renames: history = {} -- GitLab From 5e038a0b890c92834d12916576ae1a494e5c6543 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 24 May 2021 12:14:38 -0500 Subject: [PATCH 887/916] Drop all_names_known flag from fortran translator --- loopy/frontend/fortran/__init__.py | 10 ++-------- loopy/frontend/fortran/translator.py | 3 +-- test/test_numa_diff.py | 3 +-- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py index c5c1943ca..fa5c5050f 100644 --- a/loopy/frontend/fortran/__init__.py +++ b/loopy/frontend/fortran/__init__.py @@ -294,13 +294,8 @@ def _add_assignees_to_calls(knl, all_kernels): def parse_fortran(source, filename="", free_form=None, strict=None, - seq_dependencies=None, auto_dependencies=None, target=None, - all_names_known=True): + seq_dependencies=None, auto_dependencies=None, target=None): """ - :arg all_names_known: if set to *False*, enter an undocumented mode - in which Fortran parsing will try to tolerate unknown names. - If used, ``loopy.frontend.fortran.translator.specialize_fortran_division`` - must be called as soon as all names are known. :returns: a :class:`loopy.TranslationUnit` """ @@ -338,8 +333,7 @@ def parse_fortran(source, filename="", free_form=None, strict=None, "and returned invalid data (Sorry!)") from loopy.frontend.fortran.translator import F2LoopyTranslator - f2loopy = F2LoopyTranslator( - filename, target=target, all_names_known=all_names_known) + f2loopy = F2LoopyTranslator(filename, target=target) f2loopy(tree) kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index e04e2cb78..6b6c75622 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -329,11 +329,10 @@ def specialize_fortran_division(t_unit): # {{{ translator class F2LoopyTranslator(FTreeWalkerBase): - def __init__(self, filename, target=None, all_names_known=True): + def __init__(self, filename, target=None): FTreeWalkerBase.__init__(self, filename) self.target = target - self.all_names_known = all_names_known self.scope_stack = [] diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index bfe7c8756..8ba556298 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -57,8 +57,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa source = source.replace("datafloat", "real*4") - program = lp.parse_fortran(source, filename, seq_dependencies=False, - all_names_known=False) + program = lp.parse_fortran(source, filename, seq_dependencies=False) hsv_r, hsv_s = program["strongVolumeKernelR"], program["strongVolumeKernelS"] -- GitLab From 5aeb64efd9853016f9c373f0abfa76d4eff51083 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Apr 2021 16:23:35 -0500 Subject: [PATCH 888/916] improves unique callable name generation --- loopy/translation_unit.py | 103 +++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 58 deletions(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index d76e6f9fe..2c663aa35 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -20,7 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import re import collections from pytools import ImmutableRecord @@ -375,41 +374,7 @@ class Program(TranslationUnit): # }}} -# {{{ next_indexed_function_id - -def next_indexed_function_id(function_id): - """ - Returns an instance of :class:`str` with the next indexed-name in the - sequence for the name of *function_id*. - - *Example:* ``'sin_0'`` will return ``'sin_1'``. - - :arg function_id: Either an instance of :class:`str`. - """ - - # {{{ sanity checks - - assert isinstance(function_id, str) - - # }}} - - func_name = re.compile(r"^(?P\S+?)_(?P\d+?)$") - - match = func_name.match(function_id) - - if match is None: - if function_id[-1] == "_": - return f"{function_id}0" - else: - return f"{function_id}_0" - - return "{alpha}_{num}".format(alpha=match.group("alpha"), - num=int(match.group("num"))+1) - -# }}} - - -# {{{ rename_resolved_functions_in_a_single_kernel +# {{{ rename resolved functions class ResolvedFunctionRenamer(RuleAwareIdentityMapper): """ @@ -482,10 +447,12 @@ class CallablesIDCollector(CombineMapper): return callables_in_insn + def map_type_cast(self, expr): + return self.rec(expr) + map_variable = map_constant map_function_symbol = map_constant map_tagged_variable = map_constant - map_type_cast = map_constant def _get_reachable_callable_ids_for_knl(knl, callables): @@ -508,8 +475,25 @@ def _get_reachable_callable_ids(callables, entrypoints): # {{{ CallablesInferenceContext +def get_all_subst_names(callables): + """ + Returns a :class:`set` of all substitution rule names in the callable + kernels of *callables*. + + :arg callables: A mapping from function identifiers to + :class:`~loopy.kernel.function_interface.InKernelCallable`. + """ + return set().union(*(set(clbl.subkernel.substitutions.keys()) + for clbl in callables.values() + if isinstance(clbl, CallableKernel))) + + def make_clbl_inf_ctx(callables, entrypoints): - return CallablesInferenceContext(callables) + from pytools import UniqueNameGenerator + all_substs = get_all_subst_names(callables) + ung = UniqueNameGenerator(set(callables.keys()) | all_substs) + + return CallablesInferenceContext(callables, ung) class CallablesInferenceContext(ImmutableRecord): @@ -538,12 +522,13 @@ class CallablesInferenceContext(ImmutableRecord): .. automethod:: __getitem__ """ def __init__(self, callables, + clbl_name_gen, renames=collections.defaultdict(frozenset), new_entrypoints=frozenset()): assert isinstance(callables, collections.abc.Mapping) - callables = dict(callables) - super().__init__(callables=callables, + super().__init__(callables=dict(callables), + clbl_name_gen=clbl_name_gen, renames=renames, new_entrypoints=new_entrypoints) @@ -607,14 +592,8 @@ class CallablesInferenceContext(ImmutableRecord): # }}} - # {{{ must allocate a new clbl in the namespace => find a unique id for it - - unique_function_id = old_function_id - - while unique_function_id in self.callables: - unique_function_id = next_indexed_function_id(unique_function_id) - - # }}} + # must allocate a new clbl in the namespace => find a unique id for it + unique_function_id = self.clbl_name_gen(old_function_id) updated_callables = self.callables.copy() updated_callables[unique_function_id] = new_clbl @@ -668,12 +647,7 @@ class CallablesInferenceContext(ImmutableRecord): new_callables = dict(program.callables_table) for c in callees_with_old_entrypoint_names: - unique_func_id = c - - while unique_func_id in self.callables: - unique_func_id = next_indexed_function_id(unique_func_id) - - todo_renames[c] = unique_func_id + todo_renames[c] = self.clbl_name_gen(c) for e in self.new_entrypoints: # note renames to "rollback" the renaming of entrypoints @@ -769,6 +743,12 @@ def for_each_kernel(transform): def update_table(callables_table, clbl_id, clbl): + """ + Returns a tuple ``new_clbl_id, new_callables_table`` where + *new_callables_table* is a copy of *callables_table* with *clbl* in its + namespace. *clbl* is referred in *new_callables_table*'s namespace by + *new_clbl_id*. + """ from loopy.kernel.function_interface import InKernelCallable assert isinstance(clbl, InKernelCallable) @@ -776,12 +756,19 @@ def update_table(callables_table, clbl_id, clbl): if c == clbl: return i, callables_table - while clbl_id in callables_table: - clbl_id = next_indexed_function_id(clbl_id) + if isinstance(clbl_id, ReductionOpFunction): + new_clbl_id = clbl_id.copy() + else: + assert isinstance(clbl_id, str) + from pytools import UniqueNameGenerator + all_substs = get_all_subst_names(callables_table) + ung = UniqueNameGenerator(set(callables_table.keys()) | all_substs) + new_clbl_id = ung(clbl_id) - callables_table[clbl_id] = clbl + new_callables_table = callables_table.copy() + new_callables_table[new_clbl_id] = clbl - return clbl_id, callables_table + return new_clbl_id, new_callables_table # }}} -- GitLab From cdcfd5dc142886049cd60a56cc3225294ba124f2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Apr 2021 16:41:35 -0500 Subject: [PATCH 889/916] cleanup: removes unnecessary conditional --- loopy/translation_unit.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 2c663aa35..f690d9956 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -59,7 +59,6 @@ def _is_a_reduction_op(expr): if isinstance(expr, ResolvedFunction): return _is_a_reduction_op(expr.function) - from loopy.library.reduction import ReductionOpFunction return isinstance(expr, ReductionOpFunction) @@ -458,11 +457,16 @@ class CallablesIDCollector(CombineMapper): def _get_reachable_callable_ids_for_knl(knl, callables): clbl_id_collector = CallablesIDCollector() - return frozenset().union(*( - _get_reachable_callable_ids_for_knl(callables[clbl].subkernel, callables) | - frozenset([clbl]) if isinstance(callables[clbl], CallableKernel) else - frozenset([clbl]) - for clbl in clbl_id_collector.map_kernel(knl))) + def rec(clbl_id): + clbl = callables[clbl_id] + if isinstance(clbl, CallableKernel): + return (_get_reachable_callable_ids_for_knl(clbl.subkernel, callables) + | frozenset([clbl_id])) + else: + return frozenset([clbl_id]) + + return frozenset().union(*(rec(clbl_id) + for clbl_id in clbl_id_collector.map_kernel(knl))) def _get_reachable_callable_ids(callables, entrypoints): @@ -475,6 +479,7 @@ def _get_reachable_callable_ids(callables, entrypoints): # {{{ CallablesInferenceContext + def get_all_subst_names(callables): """ Returns a :class:`set` of all substitution rule names in the callable -- GitLab From ae122294425f969c9355e04089886a60b11c7433 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 24 Apr 2021 17:38:46 -0500 Subject: [PATCH 890/916] corrects map_type_cast --- loopy/translation_unit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index f690d9956..40b6b4765 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -447,7 +447,7 @@ class CallablesIDCollector(CombineMapper): return callables_in_insn def map_type_cast(self, expr): - return self.rec(expr) + return self.rec(expr.child) map_variable = map_constant map_function_symbol = map_constant -- GitLab From 429360a45f9674c392542db423eaa7d2669001e1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 25 Apr 2021 16:29:34 -0500 Subject: [PATCH 891/916] cleanup: captures repeated code into helper interfaces Co-authored-by: Andreas Kloeckner --- loopy/translation_unit.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 40b6b4765..250bf3294 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -479,7 +479,6 @@ def _get_reachable_callable_ids(callables, entrypoints): # {{{ CallablesInferenceContext - def get_all_subst_names(callables): """ Returns a :class:`set` of all substitution rule names in the callable @@ -493,12 +492,15 @@ def get_all_subst_names(callables): if isinstance(clbl, CallableKernel))) -def make_clbl_inf_ctx(callables, entrypoints): +def make_callable_name_generator(callables): from pytools import UniqueNameGenerator all_substs = get_all_subst_names(callables) - ung = UniqueNameGenerator(set(callables.keys()) | all_substs) + return UniqueNameGenerator(set(callables.keys()) | all_substs) - return CallablesInferenceContext(callables, ung) + +def make_clbl_inf_ctx(callables, entrypoints): + name_gen = make_callable_name_generator(callables) + return CallablesInferenceContext(callables, name_gen) class CallablesInferenceContext(ImmutableRecord): @@ -751,8 +753,12 @@ def update_table(callables_table, clbl_id, clbl): """ Returns a tuple ``new_clbl_id, new_callables_table`` where *new_callables_table* is a copy of *callables_table* with *clbl* in its - namespace. *clbl* is referred in *new_callables_table*'s namespace by + namespace. *clbl* is referred to in *new_callables_table*'s namespace by *new_clbl_id*. + + :arg clbl_id: An instance of :class:`str` or + :class:`~loopy.library.reduction.ReductionOpFunction` based on which + the unique identifier, *new_clbl_id* , is to be chosen. """ from loopy.kernel.function_interface import InKernelCallable assert isinstance(clbl, InKernelCallable) @@ -765,9 +771,7 @@ def update_table(callables_table, clbl_id, clbl): new_clbl_id = clbl_id.copy() else: assert isinstance(clbl_id, str) - from pytools import UniqueNameGenerator - all_substs = get_all_subst_names(callables_table) - ung = UniqueNameGenerator(set(callables_table.keys()) | all_substs) + ung = make_callable_name_generator(callables_table) new_clbl_id = ung(clbl_id) new_callables_table = callables_table.copy() -- GitLab From cd13b0531a2f4a05c40ece5be271d6b07eb12502 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 25 Apr 2021 19:45:15 -0500 Subject: [PATCH 892/916] s/update_table/add_callable_to_table/g --- loopy/library/reduction.py | 27 ++++++++++++++------------- loopy/translation_unit.py | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 9f23bcb37..13dfadedd 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -203,7 +203,7 @@ class MaxReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2, callables_table, target): dtype, = dtype - from loopy.translation_unit import update_table + from loopy.translation_unit import add_callable_to_table # getting the callable 'max' from target max_scalar_callable = target.get_device_ast_builder().known_callables["max"] @@ -213,7 +213,7 @@ class MaxReductionOperation(ScalarReductionOperation): {0: dtype, 1: dtype}, callables_table) # populate callables_table - func_id, callables_table = update_table(callables_table, "max", + func_id, callables_table = add_callable_to_table(callables_table, "max", max_scalar_callable) return ResolvedFunction(func_id)(operand1, operand2), callables_table @@ -225,7 +225,7 @@ class MinReductionOperation(ScalarReductionOperation): def __call__(self, dtype, operand1, operand2, callables_table, target): dtype, = dtype - from loopy.translation_unit import update_table + from loopy.translation_unit import add_callable_to_table # getting the callable 'min' from target min_scalar_callable = target.get_device_ast_builder().known_callables["min"] @@ -235,7 +235,7 @@ class MinReductionOperation(ScalarReductionOperation): {0: dtype, 1: dtype}, callables_table) # populate callables_table - func_id, callables_table = update_table(callables_table, "min", + func_id, callables_table = add_callable_to_table(callables_table, "min", min_scalar_callable) return ResolvedFunction(func_id)(operand1, operand2), callables_table @@ -300,7 +300,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): def neutral_element(self, scalar_dtype, segment_flag_dtype, callables_table, target): from loopy.library.function import MakeTupleCallable - from loopy.translation_unit import update_table + from loopy.translation_unit import add_callable_to_table scalar_neutral_element, calables_table = ( self.inner_reduction.neutral_element( @@ -313,7 +313,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): dict(enumerate([scalar_dtype, segment_flag_dtype])), callables_table) - func_id, callables_table = update_table( + func_id, callables_table = add_callable_to_table( callables_table, "make_tuple", make_tuple_callable) return ResolvedFunction(func_id)(scalar_neutral_element, @@ -344,8 +344,8 @@ class _SegmentedScalarReductionOperation(ReductionOperation): callables_table)) # populate callables_table - from loopy.translation_unit import update_table - func_id, callables_table = update_table( + from loopy.translation_unit import add_callable_to_table + func_id, callables_table = add_callable_to_table( callables_table, SegmentedOp(self), segmented_scalar_callable) return (ResolvedFunction(func_id)(*(operand1 + operand2)), @@ -410,7 +410,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): scalar_neutral_element = scalar_neutral_func(scalar_dtype) from loopy.library.function import MakeTupleCallable - from loopy.translation_unit import update_table + from loopy.translation_unit import add_callable_to_table make_tuple_callable = MakeTupleCallable( name="make_tuple") @@ -419,8 +419,9 @@ class _ArgExtremumReductionOperation(ReductionOperation): callables_table) # populate callables_table - func_id, callables_table = update_table(callables_table, "make_tuple", - make_tuple_callable) + func_id, callables_table = add_callable_to_table(callables_table, + "make_tuple", + make_tuple_callable) return ResolvedFunction(func_id)(scalar_neutral_element, index_dtype.numpy_dtype.type(-1)), callables_table @@ -448,8 +449,8 @@ class _ArgExtremumReductionOperation(ReductionOperation): callables_table)) # populate callables_table - from loopy.translation_unit import update_table - func_id, callables_table = update_table( + from loopy.translation_unit import add_callable_to_table + func_id, callables_table = add_callable_to_table( callables_table, ArgExtOp(self), arg_ext_scalar_callable) return (ResolvedFunction(func_id)(*(operand1 + operand2)), diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 250bf3294..127e6341a 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -749,7 +749,7 @@ def for_each_kernel(transform): return wraps(transform)(_collective_transform) -def update_table(callables_table, clbl_id, clbl): +def add_callable_to_table(callables_table, clbl_id, clbl): """ Returns a tuple ``new_clbl_id, new_callables_table`` where *new_callables_table* is a copy of *callables_table* with *clbl* in its -- GitLab From e7c0908811ebe708d1eabf826b6d6845d23b1736 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 29 Apr 2021 09:24:14 -0500 Subject: [PATCH 893/916] define :meth:`InKernelCallable.get_called_callables` --- loopy/codegen/__init__.py | 6 +-- loopy/kernel/function_interface.py | 22 +++++++++ loopy/kernel/tools.py | 58 +++++++++++++++++++++++ loopy/preprocess.py | 7 +-- loopy/translation_unit.py | 76 ++++-------------------------- 5 files changed, 95 insertions(+), 74 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 14f0a75eb..86e18de34 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -605,11 +605,11 @@ def diverge_callee_entrypoints(program): If a :class:`loopy.kernel.function_interface.CallableKernel` is both an entrypoint and a callee, then rename the callee. """ - from loopy.translation_unit import (_get_reachable_callable_ids, + from loopy.translation_unit import (get_reachable_resolved_callable_ids, rename_resolved_functions_in_a_single_kernel) from pytools import UniqueNameGenerator - callable_ids = _get_reachable_callable_ids(program.callables_table, - program.entrypoints) + callable_ids = get_reachable_resolved_callable_ids(program.callables_table, + program.entrypoints) new_callables = {} todo_renames = {} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 8c9a0f2ac..e4a91f1e7 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -313,6 +313,7 @@ class InKernelCallable(ImmutableRecord): .. automethod:: is_ready_for_codegen .. automethod:: get_hw_axes_sizes .. automethod:: get_used_hw_axes + .. automethod:: get_called_callables .. note:: @@ -481,6 +482,16 @@ class InKernelCallable(ImmutableRecord): """ raise NotImplementedError() + def get_called_callables(self, callables_table): + """ + Returns a :class:`frozenset` of callable ids called by *self* that are + resolved via *callables_table*. + + :arg callables_table: Similar to + :attr:`loopy.TranslationUnit.callables_table`. + """ + raise NotImplementedError + # }}} @@ -638,6 +649,12 @@ class ScalarCallable(InKernelCallable): def with_added_arg(self, arg_dtype, arg_descr): raise LoopyError("Cannot add args to scalar callables.") + def get_called_callables(self, callables_table): + """ + Returns a :class:`frozenset` of callable ids called by *self*. + """ + return frozenset() + # }}} @@ -927,6 +944,11 @@ class CallableKernel(InKernelCallable): return var(self.subkernel.name)(*tgt_parameters), False + def get_called_callables(self, callables_table): + from loopy.kernel.tools import get_resolved_callable_ids_called_by_knl + return get_resolved_callable_ids_called_by_knl(self.subkernel, + callables_table) + # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 19cb8acbd..8c12f1e35 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -36,6 +36,10 @@ from loopy.kernel import LoopKernel from loopy.translation_unit import (TranslationUnit, for_each_kernel) from loopy.kernel.function_interface import CallableKernel +from loopy.kernel.instruction import ( + MultiAssignmentBase, CInstruction, _DataObliviousInstruction) +from loopy.symbolic import CombineMapper +from functools import reduce import logging logger = logging.getLogger(__name__) @@ -1995,4 +1999,58 @@ def infer_args_are_input_output(kernel): # }}} + +# {{{ CallablesIDCollector + +class CallablesIDCollector(CombineMapper): + """ + Mapper to collect function identifiers of all resolved callables in an + expression. + """ + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_resolved_function(self, expr): + return frozenset([expr.name]) + + def map_constant(self, expr): + return frozenset() + + def map_kernel(self, kernel): + callables_in_insn = frozenset() + + for insn in kernel.instructions: + if isinstance(insn, MultiAssignmentBase): + callables_in_insn = callables_in_insn | ( + self(insn.expression)) + elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): + pass + else: + raise NotImplementedError(type(insn).__name__) + + for rule in kernel.substitutions.values(): + callables_in_insn = callables_in_insn | ( + self(rule.expression)) + + return callables_in_insn + + def map_type_cast(self, expr): + return self.rec(expr.child) + + map_variable = map_constant + map_function_symbol = map_constant + map_tagged_variable = map_constant + + +def get_resolved_callable_ids_called_by_knl(knl, callables): + clbl_id_collector = CallablesIDCollector() + callables_called_by_kernel = clbl_id_collector.map_kernel(knl) + callables_called_by_called_callables = frozenset().union(*( + callables[clbl_id].get_called_callables(callables) + for clbl_id in callables_called_by_kernel)) + return callables_called_by_kernel | callables_called_by_called_callables + +# }}} + # vim: foldmethod=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 59e70827e..c28f14e80 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2363,9 +2363,10 @@ def inline_kernels_with_gbarriers(program): def filter_reachable_callables(t_unit): - from loopy.translation_unit import _get_reachable_callable_ids - reachable_function_ids = _get_reachable_callable_ids(t_unit.callables_table, - t_unit.entrypoints) + from loopy.translation_unit import get_reachable_resolved_callable_ids + reachable_function_ids = get_reachable_resolved_callable_ids(t_unit + .callables_table, + t_unit.entrypoints) new_callables = {name: clbl for name, clbl in t_unit.callables_table.items() if name in (reachable_function_ids | t_unit.entrypoints)} return t_unit.copy(callables_table=new_callables) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 127e6341a..27c6392a5 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -27,18 +27,15 @@ from pymbolic.primitives import Variable from functools import wraps from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, - CombineMapper, SubstitutionRuleMappingContext) + SubstitutionRuleMappingContext) from loopy.kernel.function_interface import ( CallableKernel, ScalarCallable) -from loopy.kernel.instruction import ( - MultiAssignmentBase, CInstruction, _DataObliviousInstruction) from loopy.diagnostic import LoopyError from loopy.library.reduction import ReductionOpFunction from loopy.kernel import LoopKernel from loopy.tools import update_persistent_hash from pymbolic.primitives import Call -from functools import reduce from pyrsistent import pmap, PMap __doc__ = """ @@ -411,70 +408,13 @@ def rename_resolved_functions_in_a_single_kernel(kernel, # }}} -# {{{ CallablesIDCollector - -class CallablesIDCollector(CombineMapper): +def get_reachable_resolved_callable_ids(callables, entrypoints): """ - Mapper to collect function identifiers of all resolved callables in an - expression. + Returns a :class:`frozenset` of callables ids that are resolved and + reachable from *entrypoints*. """ - def combine(self, values): - import operator - return reduce(operator.or_, values, frozenset()) - - def map_resolved_function(self, expr): - return frozenset([expr.name]) - - def map_constant(self, expr): - return frozenset() - - def map_kernel(self, kernel): - callables_in_insn = frozenset() - - for insn in kernel.instructions: - if isinstance(insn, MultiAssignmentBase): - callables_in_insn = callables_in_insn | ( - self(insn.expression)) - elif isinstance(insn, (CInstruction, _DataObliviousInstruction)): - pass - else: - raise NotImplementedError(type(insn).__name__) - - for rule in kernel.substitutions.values(): - callables_in_insn = callables_in_insn | ( - self(rule.expression)) - - return callables_in_insn - - def map_type_cast(self, expr): - return self.rec(expr.child) - - map_variable = map_constant - map_function_symbol = map_constant - map_tagged_variable = map_constant - - -def _get_reachable_callable_ids_for_knl(knl, callables): - clbl_id_collector = CallablesIDCollector() - - def rec(clbl_id): - clbl = callables[clbl_id] - if isinstance(clbl, CallableKernel): - return (_get_reachable_callable_ids_for_knl(clbl.subkernel, callables) - | frozenset([clbl_id])) - else: - return frozenset([clbl_id]) - - return frozenset().union(*(rec(clbl_id) - for clbl_id in clbl_id_collector.map_kernel(knl))) - - -def _get_reachable_callable_ids(callables, entrypoints): - return frozenset().union(*( - _get_reachable_callable_ids_for_knl(callables[e].subkernel, callables) - for e in entrypoints)) - -# }}} + return frozenset().union(*(callables[e].get_called_callables(callables) + for e in entrypoints)) # {{{ CallablesInferenceContext @@ -631,8 +571,8 @@ class CallablesInferenceContext(ImmutableRecord): # {{{ get all the callables reachable from the new entrypoints. # get the names of all callables reachable from the new entrypoints - new_callable_ids = _get_reachable_callable_ids( - self.callables, self.new_entrypoints) + new_callable_ids = get_reachable_resolved_callable_ids(self.callables, + self.new_entrypoints) # get the history of function ids from the performed renames: history = {} -- GitLab From 6ca0d8aad1582cb997f7b33d1fedcd736e362c00 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 09:32:39 -0500 Subject: [PATCH 894/916] define :meth:`InKernelCallable.with_name` --- loopy/kernel/function_interface.py | 15 +++++++++++++++ loopy/translation_unit.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index e4a91f1e7..b99c8b2f8 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -314,6 +314,7 @@ class InKernelCallable(ImmutableRecord): .. automethod:: get_hw_axes_sizes .. automethod:: get_used_hw_axes .. automethod:: get_called_callables + .. automethod:: with_name .. note:: @@ -492,6 +493,13 @@ class InKernelCallable(ImmutableRecord): """ raise NotImplementedError + def with_name(self, name): + """ + Returns a copy of *self* so that it could be referred by *name* + in a :attr:`loopy.TranslationUnit.callables_table`'s namespace. + """ + raise NotImplementedError + # }}} @@ -655,6 +663,9 @@ class ScalarCallable(InKernelCallable): """ return frozenset() + def with_name(self, name): + return self + # }}} @@ -949,6 +960,10 @@ class CallableKernel(InKernelCallable): return get_resolved_callable_ids_called_by_knl(self.subkernel, callables_table) + def with_name(self, name): + new_knl = self.subkernel.copy(name=name) + return self.copy(subkernel=new_knl) + # }}} diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 27c6392a5..2236b8134 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -715,7 +715,7 @@ def add_callable_to_table(callables_table, clbl_id, clbl): new_clbl_id = ung(clbl_id) new_callables_table = callables_table.copy() - new_callables_table[new_clbl_id] = clbl + new_callables_table[new_clbl_id] = clbl.with_name(new_clbl_id) return new_clbl_id, new_callables_table -- GitLab From 066488bbe19f6dda02ae29e7212db062dcef992f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 09:40:31 -0500 Subject: [PATCH 895/916] removes unnecessary copy --- loopy/translation_unit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 2236b8134..5804707cc 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -708,7 +708,7 @@ def add_callable_to_table(callables_table, clbl_id, clbl): return i, callables_table if isinstance(clbl_id, ReductionOpFunction): - new_clbl_id = clbl_id.copy() + new_clbl_id = clbl_id else: assert isinstance(clbl_id, str) ung = make_callable_name_generator(callables_table) -- GitLab From df700d134f44de63a61347a08438d9c58e3df3e8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 09:43:24 -0500 Subject: [PATCH 896/916] use make_callable_name_generator instead of construction UniqueNameGenerator by hand make_callable_name_generator is better as it also accounts for possible collissions with substitution rule names --- loopy/codegen/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 86e18de34..0a849b44e 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -606,15 +606,15 @@ def diverge_callee_entrypoints(program): entrypoint and a callee, then rename the callee. """ from loopy.translation_unit import (get_reachable_resolved_callable_ids, - rename_resolved_functions_in_a_single_kernel) - from pytools import UniqueNameGenerator + rename_resolved_functions_in_a_single_kernel, + make_callable_name_generator) callable_ids = get_reachable_resolved_callable_ids(program.callables_table, program.entrypoints) new_callables = {} todo_renames = {} - vng = UniqueNameGenerator(set(program.callables_table.keys())) + vng = make_callable_name_generator(program.callables_table) for clbl_id in callable_ids & program.entrypoints: todo_renames[clbl_id] = vng(based_on=clbl_id) -- GitLab From 95489b1d1287bd39aa30ba23738d54ede2487c0e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 2 May 2021 10:04:44 -0500 Subject: [PATCH 897/916] cleanup: gets rid of dead code/comments --- loopy/kernel/tools.py | 48 -------------------------------- loopy/preprocess.py | 11 ++++---- loopy/target/execution.py | 2 -- loopy/transform/make_scalar.py | 51 ---------------------------------- 4 files changed, 5 insertions(+), 107 deletions(-) delete mode 100644 loopy/transform/make_scalar.py diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 8c12f1e35..e2e5747ef 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1896,54 +1896,6 @@ def find_aliasing_equivalence_classes(kernel): # }}} -# {{{ callee kernel tools - -def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,): - """ - Returns an instance of :class:`frozenset` of all the callee kernels - called in instructions in the *kernel* whose IDs are given in *insn_ids*. - - :arg kernel: An instance of :class:`LoopKernel`. - :arg insn_ids: An instance of :class:`frozenset`. - - If *insn_ids* is *None* returns all the callee kernels called by *kernel*. - """ - #FIXME: explain what "direct" means - - if insn_ids is None: - insn_ids = frozenset(insn.id for insn in kernel.instructions) - - def _get_callee_kernel_if_insn_has_callable_kernel(insn_id): - """Returns callee kernel if the instruction has a call to a - :class:`loopy.kernel.function_interface.CallableKernel`. Otherwise - returns *None*. - """ - insn = kernel.id_to_insn[insn_id] - from loopy.kernel.instruction import (CallInstruction, - MultiAssignmentBase, CInstruction, _DataObliviousInstruction) - from pymbolic.primitives import Call - if isinstance(insn, CallInstruction): - if isinstance(insn.expression, Call) and ( - insn.expression.function.name in callables_table): - in_knl_callable = callables_table[ - insn.expression.function.name] - if isinstance(in_knl_callable, CallableKernel): - return in_knl_callable.subkernel - elif isinstance(insn, (MultiAssignmentBase, - CInstruction, _DataObliviousInstruction)): - pass - else: - raise NotImplementedError("Unknown type of instruction %s." % - type(insn)) - - return None - - return frozenset([_get_callee_kernel_if_insn_has_callable_kernel(insn_id) - for insn_id in insn_ids]) - frozenset([None]) - -# }}} - - # {{{ direction helper tools def infer_args_are_input_output(kernel): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c28f14e80..4f3dc0773 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2148,7 +2148,8 @@ def check_atomic_loads(kernel): class ArgDescrInferenceMapper(RuleAwareIdentityMapper): """ - Infers the :attr:`loopy` + Infers :attr:`~loopy.kernel.function_interface.arg_id_to_descr` of + callables visited in an expression. """ def __init__(self, rule_mapping_context, caller_kernel, clbl_inf_ctx): @@ -2339,7 +2340,6 @@ def infer_arg_descr(program): # {{{ inline_kernels_with_gbarriers - def inline_kernels_with_gbarriers(program): from loopy.kernel.instruction import BarrierInstruction from loopy.transform.callable import inline_callable_kernel @@ -2349,6 +2349,7 @@ def inline_kernels_with_gbarriers(program): and insn.synchronization_kind == "global") for insn in knl.instructions) + # FIXME: should traverse in call-graph's topological sort order callees_to_inline = [name for name, knl_clbl in program.callables_table.items() if (isinstance(knl_clbl, CallableKernel) and has_gbarrier(knl_clbl.subkernel))] @@ -2358,7 +2359,6 @@ def inline_kernels_with_gbarriers(program): return program - # }}} @@ -2377,7 +2377,7 @@ preprocess_cache = WriteOncePersistentDict( key_builder=LoopyKeyBuilder()) -def preprocess_single_kernel(kernel, callables_table, device=None): +def _preprocess_single_kernel(kernel, callables_table, device=None): from loopy.kernel import KernelState prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name) @@ -2498,7 +2498,7 @@ def preprocess_program(program, device=None): new_callables = {} for func_id, in_knl_callable in program.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): - new_subkernel = preprocess_single_kernel( + new_subkernel = _preprocess_single_kernel( in_knl_callable.subkernel, program.callables_table, device) in_knl_callable = in_knl_callable.copy( @@ -2520,7 +2520,6 @@ def preprocess_program(program, device=None): # Ordering restriction: # callees with gbarrier in them must be inlined after inferrring arg_descr. - # inline_kernels_with_gbarriers does not recursively inline the callees. program = inline_kernels_with_gbarriers(program) # {{{ prepare for caching diff --git a/loopy/target/execution.py b/loopy/target/execution.py index fdd38278e..09ab1178a 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -632,8 +632,6 @@ class ExecutionWrapperGeneratorBase: """ options = program[entrypoint].options - #FIXME: endswith is ugly maybe make - # codegen_result.implemented_data_infos a dict? implemented_data_info = codegen_result.implemented_data_infos[entrypoint] from loopy.kernel.data import KernelArgument diff --git a/loopy/transform/make_scalar.py b/loopy/transform/make_scalar.py deleted file mode 100644 index b8db7f43f..000000000 --- a/loopy/transform/make_scalar.py +++ /dev/null @@ -1,51 +0,0 @@ -from pymbolic.primitives import Variable -from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext) -from loopy.kernel.data import ValueArg -from loopy.transform.iname import remove_unused_inames - - -class ScalarChanger(RuleAwareIdentityMapper): - def __init__(self, rule_mapping_context, var_name): - self.var_name = var_name - super().__init__(rule_mapping_context) - - def map_subscript(self, expr, expn_state): - if expr.aggregate.name == self.var_name: - return Variable(self.var_name) - - return super().map_subscript(expr, expn_state) - - -def make_scalar(kernel, var_name): - rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions, - kernel.get_var_name_generator()) - - kernel = ScalarChanger(rule_mapping_context, var_name).map_kernel(kernel) - - new_args = [ValueArg(arg.name, arg.dtype, target=arg.target, - is_output=arg.is_output) if arg.name == var_name else arg for - arg in kernel.args] - new_temps = dict((tv.name, tv.copy(shape=(), dim_tags=None)) - if tv.name == var_name else (tv.name, tv) for tv in - kernel.temporary_variables.values()) - - return kernel.copy(args=new_args, temporary_variables=new_temps) - - -def remove_invariant_inames(kernel): - inames_used = set() - untagged_inames = ( - kernel.all_inames() - frozenset(kernel.iname_to_tags.keys())) - for insn in kernel.instructions: - for iname in ((insn.read_dependency_names() - | insn.write_dependency_names()) - & untagged_inames): - inames_used.add(iname) - - removable_inames = untagged_inames - inames_used - - new_insns = [insn.copy(within_inames=insn.within_inames-removable_inames) - for insn in kernel.instructions] - - return remove_unused_inames(kernel.copy(instructions=new_insns), - removable_inames) -- GitLab From f02c24149f418113846c493c9711e1d9c251d862 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 11:11:55 -0500 Subject: [PATCH 898/916] InKernelCallable.get_called_callables now takes in recursive: bool --- loopy/kernel/function_interface.py | 12 ++++++++---- loopy/kernel/tools.py | 7 ++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index b99c8b2f8..39fde2ac6 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -483,13 +483,16 @@ class InKernelCallable(ImmutableRecord): """ raise NotImplementedError() - def get_called_callables(self, callables_table): + def get_called_callables(self, callables_table, recursive=True): """ Returns a :class:`frozenset` of callable ids called by *self* that are resolved via *callables_table*. :arg callables_table: Similar to :attr:`loopy.TranslationUnit.callables_table`. + :arg recursive: If *True* recursively searches for all the called + callables, else only returns the callables directly called by + *self*. """ raise NotImplementedError @@ -657,7 +660,7 @@ class ScalarCallable(InKernelCallable): def with_added_arg(self, arg_dtype, arg_descr): raise LoopyError("Cannot add args to scalar callables.") - def get_called_callables(self, callables_table): + def get_called_callables(self, callables_table, recursive=True): """ Returns a :class:`frozenset` of callable ids called by *self*. """ @@ -955,10 +958,11 @@ class CallableKernel(InKernelCallable): return var(self.subkernel.name)(*tgt_parameters), False - def get_called_callables(self, callables_table): + def get_called_callables(self, callables_table, recursive=True): from loopy.kernel.tools import get_resolved_callable_ids_called_by_knl return get_resolved_callable_ids_called_by_knl(self.subkernel, - callables_table) + callables_table, + recursive=recursive) def with_name(self, name): new_knl = self.subkernel.copy(name=name) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index e2e5747ef..6fba7c560 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1995,9 +1995,14 @@ class CallablesIDCollector(CombineMapper): map_tagged_variable = map_constant -def get_resolved_callable_ids_called_by_knl(knl, callables): +def get_resolved_callable_ids_called_by_knl(knl, callables, recursive=True): clbl_id_collector = CallablesIDCollector() callables_called_by_kernel = clbl_id_collector.map_kernel(knl) + + if not recursive: + # => do not recurse into the callees + return callables_called_by_kernel + callables_called_by_called_callables = frozenset().union(*( callables[clbl_id].get_called_callables(callables) for clbl_id in callables_called_by_kernel)) -- GitLab From 808028f57bf53dcce94baf22cd1fe245e80cbac6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 11:13:00 -0500 Subject: [PATCH 899/916] introduce a helper get_call_graph --- loopy/kernel/tools.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 6fba7c560..a0122576a 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -2010,4 +2010,43 @@ def get_resolved_callable_ids_called_by_knl(knl, callables, recursive=True): # }}} + +# {{{ get_call_graph + +def get_call_graph(t_unit, only_kernel_callables=False): + """ + Returns a mapping from a callable name to the calls seen in it. + + :arg t_unit: An instance of :class:`TranslationUnit`. + """ + from pyrsistent import pmap + from loopy.kernel import KernelState + + if t_unit.state < KernelState.CALLS_RESOLVED: + raise LoopyError("TranslationUnit must have calls resolved in order to" + " compute its call graph.") + + knl_callables = frozenset(name for name, clbl in t_unit.callables_table.items() + if isinstance(clbl, CallableKernel)) + + # stores a mapping from caller -> "direct"" callees + call_graph = {} + + for name, clbl in t_unit.callables_table.items(): + if (not isinstance(clbl, CallableKernel) + and only_kernel_callables): + pass + else: + if only_kernel_callables: + call_graph[name] = (clbl.get_called_callables(t_unit.callables_table, + recursive=False) + & knl_callables) + else: + call_graph[name] = clbl.get_called_callables(t_unit.callables_table, + recursive=False) + + return pmap(call_graph) + +# }}} + # vim: foldmethod=marker -- GitLab From be501886d6fca71e120a646507831ee14057aac0 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 11:13:15 -0500 Subject: [PATCH 900/916] perform bottom up traversal of the call graph while inlining callees with gbarriers in them --- loopy/preprocess.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 4f3dc0773..49e177f74 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2343,19 +2343,26 @@ def infer_arg_descr(program): def inline_kernels_with_gbarriers(program): from loopy.kernel.instruction import BarrierInstruction from loopy.transform.callable import inline_callable_kernel + from loopy.kernel.tools import get_call_graph + from pytools.graph import compute_topological_order def has_gbarrier(knl): return any((isinstance(insn, BarrierInstruction) and insn.synchronization_kind == "global") for insn in knl.instructions) - # FIXME: should traverse in call-graph's topological sort order - callees_to_inline = [name for name, knl_clbl in program.callables_table.items() - if (isinstance(knl_clbl, CallableKernel) - and has_gbarrier(knl_clbl.subkernel))] + call_graph = get_call_graph(program, only_kernel_callables=True) - for callee_to_inline in callees_to_inline: - program = inline_callable_kernel(program, callee_to_inline) + # traverse the kernel calls in a reverse topological sort so that barriers + # are rightly passed to the entrypoints. + toposort = compute_topological_order(call_graph, + # pass key to have deterministic codegen + key=lambda x: x + ) + + for name in toposort[::-1]: + if has_gbarrier(program[name]): + program = inline_callable_kernel(program, name) return program -- GitLab From 06d9db43a22711751c19f9cdb8ec30b2ea9ca52c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 11:14:26 -0500 Subject: [PATCH 901/916] test gbarrier is translated from leaf callees to entrypoints --- test/test_callables.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/test/test_callables.py b/test/test_callables.py index f79f6e8f1..2118a0fd2 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -562,6 +562,41 @@ def test_callees_with_gbarriers_are_inlined(ctx_factory): assert (expected_out == out.get()).all() +def test_callees_with_gbarriers_are_inlined_with_nested_calls(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + ones_and_zeros = lp.make_function( + "{[i, j]: 0<=i<6 and 0<=j<3}", + """ + x[i] = 0.0f + ...gbarrier + x[j] = 1.0f + """, + seq_dependencies=True, + name="ones_and_zeros") + + dummy_ones_and_zeros = lp.make_function( + "{[i]: 0<=i<6}", + """ + [i]: y[i] = ones_and_zeros() + """, + name="dummy_ones_and_zeros") + + t_unit = lp.make_kernel( + "{ : }", + """ + y[:] = dummy_ones_and_zeros() + """, [lp.GlobalArg("y", shape=6, dtype=lp.auto)]) + + t_unit = lp.merge([t_unit, dummy_ones_and_zeros, ones_and_zeros]) + evt, (out,) = t_unit(queue) + + expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32) + + assert (expected_out == out.get()).all() + + def test_inlining_with_indirections(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) -- GitLab From d1617b1106c02adf0d45c35bfa5db5e5e495b740 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 13:14:43 -0500 Subject: [PATCH 902/916] avoid 'if' by appropriately using sub-classes --- loopy/library/reduction.py | 107 ++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 50 deletions(-) diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 13dfadedd..67043e1af 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -334,8 +334,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation): other.inner_reduction) def __call__(self, dtypes, operand1, operand2, callables_table, target): - segmented_scalar_callable = ReductionCallable( - SegmentedOp(self)) + segmented_scalar_callable = SegmentOpCallable(SegmentedOp(self)) # type specialize the callable segmented_scalar_callable, callables_table = ( @@ -440,7 +439,7 @@ class _ArgExtremumReductionOperation(ReductionOperation): return 2 def __call__(self, dtypes, operand1, operand2, callables_table, target): - arg_ext_scalar_callable = ReductionCallable(ArgExtOp(self)) + arg_ext_scalar_callable = ArgExtOpCallable(ArgExtOp(self)) # type specialize the callable arg_ext_scalar_callable, callables_table = ( @@ -561,58 +560,66 @@ class ReductionCallable(ScalarCallable): self.copy(arg_id_to_descr=arg_id_to_descr), callables_table) + +class ArgExtOpCallable(ReductionCallable): + def generate_preambles(self, target): - if isinstance(self.name, ArgExtOp): - op = self.name.reduction_op - scalar_dtype = self.arg_id_to_dtype[-1] - index_dtype = self.arg_id_to_dtype[-2] - - prefix = op.prefix(scalar_dtype, index_dtype) - - yield (prefix, """ - inline {scalar_t} {prefix}_op( - {scalar_t} op1, {index_t} index1, - {scalar_t} op2, {index_t} index2, - {index_t} *index_out) + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + index_dtype = self.arg_id_to_dtype[-2] + + prefix = op.prefix(scalar_dtype, index_dtype) + + yield (prefix, """ + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {index_t} index1, + {scalar_t} op2, {index_t} index2, + {index_t} *index_out) + {{ + if (op2 {comp} op1) {{ - if (op2 {comp} op1) - {{ - *index_out = index2; - return op2; - }} - else - {{ - *index_out = index1; - return op1; - }} + *index_out = index2; + return op2; }} - """.format( - scalar_t=target.dtype_to_typename(scalar_dtype), - prefix=prefix, - index_t=target.dtype_to_typename(index_dtype), - comp=op.update_comparison, - )) - elif isinstance(self.name, SegmentedOp): - op = self.name.reduction_op - scalar_dtype = self.arg_id_to_dtype[-1] - segment_flag_dtype = self.arg_id_to_dtype[-2] - prefix = op.prefix(scalar_dtype, segment_flag_dtype) - - yield (prefix, """ - inline {scalar_t} {prefix}_op( - {scalar_t} op1, {segment_flag_t} segment_flag1, - {scalar_t} op2, {segment_flag_t} segment_flag2, - {segment_flag_t} *segment_flag_out) + else {{ - *segment_flag_out = segment_flag1 | segment_flag2; - return segment_flag2 ? op2 : {combined}; + *index_out = index1; + return op1; }} - """.format( - scalar_t=target.dtype_to_typename(scalar_dtype), - prefix=prefix, - segment_flag_t=target.dtype_to_typename(segment_flag_dtype), - combined=op.op % ("op1", "op2"), - )) + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + index_t=target.dtype_to_typename(index_dtype), + comp=op.update_comparison, + )) + + return + + +class SegmentOpCallable(ReductionCallable): + + def generate_preambles(self, target): + op = self.name.reduction_op + scalar_dtype = self.arg_id_to_dtype[-1] + segment_flag_dtype = self.arg_id_to_dtype[-2] + prefix = op.prefix(scalar_dtype, segment_flag_dtype) + + yield (prefix, """ + inline {scalar_t} {prefix}_op( + {scalar_t} op1, {segment_flag_t} segment_flag1, + {scalar_t} op2, {segment_flag_t} segment_flag2, + {segment_flag_t} *segment_flag_out) + {{ + *segment_flag_out = segment_flag1 | segment_flag2; + return segment_flag2 ? op2 : {combined}; + }} + """.format( + scalar_t=target.dtype_to_typename(scalar_dtype), + prefix=prefix, + segment_flag_t=target.dtype_to_typename(segment_flag_dtype), + combined=op.op % ("op1", "op2"), + )) return -- GitLab From 7eb3a3a226af70dbdea13c01e356e51b6ebc5910 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 13:19:28 -0500 Subject: [PATCH 903/916] run: `git grep -l arg_id_to_val | xargs sed -i 's/arg_id_to_val/arg_id_to_arg/g'` --- loopy/check.py | 10 +++++----- loopy/kernel/__init__.py | 2 +- loopy/kernel/function_interface.py | 10 +++++----- loopy/kernel/instruction.py | 8 ++++---- loopy/preprocess.py | 6 +++--- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index c176b89b5..71d51f333 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1310,19 +1310,19 @@ def _validate_kernel_call_insn(caller, call_insn, callee): from loopy.symbolic import SubArrayRef from loopy.kernel.array import ArrayBase - arg_id_to_val = call_insn.arg_id_to_val() + arg_id_to_arg = call_insn.arg_id_to_arg() next_iarg_input = 0 next_iarg_output = -1 for arg in callee.args: if arg.is_input: - if next_iarg_input not in arg_id_to_val: + if next_iarg_input not in arg_id_to_arg: raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" f" a {next_iarg_input+1}-th positional " "argument corresponding" f" to '{arg.name}'in the callee.") - in_val = arg_id_to_val[next_iarg_input] + in_val = arg_id_to_arg[next_iarg_input] next_iarg_input += 1 if isinstance(arg, ArrayBase): if not isinstance(in_val, SubArrayRef): @@ -1335,12 +1335,12 @@ def _validate_kernel_call_insn(caller, call_insn, callee): f" expects a value argument for '{arg.name}'" f" (got {in_val}).") if arg.is_output: - if next_iarg_output not in arg_id_to_val: + if next_iarg_output not in arg_id_to_arg: raise LoopyError(f"Call to '{callee.name}' in '{call_insn}' expects" f" a {-next_iarg_output}-th positional assignee" f" corresponding to '{arg.name}'in the callee.") - out_val = arg_id_to_val[next_iarg_output] + out_val = arg_id_to_arg[next_iarg_output] next_iarg_output -= 1 assert isinstance(arg, ArrayBase) if not isinstance(out_val, SubArrayRef): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index dbcaad5b5..362021d43 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1090,7 +1090,7 @@ class LoopKernel(ImmutableRecordWithoutPickling, Taggable): and isinstance(insn.expression.function, ResolvedFunction)): clbl = callables_table[insn.expression.function.name] - gsize, lsize = clbl.get_hw_axes_sizes(insn.arg_id_to_val(), + gsize, lsize = clbl.get_hw_axes_sizes(insn.arg_id_to_arg(), self.assumptions.space, callables_table) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 39fde2ac6..7367f0306 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -426,14 +426,14 @@ class InKernelCallable(ImmutableRecord): return (self.arg_id_to_dtype is not None and self.arg_id_to_descr is not None) - def get_hw_axes_sizes(self, arg_id_to_val, space, callables_table): + def get_hw_axes_sizes(self, arg_id_to_arg, space, callables_table): """ Returns ``gsizes, lsizes``, where *gsizes* and *lsizes* are mappings from axis indices to corresponding group or local hw axis sizes. The hw axes sizes are represented as instances of :class:`islpy.PwAff` on the given *space*. - :arg arg_id_to_val: A mapping from the passed argument *id* to the + :arg arg_id_to_arg: A mapping from the passed argument *id* to the arguments at a call-site. :arg space: An instance of :class:`islpy.Space`. """ @@ -548,7 +548,7 @@ class ScalarCallable(InKernelCallable): return (self.copy(arg_id_to_descr=arg_id_to_descr), clbl_inf_ctx) - def get_hw_axes_sizes(self, arg_id_to_val, space, callables_table): + def get_hw_axes_sizes(self, arg_id_to_arg, space, callables_table): return {}, {} def get_used_hw_axes(self, callables_table): @@ -889,14 +889,14 @@ class CallableKernel(InKernelCallable): return frozenset(gsize.keys()), frozenset(lsize.keys()) - def get_hw_axes_sizes(self, arg_id_to_val, space, callables_table): + def get_hw_axes_sizes(self, arg_id_to_arg, space, callables_table): from loopy.isl_helpers import subst_into_pwaff _, pos_to_kw = get_kw_pos_association(self.subkernel) gsize, lsize = self.subkernel.get_grid_size_upper_bounds(callables_table, return_dict=True) subst_dict = {i: val - for i, val in arg_id_to_val.items() + for i, val in arg_id_to_arg.items() if isinstance(self.subkernel.arg_dict[pos_to_kw[i]], ValueArg)} diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index aed6ae168..e561dd030 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1144,16 +1144,16 @@ class CallInstruction(MultiAssignmentBase): result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) return result - def arg_id_to_val(self): + def arg_id_to_arg(self): """:returns: a :class:`dict` mapping argument identifiers (non-negative numbers for positional arguments and negative numbers for assignees) to their respective values """ - arg_id_to_val = dict(enumerate(self.expression.parameters)) + arg_id_to_arg = dict(enumerate(self.expression.parameters)) for i, arg in enumerate(self.assignees): - arg_id_to_val[-i-1] = arg + arg_id_to_arg[-i-1] = arg - return arg_id_to_val + return arg_id_to_arg @property def atomicity(self): diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 49e177f74..c01e7f27a 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2171,16 +2171,16 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper): # ignore if the call is not to a ResolvedFunction return super().map_call(expr, expn_state) - arg_id_to_val = dict(enumerate(expr.parameters)) + arg_id_to_arg = dict(enumerate(expr.parameters)) if assignees is not None: # If supplied with assignees then this is a CallInstruction for i, arg in enumerate(assignees): - arg_id_to_val[-i-1] = arg + arg_id_to_arg[-i-1] = arg arg_id_to_descr = { arg_id: get_arg_descriptor_for_expression(self.caller_kernel, arg) - for arg_id, arg in arg_id_to_val.items()} + for arg_id, arg in arg_id_to_arg.items()} clbl = self.clbl_inf_ctx[expr.function.name] # {{{ translating descriptor expressions to the callable's namespace -- GitLab From 5ec2a8d20c525066063e149614a37fc070320ea2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 26 May 2021 14:27:42 -0500 Subject: [PATCH 904/916] scheduling: account for callables_table within cache_key --- loopy/schedule/__init__.py | 4 +++- test/test_callables.py | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index f5b298d5e..ba69501ba 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -2148,7 +2148,9 @@ def get_one_scheduled_kernel(kernel, callables_table): def get_one_linearized_kernel(kernel, callables_table): from loopy import CACHING_ENABLED - sched_cache_key = kernel + # must include *callables_table* within the cache key as the preschedule + # checks depend on it. + sched_cache_key = (kernel, callables_table) from_cache = False if CACHING_ENABLED: diff --git a/test/test_callables.py b/test/test_callables.py index 2118a0fd2..7e00e545d 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -802,26 +802,26 @@ def test_unused_hw_axes_in_callee(ctx_factory, inline): def test_double_hw_axes_used_in_knl_call(inline): from loopy.diagnostic import LoopyError - thrice = lp.make_function( + twice = lp.make_function( "{[i]: 0<=i<10}", """ y[i] = 2*x[i] - """, name="thrice") + """, name="twice") knl = lp.make_kernel( "{[i]: 0<=i<10}", """ - y[:, i] = thrice(x[:, i]) + y[:, i] = twice(x[:, i]) """, [lp.GlobalArg("x", shape=(10, 10), dtype=float), lp.GlobalArg("y", shape=(10, 10))], name="outer") - thrice = lp.tag_inames(thrice, {"i": "l.0"}) + twice = lp.tag_inames(twice, {"i": "l.0"}) knl = lp.tag_inames(knl, {"i": "l.0"}) - knl = lp.merge([knl, thrice]) + knl = lp.merge([knl, twice]) if inline: - knl = lp.inline_callable_kernel(knl, "thrice") + knl = lp.inline_callable_kernel(knl, "twice") with pytest.raises(LoopyError): lp.generate_code_v2(knl) -- GitLab From c37cf4e6834fadd1f6ffd12cc055e1cdb615da30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 31 May 2021 19:23:29 -0500 Subject: [PATCH 905/916] Downstream pytential CI: Use appropriate sumpy branch --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 88372caee..deb090557 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -158,6 +158,9 @@ jobs: cd "$DOWNSTREAM_PROJECT" echo "*** $DOWNSTREAM_PROJECT version: $(git rev-parse --short HEAD)" + if [[ "$DOWNSTREAM_PROJECT" = "pytential" ]] && [[ "$GITHUB_HEAD_REF" = "kernel_callables_v3-edit2" || "$GITHUB_BASE_REF" = "kernel_callables_v3-edit2" ]]; then + sed -i "/egg=sumpy/ c git+https://github.com/inducer/sumpy.git@loopy-callables#egg=sumpy" requirements.txt + fi sed -i "/egg=loopy/ c git+file://$(readlink -f ..)#egg=loopy" requirements.txt export CONDA_ENVIRONMENT=.test-conda-env-py3.yml -- GitLab From c1c25acecd39044bf7272a98655616e29c70657e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Jun 2021 14:21:22 -0500 Subject: [PATCH 906/916] add an early exit criterion for a translation unit's type-inference --- loopy/kernel/function_interface.py | 21 +++++++++++++++++++++ loopy/type_inference.py | 10 ++++++++++ 2 files changed, 31 insertions(+) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 7367f0306..c762fc621 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -315,6 +315,7 @@ class InKernelCallable(ImmutableRecord): .. automethod:: get_used_hw_axes .. automethod:: get_called_callables .. automethod:: with_name + .. automethod:: is_type_specialized .. note:: @@ -503,6 +504,13 @@ class InKernelCallable(ImmutableRecord): """ raise NotImplementedError + def is_type_specialized(self): + """ + Returns *True* iff *self*'s type signature is known, else returns + *False*. + """ + raise NotImplementedError + # }}} @@ -669,6 +677,11 @@ class ScalarCallable(InKernelCallable): def with_name(self, name): return self + def is_type_specialized(self): + return (self.arg_id_to_dtype is not None + and all(dtype is not None + for dtype in self.arg_id_to_dtype.values())) + # }}} @@ -968,6 +981,14 @@ class CallableKernel(InKernelCallable): new_knl = self.subkernel.copy(name=name) return self.copy(subkernel=new_knl) + def is_type_specialized(self): + from loopy.kernel.data import auto + return (self.arg_id_to_dtype is not None + and all(arg.dtype not in [None, auto] + for arg in self.subkernel.args) + and all(tv.dtype not in [None, auto] + for tv in self.subkernel.temporary_variables.values())) + # }}} diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 8b9b47f00..ead1448da 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -1000,6 +1000,16 @@ def infer_unknown_types(program, expect_completion=False): program = resolve_callables(program) + # {{{ early-exit criterion + + if all(clbl.is_type_specialized() + for clbl in program.callables_table.values()): + # all the callables including the kernels have inferred their types + # => no need for type inference + return program + + # }}} + clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table, program.entrypoints) -- GitLab From 38e53496e9e3bec39e7258d8ee728c94b841622e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Jun 2021 16:07:05 -0500 Subject: [PATCH 907/916] removes dead fixme --- loopy/type_inference.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index ead1448da..dd9135483 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -915,9 +915,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx): # }}} - # FIXME: copy the explanation from make_function_ready_for_codegen - # here. - # {{{ check if insn missed during type inference def _instruction_missed_during_inference(insn): -- GitLab From 5f2337a06693e2515e0f2991e9acf134b04e2411 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Jun 2021 16:17:46 -0500 Subject: [PATCH 908/916] runs the translation unit and compares the result --- test/test_callables.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/test_callables.py b/test/test_callables.py index 7e00e545d..4bc37aeba 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -408,7 +408,9 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline): np.linalg.norm(2*x+3*y))) < 1e-15 -def test_stride_depending_on_args(): +def test_stride_depending_on_args(ctx_factory): + ctx = ctx_factory() + twice = lp.make_function( "{[i, j]: 0<=i, j < n}", """ @@ -436,11 +438,12 @@ def test_stride_depending_on_args(): prog = lp.merge([prog, twice]) prog = lp.merge([prog, thrice]) - # FIXME: actually test something - print(lp.generate_code_v2(prog).device_code()) + lp.auto_test_vs_ref(prog, ctx, prog, parameters={"N": 4}) -def test_unknown_stride_to_callee(): +def test_unknown_stride_to_callee(ctx_factory): + ctx = ctx_factory() + twice = lp.make_function( "{[i, j]: 0<=i, j < n}", """ @@ -459,8 +462,7 @@ def test_unknown_stride_to_callee(): prog = lp.merge([prog, twice]) - # FIXME: actually test something - print(lp.generate_code_v2(prog).device_code()) + lp.auto_test_vs_ref(prog, ctx, prog, parameters={"N": 4, "Nvar": 5}) def test_argument_matching_for_inplace_update(ctx_factory): -- GitLab From 66286cf627393b409bbef5980ddc5e5aa4bda583 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 1 Jun 2021 16:38:16 -0500 Subject: [PATCH 909/916] do not allow '_remove' as a kwarg in fix_paramters - was used earlier to allow kernels to fix strides/shape arguments in the callee. Not necessary anymore as that has been rightly formalized. --- loopy/transform/parameter.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py index 5ceaeb121..4916dd4e7 100644 --- a/loopy/transform/parameter.py +++ b/loopy/transform/parameter.py @@ -68,7 +68,7 @@ def assume(kernel, assumptions): # {{{ fix_parameter -def _fix_parameter(kernel, name, value, remove_argument, within=None): +def _fix_parameter(kernel, name, value, within=None): def process_set(s): var_dict = s.get_var_dict() @@ -104,7 +104,8 @@ def _fix_parameter(kernel, name, value, remove_argument, within=None): from loopy.kernel.array import ArrayBase new_args = [] for arg in kernel.args: - if arg.name == name and remove_argument: + if arg.name == name: + # remove from argument list continue if not isinstance(arg, ArrayBase): @@ -144,16 +145,10 @@ def fix_parameters(kernel, **value_dict): """ assert isinstance(kernel, LoopKernel) - # FIXME: Parameter / argument terminology? - - # FIXME: Is _remove the right approach? (I'm not sure it is.) Because of - # the potential namespace conflict. If yes, document. If no, fix. - - remove_arg = value_dict.pop("_remove", True) within = value_dict.pop("within", None) for name, value in value_dict.items(): - kernel = _fix_parameter(kernel, name, value, remove_arg, within) + kernel = _fix_parameter(kernel, name, value, within) return kernel -- GitLab From 96d77e52957995e5deedeb57a35e440dad72aa0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Tue, 1 Jun 2021 19:10:46 -0500 Subject: [PATCH 910/916] Add transfer_requirements_git_urls to downstream CI projects --- .github/workflows/ci.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index deb090557..68f9a62c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -146,8 +146,8 @@ jobs: env: DOWNSTREAM_PROJECT: ${{ matrix.downstream_project }} run: | - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh - . ./ci-support.sh + curl -L -O https://tiker.net/ci-support-v0 + . ./ci-support-v0 # Use "special grudge" for kcv3 and branches targeting it. if [[ "$DOWNSTREAM_PROJECT" = "grudge" ]] && [[ "$GITHUB_HEAD_REF" = "kernel_callables_v3-edit2" || "$GITHUB_BASE_REF" = "kernel_callables_v3-edit2" ]]; then @@ -161,15 +161,16 @@ jobs: if [[ "$DOWNSTREAM_PROJECT" = "pytential" ]] && [[ "$GITHUB_HEAD_REF" = "kernel_callables_v3-edit2" || "$GITHUB_BASE_REF" = "kernel_callables_v3-edit2" ]]; then sed -i "/egg=sumpy/ c git+https://github.com/inducer/sumpy.git@loopy-callables#egg=sumpy" requirements.txt fi + + transfer_requirements_git_urls ../requirements.txt ./requirements.txt sed -i "/egg=loopy/ c git+file://$(readlink -f ..)#egg=loopy" requirements.txt + sed -i "/mpi4py/ d" requirements.txt export CONDA_ENVIRONMENT=.test-conda-env-py3.yml # Avoid slow or complicated tests in downstream projects export PYTEST_ADDOPTS="-k 'not (slowtest or octave or mpi)'" - sed -i "/mpi4py/ d" requirements.txt - build_py_project_in_conda_env test_py_project -- GitLab From 2d7a474c6213497cdefd10fd7c99a4ca3f830e25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Tue, 1 Jun 2021 21:27:37 -0500 Subject: [PATCH 911/916] Bump version to 2021.2 --- loopy/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/version.py b/loopy/version.py index 78eb9beb7..aa94283d0 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -42,7 +42,7 @@ else: # }}} -VERSION = (2021, 1) +VERSION = (2021, 2) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS -- GitLab From 834a3244d1d1fff8c1113801c4c8f23e5c9b074c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Jun 2021 11:34:40 -0500 Subject: [PATCH 912/916] SCHEDULED -> LINEARIZED --- loopy/codegen/__init__.py | 2 +- loopy/target/execution.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 0a849b44e..59cc89414 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -474,7 +474,7 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target, """ from loopy.kernel import KernelState - if kernel.state != KernelState.SCHEDULED: + if kernel.state != KernelState.LINEARIZED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") diff --git a/loopy/target/execution.py b/loopy/target/execution.py index be1d21714..68a60f28d 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -775,7 +775,7 @@ class KernelExecutorBase: from loopy.type_inference import infer_unknown_types program = infer_unknown_types(program, expect_completion=True) - if program.state < KernelState.SCHEDULED: + if program.state < KernelState.LINEARIZED: from loopy.preprocess import preprocess_program program = preprocess_program(program) -- GitLab From c599ddf494dd9edf7d20d69d2a50fcae5d7cf48a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 2 Jun 2021 11:59:01 -0500 Subject: [PATCH 913/916] get_iname_duplication_options: corrects default kwarg value --- loopy/transform/iname.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 85588da3e..8cb649b91 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -1009,7 +1009,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset( # If partitioning was empty, we have recursed successfully and yield nothing -def get_iname_duplication_options(kernel, use_boostable_into=False): +def get_iname_duplication_options(kernel, use_boostable_into=None): """List options for duplication of inames, if necessary for schedulability :returns: a generator listing all options to duplicate inames, if duplication -- GitLab From 750da9b63cd5e4ad5aef579a3dc8ffc29b71941a Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 2 Jun 2021 19:03:08 -0500 Subject: [PATCH 914/916] Tweak ref_call documentation --- doc/ref_call.rst | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/doc/ref_call.rst b/doc/ref_call.rst index 4208624ce..2a48ecca0 100644 --- a/doc/ref_call.rst +++ b/doc/ref_call.rst @@ -11,14 +11,16 @@ Resolving and specialization ---------------------------- In :mod:`loopy`, a :class:`loopy.TranslationUnit` is a collection of callables -and entrypoints. Callable are of type -:class`:loopy.kernel.function_interface.InKernelCallable`. Any expression node -which has a callable corresponding to it appears as -:class:`~loopy.symbolic.ResolvedFunction`. The process of realizing a function as -a :class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as resolving. - - -During code-generation process for a :class:`~loopy.TranslationUnit`, a callable +and entrypoints. Callables are of type +:class`:loopy.kernel.function_interface.InKernelCallable`. Functions start life +as simple :class:`pymbolic.primitives.Call` nodes. Call resolution turns the function +identifiers in those calls into :class:`~loopy.symbolic.ResolvedFunction` objects. +Each resolved function has an entry in :attr:`TranslationUnit.callables_table`. +The process of realizing a function as a +:class:`~loopy.kernel.function_interface.InKernelCallable` is referred to as +resolving. + +During code generation for a :class:`~loopy.TranslationUnit`, a (resolved) callable is *specialized* depending on the types and shapes of the arguments passed at a call site. For example, a call to ``sin(x)`` in :mod:`loopy` is type-generic to begin with, but it later specialized to either ``sinf``, ``sin`` or ``sinl`` @@ -37,11 +39,10 @@ we typically aim to expose all the standard math functions defined for a :class:`~loopy.target.TargetBase`. Other foreign functions could be invoked by *registering* them. -An example demonstrating registering a CBlasGemv as a loopy callable: +An example demonstrating registering a ``CBlasGemv`` as a loopy callable: .. literalinclude:: ../examples/python/call-external.py - Call Instruction for a kernel call ---------------------------------- @@ -52,6 +53,12 @@ arguments. Since a :class:`~loopy.kernel.data.KernelArgument` can be both an input and an output, such arguments would be a part of the call instruction's assignees as well as the call expression node's parameters. +Entry points +------------ + +Only callables in :attr:`loopy.TranslationUnit.entrypoints` can be called from +the outside. All other callables are only visible from within the translation +unit, similar to C's ``static`` functions. Reference --------- -- GitLab From be9636d28691ce0b737226d969b7d9796ac38b6f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni <15399010+kaushikcfd@users.noreply.github.com> Date: Sat, 5 Jun 2021 18:27:55 -0500 Subject: [PATCH 915/916] [callables] Test kernel call with non 1-step slice (#396) * test a non1-step slice * highlight the difference between make_slab and slice's access map * corrects domain for slices with 'start' * sharpens error condition - non-integral step sizes cannot be supported in loopy's domain representation * tests slices with more initial start expressions * make_slab: step -> iname_multiplier, disallow -ve multipliers - make_slab and slices are different, using the same names is confusion. Besides that, 'step' wasn't a representative name for the input --- loopy/isl_helpers.py | 38 ++++++++++++++---------------- loopy/kernel/creation.py | 13 +++++++++- test/test_callables.py | 51 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 22 deletions(-) diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 3e841ba87..d67df1154 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -60,29 +60,34 @@ def dump_space(ls): # {{{ make_slab -def make_slab(space, iname, start, stop, step=1): +def make_slab(space, iname, start, stop, iname_multiplier=1): """ Returns an instance of :class:`islpy._isl.BasicSet`, which satisfies the - constraint ``start <= step*iname < stop``. + constraint ``start <= iname_multiplier*iname < stop``. :arg space: An instance of :class:`islpy._isl.Space`. :arg iname: + Either an instance of :class:`str` as a name of the ``iname`` or a tuple of ``(iname_dt, iname_dx)`` indicating the *iname* in the space. :arg start: + An instance of :class:`int` or an instance of :class:`islpy._isl.Aff` indicating the lower bound of - ``step*iname``(inclusive). + ``iname_multiplier*iname``(inclusive). :arg stop: + An instance of :class:`int` or an instance of :class:`islpy._isl.Aff` indicating the upper bound of - ``step*iname``. + ``iname_multiplier*iname``. + + :arg iname_multiplier: - :arg step: - An instance of :class:`int`. + A strictly positive :class:`int` denoting *iname*'s coefficient in the + above inequality expression. """ zero = isl.Aff.zero_on_domain(space) @@ -112,25 +117,16 @@ def make_slab(space, iname, start, stop, step=1): iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) - if step > 0: - result = (isl.BasicSet.universe(space) - # start <= step*iname - .add_constraint(isl.Constraint.inequality_from_aff( - step*iname_aff - start)) - # step*iname < stop - .add_constraint(isl.Constraint.inequality_from_aff( - stop-1 - step*iname_aff))) - elif step < 0: + if iname_multiplier > 0: result = (isl.BasicSet.universe(space) - # start >= (-step)*iname + # start <= iname_multiplier*iname .add_constraint(isl.Constraint.inequality_from_aff( - step*iname_aff + start)) - # (-step)*iname > stop + iname_multiplier*iname_aff - start)) + # iname_multiplier*iname < stop .add_constraint(isl.Constraint.inequality_from_aff( - -stop-1 - step*iname_aff))) + stop-1 - iname_multiplier*iname_aff))) else: - # step = 0 - raise LoopyError("0 step not allowed in make_slab.") + raise LoopyError("iname_multiplier must be strictly positive") return result diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 4d1e86ca7..b9cf234c6 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -1899,6 +1899,8 @@ def normalize_slice_params(slice, dimension_length): :arg dimension_length: Length of the axis being sliced. """ from pymbolic.primitives import Slice + from numbers import Integral + assert isinstance(slice, Slice) start, stop, step = slice.start, slice.stop, slice.step @@ -1924,6 +1926,10 @@ def normalize_slice_params(slice, dimension_length): # }}} + if not isinstance(step, Integral): + raise LoopyError("Non-integral step sizes lead to non-affine domains =>" + " not supported") + return start, stop, step @@ -2063,7 +2069,12 @@ class SliceToInameReplacer(IdentityMapper): from loopy.isl_helpers import make_slab for iname, (start, stop, step) in sar_bounds.items(): - iname_set = iname_set & make_slab(space, iname, start, stop, step) + if step > 0: + iname_set = iname_set & make_slab(space, iname, 0, + stop-start, step) + else: + iname_set = iname_set & make_slab(space, iname, 0, + start-stop, -step) subarray_ref_domains.append(iname_set) diff --git a/test/test_callables.py b/test/test_callables.py index 4bc37aeba..c19c7f1d0 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -855,6 +855,57 @@ def test_kc_with_floor_div_in_expr(ctx_factory, inline): lp.auto_test_vs_ref(knl, ctx, knl) +@pytest.mark.parametrize("start", [5, 6, 7]) +@pytest.mark.parametrize("inline", [True, False]) +def test_non1_step_slices(ctx_factory, start, inline): + # See https://github.com/inducer/loopy/pull/222#discussion_r645905188 + + ctx = ctx_factory() + cq = cl.CommandQueue(ctx) + + callee = lp.make_function( + "{[i]: 0<=i 1: exec(sys.argv[1]) -- GitLab From b415c506e0fe31b96d918d7bf7de43dc700fe917 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 7 Jun 2021 12:48:55 -0500 Subject: [PATCH 916/916] Some trivial (doc and other fixes) from callables review --- loopy/kernel/function_interface.py | 9 ++------- loopy/transform/callable.py | 6 ++++-- loopy/translation_unit.py | 16 ++++------------ 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index c762fc621..55a38f3e8 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -155,7 +155,7 @@ class ExpressionIsScalarChecker(WalkMapper): def map_sub_array_ref(self, expr): raise LoopyError("Sub-array refs can only be used as call's parameters" - f" or assignees. '{expr}'violates this.") + f" or assignees. '{expr}' violates this.") def map_call(self, expr): self.rec(expr.parameters) @@ -211,12 +211,7 @@ def get_arg_descriptor_for_expression(kernel, expr): sub_dim_tags = [] sub_shape = [] - # FIXME This blindly assumes that dim_tag has a stride and - # will not work for non-stride dim tags (e.g. vec or sep). - - # (AK) FIXME: This will almost always be nonlinear--when does this - # actually help? Maybe remove this? - # (KK) Reply: This helps in identifying identities like + # This helps in identifying identities like # "2*(i//2) + i%2" := "i" # See the kernel in # test_callables.py::test_shape_translation_through_sub_array_refs diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index e88c88239..0180fe208 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -75,7 +75,7 @@ def register_callable(translation_unit, function_identifier, callable_, def merge(translation_units): """ - :param translation_units: A list of :class:`loopy.TranslationUnit`. + :param translation_units: A sequence of :class:`loopy.TranslationUnit`. :returns: An instance of :class:`loopy.TranslationUnit` which contains all the callables from each of the *translation_units. @@ -246,7 +246,6 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn): Returns a copy of *caller_knl* with the *call_insn* in the *kernel* replaced by inlining *callee_knl* into it within it. - :arg call_insn: An instance of `loopy.CallInstruction` of the call-site. """ import pymbolic.primitives as prim @@ -510,6 +509,8 @@ def inline_callable_kernel(translation_unit, function_name): # }}} +# {{{ rename_callable + def rename_callable(program, old_name, new_name=None, existing_ok=False): """ :arg program: An instance of :class:`loopy.TranslationUnit` @@ -564,5 +565,6 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False): return program.copy(callables_table=new_callables_table, entrypoints=new_entrypoints) +# }}} # vim: foldmethod=marker diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index 5804707cc..83ceeef68 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -139,7 +139,7 @@ class TranslationUnit(ImmutableRecord): The :class:`~loopy.LoopKernel` representing the main entrypoint of the program, if defined. Currently, this attribute may only be - accessed if there is exactly one entrypoint in the program. + accessed if there is exactly one entrypoint in the translation unit. .. attribute:: callables_table @@ -154,7 +154,7 @@ class TranslationUnit(ImmutableRecord): .. attribute:: func_id_to_in_knl_callables_mappers A :class:`frozenset` of functions of the signature ``(target: - TargetBase, function_indentifier: str)`` that would return an instance + TargetBase, function_indentifier: str)`` that returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. .. automethod:: __call__ @@ -305,19 +305,11 @@ class TranslationUnit(ImmutableRecord): :attr:`TranslationUnit.target` is an executable target. :arg entrypoint: The name of the entrypoint callable to be called. - Defaults to *the* entrypoint if there is only one. + Defaults to :attr:`default_entrypoint`. """ entrypoint = kwargs.get("entrypoint", None) - if entrypoint is None: - # did not receive an entrypoint for the program to execute - if len(self.entrypoints) == 1: - entrypoint, = self.entrypoints - else: - raise TypeError("TranslationUnit.__call__() missing 1 required" - " keyword argument: 'entrypoint'. " - "(Multiple possible entrypoints are present in the " - "program.)") + entrypoint = self.default_entrypoint.name if entrypoint not in self.entrypoints: raise LoopyError(f"'{entrypoint}' not in list of possible entrypoints " -- GitLab