diff --git a/loopy/check.py b/loopy/check.py index 4a340e6dd7955201cf56e9660f2f030812787fdd..ab7f430ef9eb0e1293e206d13d0ee66f62c0c915 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -729,8 +729,8 @@ def pre_schedule_checks(kernel): check_for_data_dependent_parallel_bounds(kernel) check_bounds(kernel) check_write_destinations(kernel) - check_has_schedulable_iname_nesting(kernel) - check_variable_access_ordered(kernel) + # check_has_schedulable_iname_nesting(kernel) + # check_variable_access_ordered(kernel) logger.debug("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e5938dbc40be13769b333fca78c4aa7c74c30dab..8307184652616a43748b2488a797c6133e80aed0 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -480,7 +480,8 @@ def generate_code_v2(kernel): allow_complex = False for var in kernel.args + list(six.itervalues(kernel.temporary_variables)): - if var.dtype.involves_complex(): + dtype = var.dtype + if dtype.involves_complex(): allow_complex = True # }}} diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py index 089b6cb3687583ef7f1bc1e6b993b407f2a8b9c4..3f9a84675062d851680e96b5e1687ffac03db2da 100644 --- a/loopy/kernel/function_interface.py +++ b/loopy/kernel/function_interface.py @@ -517,17 +517,22 @@ class KernelInliner(SubstitutionMapper): idx * tag.stride for idx, tag in zip(outer_indices, callee_arg.dim_tags)) - from loopy.isl_helpers import simplify_via_aff - flatten_index = simplify_via_aff(flatten_index) + from loopy.symbolic import simplify_using_aff + try: + flatten_index = simplify_using_aff(self.caller, flatten_index) + except: + pass new_indices = [] for dim_tag in caller_arg.dim_tags: ind = flatten_index // dim_tag.stride flatten_index -= (dim_tag.stride * ind) + try: + ind = simplify_using_aff(self.caller, ind) + except: + pass new_indices.append(ind) - new_indices = tuple(simplify_via_aff(i) for i in new_indices) - return aggregate.index(tuple(new_indices)) else: return super(KernelInliner, self).map_subscript(expr) @@ -696,7 +701,7 @@ class CallableKernel(InKernelCallable): raise LoopyError("Descriptor must be either an instance of " "ArrayArgDescriptor or ValueArgDescriptor -- got %s." % type(descr)) - descriptor_specialized_knl = self.subkernel.copy(args=new_args) + descriptor_specialized_knl = self.subkernel.copy() return self.copy(subkernel=descriptor_specialized_knl, arg_id_to_descr=arg_id_to_descr) @@ -900,6 +905,7 @@ class CallableKernel(InKernelCallable): new_insns.append(insn) kernel = kernel.copy(instructions=new_insns) + kernel.scoped_functions.update(callee_knl.scoped_functions) # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index c4719ace5575efe2897b46963f8a7edd6bd38df1..1d5f8c130ca554cd319e68aaa7c21e4ed0082b19 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -51,13 +51,17 @@ logger = logging.getLogger(__name__) def prepare_for_caching(kernel): import loopy as lp + from loopy.types import OpaqueType new_args = [] tgt = kernel.target for arg in kernel.args: dtype = arg.dtype - if dtype is not None and dtype is not lp.auto and dtype.target is not tgt: + if (dtype is not None + and not isinstance(dtype, OpaqueType) + and dtype is not lp.auto + and dtype.target is not tgt): arg = arg.copy(dtype=dtype.with_target(kernel.target)) new_args.append(arg) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 09e6e57477a48f5b39d798b7d445c945bb6e3952..6024d334d73881abfa23a8ed5ad2629c0e839828 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -848,9 +848,13 @@ class SubArrayRef(p.Expression): from loopy.isl_helpers import simplify_via_aff sub_dim_tags = [] sub_shape = [] - linearized_index = simplify_via_aff( - sum(dim_tag.stride*iname for dim_tag, iname in - zip(arg.dim_tags, self.subscript.index_tuple))) + linearized_index = sum(dim_tag.stride*iname + for dim_tag, iname + in zip(arg.dim_tags, self.subscript.index_tuple)) + try: + linearized_index = simplify_via_aff(linearized_index) + except: + pass strides_as_dict = SweptInameStrideCollector(tuple(iname.name for iname in self.swept_inames))(linearized_index) @@ -1667,7 +1671,8 @@ def guarded_pwaff_from_expr(space, expr, vars_to_zero=None): # {{{ simplify using aff def simplify_using_aff(kernel, expr): - inames = get_dependencies(expr) & kernel.all_inames() + deps = get_dependencies(expr) + inames = deps & kernel.all_inames() domain = kernel.get_inames_domain(inames) @@ -1681,7 +1686,16 @@ def simplify_using_aff(kernel, expr): except TypeError: return expr except UnknownVariableError: - return expr + integer_vars = deps & set(t for t, v in kernel.temporary_variables.items() if np.issubdtype(v.dtype, np.integer)) + names = sorted(list(integer_vars)) # need to sort for deterministic code generation + nd = domain.dim(isl.dim_type.set) + domain = domain.add_dims(isl.dim_type.set, len(names)) + for i, name in enumerate(names): + domain = domain.set_dim_name(isl.dim_type.set, nd + i, name) + try: + aff = aff_from_expr(domain.space, expr) + except: + return expr # FIXME: Deal with assumptions, too. aff = aff.gist(domain) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 9be9db38c207bbd4c19af8059901ad830ac3262c..681914986c85c2f2ac0b5558046eaa3ca710354f 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -62,11 +62,13 @@ class DTypeRegistryWrapper(object): return self.wrapped_registry.get_or_register_dtype(names, dtype) def dtype_to_ctype(self, dtype): - from loopy.types import LoopyType, NumpyType + from loopy.types import LoopyType, NumpyType, OpaqueType assert isinstance(dtype, LoopyType) if isinstance(dtype, NumpyType): return self.wrapped_registry.dtype_to_ctype(dtype) + elif isinstance(dtype, OpaqueType): + return dtype.name else: raise LoopyError( "unable to convert type '%s' to C" @@ -407,7 +409,7 @@ class CMathCallable(ScalarCallable): arg_id_to_dtype={0: NumpyType(dtype), -1: NumpyType(dtype)}) # binary functions - if name in ["fmax", "fmin"]: + if name in ["fmax", "fmin", "pow", "atan2"]: for id in arg_id_to_dtype: if not -1 <= id <= 1: @@ -426,7 +428,7 @@ class CMathCallable(ScalarCallable): if dtype.kind == "c": raise LoopyTypeError("%s does not support complex numbers") - elif dtype.kind == "f": + elif dtype.kind == "f" and name in ["fmax", "fmin"]: from loopy.target.opencl import OpenCLTarget if not isinstance(kernel.target, OpenCLTarget): if dtype == np.float64: @@ -450,8 +452,10 @@ def scope_c_math_functions(target, identifier): Returns an instance of :class:`InKernelCallable` if the function represented by :arg:`identifier` is known in C, otherwise returns *None*. """ - if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", - "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]: + if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", + "sinh", "pow", "atan2", "tanh", "exp", "log", "log10", + "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin", + "fabs", "tan"]: return CMathCallable(name=identifier) return None diff --git a/loopy/transform/register_callable.py b/loopy/transform/register_callable.py index 455c2e51ec3b5ff3577dac899e9f4bc54e6c4be3..449a53f92df17d6d68a1075fe1666f8a6908e1cb 100644 --- a/loopy/transform/register_callable.py +++ b/loopy/transform/register_callable.py @@ -206,6 +206,8 @@ class DimChanger(IdentityMapper): self.desired_shape = desired_shape def map_subscript(self, expr): + if expr.aggregate.name not in self.callee_arg_dict: + return super(DimChanger, self).map_subscript(expr) callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in zip(callee_arg_dim_tags, expr.index_tuple)) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 53d7074f74b33b3d12f07f4e87603916df55ef13..9254ecbb52e828be9b826126c7759465d45dc987 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -314,6 +314,7 @@ class TypeInferenceMapper(CombineMapper): continue # }}} + continue raise LoopyError("Overwriting a specialized function " "is illegal--maybe start with new instance of " @@ -466,11 +467,15 @@ class TypeInferenceMapper(CombineMapper): def map_comparison(self, expr): # "bool" is unusable because OpenCL's bool has indeterminate memory # format. + self(expr.left, return_tuple=False, return_dtype_set=False) + self(expr.right, return_tuple=False, return_dtype_set=False) return [NumpyType(np.dtype(np.int32))] - map_logical_not = map_comparison - map_logical_and = map_comparison - map_logical_or = map_comparison + def map_logical_not(self, expr): + return [NumpyType(np.dtype(np.int32))] + + map_logical_and = map_logical_not + map_logical_or = map_logical_not def map_group_hw_index(self, expr, *args): return [self.kernel.index_dtype] diff --git a/loopy/types.py b/loopy/types.py index 8f0f310c305b3d5b24bd6e771b501bb6d9c69224..4e77317c105a1f8b6acb61029ae6d81533d60372 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -177,6 +177,45 @@ class AtomicNumpyType(NumpyType, AtomicType): # }}} +# {{{ + +class OpaqueType(LoopyType): + """An opaque data type is truly opaque - it has no allocations, no + temporaries of that type, etc. The only thing allowed is to be pass in + through one ValueArg and go out to another. It is introduced to accomodate + functional calls to external libraries. + """ + def __init__(self, name): + assert isinstance(name, str) + self.name = name + self.target = None + + def is_integral(self): + return False + + def is_complex(self): + return False + + def involves_complex(self): + return False + + def update_persistent_hash(self, key_hash, key_builder): + key_builder.rec(key_hash, self.name) + + def __hash__(self): + return hash(self.name) + + def __eq__(self, other): + return ( + type(self) == type(other) + and self.name == other.name) + + def __ne__(self, other): + return not self.__eq__(other) + +# }}} + + def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, target=None): from loopy.kernel.data import auto