diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 16e859e0be38808e5af9e26af5cf540547b704f3..4868f70af81ae54972e7d81282b62798da233407 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -108,6 +108,10 @@ class ReductionIsNotTriangularError(LoopyError): pass +class LoopyTypeError(LoopyError): + pass + + class ExpressionNotAffineError(LoopyError): pass diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 4f4ac4f31d5ee264ea9bccc6466f98ddbb1dfaab..e870f46e60ebf9c817cc29db529562031b693bb5 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -29,7 +29,7 @@ import six import numpy as np # noqa from loopy.kernel.data import CallMangleInfo from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError, LoopyTypeError from cgen import Pointer, NestedDeclarator, Block from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE @@ -328,35 +328,68 @@ def c_symbol_mangler(kernel, name): # {{{ function mangler -def c_function_mangler(target, name, arg_dtypes): - # convert abs(), min(), max() to fabs(), fmin(), fmax() to comply with - # C99 standard +def c_math_mangler(target, name, arg_dtypes, modify_name=True): + # Function mangler for math functions defined in C standard + # Convert abs, min, max to fabs, fmin, fmax. + # If modify_name is set to True, function names are modified according to + # floating point types of the arguments (e.g. cos(double), cosf(float)) + # This should be set to True for C and Cuda, False for OpenCL if not isinstance(name, str): return None - if (name == "abs" + if name in ["abs", "min", "max"]: + name = "f" + name + + # unitary functions + if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh", + "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"] and len(arg_dtypes) == 1 and arg_dtypes[0].numpy_dtype.kind == "f"): + + dtype = arg_dtypes[0].numpy_dtype + + if modify_name: + if dtype == np.float64: + pass # fabs + elif dtype == np.float32: + name = name + "f" # fabsf + elif dtype == np.float128: + name = name + "l" # fabsl + else: + raise LoopyTypeError("%s does not support type %s" % (name, dtype)) + return CallMangleInfo( - target_name="fabs", + target_name=name, result_dtypes=arg_dtypes, arg_dtypes=arg_dtypes) - if name in ["max", "min"] and len(arg_dtypes) == 2: + # binary functions + if (name in ["fmax", "fmin"] + and len(arg_dtypes) == 2): + dtype = np.find_common_type( - [], [dtype.numpy_dtype for dtype in arg_dtypes]) + [], [dtype.numpy_dtype for dtype in arg_dtypes]) if dtype.kind == "c": - raise RuntimeError("min/max do not support complex numbers") - - if dtype.kind == "f": - name = "f" + name - - result_dtype = NumpyType(dtype) - return CallMangleInfo( - target_name=name, - result_dtypes=(result_dtype,), - arg_dtypes=2*(result_dtype,)) + raise LoopyTypeError("%s does not support complex numbers") + + elif dtype.kind == "f": + if modify_name: + if dtype == np.float64: + pass # fmin + elif dtype == np.float32: + name = name + "f" # fminf + elif dtype == np.float128: + name = name + "l" # fminl + else: + raise LoopyTypeError("%s does not support type %s" + % (name, dtype)) + + result_dtype = NumpyType(dtype) + return CallMangleInfo( + target_name=name, + result_dtypes=(result_dtype,), + arg_dtypes=2*(result_dtype,)) return None @@ -369,7 +402,7 @@ class CASTBuilder(ASTBuilderBase): def function_manglers(self): return ( super(CASTBuilder, self).function_manglers() + [ - c_function_mangler + c_math_mangler ]) def symbol_manglers(self): diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 2763caace891570a1b7f8b13f225001a03d3aa65..d2fe4157fc1ff6f9eb7817bea7da8da7e31bbdc1 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -31,13 +31,15 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType -from loopy.target.c import DTypeRegistryWrapper +from loopy.target.c import DTypeRegistryWrapper, c_math_mangler from loopy.kernel.data import temp_var_scope, CallMangleInfo from pymbolic import var +from functools import partial # {{{ dtype registry wrappers + class DTypeRegistryWrapperWithAtomics(DTypeRegistryWrapper): def get_or_register_dtype(self, names, dtype=None): if dtype is not None: @@ -167,6 +169,18 @@ def opencl_function_mangler(kernel, name, arg_dtypes): if not isinstance(name, str): return None + # OpenCL has min(), max() for integer types + if name in ["max", "min"] and len(arg_dtypes) == 2: + dtype = np.find_common_type( + [], [dtype.numpy_dtype for dtype in arg_dtypes]) + + if dtype.kind == "i": + result_dtype = NumpyType(dtype) + return CallMangleInfo( + target_name=name, + result_dtypes=(result_dtype,), + arg_dtypes=2*(result_dtype,)) + if name == "dot": scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"] return CallMangleInfo( @@ -354,9 +368,11 @@ class OpenCLCASTBuilder(CASTBuilder): def function_manglers(self): return ( - super(OpenCLCASTBuilder, self).function_manglers() + [ - opencl_function_mangler - ]) + [ + opencl_function_mangler, + partial(c_math_mangler, modify_name=False) + ] + + super(OpenCLCASTBuilder, self).function_manglers()) def symbol_manglers(self): return ( diff --git a/loopy/version.py b/loopy/version.py index e142162729d5a374082fa853dcc763665f7dfe33..744eb90a46265d0085ac6ff56455398729aad33f 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v69-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v70-islpy%s" % _islpy_version diff --git a/test/test_target.py b/test/test_target.py index aa6f004634f207a7b9733da4a3d7e06d13d7db7c..d3cf2670cb0db0eb5d0046ce1d816b679d4a1ed8 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -30,6 +30,9 @@ import pyopencl.clmath # noqa import pyopencl.clrandom # noqa import pytest +from loopy.target.c import CTarget +from loopy.target.opencl import OpenCLTarget + import logging logger = logging.getLogger(__name__) @@ -96,8 +99,6 @@ def test_cuda_target(): def test_generate_c_snippet(): - from loopy.target.c import CTarget - from pymbolic import var I = var("I") # noqa f = var("f") @@ -140,10 +141,17 @@ def test_generate_c_snippet(): print(lp.generate_body(knl)) -def test_c_min_max(): - # Test fmin() fmax() is generated for C backend instead of max() and min() - from loopy.target.c import CTarget +@pytest.mark.parametrize("target", [CTarget, OpenCLTarget]) +@pytest.mark.parametrize("tp", ["f32", "f64"]) +def test_math_function(target, tp): + # Test correct maths functions are generated for C and OpenCL + # backend instead for different data type + + data_type = {"f32": np.float32, + "f64": np.float64}[tp] + import pymbolic.primitives as p + i = p.Variable("i") xi = p.Subscript(p.Variable("x"), i) yi = p.Subscript(p.Variable("y"), i) @@ -151,20 +159,32 @@ def test_c_min_max(): n = 100 domain = "{[i]: 0<=i<%d}" % n - data = [lp.GlobalArg("x", np.float64, shape=(n,)), - lp.GlobalArg("y", np.float64, shape=(n,)), - lp.GlobalArg("z", np.float64, shape=(n,))] + data = [lp.GlobalArg("x", data_type, shape=(n,)), + lp.GlobalArg("y", data_type, shape=(n,)), + lp.GlobalArg("z", data_type, shape=(n,))] inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))] - knl = lp.make_kernel(domain, inst, data, target=CTarget()) + knl = lp.make_kernel(domain, inst, data, target=target()) code = lp.generate_code_v2(knl).device_code() + assert "fmin" in code + if tp == "f32" and target == CTarget: + assert "fminf" in code + else: + assert "fminf" not in code + inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))] - knl = lp.make_kernel(domain, inst, data, target=CTarget()) + knl = lp.make_kernel(domain, inst, data, target=target()) code = lp.generate_code_v2(knl).device_code() + assert "fmax" in code + if tp == "f32" and target == CTarget: + assert "fmaxf" in code + else: + assert "fmaxf" not in code + @pytest.mark.parametrize("tp", ["f32", "f64"]) def test_random123(ctx_factory, tp):