From c4c1b6d397a58a35450fc40056b6c57d58ff1fd9 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Mon, 7 Dec 2015 18:16:31 -0600 Subject: [PATCH] Flatten target submodules, tweak CUDA backend --- loopy/target/cuda.py | 271 ++++++++++++++++++ loopy/target/{ispc/__init__.py => ispc.py} | 0 .../target/{opencl/__init__.py => opencl.py} | 0 .../{pyopencl/__init__.py => pyopencl.py} | 0 test/test_loopy.py | 24 +- 5 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 loopy/target/cuda.py rename loopy/target/{ispc/__init__.py => ispc.py} (100%) rename loopy/target/{opencl/__init__.py => opencl.py} (100%) rename loopy/target/{pyopencl/__init__.py => pyopencl.py} (100%) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py new file mode 100644 index 000000000..6f66e7b99 --- /dev/null +++ b/loopy/target/cuda.py @@ -0,0 +1,271 @@ +"""CUDA target independent of PyCUDA.""" + +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import numpy as np + +from loopy.target.c import CTarget +from pytools import memoize_method +from loopy.diagnostic import LoopyError + +from pymbolic import var + + +# {{{ vector types + +class vec: # noqa + pass + + +def _create_vector_types(): + field_names = ["x", "y", "z", "w"] + + if tuple.__itemsize__ * 8 == 32: + long_dtype = np.int32 + ulong_dtype = np.uint32 + else: + long_dtype = np.int64 + ulong_dtype = np.uint64 + + vec.types = {} + vec.names_and_dtypes = [] + vec.type_to_scalar_and_count = {} + + for base_name, base_type, counts in [ + ('char', np.int8, [1, 2, 3, 4]), + ('uchar', np.uint8, [1, 2, 3, 4]), + ('short', np.int16, [1, 2, 3, 4]), + ('ushort', np.uint16, [1, 2, 3, 4]), + ('int', np.int32, [1, 2, 3, 4]), + ('uint', np.uint32, [1, 2, 3, 4]), + ('long', long_dtype, [1, 2, 3, 4]), + ('ulong', ulong_dtype, [1, 2, 3, 4]), + ('longlong', np.int64, [1, 2]), + ('ulonglong', np.uint64, [1, 2]), + ('float', np.float32, [1, 2, 3, 4]), + ('double', np.float64, [1, 2]), + ]: + for count in counts: + name = "%s%d" % (base_name, count) + + titles = field_names[:count] + + names = ["s%d" % i for i in range(count)] + if len(titles) < len(names): + titles.extend((len(names)-len(titles))*[None]) + + try: + dtype = np.dtype(dict( + names=names, + formats=[base_type]*count, + titles=titles)) + except NotImplementedError: + try: + dtype = np.dtype([((n, title), base_type) + for (n, title) in zip(names, titles)]) + except TypeError: + dtype = np.dtype([(n, base_type) for (n, title) + in zip(names, titles)]) + + setattr(vec, name, dtype) + + vec.names_and_dtypes.append((name, dtype)) + + vec.types[np.dtype(base_type), count] = dtype + vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count + +_create_vector_types() + + +def _register_vector_types(dtype_registry): + for name, dtype in vec.names_and_dtypes: + dtype_registry.get_or_register_dtype(name, dtype) + +# }}} + + +# {{{ function mangler + +def cuda_function_mangler(kernel, name, arg_dtypes): + if not isinstance(name, str): + return None + + if name in ["max", "min"] and len(arg_dtypes) == 2: + dtype = np.find_common_type([], arg_dtypes) + + if dtype.kind == "c": + raise RuntimeError("min/max do not support complex numbers") + + if dtype.kind == "f": + name = "f" + name + + return dtype, name + + if name in "atan2" and len(arg_dtypes) == 2: + return arg_dtypes[0], name + + if name == "dot": + scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"] + return scalar_dtype, name + + return None + +# }}} + + +# {{{ target + +class CudaTarget(CTarget): + # {{{ library + + def function_manglers(self): + return ( + super(CudaTarget, self).function_manglers() + [ + cuda_function_mangler + ]) + + # }}} + + @memoize_method + def get_dtype_registry(self): + from loopy.target.c.compyte.dtypes import (DTypeRegistry, + fill_registry_with_opencl_c_types) + + result = DTypeRegistry() + fill_registry_with_opencl_c_types(result) + + # no complex number support--needs PyOpenCLTarget + + _register_vector_types(result) + + return result + + def is_vector_dtype(self, dtype): + return list(vec.types.values()) + + def vector_dtype(self, base, count): + return vec.types[base, count] + + # }}} + + # {{{ top-level codegen + + def wrap_function_declaration(self, kernel, fdecl): + from cgen.cuda import CudaGlobal + return CudaGlobal(fdecl) + + def generate_code(self, kernel, codegen_state, impl_arg_info): + code, implemented_domains = ( + super(CudaTarget, self).generate_code( + kernel, codegen_state, impl_arg_info)) + + return code, implemented_domains + + def generate_body(self, kernel, codegen_state): + body, implemented_domains = ( + super(CudaTarget, self).generate_body(kernel, codegen_state)) + + from loopy.kernel.data import ImageArg + + if any(isinstance(arg, ImageArg) for arg in kernel.args): + raise NotImplementedError("not yet: texture arguments in CUDA") + + return body, implemented_domains + + # }}} + + # {{{ code generation guts + + _GRID_AXES = "xyz" + + def get_global_axis_expr(self, axis): + return var("blockIdx").attr(self._GRID_AXES[axis]) + + def get_local_axis_expr(self, axis): + return var("threadIdx").attr(self._GRID_AXES[axis]) + + _VEC_AXES = "xyzw" + + def add_vector_access(self, access_str, index): + return "(%s).%s" % (access_str, self._VEC_AXES[int(index)]) + + def emit_barrier(self, kind, comment): + """ + :arg kind: ``"local"`` or ``"global"`` + :return: a :class:`loopy.codegen.GeneratedInstruction`. + """ + if kind == "local": + if comment: + comment = " /* %s */" % comment + + from loopy.codegen import GeneratedInstruction + from cgen import Statement + return GeneratedInstruction( + ast=Statement("__syncthreads()%s" % comment), + implemented_domain=None) + elif kind == "global": + raise LoopyError("CUDA does not have global barriers") + else: + raise LoopyError("unknown barrier kind") + + def wrap_temporary_decl(self, decl, is_local): + if is_local: + from cgen.cuda import CudaShared + return CudaShared(decl) + else: + return decl + + def get_global_arg_decl(self, name, shape, dtype, is_written): + from loopy.codegen import POD # uses the correct complex type + from cgen import Const + from cgen.cuda import CudaRestrictPointer + + arg_decl = CudaRestrictPointer(POD(self, dtype, name)) + + if not is_written: + arg_decl = Const(arg_decl) + + return arg_decl + + def get_image_arg_decl(self, name, shape, dtype, is_written): + raise NotImplementedError("not yet: texture arguments in CUDA") + + def get_constant_arg_decl(self, name, shape, dtype, is_written): + from loopy.codegen import POD # uses the correct complex type + from cgen import RestrictPointer, Const + from cgen.cuda import CudaConstant + + arg_decl = RestrictPointer(POD(dtype, name)) + + if not is_written: + arg_decl = Const(arg_decl) + + return CudaConstant(arg_decl) + + # }}} + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/target/ispc/__init__.py b/loopy/target/ispc.py similarity index 100% rename from loopy/target/ispc/__init__.py rename to loopy/target/ispc.py diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl.py similarity index 100% rename from loopy/target/opencl/__init__.py rename to loopy/target/opencl.py diff --git a/loopy/target/pyopencl/__init__.py b/loopy/target/pyopencl.py similarity index 100% rename from loopy/target/pyopencl/__init__.py rename to loopy/target/pyopencl.py diff --git a/test/test_loopy.py b/test/test_loopy.py index 3e14ed2d8..b98e9a27a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2313,7 +2313,7 @@ def test_collect_common_factors(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13)) -def test_ispc_backend(): +def test_ispc_target(): from loopy.target.ispc import ISPCTarget knl = lp.make_kernel( @@ -2335,6 +2335,28 @@ def test_ispc_backend(): lp.preprocess_kernel(knl)))[0]) +def test_cuda_target(): + from loopy.target.cuda import CudaTarget + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]", + [ + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ], + target=CudaTarget()) + + knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") + knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) + + print( + lp.generate_code( + lp.get_one_scheduled_kernel( + lp.preprocess_kernel(knl)))[0]) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab