diff --git a/doc/source/driver.rst b/doc/source/driver.rst index 4eaa839db1c42376076bc43d93e5ae0ab5d0fcbe..09f389fa8754580ad15bddbb7a9f5c603761e4d7 100644 --- a/doc/source/driver.rst +++ b/doc/source/driver.rst @@ -513,6 +513,36 @@ Constants .. attribute:: PREFER_PTX .. attribute:: PREFER_BINARY +.. class:: jit_input_type + + Supported input data types to CUDA's JIT linker. + The attribute names and values of this class correspond to CUDA's + `enum CUjitInputType`. + + CUDA 3.5 and above. + + .. versionadded:: 2017.? + + .. attribute:: CUBIN + + Data contains compiled device-class-specific device code. + + .. attribute:: PTX + + Data contains PTX (Parallel Thread eXecution) source code. + + .. attribute:: FATBINARY + + Data contains a bundle of multiple cubins and/or PTX of some device code. + + .. attribute:: OBJECT + + Data contains host object with embedded device code. + + .. attribute:: LIBRARY + + Data contains an archive of host objects with embedded device code. + .. class:: host_alloc_flags Flags to be used to allocate :ref:`pagelocked_memory`. @@ -558,6 +588,14 @@ Constants .. attribute:: LAZY_ENABLE_PEER_ACCESS + .. attribute:: DEV_RUNTIME_SYNC_DEPTH + + CUDA 3.5 and above. + + .. attribute:: DEV_RUNTIME_PENDING_LAUNCH_COUNT + + CUDA 3.5 and above. + Graphics-related constants ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1993,6 +2031,290 @@ CUDA 4.0 and newer. Just-in-time Compilation ======================== +PyCUDA provides several tools to build a :class:`Module` from CUDA C++ +sources at run-time. A Module encapsulates binary GPU code and acts as an +interface to function and data pointers in device memory when it is executed +on a device. + +For a typical case with a single CUDA C++ source, see: + +- :class:`~pycuda.compiler.SourceModule` - A Module created using NVIDIA's + command line compiler NVCC, compiled and linked against CUDA's static + runtime. Unless you need any of the other features described below, this + should be your choice. +- :class:`~pycuda.compiler.DynamicSourceModule` - A Module created using NVCC + or NVRTC and CUDA's Just-In-Time (JIT) linker. Facilitates linking against + CUDA's dynamic runtime, which is required for dynamic parallelism / + recursive kernels. + +and when you need to compile from multiple input sources, see: + +- :class:`~pycuda.compiler.DynamicModule` - A Module created from an arbitrary + mix of CUDA C++ sources (compiled with either NVCC or NVRTC), PTX sources + and CUDA libraries with CUDA's Just-In-Time (JIT) linker. + +All classes exhibit the same public interface as :class:`Module`, but do not +inherit from it. In order to compile a CUDA C++ source into a plain string +without building a Module, see: + +- :func:`compiler.compile() ` for NVCC, returns + CUBIN binary code +- :meth:`NvrtcCompiler.compile` for NVRTC, returns PTX source code + +NVRTC +----- + +NVIDIA's Run-Time-Compiler +`NVRTC `_ +is a light-weight alternative to their Command-Line-Compiler +`NVCC `_, +some of the differences include: + +- NVRTC is a pure CUDA C++ source to PTX (Parallel Thread eXecution) code + compiler and thus covers a lot smaller scope than NVCC. +- NVRTC is an API, whereas NVCC is an executable. +- NVRTC supports only a `small subset + `_ + of NVCC's `compiler options + `_. +- NVRTC does not support certain preprocessor directives and for example + fails to parse some STL header files, but just like in NVCC, the + ``printf()`` function is directly available in CUDA sources. +- NVRTC requires extra steps to access functions in compiled CUDA + sources from the host. The names of these functions must be declared + to NVRTC before compiling, and after compilation they can only be + looked up in the Module by using their C++ mangled names. + +PyCUDA already reduces the computational overhead involved with NVCC by +maintaining a file-sytem based cache, which means that it invokes NVCC +only once for each unique combination of CUDA source and compiler setup. +For applications that generate volatile CUDA C++ sources dynamically at +run-time and therby cause overly many cache-misses in PyCUDA's NVCC +cache, using NVRTC instead of NVCC should be considered if CUDA compile +time is a problem. + +See also: + + :class:`~pycuda.compiler.DynamicSourceModule`, + :meth:`DynamicModule.add_source_nvrtc() ` and + :class:`NvrtcCompiler` + +Dynamic Runtime +--------------- + +Be aware that there is an overall negative performance impact caused by +simply linking to CUDA's dynamic device runtime library *cudadevrt*. + +.. warning:: + + From CUDA's `Dynamic Parallelism Programming Guide + `_ + (CUDA 8.0): + + *C.4.2.2. Dynamic-parallelism-enabled Kernel Overhead* + + System software which is active when controlling dynamic launches + may impose an overhead on any kernel which is running at the time, + whether or not it invokes kernel launches of its own. This + overhead arises from the device runtime's execution tracking and + management software and may result in decreased performance for + e.g., library calls when made from the device compared to from the + host side. This overhead is, in general, incurred for applications + that link against the device runtime library. + +It is therefore recommended to use *cudadevrt* only when needed. + +.. class:: JitLinker([link_options [, message_handler [, log_verbose]]]) + + .. versionadded:: 2017.? + + Create an object that provides access to CUDA's Just-In-Time (JIT) linker. + + :param link_options: + JIT linker options + :type link_options: + list(jit_option) or None + :param message_handler: + Optional callback on success or failure (see below) + :type message_handler: + callable or None + :param bool log_verbose: + If *True*, instruct linker to create detailed log output. + Default: *False* + + The *message_handler* callback's Python function prototype is + + .. code-block:: python + + def message_handler(bool succeeded, str info_str, str error_str) + + where *succeeded* informs on success (*True*) or failure (*False*) of the + linker invocation, *info_str* contains informational linker log output and + *error_str* any error output. Note that in case of a linker error an + Excpetion is raised containing *error_str* after *message_handler* was + called. + + CUDA 3.5 and above. + + .. method:: add_data(data, input_type, name=None) + + Add *data* with type *input_type* to this linker session. + + :param data: + A string (Python 2) or bytes (Python 3) object containing the + data to add + :type data: + str or bytes + :param input_type: + JIT linker input type of *data* + :type input_type: + jit_input_type + :param name: + Specifies to the linker under which name to refer to *data* in + its warning and error messages. If *None*, defaults to ``"unknown"`` + :type name: + str or None + + .. method:: add_file(filename, input_type) + + Add content of file *filename* with type *input_type* to this linker + session. + + :param str filename: + The name of the file to add + :param input_type: + JIT linker input type of the file's content + :type input_type: + jit_input_type + + .. method:: add_source_nvrtc(source, compiler, name=None,\ + options=None, exports=None, mangled_exports=None) + + Compile CUDA source code *source* using NVRTC *compiler* with options + *options*, then add the resulting PTX code to this linker + session. + + :param source: + CUDA C++ source code + :type source: + str + :param compiler: + The NVRTC compiler to use to compile *source* + :type compiler: + NvrtcCompiler + :param name: + Specifies to the compiler and linker under which name to refer to + *source* in their warning and error messages. If *None*, defaults + to ``"kernel.ptx"`` + :type name: + str or None + :param options: + List of NVRTC compile options + :type options: + list(str) or None + :param exports: + Function names in *source* that the caller wants to be exported + :type exports: + list(str) or None + :param mangled_exports: + Output variable, receives pairs of function names from *exports* + and their mangled counterparts + :type mangled_exports: + dict(str, str) or None + :see: + :meth:`NvrtcCompiler.compile` + + CUDA 8.0 and above. + + .. method:: link_module() + + Closes this linker session and returns the new :class:`Module` object. + + :return: + the newly created :class:`Module` instance + +.. class:: NvrtcCompiler([options[, include_dirs [, arch [, headers]]]]) + + .. versionadded:: 2017.? + + Create an object that provides access to NVIDIA's Run-Time-Compiler (NVRTC). + + Configure the compiler to add the CUDA SDK include path, the PyCUDA + include path and all directory names in *include_dirs* to its include + search paths. The global compiler options *options* provided in + this constructor can later be extended per CUDA source, see + :meth:`compile`. + + Note that NVRTC `supports + `_ + only a small subset of `NVCC's + `_ + compiler options. + + :param options: + List of NVRTC compile options to use with this compiler instance. + :type options: + list(str) or None + :param include_dirs: + List of extra directory names to use when processing ``#include`` + preprocessor directives in CUDA sources. + :type include_dirs: + list(str) or None + :param arch: + String of the form ``compute_`` where `` and + `` are the two digits of a compute capability + pair `(, )`. If *None*, defaults to the current + context's device's compute capability. + :type arch: + str or None + :param headers: + A `dict` that maps virtual header file names to their respective + content. These extra in-memory headers are made visible to and can + be included in CUDA sources processed by this compiler just like + regular include files by using the ``#include`` preprocessor + directive. + :type headers: + dict(str, str) or None + + .. method:: compile(source, name=None, options=None, exports=None,\ + mangled_exports=None) + + Compile CUDA source code *source* with NVRTC using the compile options + specified earlier in the constructor combined with *options* + and return the resulting PTX code to the caller. Do not upload the + code to the GPU. + + Declare all function names in *exports* to the compiler, and after + successfull compilation, insert each pair of exported function name + in *exports* and its C++ mangled function name into *mangled_exports*. + + :param str source: + CUDA C++ source code + :param name: + Specifies to the compiler under which name to refer to *source* + in its warning and error messages. If *None*, defaults to + ``"kernel.ptx"`` + :type name: + str or None + :param options: + List of additional NVRTC compile options, combine with those + supplied in the constructor + :type options: + list(str) or None + :param exports: + Function names in *source* that the caller wants to be exported + :type exports: + list(str) or None + :param mangled_exports: + Output variable, receives pairs of function names from *exports* + and their mangled counterparts + :type mangled_exports: + dict(str, str) or None + :return: + the compiled PTX code string + + CUDA 8.0 and above. + .. module:: pycuda.compiler .. data:: DEFAULT_NVCC_FLAGS @@ -2042,11 +2364,351 @@ Just-in-time Compilation *Change note:* :class:`SourceModule` was moved from :mod:`pycuda.driver` to :mod:`pycuda.compiler` in version 0.93. -.. function:: compile(source, nvcc="nvcc", options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, +.. function:: compile(source, nvcc="nvcc", options=None, keep=False,\ + no_extern_c=False, arch=None, code=None, cache_dir=None,\ include_dirs=[]) Perform the same compilation as the corresponding :class:`SourceModule` constructor, but only return resulting *cubin* file as a string. In particular, do not upload the code to the GPU. + +.. class:: DynamicSourceModule(source, nvcc=None, options=None, keep=False,\ + no_extern_c=False, arch=None, code=None, cache_dir=None,\ + include_dirs=None, use_cudadevrt=False, cuda_libdir=None,\ + exports=None) + + .. versionadded:: 2017.? + + Create a :class:`~pycuda.driver.Module` from CUDA source code *source* + using either NVCC or NVRTC with optional support for the dynamic CUDA + runtime `cudadevrt` (needed for recursive launches/dynamic parallelism). + + :param source: + CUDA C++ source code + :type source: + str + :param nvcc: + File system path of the NVCC executable. If *None*, defaults to ``nvcc`` + :type nvcc: + str or None + :param options: + List of NVCC or NVRTC compile options + :type options: + list(str) or None + :param keep: + If *True*, keep compiler output and print a line indicating its + location in the file system for debugging purposes. + Default: *False* (NVCC only) + :type keep: + bool + :param no_extern_c: + If *False*, wrap the given source code in ``extern "C" { ... }`` to + prevent C++ name mangling. Default: *False* (NVCC only) + :type no_extern_c: + bool + :param arch: + The name of the class of NVIDIA virtual GPU architecture for which to + compile the CUDA C++ source. If *None*, defaults to the current + context's device's compute capability + :type arch: + str or None + :param code: + The name of the NVIDIA GPU to assemble and optimize PTX code for. If + *None*, leave it unspecified (NVCC only) + :type code: + str or None + :param cache_dir: + The directory used by NVCC for compiler caching. If *None* (default), + then `cache_dir` is taken to be :envvar:`PYCUDA_CACHE_DIR` if set or a + sensible per-user default. If *False*, caching is disabled (NVCC only) + :type cache_dir: + str or None + :param include_dirs: + List of extra directory names to use when processing ``#include`` + preprocessor directives in the CUDA source. + :type include_dirs: + list(str) or None + :param use_cudadevrt: + If *True*, link to the dynamic CUDA runtime, meaning: instruct the + compiler to generate relocatable PTX device code from *source* that + can be linked with other relocatable device code, and statically link + standard library `cudadevrt` to this module. Default: *False* (link + static runtime) + :type use_cudadevrt: + bool + :param cuda_libdir: + The absolute path in the local file system of the CUDA SDK 64-Bit + library directory + :type cuda_libdir: + str or None + :param exports: + If not *None*, use NVRTC instead of NVCC to compile *source*. + This argument contains the list of function names in *source* that + the caller later wants to access using :meth:`get_function()` + :type exports: + list(str) or None + + This class exhibits the same public interface as + :class:`~pycuda.driver.Module`, but does not inherit from it. + + CUDA 3.5 and above, 8.0 and above when using NVRTC. + + .. method:: get_function(name) + + :see: + :meth:`Module.get_function() ` + + .. method:: get_global(name) + + :see: + :meth:`Module.get_global() ` + + .. method:: get_texref(name) + + :see: + :meth:`Module.get_texref() ` + + .. method:: get_surfref(name) + + :see: + :meth:`Module.get_surfref() ` + +.. class:: DynamicModule(nvcc=None, keep=False, no_extern_c=False,\ + arch=None, code=None, cache_dir=None, include_dirs=None,\ + link_options=None, message_handler=None, log_verbose=False,\ + cuda_libdir=None) + + .. versionadded:: 2017.? + + Build a :class:`~pycuda.driver.Module` from an arbitrary mix of CUDA C++ + sources (compiled with either NVCC or NVRTC), PTX sources and CUDA + libraries with CUDA's Just-In-Time (JIT) linker. + + General usage pattern for this class: + + 1. Create a module builder using constructor :meth:`DynamicModule` + 2. Use the ``add_...()`` methods as often as needed and in any order: + + - :meth:`add_source_nvcc(source) ` - to compile *source* with NVCC and add result + - :meth:`add_source_nvrtc(source) ` - to compile *source* with NVRTC and add result + - :meth:`add_data(data) ` - to add in-memory data to the module + - :meth:`add_file(filename) ` - to add files to the module + - :meth:`add_stdlib(libanem) ` - to add SDK libraries to the module + + 3. Link and upload module code to the GPU by calling :meth:`link` once + 4. Access module's kernel functions on the GPU using :meth:`get_function` + + :param nvcc: + File system path of the NVCC executable. If *None*, defaults to ``nvcc`` + :type nvcc: + str or None + :param keep: + If *True*, keep compiler outputs and print a line indicating their + location in the file system for debugging purposes. + Default: *False* (NVCC only) + :type keep: + bool + :param no_extern_c: + If *False*, wrap the given source codes in ``extern "C" { ... }`` to + prevent C++ name mangling. Default: *False* (NVCC only) + :type no_extern_c: + bool + :param arch: + The name of the class of NVIDIA virtual GPU architecture for which to + compile the CUDA C++ sources. If *None*, defaults to the current + context's device's compute capability + :type arch: + str or None + :param code: + The name of the NVIDIA GPU to assemble and optimize PTX code for. If + *None*, leave it unspecified (NVCC only) + :type code: + str or None + :param cache_dir: + The directory used by NVCC for compiler caching. If *None* (default), + then `cache_dir` is taken to be :envvar:`PYCUDA_CACHE_DIR` if set or a + sensible per-user default. If *False*, caching is disabled (NVCC only) + :type cache_dir: + str or None + :param include_dirs: + List of extra directory names to use when processing ``#include`` + preprocessor directives in CUDA sources. + :type include_dirs: + list(str) or None + :param link_options: + Options for the JIT linker :class:`~pycuda.driver.JitLinker` + :type link_options: + list(jit_option) or None + :param message_handler: + Optional callback on success or failure + :type message_handler: + callable or None + :param bool log_verbose: + If *True*, instruct linker to create detailed log output. + Default: *False* + :param cuda_libdir: + The absolute path in the local file system of the CUDA SDK 64-Bit + library directory + :type cuda_libdir: + str or None + + After :meth:`link` has been called, objects of this class exhibit + the same public interface as :class:`~pycuda.driver.Module`, but do not + inherit from it. + + CUDA 3.5 and above. + + .. method:: add_source_nvcc(source, options=None, name=None) + + Compile CUDA source code *source* with NVIDIA's Command-Line-Compiler + NVCC using compiler options *options* and add the resulting + PTX code to the linker session of this module. + + :param str source: + CUDA C++ source code + :param options: + List of NVCC compiler options + :type options: + list(str) or None + :param name: + Specifies to the compiler and linker under which name to refer to + *source* in their warning and error messages. If *None*, defaults + to ``"kernel.ptx"`` + :type name: + str or None + :return: + *self* + + .. method:: add_source_nvrtc(source, options=None,\ + name=None, exports=None, compiler=None) + + Compile CUDA source code *source* with NVIDIA's Run-Time-Compiler + NVRTC using compiler options *options* and add the resulting + PTX code to the linker session of this module. + + :param str source: + CUDA C++ source code + :param options: + List of NVRTC compile options + :type options: + list(str) or None + :param name: + Specifies to the compiler and linker under which name to refer to + *source* in their warning and error messages. If *None*, defaults + to ``"kernel.ptx"`` + :type name: + str or None + :param exports: + Function names in *source* that the caller wants to be exported + :type exports: + list(str) or None + :param compiler: + The NVRTC compiler to use to compile *source*. If *None*, a + default compiler instance is created and used + :type compiler: + NvrtcCompiler or None + :return: + *self* + :see: + :meth:`JitLinker.add_source_nvrtc() ` + + CUDA 8.0 and above. + + .. method:: add_data(data, input_type, name=None) + + Add the binary data in *data* to the linker session of this module. + + :param data: + A string (Python 2) or bytes (Python 3) object containing the data to add + :type data: + str or bytes + :param input_type: + JIT linker input type of *data* + :type input_type: + jit_input_type + :param name: + Specifies to the linker under which name to refer to *data* in + its warning and error messages. If *None*, defaults + to ``"unknown"`` + :type name: + str or None + :return: + *self* + :see: + :meth:`JitLinker.add_data() ` + + .. method:: add_file(filename, input_type) + + Add the content of file *filename* to the linker session of this + module. + + :param str filename: + The name of the file to add + :param input_type: + JIT linker input type of the file's content + :type input_type: + jit_input_type + :return: + *self* + :see: + :meth:`JitLinker.add_file() ` + + .. method:: add_stdlib(libname) + + Add the standard library *libname* to the linker session of this + module. + + A standard library is part of the CUDA SDK. If this method fails + to auto-locate the correct CUDA SDK installation path on your system, + use argument *cuda_libdir* in the constructor to set this path + manually to the 64-Bit library directory of your local CUDA SDK + installation. + + :param str libname: + specifies the shortened library file name which gets expanded + internally to match the correct library file name for the local + system, for example `cudadevrt` is the shortened form of + `cudadevrt.lib` on Windows and `libcudadevrt.a` on Linux. + :return: + *self* + + .. method:: link() + + Complete the linker session and create the :class:`~pycuda.driver.Module`. + + After calling this method, calling the *add_...* methods of this + instance is no longer allowed, whereas accessing module functions + through this instance using :meth:`get_function` is only + allowed after calling this method. + + :return: + *self* + + .. method:: get_function(name) + + This method is not defined before calling :meth:`link`. + + :see: + :meth:`Module.get_function() ` + + .. method:: get_global(name) + + This method is not defined before calling :meth:`link`. + + :see: + :meth:`Module.get_global() ` + + .. method:: get_texref(name) + + This method is not defined before calling :meth:`link`. + + :see: + :meth:`Module.get_texref() ` + + .. method:: get_surfref(name) + + This method is not defined before calling :meth:`link`. + + :see: + :meth:`Module.get_surfref() ` diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py index d5435987420dec7fa4542b5c11667dfda9d3f9dd..b2c0b4e6e65621022561ef69938a5b7387f74123 100644 --- a/examples/demo_cdpSimplePrint.py +++ b/examples/demo_cdpSimplePrint.py @@ -95,7 +95,7 @@ def main(argv): print("starting Simple Print (CUDA Dynamic Parallelism)") - mod = DynamicSourceModule(cdpSimplePrint_cu) + mod = DynamicSourceModule(cdpSimplePrint_cu, use_cudadevrt=True) cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call print("***************************************************************************") diff --git a/pycuda/compiler.py b/pycuda/compiler.py index b73793b6dee779f268cab3f80929f30012c3c3f5..7ad26ae0e36b2afbc61bad642c6ae42dcc4c8964 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -278,10 +278,6 @@ class CudaModule(object): return self.module.get_function(name) class SourceModule(CudaModule): - ''' - Creates a Module from a single .cu source object linked against the - static CUDA runtime. - ''' def __init__(self, source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): @@ -318,22 +314,20 @@ def _find_nvcc_on_path(): class DynamicModule(CudaModule): - ''' - Creates a Module from multiple .cu source, library file and/or data - objects linked against the static or dynamic CUDA runtime. - ''' - def __init__(self, nvcc='nvcc', link_options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], message_handler=None, log_verbose=False, - cuda_libdir=None): + #def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None, + # code=None, cache_dir=None, include_dirs=[], link_options=None, + # message_handler=None, log_verbose=False, cuda_libdir=None): + def __init__(self, nvcc=None, keep=False, no_extern_c=False, arch=None, + code=None, cache_dir=None, include_dirs=None, link_options=None, + message_handler=None, log_verbose=False, cuda_libdir=None): from pycuda.driver import Context compute_capability = Context.get_device().compute_capability() if compute_capability < (3,5): - raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % + raise RuntimeError('Minimum compute capability for JIT linker is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) else: - from pycuda.driver import Linker - self.linker = Linker(message_handler, link_options, log_verbose) + from pycuda.driver import JitLinker + self.linker = JitLinker(link_options, message_handler, log_verbose) self._check_arch(arch) self.nvcc = nvcc self.keep = keep @@ -344,6 +338,7 @@ class DynamicModule(CudaModule): self.include_dirs = include_dirs self.cuda_libdir = cuda_libdir self.libdir, self.libptn = None, None + self.mangled_exports = None self.module = None def _locate_cuda_libdir(self): @@ -407,16 +402,41 @@ class DynamicModule(CudaModule): 'directory, set CUDA library path manually') return libdir, libptn - def add_source(self, source, nvcc_options=None, name='kernel.ptx'): - ptx = compile(source, nvcc=self.nvcc, options=nvcc_options, + def add_source_nvcc(self, source, options=None, name=None): + if self.nvcc is None: + nvcc = 'nvcc' + else: + nvcc = self.nvcc + if self.include_dirs is None: + include_dirs = [] + else: + include_dirs = self.include_dirs + ptx = compile(source, nvcc=nvcc, options=options, keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch, code=self.code, cache_dir=self.cache_dir, - include_dirs=self.include_dirs, target="ptx") + include_dirs=include_dirs, target="ptx") from pycuda.driver import jit_input_type + if name is None: + name = 'kernel.ptx' self.linker.add_data(ptx, jit_input_type.PTX, name) return self - def add_data(self, data, input_type, name='unknown'): + def add_source_nvrtc(self, source, options=None, + name=None, exports=None, compiler=None): + if not hasattr(self.linker, "add_source_nvrtc"): + raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!') + if self.mangled_exports is None and len(exports) > 0: + self.mangled_exports = {} + if compiler is None: + from pycuda.driver import NvrtcCompiler + compiler = NvrtcCompiler(options, self.include_dirs, self.arch) + options = None + self.linker.add_source_nvrtc(source, compiler, name=name, + options=options, exports=exports, + mangled_exports=self.mangled_exports) + return self + + def add_data(self, data, input_type, name=None): self.linker.add_data(data, input_type, name) return self @@ -441,6 +461,12 @@ class DynamicModule(CudaModule): self._bind_module() return self + def get_function(self, name): + if self.mangled_exports is None or name not in self.mangled_exports: + return self.module.get_function(name) + else: + return self.module.get_function(self.mangled_exports[name]) + class DynamicSourceModule(DynamicModule): ''' @@ -451,20 +477,34 @@ class DynamicSourceModule(DynamicModule): - source is linked against the CUDA device runtime library cudadevrt - library cudadevrt is statically linked into the generated Module ''' - def __init__(self, source, nvcc="nvcc", options=None, keep=False, + def __init__(self, source, nvcc=None, options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], cuda_libdir=None): + include_dirs=None, use_cudadevrt=False, cuda_libdir=None, + exports=None): super(DynamicSourceModule, self).__init__(nvcc=nvcc, + keep=keep, no_extern_c=no_extern_c, arch=arch, code=code, + cache_dir=cache_dir, include_dirs=include_dirs, link_options=None, + cuda_libdir=cuda_libdir) + + if options is None: + options = DEFAULT_NVCC_FLAGS + else: + options = options[:] + if use_cudadevrt and '-rdc=true' not in options: link_options=None, keep=keep, no_extern_c=no_extern_c, arch=arch, code=code, cache_dir=cache_dir, include_dirs=include_dirs, cuda_libdir=cuda_libdir) - if options is None: - options = DEFAULT_NVCC_FLAGS options = options[:] - if '-rdc=true' not in options: + if not '-rdc=true' in options: options.append('-rdc=true') - if '-lcudadevrt' not in options: - options.append('-lcudadevrt') - self.add_source(source, nvcc_options=options) - self.add_stdlib('cudadevrt') + if exports is None: + if use_cudadevrt and '-lcudadevrt' not in options: + options.append('-lcudadevrt') + self.add_source_nvcc(source, options=options) + else: + self.add_source_nvrtc(source, options=options, exports=exports) + if use_cudadevrt: + self.add_stdlib('cudadevrt') + self.link() +''' diff --git a/setup.py b/setup.py index 775141b0d91db62fafd716cb66e9124fa10e628f..a398920a03047441d13d9a2c111864f997b7bcec 100644 --- a/setup.py +++ b/setup.py @@ -91,6 +91,9 @@ def get_config_schema(): LibraryDir("CURAND", default_lib_dirs), Libraries("CURAND", ["curand"]), + LibraryDir("NVRTC", default_lib_dirs), + Libraries("NVRTC", ["nvrtc"]), + StringListOption("CXXFLAGS", cxxflags_default, help="Any extra C++ compiler options to include"), StringListOption("LDFLAGS", ldflags_default, @@ -117,7 +120,7 @@ def main(): LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"] LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"] - + conf["CUDADRV_LIBNAME"]) + + conf["CUDADRV_LIBNAME"]) + conf["NVRTC_LIBNAME"] if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]: conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")] diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 8a2488deccf6fecfa6350faaa013246de2a2c360..eead66450bdb5eff3e3a79ce81fa0567e1bb5c62 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -9,6 +9,8 @@ #include +#define USE_JIT + #if CUDAPP_CUDA_VERSION < 1010 @@ -410,10 +412,273 @@ namespace // }}} - // {{{ linker + // {{{ nvrtc compiler + +#ifdef USE_JIT + +#if CUDAPP_CUDA_VERSION >= 8000 + #include // NOTE: this requires us to link static SDK library "nvrtc" + + class NvrtcProgram + { + private: + nvrtcProgram m_prog; + + const std::string compilation_log_str() const { + size_t result_length; + nvrtcResult nvrtc_result = nvrtcGetProgramLogSize(m_prog, &result_length); + check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result); + if (result_length == 0) { + return std::string(); + } + std::vector result_vec(result_length); + nvrtc_result = nvrtcGetProgramLog(m_prog, &result_vec[0]); + check_nvrtc_result("nvrtcGetProgramLog", nvrtc_result); + return std::string(&result_vec[0], result_length); + } + + public: + NvrtcProgram(const char* source, const char* name, + const std::vector& header_names_p, + const std::vector& header_contents_p) + : m_prog() + { + const size_t header_count = header_names_p.size(); + const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog, + source, name, (int)header_count, + (header_count > 0)? &header_contents_p[0] : NULL, + (header_count > 0)? &header_names_p[0] : NULL); + check_nvrtc_result("nvrtcCreateProgram", nvrtc_result); + } + + ~NvrtcProgram() { + if (m_prog != NULL) { + nvrtcDestroyProgram(&m_prog); + m_prog = NULL; + } + } + + nvrtcProgram prog() const { + return m_prog; + } + + void check_nvrtc_result(const char* nvrtc_function_name, nvrtcResult nvrtc_result) const { + if (nvrtc_result != NVRTC_SUCCESS) { + std::string error_msg = nvrtc_function_name; + error_msg += " failed: "; + error_msg += nvrtcGetErrorString(nvrtc_result); + const std::string compilation_log = compilation_log_str(); + if (compilation_log.length() > 0) { + error_msg += " - "; + error_msg += compilation_log; + } + throw std::runtime_error(error_msg); + } + } + }; + + class nvrtc_compiler : public boost::noncopyable + { + private: + std::vector m_compile_options; + std::vector m_header_names; + std::vector m_header_contents; + std::vector m_compile_options_p; + std::vector m_header_names_p; + std::vector m_header_contents_p; +#ifdef _WIN32 + std::vector m_win32_getenv_buf; +#endif + + char* get_env(const char* varname) { +#ifdef _WIN32 + // Visual Studio 2015: std::getenv() causes compiler warning C4996 + size_t required_size = 0; + getenv_s(&required_size, NULL, 0, varname); + if (required_size > 0) { + m_win32_getenv_buf.resize(required_size); + getenv_s(&required_size, &m_win32_getenv_buf[0], required_size, varname); + return &m_win32_getenv_buf[0]; + } + else { + return NULL; + } +#else + return std::getenv(varname); +#endif + } + + std::string find_pycuda_include_path() const { + // pycuda.compiler._find_pycuda_include_path(): + // from pkg_resources import Requirement, resource_filename + // return resource_filename(Requirement.parse("pycuda"), "pycuda/cuda") + const py::object pkg_resources = py::import("pkg_resources"); + const py::object resource_filename = pkg_resources.attr("resource_filename"); + const py::object Requirement_parse = pkg_resources.attr("Requirement").attr("parse"); + const py::object py_result = resource_filename(Requirement_parse("pycuda"), "pycuda/cuda"); + return py::extract(py_result); + } + + void options_changed() { + const size_t n = m_compile_options.size(); + m_compile_options_p.resize(n); + for (size_t i=0; i(py_arch); + if (strlen(arch) == 5 && std::memcmp(arch, "sm_", 3) == 0) { + arch_compute_capability = std::string("compute_") + &arch[3]; + } + else { + arch_compute_capability = arch; + } + } + else { + const py::tuple py_compute_capability = context::get_device().compute_capability(); + const char arch_major = '0' + py::extract(py_compute_capability[0]); + const char arch_minor = '0' + py::extract(py_compute_capability[1]); + arch_compute_capability = std::string("compute_") + arch_major + arch_minor; + } + m_compile_options.push_back("-arch"); + m_compile_options.push_back(arch_compute_capability); + + // Include directory (-I) compiler options + m_compile_options.push_back(std::string("-I") + find_pycuda_include_path()); + const char* CUDA_SDK; + if ((CUDA_SDK = get_env("CUDA_PATH")) != NULL) { + m_compile_options.push_back(std::string("-I") + CUDA_SDK + "\\include"); + } + else if ((CUDA_SDK = get_env("CUDA_ROOT")) != NULL) { + m_compile_options.push_back(std::string("-I") + CUDA_SDK + "/include"); + } + if (py_include_dirs.ptr() != Py_None) { + PYTHON_FOREACH(py_include_dir, py_include_dirs) { + m_compile_options.push_back(std::string("-I") + py::extract(py_include_dir)()); + } + } + + // Other compiler options specified by the caller + if (py_nvrtc_options.ptr() != Py_None) { + PYTHON_FOREACH(py_compiler_option, py_nvrtc_options) { + m_compile_options.push_back(py::extract(py_compiler_option)); + } + } + + options_changed(); + + // In-memory NVRTC header files + if (py_headers.ptr() != Py_None) { + const py::dict py_headers_dict = py::extract(py_headers); + if (py::len(py_headers_dict) > 0) { + const py::list py_keys = py_headers_dict.keys(); + PYTHON_FOREACH(py_key, py_keys) { + m_header_names.push_back(py::extract(py_key)); + m_header_contents.push_back(py::extract(py_headers_dict[py_key])); + } + header_changed(); + } + } + } + + std::vector compile( + const py::str& py_source, + const py::object& py_name, + const py::object& py_nvrtc_options, + const py::object& py_exports, + py::object& py_mangled_exports) + { + // Combine compile option lists into compile_options_p + std::vector compile_options_p(m_compile_options_p); + if (py_nvrtc_options.ptr() != Py_None) { + PYTHON_FOREACH(py_compile_option, py_nvrtc_options) { + compile_options_p.push_back(py::extract(py_compile_option)); + } + } + + // Create NVRTC program + const char* name = (py_name.ptr() != Py_None)? + py::extract(py_name) : "kernel.ptx"; + NvrtcProgram prog(py::extract(py_source), name, + m_header_names_p, m_header_contents_p); + + // Declare exported function names + nvrtcResult nvrtc_result; + if (py_exports.ptr() != Py_None) { + PYTHON_FOREACH(py_export, py_exports) { + nvrtc_result = nvrtcAddNameExpression(prog.prog(), py::extract(py_export)); + prog.check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); + } + } + + // Compile .cu => .ptx + nvrtc_result = nvrtcCompileProgram(prog.prog(), + (int)compile_options_p.size(), &compile_options_p[0]); + prog.check_nvrtc_result("nvrtcCompileProgram", nvrtc_result); + + // Map exported function names to their mangled representation + if (py_exports.ptr() != Py_None && py_mangled_exports.ptr() != Py_None) { + py::dict py_mangled_exports_dict = py::extract(py_mangled_exports); + PYTHON_FOREACH(py_export, py_exports) { + const char* plain_name = py::extract(py_export); + const char* mangled_name = NULL; + nvrtc_result = nvrtcGetLoweredName(prog.prog(), plain_name, &mangled_name); + prog.check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result); + py_mangled_exports_dict[py_export] = mangled_name; + } + } + + // Copy compiled ptx result into ptx_buffer + size_t ptx_size; + nvrtc_result = nvrtcGetPTXSize(prog.prog(), &ptx_size); + prog.check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result); + std::vector ptx_buffer(ptx_size); + nvrtc_result = nvrtcGetPTX(prog.prog(), &ptx_buffer[0]); + prog.check_nvrtc_result("nvrtcGetPTX", nvrtc_result); + return ptx_buffer; + } + + std::string py_compile( + py::str py_source, + py::object py_name, + py::object py_nvrtc_options, + py::object py_exports, + py::object py_mangled_exports) + { + const std::vector ptx = compile(py_source, py_name, + py_nvrtc_options, py_exports, py_mangled_exports); + return std::string(ptx.begin(), ptx.end()); + } + }; +#endif + + // }}} + + // {{{ jit linker #if CUDAPP_CUDA_VERSION >= 5050 - class Linker : public boost::noncopyable + class jit_linker : public boost::noncopyable { private: py::object m_message_handler; @@ -459,8 +724,8 @@ namespace } public: - Linker(py::object message_handler = py::object(), - py::object py_options = py::object(), + jit_linker(py::object py_link_options = py::object(), + py::object message_handler = py::object(), py::object py_log_verbose = py::object(false)) : m_message_handler(message_handler), m_link_state(), @@ -472,8 +737,8 @@ namespace add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, sizeof(m_error_buf)); add_option(CU_JIT_LOG_VERBOSE, m_log_verbose? 1ull : 0ull); - if (py_options.ptr() != Py_None) { - PYTHON_FOREACH(key_value, py_options) { + if (py_link_options.ptr() != Py_None) { + PYTHON_FOREACH(key_value, py_link_options) { add_option( py::extract(key_value[0]), py::extract(key_value[1])()); @@ -488,12 +753,12 @@ namespace check_cu_result("cuLinkCreate", cu_result); } - ~Linker() + ~jit_linker() { close(); } - void add_data(py::object py_data, CUjitInputType input_type, py::str py_name) + void add_data(py::object py_data, CUjitInputType input_type, py::object py_name) { const char *data_buf; PYCUDA_BUFFER_SIZE_T data_buf_len; @@ -501,7 +766,7 @@ namespace throw py::error_already_set(); } const char* name = (py_name.ptr() != Py_None)? - py::extract(py_name) : NULL; + py::extract(py_name) : "unknown"; const CUresult cu_result = cuLinkAddData(m_link_state, input_type, static_cast(const_cast(data_buf)), data_buf_len, name, 0, NULL, NULL); @@ -510,12 +775,29 @@ namespace void add_file(py::str py_filename, CUjitInputType input_type) { - const char* filename = py::extract(py_filename); + const char* filename = py::extract(py_filename); const CUresult cu_result = cuLinkAddFile(m_link_state, input_type, filename, 0, NULL, NULL); check_cu_result("cuLinkAddFile", cu_result); } +#if CUDAPP_CUDA_VERSION >= 8000 + void add_source_nvrtc(py::str py_source, nvrtc_compiler* compiler, + py::object py_name, py::object py_nvrtc_options, + py::object py_exports, py::object py_mangled_exports) + { + // compile source into buffer ptx + std::vector ptx = compiler->compile(py_source, py_name, + py_nvrtc_options, py_exports, py_mangled_exports); + // link buffer ptx + const char* name = (py_name.ptr() != Py_None)? + py::extract(py_name) : "kernel.ptx"; + const CUresult cu_result = cuLinkAddData(m_link_state, + CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL); + check_cu_result("cuLinkAddData", cu_result); + } +#endif + module* link_module() { char* cubin_data = NULL; @@ -536,6 +818,8 @@ namespace }; #endif +#endif /* #ifdef USE_JIT */ + // }}} template @@ -1192,7 +1476,28 @@ BOOST_PYTHON_MODULE(_driver) // }}} - // {{{ linker + // {{{ nvrtc compiler + +#ifdef USE_JIT + +#if CUDAPP_CUDA_VERSION >= 8000 +// py::class_ >("NvrtcCompiler") + py::class_("NvrtcCompiler") + .def(py::init()) + .def(py::init()) + .def(py::init()) + .def(py::init()) + .def("compile", &nvrtc_compiler::py_compile, ( + py::arg("source"), + py::arg("name")=py::object(), + py::arg("options")=py::object(), + py::arg("exports")=py::object(), + py::arg("mangled_exports")=py::object())); +#endif + + // }}} + + // {{{ jit linker #if CUDAPP_CUDA_VERSION >= 5050 py::enum_("jit_input_type") @@ -1202,14 +1507,32 @@ BOOST_PYTHON_MODULE(_driver) .value("OBJECT", CU_JIT_INPUT_OBJECT) .value("LIBRARY", CU_JIT_INPUT_LIBRARY); - py::class_ >("Linker") +// py::class_ >("JitLinker") + py::class_("JitLinker") .def(py::init()) .def(py::init()) .def(py::init()) - .def("add_data", &Linker::add_data, (py::arg("data"), py::arg("input_type"), py::arg("name")=py::str("unknown"))) - .def("add_file", &Linker::add_file, (py::arg("filename"), py::arg("input_type"))) - .def("link_module", &Linker::link_module, py::return_value_policy()); -#endif + .def("add_data", &jit_linker::add_data, ( + py::arg("data"), + py::arg("input_type"), + py::arg("name")=py::object())) + .def("add_file", &jit_linker::add_file, ( + py::arg("filename"), + py::arg("input_type"))) +#if CUDAPP_CUDA_VERSION >= 8000 + .def("add_source_nvrtc", &jit_linker::add_source_nvrtc, ( + py::arg("source"), + py::arg("compiler"), + py::arg("name")=py::object(), + py::arg("options")=py::object(), + py::arg("exports")=py::object(), + py::arg("mangled_exports")=py::object())) +#endif + .def("link_module", &jit_linker::link_module, + py::return_value_policy()); +#endif + +#endif /* #ifdef USE_JIT */ // }}} diff --git a/test/test_driver.py b/test/test_driver.py index b440eff09e9791ea7977afd1cba550f0cebe173f..b3deed408c21d2afb52d68f6e957feb21e2a9d76 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -891,12 +891,14 @@ class TestDriver: drv.memcpy_htod(a_gpu, a) drv.memcpy_htod(b_gpu, b) + ''' from pycuda.compiler import DynamicSourceModule - mod = DynamicSourceModule(cuda_string, keep=True) + mod = DynamicSourceModule(cuda_string, keep=True, use_cudadevrt=True) func = mod.get_function("math") func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu, block=(100, 1, 1), grid=(1, 1, 1)) + ''' drv.memcpy_dtoh(c, c_gpu) drv.memcpy_dtoh(d, d_gpu) @@ -921,31 +923,36 @@ class TestDriver: from pytest import skip skip("need compute capability 3.5 or higher for dynamic parallelism") - test_outer_cu = '''#include - __global__ void test_kernel() { - extern __global__ void test_kernel_inner(); - printf("Hello outer world!\\n"); - test_kernel_inner<<<2, 1>>>(); - }''' + test_outer_cu = ''' + #include + __global__ void test_kernel() { + extern __global__ void test_kernel_inner(); + printf("Hello outer world!\\n"); + test_kernel_inner<<<2, 1>>>(); + } + ''' - test_inner_cu = '''#include - __global__ void test_kernel_inner() { - printf(" Hello inner world!\\n"); - }''' + test_inner_cu = ''' + #include + __global__ void test_kernel_inner() { + printf("Hello inner world!\\n"); + } + ''' from pycuda.compiler import DynamicModule mod = DynamicModule() - mod.add_source( - test_outer_cu, nvcc_options=( - ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS)) - mod.add_source( - test_inner_cu, nvcc_options=( - ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS)) + mod.add_source_nvcc( + test_outer_cu, + nvcc_options=['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS) + mod.add_source_nvcc( + test_inner_cu, + nvcc_options=['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS) + mod.add_stdlib('cudadevrt') mod.link() test_kernel = mod.get_function('test_kernel') - test_kernel(grid=(2, 1), block=(1, 1, 1)) + test_kernel(grid=(2,1), block=(1,1,1)) def test_import_pyopencl_before_pycuda():