diff --git a/doc/source/driver.rst b/doc/source/driver.rst
index 4eaa839db1c42376076bc43d93e5ae0ab5d0fcbe..09f389fa8754580ad15bddbb7a9f5c603761e4d7 100644
--- a/doc/source/driver.rst
+++ b/doc/source/driver.rst
@@ -513,6 +513,36 @@ Constants
     .. attribute:: PREFER_PTX
     .. attribute:: PREFER_BINARY
 
+.. class:: jit_input_type
+
+    Supported input data types to CUDA's JIT linker.
+    The attribute names and values of this class correspond to CUDA's
+    `enum CUjitInputType`.
+
+    CUDA 3.5 and above.
+
+    .. versionadded:: 2017.?
+
+    .. attribute:: CUBIN
+
+        Data contains compiled device-class-specific device code.
+
+    .. attribute:: PTX
+
+        Data contains PTX (Parallel Thread eXecution) source code.
+
+    .. attribute:: FATBINARY
+
+        Data contains a bundle of multiple cubins and/or PTX of some device code.
+
+    .. attribute:: OBJECT
+
+        Data contains host object with embedded device code.
+
+    .. attribute:: LIBRARY
+
+        Data contains an archive of host objects with embedded device code.
+
 .. class:: host_alloc_flags
 
     Flags to be used to allocate :ref:`pagelocked_memory`.
@@ -558,6 +588,14 @@ Constants
 
     .. attribute:: LAZY_ENABLE_PEER_ACCESS
 
+    .. attribute:: DEV_RUNTIME_SYNC_DEPTH
+
+        CUDA 3.5 and above.
+
+    .. attribute:: DEV_RUNTIME_PENDING_LAUNCH_COUNT
+
+        CUDA 3.5 and above.
+
 
 Graphics-related constants
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1993,6 +2031,290 @@ CUDA 4.0 and newer.
 Just-in-time Compilation
 ========================
 
+PyCUDA provides several tools to build a :class:`Module` from CUDA C++
+sources at run-time. A Module encapsulates binary GPU code and acts as an
+interface to function and data pointers in device memory when it is executed
+on a device.
+
+For a typical case with a single CUDA C++ source, see:
+
+- :class:`~pycuda.compiler.SourceModule` - A Module created using NVIDIA's
+  command line compiler NVCC, compiled and linked against CUDA's static
+  runtime. Unless you need any of the other features described below, this
+  should be your choice.
+- :class:`~pycuda.compiler.DynamicSourceModule` - A Module created using NVCC
+  or NVRTC and CUDA's Just-In-Time (JIT) linker. Facilitates linking against
+  CUDA's dynamic runtime, which is required for dynamic parallelism /
+  recursive kernels.
+
+and when you need to compile from multiple input sources, see:
+
+- :class:`~pycuda.compiler.DynamicModule` - A Module created from an arbitrary
+  mix of CUDA C++ sources (compiled with either NVCC or NVRTC), PTX sources
+  and CUDA libraries with CUDA's Just-In-Time (JIT) linker.
+
+All classes exhibit the same public interface as :class:`Module`, but do not
+inherit from it. In order to compile a CUDA C++ source into a plain string
+without building a Module, see:
+
+- :func:`compiler.compile() <pycuda.compiler.compile>` for NVCC, returns
+  CUBIN binary code
+- :meth:`NvrtcCompiler.compile` for NVRTC, returns PTX source code
+
+NVRTC
+-----
+
+NVIDIA's Run-Time-Compiler
+`NVRTC <http://docs.nvidia.com/cuda/nvrtc/index.html>`_
+is a light-weight alternative to their Command-Line-Compiler
+`NVCC <http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/>`_,
+some of the differences include:
+
+- NVRTC is a pure CUDA C++ source to PTX (Parallel Thread eXecution) code
+  compiler and thus covers a lot smaller scope than NVCC.
+- NVRTC is an API, whereas NVCC is an executable.
+- NVRTC supports only a `small subset
+  <http://docs.nvidia.com/cuda/nvrtc/index.html#group__options>`_
+  of NVCC's `compiler options
+  <http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#nvcc-command-options>`_.
+- NVRTC does not support certain preprocessor directives and for example
+  fails to parse some STL header files, but just like in NVCC, the
+  ``printf()`` function is directly available in CUDA sources.
+- NVRTC requires extra steps to access functions in compiled CUDA
+  sources from the host. The names of these functions must be declared
+  to NVRTC before compiling, and after compilation they can only be
+  looked up in the Module by using their C++ mangled names.
+
+PyCUDA already reduces the computational overhead involved with NVCC by
+maintaining a file-sytem based cache, which means that it invokes NVCC
+only once for each unique combination of CUDA source and compiler setup.
+For applications that generate volatile CUDA C++ sources dynamically at
+run-time and therby cause overly many cache-misses in PyCUDA's NVCC
+cache, using NVRTC instead of NVCC should be considered if CUDA compile
+time is a problem.
+
+See also:
+
+    :class:`~pycuda.compiler.DynamicSourceModule`,
+    :meth:`DynamicModule.add_source_nvrtc() <pycuda.compiler.DynamicModule.add_source_nvrtc>` and
+    :class:`NvrtcCompiler`
+
+Dynamic Runtime
+---------------
+
+Be aware that there is an overall negative performance impact caused by
+simply linking to CUDA's dynamic device runtime library *cudadevrt*.
+
+.. warning::
+
+    From CUDA's `Dynamic Parallelism Programming Guide
+    <https://docs.nvidia.com/cuda/pdf/CUDA_Dynamic_Parallelism_Programming_Guide.pdf#page=169&zoom=auto,-13,164>`_
+    (CUDA 8.0):
+
+        *C.4.2.2. Dynamic-parallelism-enabled Kernel Overhead*
+
+        System software which is active when controlling dynamic launches
+        may impose an overhead on any kernel which is running at the time,
+        whether or not it invokes kernel launches of its own. This
+        overhead arises from the device runtime's execution tracking and
+        management software and may result in decreased performance for
+        e.g., library calls when made from the device compared to from the
+        host side. This overhead is, in general, incurred for applications
+        that link against the device runtime library.
+
+It is therefore recommended to use *cudadevrt* only when needed.
+
+.. class:: JitLinker([link_options [, message_handler [, log_verbose]]])
+
+    .. versionadded:: 2017.?
+
+    Create an object that provides access to CUDA's Just-In-Time (JIT) linker.
+
+    :param link_options:
+        JIT linker options
+    :type link_options:
+        list(jit_option) or None
+    :param message_handler:
+        Optional callback on success or failure (see below)
+    :type message_handler:
+        callable or None
+    :param bool log_verbose:
+        If *True*, instruct linker to create detailed log output.
+        Default: *False*
+
+    The *message_handler* callback's Python function prototype is
+
+    .. code-block:: python
+
+        def message_handler(bool succeeded, str info_str, str error_str)
+
+    where *succeeded* informs on success (*True*) or failure (*False*) of the
+    linker invocation, *info_str* contains informational linker log output and
+    *error_str* any error output. Note that in case of a linker error an
+    Excpetion is raised containing *error_str* after *message_handler* was
+    called.
+
+    CUDA 3.5 and above.
+
+    .. method:: add_data(data, input_type, name=None)
+
+        Add *data* with type *input_type* to this linker session.
+
+        :param data:
+            A string (Python 2) or bytes (Python 3) object containing the
+            data to add
+        :type data:
+            str or bytes
+        :param input_type:
+            JIT linker input type of *data*
+        :type input_type:
+            jit_input_type
+        :param name:
+            Specifies to the linker under which name to refer to *data* in
+            its warning and error messages. If *None*, defaults to ``"unknown"``
+        :type name:
+            str or None
+
+    .. method:: add_file(filename, input_type)
+
+        Add content of file *filename* with type *input_type* to this linker
+        session.
+
+        :param str filename:
+            The name of the file to add
+        :param input_type:
+            JIT linker input type of the file's content
+        :type input_type:
+            jit_input_type
+
+    .. method:: add_source_nvrtc(source, compiler, name=None,\
+                    options=None, exports=None, mangled_exports=None)
+
+        Compile CUDA source code *source* using NVRTC *compiler* with options
+        *options*, then add the resulting PTX code to this linker
+        session.
+
+        :param source:
+            CUDA C++ source code
+        :type source:
+            str
+        :param compiler:
+            The NVRTC compiler to use to compile *source*
+        :type compiler:
+            NvrtcCompiler
+        :param name:
+            Specifies to the compiler and linker under which name to refer to
+            *source* in their warning and error messages. If *None*, defaults
+            to ``"kernel.ptx"``
+        :type name:
+            str or None
+        :param options:
+           List of NVRTC compile options
+        :type options:
+            list(str) or None
+        :param exports:
+            Function names in *source* that the caller wants to be exported
+        :type exports:
+            list(str) or None
+        :param mangled_exports:
+            Output variable, receives pairs of function names from *exports*
+            and their mangled counterparts
+        :type mangled_exports:
+            dict(str, str) or None
+        :see:
+            :meth:`NvrtcCompiler.compile`
+
+        CUDA 8.0 and above.
+
+    .. method:: link_module()
+
+        Closes this linker session and returns the new :class:`Module` object.
+
+        :return:
+            the newly created :class:`Module` instance
+
+.. class:: NvrtcCompiler([options[, include_dirs [, arch [, headers]]]])
+
+    .. versionadded:: 2017.?
+
+    Create an object that provides access to NVIDIA's Run-Time-Compiler (NVRTC).
+
+    Configure the compiler to add the CUDA SDK include path, the PyCUDA
+    include path and all directory names in *include_dirs* to its include
+    search paths. The global compiler options *options* provided in
+    this constructor can later be extended per CUDA source, see
+    :meth:`compile`.
+
+    Note that NVRTC `supports
+    <http://docs.nvidia.com/cuda/nvrtc/index.html#group__options>`_
+    only a small subset of `NVCC's
+    <http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#nvcc-command-options>`_
+    compiler options.
+
+    :param options:
+        List of NVRTC compile options to use with this compiler instance.
+    :type options:
+        list(str) or None
+    :param include_dirs:
+        List of extra directory names to use when processing ``#include``
+        preprocessor directives in CUDA sources.
+    :type include_dirs:
+        list(str) or None
+    :param arch:
+        String of the form ``compute_<major><minor>`` where `<major>` and
+        `<minor>` are the two digits of a compute capability
+        pair `(<major>, <minor>)`. If *None*, defaults to the current
+        context's device's compute capability.
+    :type arch:
+        str or None
+    :param headers:
+        A `dict` that maps virtual header file names to their respective
+        content. These extra in-memory headers are made visible to and can
+        be included in CUDA sources processed by this compiler just like
+        regular include files by using the ``#include`` preprocessor
+        directive.
+    :type headers:
+        dict(str, str) or None
+
+    .. method:: compile(source, name=None, options=None, exports=None,\
+                    mangled_exports=None)
+
+        Compile CUDA source code *source* with NVRTC using the compile options
+        specified earlier in the constructor combined with *options*
+        and return the resulting PTX code to the caller. Do not upload the
+        code to the GPU.
+
+        Declare all function names in *exports* to the compiler, and after
+        successfull compilation, insert each pair of exported function name
+        in *exports* and its C++ mangled function name into *mangled_exports*.
+
+        :param str source:
+            CUDA C++ source code
+        :param name:
+            Specifies to the compiler under which name to refer to *source*
+            in its warning and error messages. If *None*, defaults to
+            ``"kernel.ptx"``
+        :type name:
+            str or None
+        :param options:
+            List of additional NVRTC compile options, combine with those
+            supplied in the constructor
+        :type options:
+            list(str) or None
+        :param exports:
+            Function names in *source* that the caller wants to be exported
+        :type exports:
+            list(str) or None
+        :param mangled_exports:
+            Output variable, receives pairs of function names from *exports*
+            and their mangled counterparts
+        :type mangled_exports:
+            dict(str, str) or None
+        :return:
+            the compiled PTX code string
+
+    CUDA 8.0 and above.
+
 .. module:: pycuda.compiler
 
 .. data:: DEFAULT_NVCC_FLAGS
@@ -2042,11 +2364,351 @@ Just-in-time Compilation
     *Change note:* :class:`SourceModule` was moved from :mod:`pycuda.driver` to
     :mod:`pycuda.compiler` in version 0.93.
 
-.. function:: compile(source, nvcc="nvcc", options=None, keep=False,
-        no_extern_c=False, arch=None, code=None, cache_dir=None,
+.. function:: compile(source, nvcc="nvcc", options=None, keep=False,\
+        no_extern_c=False, arch=None, code=None, cache_dir=None,\
         include_dirs=[])
 
     Perform the same compilation as the corresponding
     :class:`SourceModule` constructor, but only return
     resulting *cubin* file as a string. In particular,
     do not upload the code to the GPU.
+
+.. class:: DynamicSourceModule(source, nvcc=None, options=None, keep=False,\
+               no_extern_c=False, arch=None, code=None, cache_dir=None,\
+               include_dirs=None, use_cudadevrt=False, cuda_libdir=None,\
+               exports=None)
+
+    .. versionadded:: 2017.?
+
+    Create a :class:`~pycuda.driver.Module` from CUDA source code *source*
+    using either NVCC or NVRTC with optional support for the dynamic CUDA
+    runtime `cudadevrt` (needed for recursive launches/dynamic parallelism).
+
+    :param source:
+        CUDA C++ source code
+    :type source:
+        str
+    :param nvcc:
+        File system path of the NVCC executable. If *None*, defaults to ``nvcc``
+    :type nvcc:
+        str or None
+    :param options:
+        List of NVCC or NVRTC compile options
+    :type options:
+        list(str) or None
+    :param keep:
+        If *True*, keep compiler output and print a line indicating its
+        location in the file system for debugging purposes.
+        Default: *False* (NVCC only)
+    :type keep:
+        bool
+    :param no_extern_c:
+        If *False*, wrap the given source code in ``extern "C" { ... }`` to
+        prevent C++ name mangling. Default: *False* (NVCC only)
+    :type no_extern_c:
+        bool
+    :param arch:
+        The name of the class of NVIDIA virtual GPU architecture for which to
+        compile the CUDA C++ source. If *None*, defaults to the current
+        context's device's compute capability
+    :type arch:
+        str or None
+    :param code:
+        The name of the NVIDIA GPU to assemble and optimize PTX code for. If
+        *None*, leave it unspecified (NVCC only)
+    :type code:
+        str or None
+    :param cache_dir:
+        The directory used by NVCC for compiler caching. If *None* (default),
+        then `cache_dir` is taken to be :envvar:`PYCUDA_CACHE_DIR` if set or a
+        sensible per-user default. If *False*, caching is disabled (NVCC only)
+    :type cache_dir:
+        str or None
+    :param include_dirs:
+        List of extra directory names to use when processing ``#include``
+        preprocessor directives in the CUDA source.
+    :type include_dirs:
+        list(str) or None
+    :param use_cudadevrt:
+        If *True*, link to the dynamic CUDA runtime, meaning: instruct the
+        compiler to generate relocatable PTX device code from *source* that
+        can be linked with other relocatable device code, and statically link
+        standard library `cudadevrt` to this module. Default: *False* (link
+        static runtime)
+    :type use_cudadevrt:
+        bool
+    :param cuda_libdir:
+        The absolute path in the local file system of the CUDA SDK 64-Bit
+        library directory
+    :type cuda_libdir:
+        str or None
+    :param exports:
+        If not *None*, use NVRTC instead of NVCC to compile *source*.
+        This argument contains the list of function names in *source* that
+        the caller later wants to access using :meth:`get_function()`
+    :type exports:
+        list(str) or None
+
+    This class exhibits the same public interface as
+    :class:`~pycuda.driver.Module`, but does not inherit from it.
+
+    CUDA 3.5 and above, 8.0 and above when using NVRTC.
+
+    .. method:: get_function(name)
+
+        :see:
+            :meth:`Module.get_function() <pycuda.driver.Module.get_function>`
+
+    .. method:: get_global(name)
+
+        :see:
+            :meth:`Module.get_global() <pycuda.driver.Module.get_global>`
+
+    .. method:: get_texref(name)
+
+        :see:
+            :meth:`Module.get_texref() <pycuda.driver.Module.get_texref>`
+
+    .. method:: get_surfref(name)
+
+        :see:
+            :meth:`Module.get_surfref() <pycuda.driver.Module.get_surfref>`
+
+.. class:: DynamicModule(nvcc=None, keep=False, no_extern_c=False,\
+               arch=None, code=None, cache_dir=None, include_dirs=None,\
+               link_options=None, message_handler=None, log_verbose=False,\
+               cuda_libdir=None)
+
+    .. versionadded:: 2017.?
+
+    Build a :class:`~pycuda.driver.Module` from an arbitrary mix of CUDA C++
+    sources (compiled with either NVCC or NVRTC), PTX sources and CUDA
+    libraries with CUDA's Just-In-Time (JIT) linker.
+
+    General usage pattern for this class:
+
+    1. Create a module builder using constructor :meth:`DynamicModule`
+    2. Use the ``add_...()`` methods as often as needed and in any order:
+
+      - :meth:`add_source_nvcc(source) <add_source_nvcc>` - to compile *source* with NVCC and add result
+      - :meth:`add_source_nvrtc(source) <add_source_nvrtc>` - to compile *source* with NVRTC and add result
+      - :meth:`add_data(data) <add_data>` - to add in-memory data to the module
+      - :meth:`add_file(filename) <add_file>` - to add files to the module
+      - :meth:`add_stdlib(libanem) <add_stdlib>` - to add SDK libraries to the module
+
+    3. Link and upload module code to the GPU by calling :meth:`link` once
+    4. Access module's kernel functions on the GPU using  :meth:`get_function`
+
+    :param nvcc:
+        File system path of the NVCC executable. If *None*, defaults to ``nvcc``
+    :type nvcc:
+        str or None
+    :param keep:
+        If *True*, keep compiler outputs and print a line indicating their
+        location in the file system for debugging purposes.
+        Default: *False* (NVCC only)
+    :type keep:
+        bool
+    :param no_extern_c:
+        If *False*, wrap the given source codes in ``extern "C" { ... }`` to
+        prevent C++ name mangling. Default: *False* (NVCC only)
+    :type no_extern_c:
+        bool
+    :param arch:
+        The name of the class of NVIDIA virtual GPU architecture for which to
+        compile the CUDA C++ sources. If *None*, defaults to the current
+        context's device's compute capability
+    :type arch:
+        str or None
+    :param code:
+        The name of the NVIDIA GPU to assemble and optimize PTX code for. If
+        *None*, leave it unspecified (NVCC only)
+    :type code:
+        str or None
+    :param cache_dir:
+        The directory used by NVCC for compiler caching. If *None* (default),
+        then `cache_dir` is taken to be :envvar:`PYCUDA_CACHE_DIR` if set or a
+        sensible per-user default. If *False*, caching is disabled (NVCC only)
+    :type cache_dir:
+        str or None
+    :param include_dirs:
+        List of extra directory names to use when processing ``#include``
+        preprocessor directives in CUDA sources.
+    :type include_dirs:
+        list(str) or None
+    :param link_options:
+        Options for the JIT linker :class:`~pycuda.driver.JitLinker`
+    :type link_options:
+        list(jit_option) or None
+    :param message_handler:
+        Optional callback on success or failure
+    :type message_handler:
+        callable or None
+    :param bool log_verbose:
+        If *True*, instruct linker to create detailed log output.
+        Default: *False*
+    :param cuda_libdir:
+        The absolute path in the local file system of the CUDA SDK 64-Bit
+        library directory
+    :type cuda_libdir:
+        str or None
+
+    After :meth:`link` has been called, objects of this class exhibit
+    the same public interface as :class:`~pycuda.driver.Module`, but do not
+    inherit from it.
+
+    CUDA 3.5 and above.
+
+    .. method:: add_source_nvcc(source, options=None, name=None)
+
+        Compile CUDA source code *source* with NVIDIA's Command-Line-Compiler
+        NVCC using compiler options *options* and add the resulting
+        PTX code to the linker session of this module.
+
+        :param str source:
+            CUDA C++ source code
+        :param options:
+            List of NVCC compiler options
+        :type options:
+            list(str) or None
+        :param name:
+            Specifies to the compiler and linker under which name to refer to
+            *source* in their warning and error messages. If *None*, defaults
+            to ``"kernel.ptx"``
+        :type name:
+            str or None
+        :return:
+            *self*
+
+    .. method:: add_source_nvrtc(source, options=None,\
+                    name=None, exports=None, compiler=None)
+
+        Compile CUDA source code *source* with NVIDIA's Run-Time-Compiler
+        NVRTC using compiler options *options* and add the resulting
+        PTX code to the linker session of this module.
+
+        :param str source:
+            CUDA C++ source code
+        :param options:
+           List of NVRTC compile options
+        :type options:
+            list(str) or None
+        :param name:
+            Specifies to the compiler and linker under which name to refer to
+            *source* in their warning and error messages. If *None*, defaults
+            to ``"kernel.ptx"``
+        :type name:
+            str or None
+        :param exports:
+            Function names in *source* that the caller wants to be exported
+        :type exports:
+            list(str) or None
+        :param compiler:
+            The NVRTC compiler to use to compile *source*. If *None*, a
+            default compiler instance is created and used
+        :type compiler:
+            NvrtcCompiler or None
+        :return:
+            *self*
+        :see:
+            :meth:`JitLinker.add_source_nvrtc() <pycuda.driver.JitLinker.add_source_nvrtc>`
+
+        CUDA 8.0 and above.
+
+    .. method:: add_data(data, input_type, name=None)
+
+        Add the binary data in *data* to the linker session of this module.
+
+        :param data:
+            A string (Python 2) or bytes (Python 3) object containing the data to add
+        :type data:
+            str or bytes
+        :param input_type:
+            JIT linker input type of *data*
+        :type input_type:
+            jit_input_type
+        :param name:
+            Specifies to the linker under which name to refer to *data* in
+            its warning and error messages. If *None*, defaults
+            to ``"unknown"``
+        :type name:
+            str or None
+        :return:
+            *self*
+        :see:
+            :meth:`JitLinker.add_data() <pycuda.driver.JitLinker.add_data>`
+
+    .. method:: add_file(filename, input_type)
+
+        Add the content of file *filename* to the linker session of this
+        module.
+
+        :param str filename:
+            The name of the file to add
+        :param input_type:
+            JIT linker input type of the file's content
+        :type input_type:
+            jit_input_type
+        :return:
+            *self*
+        :see:
+            :meth:`JitLinker.add_file() <pycuda.driver.JitLinker.add_file>`
+
+    .. method:: add_stdlib(libname)
+
+        Add the standard library *libname* to the linker session of this
+        module.
+
+        A standard library is part of the CUDA SDK. If this method fails
+        to auto-locate the correct CUDA SDK installation path on your system,
+        use argument *cuda_libdir* in the constructor to set this path
+        manually to the 64-Bit library directory of your local CUDA SDK
+        installation.
+
+        :param str libname:
+            specifies the shortened library file name which gets expanded
+            internally to match the correct library file name for the local
+            system, for example `cudadevrt` is the shortened form of
+            `cudadevrt.lib` on Windows and `libcudadevrt.a` on Linux.
+        :return:
+            *self*
+
+    .. method:: link()
+
+        Complete the linker session and create the :class:`~pycuda.driver.Module`.
+
+        After calling this method, calling the *add_...* methods of this
+        instance is no longer allowed, whereas accessing module functions
+        through this instance using :meth:`get_function` is only
+        allowed after calling this method.
+
+        :return:
+            *self*
+
+    .. method:: get_function(name)
+
+        This method is not defined before calling :meth:`link`.
+
+        :see:
+            :meth:`Module.get_function() <pycuda.driver.Module.get_function>`
+
+    .. method:: get_global(name)
+
+        This method is not defined before calling :meth:`link`.
+
+        :see:
+            :meth:`Module.get_global() <pycuda.driver.Module.get_global>`
+
+    .. method:: get_texref(name)
+
+        This method is not defined before calling :meth:`link`.
+
+        :see:
+            :meth:`Module.get_texref() <pycuda.driver.Module.get_texref>`
+
+    .. method:: get_surfref(name)
+
+        This method is not defined before calling :meth:`link`.
+
+        :see:
+            :meth:`Module.get_surfref() <pycuda.driver.Module.get_surfref>`
diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py
index d5435987420dec7fa4542b5c11667dfda9d3f9dd..b2c0b4e6e65621022561ef69938a5b7387f74123 100644
--- a/examples/demo_cdpSimplePrint.py
+++ b/examples/demo_cdpSimplePrint.py
@@ -95,7 +95,7 @@ def main(argv):
 
     print("starting Simple Print (CUDA Dynamic Parallelism)")
 
-    mod = DynamicSourceModule(cdpSimplePrint_cu)
+    mod = DynamicSourceModule(cdpSimplePrint_cu, use_cudadevrt=True)
     cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call
 
     print("***************************************************************************")
diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index b73793b6dee779f268cab3f80929f30012c3c3f5..7ad26ae0e36b2afbc61bad642c6ae42dcc4c8964 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -278,10 +278,6 @@ class CudaModule(object):
         return self.module.get_function(name)
 
 class SourceModule(CudaModule):
-    '''
-    Creates a Module from a single .cu source object linked against the
-    static CUDA runtime.
-    '''
     def __init__(self, source, nvcc="nvcc", options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
             include_dirs=[]):
@@ -318,22 +314,20 @@ def _find_nvcc_on_path():
 
 
 class DynamicModule(CudaModule):
-    '''
-    Creates a Module from multiple .cu source, library file and/or data
-    objects linked against the static or dynamic CUDA runtime.
-    '''
-    def __init__(self, nvcc='nvcc', link_options=None, keep=False,
-            no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[],  message_handler=None, log_verbose=False,
-            cuda_libdir=None):
+    #def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None,
+    #        code=None, cache_dir=None, include_dirs=[], link_options=None,
+    #        message_handler=None, log_verbose=False, cuda_libdir=None):
+    def __init__(self, nvcc=None, keep=False, no_extern_c=False, arch=None,
+            code=None, cache_dir=None, include_dirs=None, link_options=None,
+            message_handler=None, log_verbose=False, cuda_libdir=None):
         from pycuda.driver import Context
         compute_capability = Context.get_device().compute_capability()
         if compute_capability < (3,5):
-            raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
+            raise RuntimeError('Minimum compute capability for JIT linker is 3.5 (found: %u.%u)!' %
                 (compute_capability[0], compute_capability[1]))
         else:
-            from pycuda.driver import Linker
-            self.linker = Linker(message_handler, link_options, log_verbose)
+            from pycuda.driver import JitLinker
+            self.linker = JitLinker(link_options, message_handler, log_verbose)
         self._check_arch(arch)
         self.nvcc = nvcc
         self.keep = keep
@@ -344,6 +338,7 @@ class DynamicModule(CudaModule):
         self.include_dirs = include_dirs
         self.cuda_libdir = cuda_libdir
         self.libdir, self.libptn = None, None
+        self.mangled_exports = None
         self.module = None
 
     def _locate_cuda_libdir(self):
@@ -407,16 +402,41 @@ class DynamicModule(CudaModule):
                 'directory, set CUDA library path manually')
         return libdir, libptn
 
-    def add_source(self, source, nvcc_options=None, name='kernel.ptx'):
-        ptx = compile(source, nvcc=self.nvcc, options=nvcc_options,
+    def add_source_nvcc(self, source, options=None, name=None):
+        if self.nvcc is None:
+            nvcc = 'nvcc'
+        else:
+            nvcc = self.nvcc
+        if self.include_dirs is None:
+            include_dirs = []
+        else:
+            include_dirs = self.include_dirs
+        ptx = compile(source, nvcc=nvcc, options=options,
             keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch,
             code=self.code, cache_dir=self.cache_dir,
-            include_dirs=self.include_dirs, target="ptx")
+            include_dirs=include_dirs, target="ptx")
         from pycuda.driver import jit_input_type
+        if name is None:
+            name = 'kernel.ptx'
         self.linker.add_data(ptx, jit_input_type.PTX, name)
         return self
 
-    def add_data(self, data, input_type, name='unknown'):
+    def add_source_nvrtc(self, source, options=None,
+            name=None, exports=None, compiler=None):
+        if not hasattr(self.linker, "add_source_nvrtc"):
+            raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!')
+        if self.mangled_exports is None and len(exports) > 0:
+            self.mangled_exports = {}
+        if compiler is None:
+            from pycuda.driver import NvrtcCompiler
+            compiler = NvrtcCompiler(options, self.include_dirs, self.arch)
+            options = None
+        self.linker.add_source_nvrtc(source, compiler, name=name,
+            options=options, exports=exports,
+            mangled_exports=self.mangled_exports)
+        return self
+
+    def add_data(self, data, input_type, name=None):
         self.linker.add_data(data, input_type, name)
         return self
 
@@ -441,6 +461,12 @@ class DynamicModule(CudaModule):
         self._bind_module()
         return self
 
+    def get_function(self, name):
+        if self.mangled_exports is None or name not in self.mangled_exports:
+            return self.module.get_function(name)
+        else:
+            return self.module.get_function(self.mangled_exports[name])
+
 
 class DynamicSourceModule(DynamicModule):
     '''
@@ -451,20 +477,34 @@ class DynamicSourceModule(DynamicModule):
     - source is linked against the CUDA device runtime library cudadevrt
     - library cudadevrt is statically linked into the generated Module
     '''
-    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
+    def __init__(self, source, nvcc=None, options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[], cuda_libdir=None):
+            include_dirs=None, use_cudadevrt=False, cuda_libdir=None,
+            exports=None):
         super(DynamicSourceModule, self).__init__(nvcc=nvcc,
+            keep=keep, no_extern_c=no_extern_c, arch=arch, code=code,
+            cache_dir=cache_dir, include_dirs=include_dirs, link_options=None,
+            cuda_libdir=cuda_libdir)
+
+        if options is None:
+            options = DEFAULT_NVCC_FLAGS
+        else:
+            options = options[:]
+        if use_cudadevrt and '-rdc=true' not in options:
             link_options=None, keep=keep, no_extern_c=no_extern_c,
             arch=arch, code=code, cache_dir=cache_dir,
             include_dirs=include_dirs, cuda_libdir=cuda_libdir)
-        if options is None:
-            options = DEFAULT_NVCC_FLAGS
         options = options[:]
-        if '-rdc=true' not in options:
+        if not '-rdc=true' in options:
             options.append('-rdc=true')
-        if '-lcudadevrt' not in options:
-            options.append('-lcudadevrt')
-        self.add_source(source, nvcc_options=options)
-        self.add_stdlib('cudadevrt')
+        if exports is None:
+            if use_cudadevrt and '-lcudadevrt' not in options:
+                options.append('-lcudadevrt')
+            self.add_source_nvcc(source, options=options)
+        else:
+            self.add_source_nvrtc(source, options=options, exports=exports)
+        if use_cudadevrt:
+            self.add_stdlib('cudadevrt')
+
         self.link()
+'''
diff --git a/setup.py b/setup.py
index 775141b0d91db62fafd716cb66e9124fa10e628f..a398920a03047441d13d9a2c111864f997b7bcec 100644
--- a/setup.py
+++ b/setup.py
@@ -91,6 +91,9 @@ def get_config_schema():
         LibraryDir("CURAND", default_lib_dirs),
         Libraries("CURAND", ["curand"]),
 
+        LibraryDir("NVRTC", default_lib_dirs),
+        Libraries("NVRTC", ["nvrtc"]),
+
         StringListOption("CXXFLAGS", cxxflags_default,
             help="Any extra C++ compiler options to include"),
         StringListOption("LDFLAGS", ldflags_default,
@@ -117,7 +120,7 @@ def main():
 
     LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"]
     LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"]
-            + conf["CUDADRV_LIBNAME"])
+            + conf["CUDADRV_LIBNAME"]) + conf["NVRTC_LIBNAME"]
 
     if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]:
         conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")]
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 8a2488deccf6fecfa6350faaa013246de2a2c360..eead66450bdb5eff3e3a79ce81fa0567e1bb5c62 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -9,6 +9,8 @@
 #include <boost/python/stl_iterator.hpp>
 
 
+#define USE_JIT
+
 
 
 #if CUDAPP_CUDA_VERSION < 1010
@@ -410,10 +412,273 @@ namespace
 
   // }}}
 
-  // {{{ linker
+  // {{{ nvrtc compiler
+
+#ifdef USE_JIT
+
+#if CUDAPP_CUDA_VERSION >= 8000
+  #include <nvrtc.h>    // NOTE: this requires us to link static SDK library "nvrtc"
+
+  class NvrtcProgram
+  {
+    private:
+      nvrtcProgram m_prog;
+
+      const std::string compilation_log_str() const {
+        size_t result_length;
+        nvrtcResult nvrtc_result = nvrtcGetProgramLogSize(m_prog, &result_length);
+        check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result);
+        if (result_length == 0) {
+          return std::string();
+        }
+        std::vector<char> result_vec(result_length);
+        nvrtc_result = nvrtcGetProgramLog(m_prog, &result_vec[0]);
+        check_nvrtc_result("nvrtcGetProgramLog", nvrtc_result);
+        return std::string(&result_vec[0], result_length);
+      }
+
+    public:
+      NvrtcProgram(const char* source, const char* name,
+          const std::vector<const char*>& header_names_p,
+          const std::vector<const char*>& header_contents_p)
+        : m_prog()
+      {
+        const size_t header_count = header_names_p.size();
+        const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog,
+            source, name, (int)header_count,
+            (header_count > 0)? &header_contents_p[0] : NULL,
+            (header_count > 0)? &header_names_p[0] : NULL);
+        check_nvrtc_result("nvrtcCreateProgram", nvrtc_result);
+      }
+
+      ~NvrtcProgram() {
+        if (m_prog != NULL) {
+          nvrtcDestroyProgram(&m_prog);
+          m_prog = NULL;
+        }
+      }
+
+      nvrtcProgram prog() const {
+        return m_prog;
+      }
+
+      void check_nvrtc_result(const char* nvrtc_function_name, nvrtcResult nvrtc_result) const {
+        if (nvrtc_result != NVRTC_SUCCESS) {
+          std::string error_msg = nvrtc_function_name;
+          error_msg += " failed: ";
+          error_msg += nvrtcGetErrorString(nvrtc_result);
+          const std::string compilation_log = compilation_log_str();
+          if (compilation_log.length() > 0) {
+            error_msg += " - ";
+            error_msg += compilation_log;
+          }
+          throw std::runtime_error(error_msg);
+        }
+      }
+  };
+
+  class nvrtc_compiler : public boost::noncopyable
+  {
+    private:
+      std::vector<std::string> m_compile_options;
+      std::vector<std::string> m_header_names;
+      std::vector<std::string> m_header_contents;
+      std::vector<const char*> m_compile_options_p;
+      std::vector<const char*> m_header_names_p;
+      std::vector<const char*> m_header_contents_p;
+#ifdef _WIN32
+      std::vector<char> m_win32_getenv_buf;
+#endif
+
+      char* get_env(const char* varname) {
+#ifdef _WIN32
+        // Visual Studio 2015: std::getenv() causes compiler warning C4996
+        size_t required_size = 0;
+        getenv_s(&required_size, NULL, 0, varname);
+        if (required_size > 0) {
+          m_win32_getenv_buf.resize(required_size);
+          getenv_s(&required_size, &m_win32_getenv_buf[0], required_size, varname);
+          return &m_win32_getenv_buf[0];
+        }
+        else {
+          return NULL;
+        }
+#else
+        return std::getenv(varname);
+#endif
+      }
+
+      std::string find_pycuda_include_path() const {
+        // pycuda.compiler._find_pycuda_include_path():
+        //     from pkg_resources import Requirement, resource_filename
+        //     return resource_filename(Requirement.parse("pycuda"), "pycuda/cuda")
+        const py::object pkg_resources = py::import("pkg_resources");
+        const py::object resource_filename = pkg_resources.attr("resource_filename");
+        const py::object Requirement_parse = pkg_resources.attr("Requirement").attr("parse");
+        const py::object py_result = resource_filename(Requirement_parse("pycuda"), "pycuda/cuda");
+        return py::extract<std::string>(py_result);
+      }
+
+      void options_changed() {
+        const size_t n = m_compile_options.size();
+        m_compile_options_p.resize(n);
+        for (size_t i=0; i<n; ++i) {
+          m_compile_options_p[i] = m_compile_options[i].c_str();
+        }
+      }
+
+      void header_changed() {
+        const size_t n = m_header_names.size();
+        m_header_names_p.resize(n);
+        m_header_contents_p.resize(n);
+        for (size_t i=0; i<n; ++i) {
+          m_header_names_p[i] = m_header_names[i].c_str();
+          m_header_contents_p[i] = m_header_contents[i].c_str();
+        }
+      }
+
+    public:
+      nvrtc_compiler(
+          const py::object py_nvrtc_options=py::object(),
+          const py::object py_include_dirs=py::object(),
+          const py::object py_arch=py::object(),
+          const py::object py_headers=py::object())
+      {
+        // Target GPU architecture (-arch) compiler option
+        std::string arch_compute_capability;
+        if (py_arch.ptr() != Py_None) {
+          const char* arch = py::extract<char*>(py_arch);
+          if (strlen(arch) == 5 && std::memcmp(arch, "sm_", 3) == 0) {
+            arch_compute_capability = std::string("compute_") + &arch[3];
+          }
+          else {
+            arch_compute_capability = arch;
+          }
+        }
+        else {
+          const py::tuple py_compute_capability = context::get_device().compute_capability();
+          const char arch_major = '0' + py::extract<int>(py_compute_capability[0]);
+          const char arch_minor = '0' + py::extract<int>(py_compute_capability[1]);
+          arch_compute_capability = std::string("compute_") + arch_major + arch_minor;
+        }
+        m_compile_options.push_back("-arch");
+        m_compile_options.push_back(arch_compute_capability);
+
+        // Include directory (-I) compiler options
+        m_compile_options.push_back(std::string("-I") + find_pycuda_include_path());
+        const char* CUDA_SDK;
+        if ((CUDA_SDK = get_env("CUDA_PATH")) != NULL) {
+          m_compile_options.push_back(std::string("-I") + CUDA_SDK + "\\include");
+        }
+        else if ((CUDA_SDK = get_env("CUDA_ROOT")) != NULL) {
+          m_compile_options.push_back(std::string("-I") + CUDA_SDK + "/include");
+        }
+        if (py_include_dirs.ptr() != Py_None) {
+          PYTHON_FOREACH(py_include_dir, py_include_dirs) {
+            m_compile_options.push_back(std::string("-I") + py::extract<char*>(py_include_dir)());
+          }
+        }
+
+        // Other compiler options specified by the caller
+        if (py_nvrtc_options.ptr() != Py_None) {
+          PYTHON_FOREACH(py_compiler_option, py_nvrtc_options) {
+            m_compile_options.push_back(py::extract<std::string>(py_compiler_option));
+          }
+        }
+
+        options_changed();
+
+        // In-memory NVRTC header files
+        if (py_headers.ptr() != Py_None) {
+          const py::dict py_headers_dict = py::extract<py::dict>(py_headers);
+          if (py::len(py_headers_dict) > 0) {
+            const py::list py_keys = py_headers_dict.keys();
+            PYTHON_FOREACH(py_key, py_keys) {
+              m_header_names.push_back(py::extract<std::string>(py_key));
+              m_header_contents.push_back(py::extract<std::string>(py_headers_dict[py_key]));
+            }
+            header_changed();
+          }
+        }
+      }
+
+      std::vector<char> compile(
+          const py::str& py_source,
+          const py::object& py_name,
+          const py::object& py_nvrtc_options,
+          const py::object& py_exports,
+          py::object& py_mangled_exports)
+      {
+        // Combine compile option lists into compile_options_p<char*>
+        std::vector<const char*> compile_options_p(m_compile_options_p);
+        if (py_nvrtc_options.ptr() != Py_None) {
+          PYTHON_FOREACH(py_compile_option, py_nvrtc_options) {
+            compile_options_p.push_back(py::extract<char*>(py_compile_option));
+          }
+        }
+
+        // Create NVRTC program
+        const char* name = (py_name.ptr() != Py_None)?
+            py::extract<char*>(py_name) : "kernel.ptx";
+        NvrtcProgram prog(py::extract<char*>(py_source), name,
+            m_header_names_p, m_header_contents_p);
+
+        // Declare exported function names
+        nvrtcResult nvrtc_result;
+        if (py_exports.ptr() != Py_None) {
+          PYTHON_FOREACH(py_export, py_exports) {
+            nvrtc_result = nvrtcAddNameExpression(prog.prog(), py::extract<char*>(py_export));
+            prog.check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
+          }
+        }
+
+        // Compile .cu => .ptx
+        nvrtc_result = nvrtcCompileProgram(prog.prog(),
+            (int)compile_options_p.size(), &compile_options_p[0]);
+        prog.check_nvrtc_result("nvrtcCompileProgram", nvrtc_result);
+
+        // Map exported function names to their mangled representation
+        if (py_exports.ptr() != Py_None && py_mangled_exports.ptr() != Py_None) {
+          py::dict py_mangled_exports_dict = py::extract<py::dict>(py_mangled_exports);
+          PYTHON_FOREACH(py_export, py_exports) {
+            const char* plain_name = py::extract<char*>(py_export);
+            const char* mangled_name = NULL;
+            nvrtc_result = nvrtcGetLoweredName(prog.prog(), plain_name, &mangled_name);
+            prog.check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result);
+            py_mangled_exports_dict[py_export] = mangled_name;
+          }
+        }
+
+        // Copy compiled ptx result into ptx_buffer
+        size_t ptx_size;
+        nvrtc_result = nvrtcGetPTXSize(prog.prog(), &ptx_size);
+        prog.check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result);
+        std::vector<char> ptx_buffer(ptx_size);
+        nvrtc_result = nvrtcGetPTX(prog.prog(), &ptx_buffer[0]);
+        prog.check_nvrtc_result("nvrtcGetPTX", nvrtc_result);
+        return ptx_buffer;
+      }
+
+      std::string py_compile(
+          py::str py_source,
+          py::object py_name,
+          py::object py_nvrtc_options,
+          py::object py_exports,
+          py::object py_mangled_exports)
+      {
+        const std::vector<char> ptx = compile(py_source, py_name,
+            py_nvrtc_options, py_exports, py_mangled_exports);
+        return std::string(ptx.begin(), ptx.end());
+      }
+  };
+#endif
+
+  // }}}
+
+  // {{{ jit linker
 
 #if CUDAPP_CUDA_VERSION >= 5050
-  class Linker : public boost::noncopyable
+  class jit_linker : public boost::noncopyable
   {
     private:
       py::object m_message_handler;
@@ -459,8 +724,8 @@ namespace
       }
 
     public:
-      Linker(py::object message_handler = py::object(),
-             py::object py_options = py::object(),
+      jit_linker(py::object py_link_options = py::object(),
+             py::object message_handler = py::object(),
              py::object py_log_verbose = py::object(false))
         : m_message_handler(message_handler),
           m_link_state(),
@@ -472,8 +737,8 @@ namespace
         add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, sizeof(m_error_buf));
         add_option(CU_JIT_LOG_VERBOSE, m_log_verbose? 1ull : 0ull);
 
-        if (py_options.ptr() != Py_None) {
-          PYTHON_FOREACH(key_value, py_options) {
+        if (py_link_options.ptr() != Py_None) {
+          PYTHON_FOREACH(key_value, py_link_options) {
             add_option(
                 py::extract<CUjit_option>(key_value[0]),
                 py::extract<intptr_t>(key_value[1])());
@@ -488,12 +753,12 @@ namespace
         check_cu_result("cuLinkCreate", cu_result);
       }
 
-      ~Linker()
+      ~jit_linker()
       {
         close();
       }
 
-      void add_data(py::object py_data, CUjitInputType input_type, py::str py_name)
+      void add_data(py::object py_data, CUjitInputType input_type, py::object py_name)
       {
         const char *data_buf;
         PYCUDA_BUFFER_SIZE_T data_buf_len;
@@ -501,7 +766,7 @@ namespace
           throw py::error_already_set();
         }
         const char* name = (py_name.ptr() != Py_None)?
-            py::extract<const char*>(py_name) : NULL;
+            py::extract<char*>(py_name) : "unknown";
         const CUresult cu_result = cuLinkAddData(m_link_state, input_type,
             static_cast<void *>(const_cast<char *>(data_buf)),
             data_buf_len, name, 0, NULL, NULL);
@@ -510,12 +775,29 @@ namespace
 
       void add_file(py::str py_filename, CUjitInputType input_type)
       {
-        const char* filename = py::extract<const char*>(py_filename);
+        const char* filename = py::extract<char*>(py_filename);
         const CUresult cu_result = cuLinkAddFile(m_link_state, input_type,
             filename, 0, NULL, NULL);
         check_cu_result("cuLinkAddFile", cu_result);
       }
 
+#if CUDAPP_CUDA_VERSION >= 8000
+      void add_source_nvrtc(py::str py_source, nvrtc_compiler* compiler,
+          py::object py_name, py::object py_nvrtc_options,
+          py::object py_exports, py::object py_mangled_exports)
+      {
+        // compile source into buffer ptx
+        std::vector<char> ptx = compiler->compile(py_source, py_name,
+            py_nvrtc_options, py_exports, py_mangled_exports);
+        // link buffer ptx
+        const char* name = (py_name.ptr() != Py_None)?
+            py::extract<char*>(py_name) : "kernel.ptx";
+        const CUresult cu_result = cuLinkAddData(m_link_state,
+            CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL);
+        check_cu_result("cuLinkAddData", cu_result);
+      }
+#endif
+
       module* link_module()
       {
         char* cubin_data = NULL;
@@ -536,6 +818,8 @@ namespace
   };
 #endif
 
+#endif /* #ifdef USE_JIT */
+
   // }}}
 
   template <class T>
@@ -1192,7 +1476,28 @@ BOOST_PYTHON_MODULE(_driver)
 
   // }}}
 
-  // {{{ linker
+  // {{{ nvrtc compiler
+
+#ifdef USE_JIT
+
+#if CUDAPP_CUDA_VERSION >= 8000
+//  py::class_<nvrtc_compiler, boost::noncopyable, shared_ptr<nvrtc_compiler> >("NvrtcCompiler")
+  py::class_<nvrtc_compiler, boost::noncopyable>("NvrtcCompiler")
+    .def(py::init<py::object>())
+    .def(py::init<py::object, py::object>())
+    .def(py::init<py::object, py::object, py::object>())
+    .def(py::init<py::object, py::object, py::object, py::object>())
+    .def("compile", &nvrtc_compiler::py_compile, (
+        py::arg("source"),
+        py::arg("name")=py::object(),
+        py::arg("options")=py::object(),
+        py::arg("exports")=py::object(),
+        py::arg("mangled_exports")=py::object()));
+#endif
+
+  // }}}
+
+  // {{{ jit linker
 
 #if CUDAPP_CUDA_VERSION >= 5050
   py::enum_<CUjitInputType>("jit_input_type")
@@ -1202,14 +1507,32 @@ BOOST_PYTHON_MODULE(_driver)
     .value("OBJECT", CU_JIT_INPUT_OBJECT)
     .value("LIBRARY", CU_JIT_INPUT_LIBRARY);
 
-  py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
+//  py::class_<jit_linker, boost::noncopyable, shared_ptr<jit_linker> >("JitLinker")
+  py::class_<jit_linker, boost::noncopyable>("JitLinker")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
     .def(py::init<py::object, py::object, py::object>())
-    .def("add_data", &Linker::add_data, (py::arg("data"), py::arg("input_type"), py::arg("name")=py::str("unknown")))
-    .def("add_file", &Linker::add_file, (py::arg("filename"), py::arg("input_type")))
-    .def("link_module", &Linker::link_module, py::return_value_policy<py::manage_new_object>());
-#endif
+    .def("add_data", &jit_linker::add_data, (
+        py::arg("data"),
+        py::arg("input_type"),
+        py::arg("name")=py::object()))
+    .def("add_file", &jit_linker::add_file, (
+        py::arg("filename"),
+        py::arg("input_type")))
+#if CUDAPP_CUDA_VERSION >= 8000
+    .def("add_source_nvrtc", &jit_linker::add_source_nvrtc, (
+        py::arg("source"),
+        py::arg("compiler"),
+        py::arg("name")=py::object(),
+        py::arg("options")=py::object(),
+        py::arg("exports")=py::object(),
+        py::arg("mangled_exports")=py::object()))
+#endif
+    .def("link_module", &jit_linker::link_module,
+        py::return_value_policy<py::manage_new_object>());
+#endif
+
+#endif  /* #ifdef USE_JIT */
 
   // }}}
 
diff --git a/test/test_driver.py b/test/test_driver.py
index b440eff09e9791ea7977afd1cba550f0cebe173f..b3deed408c21d2afb52d68f6e957feb21e2a9d76 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -891,12 +891,14 @@ class TestDriver:
             drv.memcpy_htod(a_gpu, a)
             drv.memcpy_htod(b_gpu, b)
 
+            '''
             from pycuda.compiler import DynamicSourceModule
-            mod = DynamicSourceModule(cuda_string, keep=True)
+            mod = DynamicSourceModule(cuda_string, keep=True, use_cudadevrt=True)
 
             func = mod.get_function("math")
             func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu,
                     block=(100, 1, 1), grid=(1, 1, 1))
+            '''
 
             drv.memcpy_dtoh(c, c_gpu)
             drv.memcpy_dtoh(d, d_gpu)
@@ -921,31 +923,36 @@ class TestDriver:
             from pytest import skip
             skip("need compute capability 3.5 or higher for dynamic parallelism")
 
-        test_outer_cu = '''#include <cstdio>
-        __global__ void test_kernel() {
-            extern __global__ void test_kernel_inner();
-            printf("Hello outer world!\\n");
-            test_kernel_inner<<<2, 1>>>();
-        }'''
+        test_outer_cu = '''
+            #include <cstdio>
+            __global__ void test_kernel() {
+                extern __global__ void test_kernel_inner();
+                printf("Hello outer world!\\n");
+                test_kernel_inner<<<2, 1>>>();
+            }
+            '''
 
-        test_inner_cu = '''#include <cstdio>
-        __global__ void test_kernel_inner() {
-            printf("  Hello inner world!\\n");
-        }'''
+        test_inner_cu = '''
+            #include <cstdio>
+            __global__ void test_kernel_inner() {
+                printf("Hello inner world!\\n");
+            }
+        '''
 
         from pycuda.compiler import DynamicModule
         mod = DynamicModule()
-        mod.add_source(
-                test_outer_cu, nvcc_options=(
-                    ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS))
-        mod.add_source(
-                test_inner_cu, nvcc_options=(
-                    ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS))
+        mod.add_source_nvcc(
+            test_outer_cu,
+            nvcc_options=['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS)
+        mod.add_source_nvcc(
+            test_inner_cu,
+            nvcc_options=['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS)
+
         mod.add_stdlib('cudadevrt')
         mod.link()
 
         test_kernel = mod.get_function('test_kernel')
-        test_kernel(grid=(2, 1), block=(1, 1, 1))
+        test_kernel(grid=(2,1), block=(1,1,1))
 
 
 def test_import_pyopencl_before_pycuda():