diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5435987420dec7fa4542b5c11667dfda9d3f9dd
--- /dev/null
+++ b/examples/demo_cdpSimplePrint.py
@@ -0,0 +1,120 @@
+'''
+ * demo_cdpSimplePrint.py
+ *
+ * Adapted from NVIDIA's "cdpSimplePrint - Simple Print (CUDA Dynamic Parallelism)" sample
+ * http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-print--cuda-dynamic-parallelism-
+ * http://ecee.colorado.edu/~siewerts/extra/code/example_code_archive/a490dmis_code/CUDA/cuda_work/samples/0_Simple/cdpSimplePrint/cdpSimplePrint.cu
+ *
+ * From cdpSimplePrint.cu (not sure if this is Ok with NVIDIA's 38-page EULA though...):
+ * ---------------------------------------------------------------------------
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ * ---------------------------------------------------------------------------
+'''
+
+import sys, os
+import pycuda.autoinit
+import pycuda.driver as cuda
+from pycuda.compiler import DynamicSourceModule
+
+cdpSimplePrint_cu = '''
+#include <cstdio>
+
+////////////////////////////////////////////////////////////////////////////////
+// Variable on the GPU used to generate unique identifiers of blocks.
+////////////////////////////////////////////////////////////////////////////////
+__device__ int g_uids = 0;
+
+////////////////////////////////////////////////////////////////////////////////
+// Print a simple message to signal the block which is currently executing.
+////////////////////////////////////////////////////////////////////////////////
+__device__ void print_info( int depth, int thread, int uid, int parent_uid )
+{
+  if( threadIdx.x == 0 )
+  {
+    if( depth == 0 )
+      printf( "BLOCK %d launched by the host\\n", uid );
+    else
+    {
+      char buffer[32];
+      for( int i = 0 ; i < depth ; ++i )
+      {
+        buffer[3*i+0] = '|';
+        buffer[3*i+1] = ' ';
+        buffer[3*i+2] = ' ';
+      }
+      buffer[3*depth] = '\\0';
+      printf( "%sBLOCK %d launched by thread %d of block %d\\n", buffer, uid, thread, parent_uid );
+    }
+  }
+  __syncthreads();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The kernel using CUDA dynamic parallelism.
+//
+// It generates a unique identifier for each block. Prints the information
+// about that block. Finally, if the 'max_depth' has not been reached, the
+// block launches new blocks directly from the GPU.
+////////////////////////////////////////////////////////////////////////////////
+__global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid )
+{
+  // We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
+  __shared__ int s_uid;
+  if( threadIdx.x == 0 ) 
+  {
+    s_uid = atomicAdd( &g_uids, 1 );
+  }
+  __syncthreads();
+
+  // We print the ID of the block and information about its parent.
+  print_info( depth, thread, s_uid, parent_uid );
+  
+  // We launch new blocks if we haven't reached the max_depth yet.
+  if( ++depth >= max_depth )
+  {
+    return;
+  }
+  cdp_kernel<<<gridDim.x, blockDim.x>>>( max_depth, depth, threadIdx.x, s_uid );
+}
+'''
+
+def main(argv):
+    max_depth = 2
+    if len(argv) > 1:
+        if len(argv) == 2 and argv[1].isdigit() and int(argv[1]) >= 1 and int(argv[1]) <= 8:
+            max_depth = int(argv[1])
+        else:
+            print("Usage: %s <max_depth>\t(where max_depth is a value between 1 and 8)." % argv[0])
+            sys.exit(0)
+
+    print("starting Simple Print (CUDA Dynamic Parallelism)")
+
+    mod = DynamicSourceModule(cdpSimplePrint_cu)
+    cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call
+
+    print("***************************************************************************")
+    print("The CPU launches 2 blocks of 2 threads each. On the device each thread will")
+    print("launch 2 blocks of 2 threads each. The GPU we will do that recursively")
+    print("until it reaches max_depth=%d\n" % max_depth)
+    print("In total 2")
+    num_blocks, sum = 2, 2
+    for i in range(1, max_depth):
+        num_blocks *= 4
+        print("+%d" % num_blocks)
+        sum += num_blocks
+    print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum-2))
+    print("***************************************************************************\n")
+
+    pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH, max_depth)
+
+    print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n")
+    cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1)
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index c0b5ba6368950e186c73ad85f0efd9b9bbbb22a6..d9f25752a3c55bc8562b163ec77e2e8636c6b25e 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -254,24 +254,7 @@ def compile(source, nvcc="nvcc", options=None, keep=False,
 
     return compile_plain(source, options, keep, nvcc, cache_dir, target)
 
-
-class SourceModule(object):
-    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
-            no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[]):
-        self._check_arch(arch)
-
-        cubin = compile(source, nvcc, options, keep, no_extern_c,
-                arch, code, cache_dir, include_dirs)
-
-        from pycuda.driver import module_from_buffer
-        self.module = module_from_buffer(cubin)
-
-        self.get_global = self.module.get_global
-        self.get_texref = self.module.get_texref
-        if hasattr(self.module, "get_surfref"):
-            self.get_surfref = self.module.get_surfref
-
+class CudaModule(object):
     def _check_arch(self, arch):
         if arch is None:
             return
@@ -285,5 +268,171 @@ class SourceModule(object):
         except:
             pass
 
+    def _bind_module(self):
+        self.get_global = self.module.get_global
+        self.get_texref = self.module.get_texref
+        if hasattr(self.module, "get_surfref"):
+            self.get_surfref = self.module.get_surfref
+
     def get_function(self, name):
         return self.module.get_function(name)
+
+class SourceModule(CudaModule):
+    '''
+    Creates a Module from a single .cu source object linked against the
+    static CUDA runtime.
+    '''
+    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
+            no_extern_c=False, arch=None, code=None, cache_dir=None,
+            include_dirs=[]):
+        self._check_arch(arch)
+
+        cubin = compile(source, nvcc, options, keep, no_extern_c,
+                arch, code, cache_dir, include_dirs)
+
+        from pycuda.driver import module_from_buffer
+        self.module = module_from_buffer(cubin)
+
+        self._bind_module()
+
+class DynamicModule(CudaModule):
+    '''
+    Creates a Module from multiple .cu source, library file and/or data
+    objects linked against the static or dynamic CUDA runtime.
+    '''
+    def __init__(self, nvcc='nvcc', link_options=None, keep=False,
+            no_extern_c=False, arch=None, code=None, cache_dir=None,
+            include_dirs=[],  message_handler=None, log_verbose=False,
+            cuda_libdir=None):
+        from pycuda.driver import Context
+        compute_capability = Context.get_device().compute_capability()
+        if compute_capability < (3,5):
+            raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
+                (compute_capability[0], compute_capability[1]))
+        else:
+            from pycuda.driver import Linker
+            self.linker = Linker(message_handler, link_options, log_verbose)
+        self._check_arch(arch)
+        self.nvcc = nvcc
+        self.keep = keep
+        self.no_extern_c = no_extern_c
+        self.arch = arch
+        self.code = code
+        self.cache_dir = cache_dir
+        self.include_dirs = include_dirs
+        self.cuda_libdir = cuda_libdir
+        self.libdir, self.libptn = None, None
+        self.module = None
+
+    def _locate_cuda_libdir(self):
+        '''
+        Locate the "standard" CUDA SDK library directory in the local
+        file system. Supports 64-Bit Windows, Linux and Mac OS X.
+        In case the caller supplied cuda_libdir in the constructor
+        other than None that value is returned unchecked, else a
+        best-effort attempt is made.
+        Precedence:
+            Windows: cuda_libdir > %CUDA_PATH%
+            Linux:   cuda_libdir > $CUDA_ROOT > $LD_LIBRARY_PATH > '/usr/lib/x86_64-linux-gnu'
+        Returns a pair (libdir, libptn) where libdir is None in case
+            of failure or a string containing the absolute path of the
+            directory, and libptn is the %-format pattern to construct
+            library file names from library names on the local system.
+        Raises a RuntimeError in case of failure.
+        Links:
+        - Post-installation Actions
+          http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions
+        TODO:
+        - Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries on Linux?
+        - Mac OS X (Darwin) is currently treated like Linux, is that correct?
+        - Check CMake's FindCUDA module, it might contain some helpful clues in its sources
+          https://cmake.org/cmake/help/v3.0/module/FindCUDA.html
+          https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake
+        - Verify all Linux code paths somehow
+        '''
+        from os.path import isfile, join
+        from platform import system as platform_system
+        system = platform_system()
+        libdir, libptn = None, None
+        if system == 'Windows':
+            if self.cuda_libdir is not None:
+                libdir = self.cuda_libdir
+            elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib\\x64\\cudadevrt.lib')):
+                libdir = join(os.environ['CUDA_PATH'], 'lib\\x64')
+            libptn = '%s.lib'
+        elif system == 'Linux' or system == 'Darwin':
+            if self.cuda_libdir is not None:
+                libdir = self.cuda_libdir
+            elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')):
+                libdir = join(os.environ['CUDA_ROOT'], 'lib64')
+            elif 'LD_LIBRARY_PATH' in os.environ:
+                for ld_path in os.environ['LD_LIBRARY_PATH'].split(':'):
+                    if isfile(join(ld_path, 'libcudadevrt.a')):
+                        libdir = ld_path
+                        break
+            if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'):
+                libdir = '/usr/lib/x86_64-linux-gnu'
+            libptn = 'lib%s.a'
+        if libdir is None:
+            raise RuntimeError('Unable to locate the CUDA SDK installation '
+                'directory, set CUDA library path manually')
+        return libdir, libptn
+
+    def add_source(self, source, nvcc_options=None, name='kernel.ptx'):
+        ptx = compile(source, nvcc=self.nvcc, options=nvcc_options,
+            keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch,
+            code=self.code, cache_dir=self.cache_dir,
+            include_dirs=self.include_dirs, target="ptx")
+        from pycuda.driver import jit_input_type
+        self.linker.add_data(ptx, jit_input_type.PTX, name)
+        return self
+
+    def add_data(self, data, input_type, name='unknown'):
+        self.linker.add_data(data, input_type, name)
+        return self
+
+    def add_file(self, filename, input_type):
+        self.linker.add_file(filename, input_type)
+        return self
+
+    def add_stdlib(self, libname):
+        if self.libdir is None:
+            self.libdir, self.libptn = self._locate_cuda_libdir()
+        from os.path import isfile, join
+        libpath = join(self.libdir, self.libptn % libname)
+        if not isfile(libpath):
+            raise FileNotFoundError('CUDA SDK library file "%s" not found' % libpath)
+        from pycuda.driver import jit_input_type
+        self.linker.add_file(libpath, jit_input_type.LIBRARY)
+        return self
+
+    def link(self):
+        self.module = self.linker.link_module()
+        self.linker = None
+        self._bind_module()
+        return self
+
+class DynamicSourceModule(DynamicModule):
+    '''
+    Creates a Module from a single .cu source object linked against the
+    dynamic CUDA runtime.
+    - compiler generates PTX relocatable device code (rdc) from source that
+      can be linked with other relocatable device code
+    - source is linked against the CUDA device runtime library cudadevrt
+    - library cudadevrt is statically linked into the generated Module
+    '''
+    def __init__(self, source, nvcc="nvcc", options=[], keep=False,
+            no_extern_c=False, arch=None, code=None, cache_dir=None,
+            include_dirs=[], cuda_libdir=None):
+        super(DynamicSourceModule, self).__init__(nvcc=nvcc,
+            link_options=None, keep=keep, no_extern_c=no_extern_c,
+            arch=arch, code=code, cache_dir=cache_dir,
+            include_dirs=include_dirs, cuda_libdir=cuda_libdir)
+        options = options[:]
+        if not '-rdc=true' in options:
+            options.append('-rdc=true')
+        if not '-lcudadevrt' in options:
+            options.append('-lcudadevrt')
+        self.add_source(source, nvcc_options=options)
+        self.add_stdlib('cudadevrt')
+        self.link()
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 72175979e8ee851fc3d4ee2abfb08b152297c86e..7bb23d4a7be6e8a2517cc31a5b758b6cc948f5d7 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -410,6 +410,134 @@ namespace
 
   // }}}
 
+  // {{{ linker
+
+#if CUDAPP_CUDA_VERSION >= 5050
+  class Linker : public boost::noncopyable
+  {
+    private:
+      py::object m_message_handler;
+      CUlinkState m_link_state;
+      bool m_log_verbose;
+      std::vector<CUjit_option> m_options;
+      std::vector<void *> m_values;
+      char m_info_buf[32768];
+      char m_error_buf[32768];
+
+      void close() {
+        if (m_link_state != NULL) {
+          cuLinkDestroy(m_link_state);
+          m_link_state = NULL;
+        }
+      }
+
+      template<typename T>
+      void add_option(CUjit_option option, T value) {
+        m_options.push_back(option);
+        m_values.push_back(reinterpret_cast<void *>(value));
+      }
+
+      void check_cu_result(const char* cu_function_name, CUresult cu_result) const {
+        if (cu_result != CUDA_SUCCESS) {
+          call_message_handler(cu_result);
+          throw pycuda::error(cu_function_name, cu_result, error_str().c_str());
+        }
+      }
+
+      void call_message_handler(CUresult cu_result) const {
+        if (m_message_handler != py::object()) {
+          m_message_handler(cu_result == CUDA_SUCCESS, info_str(), error_str());
+        }
+      }
+
+      const std::string info_str() const {
+        return std::string(m_info_buf, size_t(m_values[1]));
+      }
+
+      const std::string error_str() const {
+        return std::string(m_error_buf, size_t(m_values[3]));
+      }
+
+    public:
+      Linker(py::object message_handler = py::object(),
+             py::object py_options = py::object(),
+             py::object py_log_verbose = py::object(false))
+        : m_message_handler(message_handler),
+          m_link_state(),
+          m_log_verbose(py::extract<bool>(py_log_verbose))
+      {
+        add_option(CU_JIT_INFO_LOG_BUFFER, m_info_buf);
+        add_option(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, sizeof(m_info_buf));
+        add_option(CU_JIT_ERROR_LOG_BUFFER, m_error_buf);
+        add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, sizeof(m_error_buf));
+        add_option(CU_JIT_LOG_VERBOSE, m_log_verbose? 1ull : 0ull);
+
+        if (py_options.ptr() != Py_None) {
+          PYTHON_FOREACH(key_value, py_options) {
+            add_option(
+                py::extract<CUjit_option>(key_value[0]),
+                py::extract<intptr_t>(key_value[1])());
+          }
+        }
+
+        const CUresult cu_result = cuLinkCreate(
+            (unsigned int) m_options.size(),
+            const_cast<CUjit_option *>(&*m_options.begin()),
+            const_cast<void **>(&*m_values.begin()),
+            &m_link_state);
+        check_cu_result("cuLinkCreate", cu_result);
+      }
+
+      ~Linker()
+      {
+        close();
+      }
+
+      void add_data(py::object py_data, CUjitInputType input_type, py::str py_name)
+      {
+        const char *data_buf;
+        PYCUDA_BUFFER_SIZE_T data_buf_len;
+        if (PyObject_AsCharBuffer(py_data.ptr(), &data_buf, &data_buf_len) != 0) {
+          throw py::error_already_set();
+        }
+        const char* name = (py_name.ptr() != Py_None)?
+            py::extract<const char*>(py_name) : NULL;
+        const CUresult cu_result = cuLinkAddData(m_link_state, input_type,
+            static_cast<void *>(const_cast<char *>(data_buf)),
+            data_buf_len, name, 0, NULL, NULL);
+        check_cu_result("cuLinkAddData", cu_result);
+      }
+
+      void add_file(py::str py_filename, CUjitInputType input_type)
+      {
+        const char* filename = py::extract<const char*>(py_filename);
+        const CUresult cu_result = cuLinkAddFile(m_link_state, input_type,
+            filename, 0, NULL, NULL);
+        check_cu_result("cuLinkAddFile", cu_result);
+      }
+
+      module* link_module()
+      {
+        char* cubin_data = NULL;
+        size_t cubin_size = 0;
+        CUresult cu_result = cuLinkComplete(m_link_state,
+            reinterpret_cast<void **>(&cubin_data), &cubin_size);
+        check_cu_result("cuLinkComplete", cu_result);
+
+        CUmodule cu_module = NULL;
+        cu_result = cuModuleLoadData(&cu_module, cubin_data);
+        check_cu_result("cuModuleLoadData", cu_result);
+
+        call_message_handler(cu_result);
+        close();
+
+        return new module(cu_module);
+      }
+  };
+#endif
+
+  // }}}
+
   template <class T>
   PyObject *mem_obj_to_long(T const &mo)
   {
@@ -888,6 +1016,10 @@ BOOST_PYTHON_MODULE(_driver)
     .value("PRINTF_FIFO_SIZE", CU_LIMIT_PRINTF_FIFO_SIZE)
 #if CUDAPP_CUDA_VERSION >= 3020
     .value("MALLOC_HEAP_SIZE", CU_LIMIT_MALLOC_HEAP_SIZE)
+#endif
+#if CUDAPP_CUDA_VERSION >= 3050
+    .value("DEV_RUNTIME_SYNC_DEPTH", CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH)
+    .value("DEV_RUNTIME_PENDING_LAUNCH_COUNT", CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT)
 #endif
     ;
 #endif
@@ -1058,6 +1190,27 @@ BOOST_PYTHON_MODULE(_driver)
 
   // }}}
 
+  // {{{ linker
+
+#if CUDAPP_CUDA_VERSION >= 5050
+  py::enum_<CUjitInputType>("jit_input_type")
+    .value("CUBIN", CU_JIT_INPUT_CUBIN)
+    .value("PTX", CU_JIT_INPUT_PTX)
+    .value("FATBINARY", CU_JIT_INPUT_FATBINARY)
+    .value("OBJECT", CU_JIT_INPUT_OBJECT)
+    .value("LIBRARY", CU_JIT_INPUT_LIBRARY);
+
+  py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
+    .def(py::init<py::object>())
+    .def(py::init<py::object, py::object>())
+    .def(py::init<py::object, py::object, py::object>())
+    .def("add_data", &Linker::add_data, (py::arg("data"), py::arg("input_type"), py::arg("name")=py::str("unknown")))
+    .def("add_file", &Linker::add_file, (py::arg("filename"), py::arg("input_type")))
+    .def("link_module", &Linker::link_module, py::return_value_policy<py::manage_new_object>());
+#endif
+
+  // }}}
+
   // {{{ function
   {
     typedef function cl;
diff --git a/test/test_driver.py b/test/test_driver.py
index 56d936589d4983788b1942c215fa222758a339a2..f88a1d67108ae91450b5af411ccc87f784bc45cd 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -838,7 +838,6 @@ class TestDriver:
         drv.memcpy_htod_async(gpu_ary, a_pin, stream)
         drv.Context.synchronize()
 
-    @pytest.mark.xfail
     @mark_cuda_test
     # https://github.com/inducer/pycuda/issues/45
     def test_recursive_launch(self):
@@ -894,9 +893,8 @@ class TestDriver:
             drv.memcpy_htod(a_gpu, a)
             drv.memcpy_htod(b_gpu, b)
 
-            mod = SourceModule(cuda_string,
-                    options=['-rdc=true', '-lcudadevrt'],
-                    keep=True)
+            from pycuda.compiler import DynamicSourceModule
+            mod = DynamicSourceModule(cuda_string, keep=True)
 
             func = mod.get_function("math")
             func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu,
@@ -918,6 +916,34 @@ class TestDriver:
 
         math(a, b, c, d, e, f)
 
+    @mark_cuda_test
+    def test_jit_link_module(self):
+        if drv.Context.get_device().compute_capability() < (3, 5):
+            from pytest import skip
+            skip("need compute capability 3.5 or higher for dynamic parallelism")
+
+        test_outer_cu = '''#include <cstdio>
+        __global__ void test_kernel() {
+            extern __global__ void test_kernel_inner();
+            printf("Hello outer world!\\n");
+            test_kernel_inner<<<2, 1>>>();
+        }'''
+
+        test_inner_cu = '''#include <cstdio>
+        __global__ void test_kernel_inner() {
+            printf("  Hello inner world!\\n");
+        }'''
+
+        from pycuda.compiler import DynamicModule
+        mod = DynamicModule()
+        mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
+        mod.add_source(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
+        mod.add_stdlib('cudadevrt')
+        mod.link()
+
+        test_kernel = mod.get_function('test_kernel')
+        test_kernel(grid=(2,1), block=(1,1,1))
+
 
 def test_import_pyopencl_before_pycuda():
     try: