From 99f864919e273e700f6cef1133319e5fbd3a0f74 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Tue, 10 Jan 2017 17:51:34 +0100
Subject: [PATCH 01/15] Added Python class pycuda.compiler.JitLinkModule, C++
 class Linker and Boost.Python wrappers for class Linker and enum
 CUjitInputType.

---
 pycuda/compiler.py           |  54 +++++++++++++
 src/wrapper/wrap_cudadrv.cpp | 143 +++++++++++++++++++++++++++++++++++
 2 files changed, 197 insertions(+)
diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index c0b5ba63..fe0260f0 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -287,3 +287,57 @@ class SourceModule(object):
 
     def get_function(self, name):
         return self.module.get_function(name)
+
+class JitLinkModule(object):
+    def __init__(self, nvcc='nvcc', options=None, keep=False,
+            no_extern_c=False, arch=None, code=None, cache_dir=None,
+            include_dirs=[],  message_handler=None, log_verbose=False):
+        self._check_arch(arch)
+        self.nvcc = nvcc
+        self.keep = keep
+        self.no_extern_c = no_extern_c
+        self.arch = arch
+        self.code = code
+        self.cache_dir = cache_dir
+        self.include_dirs = include_dirs
+        self.module = None
+        from pycuda.driver import Linker
+        self.linker = Linker(message_handler, options, log_verbose)
+
+    def add_source(self, source, nvcc_options=None, name='unknown'):
+        cubin = compile(source, self.nvcc, nvcc_options, self.keep,
+            self.no_extern_c, self.arch, self.code, self.cache_dir,
+            self.include_dirs)
+        from pycuda.driver import jit_input_type
+        self.linker.add_data(cubin, jit_input_type.PTX, name=name)
+
+    def add_data(self, data, cu_jit_input_type, name='unknown'):
+        self.linker.add_data(data, cu_jit_input_type, name)
+
+    def add_file(self, filename, cu_jit_input_type):
+        self.linker.add_file(filename, cu_jit_input_type)
+
+    def link(self):
+        self.module = self.linker.link_module()
+        self.linker = None
+
+        self.get_global = self.module.get_global
+        self.get_texref = self.module.get_texref
+        if hasattr(self.module, "get_surfref"):
+            self.get_surfref = self.module.get_surfref
+
+    def _check_arch(self, arch):
+        if arch is None:
+            return
+        try:
+            from pycuda.driver import Context
+            capability = Context.get_device().compute_capability()
+            if tuple(map(int, tuple(arch.split("_")[1]))) > capability:
+                from warnings import warn
+                warn("trying to compile for a compute capability "
+                        "higher than selected GPU")
+        except:
+            pass
+
+    def get_function(self, name):
+        return self.module.get_function(name)
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 72175979..4f126065 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -410,6 +410,128 @@ namespace
 
   // }}}
 
+  // {{{ linker
+
+#if CUDAPP_CUDA_VERSION >= 5050
+  class Linker : public boost::noncopyable
+  {
+    private:
+      py::object m_message_handler;
+      CUlinkState m_link_state;
+      bool m_log_verbose;
+      std::vector<CUjit_option> m_options;
+      std::vector<void *> m_values;
+      char m_info_buf[32768];
+      char m_error_buf[32768];
+
+      void close() {
+        if (m_link_state != NULL) {
+          cuLinkDestroy(m_link_state);
+          m_link_state = NULL;
+        }
+      }
+
+      void add_option(CUjit_option option, void* value) {
+        m_options.push_back(option);
+        m_values.push_back(value);
+      }
+
+      void check_cu_result(const char* cu_function_name, CUresult cu_result) const {
+        if (cu_result != CUDA_SUCCESS) {
+          call_message_handler(cu_result);
+          throw pycuda::error(cu_function_name, cu_result, error_str().c_str());
+        }
+      }
+
+      void call_message_handler(CUresult cu_result) const {
+        if (m_message_handler != py::object()) {
+          m_message_handler(cu_result == CUDA_SUCCESS, info_str(), error_str());
+        }
+      }
+
+      const std::string info_str() const {
+        return std::string(m_info_buf, size_t(m_values[1]));
+      }
+
+      const std::string error_str() const {
+        return std::string(m_error_buf, size_t(m_values[3]));
+      }
+
+    public:
+      Linker(py::object message_handler = py::object(),
+             py::object py_options = py::object(),
+             py::object py_log_verbose = py::object(false))
+        : m_message_handler(message_handler),
+          m_link_state(),
+          m_log_verbose(py::extract<bool>(py_log_verbose))
+      {
+        add_option(CU_JIT_INFO_LOG_BUFFER, m_info_buf);
+        add_option(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, (void*) sizeof(m_info_buf));
+        add_option(CU_JIT_ERROR_LOG_BUFFER, m_error_buf);
+        add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, (void*) sizeof(m_error_buf));
+        add_option(CU_JIT_LOG_VERBOSE, (void*) (m_log_verbose? 1ull : 0ull));
+
+        if (py_options.ptr() != Py_None) {
+          PYTHON_FOREACH(key_value, py_options) {
+            add_option(
+                py::extract<CUjit_option>(key_value[0]),
+                (void*) py::extract<intptr_t>(key_value[1])());
+          }
+        }
+
+        const CUresult cu_result = cuLinkCreate(
+            (unsigned int) m_options.size(),
+            const_cast<CUjit_option *>(&*m_options.begin()),
+            const_cast<void **>(&*m_values.begin()),
+            &m_link_state);
+        check_cu_result("cuLinkCreate", cu_result);
+      }
+
+      ~Linker()
+      {
+        close();
+      }
+
+      void add_data(py::object py_data, CUjitInputType input_type, py::str py_name)
+      {
+        const char *data_buf;
+        PYCUDA_BUFFER_SIZE_T data_buf_len;
+        if (PyObject_AsCharBuffer(py_data.ptr(), &data_buf, &data_buf_len) != 0) {
+          throw py::error_already_set();
+        }
+        const char* name = (py_name.ptr() != Py_None)? py::extract<const char*>(py_name) : NULL;
+        const CUresult cu_result = cuLinkAddData(m_link_state, input_type, (void*)data_buf, data_buf_len, name, 0, NULL, NULL);
+        check_cu_result("cuLinkAddData", cu_result);
+      }
+
+      void add_file(py::str py_filename, CUjitInputType input_type)
+      {
+        const char* filename = py::extract<const char*>(py_filename);
+        const CUresult cu_result = cuLinkAddFile(m_link_state, input_type, filename, 0, NULL, NULL);
+        check_cu_result("cuLinkAddFile", cu_result);
+      }
+
+      module* link_module()
+      {
+        const char* cubin_data = NULL;
+        size_t cubin_size = 0;
+        CUresult cu_result = cuLinkComplete(m_link_state, (void**)&cubin_data, &cubin_size);
+        check_cu_result("cuLinkComplete", cu_result);
+
+        CUmodule cu_module = NULL;
+        cu_result = cuModuleLoadData(&cu_module, (void*)cubin_data);
+        check_cu_result("cuModuleLoadData", cu_result);
+
+        call_message_handler(cu_result);
+        close();
+
+        return new module(cu_module);
+      }
+  };
+#endif
+
+  // }}}
+
   template <class T>
   PyObject *mem_obj_to_long(T const &mo)
   {
@@ -1058,6 +1180,27 @@ BOOST_PYTHON_MODULE(_driver)
 
   // }}}
 
+  // {{{ linker
+
+#if CUDAPP_CUDA_VERSION >= 5050
+  py::enum_<CUjitInputType>("jit_input_type")
+    .value("CUBIN", CU_JIT_INPUT_CUBIN)
+    .value("PTX", CU_JIT_INPUT_PTX)
+    .value("FATBINARY", CU_JIT_INPUT_FATBINARY)
+    .value("OBJECT", CU_JIT_INPUT_OBJECT)
+    .value("LIBRARY", CU_JIT_INPUT_LIBRARY);
+
+  py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
+    .def(py::init<py::object>())
+    .def(py::init<py::object, py::object>())
+    .def(py::init<py::object, py::object, py::object>())
+    .def("add_data", &Linker::add_data, (py::arg("data"), py::arg("input_type"), py::arg("name")=py::str("unknown")))
+    .def("add_file", &Linker::add_file, (py::arg("filename"), py::arg("input_type")))
+    .def("link_module", &Linker::link_module, py::return_value_policy<py::manage_new_object>());
+#endif
+
+  // }}}
+
   // {{{ function
   {
     typedef function cl;
-- 
GitLab


From a69e5398b28bbaddf0bd9b3e7376b0b41b93fb25 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Tue, 10 Jan 2017 21:22:38 +0100
Subject: [PATCH 02/15] Minor name changes

---
 pycuda/compiler.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index fe0260f0..af684923 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -309,13 +309,13 @@ class JitLinkModule(object):
             self.no_extern_c, self.arch, self.code, self.cache_dir,
             self.include_dirs)
         from pycuda.driver import jit_input_type
-        self.linker.add_data(cubin, jit_input_type.PTX, name=name)
+        self.linker.add_data(cubin, jit_input_type.PTX, name)
 
-    def add_data(self, data, cu_jit_input_type, name='unknown'):
-        self.linker.add_data(data, cu_jit_input_type, name)
+    def add_data(self, data, input_type, name='unknown'):
+        self.linker.add_data(data, input_type, name)
 
-    def add_file(self, filename, cu_jit_input_type):
-        self.linker.add_file(filename, cu_jit_input_type)
+    def add_file(self, filename, input_type):
+        self.linker.add_file(filename, input_type)
 
     def link(self):
         self.module = self.linker.link_module()
-- 
GitLab


From 7240a3e0c6a6eeba6c1f4931e3d2684deb776462 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Tue, 10 Jan 2017 23:37:09 +0100
Subject: [PATCH 03/15] Switched from target "CUBIN" to "PTX"

---
 pycuda/compiler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index af684923..58a87602 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -305,11 +305,11 @@ class JitLinkModule(object):
         self.linker = Linker(message_handler, options, log_verbose)
 
     def add_source(self, source, nvcc_options=None, name='unknown'):
-        cubin = compile(source, self.nvcc, nvcc_options, self.keep,
+        ptx = compile(source, self.nvcc, nvcc_options, self.keep,
             self.no_extern_c, self.arch, self.code, self.cache_dir,
-            self.include_dirs)
+            self.include_dirs, target="ptx")
         from pycuda.driver import jit_input_type
-        self.linker.add_data(cubin, jit_input_type.PTX, name)
+        self.linker.add_data(ptx, jit_input_type.PTX, name)
 
     def add_data(self, data, input_type, name='unknown'):
         self.linker.add_data(data, input_type, name)
-- 
GitLab


From be87fc2054c5ab164cc085cda426de41b23de61f Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Wed, 11 Jan 2017 00:07:22 +0100
Subject: [PATCH 04/15] Default compiled source name improved

---
 pycuda/compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 58a87602..484c197d 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -304,7 +304,7 @@ class JitLinkModule(object):
         from pycuda.driver import Linker
         self.linker = Linker(message_handler, options, log_verbose)
 
-    def add_source(self, source, nvcc_options=None, name='unknown'):
+    def add_source(self, source, nvcc_options=None, name='kernel.ptx'):
         ptx = compile(source, self.nvcc, nvcc_options, self.keep,
             self.no_extern_c, self.arch, self.code, self.cache_dir,
             self.include_dirs, target="ptx")
-- 
GitLab


From bafba75e217a2525dc2d3b0bc7395963724ccb82 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Wed, 11 Jan 2017 17:20:09 +0100
Subject: [PATCH 05/15] Refactored class SourceModule, now derived from class
 JitLinkModule.

---
 pycuda/compiler.py | 53 +++++++++++++---------------------------------
 1 file changed, 15 insertions(+), 38 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 484c197d..ebeeb41a 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -254,40 +254,6 @@ def compile(source, nvcc="nvcc", options=None, keep=False,
 
     return compile_plain(source, options, keep, nvcc, cache_dir, target)
 
-
-class SourceModule(object):
-    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
-            no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[]):
-        self._check_arch(arch)
-
-        cubin = compile(source, nvcc, options, keep, no_extern_c,
-                arch, code, cache_dir, include_dirs)
-
-        from pycuda.driver import module_from_buffer
-        self.module = module_from_buffer(cubin)
-
-        self.get_global = self.module.get_global
-        self.get_texref = self.module.get_texref
-        if hasattr(self.module, "get_surfref"):
-            self.get_surfref = self.module.get_surfref
-
-    def _check_arch(self, arch):
-        if arch is None:
-            return
-        try:
-            from pycuda.driver import Context
-            capability = Context.get_device().compute_capability()
-            if tuple(map(int, tuple(arch.split("_")[1]))) > capability:
-                from warnings import warn
-                warn("trying to compile for a compute capability "
-                        "higher than selected GPU")
-        except:
-            pass
-
-    def get_function(self, name):
-        return self.module.get_function(name)
-
 class JitLinkModule(object):
     def __init__(self, nvcc='nvcc', options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
@@ -305,9 +271,10 @@ class JitLinkModule(object):
         self.linker = Linker(message_handler, options, log_verbose)
 
     def add_source(self, source, nvcc_options=None, name='kernel.ptx'):
-        ptx = compile(source, self.nvcc, nvcc_options, self.keep,
-            self.no_extern_c, self.arch, self.code, self.cache_dir,
-            self.include_dirs, target="ptx")
+        ptx = compile(source, nvcc=self.nvcc, options=nvcc_options,
+            keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch,
+            code=self.code, cache_dir=self.cache_dir,
+            include_dirs=self.include_dirs, target="ptx")
         from pycuda.driver import jit_input_type
         self.linker.add_data(ptx, jit_input_type.PTX, name)
 
@@ -320,7 +287,6 @@ class JitLinkModule(object):
     def link(self):
         self.module = self.linker.link_module()
         self.linker = None
-
         self.get_global = self.module.get_global
         self.get_texref = self.module.get_texref
         if hasattr(self.module, "get_surfref"):
@@ -341,3 +307,14 @@ class JitLinkModule(object):
 
     def get_function(self, name):
         return self.module.get_function(name)
+
+class SourceModule(JitLinkModule):
+    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
+            no_extern_c=False, arch=None, code=None, cache_dir=None,
+            include_dirs=[], message_handler=None, log_verbose=False):
+        super(SourceModule, self).__init__(nvcc=nvcc, options=None,
+            keep=keep, no_extern_c=no_extern_c, arch=arch, code=code,
+            cache_dir=cache_dir, include_dirs=include_dirs,
+            message_handler=message_handler, log_verbose=log_verbose)
+        self.add_source(source, nvcc_options=options)
+        self.link()
-- 
GitLab


From f7223616d5e5d7fc2ebf8ad01c656a7d79978e20 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Wed, 11 Jan 2017 19:03:15 +0100
Subject: [PATCH 06/15] Added adaption of NVIDIA's "cdpSimplePrint - Simple
 Print (CUDA Dynamic Parallelism)" sample that demonstrates dynamic
 parallelism with a recursive kernel

---
 examples/demo_cdpSimplePrint.py | 131 ++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 examples/demo_cdpSimplePrint.py

diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py
new file mode 100644
index 00000000..ab78b917
--- /dev/null
+++ b/examples/demo_cdpSimplePrint.py
@@ -0,0 +1,131 @@
+'''
+ * demo_cdpSimplePrint.py
+ *
+ * Adapted from NVIDIA's "cdpSimplePrint - Simple Print (CUDA Dynamic Parallelism)" sample
+ * http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-print--cuda-dynamic-parallelism-
+ * http://ecee.colorado.edu/~siewerts/extra/code/example_code_archive/a490dmis_code/CUDA/cuda_work/samples/0_Simple/cdpSimplePrint/cdpSimplePrint.cu
+ *
+ * From cdpSimplePrint.cu (not sure if this is Ok with NVIDIA's 38-page EULA though...):
+ * ---------------------------------------------------------------------------
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ * ---------------------------------------------------------------------------
+'''
+
+import sys, os
+import pycuda.autoinit
+import pycuda.driver as cuda
+from pycuda.compiler import JitLinkModule
+from pycuda.driver import jit_input_type
+
+cdpSimplePrint_cu = '''
+#include <cstdio>
+
+////////////////////////////////////////////////////////////////////////////////
+// Variable on the GPU used to generate unique identifiers of blocks.
+////////////////////////////////////////////////////////////////////////////////
+__device__ int g_uids = 0;
+
+////////////////////////////////////////////////////////////////////////////////
+// Print a simple message to signal the block which is currently executing.
+////////////////////////////////////////////////////////////////////////////////
+__device__ void print_info( int depth, int thread, int uid, int parent_uid )
+{
+  if( threadIdx.x == 0 )
+  {
+    if( depth == 0 )
+      printf( "BLOCK %d launched by the host\\n", uid );
+    else
+    {
+      char buffer[32];
+      for( int i = 0 ; i < depth ; ++i )
+      {
+        buffer[3*i+0] = '|';
+        buffer[3*i+1] = ' ';
+        buffer[3*i+2] = ' ';
+      }
+      buffer[3*depth] = '\\0';
+      printf( "%sBLOCK %d launched by thread %d of block %d\\n", buffer, uid, thread, parent_uid );
+    }
+  }
+  __syncthreads();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The kernel using CUDA dynamic parallelism.
+//
+// It generates a unique identifier for each block. Prints the information
+// about that block. Finally, if the 'max_depth' has not been reached, the
+// block launches new blocks directly from the GPU.
+////////////////////////////////////////////////////////////////////////////////
+__global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid )
+{
+  // We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
+  __shared__ int s_uid;
+  if( threadIdx.x == 0 ) 
+  {
+    s_uid = atomicAdd( &g_uids, 1 );
+  }
+  __syncthreads();
+
+  // We print the ID of the block and information about its parent.
+  print_info( depth, thread, s_uid, parent_uid );
+  
+  // We launch new blocks if we haven't reached the max_depth yet.
+  if( ++depth >= max_depth )
+  {
+    return;
+  }
+  cdp_kernel<<<gridDim.x, blockDim.x>>>( max_depth, depth, threadIdx.x, s_uid );
+}
+'''
+
+def main(argv):
+    max_depth = 2
+    if len(argv) > 1:
+        if len(argv) == 2 and argv[1].isdigit() and int(argv[1]) >= 1 and int(argv[1]) <= 8:
+            max_depth = int(argv[1])
+        else:
+            print("Usage: %s <max_depth>\t(where max_depth is a value between 1 and 8)." % argv[0])
+            sys.exit(0)
+
+    print("starting Simple Print (CUDA Dynamic Parallelism)")
+
+    # TODO: this add_file('cudadevrt.lib') probably works under Windows only,
+    # not sure about Linux, maybe "/usr/lib/x86_64-linux-gnu/libcudadevrt.a"?
+
+    mod = JitLinkModule()
+    mod.add_source(cdpSimplePrint_cu, nvcc_options=['-O3', '-rdc=true', '-lcudadevrt'])
+    mod.add_file(os.environ['CUDA_PATH'] + '\\lib\\x64\\cudadevrt.lib', jit_input_type.LIBRARY)
+    mod.link()
+
+    cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call
+
+    print("***************************************************************************")
+    print("The CPU launches 2 blocks of 2 threads each. On the device each thread will")
+    print("launch 2 blocks of 2 threads each. The GPU we will do that recursively")
+    print("until it reaches max_depth=%d\n" % max_depth)
+    print("In total 2")
+    num_blocks, sum = 2, 2
+    for i in range(1, max_depth):
+        num_blocks *= 4
+        print("+%d" % num_blocks)
+        sum += num_blocks
+
+    print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum-2))
+    print("***************************************************************************\n")
+
+    # TODO: cudaDeviceSetLimit() is not available on PyCuda, works anyway on my GeForce GTX 980; maybe add that function?
+    #cudaDeviceSetLimit( cudaLimitDevRuntimeSyncDepth, max_depth )
+
+    # Launch the kernel from the CPU.
+    print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n")
+    cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1)
+
+if __name__ == "__main__":
+    main(sys.argv)
-- 
GitLab


From 23132b47c6c84bb5335ed48a2389943d96b19f49 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Thu, 12 Jan 2017 16:53:51 +0100
Subject: [PATCH 07/15] Added 3 unit tests for jit-link in test_jit_link.py

---
 test/test_jit_link.py | 78 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 test/test_jit_link.py

diff --git a/test/test_jit_link.py b/test/test_jit_link.py
new file mode 100644
index 00000000..a0728991
--- /dev/null
+++ b/test/test_jit_link.py
@@ -0,0 +1,78 @@
+import sys
+from pycuda.tools import mark_cuda_test
+
+def have_pycuda():
+    try:
+        import pycuda  # noqa
+        return True
+    except:
+        return False
+
+if have_pycuda():
+    import pycuda.driver as drv  # noqa
+    from pycuda.compiler import SourceModule
+    from pycuda.compiler import JitLinkModule
+    from pycuda.driver import jit_input_type
+
+class TestJitLink:
+    @mark_cuda_test
+    def test_create(self):
+        mod = JitLinkModule()
+
+    @mark_cuda_test
+    def test_static_parallelism(self):
+        test_cu = '''#include <cstdio>
+        __global__ void test_kernel() {
+            printf("Hello world!\\n");
+        }'''
+
+        mod = SourceModule(test_cu)
+        test_kernel = mod.get_function('test_kernel')
+        test_kernel(grid=(2,1), block=(1,1,1))
+
+    @mark_cuda_test
+    def test_dynamic_parallelism(self):
+        # nvcc error:
+        #     calling a __global__ function("test_kernel_inner") from a
+        #         __global__ function("test_kernel") is only allowed on the
+        #     compute_35 architecture or above
+        import pycuda.autoinit
+        compute_capability = pycuda.autoinit.device.compute_capability()
+        if compute_capability[0] < 3 or (compute_capability[0] == 3 and compute_capability[1] < 5):
+            raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
+                (compute_capability[0], compute_capability[1]))
+
+        import os, os.path
+        from platform import system
+        if system() == 'Windows':
+            cudadevrt = os.path.join(os.environ['CUDA_PATH'], 'lib/x64/cudadevrt.lib')
+        else:
+            cudadevrt = '/usr/lib/x86_64-linux-gnu/libcudadevrt.a'  # TODO: this is just an untested guess!
+        if not os.path.isfile(cudadevrt):
+            raise Exception('Cannot locate library cudadevrt!')
+
+        test_cu = '''#include <cstdio>
+        __global__ void test_kernel_inner() {
+            printf("  Hello inner world!\\n");
+        }
+        __global__ void test_kernel() {
+            printf("Hello outer world!\\n");
+            test_kernel_inner<<<2, 1>>>();
+        }'''
+
+        mod = JitLinkModule()
+        mod.add_source(test_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
+        mod.add_file(cudadevrt, jit_input_type.LIBRARY)
+        mod.link()
+        test_kernel = mod.get_function('test_kernel')
+        test_kernel(grid=(2,1), block=(1,1,1))
+
+if __name__ == "__main__":
+    # make sure that import failures get reported, instead of skipping the tests.
+    import pycuda.autoinit  # noqa
+
+    if len(sys.argv) > 1:
+        exec (sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
-- 
GitLab


From 68dcd6ec1cd1b41155284b1cbea471b7427a5430 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Thu, 12 Jan 2017 20:43:05 +0100
Subject: [PATCH 08/15] Added method JitLinkModule.add_stdlib() for static
 linking of CUDA standard libraries like "cudadevrt".

---
 examples/demo_cdpSimplePrint.py |  5 +----
 pycuda/compiler.py              | 28 +++++++++++++++++++++++++++-
 test/test_jit_link.py           | 12 ++----------
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py
index ab78b917..d00ba750 100644
--- a/examples/demo_cdpSimplePrint.py
+++ b/examples/demo_cdpSimplePrint.py
@@ -96,12 +96,9 @@ def main(argv):
 
     print("starting Simple Print (CUDA Dynamic Parallelism)")
 
-    # TODO: this add_file('cudadevrt.lib') probably works under Windows only,
-    # not sure about Linux, maybe "/usr/lib/x86_64-linux-gnu/libcudadevrt.a"?
-
     mod = JitLinkModule()
     mod.add_source(cdpSimplePrint_cu, nvcc_options=['-O3', '-rdc=true', '-lcudadevrt'])
-    mod.add_file(os.environ['CUDA_PATH'] + '\\lib\\x64\\cudadevrt.lib', jit_input_type.LIBRARY)
+    mod.add_stdlib('cudadevrt')
     mod.link()
 
     cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call
diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index ebeeb41a..6296ca25 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -257,7 +257,8 @@ def compile(source, nvcc="nvcc", options=None, keep=False,
 class JitLinkModule(object):
     def __init__(self, nvcc='nvcc', options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[],  message_handler=None, log_verbose=False):
+            include_dirs=[],  message_handler=None, log_verbose=False,
+            cudalib_dir=None):
         self._check_arch(arch)
         self.nvcc = nvcc
         self.keep = keep
@@ -266,6 +267,7 @@ class JitLinkModule(object):
         self.code = code
         self.cache_dir = cache_dir
         self.include_dirs = include_dirs
+        self.cudalib_dir = cudalib_dir
         self.module = None
         from pycuda.driver import Linker
         self.linker = Linker(message_handler, options, log_verbose)
@@ -284,6 +286,30 @@ class JitLinkModule(object):
     def add_file(self, filename, input_type):
         self.linker.add_file(filename, input_type)
 
+    def add_stdlib(self, libname):
+        import os, os.path
+        from pycuda.driver import jit_input_type
+        from platform import system
+        syst = system()
+        if syst == 'Windows':
+            if self.cudalib_dir is None:
+                libdir = os.path.join(os.environ['CUDA_PATH'], 'lib/x64')
+            else:
+                libdir = self.cudalib_dir
+            libpath = os.path.join(libdir, '%s.lib' % libname)
+        elif syst == 'Linux':
+            if self.cudalib_dir is None:
+                libdir = '/usr/lib/x86_64-linux-gnu'
+            else:
+                libdir = self.cudalib_dir
+            libpath = os.path.join(libdir, 'lib%s.a' % libname)
+        else:
+            raise Exception('System "%s" currently not supported, use add_file() instead' % syst)
+        if os.path.isfile(libpath):
+            self.linker.add_file(libpath, jit_input_type.LIBRARY)
+        else:
+            raise Exception('Library file "%s" not found, use add_file() instead' % libpath)
+
     def link(self):
         self.module = self.linker.link_module()
         self.linker = None
diff --git a/test/test_jit_link.py b/test/test_jit_link.py
index a0728991..43b54942 100644
--- a/test/test_jit_link.py
+++ b/test/test_jit_link.py
@@ -42,15 +42,6 @@ class TestJitLink:
             raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
                 (compute_capability[0], compute_capability[1]))
 
-        import os, os.path
-        from platform import system
-        if system() == 'Windows':
-            cudadevrt = os.path.join(os.environ['CUDA_PATH'], 'lib/x64/cudadevrt.lib')
-        else:
-            cudadevrt = '/usr/lib/x86_64-linux-gnu/libcudadevrt.a'  # TODO: this is just an untested guess!
-        if not os.path.isfile(cudadevrt):
-            raise Exception('Cannot locate library cudadevrt!')
-
         test_cu = '''#include <cstdio>
         __global__ void test_kernel_inner() {
             printf("  Hello inner world!\\n");
@@ -62,8 +53,9 @@ class TestJitLink:
 
         mod = JitLinkModule()
         mod.add_source(test_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
-        mod.add_file(cudadevrt, jit_input_type.LIBRARY)
+        mod.add_stdlib('cudadevrt')
         mod.link()
+
         test_kernel = mod.get_function('test_kernel')
         test_kernel(grid=(2,1), block=(1,1,1))
 
-- 
GitLab


From e990a4e67cd84464cc43be72be737202a2f4c47d Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Fri, 13 Jan 2017 19:32:49 +0100
Subject: [PATCH 09/15] Refactored SourceModule again, new class hierarchy:

+ CudaModule
  + SourceModule
  + JitLinkModule
    + DynamicSourceModule

Splitted the "old" class SourceModule in two: class CudaModule and the "new" SourceModule.

- CudaModule is now the common base class for module loading and provides common methods. All methods here were moved from the old SourceModule.
- The "new" SourceModule's interface and system requirements are 100% unchanged, it will work under all previous configuration scenarios.
- JitLinkModule requires at least CUDA 5.5 and Compute Capabilty 3.5 (that is now guarded in the constructor), it's the swiss-army-knife for non-trivial linker invocations.
- DynamicSourceModule is a special case of JitLinkModule, it exposes the same interface as SourceModule but enables dynamic parallelism (it comes with one extra optional argument in the constructor, cudalib_dir). It's meant for the trivial cases where the user has a single source file, as before with SourceModule.

So if a PyCuda user only wants to activate dynamic parallelism, all that's required is to replace "SourceModule" with "DynamicSourceModule" given that we're able to locate "cudadevrt" automagically in method _locate_cuda_libdir(), otherwise the caller must provide the CUDA library path manually in constructor argument "cudalib_dir". I do not think this can be reduced any further.

Other changes in class JitLinkModule:
- Made all add_* methods and the link() method return self
- Moved CUDA library path detection logic into method JitLinkModule._locate_cuda_libdir(), gets called only once from constructor
---
 pycuda/compiler.py | 180 +++++++++++++++++++++++++++++++--------------
 1 file changed, 126 insertions(+), 54 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 6296ca25..09674535 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -254,11 +254,59 @@ def compile(source, nvcc="nvcc", options=None, keep=False,
 
     return compile_plain(source, options, keep, nvcc, cache_dir, target)
 
-class JitLinkModule(object):
-    def __init__(self, nvcc='nvcc', options=None, keep=False,
+class CudaModule(object):
+    def _check_arch(self, arch):
+        if arch is None:
+            return
+        try:
+            from pycuda.driver import Context
+            capability = Context.get_device().compute_capability()
+            if tuple(map(int, tuple(arch.split("_")[1]))) > capability:
+                from warnings import warn
+                warn("trying to compile for a compute capability "
+                        "higher than selected GPU")
+        except:
+            pass
+
+    def _bind_module(self):
+        self.get_global = self.module.get_global
+        self.get_texref = self.module.get_texref
+        if hasattr(self.module, "get_surfref"):
+            self.get_surfref = self.module.get_surfref
+
+    def get_function(self, name):
+        return self.module.get_function(name)
+
+class SourceModule(CudaModule):
+    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
+            no_extern_c=False, arch=None, code=None, cache_dir=None,
+            include_dirs=[]):
+        self._check_arch(arch)
+
+        cubin = compile(source, nvcc, options, keep, no_extern_c,
+                arch, code, cache_dir, include_dirs)
+
+        from pycuda.driver import module_from_buffer
+        self.module = module_from_buffer(cubin)
+
+        self._bind_module()
+
+class JitLinkModule(CudaModule):
+    # TODO:
+    # - How do we handle multiple CUDA devices? Currently using
+    #   pycuda.autoinit.device
+    def __init__(self, nvcc='nvcc', link_options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
             include_dirs=[],  message_handler=None, log_verbose=False,
-            cudalib_dir=None):
+            cuda_libdir=None):
+        from pycuda.autoinit import device
+        compute_capability = device.compute_capability()
+        if compute_capability < (3,5):
+            raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
+                (compute_capability[0], compute_capability[1]))
+        else:
+            from pycuda.driver import Linker
+            self.linker = Linker(message_handler, link_options, log_verbose)
         self._check_arch(arch)
         self.nvcc = nvcc
         self.keep = keep
@@ -267,10 +315,56 @@ class JitLinkModule(object):
         self.code = code
         self.cache_dir = cache_dir
         self.include_dirs = include_dirs
-        self.cudalib_dir = cudalib_dir
         self.module = None
-        from pycuda.driver import Linker
-        self.linker = Linker(message_handler, options, log_verbose)
+        self.libdir, self.libptn = self._locate_cuda_libdir(cuda_libdir)
+
+    def _locate_cuda_libdir(self, cuda_libdir=None):
+        '''
+        Locate the CUDA "standard" libraries directory in the local
+        file system. Supports 64-Bit Windows, Linux and Mac OS X.
+        In case the caller supplied cuda_libdir other than None that
+        value is returned unchecked, else make a best-effort attempt.
+        Precedence:
+            Windows: cuda_libdir > %CUDA_PATH%
+            Linux:   cuda_libdir > $CUDA_ROOT > $LD_LIBRARY_PATH > '/usr/lib/x86_64-linux-gnu'
+        Returns a pair (libdir, libptn) where libdir is None in case
+            of failure or a string containing the absolute path of the
+            directory, and libptn is the %-format pattern to construct
+            library file names from library names on the local system.
+        Does not raise an Excpetion in case of failure.
+        TODO:
+        - Mac OS X (Darwin) is currently treated like Linux, is that correct?
+        - Check CMake's FindCUDA module, it might contain some helpful clues in its sources
+          https://cmake.org/cmake/help/v3.0/module/FindCUDA.html
+          https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake
+        - Verify all Linux code paths somehow
+        '''
+        from os.path import isfile, join
+        from platform import system as platform_system   #  TODO: Only since Pyhton 2.3., future or is 2.3 fine?
+        system = platform_system()
+        libdir, libptn = None, None
+        if system == 'Windows':
+            if cuda_libdir is not None:
+                libdir = cuda_libdir
+            elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib/x64/cudadevrt.lib')):
+                libdir = join(os.environ['CUDA_PATH'], 'lib/x64')
+            libptn = '%s.lib'
+        elif system == 'Linux' or system == 'Darwin':
+            if cuda_libdir is not None:
+                libdir = cuda_libdir
+            elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')):
+                # TODO: Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries?
+                libdir = join(os.environ['CUDA_ROOT'], 'lib64')
+            elif 'LD_LIBRARY_PATH' in os.environ:
+                # see: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions
+                for ld_path in os.environ['LD_LIBRARY_PATH'].split(':'):
+                    if isfile(join(ld_path, 'libcudadevrt.a')):
+                        libdir = ld_path
+                        break
+            if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'):
+                libdir = '/usr/lib/x86_64-linux-gnu'
+            libptn = 'lib%s.a'
+        return libdir, libptn
 
     def add_source(self, source, nvcc_options=None, name='kernel.ptx'):
         ptx = compile(source, nvcc=self.nvcc, options=nvcc_options,
@@ -279,68 +373,46 @@ class JitLinkModule(object):
             include_dirs=self.include_dirs, target="ptx")
         from pycuda.driver import jit_input_type
         self.linker.add_data(ptx, jit_input_type.PTX, name)
+        return self
 
     def add_data(self, data, input_type, name='unknown'):
         self.linker.add_data(data, input_type, name)
+        return self
 
     def add_file(self, filename, input_type):
         self.linker.add_file(filename, input_type)
+        return self
 
     def add_stdlib(self, libname):
-        import os, os.path
+        # TODO: which error class to raise best here?
+        if self.libdir is None:
+            raise Exception('Unable to find CUDA installation path, please set CUDA library path manually')
+        from os.path import isfile, join
+        libpath = join(self.libdir, self.libptn % libname)
+        if not isfile(libpath):
+            raise Exception('CUDA library file "%s" not found' % libpath)
         from pycuda.driver import jit_input_type
-        from platform import system
-        syst = system()
-        if syst == 'Windows':
-            if self.cudalib_dir is None:
-                libdir = os.path.join(os.environ['CUDA_PATH'], 'lib/x64')
-            else:
-                libdir = self.cudalib_dir
-            libpath = os.path.join(libdir, '%s.lib' % libname)
-        elif syst == 'Linux':
-            if self.cudalib_dir is None:
-                libdir = '/usr/lib/x86_64-linux-gnu'
-            else:
-                libdir = self.cudalib_dir
-            libpath = os.path.join(libdir, 'lib%s.a' % libname)
-        else:
-            raise Exception('System "%s" currently not supported, use add_file() instead' % syst)
-        if os.path.isfile(libpath):
-            self.linker.add_file(libpath, jit_input_type.LIBRARY)
-        else:
-            raise Exception('Library file "%s" not found, use add_file() instead' % libpath)
+        self.linker.add_file(libpath, jit_input_type.LIBRARY)
+        return self
 
     def link(self):
         self.module = self.linker.link_module()
         self.linker = None
-        self.get_global = self.module.get_global
-        self.get_texref = self.module.get_texref
-        if hasattr(self.module, "get_surfref"):
-            self.get_surfref = self.module.get_surfref
+        self._bind_module()
+        return self
 
-    def _check_arch(self, arch):
-        if arch is None:
-            return
-        try:
-            from pycuda.driver import Context
-            capability = Context.get_device().compute_capability()
-            if tuple(map(int, tuple(arch.split("_")[1]))) > capability:
-                from warnings import warn
-                warn("trying to compile for a compute capability "
-                        "higher than selected GPU")
-        except:
-            pass
-
-    def get_function(self, name):
-        return self.module.get_function(name)
-
-class SourceModule(JitLinkModule):
-    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
+class DynamicSourceModule(JitLinkModule):
+    def __init__(self, source, nvcc="nvcc", options=[], keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[], message_handler=None, log_verbose=False):
-        super(SourceModule, self).__init__(nvcc=nvcc, options=None,
-            keep=keep, no_extern_c=no_extern_c, arch=arch, code=code,
-            cache_dir=cache_dir, include_dirs=include_dirs,
-            message_handler=message_handler, log_verbose=log_verbose)
+            include_dirs=[], cuda_libdir=None):
+        super(DynamicSourceModule, self).__init__(nvcc=nvcc,
+            link_options=None, keep=keep, no_extern_c=no_extern_c,
+            arch=arch, code=code, cache_dir=cache_dir,
+            include_dirs=include_dirs, cuda_libdir=cuda_libdir)
+        if not '-rdc=true' in options:
+            options.append('-rdc=true')
+        if not '-lcudadevrt' in options:
+            options.append('-lcudadevrt')
         self.add_source(source, nvcc_options=options)
+        self.add_stdlib('cudadevrt')
         self.link()
-- 
GitLab


From e03e9f74b667acdcfccd7a05abe568b95a12ac53 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Fri, 13 Jan 2017 20:06:44 +0100
Subject: [PATCH 10/15] Changed demo_cdpSimplePrint.py to use
 DynamicSourceModule. Using Context.set_limit() now. Needed a missing limit
 enum CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH (in CUDA8/v8.0/include/cuda.h[975],
 available since 3.5), added CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH and
 CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT (same) to pycuda.driver.limit.

---
 examples/demo_cdpSimplePrint.py | 14 +++-----------
 src/wrapper/wrap_cudadrv.cpp    |  4 ++++
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py
index d00ba750..d5435987 100644
--- a/examples/demo_cdpSimplePrint.py
+++ b/examples/demo_cdpSimplePrint.py
@@ -20,8 +20,7 @@
 import sys, os
 import pycuda.autoinit
 import pycuda.driver as cuda
-from pycuda.compiler import JitLinkModule
-from pycuda.driver import jit_input_type
+from pycuda.compiler import DynamicSourceModule
 
 cdpSimplePrint_cu = '''
 #include <cstdio>
@@ -96,11 +95,7 @@ def main(argv):
 
     print("starting Simple Print (CUDA Dynamic Parallelism)")
 
-    mod = JitLinkModule()
-    mod.add_source(cdpSimplePrint_cu, nvcc_options=['-O3', '-rdc=true', '-lcudadevrt'])
-    mod.add_stdlib('cudadevrt')
-    mod.link()
-
+    mod = DynamicSourceModule(cdpSimplePrint_cu)
     cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call
 
     print("***************************************************************************")
@@ -113,14 +108,11 @@ def main(argv):
         num_blocks *= 4
         print("+%d" % num_blocks)
         sum += num_blocks
-
     print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum-2))
     print("***************************************************************************\n")
 
-    # TODO: cudaDeviceSetLimit() is not available on PyCuda, works anyway on my GeForce GTX 980; maybe add that function?
-    #cudaDeviceSetLimit( cudaLimitDevRuntimeSyncDepth, max_depth )
+    pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH, max_depth)
 
-    # Launch the kernel from the CPU.
     print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n")
     cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1)
 
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 4f126065..76f30df4 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -1010,6 +1010,10 @@ BOOST_PYTHON_MODULE(_driver)
     .value("PRINTF_FIFO_SIZE", CU_LIMIT_PRINTF_FIFO_SIZE)
 #if CUDAPP_CUDA_VERSION >= 3020
     .value("MALLOC_HEAP_SIZE", CU_LIMIT_MALLOC_HEAP_SIZE)
+#endif
+#if CUDAPP_CUDA_VERSION >= 3050
+    .value("DEV_RUNTIME_SYNC_DEPTH", CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH)
+    .value("DEV_RUNTIME_PENDING_LAUNCH_COUNT", CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT)
 #endif
     ;
 #endif
-- 
GitLab


From d0666aed20c8da41e83d83a4c0f7859687a662bb Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Fri, 13 Jan 2017 20:48:05 +0100
Subject: [PATCH 11/15] Changed TestDriver.test_recursive_launch() in
 test/test_deriver.py to use pycuda.compiler.DynamicSourceModule, removed
 xfail marker Added a test case for pycuda.compiler.JitLinkModule: compiles
 and links two .cu files. Removed test/test_jit_link.py, all further tests in
 test/test_deriver.py from now on Removed bad "import pycuda.autoinit" (how
 did this even get there?)

---
 test/test_driver.py   | 33 ++++++++++++++++++--
 test/test_jit_link.py | 70 -------------------------------------------
 2 files changed, 30 insertions(+), 73 deletions(-)
 delete mode 100644 test/test_jit_link.py

diff --git a/test/test_driver.py b/test/test_driver.py
index 56d93658..e951134d 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -894,9 +894,8 @@ class TestDriver:
             drv.memcpy_htod(a_gpu, a)
             drv.memcpy_htod(b_gpu, b)
 
-            mod = SourceModule(cuda_string,
-                    options=['-rdc=true', '-lcudadevrt'],
-                    keep=True)
+            from pycuda.compiler import DynamicSourceModule
+            mod = DynamicSourceModule(cuda_string, keep=True)
 
             func = mod.get_function("math")
             func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu,
@@ -918,6 +917,34 @@ class TestDriver:
 
         math(a, b, c, d, e, f)
 
+    @mark_cuda_test
+    def test_jit_link_module(self):
+        if drv.Context.get_device().compute_capability() < (3, 5):
+            from pytest import skip
+            skip("need compute capability 3.5 or higher for dynamic parallelism")
+
+        test_outer_cu = '''#include <cstdio>
+        __global__ void test_kernel() {
+            extern __global__ void test_kernel_inner();
+            printf("Hello outer world!\\n");
+            test_kernel_inner<<<2, 1>>>();
+        }'''
+
+        test_inner_cu = '''#include <cstdio>
+        __global__ void test_kernel_inner() {
+            printf("  Hello inner world!\\n");
+        }'''
+
+        from pycuda.compiler import JitLinkModule
+        mod = JitLinkModule()
+        mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
+        mod.add_source(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
+        mod.add_stdlib('cudadevrt')
+        mod.link()
+
+        test_kernel = mod.get_function('test_kernel')
+        test_kernel(grid=(2,1), block=(1,1,1))
+
 
 def test_import_pyopencl_before_pycuda():
     try:
diff --git a/test/test_jit_link.py b/test/test_jit_link.py
deleted file mode 100644
index 43b54942..00000000
--- a/test/test_jit_link.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import sys
-from pycuda.tools import mark_cuda_test
-
-def have_pycuda():
-    try:
-        import pycuda  # noqa
-        return True
-    except:
-        return False
-
-if have_pycuda():
-    import pycuda.driver as drv  # noqa
-    from pycuda.compiler import SourceModule
-    from pycuda.compiler import JitLinkModule
-    from pycuda.driver import jit_input_type
-
-class TestJitLink:
-    @mark_cuda_test
-    def test_create(self):
-        mod = JitLinkModule()
-
-    @mark_cuda_test
-    def test_static_parallelism(self):
-        test_cu = '''#include <cstdio>
-        __global__ void test_kernel() {
-            printf("Hello world!\\n");
-        }'''
-
-        mod = SourceModule(test_cu)
-        test_kernel = mod.get_function('test_kernel')
-        test_kernel(grid=(2,1), block=(1,1,1))
-
-    @mark_cuda_test
-    def test_dynamic_parallelism(self):
-        # nvcc error:
-        #     calling a __global__ function("test_kernel_inner") from a
-        #         __global__ function("test_kernel") is only allowed on the
-        #     compute_35 architecture or above
-        import pycuda.autoinit
-        compute_capability = pycuda.autoinit.device.compute_capability()
-        if compute_capability[0] < 3 or (compute_capability[0] == 3 and compute_capability[1] < 5):
-            raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
-                (compute_capability[0], compute_capability[1]))
-
-        test_cu = '''#include <cstdio>
-        __global__ void test_kernel_inner() {
-            printf("  Hello inner world!\\n");
-        }
-        __global__ void test_kernel() {
-            printf("Hello outer world!\\n");
-            test_kernel_inner<<<2, 1>>>();
-        }'''
-
-        mod = JitLinkModule()
-        mod.add_source(test_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
-        mod.add_stdlib('cudadevrt')
-        mod.link()
-
-        test_kernel = mod.get_function('test_kernel')
-        test_kernel(grid=(2,1), block=(1,1,1))
-
-if __name__ == "__main__":
-    # make sure that import failures get reported, instead of skipping the tests.
-    import pycuda.autoinit  # noqa
-
-    if len(sys.argv) > 1:
-        exec (sys.argv[1])
-    else:
-        from py.test.cmdline import main
-        main([__file__])
-- 
GitLab


From f6b37048d6412af4b60a0da6bbe4f3ebea834bf2 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Fri, 13 Jan 2017 20:50:21 +0100
Subject: [PATCH 12/15] Removed xfail marker from
 TestDriver.test_recursive_launch() in test/test_deriver.py

---
 test/test_driver.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_driver.py b/test/test_driver.py
index e951134d..6e705ff7 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -838,7 +838,6 @@ class TestDriver:
         drv.memcpy_htod_async(gpu_ary, a_pin, stream)
         drv.Context.synchronize()
 
-    @pytest.mark.xfail
     @mark_cuda_test
     # https://github.com/inducer/pycuda/issues/45
     def test_recursive_launch(self):
-- 
GitLab


From 179bba661a5c5494fc1f4ad1b4a8986077749eb0 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Sat, 14 Jan 2017 09:32:24 +0100
Subject: [PATCH 13/15] Fixed some mior TODOs

---
 pycuda/compiler.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 09674535..9b325410 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -292,15 +292,12 @@ class SourceModule(CudaModule):
         self._bind_module()
 
 class JitLinkModule(CudaModule):
-    # TODO:
-    # - How do we handle multiple CUDA devices? Currently using
-    #   pycuda.autoinit.device
     def __init__(self, nvcc='nvcc', link_options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
             include_dirs=[],  message_handler=None, log_verbose=False,
             cuda_libdir=None):
-        from pycuda.autoinit import device
-        compute_capability = device.compute_capability()
+        from pycuda.driver import Context
+        compute_capability = Context.get_device().compute_capability()
         if compute_capability < (3,5):
             raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
                 (compute_capability[0], compute_capability[1]))
@@ -332,7 +329,11 @@ class JitLinkModule(CudaModule):
             directory, and libptn is the %-format pattern to construct
             library file names from library names on the local system.
         Does not raise an Excpetion in case of failure.
+        Links
+        - Post-installation Actions
+          http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions
         TODO:
+        - Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries on Linux?
         - Mac OS X (Darwin) is currently treated like Linux, is that correct?
         - Check CMake's FindCUDA module, it might contain some helpful clues in its sources
           https://cmake.org/cmake/help/v3.0/module/FindCUDA.html
@@ -340,7 +341,7 @@ class JitLinkModule(CudaModule):
         - Verify all Linux code paths somehow
         '''
         from os.path import isfile, join
-        from platform import system as platform_system   #  TODO: Only since Pyhton 2.3., future or is 2.3 fine?
+        from platform import system as platform_system
         system = platform_system()
         libdir, libptn = None, None
         if system == 'Windows':
@@ -353,10 +354,8 @@ class JitLinkModule(CudaModule):
             if cuda_libdir is not None:
                 libdir = cuda_libdir
             elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')):
-                # TODO: Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries?
                 libdir = join(os.environ['CUDA_ROOT'], 'lib64')
             elif 'LD_LIBRARY_PATH' in os.environ:
-                # see: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions
                 for ld_path in os.environ['LD_LIBRARY_PATH'].split(':'):
                     if isfile(join(ld_path, 'libcudadevrt.a')):
                         libdir = ld_path
@@ -384,13 +383,12 @@ class JitLinkModule(CudaModule):
         return self
 
     def add_stdlib(self, libname):
-        # TODO: which error class to raise best here?
         if self.libdir is None:
-            raise Exception('Unable to find CUDA installation path, please set CUDA library path manually')
+            raise RuntimeError('Unable to find CUDA installation path, please set CUDA library path manually')
         from os.path import isfile, join
         libpath = join(self.libdir, self.libptn % libname)
         if not isfile(libpath):
-            raise Exception('CUDA library file "%s" not found' % libpath)
+            raise FileNotFoundError('CUDA library file "%s" not found' % libpath)
         from pycuda.driver import jit_input_type
         self.linker.add_file(libpath, jit_input_type.LIBRARY)
         return self
@@ -409,6 +407,7 @@ class DynamicSourceModule(JitLinkModule):
             link_options=None, keep=keep, no_extern_c=no_extern_c,
             arch=arch, code=code, cache_dir=cache_dir,
             include_dirs=include_dirs, cuda_libdir=cuda_libdir)
+        options = options[:]
         if not '-rdc=true' in options:
             options.append('-rdc=true')
         if not '-lcudadevrt' in options:
-- 
GitLab


From 1a25b19853c003945e60941d9231ee00672475fd Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Sat, 14 Jan 2017 10:44:25 +0100
Subject: [PATCH 14/15] Removed the pointer type conversion hacks from C++
 Linker class

---
 src/wrapper/wrap_cudadrv.cpp | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 76f30df4..9756b035 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -431,9 +431,10 @@ namespace
         }
       }
 
-      void add_option(CUjit_option option, void* value) {
+      template<typename T>
+      void add_option(CUjit_option option, T value) {
         m_options.push_back(option);
-        m_values.push_back(value);
+        m_values.push_back(reinterpret_cast<void *>(value));
       }
 
       void check_cu_result(const char* cu_function_name, CUresult cu_result) const {
@@ -466,16 +467,16 @@ namespace
           m_log_verbose(py::extract<bool>(py_log_verbose))
       {
         add_option(CU_JIT_INFO_LOG_BUFFER, m_info_buf);
-        add_option(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, (void*) sizeof(m_info_buf));
+        add_option(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, sizeof(m_info_buf));
         add_option(CU_JIT_ERROR_LOG_BUFFER, m_error_buf);
-        add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, (void*) sizeof(m_error_buf));
-        add_option(CU_JIT_LOG_VERBOSE, (void*) (m_log_verbose? 1ull : 0ull));
+        add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, sizeof(m_error_buf));
+        add_option(CU_JIT_LOG_VERBOSE, m_log_verbose? 1ull : 0ull);
 
         if (py_options.ptr() != Py_None) {
           PYTHON_FOREACH(key_value, py_options) {
             add_option(
                 py::extract<CUjit_option>(key_value[0]),
-                (void*) py::extract<intptr_t>(key_value[1])());
+                py::extract<intptr_t>(key_value[1])());
           }
         }
 
@@ -499,8 +500,11 @@ namespace
         if (PyObject_AsCharBuffer(py_data.ptr(), &data_buf, &data_buf_len) != 0) {
           throw py::error_already_set();
         }
-        const char* name = (py_name.ptr() != Py_None)? py::extract<const char*>(py_name) : NULL;
-        const CUresult cu_result = cuLinkAddData(m_link_state, input_type, (void*)data_buf, data_buf_len, name, 0, NULL, NULL);
+        const char* name = (py_name.ptr() != Py_None)?
+            py::extract<const char*>(py_name) : NULL;
+        const CUresult cu_result = cuLinkAddData(m_link_state, input_type,
+            static_cast<void *>(const_cast<char *>(data_buf)),
+            data_buf_len, name, 0, NULL, NULL);
         check_cu_result("cuLinkAddData", cu_result);
       }
 
@@ -513,13 +517,14 @@ namespace
 
       module* link_module()
       {
-        const char* cubin_data = NULL;
+        char* cubin_data = NULL;
         size_t cubin_size = 0;
-        CUresult cu_result = cuLinkComplete(m_link_state, (void**)&cubin_data, &cubin_size);
+        CUresult cu_result = cuLinkComplete(m_link_state,
+            reinterpret_cast<void **>(&cubin_data), &cubin_size);
         check_cu_result("cuLinkComplete", cu_result);
 
         CUmodule cu_module = NULL;
-        cu_result = cuModuleLoadData(&cu_module, (void*)cubin_data);
+        cu_result = cuModuleLoadData(&cu_module, cubin_data);
         check_cu_result("cuModuleLoadData", cu_result);
 
         call_message_handler(cu_result);
-- 
GitLab


From 16fc15a0e479b3a6e7f3cde4ce54d33f6aef6207 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Sat, 14 Jan 2017 13:08:32 +0100
Subject: [PATCH 15/15] Moved invocation of _locate_cuda_libdir() from
 constructor to add_stdlib(), it should be only called when actually needed
 Renamed JitLinkModule to DynamicModule

---
 pycuda/compiler.py           | 55 +++++++++++++++++++++++++-----------
 src/wrapper/wrap_cudadrv.cpp |  3 +-
 test/test_driver.py          |  4 +--
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 9b325410..d9f25752 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -278,6 +278,10 @@ class CudaModule(object):
         return self.module.get_function(name)
 
 class SourceModule(CudaModule):
+    '''
+    Creates a Module from a single .cu source object linked against the
+    static CUDA runtime.
+    '''
     def __init__(self, source, nvcc="nvcc", options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
             include_dirs=[]):
@@ -291,7 +295,11 @@ class SourceModule(CudaModule):
 
         self._bind_module()
 
-class JitLinkModule(CudaModule):
+class DynamicModule(CudaModule):
+    '''
+    Creates a Module from multiple .cu source, library file and/or data
+    objects linked against the static or dynamic CUDA runtime.
+    '''
     def __init__(self, nvcc='nvcc', link_options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
             include_dirs=[],  message_handler=None, log_verbose=False,
@@ -312,15 +320,17 @@ class JitLinkModule(CudaModule):
         self.code = code
         self.cache_dir = cache_dir
         self.include_dirs = include_dirs
+        self.cuda_libdir = cuda_libdir
+        self.libdir, self.libptn = None, None
         self.module = None
-        self.libdir, self.libptn = self._locate_cuda_libdir(cuda_libdir)
 
-    def _locate_cuda_libdir(self, cuda_libdir=None):
+    def _locate_cuda_libdir(self):
         '''
-        Locate the CUDA "standard" libraries directory in the local
+        Locate the "standard" CUDA SDK library directory in the local
         file system. Supports 64-Bit Windows, Linux and Mac OS X.
-        In case the caller supplied cuda_libdir other than None that
-        value is returned unchecked, else make a best-effort attempt.
+        In case the caller supplied cuda_libdir in the constructor
+        other than None that value is returned unchecked, else a
+        best-effort attempt is made.
         Precedence:
             Windows: cuda_libdir > %CUDA_PATH%
             Linux:   cuda_libdir > $CUDA_ROOT > $LD_LIBRARY_PATH > '/usr/lib/x86_64-linux-gnu'
@@ -328,8 +338,8 @@ class JitLinkModule(CudaModule):
             of failure or a string containing the absolute path of the
             directory, and libptn is the %-format pattern to construct
             library file names from library names on the local system.
-        Does not raise an Excpetion in case of failure.
-        Links
+        Raises a RuntimeError in case of failure.
+        Links:
         - Post-installation Actions
           http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions
         TODO:
@@ -345,14 +355,14 @@ class JitLinkModule(CudaModule):
         system = platform_system()
         libdir, libptn = None, None
         if system == 'Windows':
-            if cuda_libdir is not None:
-                libdir = cuda_libdir
-            elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib/x64/cudadevrt.lib')):
-                libdir = join(os.environ['CUDA_PATH'], 'lib/x64')
+            if self.cuda_libdir is not None:
+                libdir = self.cuda_libdir
+            elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib\\x64\\cudadevrt.lib')):
+                libdir = join(os.environ['CUDA_PATH'], 'lib\\x64')
             libptn = '%s.lib'
         elif system == 'Linux' or system == 'Darwin':
-            if cuda_libdir is not None:
-                libdir = cuda_libdir
+            if self.cuda_libdir is not None:
+                libdir = self.cuda_libdir
             elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')):
                 libdir = join(os.environ['CUDA_ROOT'], 'lib64')
             elif 'LD_LIBRARY_PATH' in os.environ:
@@ -363,6 +373,9 @@ class JitLinkModule(CudaModule):
             if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'):
                 libdir = '/usr/lib/x86_64-linux-gnu'
             libptn = 'lib%s.a'
+        if libdir is None:
+            raise RuntimeError('Unable to locate the CUDA SDK installation '
+                'directory, set CUDA library path manually')
         return libdir, libptn
 
     def add_source(self, source, nvcc_options=None, name='kernel.ptx'):
@@ -384,11 +397,11 @@ class JitLinkModule(CudaModule):
 
     def add_stdlib(self, libname):
         if self.libdir is None:
-            raise RuntimeError('Unable to find CUDA installation path, please set CUDA library path manually')
+            self.libdir, self.libptn = self._locate_cuda_libdir()
         from os.path import isfile, join
         libpath = join(self.libdir, self.libptn % libname)
         if not isfile(libpath):
-            raise FileNotFoundError('CUDA library file "%s" not found' % libpath)
+            raise FileNotFoundError('CUDA SDK library file "%s" not found' % libpath)
         from pycuda.driver import jit_input_type
         self.linker.add_file(libpath, jit_input_type.LIBRARY)
         return self
@@ -399,7 +412,15 @@ class JitLinkModule(CudaModule):
         self._bind_module()
         return self
 
-class DynamicSourceModule(JitLinkModule):
+class DynamicSourceModule(DynamicModule):
+    '''
+    Creates a Module from a single .cu source object linked against the
+    dynamic CUDA runtime.
+    - compiler generates PTX relocatable device code (rdc) from source that
+      can be linked with other relocatable device code
+    - source is linked against the CUDA device runtime library cudadevrt
+    - library cudadevrt is statically linked into the generated Module
+    '''
     def __init__(self, source, nvcc="nvcc", options=[], keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
             include_dirs=[], cuda_libdir=None):
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 9756b035..7bb23d4a 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -511,7 +511,8 @@ namespace
       void add_file(py::str py_filename, CUjitInputType input_type)
       {
         const char* filename = py::extract<const char*>(py_filename);
-        const CUresult cu_result = cuLinkAddFile(m_link_state, input_type, filename, 0, NULL, NULL);
+        const CUresult cu_result = cuLinkAddFile(m_link_state, input_type,
+            filename, 0, NULL, NULL);
         check_cu_result("cuLinkAddFile", cu_result);
       }
 
diff --git a/test/test_driver.py b/test/test_driver.py
index 6e705ff7..f88a1d67 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -934,8 +934,8 @@ class TestDriver:
             printf("  Hello inner world!\\n");
         }'''
 
-        from pycuda.compiler import JitLinkModule
-        mod = JitLinkModule()
+        from pycuda.compiler import DynamicModule
+        mod = DynamicModule()
         mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
         mod.add_source(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
         mod.add_stdlib('cudadevrt')
-- 
GitLab