From ebf0a6e0fb0a3a1b38fc08e155b23ff50eb81bce Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Sun, 15 Jan 2017 12:26:12 +0100
Subject: [PATCH 01/16] Created new experimental branch for NVRTC

---
 pycuda/compiler.py           |   2 +-
 setup.py                     |   2 +-
 src/wrapper/wrap_cudadrv.cpp | 106 +++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 2 deletions(-)
diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index d9f25752..494dd330 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -378,7 +378,7 @@ class DynamicModule(CudaModule):
                 'directory, set CUDA library path manually')
         return libdir, libptn
 
-    def add_source(self, source, nvcc_options=None, name='kernel.ptx'):
+    def add_source(self, source, nvcc_options=[], name='kernel.ptx'):
         ptx = compile(source, nvcc=self.nvcc, options=nvcc_options,
             keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch,
             code=self.code, cache_dir=self.cache_dir,
diff --git a/setup.py b/setup.py
index 775141b0..2f909cb3 100644
--- a/setup.py
+++ b/setup.py
@@ -117,7 +117,7 @@ def main():
 
     LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"]
     LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"]
-            + conf["CUDADRV_LIBNAME"])
+            + conf["CUDADRV_LIBNAME"]) + ["nvrtc"]
 
     if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]:
         conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")]
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 7bb23d4a..74067b7b 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -413,6 +413,106 @@ namespace
   // {{{ linker
 
 #if CUDAPP_CUDA_VERSION >= 5050
+  #include <nvrtc.h>    // NOTE: this requires us to link static SDK library "nvrtc"
+
+  class CudaCompiler : public boost::noncopyable
+  {
+    private:
+      nvrtcProgram m_prog;
+      std::vector<char> m_ptx_vec;
+
+      void check_nvrtc_result(const char* nvrtc_function_name, nvrtcResult nvrtc_result) const {
+        if (nvrtc_result != NVRTC_SUCCESS) {
+          std::string error_msg = nvrtc_function_name;
+          error_msg += " failed: ";
+          error_msg += nvrtcGetErrorString(nvrtc_result);
+          const std::string compilation_log = compilation_log_str();
+          if (compilation_log.length() > 0) {
+            error_msg += " - ";
+            error_msg += compilation_log;
+          }
+          throw std::runtime_error(error_msg);
+        }
+      }
+
+      const std::string compilation_log_str() const {
+        size_t result_length;
+        nvrtcResult nvrtc_result = nvrtcGetProgramLogSize(m_prog, &result_length);
+        check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result);
+        if (result_length > 0) {
+          std::vector<char> result_vec(result_length);
+          nvrtc_result = nvrtcGetProgramLog(m_prog, &result_vec[0]);
+          check_nvrtc_result("nvrtcGetProgramLog", nvrtc_result);
+          return std::string(&result_vec[0], result_length);
+        }
+        else {
+          return std::string();
+        }
+      }
+
+    public:
+      CudaCompiler(py::object py_src) // , py::object py_name=py::object())
+        : m_prog()
+      {
+        const char* src = py::extract<char*>(py_src);
+        const char* name = NULL;
+        nvrtcResult nvrtc_result = nvrtcCreateProgram(
+            &m_prog,    // Pointer to the result CUDA Runtime Compilation program
+            src,        // CUDA program source, 0-terminated string
+            name,       // CUDA program name, can be NULL (defaults to "default_program")
+            0,          // Number of headers >= 0
+            NULL,       // const char** headers,       - Contents of the headers, can be NULL when numHeaders == 0
+            NULL);      // const char** includeNames)  - Name of each header by which they can be included in the CUDA program source, can be NULL
+        check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result);
+      }
+
+      ~CudaCompiler()
+      {
+        if (m_prog != NULL) {
+            nvrtcDestroyProgram(&m_prog);
+            m_prog = NULL;
+        }
+      }
+
+    void declare_function(py::object py_function_name) {
+        const char* function_name = py::extract<char*>(py_function_name);
+        const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name);
+        check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
+    }
+
+    py::object compile(py::object py_compile_options) {
+        std::vector<char*> compile_options;
+        PYTHON_FOREACH(py_option, py_compile_options) {
+          compile_options.push_back(py::extract<char*>(py_option));
+        }
+
+        nvrtcResult nvrtc_result;
+        nvrtc_result = nvrtcCompileProgram(m_prog, (int)compile_options.size(), &compile_options[0]);
+        check_nvrtc_result("nvrtcCompileProgram", nvrtc_result);
+
+        size_t ptx_size;
+        nvrtc_result = nvrtcGetPTXSize(m_prog, &ptx_size);
+        check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result);
+
+        m_ptx_vec = std::vector<char>(ptx_size);
+        nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]);
+        check_nvrtc_result("nvrtcGetPTX", nvrtc_result);
+
+        // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview)
+        // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes
+        PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ);
+        return py::object(py::handle<>(memory_view));
+      }
+
+    std::string mangled_function(py::object py_function_name) {
+        const char* function_name = py::extract<char*>(py_function_name);
+        const char* lowered_name = NULL;
+        const nvrtcResult nvrtc_result = nvrtcGetLoweredName(m_prog, function_name, &lowered_name);
+        check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result);
+        return std::string(lowered_name);
+    }
+  };
+
   class Linker : public boost::noncopyable
   {
     private:
@@ -1200,6 +1300,12 @@ BOOST_PYTHON_MODULE(_driver)
     .value("OBJECT", CU_JIT_INPUT_OBJECT)
     .value("LIBRARY", CU_JIT_INPUT_LIBRARY);
 
+  py::class_<CudaCompiler, boost::noncopyable, shared_ptr<CudaCompiler> >("CudaCompiler", py::init<py::object>())
+//    .def(py::init<py::object, py::object>())
+    .def("declare_function", &CudaCompiler::declare_function, (py::arg("py_function_name")))
+    .def("compile", &CudaCompiler::compile, (py::arg("compile_options")=py::object()))
+    .def("mangled_function", &CudaCompiler::mangled_function, (py::arg("py_function_name")));
+
   py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
-- 
GitLab


From a408c26b7531dc709f95497b600a010c6bcd2d7f Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Sun, 15 Jan 2017 12:54:50 +0100
Subject: [PATCH 02/16] Fixed Python version compatibility issue with
 PyMemoryView_FromMemory()

---
 src/wrapper/wrap_cudadrv.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 74067b7b..0d2ef389 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -498,10 +498,15 @@ namespace
         nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]);
         check_nvrtc_result("nvrtcGetPTX", nvrtc_result);
 
+#if PY_VERSION_HEX >= 0x03030000
         // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview)
         // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes
+        // new since Python 3.3
         PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ);
         return py::object(py::handle<>(memory_view));
+#else
+        return py::str(std::string(&m_ptx_vec[0], ptx_size));
+#endif
       }
 
     std::string mangled_function(py::object py_function_name) {
-- 
GitLab


From 5e0b780b9366442fc2fa88c045ae8074c45bebed Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Mon, 16 Jan 2017 06:38:49 +0100
Subject: [PATCH 03/16] Several improvements to the Compiler class

---
 src/wrapper/wrap_cudadrv.cpp | 74 ++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 29 deletions(-)

diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 0d2ef389..febe57a9 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -451,11 +451,15 @@ namespace
       }
 
     public:
-      CudaCompiler(py::object py_src) // , py::object py_name=py::object())
+      CudaCompiler(py::object py_src, py::object py_name=py::object())
         : m_prog()
       {
         const char* src = py::extract<char*>(py_src);
         const char* name = NULL;
+        if (py_name.ptr() != Py_None) {
+            name = py::extract<char*>(py_name);
+        }
+
         nvrtcResult nvrtc_result = nvrtcCreateProgram(
             &m_prog,    // Pointer to the result CUDA Runtime Compilation program
             src,        // CUDA program source, 0-terminated string
@@ -475,46 +479,58 @@ namespace
       }
 
     void declare_function(py::object py_function_name) {
-        const char* function_name = py::extract<char*>(py_function_name);
-        const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name);
-        check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
+      const char* function_name = py::extract<char*>(py_function_name);
+      const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name);
+      check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
     }
 
-    py::object compile(py::object py_compile_options) {
-        std::vector<char*> compile_options;
+    py::object compile(py::object py_compile_options=py::object(), py::object py_function_names=py::object()) {
+      std::vector<const char*> compile_options;
+      if (py_compile_options.ptr() != Py_None) {
         PYTHON_FOREACH(py_option, py_compile_options) {
           compile_options.push_back(py::extract<char*>(py_option));
         }
+      }
 
-        nvrtcResult nvrtc_result;
-        nvrtc_result = nvrtcCompileProgram(m_prog, (int)compile_options.size(), &compile_options[0]);
-        check_nvrtc_result("nvrtcCompileProgram", nvrtc_result);
+      std::vector<const char*> function_names;
+      if (py_function_names.ptr() != Py_None) {
+        PYTHON_FOREACH(py_function_name, py_function_names) {
+          const char* function_name = py::extract<char*>(py_function_name);
+          const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name);
+          check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
+          function_names.push_back(function_name);
+        }
+      }
 
-        size_t ptx_size;
-        nvrtc_result = nvrtcGetPTXSize(m_prog, &ptx_size);
-        check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result);
+      nvrtcResult nvrtc_result;
+      nvrtc_result = nvrtcCompileProgram(m_prog, (int)compile_options.size(), &compile_options[0]);
+      check_nvrtc_result("nvrtcCompileProgram", nvrtc_result);
 
-        m_ptx_vec = std::vector<char>(ptx_size);
-        nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]);
-        check_nvrtc_result("nvrtcGetPTX", nvrtc_result);
+      size_t ptx_size;
+      nvrtc_result = nvrtcGetPTXSize(m_prog, &ptx_size);
+      check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result);
+
+      m_ptx_vec = std::vector<char>(ptx_size);
+      nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]);
+      check_nvrtc_result("nvrtcGetPTX", nvrtc_result);
 
 #if PY_VERSION_HEX >= 0x03030000
-        // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview)
-        // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes
-        // new since Python 3.3
-        PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ);
-        return py::object(py::handle<>(memory_view));
+      // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview)
+      // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes
+      // new since Python 3.3
+      PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ);
+      return py::object(py::handle<>(memory_view));
 #else
-        return py::str(std::string(&m_ptx_vec[0], ptx_size));
+      return py::str(std::string(&m_ptx_vec[0], ptx_size));
 #endif
-      }
+    }
 
     std::string mangled_function(py::object py_function_name) {
-        const char* function_name = py::extract<char*>(py_function_name);
-        const char* lowered_name = NULL;
-        const nvrtcResult nvrtc_result = nvrtcGetLoweredName(m_prog, function_name, &lowered_name);
-        check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result);
-        return std::string(lowered_name);
+      const char* function_name = py::extract<char*>(py_function_name);
+      const char* lowered_name = NULL;
+      const nvrtcResult nvrtc_result = nvrtcGetLoweredName(m_prog, function_name, &lowered_name);
+      check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result);
+      return std::string(lowered_name);
     }
   };
 
@@ -1306,9 +1322,9 @@ BOOST_PYTHON_MODULE(_driver)
     .value("LIBRARY", CU_JIT_INPUT_LIBRARY);
 
   py::class_<CudaCompiler, boost::noncopyable, shared_ptr<CudaCompiler> >("CudaCompiler", py::init<py::object>())
-//    .def(py::init<py::object, py::object>())
+    .def(py::init<py::object, py::object>())
     .def("declare_function", &CudaCompiler::declare_function, (py::arg("py_function_name")))
-    .def("compile", &CudaCompiler::compile, (py::arg("compile_options")=py::object()))
+    .def("compile", &CudaCompiler::compile, (py::arg("compile_options")=py::object(), py::arg("function_names")=py::object()))
     .def("mangled_function", &CudaCompiler::mangled_function, (py::arg("py_function_name")));
 
   py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
-- 
GitLab


From 6075cad00a8072d1b9862f49fb3d8e4097260e5f Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Tue, 17 Jan 2017 19:44:36 +0100
Subject: [PATCH 04/16] Completed NVRTC implementation

---
 pycuda/compiler.py           |  54 ++++++++
 src/wrapper/wrap_cudadrv.cpp | 253 ++++++++++++++++++++++-------------
 2 files changed, 213 insertions(+), 94 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 494dd330..f38ad95d 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -322,6 +322,7 @@ class DynamicModule(CudaModule):
         self.include_dirs = include_dirs
         self.cuda_libdir = cuda_libdir
         self.libdir, self.libptn = None, None
+        self.mangled_exports = None
         self.module = None
 
     def _locate_cuda_libdir(self):
@@ -387,6 +388,53 @@ class DynamicModule(CudaModule):
         self.linker.add_data(ptx, jit_input_type.PTX, name)
         return self
 
+    def add_source_nvrtc(self, source, exports=[], compile_options=[],
+            name='kernel.ptx', compiler=None):
+        '''
+        NOTE:
+        - self.code is not supported with NVRTC and ignored here
+        - headers in _find_pycuda_include_path() are supported with NVRTC:
+          pycuda-helpers.hpp, pycuda-complex.hpp tested (they need -I<CUDA>/include)
+        TODO:
+        - the include paths _find_pycuda_include_path() and CUDA_PATH/CUDA_ROOT are constant, they could be moved to NvrtcCompiler (C++)
+        - the compiler option '-arch' could also be moved to NvrtcCompiler
+        '''
+        if not hasattr(self.linker, "add_nvrtc"):
+            raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!')
+
+        # append -arch compiler option, if not already exists
+        compile_options = compile_options[:]
+        if not '-arch' in compile_options:
+            if self.arch is not None:
+                # nvcc:      "-arch sm_%d%d"
+                # nvrtc:     "-arch compute_%d%d"
+                # self.arch: "sm_%d%d" or None
+                arch_compute_capability = (int(self.arch[3]), int(self.arch[4]))
+            else:
+                from pycuda.driver import Context
+                arch_compute_capability = Context.get_device().compute_capability()
+            compile_options.extend(['-arch', 'compute_%u%u' % arch_compute_capability])
+
+        # append -I include directory options
+        compile_options.append('-I%s' % _find_pycuda_include_path())
+        if 'CUDA_PATH' in os.environ:
+            compile_options.append('-I%s\\include' % os.environ['CUDA_PATH'])
+        elif 'CUDA_ROOT' in os.environ:
+            compile_options.append('-I%s/include' % os.environ['CUDA_ROOT'])
+        for include_dir in self.include_dirs:
+            compile_options.append('-I%s' % include_dir)
+
+        # compile and link
+        if self.mangled_exports is None and len(exports) > 0:
+            self.mangled_exports = {}
+        if compiler is None:
+            from pycuda.driver import NvrtcCompiler
+            compiler = NvrtcCompiler()
+        self.linker.add_nvrtc(source, compiler, name=name,
+            compile_options=compile_options, exports=exports,
+            mangled_exports=self.mangled_exports)
+        return self
+
     def add_data(self, data, input_type, name='unknown'):
         self.linker.add_data(data, input_type, name)
         return self
@@ -412,6 +460,12 @@ class DynamicModule(CudaModule):
         self._bind_module()
         return self
 
+    def get_function(self, name):
+        if self.mangled_exports is None or name not in self.mangled_exports:
+            return self.module.get_function(name)
+        else:
+            return self.module.get_function(self.mangled_exports[name])
+
 class DynamicSourceModule(DynamicModule):
     '''
     Creates a Module from a single .cu source object linked against the
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index febe57a9..ecd04a3c 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -410,16 +410,49 @@ namespace
 
   // }}}
 
-  // {{{ linker
+  // {{{ nvrtc compiler
 
-#if CUDAPP_CUDA_VERSION >= 5050
+#if CUDAPP_CUDA_VERSION >= 8000
   #include <nvrtc.h>    // NOTE: this requires us to link static SDK library "nvrtc"
 
-  class CudaCompiler : public boost::noncopyable
+  class NvrtcProgram
   {
     private:
       nvrtcProgram m_prog;
-      std::vector<char> m_ptx_vec;
+
+      const std::string compilation_log_str() const {
+        size_t result_length;
+        nvrtcResult nvrtc_result = nvrtcGetProgramLogSize(m_prog, &result_length);
+        check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result);
+        if (result_length == 0) {
+          return std::string();
+        }
+        std::vector<char> result_vec(result_length);
+        nvrtc_result = nvrtcGetProgramLog(m_prog, &result_vec[0]);
+        check_nvrtc_result("nvrtcGetProgramLog", nvrtc_result);
+        return std::string(&result_vec[0], result_length);
+      }
+
+    public:
+      NvrtcProgram(const char* src, const char* name=NULL, int numHeaders=0,
+            const char** includeNames=NULL, const char** headers=NULL)
+        : m_prog()
+      {
+        const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog, src,
+            name, numHeaders, headers, includeNames);
+        check_nvrtc_result("nvrtcCreateProgram", nvrtc_result);
+      }
+
+      ~NvrtcProgram() {
+        if (m_prog != NULL) {
+          nvrtcDestroyProgram(&m_prog);
+          m_prog = NULL;
+        }
+      }
+
+      nvrtcProgram prog() const {
+        return m_prog;
+      }
 
       void check_nvrtc_result(const char* nvrtc_function_name, nvrtcResult nvrtc_result) const {
         if (nvrtc_result != NVRTC_SUCCESS) {
@@ -434,106 +467,84 @@ namespace
           throw std::runtime_error(error_msg);
         }
       }
+  };
 
-      const std::string compilation_log_str() const {
-        size_t result_length;
-        nvrtcResult nvrtc_result = nvrtcGetProgramLogSize(m_prog, &result_length);
-        check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result);
-        if (result_length > 0) {
-          std::vector<char> result_vec(result_length);
-          nvrtc_result = nvrtcGetProgramLog(m_prog, &result_vec[0]);
-          check_nvrtc_result("nvrtcGetProgramLog", nvrtc_result);
-          return std::string(&result_vec[0], result_length);
-        }
-        else {
-          return std::string();
-        }
-      }
+  class NvrtcCompiler : public boost::noncopyable
+  {
+    private:
+      std::vector<const char*> m_compile_options;
+      std::vector<const char*> m_header_names;
+      std::vector<const char*> m_header_contents;
 
     public:
-      CudaCompiler(py::object py_src, py::object py_name=py::object())
-        : m_prog()
-      {
-        const char* src = py::extract<char*>(py_src);
-        const char* name = NULL;
-        if (py_name.ptr() != Py_None) {
-            name = py::extract<char*>(py_name);
-        }
+      void add_header(py::str py_header_name, py::str py_header_content) {
+        m_header_names.push_back(py::extract<const char*>(py_header_name));
+        m_header_contents.push_back(py::extract<const char*>(py_header_content));
+      }
 
-        nvrtcResult nvrtc_result = nvrtcCreateProgram(
-            &m_prog,    // Pointer to the result CUDA Runtime Compilation program
-            src,        // CUDA program source, 0-terminated string
-            name,       // CUDA program name, can be NULL (defaults to "default_program")
-            0,          // Number of headers >= 0
-            NULL,       // const char** headers,       - Contents of the headers, can be NULL when numHeaders == 0
-            NULL);      // const char** includeNames)  - Name of each header by which they can be included in the CUDA program source, can be NULL
-        check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result);
+      void add_compile_options(py::list py_compile_options) {
+        PYTHON_FOREACH(py_compile_option, py_compile_options) {
+          const char* compile_option = py::extract<char*>(py_compile_option);
+          m_compile_options.push_back(compile_option);
+        }
       }
 
-      ~CudaCompiler()
+      std::vector<char> compile(const char* src, const char* name,
+          const std::vector<const char*>* compile_options,
+          const std::vector<const char*>* exports,
+          py::dict mangled_exports_dict)
       {
-        if (m_prog != NULL) {
-            nvrtcDestroyProgram(&m_prog);
-            m_prog = NULL;
+        // combine compile option lists into all_compile_options
+        std::vector<const char*> all_compile_options(m_compile_options);
+        if (!compile_options->empty()) {
+          all_compile_options.insert(all_compile_options.end(), compile_options->begin(), compile_options->end());
         }
-      }
 
-    void declare_function(py::object py_function_name) {
-      const char* function_name = py::extract<char*>(py_function_name);
-      const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name);
-      check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
-    }
+        // create NVRTC program
+        const char** header_names = m_header_names.empty()? NULL : &m_header_names[0];
+        const char** header_contents = m_header_contents.empty()? NULL : &m_header_contents[0];
+        NvrtcProgram prog(src, name, (int)m_header_names.size(), header_names, header_contents);
 
-    py::object compile(py::object py_compile_options=py::object(), py::object py_function_names=py::object()) {
-      std::vector<const char*> compile_options;
-      if (py_compile_options.ptr() != Py_None) {
-        PYTHON_FOREACH(py_option, py_compile_options) {
-          compile_options.push_back(py::extract<char*>(py_option));
+        // declare exported function names
+        nvrtcResult nvrtc_result;
+        for (std::vector<const char*>::const_iterator it = exports->begin(); it != exports->end(); ++it) {
+          nvrtc_result = nvrtcAddNameExpression(prog.prog(), *it);
+          prog.check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
         }
-      }
 
-      std::vector<const char*> function_names;
-      if (py_function_names.ptr() != Py_None) {
-        PYTHON_FOREACH(py_function_name, py_function_names) {
-          const char* function_name = py::extract<char*>(py_function_name);
-          const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name);
-          check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
-          function_names.push_back(function_name);
+        // compile .cu => .ptx
+        nvrtc_result = nvrtcCompileProgram(prog.prog(),
+            (int)all_compile_options.size(), &all_compile_options[0]);
+        prog.check_nvrtc_result("nvrtcCompileProgram", nvrtc_result);
+
+        // map exported function names to their mangled representation
+        if (mangled_exports_dict != NULL) {
+          for (std::vector<const char*>::const_iterator it = exports->begin(); it != exports->end(); ++it) {
+            const char* lowered_name = NULL;
+            nvrtc_result = nvrtcGetLoweredName(prog.prog(), *it, &lowered_name);
+            prog.check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result);
+            mangled_exports_dict[*it] = std::string(lowered_name);
+          }
         }
-      }
-
-      nvrtcResult nvrtc_result;
-      nvrtc_result = nvrtcCompileProgram(m_prog, (int)compile_options.size(), &compile_options[0]);
-      check_nvrtc_result("nvrtcCompileProgram", nvrtc_result);
-
-      size_t ptx_size;
-      nvrtc_result = nvrtcGetPTXSize(m_prog, &ptx_size);
-      check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result);
 
-      m_ptx_vec = std::vector<char>(ptx_size);
-      nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]);
-      check_nvrtc_result("nvrtcGetPTX", nvrtc_result);
+        // copy compiled ptx result into ptx_buffer
+        size_t ptx_size;
+        nvrtc_result = nvrtcGetPTXSize(prog.prog(), &ptx_size);
+        prog.check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result);
 
-#if PY_VERSION_HEX >= 0x03030000
-      // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview)
-      // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes
-      // new since Python 3.3
-      PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ);
-      return py::object(py::handle<>(memory_view));
-#else
-      return py::str(std::string(&m_ptx_vec[0], ptx_size));
+        std::vector<char> ptx_buffer(ptx_size);
+        nvrtc_result = nvrtcGetPTX(prog.prog(), &ptx_buffer[0]);
+        prog.check_nvrtc_result("nvrtcGetPTX", nvrtc_result);
+        return ptx_buffer;
+      }
+  };
 #endif
-    }
 
-    std::string mangled_function(py::object py_function_name) {
-      const char* function_name = py::extract<char*>(py_function_name);
-      const char* lowered_name = NULL;
-      const nvrtcResult nvrtc_result = nvrtcGetLoweredName(m_prog, function_name, &lowered_name);
-      check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result);
-      return std::string(lowered_name);
-    }
-  };
+  // }}}
 
+  // {{{ linker
+
+#if CUDAPP_CUDA_VERSION >= 5050
   class Linker : public boost::noncopyable
   {
     private:
@@ -637,6 +648,41 @@ namespace
         check_cu_result("cuLinkAddFile", cu_result);
       }
 
+#if CUDAPP_CUDA_VERSION >= 8000
+      void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::str py_name,
+          py::list py_compile_options, py::list py_exports,
+          py::dict py_mangled_exports)
+      {
+        // extract Python arguments
+        const char* src = py::extract<char*>(py_src);
+        const char* name = (py_name.ptr() != Py_None)?
+            py::extract<char*>(py_name) : "kernel.ptx";
+        std::vector<const char*> compile_options;
+        if (py_compile_options.ptr() != Py_None) {
+          PYTHON_FOREACH(py_option, py_compile_options) {
+            compile_options.push_back(py::extract<char*>(py_option));
+          }
+        }
+        std::vector<const char*> exports;
+        if (py_exports.ptr() != Py_None) {
+          PYTHON_FOREACH(py_function_name, py_exports) {
+            exports.push_back(py::extract<char*>(py_function_name));
+          }
+        }
+        py::dict mangled_exports_dict = (py_mangled_exports.ptr() != Py_None)?
+            py::extract<py::dict>(py_mangled_exports) : NULL;
+
+        // compile src into buffer ptx
+        std::vector<char> ptx = compiler->compile(src, name,
+            &compile_options, &exports, mangled_exports_dict);
+
+        // link buffer ptx
+        const CUresult cu_result = cuLinkAddData(m_link_state,
+            CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL);
+        check_cu_result("cuLinkAddData", cu_result);
+      }
+#endif
+
       module* link_module()
       {
         char* cubin_data = NULL;
@@ -1311,6 +1357,19 @@ BOOST_PYTHON_MODULE(_driver)
 
   // }}}
 
+  // {{{ nvrtc compiler
+
+#if CUDAPP_CUDA_VERSION >= 8000
+  py::class_<NvrtcCompiler, boost::noncopyable, shared_ptr<NvrtcCompiler> >("NvrtcCompiler")
+    .def("add_header", &NvrtcCompiler::add_header, (
+        py::arg("header_name"),
+        py::arg("header_content")))
+    .def("add_compile_options", &NvrtcCompiler::add_compile_options, (
+        py::arg("compile_options")));
+#endif
+
+  // }}}
+
   // {{{ linker
 
 #if CUDAPP_CUDA_VERSION >= 5050
@@ -1321,19 +1380,25 @@ BOOST_PYTHON_MODULE(_driver)
     .value("OBJECT", CU_JIT_INPUT_OBJECT)
     .value("LIBRARY", CU_JIT_INPUT_LIBRARY);
 
-  py::class_<CudaCompiler, boost::noncopyable, shared_ptr<CudaCompiler> >("CudaCompiler", py::init<py::object>())
-    .def(py::init<py::object, py::object>())
-    .def("declare_function", &CudaCompiler::declare_function, (py::arg("py_function_name")))
-    .def("compile", &CudaCompiler::compile, (py::arg("compile_options")=py::object(), py::arg("function_names")=py::object()))
-    .def("mangled_function", &CudaCompiler::mangled_function, (py::arg("py_function_name")));
-
   py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
     .def(py::init<py::object, py::object, py::object>())
-    .def("add_data", &Linker::add_data, (py::arg("data"), py::arg("input_type"), py::arg("name")=py::str("unknown")))
-    .def("add_file", &Linker::add_file, (py::arg("filename"), py::arg("input_type")))
+    .def("add_data", &Linker::add_data, (
+        py::arg("data"),
+        py::arg("input_type"),
+        py::arg("name")=py::str("unknown")))
+    .def("add_file", &Linker::add_file, (
+        py::arg("filename"),
+        py::arg("input_type")))
+    .def("add_nvrtc", &Linker::add_nvrtc, (
+        py::arg("src"),
+        py::arg("compiler"), py::arg("name")=py::object(),
+        py::arg("compile_options")=py::object(),
+        py::arg("exports")=py::object(),
+        py::arg("mangled_exports")=py::object()))
     .def("link_module", &Linker::link_module, py::return_value_policy<py::manage_new_object>());
+
 #endif
 
   // }}}
-- 
GitLab


From ce8228bed917ca4fe70c75aea75413a01479eb9e Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Wed, 18 Jan 2017 20:08:22 +0100
Subject: [PATCH 05/16] Improvements and bug-fixes: - Moved all constant NVRTC
 compiler configuration from DynamicSourceModule.add_source_nvrtc() to
 constructor of NvrtcCompiler - Added support for NVRTC in-memory header files
 - Added support to let the PyCUDA user create and preconfigure a
 NvrtcCompiler instance and pass it to add_source_nvrtc() - Added argument
 "exports" to constructor of DynamicSourceModule, triggers NVRTC compilation
 instead of NVCC - Fixed C++ buffer handling

---
 pycuda/compiler.py           |  53 ++------
 src/wrapper/wrap_cudadrv.cpp | 245 ++++++++++++++++++++++++-----------
 2 files changed, 184 insertions(+), 114 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index f38ad95d..f9ebbe14 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -307,7 +307,7 @@ class DynamicModule(CudaModule):
         from pycuda.driver import Context
         compute_capability = Context.get_device().compute_capability()
         if compute_capability < (3,5):
-            raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
+            raise RuntimeError('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
                 (compute_capability[0], compute_capability[1]))
         else:
             from pycuda.driver import Linker
@@ -390,46 +390,15 @@ class DynamicModule(CudaModule):
 
     def add_source_nvrtc(self, source, exports=[], compile_options=[],
             name='kernel.ptx', compiler=None):
-        '''
-        NOTE:
-        - self.code is not supported with NVRTC and ignored here
-        - headers in _find_pycuda_include_path() are supported with NVRTC:
-          pycuda-helpers.hpp, pycuda-complex.hpp tested (they need -I<CUDA>/include)
-        TODO:
-        - the include paths _find_pycuda_include_path() and CUDA_PATH/CUDA_ROOT are constant, they could be moved to NvrtcCompiler (C++)
-        - the compiler option '-arch' could also be moved to NvrtcCompiler
-        '''
+        # NOTE: - self.code is not supported with NVRTC and ignored here
         if not hasattr(self.linker, "add_nvrtc"):
             raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!')
-
-        # append -arch compiler option, if not already exists
-        compile_options = compile_options[:]
-        if not '-arch' in compile_options:
-            if self.arch is not None:
-                # nvcc:      "-arch sm_%d%d"
-                # nvrtc:     "-arch compute_%d%d"
-                # self.arch: "sm_%d%d" or None
-                arch_compute_capability = (int(self.arch[3]), int(self.arch[4]))
-            else:
-                from pycuda.driver import Context
-                arch_compute_capability = Context.get_device().compute_capability()
-            compile_options.extend(['-arch', 'compute_%u%u' % arch_compute_capability])
-
-        # append -I include directory options
-        compile_options.append('-I%s' % _find_pycuda_include_path())
-        if 'CUDA_PATH' in os.environ:
-            compile_options.append('-I%s\\include' % os.environ['CUDA_PATH'])
-        elif 'CUDA_ROOT' in os.environ:
-            compile_options.append('-I%s/include' % os.environ['CUDA_ROOT'])
-        for include_dir in self.include_dirs:
-            compile_options.append('-I%s' % include_dir)
-
-        # compile and link
         if self.mangled_exports is None and len(exports) > 0:
             self.mangled_exports = {}
         if compiler is None:
             from pycuda.driver import NvrtcCompiler
-            compiler = NvrtcCompiler()
+            compiler = NvrtcCompiler(compile_options, self.include_dirs, self.arch)
+            compile_options = None
         self.linker.add_nvrtc(source, compiler, name=name,
             compile_options=compile_options, exports=exports,
             mangled_exports=self.mangled_exports)
@@ -477,16 +446,20 @@ class DynamicSourceModule(DynamicModule):
     '''
     def __init__(self, source, nvcc="nvcc", options=[], keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[], cuda_libdir=None):
+            include_dirs=[], cuda_libdir=None, exports=None):
         super(DynamicSourceModule, self).__init__(nvcc=nvcc,
             link_options=None, keep=keep, no_extern_c=no_extern_c,
             arch=arch, code=code, cache_dir=cache_dir,
             include_dirs=include_dirs, cuda_libdir=cuda_libdir)
         options = options[:]
-        if not '-rdc=true' in options:
+        if '-rdc=true' not in options:
             options.append('-rdc=true')
-        if not '-lcudadevrt' in options:
-            options.append('-lcudadevrt')
-        self.add_source(source, nvcc_options=options)
+        if exports is None:
+            if '-lcudadevrt' not in options:
+                options.append('-lcudadevrt')
+            self.add_source(source, nvcc_options=options)
+        else:
+            self.add_source_nvrtc(source, exports=exports,
+                compile_options=options)
         self.add_stdlib('cudadevrt')
         self.link()
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index ecd04a3c..09fb57be 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -434,12 +434,16 @@ namespace
       }
 
     public:
-      NvrtcProgram(const char* src, const char* name=NULL, int numHeaders=0,
-            const char** includeNames=NULL, const char** headers=NULL)
+      NvrtcProgram(const char* src, const char* name,
+          const std::vector<const char*>& header_names_p,
+          const std::vector<const char*>& header_contents_p)
         : m_prog()
       {
-        const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog, src,
-            name, numHeaders, headers, includeNames);
+        const size_t header_count = header_names_p.size();
+        const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog,
+            src, name, (int)header_count,
+            (header_count > 0)? &header_contents_p[0] : NULL,
+            (header_count > 0)? &header_names_p[0] : NULL);
         check_nvrtc_result("nvrtcCreateProgram", nvrtc_result);
       }
 
@@ -472,66 +476,177 @@ namespace
   class NvrtcCompiler : public boost::noncopyable
   {
     private:
-      std::vector<const char*> m_compile_options;
-      std::vector<const char*> m_header_names;
-      std::vector<const char*> m_header_contents;
+      std::vector<std::string> m_compile_options;
+      std::vector<std::string> m_header_names;
+      std::vector<std::string> m_header_contents;
+      std::vector<const char*> m_compile_options_p;
+      std::vector<const char*> m_header_names_p;
+      std::vector<const char*> m_header_contents_p;
+#ifdef _WIN32
+      std::vector<char> m_win32_getenv_buf;
+#endif
+
+      char* get_env(const char* varname) {
+#ifdef _WIN32
+        // Visual Studio 2015: std::getenv() causes compiler warning C4996
+        size_t required_size = 0;
+        getenv_s(&required_size, NULL, 0, varname);
+        if (required_size > 0) {
+          m_win32_getenv_buf.resize(required_size);
+          getenv_s(&required_size, &m_win32_getenv_buf[0], required_size, varname);
+          return &m_win32_getenv_buf[0];
+        }
+        else {
+          return NULL;
+        }
+#else
+        return std::getenv(varname);
+#endif
+      }
 
-    public:
-      void add_header(py::str py_header_name, py::str py_header_content) {
-        m_header_names.push_back(py::extract<const char*>(py_header_name));
-        m_header_contents.push_back(py::extract<const char*>(py_header_content));
+      std::string find_pycuda_include_path() const {
+        // pycuda.compiler._find_pycuda_include_path():
+        //     from pkg_resources import Requirement, resource_filename
+        //     return resource_filename(Requirement.parse("pycuda"), "pycuda/cuda")
+        const py::object pkg_resources = py::import("pkg_resources");
+        const py::object resource_filename = pkg_resources.attr("resource_filename");
+        const py::object Requirement_parse = pkg_resources.attr("Requirement").attr("parse");
+        const py::object py_result = resource_filename(Requirement_parse("pycuda"), "pycuda/cuda");
+        return py::extract<std::string>(py_result);
+      }
+
+      void options_changed() {
+        const size_t n = m_compile_options.size();
+        m_compile_options_p.resize(n);
+        for (size_t i=0; i<n; ++i) {
+          m_compile_options_p[i] = m_compile_options[i].c_str();
+        }
+      }
+
+      void header_changed() {
+        const size_t n = m_header_names.size();
+        m_header_names_p.resize(n);
+        m_header_contents_p.resize(n);
+        for (size_t i=0; i<n; ++i) {
+          m_header_names_p[i] = m_header_names[i].c_str();
+          m_header_contents_p[i] = m_header_contents[i].c_str();
+        }
       }
 
-      void add_compile_options(py::list py_compile_options) {
-        PYTHON_FOREACH(py_compile_option, py_compile_options) {
-          const char* compile_option = py::extract<char*>(py_compile_option);
-          m_compile_options.push_back(compile_option);
+    public:
+      NvrtcCompiler(
+          const py::object py_compiler_options=py::object(),
+          const py::object py_include_dirs=py::object(),
+          const py::object py_arch=py::object(),
+          const py::object py_headers=py::object())
+      {
+        // Target GPU architecture (-arch) compiler option
+        std::string arch_compute_capability;
+        if (py_arch.ptr() != Py_None) {
+          const char* arch = py::extract<char*>(py_arch);
+          if (strlen(arch) == 5 && std::memcmp(arch, "sm_", 3) == 0) {
+            arch_compute_capability = std::string("compute_") + &arch[3];
+          }
+          else {
+            arch_compute_capability = arch;
+          }
+        }
+        else {
+          const py::tuple py_compute_capability = context::get_device().compute_capability();
+          const char arch_major = '0' + py::extract<int>(py_compute_capability[0]);
+          const char arch_minor = '0' + py::extract<int>(py_compute_capability[1]);
+          arch_compute_capability = std::string("compute_") + arch_major + arch_minor;
+        }
+        m_compile_options.push_back("-arch");
+        m_compile_options.push_back(arch_compute_capability);
+
+        // Include directory (-I) compiler options
+        m_compile_options.push_back(std::string("-I") + find_pycuda_include_path());
+        const char* CUDA_SDK;
+        if ((CUDA_SDK = get_env("CUDA_PATH")) != NULL) {
+          m_compile_options.push_back(std::string("-I") + CUDA_SDK + "\\include");
+        }
+        else if ((CUDA_SDK = get_env("CUDA_ROOT")) != NULL) {
+          m_compile_options.push_back(std::string("-I") + CUDA_SDK + "/include");
+        }
+        if (py_include_dirs.ptr() != Py_None) {
+          PYTHON_FOREACH(py_include_dir, py_include_dirs) {
+            m_compile_options.push_back(std::string("-I") + py::extract<char*>(py_include_dir)());
+          }
+        }
+
+        // Other compiler options specified by the caller
+        if (py_include_dirs.ptr() != Py_None) {
+          PYTHON_FOREACH(py_compiler_option, py_compiler_options) {
+            m_compile_options.push_back(py::extract<std::string>(py_compiler_option));
+          }
+        }
+
+        options_changed();
+
+        // In-memory NVRCT header files
+        if (py_headers.ptr() != Py_None) {
+          const py::dict py_headers_dict = py::extract<py::dict>(py_headers);
+          if (py::len(py_headers_dict) > 0) {
+            const py::list py_keys = py_headers_dict.keys();
+            PYTHON_FOREACH(py_key, py_keys) {
+              m_header_names.push_back(py::extract<std::string>(py_key));
+              m_header_contents.push_back(py::extract<std::string>(py_headers_dict[py_key]));
+            }
+            header_changed();
+          }
         }
       }
 
-      std::vector<char> compile(const char* src, const char* name,
-          const std::vector<const char*>* compile_options,
-          const std::vector<const char*>* exports,
-          py::dict mangled_exports_dict)
+      std::vector<char> compile(
+          const py::str& py_src,
+          const char* name,
+          const py::object& py_compile_options,
+          const py::object& py_exports,
+          py::object& py_mangled_exports)
       {
-        // combine compile option lists into all_compile_options
-        std::vector<const char*> all_compile_options(m_compile_options);
-        if (!compile_options->empty()) {
-          all_compile_options.insert(all_compile_options.end(), compile_options->begin(), compile_options->end());
+        // Combine compile option lists into compile_options_p<char*>
+        std::vector<const char*> compile_options_p(m_compile_options_p);
+        if (py_compile_options.ptr() != Py_None) {
+          PYTHON_FOREACH(py_compile_option, py_compile_options) {
+            compile_options_p.push_back(py::extract<char*>(py_compile_option));
+          }
         }
 
-        // create NVRTC program
-        const char** header_names = m_header_names.empty()? NULL : &m_header_names[0];
-        const char** header_contents = m_header_contents.empty()? NULL : &m_header_contents[0];
-        NvrtcProgram prog(src, name, (int)m_header_names.size(), header_names, header_contents);
+        // Create NVRTC program
+        NvrtcProgram prog(py::extract<char*>(py_src), name,
+            m_header_names_p, m_header_contents_p);
 
-        // declare exported function names
+        // Declare exported function names
         nvrtcResult nvrtc_result;
-        for (std::vector<const char*>::const_iterator it = exports->begin(); it != exports->end(); ++it) {
-          nvrtc_result = nvrtcAddNameExpression(prog.prog(), *it);
-          prog.check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
+        if (py_exports.ptr() != Py_None) {
+          PYTHON_FOREACH(py_export, py_exports) {
+            nvrtc_result = nvrtcAddNameExpression(prog.prog(), py::extract<char*>(py_export));
+            prog.check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result);
+          }
         }
 
-        // compile .cu => .ptx
+        // Compile .cu => .ptx
         nvrtc_result = nvrtcCompileProgram(prog.prog(),
-            (int)all_compile_options.size(), &all_compile_options[0]);
+            (int)compile_options_p.size(), &compile_options_p[0]);
         prog.check_nvrtc_result("nvrtcCompileProgram", nvrtc_result);
 
-        // map exported function names to their mangled representation
-        if (mangled_exports_dict != NULL) {
-          for (std::vector<const char*>::const_iterator it = exports->begin(); it != exports->end(); ++it) {
-            const char* lowered_name = NULL;
-            nvrtc_result = nvrtcGetLoweredName(prog.prog(), *it, &lowered_name);
+        // Map exported function names to their mangled representation
+        if (py_exports.ptr() != Py_None && py_mangled_exports.ptr() != Py_None) {
+          py::dict py_mangled_exports_dict = py::extract<py::dict>(py_mangled_exports);
+          PYTHON_FOREACH(py_export, py_exports) {
+            const char* plain_name = py::extract<char*>(py_export);
+            const char* mangled_name = NULL;
+            nvrtc_result = nvrtcGetLoweredName(prog.prog(), plain_name, &mangled_name);
             prog.check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result);
-            mangled_exports_dict[*it] = std::string(lowered_name);
+            py_mangled_exports_dict[py_export] = mangled_name;
           }
         }
 
-        // copy compiled ptx result into ptx_buffer
+        // Copy compiled ptx result into ptx_buffer
         size_t ptx_size;
         nvrtc_result = nvrtcGetPTXSize(prog.prog(), &ptx_size);
         prog.check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result);
-
         std::vector<char> ptx_buffer(ptx_size);
         nvrtc_result = nvrtcGetPTX(prog.prog(), &ptx_buffer[0]);
         prog.check_nvrtc_result("nvrtcGetPTX", nvrtc_result);
@@ -633,7 +748,7 @@ namespace
           throw py::error_already_set();
         }
         const char* name = (py_name.ptr() != Py_None)?
-            py::extract<const char*>(py_name) : NULL;
+            py::extract<char*>(py_name) : NULL;
         const CUresult cu_result = cuLinkAddData(m_link_state, input_type,
             static_cast<void *>(const_cast<char *>(data_buf)),
             data_buf_len, name, 0, NULL, NULL);
@@ -642,40 +757,22 @@ namespace
 
       void add_file(py::str py_filename, CUjitInputType input_type)
       {
-        const char* filename = py::extract<const char*>(py_filename);
+        const char* filename = py::extract<char*>(py_filename);
         const CUresult cu_result = cuLinkAddFile(m_link_state, input_type,
             filename, 0, NULL, NULL);
         check_cu_result("cuLinkAddFile", cu_result);
       }
 
 #if CUDAPP_CUDA_VERSION >= 8000
-      void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::str py_name,
-          py::list py_compile_options, py::list py_exports,
-          py::dict py_mangled_exports)
+      void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::object py_name,
+          py::object py_compile_options, py::object py_exports,
+          py::object py_mangled_exports)
       {
-        // extract Python arguments
-        const char* src = py::extract<char*>(py_src);
         const char* name = (py_name.ptr() != Py_None)?
             py::extract<char*>(py_name) : "kernel.ptx";
-        std::vector<const char*> compile_options;
-        if (py_compile_options.ptr() != Py_None) {
-          PYTHON_FOREACH(py_option, py_compile_options) {
-            compile_options.push_back(py::extract<char*>(py_option));
-          }
-        }
-        std::vector<const char*> exports;
-        if (py_exports.ptr() != Py_None) {
-          PYTHON_FOREACH(py_function_name, py_exports) {
-            exports.push_back(py::extract<char*>(py_function_name));
-          }
-        }
-        py::dict mangled_exports_dict = (py_mangled_exports.ptr() != Py_None)?
-            py::extract<py::dict>(py_mangled_exports) : NULL;
-
         // compile src into buffer ptx
-        std::vector<char> ptx = compiler->compile(src, name,
-            &compile_options, &exports, mangled_exports_dict);
-
+        std::vector<char> ptx = compiler->compile(py_src, name,
+            py_compile_options, py_exports, py_mangled_exports);
         // link buffer ptx
         const CUresult cu_result = cuLinkAddData(m_link_state,
             CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL);
@@ -1361,11 +1458,10 @@ BOOST_PYTHON_MODULE(_driver)
 
 #if CUDAPP_CUDA_VERSION >= 8000
   py::class_<NvrtcCompiler, boost::noncopyable, shared_ptr<NvrtcCompiler> >("NvrtcCompiler")
-    .def("add_header", &NvrtcCompiler::add_header, (
-        py::arg("header_name"),
-        py::arg("header_content")))
-    .def("add_compile_options", &NvrtcCompiler::add_compile_options, (
-        py::arg("compile_options")));
+    .def(py::init<py::object>())
+    .def(py::init<py::object, py::object>())
+    .def(py::init<py::object, py::object, py::object>())
+    .def(py::init<py::object, py::object, py::object, py::object>());
 #endif
 
   // }}}
@@ -1393,12 +1489,13 @@ BOOST_PYTHON_MODULE(_driver)
         py::arg("input_type")))
     .def("add_nvrtc", &Linker::add_nvrtc, (
         py::arg("src"),
-        py::arg("compiler"), py::arg("name")=py::object(),
+        py::arg("compiler"),
+        py::arg("name")=py::object(),
         py::arg("compile_options")=py::object(),
         py::arg("exports")=py::object(),
         py::arg("mangled_exports")=py::object()))
-    .def("link_module", &Linker::link_module, py::return_value_policy<py::manage_new_object>());
-
+    .def("link_module", &Linker::link_module,
+        py::return_value_policy<py::manage_new_object>());
 #endif
 
   // }}}
-- 
GitLab


From e04c1948ca356036cd63cc8d562f37563a5f6139 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Thu, 19 Jan 2017 10:09:27 +0100
Subject: [PATCH 06/16] Temporary test to investigate crash

---
 src/wrapper/wrap_cudadrv.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 09fb57be..a194fd6f 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -412,7 +412,7 @@ namespace
 
   // {{{ nvrtc compiler
 
-#if CUDAPP_CUDA_VERSION >= 8000
+#if CUDAPP_CUDA_VERSION >= 18000
   #include <nvrtc.h>    // NOTE: this requires us to link static SDK library "nvrtc"
 
   class NvrtcProgram
@@ -763,7 +763,7 @@ namespace
         check_cu_result("cuLinkAddFile", cu_result);
       }
 
-#if CUDAPP_CUDA_VERSION >= 8000
+#if CUDAPP_CUDA_VERSION >= 18000
       void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::object py_name,
           py::object py_compile_options, py::object py_exports,
           py::object py_mangled_exports)
@@ -1456,7 +1456,7 @@ BOOST_PYTHON_MODULE(_driver)
 
   // {{{ nvrtc compiler
 
-#if CUDAPP_CUDA_VERSION >= 8000
+#if CUDAPP_CUDA_VERSION >= 18000
   py::class_<NvrtcCompiler, boost::noncopyable, shared_ptr<NvrtcCompiler> >("NvrtcCompiler")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
@@ -1487,6 +1487,7 @@ BOOST_PYTHON_MODULE(_driver)
     .def("add_file", &Linker::add_file, (
         py::arg("filename"),
         py::arg("input_type")))
+#if CUDAPP_CUDA_VERSION >= 18000
     .def("add_nvrtc", &Linker::add_nvrtc, (
         py::arg("src"),
         py::arg("compiler"),
@@ -1494,6 +1495,7 @@ BOOST_PYTHON_MODULE(_driver)
         py::arg("compile_options")=py::object(),
         py::arg("exports")=py::object(),
         py::arg("mangled_exports")=py::object()))
+#endif
     .def("link_module", &Linker::link_module,
         py::return_value_policy<py::manage_new_object>());
 #endif
-- 
GitLab


From dad4bdba958c03030724d7daba9fada8b78ae4e2 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Thu, 19 Jan 2017 11:08:21 +0100
Subject: [PATCH 07/16] 2nd temporary test to investigate crash: removed nvrtc
 library

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2f909cb3..f6433a84 100644
--- a/setup.py
+++ b/setup.py
@@ -117,7 +117,7 @@ def main():
 
     LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"]
     LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"]
-            + conf["CUDADRV_LIBNAME"]) + ["nvrtc"]
+            + conf["CUDADRV_LIBNAME"]) # + ["nvrtc"]
 
     if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]:
         conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")]
-- 
GitLab


From b33a50803f67fd0d0d057cf187a3b218850f79d7 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Thu, 19 Jan 2017 21:59:47 +0100
Subject: [PATCH 08/16] Reverted temporary tests, modified static linking of
 nvrtc.lib in setup.py to proper syntax, normalized a couple of method
 prototypes after drafting the documentation.

---
 examples/demo_cdpSimplePrint.py |  2 +-
 pycuda/compiler.py              | 66 ++++++++++++------------------
 setup.py                        |  5 ++-
 src/wrapper/wrap_cudadrv.cpp    | 72 +++++++++++++++++++++------------
 test/test_driver.py             |  2 +-
 5 files changed, 78 insertions(+), 69 deletions(-)

diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py
index d5435987..b2c0b4e6 100644
--- a/examples/demo_cdpSimplePrint.py
+++ b/examples/demo_cdpSimplePrint.py
@@ -95,7 +95,7 @@ def main(argv):
 
     print("starting Simple Print (CUDA Dynamic Parallelism)")
 
-    mod = DynamicSourceModule(cdpSimplePrint_cu)
+    mod = DynamicSourceModule(cdpSimplePrint_cu, use_cudadevrt=True)
     cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call
 
     print("***************************************************************************")
diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index f9ebbe14..94c78352 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -278,10 +278,6 @@ class CudaModule(object):
         return self.module.get_function(name)
 
 class SourceModule(CudaModule):
-    '''
-    Creates a Module from a single .cu source object linked against the
-    static CUDA runtime.
-    '''
     def __init__(self, source, nvcc="nvcc", options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
             include_dirs=[]):
@@ -296,22 +292,17 @@ class SourceModule(CudaModule):
         self._bind_module()
 
 class DynamicModule(CudaModule):
-    '''
-    Creates a Module from multiple .cu source, library file and/or data
-    objects linked against the static or dynamic CUDA runtime.
-    '''
-    def __init__(self, nvcc='nvcc', link_options=None, keep=False,
-            no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[],  message_handler=None, log_verbose=False,
-            cuda_libdir=None):
+    def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None,
+            code=None, cache_dir=None, include_dirs=None, link_options=None,
+            message_handler=None, log_verbose=False, cuda_libdir=None):
         from pycuda.driver import Context
         compute_capability = Context.get_device().compute_capability()
         if compute_capability < (3,5):
-            raise RuntimeError('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
+            raise RuntimeError('Minimum compute capability for JIT linker is 3.5 (found: %u.%u)!' %
                 (compute_capability[0], compute_capability[1]))
         else:
             from pycuda.driver import Linker
-            self.linker = Linker(message_handler, link_options, log_verbose)
+            self.linker = Linker(link_options, message_handler, log_verbose)
         self._check_arch(arch)
         self.nvcc = nvcc
         self.keep = keep
@@ -388,19 +379,18 @@ class DynamicModule(CudaModule):
         self.linker.add_data(ptx, jit_input_type.PTX, name)
         return self
 
-    def add_source_nvrtc(self, source, exports=[], compile_options=[],
-            name='kernel.ptx', compiler=None):
-        # NOTE: - self.code is not supported with NVRTC and ignored here
+    def add_source_nvrtc(self, source, nvrtc_options=None,
+            name='kernel.ptx', exports=None, compiler=None):
         if not hasattr(self.linker, "add_nvrtc"):
             raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!')
         if self.mangled_exports is None and len(exports) > 0:
             self.mangled_exports = {}
         if compiler is None:
             from pycuda.driver import NvrtcCompiler
-            compiler = NvrtcCompiler(compile_options, self.include_dirs, self.arch)
-            compile_options = None
-        self.linker.add_nvrtc(source, compiler, name=name,
-            compile_options=compile_options, exports=exports,
+            compiler = NvrtcCompiler(nvrtc_options, self.include_dirs, self.arch)
+            nvrtc_options = None
+        self.linker.add_nvrtc(compiler, source, name=name,
+            nvrtc_options=nvrtc_options, exports=exports,
             mangled_exports=self.mangled_exports)
         return self
 
@@ -436,30 +426,26 @@ class DynamicModule(CudaModule):
             return self.module.get_function(self.mangled_exports[name])
 
 class DynamicSourceModule(DynamicModule):
-    '''
-    Creates a Module from a single .cu source object linked against the
-    dynamic CUDA runtime.
-    - compiler generates PTX relocatable device code (rdc) from source that
-      can be linked with other relocatable device code
-    - source is linked against the CUDA device runtime library cudadevrt
-    - library cudadevrt is statically linked into the generated Module
-    '''
-    def __init__(self, source, nvcc="nvcc", options=[], keep=False,
+    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[], cuda_libdir=None, exports=None):
+            include_dirs=None, use_cudadevrt=False, cuda_libdir=None,
+            exports=None):
         super(DynamicSourceModule, self).__init__(nvcc=nvcc,
-            link_options=None, keep=keep, no_extern_c=no_extern_c,
-            arch=arch, code=code, cache_dir=cache_dir,
-            include_dirs=include_dirs, cuda_libdir=cuda_libdir)
-        options = options[:]
-        if '-rdc=true' not in options:
+            keep=keep, no_extern_c=no_extern_c, arch=arch, code=code,
+            cache_dir=cache_dir, include_dirs=include_dirs, link_options=None,
+            cuda_libdir=cuda_libdir)
+        if options is None:
+            options = []
+        else:
+            options = options[:]
+        if use_cudadevrt and '-rdc=true' not in options:
             options.append('-rdc=true')
         if exports is None:
-            if '-lcudadevrt' not in options:
+            if use_cudadevrt and '-lcudadevrt' not in options:
                 options.append('-lcudadevrt')
             self.add_source(source, nvcc_options=options)
         else:
-            self.add_source_nvrtc(source, exports=exports,
-                compile_options=options)
-        self.add_stdlib('cudadevrt')
+            self.add_source_nvrtc(source, nvrtc_options=options, exports=exports)
+        if use_cudadevrt:
+            self.add_stdlib('cudadevrt')
         self.link()
diff --git a/setup.py b/setup.py
index f6433a84..a398920a 100644
--- a/setup.py
+++ b/setup.py
@@ -91,6 +91,9 @@ def get_config_schema():
         LibraryDir("CURAND", default_lib_dirs),
         Libraries("CURAND", ["curand"]),
 
+        LibraryDir("NVRTC", default_lib_dirs),
+        Libraries("NVRTC", ["nvrtc"]),
+
         StringListOption("CXXFLAGS", cxxflags_default,
             help="Any extra C++ compiler options to include"),
         StringListOption("LDFLAGS", ldflags_default,
@@ -117,7 +120,7 @@ def main():
 
     LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"]
     LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"]
-            + conf["CUDADRV_LIBNAME"]) # + ["nvrtc"]
+            + conf["CUDADRV_LIBNAME"]) + conf["NVRTC_LIBNAME"]
 
     if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]:
         conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")]
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index a194fd6f..46b3299c 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -412,7 +412,7 @@ namespace
 
   // {{{ nvrtc compiler
 
-#if CUDAPP_CUDA_VERSION >= 18000
+#if CUDAPP_CUDA_VERSION >= 8000
   #include <nvrtc.h>    // NOTE: this requires us to link static SDK library "nvrtc"
 
   class NvrtcProgram
@@ -434,14 +434,14 @@ namespace
       }
 
     public:
-      NvrtcProgram(const char* src, const char* name,
+      NvrtcProgram(const char* source, const char* name,
           const std::vector<const char*>& header_names_p,
           const std::vector<const char*>& header_contents_p)
         : m_prog()
       {
         const size_t header_count = header_names_p.size();
         const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog,
-            src, name, (int)header_count,
+            source, name, (int)header_count,
             (header_count > 0)? &header_contents_p[0] : NULL,
             (header_count > 0)? &header_names_p[0] : NULL);
         check_nvrtc_result("nvrtcCreateProgram", nvrtc_result);
@@ -535,7 +535,7 @@ namespace
 
     public:
       NvrtcCompiler(
-          const py::object py_compiler_options=py::object(),
+          const py::object py_nvrtc_options=py::object(),
           const py::object py_include_dirs=py::object(),
           const py::object py_arch=py::object(),
           const py::object py_headers=py::object())
@@ -577,7 +577,7 @@ namespace
 
         // Other compiler options specified by the caller
         if (py_include_dirs.ptr() != Py_None) {
-          PYTHON_FOREACH(py_compiler_option, py_compiler_options) {
+          PYTHON_FOREACH(py_compiler_option, py_nvrtc_options) {
             m_compile_options.push_back(py::extract<std::string>(py_compiler_option));
           }
         }
@@ -599,22 +599,22 @@ namespace
       }
 
       std::vector<char> compile(
-          const py::str& py_src,
+          const py::str& py_source,
           const char* name,
-          const py::object& py_compile_options,
+          const py::object& py_nvrtc_options,
           const py::object& py_exports,
           py::object& py_mangled_exports)
       {
         // Combine compile option lists into compile_options_p<char*>
         std::vector<const char*> compile_options_p(m_compile_options_p);
-        if (py_compile_options.ptr() != Py_None) {
-          PYTHON_FOREACH(py_compile_option, py_compile_options) {
+        if (py_nvrtc_options.ptr() != Py_None) {
+          PYTHON_FOREACH(py_compile_option, py_nvrtc_options) {
             compile_options_p.push_back(py::extract<char*>(py_compile_option));
           }
         }
 
         // Create NVRTC program
-        NvrtcProgram prog(py::extract<char*>(py_src), name,
+        NvrtcProgram prog(py::extract<char*>(py_source), name,
             m_header_names_p, m_header_contents_p);
 
         // Declare exported function names
@@ -652,6 +652,20 @@ namespace
         prog.check_nvrtc_result("nvrtcGetPTX", nvrtc_result);
         return ptx_buffer;
       }
+
+      std::string py_compile(
+          py::str py_source,
+          py::object py_name,
+          py::object py_nvrtc_options,
+          py::object py_exports,
+          py::object py_mangled_exports)
+      {
+        const char* name = (py_name.ptr() != Py_None)?
+            py::extract<char*>(py_name) : "kernel.ptx";
+        const std::vector<char> ptx = compile(py_source, name,
+            py_nvrtc_options, py_exports, py_mangled_exports);
+        return std::string(ptx.begin(), ptx.end());
+      }
   };
 #endif
 
@@ -706,8 +720,8 @@ namespace
       }
 
     public:
-      Linker(py::object message_handler = py::object(),
-             py::object py_options = py::object(),
+      Linker(py::object py_link_options = py::object(),
+             py::object message_handler = py::object(),
              py::object py_log_verbose = py::object(false))
         : m_message_handler(message_handler),
           m_link_state(),
@@ -719,8 +733,8 @@ namespace
         add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, sizeof(m_error_buf));
         add_option(CU_JIT_LOG_VERBOSE, m_log_verbose? 1ull : 0ull);
 
-        if (py_options.ptr() != Py_None) {
-          PYTHON_FOREACH(key_value, py_options) {
+        if (py_link_options.ptr() != Py_None) {
+          PYTHON_FOREACH(key_value, py_link_options) {
             add_option(
                 py::extract<CUjit_option>(key_value[0]),
                 py::extract<intptr_t>(key_value[1])());
@@ -763,16 +777,16 @@ namespace
         check_cu_result("cuLinkAddFile", cu_result);
       }
 
-#if CUDAPP_CUDA_VERSION >= 18000
-      void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::object py_name,
-          py::object py_compile_options, py::object py_exports,
+#if CUDAPP_CUDA_VERSION >= 8000
+      void add_nvrtc(NvrtcCompiler* compiler, py::str py_source, py::object py_name,
+          py::object py_nvrtc_options, py::object py_exports,
           py::object py_mangled_exports)
       {
         const char* name = (py_name.ptr() != Py_None)?
             py::extract<char*>(py_name) : "kernel.ptx";
-        // compile src into buffer ptx
-        std::vector<char> ptx = compiler->compile(py_src, name,
-            py_compile_options, py_exports, py_mangled_exports);
+        // compile source into buffer ptx
+        std::vector<char> ptx = compiler->compile(py_source, name,
+            py_nvrtc_options, py_exports, py_mangled_exports);
         // link buffer ptx
         const CUresult cu_result = cuLinkAddData(m_link_state,
             CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL);
@@ -1456,12 +1470,18 @@ BOOST_PYTHON_MODULE(_driver)
 
   // {{{ nvrtc compiler
 
-#if CUDAPP_CUDA_VERSION >= 18000
+#if CUDAPP_CUDA_VERSION >= 8000
   py::class_<NvrtcCompiler, boost::noncopyable, shared_ptr<NvrtcCompiler> >("NvrtcCompiler")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
     .def(py::init<py::object, py::object, py::object>())
-    .def(py::init<py::object, py::object, py::object, py::object>());
+    .def(py::init<py::object, py::object, py::object, py::object>())
+    .def("compile", &NvrtcCompiler::py_compile, (
+        py::arg("source"),
+        py::arg("name")=py::str("kernel.ptx"),
+        py::arg("nvrtc_options")=py::object(),
+        py::arg("exports")=py::object(),
+        py::arg("mangled_exports")=py::object()));
 #endif
 
   // }}}
@@ -1487,12 +1507,12 @@ BOOST_PYTHON_MODULE(_driver)
     .def("add_file", &Linker::add_file, (
         py::arg("filename"),
         py::arg("input_type")))
-#if CUDAPP_CUDA_VERSION >= 18000
+#if CUDAPP_CUDA_VERSION >= 8000
     .def("add_nvrtc", &Linker::add_nvrtc, (
-        py::arg("src"),
         py::arg("compiler"),
-        py::arg("name")=py::object(),
-        py::arg("compile_options")=py::object(),
+        py::arg("source"),
+        py::arg("name")=py::str("kernel.ptx"),
+        py::arg("nvrtc_options")=py::object(),
         py::arg("exports")=py::object(),
         py::arg("mangled_exports")=py::object()))
 #endif
diff --git a/test/test_driver.py b/test/test_driver.py
index f88a1d67..d7e68d77 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -894,7 +894,7 @@ class TestDriver:
             drv.memcpy_htod(b_gpu, b)
 
             from pycuda.compiler import DynamicSourceModule
-            mod = DynamicSourceModule(cuda_string, keep=True)
+            mod = DynamicSourceModule(cuda_string, keep=True, use_cudadevrt=True)
 
             func = mod.get_function("math")
             func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu,
-- 
GitLab


From d0c479a20e0e882459e52bf49bc3f9273a3149cc Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Thu, 19 Jan 2017 22:40:41 +0100
Subject: [PATCH 09/16] Fixed bug, include_dirs must not be None

---
 pycuda/compiler.py           | 4 ++--
 src/wrapper/wrap_cudadrv.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 94c78352..7d08ce2b 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -293,7 +293,7 @@ class SourceModule(CudaModule):
 
 class DynamicModule(CudaModule):
     def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None,
-            code=None, cache_dir=None, include_dirs=None, link_options=None,
+            code=None, cache_dir=None, include_dirs=[], link_options=None,
             message_handler=None, log_verbose=False, cuda_libdir=None):
         from pycuda.driver import Context
         compute_capability = Context.get_device().compute_capability()
@@ -428,7 +428,7 @@ class DynamicModule(CudaModule):
 class DynamicSourceModule(DynamicModule):
     def __init__(self, source, nvcc="nvcc", options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=None, use_cudadevrt=False, cuda_libdir=None,
+            include_dirs=[], use_cudadevrt=False, cuda_libdir=None,
             exports=None):
         super(DynamicSourceModule, self).__init__(nvcc=nvcc,
             keep=keep, no_extern_c=no_extern_c, arch=arch, code=code,
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 46b3299c..68e2fa5c 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -576,7 +576,7 @@ namespace
         }
 
         // Other compiler options specified by the caller
-        if (py_include_dirs.ptr() != Py_None) {
+        if (py_nvrtc_options.ptr() != Py_None) {
           PYTHON_FOREACH(py_compiler_option, py_nvrtc_options) {
             m_compile_options.push_back(py::extract<std::string>(py_compiler_option));
           }
-- 
GitLab


From 784f8c2f867418636d4d21cbbc31386779c18fd4 Mon Sep 17 00:00:00 2001
From: Lurch <Lurch>
Date: Fri, 20 Jan 2017 17:54:19 +0100
Subject: [PATCH 10/16] Minor modifications to a few prototypes, removed
 shared_ptr<> from Boost::Python declaration of C++ classes

---
 pycuda/compiler.py           |  2 +-
 src/wrapper/wrap_cudadrv.cpp | 32 +++++++++++++++++---------------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 7d08ce2b..a6da5c89 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -380,7 +380,7 @@ class DynamicModule(CudaModule):
         return self
 
     def add_source_nvrtc(self, source, nvrtc_options=None,
-            name='kernel.ptx', exports=None, compiler=None):
+            name=None, exports=None, compiler=None):
         if not hasattr(self.linker, "add_nvrtc"):
             raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!')
         if self.mangled_exports is None and len(exports) > 0:
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 68e2fa5c..bb1ffc00 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -584,7 +584,7 @@ namespace
 
         options_changed();
 
-        // In-memory NVRCT header files
+        // In-memory NVRTC header files
         if (py_headers.ptr() != Py_None) {
           const py::dict py_headers_dict = py::extract<py::dict>(py_headers);
           if (py::len(py_headers_dict) > 0) {
@@ -600,7 +600,7 @@ namespace
 
       std::vector<char> compile(
           const py::str& py_source,
-          const char* name,
+          const py::object& py_name,
           const py::object& py_nvrtc_options,
           const py::object& py_exports,
           py::object& py_mangled_exports)
@@ -614,6 +614,8 @@ namespace
         }
 
         // Create NVRTC program
+        const char* name = (py_name.ptr() != Py_None)?
+            py::extract<char*>(py_name) : "kernel.ptx";
         NvrtcProgram prog(py::extract<char*>(py_source), name,
             m_header_names_p, m_header_contents_p);
 
@@ -660,9 +662,7 @@ namespace
           py::object py_exports,
           py::object py_mangled_exports)
       {
-        const char* name = (py_name.ptr() != Py_None)?
-            py::extract<char*>(py_name) : "kernel.ptx";
-        const std::vector<char> ptx = compile(py_source, name,
+        const std::vector<char> ptx = compile(py_source, py_name,
             py_nvrtc_options, py_exports, py_mangled_exports);
         return std::string(ptx.begin(), ptx.end());
       }
@@ -754,7 +754,7 @@ namespace
         close();
       }
 
-      void add_data(py::object py_data, CUjitInputType input_type, py::str py_name)
+      void add_data(py::object py_data, CUjitInputType input_type, py::object py_name)
       {
         const char *data_buf;
         PYCUDA_BUFFER_SIZE_T data_buf_len;
@@ -762,7 +762,7 @@ namespace
           throw py::error_already_set();
         }
         const char* name = (py_name.ptr() != Py_None)?
-            py::extract<char*>(py_name) : NULL;
+            py::extract<char*>(py_name) : "unknown";
         const CUresult cu_result = cuLinkAddData(m_link_state, input_type,
             static_cast<void *>(const_cast<char *>(data_buf)),
             data_buf_len, name, 0, NULL, NULL);
@@ -782,12 +782,12 @@ namespace
           py::object py_nvrtc_options, py::object py_exports,
           py::object py_mangled_exports)
       {
-        const char* name = (py_name.ptr() != Py_None)?
-            py::extract<char*>(py_name) : "kernel.ptx";
         // compile source into buffer ptx
-        std::vector<char> ptx = compiler->compile(py_source, name,
+        std::vector<char> ptx = compiler->compile(py_source, py_name,
             py_nvrtc_options, py_exports, py_mangled_exports);
         // link buffer ptx
+        const char* name = (py_name.ptr() != Py_None)?
+            py::extract<char*>(py_name) : "kernel.ptx";
         const CUresult cu_result = cuLinkAddData(m_link_state,
             CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL);
         check_cu_result("cuLinkAddData", cu_result);
@@ -1471,14 +1471,15 @@ BOOST_PYTHON_MODULE(_driver)
   // {{{ nvrtc compiler
 
 #if CUDAPP_CUDA_VERSION >= 8000
-  py::class_<NvrtcCompiler, boost::noncopyable, shared_ptr<NvrtcCompiler> >("NvrtcCompiler")
+//  py::class_<NvrtcCompiler, boost::noncopyable, shared_ptr<NvrtcCompiler> >("NvrtcCompiler")
+  py::class_<NvrtcCompiler, boost::noncopyable>("NvrtcCompiler")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
     .def(py::init<py::object, py::object, py::object>())
     .def(py::init<py::object, py::object, py::object, py::object>())
     .def("compile", &NvrtcCompiler::py_compile, (
         py::arg("source"),
-        py::arg("name")=py::str("kernel.ptx"),
+        py::arg("name")=py::object(),
         py::arg("nvrtc_options")=py::object(),
         py::arg("exports")=py::object(),
         py::arg("mangled_exports")=py::object()));
@@ -1496,14 +1497,15 @@ BOOST_PYTHON_MODULE(_driver)
     .value("OBJECT", CU_JIT_INPUT_OBJECT)
     .value("LIBRARY", CU_JIT_INPUT_LIBRARY);
 
-  py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
+//  py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
+  py::class_<Linker, boost::noncopyable>("Linker")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
     .def(py::init<py::object, py::object, py::object>())
     .def("add_data", &Linker::add_data, (
         py::arg("data"),
         py::arg("input_type"),
-        py::arg("name")=py::str("unknown")))
+        py::arg("name")=py::object()))
     .def("add_file", &Linker::add_file, (
         py::arg("filename"),
         py::arg("input_type")))
@@ -1511,7 +1513,7 @@ BOOST_PYTHON_MODULE(_driver)
     .def("add_nvrtc", &Linker::add_nvrtc, (
         py::arg("compiler"),
         py::arg("source"),
-        py::arg("name")=py::str("kernel.ptx"),
+        py::arg("name")=py::object(),
         py::arg("nvrtc_options")=py::object(),
         py::arg("exports")=py::object(),
         py::arg("mangled_exports")=py::object()))
-- 
GitLab


From 715ef3d2ec7cee2b2d5b6196265bf8840ee3e8d6 Mon Sep 17 00:00:00 2001
From: chschnell <cs@rekoba.de>
Date: Sun, 22 Jan 2017 18:48:53 +0100
Subject: [PATCH 11/16] Final tuning of prototypes, changed C++ class names to
 properly match Boost class naming convention

---
 pycuda/compiler.py           | 49 +++++++++++++++++++++++-------------
 src/wrapper/wrap_cudadrv.cpp | 44 ++++++++++++++++----------------
 2 files changed, 53 insertions(+), 40 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index a6da5c89..2ebf178a 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -292,8 +292,11 @@ class SourceModule(CudaModule):
         self._bind_module()
 
 class DynamicModule(CudaModule):
-    def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None,
-            code=None, cache_dir=None, include_dirs=[], link_options=None,
+    #def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None,
+    #        code=None, cache_dir=None, include_dirs=[], link_options=None,
+    #        message_handler=None, log_verbose=False, cuda_libdir=None):
+    def __init__(self, nvcc=None, keep=False, no_extern_c=False, arch=None,
+            code=None, cache_dir=None, include_dirs=None, link_options=None,
             message_handler=None, log_verbose=False, cuda_libdir=None):
         from pycuda.driver import Context
         compute_capability = Context.get_device().compute_capability()
@@ -301,8 +304,8 @@ class DynamicModule(CudaModule):
             raise RuntimeError('Minimum compute capability for JIT linker is 3.5 (found: %u.%u)!' %
                 (compute_capability[0], compute_capability[1]))
         else:
-            from pycuda.driver import Linker
-            self.linker = Linker(link_options, message_handler, log_verbose)
+            from pycuda.driver import JitLinker
+            self.linker = JitLinker(link_options, message_handler, log_verbose)
         self._check_arch(arch)
         self.nvcc = nvcc
         self.keep = keep
@@ -370,31 +373,41 @@ class DynamicModule(CudaModule):
                 'directory, set CUDA library path manually')
         return libdir, libptn
 
-    def add_source(self, source, nvcc_options=[], name='kernel.ptx'):
-        ptx = compile(source, nvcc=self.nvcc, options=nvcc_options,
+    def add_source_nvcc(self, source, options=None, name=None):
+        if self.nvcc is None:
+            nvcc = 'nvcc'
+        else:
+            nvcc = self.nvcc
+        if self.include_dirs is None:
+            include_dirs = []
+        else:
+            include_dirs = self.include_dirs
+        ptx = compile(source, nvcc=nvcc, options=options,
             keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch,
             code=self.code, cache_dir=self.cache_dir,
-            include_dirs=self.include_dirs, target="ptx")
+            include_dirs=include_dirs, target="ptx")
         from pycuda.driver import jit_input_type
+        if name is None:
+            name = 'kernel.ptx'
         self.linker.add_data(ptx, jit_input_type.PTX, name)
         return self
 
-    def add_source_nvrtc(self, source, nvrtc_options=None,
+    def add_source_nvrtc(self, source, options=None,
             name=None, exports=None, compiler=None):
-        if not hasattr(self.linker, "add_nvrtc"):
+        if not hasattr(self.linker, "add_source_nvrtc"):
             raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!')
         if self.mangled_exports is None and len(exports) > 0:
             self.mangled_exports = {}
         if compiler is None:
             from pycuda.driver import NvrtcCompiler
-            compiler = NvrtcCompiler(nvrtc_options, self.include_dirs, self.arch)
-            nvrtc_options = None
-        self.linker.add_nvrtc(compiler, source, name=name,
-            nvrtc_options=nvrtc_options, exports=exports,
+            compiler = NvrtcCompiler(options, self.include_dirs, self.arch)
+            options = None
+        self.linker.add_source_nvrtc(source, compiler, name=name,
+            options=options, exports=exports,
             mangled_exports=self.mangled_exports)
         return self
 
-    def add_data(self, data, input_type, name='unknown'):
+    def add_data(self, data, input_type, name=None):
         self.linker.add_data(data, input_type, name)
         return self
 
@@ -426,9 +439,9 @@ class DynamicModule(CudaModule):
             return self.module.get_function(self.mangled_exports[name])
 
 class DynamicSourceModule(DynamicModule):
-    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
+    def __init__(self, source, nvcc=None, options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
-            include_dirs=[], use_cudadevrt=False, cuda_libdir=None,
+            include_dirs=None, use_cudadevrt=False, cuda_libdir=None,
             exports=None):
         super(DynamicSourceModule, self).__init__(nvcc=nvcc,
             keep=keep, no_extern_c=no_extern_c, arch=arch, code=code,
@@ -443,9 +456,9 @@ class DynamicSourceModule(DynamicModule):
         if exports is None:
             if use_cudadevrt and '-lcudadevrt' not in options:
                 options.append('-lcudadevrt')
-            self.add_source(source, nvcc_options=options)
+            self.add_source_nvcc(source, options=options)
         else:
-            self.add_source_nvrtc(source, nvrtc_options=options, exports=exports)
+            self.add_source_nvrtc(source, options=options, exports=exports)
         if use_cudadevrt:
             self.add_stdlib('cudadevrt')
         self.link()
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index bb1ffc00..1988d492 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -473,7 +473,7 @@ namespace
       }
   };
 
-  class NvrtcCompiler : public boost::noncopyable
+  class nvrtc_compiler : public boost::noncopyable
   {
     private:
       std::vector<std::string> m_compile_options;
@@ -534,7 +534,7 @@ namespace
       }
 
     public:
-      NvrtcCompiler(
+      nvrtc_compiler(
           const py::object py_nvrtc_options=py::object(),
           const py::object py_include_dirs=py::object(),
           const py::object py_arch=py::object(),
@@ -671,10 +671,10 @@ namespace
 
   // }}}
 
-  // {{{ linker
+  // {{{ jit linker
 
 #if CUDAPP_CUDA_VERSION >= 5050
-  class Linker : public boost::noncopyable
+  class jit_linker : public boost::noncopyable
   {
     private:
       py::object m_message_handler;
@@ -720,7 +720,7 @@ namespace
       }
 
     public:
-      Linker(py::object py_link_options = py::object(),
+      jit_linker(py::object py_link_options = py::object(),
              py::object message_handler = py::object(),
              py::object py_log_verbose = py::object(false))
         : m_message_handler(message_handler),
@@ -749,7 +749,7 @@ namespace
         check_cu_result("cuLinkCreate", cu_result);
       }
 
-      ~Linker()
+      ~jit_linker()
       {
         close();
       }
@@ -778,9 +778,9 @@ namespace
       }
 
 #if CUDAPP_CUDA_VERSION >= 8000
-      void add_nvrtc(NvrtcCompiler* compiler, py::str py_source, py::object py_name,
-          py::object py_nvrtc_options, py::object py_exports,
-          py::object py_mangled_exports)
+      void add_source_nvrtc(py::str py_source, nvrtc_compiler* compiler,
+          py::object py_name, py::object py_nvrtc_options,
+          py::object py_exports, py::object py_mangled_exports)
       {
         // compile source into buffer ptx
         std::vector<char> ptx = compiler->compile(py_source, py_name,
@@ -1471,23 +1471,23 @@ BOOST_PYTHON_MODULE(_driver)
   // {{{ nvrtc compiler
 
 #if CUDAPP_CUDA_VERSION >= 8000
-//  py::class_<NvrtcCompiler, boost::noncopyable, shared_ptr<NvrtcCompiler> >("NvrtcCompiler")
-  py::class_<NvrtcCompiler, boost::noncopyable>("NvrtcCompiler")
+//  py::class_<nvrtc_compiler, boost::noncopyable, shared_ptr<nvrtc_compiler> >("NvrtcCompiler")
+  py::class_<nvrtc_compiler, boost::noncopyable>("NvrtcCompiler")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
     .def(py::init<py::object, py::object, py::object>())
     .def(py::init<py::object, py::object, py::object, py::object>())
-    .def("compile", &NvrtcCompiler::py_compile, (
+    .def("compile", &nvrtc_compiler::py_compile, (
         py::arg("source"),
         py::arg("name")=py::object(),
-        py::arg("nvrtc_options")=py::object(),
+        py::arg("options")=py::object(),
         py::arg("exports")=py::object(),
         py::arg("mangled_exports")=py::object()));
 #endif
 
   // }}}
 
-  // {{{ linker
+  // {{{ jit linker
 
 #if CUDAPP_CUDA_VERSION >= 5050
   py::enum_<CUjitInputType>("jit_input_type")
@@ -1497,28 +1497,28 @@ BOOST_PYTHON_MODULE(_driver)
     .value("OBJECT", CU_JIT_INPUT_OBJECT)
     .value("LIBRARY", CU_JIT_INPUT_LIBRARY);
 
-//  py::class_<Linker, boost::noncopyable, shared_ptr<Linker> >("Linker")
-  py::class_<Linker, boost::noncopyable>("Linker")
+//  py::class_<jit_linker, boost::noncopyable, shared_ptr<jit_linker> >("JitLinker")
+  py::class_<jit_linker, boost::noncopyable>("JitLinker")
     .def(py::init<py::object>())
     .def(py::init<py::object, py::object>())
     .def(py::init<py::object, py::object, py::object>())
-    .def("add_data", &Linker::add_data, (
+    .def("add_data", &jit_linker::add_data, (
         py::arg("data"),
         py::arg("input_type"),
         py::arg("name")=py::object()))
-    .def("add_file", &Linker::add_file, (
+    .def("add_file", &jit_linker::add_file, (
         py::arg("filename"),
         py::arg("input_type")))
 #if CUDAPP_CUDA_VERSION >= 8000
-    .def("add_nvrtc", &Linker::add_nvrtc, (
-        py::arg("compiler"),
+    .def("add_source_nvrtc", &jit_linker::add_source_nvrtc, (
         py::arg("source"),
+        py::arg("compiler"),
         py::arg("name")=py::object(),
-        py::arg("nvrtc_options")=py::object(),
+        py::arg("options")=py::object(),
         py::arg("exports")=py::object(),
         py::arg("mangled_exports")=py::object()))
 #endif
-    .def("link_module", &Linker::link_module,
+    .def("link_module", &jit_linker::link_module,
         py::return_value_policy<py::manage_new_object>());
 #endif
 
-- 
GitLab


From c2a5f761106d8c376dacb5843f85b23e12c0ad99 Mon Sep 17 00:00:00 2001
From: chschnell <cs@rekoba.de>
Date: Sun, 22 Jan 2017 18:54:02 +0100
Subject: [PATCH 12/16] Temporarily disabled dynamic compiler tests

---
 test/test_driver.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_driver.py b/test/test_driver.py
index d7e68d77..d73d1712 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -893,12 +893,14 @@ class TestDriver:
             drv.memcpy_htod(a_gpu, a)
             drv.memcpy_htod(b_gpu, b)
 
+            '''
             from pycuda.compiler import DynamicSourceModule
             mod = DynamicSourceModule(cuda_string, keep=True, use_cudadevrt=True)
 
             func = mod.get_function("math")
             func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu,
                     block=(100, 1, 1), grid=(1, 1, 1))
+            '''
 
             drv.memcpy_dtoh(c, c_gpu)
             drv.memcpy_dtoh(d, d_gpu)
@@ -934,6 +936,7 @@ class TestDriver:
             printf("  Hello inner world!\\n");
         }'''
 
+        '''
         from pycuda.compiler import DynamicModule
         mod = DynamicModule()
         mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
@@ -943,6 +946,7 @@ class TestDriver:
 
         test_kernel = mod.get_function('test_kernel')
         test_kernel(grid=(2,1), block=(1,1,1))
+        '''
 
 
 def test_import_pyopencl_before_pycuda():
-- 
GitLab


From b75c2d0c810f6117de17972ee20c6a3c7be38974 Mon Sep 17 00:00:00 2001
From: chschnell <cs@rekoba.de>
Date: Sun, 22 Jan 2017 19:12:09 +0100
Subject: [PATCH 13/16] Temporarily disabled all JIT C++ sources

---
 src/wrapper/wrap_cudadrv.cpp | 12 +++++++++++-
 test/test_driver.py          |  4 ++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 1988d492..988d88b9 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -8,7 +8,9 @@
 #include "wrap_helpers.hpp"
 #include <boost/python/stl_iterator.hpp>
 
-
+/*
+#DEFINE USE_JIT
+*/
 
 
 #if CUDAPP_CUDA_VERSION < 1010
@@ -412,6 +414,8 @@ namespace
 
   // {{{ nvrtc compiler
 
+#ifdef USE_JIT
+
 #if CUDAPP_CUDA_VERSION >= 8000
   #include <nvrtc.h>    // NOTE: this requires us to link static SDK library "nvrtc"
 
@@ -814,6 +818,8 @@ namespace
   };
 #endif
 
+#endif /* #ifdef USE_JIT */
+
   // }}}
 
   template <class T>
@@ -1470,6 +1476,8 @@ BOOST_PYTHON_MODULE(_driver)
 
   // {{{ nvrtc compiler
 
+#ifdef USE_JIT
+
 #if CUDAPP_CUDA_VERSION >= 8000
 //  py::class_<nvrtc_compiler, boost::noncopyable, shared_ptr<nvrtc_compiler> >("NvrtcCompiler")
   py::class_<nvrtc_compiler, boost::noncopyable>("NvrtcCompiler")
@@ -1522,6 +1530,8 @@ BOOST_PYTHON_MODULE(_driver)
         py::return_value_policy<py::manage_new_object>());
 #endif
 
+#endif  /* #ifdef USE_JIT */
+
   // }}}
 
   // {{{ function
diff --git a/test/test_driver.py b/test/test_driver.py
index d73d1712..e0edfc3c 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -939,8 +939,8 @@ class TestDriver:
         '''
         from pycuda.compiler import DynamicModule
         mod = DynamicModule()
-        mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
-        mod.add_source(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
+        mod.add_source_nvcc(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
+        mod.add_source_nvcc(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
         mod.add_stdlib('cudadevrt')
         mod.link()
 
-- 
GitLab


From 8542e59bf509051fa2f464c822d90b3e4c428954 Mon Sep 17 00:00:00 2001
From: chschnell <cs@rekoba.de>
Date: Mon, 23 Jan 2017 08:05:19 +0100
Subject: [PATCH 14/16] Added documentation

---
 doc/source/driver.rst | 668 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 666 insertions(+), 2 deletions(-)

diff --git a/doc/source/driver.rst b/doc/source/driver.rst
index 296782da..a4f171ab 100644
--- a/doc/source/driver.rst
+++ b/doc/source/driver.rst
@@ -513,6 +513,36 @@ Constants
     .. attribute:: PREFER_PTX
     .. attribute:: PREFER_BINARY
 
+.. class:: jit_input_type
+
+    Supported input data types to CUDA's JIT linker.
+    The attribute names and values of this class correspond to CUDA's
+    `enum CUjitInputType`.
+
+    CUDA 3.5 and above.
+
+    .. versionadded:: 2017.?
+
+    .. attribute:: CUBIN
+
+        Data contains compiled device-class-specific device code.
+
+    .. attribute:: PTX
+
+        Data contains PTX (Parallel Thread eXecution) source code.
+
+    .. attribute:: FATBINARY
+
+        Data contains a bundle of multiple cubins and/or PTX of some device code.
+
+    .. attribute:: OBJECT
+
+        Data contains host object with embedded device code.
+
+    .. attribute:: LIBRARY
+
+        Data contains an archive of host objects with embedded device code.
+
 .. class:: host_alloc_flags
 
     Flags to be used to allocate :ref:`pagelocked_memory`.
@@ -558,6 +588,14 @@ Constants
 
     .. attribute:: LAZY_ENABLE_PEER_ACCESS
 
+    .. attribute:: DEV_RUNTIME_SYNC_DEPTH
+
+        CUDA 3.5 and above.
+
+    .. attribute:: DEV_RUNTIME_PENDING_LAUNCH_COUNT
+
+        CUDA 3.5 and above.
+
 
 Graphics-related constants
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1993,6 +2031,290 @@ CUDA 4.0 and newer.
 Just-in-time Compilation
 ========================
 
+PyCUDA provides several tools to build a :class:`Module` from CUDA C++
+sources at run-time. A Module encapsulates binary GPU code and acts as an
+interface to function and data pointers in device memory when it is executed
+on a device.
+
+For a typical case with a single CUDA C++ source, see:
+
+- :class:`~pycuda.compiler.SourceModule` - A Module created using NVIDIA's
+  command line compiler NVCC, compiled and linked against CUDA's static
+  runtime. Unless you need any of the other features described below, this
+  should be your choice.
+- :class:`~pycuda.compiler.DynamicSourceModule` - A Module created using NVCC
+  or NVRTC and CUDA's Just-In-Time (JIT) linker. Facilitates linking against
+  CUDA's dynamic runtime, which is required for dynamic parallelism /
+  recursive kernels.
+
+and when you need to compile from multiple input sources, see:
+
+- :class:`~pycuda.compiler.DynamicModule` - A Module created from an arbitrary
+  mix of CUDA C++ sources (compiled with either NVCC or NVRTC), PTX sources
+  and CUDA libraries with CUDA's Just-In-Time (JIT) linker.
+
+All classes exhibit the same public interface as :class:`Module`, but do not
+inherit from it. In order to compile a CUDA C++ source into a plain string
+without building a Module, see:
+
+- :func:`compiler.compile() <pycuda.compiler.compile>` for NVCC, returns
+  CUBIN binary code
+- :meth:`NvrtcCompiler.compile` for NVRTC, returns PTX source code
+
+NVRTC
+-----
+
+NVIDIA's Run-Time-Compiler
+`NVRTC <http://docs.nvidia.com/cuda/nvrtc/index.html>`_
+is a light-weight alternative to their Command-Line-Compiler
+`NVCC <http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/>`_,
+some of the differences include:
+
+- NVRTC is a pure CUDA C++ source to PTX (Parallel Thread eXecution) code
+  compiler and thus covers a lot smaller scope than NVCC.
+- NVRTC is an API, whereas NVCC is an executable.
+- NVRTC supports only a `small subset
+  <http://docs.nvidia.com/cuda/nvrtc/index.html#group__options>`_
+  of NVCC's `compiler options
+  <http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#nvcc-command-options>`_.
+- NVRTC does not support certain preprocessor directives and for example
+  fails to parse some STL header files, but just like in NVCC, the
+  ``printf()`` function is directly available in CUDA sources.
+- NVRTC requires extra steps to access functions in compiled CUDA
+  sources from the host. The names of these functions must be declared
+  to NVRTC before compiling, and after compilation they can only be
+  looked up in the Module by using their C++ mangled names.
+
+PyCUDA already reduces the computational overhead involved with NVCC by
+maintaining a file-sytem based cache, which means that it invokes NVCC
+only once for each unique combination of CUDA source and compiler setup.
+For applications that generate volatile CUDA C++ sources dynamically at
+run-time and therby cause overly many cache-misses in PyCUDA's NVCC
+cache, using NVRTC instead of NVCC should be considered if CUDA compile
+time is a problem.
+
+See also:
+
+    :class:`~pycuda.compiler.DynamicSourceModule`,
+    :meth:`DynamicModule.add_source_nvrtc() <pycuda.compiler.DynamicModule.add_source_nvrtc>` and
+    :class:`NvrtcCompiler`
+
+Dynamic Runtime
+---------------
+
+Be aware that there is an overall negative performance impact caused by
+simply linking to CUDA's dynamic device runtime library *cudadevrt*.
+
+.. warning::
+
+    From CUDA's `Dynamic Parallelism Programming Guide
+    <https://docs.nvidia.com/cuda/pdf/CUDA_Dynamic_Parallelism_Programming_Guide.pdf#page=169&zoom=auto,-13,164>`_
+    (CUDA 8.0):
+
+        *C.4.2.2. Dynamic-parallelism-enabled Kernel Overhead*
+
+        System software which is active when controlling dynamic launches
+        may impose an overhead on any kernel which is running at the time,
+        whether or not it invokes kernel launches of its own. This
+        overhead arises from the device runtime's execution tracking and
+        management software and may result in decreased performance for
+        e.g., library calls when made from the device compared to from the
+        host side. This overhead is, in general, incurred for applications
+        that link against the device runtime library.
+
+It is therefore recommended to use *cudadevrt* only when needed.
+
+.. class:: JitLinker([link_options [, message_handler [, log_verbose]]])
+
+    .. versionadded:: 2017.?
+
+    Create an object that provides access to CUDA's Just-In-Time (JIT) linker.
+
+    :param link_options:
+        JIT linker options
+    :type link_options:
+        list(jit_option) or None
+    :param message_handler:
+        Optional callback on success or failure (see below)
+    :type message_handler:
+        callable or None
+    :param bool log_verbose:
+        If *True*, instruct linker to create detailed log output.
+        Default: *False*
+
+    The *message_handler* callback's Python function prototype is
+
+    .. code-block:: python
+
+        def message_handler(bool succeeded, str info_str, str error_str)
+
+    where *succeeded* informs on success (*True*) or failure (*False*) of the
+    linker invocation, *info_str* contains informational linker log output and
+    *error_str* any error output. Note that in case of a linker error an
+    Excpetion is raised containing *error_str* after *message_handler* was
+    called.
+
+    CUDA 3.5 and above.
+
+    .. method:: add_data(data, input_type, name=None)
+
+        Add *data* with type *input_type* to this linker session.
+
+        :param data:
+            A string (Python 2) or bytes (Python 3) object containing the
+            data to add
+        :type data:
+            str or bytes
+        :param input_type:
+            JIT linker input type of *data*
+        :type input_type:
+            jit_input_type
+        :param name:
+            Specifies to the linker under which name to refer to *data* in
+            its warning and error messages. If *None*, defaults to ``"unknown"``
+        :type name:
+            str or None
+
+    .. method:: add_file(filename, input_type)
+
+        Add content of file *filename* with type *input_type* to this linker
+        session.
+
+        :param str filename:
+            The name of the file to add
+        :param input_type:
+            JIT linker input type of the file's content
+        :type input_type:
+            jit_input_type
+
+    .. method:: add_source_nvrtc(source, compiler, name=None,\
+                    options=None, exports=None, mangled_exports=None)
+
+        Compile CUDA source code *source* using NVRTC *compiler* with options
+        *options*, then add the resulting PTX code to this linker
+        session.
+
+        :param source:
+            CUDA C++ source code
+        :type source:
+            str
+        :param compiler:
+            The NVRTC compiler to use to compile *source*
+        :type compiler:
+            NvrtcCompiler
+        :param name:
+            Specifies to the compiler and linker under which name to refer to
+            *source* in their warning and error messages. If *None*, defaults
+            to ``"kernel.ptx"``
+        :type name:
+            str or None
+        :param options:
+           List of NVRTC compile options
+        :type options:
+            list(str) or None
+        :param exports:
+            Function names in *source* that the caller wants to be exported
+        :type exports:
+            list(str) or None
+        :param mangled_exports:
+            Output variable, receives pairs of function names from *exports*
+            and their mangled counterparts
+        :type mangled_exports:
+            dict(str, str) or None
+        :see:
+            :meth:`NvrtcCompiler.compile`
+
+        CUDA 8.0 and above.
+
+    .. method:: link_module()
+
+        Closes this linker session and returns the new :class:`Module` object.
+
+        :return:
+            the newly created :class:`Module` instance
+
+.. class:: NvrtcCompiler([options[, include_dirs [, arch [, headers]]]])
+
+    .. versionadded:: 2017.?
+
+    Create an object that provides access to NVIDIA's Run-Time-Compiler (NVRTC).
+
+    Configure the compiler to add the CUDA SDK include path, the PyCUDA
+    include path and all directory names in *include_dirs* to its include
+    search paths. The global compiler options *options* provided in
+    this constructor can later be extended per CUDA source, see
+    :meth:`compile`.
+
+    Note that NVRTC `supports
+    <http://docs.nvidia.com/cuda/nvrtc/index.html#group__options>`_
+    only a small subset of `NVCC's
+    <http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#nvcc-command-options>`_
+    compiler options.
+
+    :param options:
+        List of NVRTC compile options to use with this compiler instance.
+    :type options:
+        list(str) or None
+    :param include_dirs:
+        List of extra directory names to use when processing ``#include``
+        preprocessor directives in CUDA sources.
+    :type include_dirs:
+        list(str) or None
+    :param arch:
+        String of the form ``compute_<major><minor>`` where `<major>` and
+        `<minor>` are the two digits of a compute capability
+        pair `(<major>, <minor>)`. If *None*, defaults to the current
+        context's device's compute capability.
+    :type arch:
+        str or None
+    :param headers:
+        A `dict` that maps virtual header file names to their respective
+        content. These extra in-memory headers are made visible to and can
+        be included in CUDA sources processed by this compiler just like
+        regular include files by using the ``#include`` preprocessor
+        directive.
+    :type headers:
+        dict(str, str) or None
+
+    .. method:: compile(source, name=None, options=None, exports=None,\
+                    mangled_exports=None)
+
+        Compile CUDA source code *source* with NVRTC using the compile options
+        specified earlier in the constructor combined with *options*
+        and return the resulting PTX code to the caller. Do not upload the
+        code to the GPU.
+
+        Declare all function names in *exports* to the compiler, and after
+        successfull compilation, insert each pair of exported function name
+        in *exports* and its C++ mangled function name into *mangled_exports*.
+
+        :param str source:
+            CUDA C++ source code
+        :param name:
+            Specifies to the compiler under which name to refer to *source*
+            in its warning and error messages. If *None*, defaults to
+            ``"kernel.ptx"``
+        :type name:
+            str or None
+        :param options:
+            List of additional NVRTC compile options, combine with those
+            supplied in the constructor
+        :type options:
+            list(str) or None
+        :param exports:
+            Function names in *source* that the caller wants to be exported
+        :type exports:
+            list(str) or None
+        :param mangled_exports:
+            Output variable, receives pairs of function names from *exports*
+            and their mangled counterparts
+        :type mangled_exports:
+            dict(str, str) or None
+        :return:
+            the compiled PTX code string
+
+    CUDA 8.0 and above.
+
 .. module:: pycuda.compiler
 
 .. data:: DEFAULT_NVCC_FLAGS
@@ -2042,11 +2364,353 @@ Just-in-time Compilation
     *Change note:* :class:`SourceModule` was moved from :mod:`pycuda.driver` to
     :mod:`pycuda.compiler` in version 0.93.
 
-.. function:: compile(source, nvcc="nvcc", options=None, keep=False,
-        no_extern_c=False, arch=None, code=None, cache_dir=None,
+.. function:: compile(source, nvcc="nvcc", options=None, keep=False,\
+        no_extern_c=False, arch=None, code=None, cache_dir=None,\
         include_dirs=[])
 
     Perform the same compilation as the corresponding
     :class:`SourceModule` constructor, but only return
     resulting *cubin* file as a string. In particular,
     do not upload the code to the GPU.
+
+.. class:: DynamicSourceModule(source, nvcc=None, options=None, keep=False,\
+               no_extern_c=False, arch=None, code=None, cache_dir=None,\
+               include_dirs=None, use_cudadevrt=False, cuda_libdir=None,\
+               exports=None)
+
+    .. versionadded:: 2017.?
+
+    Create a :class:`~pycuda.driver.Module` from CUDA source code *source*
+    using either NVCC or NVRTC with optional support for the dynamic CUDA
+    runtime `cudadevrt` (needed for recursive launches/dynamic parallelism).
+
+    :param source:
+        CUDA C++ source code
+    :type source:
+        str
+    :param nvcc:
+        File system path of the NVCC executable. If *None*, defaults to ``nvcc``
+    :type nvcc:
+        str or None
+    :param options:
+        List of NVCC or NVRTC compile options
+    :type options:
+        list(str) or None
+    :param keep:
+        If *True*, keep compiler output and print a line indicating its
+        location in the file system for debugging purposes.
+        Default: *False* (NVCC only)
+    :type keep:
+        bool
+    :param no_extern_c:
+        If *False*, wrap the given source code in ``extern "C" { ... }`` to
+        prevent C++ name mangling. Default: *False* (NVCC only)
+    :type no_extern_c:
+        bool
+    :param arch:
+        The name of the class of NVIDIA virtual GPU architecture for which to
+        compile the CUDA C++ source. If *None*, defaults to the current
+        context's device's compute capability
+    :type arch:
+        str or None
+    :param code:
+        The name of the NVIDIA GPU to assemble and optimize PTX code for. If
+        *None*, it leave unspecified (NVCC only)
+    :type code:
+        str or None
+    :param cache_dir:
+        The directory used by NVCC for compiler caching. If *None* (default),
+        then `cache_dir` is taken to be :envvar:`PYCUDA_CACHE_DIR` if set or a
+        sensible per-user default. If *False*, caching is disabled (NVCC only)
+    :type cache_dir:
+        str or None
+    :param include_dirs:
+        List of extra directory names to use when processing ``#include``
+        preprocessor directives in the CUDA source.
+    :type include_dirs:
+        list(str) or None
+    :param use_cudadevrt:
+        If *True*, link to the dynamic CUDA runtime, meaning: instruct the
+        compiler to generate relocatable PTX device code from *source* that
+        can be linked with other relocatable device code, and statically link
+        standard library `cudadevrt` to this module. Default: *False* (link
+        static runtime)
+    :type use_cudadevrt:
+        bool
+    :param cuda_libdir:
+        The absolute path in the local file system of the CUDA SDK 64-Bit
+        library directory
+    :type cuda_libdir:
+        str or None
+    :param exports:
+        If not *None*, use NVRTC instead of NVCC to compile *source*.
+        This argument contains the list of function names in *source* that
+        the caller later wants to access using :meth:`get_function()`
+    :type exports:
+        list(str) or None
+
+    This class exhibits the same public interface as
+    :class:`~pycuda.driver.Module`, but does not inherit from it.
+
+    CUDA 3.5 and above, 8.0 and above when using NVRTC.
+
+    .. method:: get_function(name)
+
+        :see:
+            :meth:`Module.get_function() <pycuda.driver.Module.get_function>`
+
+    .. method:: get_global(name)
+
+        :see:
+            :meth:`Module.get_global() <pycuda.driver.Module.get_global>`
+
+    .. method:: get_texref(name)
+
+        :see:
+            :meth:`Module.get_texref() <pycuda.driver.Module.get_texref>`
+
+    .. method:: get_surfref(name)
+
+        :see:
+            :meth:`Module.get_surfref() <pycuda.driver.Module.get_surfref>`
+
+.. class:: DynamicModule(nvcc=None, keep=False, no_extern_c=False,\
+               arch=None, code=None, cache_dir=None, include_dirs=None,\
+               link_options=None, message_handler=None, log_verbose=False,\
+               cuda_libdir=None)
+
+    .. versionadded:: 2017.?
+
+    Build a :class:`~pycuda.driver.Module` from an arbitrary mix of CUDA C++
+    sources (compiled with either NVCC or NVRTC), PTX sources and CUDA
+    libraries with CUDA's Just-In-Time (JIT) linker.
+
+    General usage pattern for this class:
+
+    1. Create a module builder using constructor :meth:`DynamicModule`
+    2. Use the ``add_...()`` methods as often as needed and in any order:
+
+      - :meth:`add_source_nvcc(source) <add_source_nvcc>` - to compile *source* with NVCC and add result
+      - :meth:`add_source_nvrtc(source) <add_source_nvrtc>` - to compile *source* with NVRTC and add result
+      - :meth:`add_data(data) <add_data>` - to add in-memory data to the module
+      - :meth:`add_file(filename) <add_file>` - to add files to the module
+      - :meth:`add_stdlib(libanem) <add_stdlib>` - to add SDK libraries to the module
+
+    3. Link and upload module code to the GPU by calling :meth:`link` once
+    4. Access module's kernel functions on the GPU using  :meth:`get_function`
+
+    :param nvcc:
+        File system path of the NVCC executable. If *None*, defaults to ``nvcc``
+    :type nvcc:
+        str or None
+    :param keep:
+        If *True*, keep compiler outputs and print a line indicating their
+        location in the file system for debugging purposes.
+        Default: *False* (NVCC only)
+    :type keep:
+        bool
+    :param no_extern_c:
+        If *False*, wrap the given source codes in ``extern "C" { ... }`` to
+        prevent C++ name mangling. Default: *False* (NVCC only)
+    :type no_extern_c:
+        bool
+    :param arch:
+        The name of the class of NVIDIA virtual GPU architecture for which to
+        compile the CUDA C++ sources. If *None*, defaults to the current
+        context's device's compute capability
+    :type arch:
+        str or None
+    :param code:
+        The name of the NVIDIA GPU to assemble and optimize PTX code for. If
+        *None*, it leave unspecified (NVCC only)
+    :type code:
+        str or None
+    :param cache_dir:
+        The directory used by NVCC for compiler caching. If *None* (default),
+        then `cache_dir` is taken to be :envvar:`PYCUDA_CACHE_DIR` if set or a
+        sensible per-user default. If *False*, caching is disabled (NVCC only)
+    :type cache_dir:
+        str or None
+    :param include_dirs:
+        List of extra directory names to use when processing ``#include``
+        preprocessor directives in CUDA sources.
+    :type include_dirs:
+        list(str) or None
+    :param link_options:
+        Options for the JIT linker :class:`~pycuda.driver.JitLinker`
+    :type link_options:
+        list(jit_option) or None
+    :param message_handler:
+        Optional callback on success or failure
+    :type message_handler:
+        callable or None
+    :param bool log_verbose:
+        If *True*, instruct linker to create detailed log output.
+        Default: *False*
+    :param cuda_libdir:
+        The absolute path in the local file system of the CUDA SDK 64-Bit
+        library directory
+    :type cuda_libdir:
+        str or None
+
+    After :meth:`link` has been called, objects of this class exhibit
+    the same public interface as :class:`~pycuda.driver.Module`, but do not
+    inherit from it.
+
+    CUDA 3.5 and above.
+
+    .. method:: add_source_nvcc(source, options=None, name=None)
+
+        Compile CUDA source code *source* with NVIDIA's Command-Line-Compiler
+        NVCC using compiler options *options* and add the resulting
+        PTX code to the linker session of this module.
+
+        :param str source:
+            CUDA C++ source code
+        :param options:
+            List of NVCC compiler options
+        :type options:
+            list(str) or None
+        :param name:
+            Specifies to the compiler and linker under which name to refer to
+            *source* in their warning and error messages. If *None*, defaults
+            to ``"kernel.ptx"``
+        :type name:
+            str or None
+        :return:
+            *self*
+
+    .. method:: add_source_nvrtc(source, options=None,\
+                    name=None, exports=None, compiler=None)
+
+        Compile CUDA source code *source* with NVIDIA's Run-Time-Compiler
+        NVRTC using compiler options *options* and add the resulting
+        PTX code to the linker session of this module.
+
+        NOTE: self.code is not supported with NVRTC and ignored here
+
+        :param str source:
+            CUDA C++ source code
+        :param options:
+           List of NVRTC compile options
+        :type options:
+            list(str) or None
+        :param name:
+            Specifies to the compiler and linker under which name to refer to
+            *source* in their warning and error messages. If *None*, defaults
+            to ``"kernel.ptx"``
+        :type name:
+            str or None
+        :param exports:
+            Function names in *source* that the caller wants to be exported
+        :type exports:
+            list(str) or None
+        :param compiler:
+            The NVRTC compiler to use to compile *source*. If *None*, a
+            default compiler instance is created and used
+        :type compiler:
+            NvrtcCompiler or None
+        :return:
+            *self*
+        :see:
+            :meth:`JitLinker.add_source_nvrtc() <pycuda.driver.JitLinker.add_source_nvrtc>`
+
+        CUDA 8.0 and above.
+
+    .. method:: add_data(data, input_type, name=None)
+
+        Add the binary data in *data* to the linker session of this module.
+
+        :param data:
+            A string (Python 2) or bytes (Python 3) object containing the data to add
+        :type data:
+            str or bytes
+        :param input_type:
+            JIT linker input type of *data*
+        :type input_type:
+            jit_input_type
+        :param name:
+            Specifies to the linker under which name to refer to *data* in
+            its warning and error messages. If *None*, defaults
+            to ``"kernel.ptx"``
+        :type name:
+            str or None
+        :return:
+            *self*
+        :see:
+            :meth:`JitLinker.add_data() <pycuda.driver.JitLinker.add_data>`
+
+    .. method:: add_file(filename, input_type)
+
+        Add the content of file *filename* to the linker session of this
+        module.
+
+        :param str filename:
+            The name of the file to add
+        :param input_type:
+            JIT linker input type of the file's content
+        :type input_type:
+            jit_input_type
+        :return:
+            *self*
+        :see:
+            :meth:`JitLinker.add_file() <pycuda.driver.JitLinker.add_file>`
+
+    .. method:: add_stdlib(libname)
+
+        Add the standard library *libname* to the linker session of this
+        module.
+
+        A standard library is part of the CUDA SDK. If this method fails
+        to auto-locate the correct CUDA SDK installation path on your system,
+        use argument *cuda_libdir* in the constructor to set this path
+        manually to the 64-Bit library directory of your local CUDA SDK
+        installation.
+
+        :param str libname:
+            specifies the shortened library file name which gets expanded
+            internally to match the correct library file name for the local
+            system, for example `cudadevrt` is the shortened form of
+            `cudadevrt.lib` on Windows and `libcudadevrt.a` on Linux.
+        :return:
+            *self*
+
+    .. method:: link()
+
+        Complete the linker session and create the :class:`~pycuda.driver.Module`.
+
+        After calling this method, calling the *add_...* methods of this
+        instance is no longer allowed, whereas accessing module functions
+        through this instance using :meth:`get_function` is only
+        allowed after calling this method.
+
+        :return:
+            *self*
+
+    .. method:: get_function(name)
+
+        This method is not defined before calling :meth:`link`.
+
+        :see:
+            :meth:`Module.get_function() <pycuda.driver.Module.get_function>`
+
+    .. method:: get_global(name)
+
+        This method is not defined before calling :meth:`link`.
+
+        :see:
+            :meth:`Module.get_global() <pycuda.driver.Module.get_global>`
+
+    .. method:: get_texref(name)
+
+        This method is not defined before calling :meth:`link`.
+
+        :see:
+            :meth:`Module.get_texref() <pycuda.driver.Module.get_texref>`
+
+    .. method:: get_surfref(name)
+
+        This method is not defined before calling :meth:`link`.
+
+        :see:
+            :meth:`Module.get_surfref() <pycuda.driver.Module.get_surfref>`
-- 
GitLab


From 2d823d589bf9c7a006e95cecf49da17fdd4cfd0e Mon Sep 17 00:00:00 2001
From: chschnell <cs@rekoba.de>
Date: Tue, 24 Jan 2017 05:10:19 +0100
Subject: [PATCH 15/16] Test

---
 doc/source/driver.rst        | 8 +++-----
 src/wrapper/wrap_cudadrv.cpp | 6 +++---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/doc/source/driver.rst b/doc/source/driver.rst
index a4f171ab..415a1272 100644
--- a/doc/source/driver.rst
+++ b/doc/source/driver.rst
@@ -2415,7 +2415,7 @@ It is therefore recommended to use *cudadevrt* only when needed.
         str or None
     :param code:
         The name of the NVIDIA GPU to assemble and optimize PTX code for. If
-        *None*, it leave unspecified (NVCC only)
+        *None*, leave it unspecified (NVCC only)
     :type code:
         str or None
     :param cache_dir:
@@ -2522,7 +2522,7 @@ It is therefore recommended to use *cudadevrt* only when needed.
         str or None
     :param code:
         The name of the NVIDIA GPU to assemble and optimize PTX code for. If
-        *None*, it leave unspecified (NVCC only)
+        *None*, leave it unspecified (NVCC only)
     :type code:
         str or None
     :param cache_dir:
@@ -2587,8 +2587,6 @@ It is therefore recommended to use *cudadevrt* only when needed.
         NVRTC using compiler options *options* and add the resulting
         PTX code to the linker session of this module.
 
-        NOTE: self.code is not supported with NVRTC and ignored here
-
         :param str source:
             CUDA C++ source code
         :param options:
@@ -2632,7 +2630,7 @@ It is therefore recommended to use *cudadevrt* only when needed.
         :param name:
             Specifies to the linker under which name to refer to *data* in
             its warning and error messages. If *None*, defaults
-            to ``"kernel.ptx"``
+            to ``"unknown"``
         :type name:
             str or None
         :return:
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 988d88b9..c97f4042 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -8,9 +8,9 @@
 #include "wrap_helpers.hpp"
 #include <boost/python/stl_iterator.hpp>
 
-/*
-#DEFINE USE_JIT
-*/
+
+#define USE_JIT
+
 
 
 #if CUDAPP_CUDA_VERSION < 1010
-- 
GitLab


From 21a21f97d884651c688c5c529d70258f5455c5fe Mon Sep 17 00:00:00 2001
From: chschnell <cs@rekoba.de>
Date: Tue, 24 Jan 2017 05:51:01 +0100
Subject: [PATCH 16/16] Test

---
 pycuda/compiler.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pycuda/compiler.py b/pycuda/compiler.py
index 2ebf178a..646930ad 100644
--- a/pycuda/compiler.py
+++ b/pycuda/compiler.py
@@ -291,6 +291,7 @@ class SourceModule(CudaModule):
 
         self._bind_module()
 
+'''
 class DynamicModule(CudaModule):
     #def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None,
     #        code=None, cache_dir=None, include_dirs=[], link_options=None,
@@ -320,7 +321,7 @@ class DynamicModule(CudaModule):
         self.module = None
 
     def _locate_cuda_libdir(self):
-        '''
+        '' '
         Locate the "standard" CUDA SDK library directory in the local
         file system. Supports 64-Bit Windows, Linux and Mac OS X.
         In case the caller supplied cuda_libdir in the constructor
@@ -344,7 +345,7 @@ class DynamicModule(CudaModule):
           https://cmake.org/cmake/help/v3.0/module/FindCUDA.html
           https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake
         - Verify all Linux code paths somehow
-        '''
+        '' '
         from os.path import isfile, join
         from platform import system as platform_system
         system = platform_system()
@@ -462,3 +463,4 @@ class DynamicSourceModule(DynamicModule):
         if use_cudadevrt:
             self.add_stdlib('cudadevrt')
         self.link()
+'''
-- 
GitLab