From ebf0a6e0fb0a3a1b38fc08e155b23ff50eb81bce Mon Sep 17 00:00:00 2001 From: Lurch Date: Sun, 15 Jan 2017 12:26:12 +0100 Subject: [PATCH 01/16] Created new experimental branch for NVRTC --- pycuda/compiler.py | 2 +- setup.py | 2 +- src/wrapper/wrap_cudadrv.cpp | 106 +++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index d9f25752..494dd330 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -378,7 +378,7 @@ class DynamicModule(CudaModule): 'directory, set CUDA library path manually') return libdir, libptn - def add_source(self, source, nvcc_options=None, name='kernel.ptx'): + def add_source(self, source, nvcc_options=[], name='kernel.ptx'): ptx = compile(source, nvcc=self.nvcc, options=nvcc_options, keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch, code=self.code, cache_dir=self.cache_dir, diff --git a/setup.py b/setup.py index 775141b0..2f909cb3 100644 --- a/setup.py +++ b/setup.py @@ -117,7 +117,7 @@ def main(): LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"] LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"] - + conf["CUDADRV_LIBNAME"]) + + conf["CUDADRV_LIBNAME"]) + ["nvrtc"] if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]: conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")] diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 7bb23d4a..74067b7b 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -413,6 +413,106 @@ namespace // {{{ linker #if CUDAPP_CUDA_VERSION >= 5050 + #include // NOTE: this requires us to link static SDK library "nvrtc" + + class CudaCompiler : public boost::noncopyable + { + private: + nvrtcProgram m_prog; + std::vector m_ptx_vec; + + void check_nvrtc_result(const char* nvrtc_function_name, nvrtcResult nvrtc_result) const { + if (nvrtc_result != NVRTC_SUCCESS) { + std::string error_msg = nvrtc_function_name; + error_msg += " failed: "; + error_msg += nvrtcGetErrorString(nvrtc_result); + const std::string compilation_log = compilation_log_str(); + if (compilation_log.length() > 0) { + error_msg += " - "; + error_msg += compilation_log; + } + throw std::runtime_error(error_msg); + } + } + + const std::string compilation_log_str() const { + size_t result_length; + nvrtcResult nvrtc_result = nvrtcGetProgramLogSize(m_prog, &result_length); + check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result); + if (result_length > 0) { + std::vector result_vec(result_length); + nvrtc_result = nvrtcGetProgramLog(m_prog, &result_vec[0]); + check_nvrtc_result("nvrtcGetProgramLog", nvrtc_result); + return std::string(&result_vec[0], result_length); + } + else { + return std::string(); + } + } + + public: + CudaCompiler(py::object py_src) // , py::object py_name=py::object()) + : m_prog() + { + const char* src = py::extract(py_src); + const char* name = NULL; + nvrtcResult nvrtc_result = nvrtcCreateProgram( + &m_prog, // Pointer to the result CUDA Runtime Compilation program + src, // CUDA program source, 0-terminated string + name, // CUDA program name, can be NULL (defaults to "default_program") + 0, // Number of headers >= 0 + NULL, // const char** headers, - Contents of the headers, can be NULL when numHeaders == 0 + NULL); // const char** includeNames) - Name of each header by which they can be included in the CUDA program source, can be NULL + check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result); + } + + ~CudaCompiler() + { + if (m_prog != NULL) { + nvrtcDestroyProgram(&m_prog); + m_prog = NULL; + } + } + + void declare_function(py::object py_function_name) { + const char* function_name = py::extract(py_function_name); + const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name); + check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); + } + + py::object compile(py::object py_compile_options) { + std::vector compile_options; + PYTHON_FOREACH(py_option, py_compile_options) { + compile_options.push_back(py::extract(py_option)); + } + + nvrtcResult nvrtc_result; + nvrtc_result = nvrtcCompileProgram(m_prog, (int)compile_options.size(), &compile_options[0]); + check_nvrtc_result("nvrtcCompileProgram", nvrtc_result); + + size_t ptx_size; + nvrtc_result = nvrtcGetPTXSize(m_prog, &ptx_size); + check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result); + + m_ptx_vec = std::vector(ptx_size); + nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]); + check_nvrtc_result("nvrtcGetPTX", nvrtc_result); + + // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview) + // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes + PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ); + return py::object(py::handle<>(memory_view)); + } + + std::string mangled_function(py::object py_function_name) { + const char* function_name = py::extract(py_function_name); + const char* lowered_name = NULL; + const nvrtcResult nvrtc_result = nvrtcGetLoweredName(m_prog, function_name, &lowered_name); + check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result); + return std::string(lowered_name); + } + }; + class Linker : public boost::noncopyable { private: @@ -1200,6 +1300,12 @@ BOOST_PYTHON_MODULE(_driver) .value("OBJECT", CU_JIT_INPUT_OBJECT) .value("LIBRARY", CU_JIT_INPUT_LIBRARY); + py::class_ >("CudaCompiler", py::init()) +// .def(py::init()) + .def("declare_function", &CudaCompiler::declare_function, (py::arg("py_function_name"))) + .def("compile", &CudaCompiler::compile, (py::arg("compile_options")=py::object())) + .def("mangled_function", &CudaCompiler::mangled_function, (py::arg("py_function_name"))); + py::class_ >("Linker") .def(py::init()) .def(py::init()) -- GitLab From a408c26b7531dc709f95497b600a010c6bcd2d7f Mon Sep 17 00:00:00 2001 From: Lurch Date: Sun, 15 Jan 2017 12:54:50 +0100 Subject: [PATCH 02/16] Fixed Python version compatibility issue with PyMemoryView_FromMemory() --- src/wrapper/wrap_cudadrv.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 74067b7b..0d2ef389 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -498,10 +498,15 @@ namespace nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]); check_nvrtc_result("nvrtcGetPTX", nvrtc_result); +#if PY_VERSION_HEX >= 0x03030000 // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview) // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes + // new since Python 3.3 PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ); return py::object(py::handle<>(memory_view)); +#else + return py::str(std::string(&m_ptx_vec[0], ptx_size)); +#endif } std::string mangled_function(py::object py_function_name) { -- GitLab From 5e0b780b9366442fc2fa88c045ae8074c45bebed Mon Sep 17 00:00:00 2001 From: Lurch Date: Mon, 16 Jan 2017 06:38:49 +0100 Subject: [PATCH 03/16] Several improvements to the Compiler class --- src/wrapper/wrap_cudadrv.cpp | 74 ++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 0d2ef389..febe57a9 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -451,11 +451,15 @@ namespace } public: - CudaCompiler(py::object py_src) // , py::object py_name=py::object()) + CudaCompiler(py::object py_src, py::object py_name=py::object()) : m_prog() { const char* src = py::extract(py_src); const char* name = NULL; + if (py_name.ptr() != Py_None) { + name = py::extract(py_name); + } + nvrtcResult nvrtc_result = nvrtcCreateProgram( &m_prog, // Pointer to the result CUDA Runtime Compilation program src, // CUDA program source, 0-terminated string @@ -475,46 +479,58 @@ namespace } void declare_function(py::object py_function_name) { - const char* function_name = py::extract(py_function_name); - const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name); - check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); + const char* function_name = py::extract(py_function_name); + const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name); + check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); } - py::object compile(py::object py_compile_options) { - std::vector compile_options; + py::object compile(py::object py_compile_options=py::object(), py::object py_function_names=py::object()) { + std::vector compile_options; + if (py_compile_options.ptr() != Py_None) { PYTHON_FOREACH(py_option, py_compile_options) { compile_options.push_back(py::extract(py_option)); } + } - nvrtcResult nvrtc_result; - nvrtc_result = nvrtcCompileProgram(m_prog, (int)compile_options.size(), &compile_options[0]); - check_nvrtc_result("nvrtcCompileProgram", nvrtc_result); + std::vector function_names; + if (py_function_names.ptr() != Py_None) { + PYTHON_FOREACH(py_function_name, py_function_names) { + const char* function_name = py::extract(py_function_name); + const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name); + check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); + function_names.push_back(function_name); + } + } - size_t ptx_size; - nvrtc_result = nvrtcGetPTXSize(m_prog, &ptx_size); - check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result); + nvrtcResult nvrtc_result; + nvrtc_result = nvrtcCompileProgram(m_prog, (int)compile_options.size(), &compile_options[0]); + check_nvrtc_result("nvrtcCompileProgram", nvrtc_result); - m_ptx_vec = std::vector(ptx_size); - nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]); - check_nvrtc_result("nvrtcGetPTX", nvrtc_result); + size_t ptx_size; + nvrtc_result = nvrtcGetPTXSize(m_prog, &ptx_size); + check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result); + + m_ptx_vec = std::vector(ptx_size); + nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]); + check_nvrtc_result("nvrtcGetPTX", nvrtc_result); #if PY_VERSION_HEX >= 0x03030000 - // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview) - // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes - // new since Python 3.3 - PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ); - return py::object(py::handle<>(memory_view)); + // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview) + // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes + // new since Python 3.3 + PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ); + return py::object(py::handle<>(memory_view)); #else - return py::str(std::string(&m_ptx_vec[0], ptx_size)); + return py::str(std::string(&m_ptx_vec[0], ptx_size)); #endif - } + } std::string mangled_function(py::object py_function_name) { - const char* function_name = py::extract(py_function_name); - const char* lowered_name = NULL; - const nvrtcResult nvrtc_result = nvrtcGetLoweredName(m_prog, function_name, &lowered_name); - check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result); - return std::string(lowered_name); + const char* function_name = py::extract(py_function_name); + const char* lowered_name = NULL; + const nvrtcResult nvrtc_result = nvrtcGetLoweredName(m_prog, function_name, &lowered_name); + check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result); + return std::string(lowered_name); } }; @@ -1306,9 +1322,9 @@ BOOST_PYTHON_MODULE(_driver) .value("LIBRARY", CU_JIT_INPUT_LIBRARY); py::class_ >("CudaCompiler", py::init()) -// .def(py::init()) + .def(py::init()) .def("declare_function", &CudaCompiler::declare_function, (py::arg("py_function_name"))) - .def("compile", &CudaCompiler::compile, (py::arg("compile_options")=py::object())) + .def("compile", &CudaCompiler::compile, (py::arg("compile_options")=py::object(), py::arg("function_names")=py::object())) .def("mangled_function", &CudaCompiler::mangled_function, (py::arg("py_function_name"))); py::class_ >("Linker") -- GitLab From 6075cad00a8072d1b9862f49fb3d8e4097260e5f Mon Sep 17 00:00:00 2001 From: Lurch Date: Tue, 17 Jan 2017 19:44:36 +0100 Subject: [PATCH 04/16] Completed NVRTC implementation --- pycuda/compiler.py | 54 ++++++++ src/wrapper/wrap_cudadrv.cpp | 253 ++++++++++++++++++++++------------- 2 files changed, 213 insertions(+), 94 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 494dd330..f38ad95d 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -322,6 +322,7 @@ class DynamicModule(CudaModule): self.include_dirs = include_dirs self.cuda_libdir = cuda_libdir self.libdir, self.libptn = None, None + self.mangled_exports = None self.module = None def _locate_cuda_libdir(self): @@ -387,6 +388,53 @@ class DynamicModule(CudaModule): self.linker.add_data(ptx, jit_input_type.PTX, name) return self + def add_source_nvrtc(self, source, exports=[], compile_options=[], + name='kernel.ptx', compiler=None): + ''' + NOTE: + - self.code is not supported with NVRTC and ignored here + - headers in _find_pycuda_include_path() are supported with NVRTC: + pycuda-helpers.hpp, pycuda-complex.hpp tested (they need -I/include) + TODO: + - the include paths _find_pycuda_include_path() and CUDA_PATH/CUDA_ROOT are constant, they could be moved to NvrtcCompiler (C++) + - the compiler option '-arch' could also be moved to NvrtcCompiler + ''' + if not hasattr(self.linker, "add_nvrtc"): + raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!') + + # append -arch compiler option, if not already exists + compile_options = compile_options[:] + if not '-arch' in compile_options: + if self.arch is not None: + # nvcc: "-arch sm_%d%d" + # nvrtc: "-arch compute_%d%d" + # self.arch: "sm_%d%d" or None + arch_compute_capability = (int(self.arch[3]), int(self.arch[4])) + else: + from pycuda.driver import Context + arch_compute_capability = Context.get_device().compute_capability() + compile_options.extend(['-arch', 'compute_%u%u' % arch_compute_capability]) + + # append -I include directory options + compile_options.append('-I%s' % _find_pycuda_include_path()) + if 'CUDA_PATH' in os.environ: + compile_options.append('-I%s\\include' % os.environ['CUDA_PATH']) + elif 'CUDA_ROOT' in os.environ: + compile_options.append('-I%s/include' % os.environ['CUDA_ROOT']) + for include_dir in self.include_dirs: + compile_options.append('-I%s' % include_dir) + + # compile and link + if self.mangled_exports is None and len(exports) > 0: + self.mangled_exports = {} + if compiler is None: + from pycuda.driver import NvrtcCompiler + compiler = NvrtcCompiler() + self.linker.add_nvrtc(source, compiler, name=name, + compile_options=compile_options, exports=exports, + mangled_exports=self.mangled_exports) + return self + def add_data(self, data, input_type, name='unknown'): self.linker.add_data(data, input_type, name) return self @@ -412,6 +460,12 @@ class DynamicModule(CudaModule): self._bind_module() return self + def get_function(self, name): + if self.mangled_exports is None or name not in self.mangled_exports: + return self.module.get_function(name) + else: + return self.module.get_function(self.mangled_exports[name]) + class DynamicSourceModule(DynamicModule): ''' Creates a Module from a single .cu source object linked against the diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index febe57a9..ecd04a3c 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -410,16 +410,49 @@ namespace // }}} - // {{{ linker + // {{{ nvrtc compiler -#if CUDAPP_CUDA_VERSION >= 5050 +#if CUDAPP_CUDA_VERSION >= 8000 #include // NOTE: this requires us to link static SDK library "nvrtc" - class CudaCompiler : public boost::noncopyable + class NvrtcProgram { private: nvrtcProgram m_prog; - std::vector m_ptx_vec; + + const std::string compilation_log_str() const { + size_t result_length; + nvrtcResult nvrtc_result = nvrtcGetProgramLogSize(m_prog, &result_length); + check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result); + if (result_length == 0) { + return std::string(); + } + std::vector result_vec(result_length); + nvrtc_result = nvrtcGetProgramLog(m_prog, &result_vec[0]); + check_nvrtc_result("nvrtcGetProgramLog", nvrtc_result); + return std::string(&result_vec[0], result_length); + } + + public: + NvrtcProgram(const char* src, const char* name=NULL, int numHeaders=0, + const char** includeNames=NULL, const char** headers=NULL) + : m_prog() + { + const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog, src, + name, numHeaders, headers, includeNames); + check_nvrtc_result("nvrtcCreateProgram", nvrtc_result); + } + + ~NvrtcProgram() { + if (m_prog != NULL) { + nvrtcDestroyProgram(&m_prog); + m_prog = NULL; + } + } + + nvrtcProgram prog() const { + return m_prog; + } void check_nvrtc_result(const char* nvrtc_function_name, nvrtcResult nvrtc_result) const { if (nvrtc_result != NVRTC_SUCCESS) { @@ -434,106 +467,84 @@ namespace throw std::runtime_error(error_msg); } } + }; - const std::string compilation_log_str() const { - size_t result_length; - nvrtcResult nvrtc_result = nvrtcGetProgramLogSize(m_prog, &result_length); - check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result); - if (result_length > 0) { - std::vector result_vec(result_length); - nvrtc_result = nvrtcGetProgramLog(m_prog, &result_vec[0]); - check_nvrtc_result("nvrtcGetProgramLog", nvrtc_result); - return std::string(&result_vec[0], result_length); - } - else { - return std::string(); - } - } + class NvrtcCompiler : public boost::noncopyable + { + private: + std::vector m_compile_options; + std::vector m_header_names; + std::vector m_header_contents; public: - CudaCompiler(py::object py_src, py::object py_name=py::object()) - : m_prog() - { - const char* src = py::extract(py_src); - const char* name = NULL; - if (py_name.ptr() != Py_None) { - name = py::extract(py_name); - } + void add_header(py::str py_header_name, py::str py_header_content) { + m_header_names.push_back(py::extract(py_header_name)); + m_header_contents.push_back(py::extract(py_header_content)); + } - nvrtcResult nvrtc_result = nvrtcCreateProgram( - &m_prog, // Pointer to the result CUDA Runtime Compilation program - src, // CUDA program source, 0-terminated string - name, // CUDA program name, can be NULL (defaults to "default_program") - 0, // Number of headers >= 0 - NULL, // const char** headers, - Contents of the headers, can be NULL when numHeaders == 0 - NULL); // const char** includeNames) - Name of each header by which they can be included in the CUDA program source, can be NULL - check_nvrtc_result("nvrtcGetProgramLogSize", nvrtc_result); + void add_compile_options(py::list py_compile_options) { + PYTHON_FOREACH(py_compile_option, py_compile_options) { + const char* compile_option = py::extract(py_compile_option); + m_compile_options.push_back(compile_option); + } } - ~CudaCompiler() + std::vector compile(const char* src, const char* name, + const std::vector* compile_options, + const std::vector* exports, + py::dict mangled_exports_dict) { - if (m_prog != NULL) { - nvrtcDestroyProgram(&m_prog); - m_prog = NULL; + // combine compile option lists into all_compile_options + std::vector all_compile_options(m_compile_options); + if (!compile_options->empty()) { + all_compile_options.insert(all_compile_options.end(), compile_options->begin(), compile_options->end()); } - } - void declare_function(py::object py_function_name) { - const char* function_name = py::extract(py_function_name); - const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name); - check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); - } + // create NVRTC program + const char** header_names = m_header_names.empty()? NULL : &m_header_names[0]; + const char** header_contents = m_header_contents.empty()? NULL : &m_header_contents[0]; + NvrtcProgram prog(src, name, (int)m_header_names.size(), header_names, header_contents); - py::object compile(py::object py_compile_options=py::object(), py::object py_function_names=py::object()) { - std::vector compile_options; - if (py_compile_options.ptr() != Py_None) { - PYTHON_FOREACH(py_option, py_compile_options) { - compile_options.push_back(py::extract(py_option)); + // declare exported function names + nvrtcResult nvrtc_result; + for (std::vector::const_iterator it = exports->begin(); it != exports->end(); ++it) { + nvrtc_result = nvrtcAddNameExpression(prog.prog(), *it); + prog.check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); } - } - std::vector function_names; - if (py_function_names.ptr() != Py_None) { - PYTHON_FOREACH(py_function_name, py_function_names) { - const char* function_name = py::extract(py_function_name); - const nvrtcResult nvrtc_result = nvrtcAddNameExpression(m_prog, function_name); - check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); - function_names.push_back(function_name); + // compile .cu => .ptx + nvrtc_result = nvrtcCompileProgram(prog.prog(), + (int)all_compile_options.size(), &all_compile_options[0]); + prog.check_nvrtc_result("nvrtcCompileProgram", nvrtc_result); + + // map exported function names to their mangled representation + if (mangled_exports_dict != NULL) { + for (std::vector::const_iterator it = exports->begin(); it != exports->end(); ++it) { + const char* lowered_name = NULL; + nvrtc_result = nvrtcGetLoweredName(prog.prog(), *it, &lowered_name); + prog.check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result); + mangled_exports_dict[*it] = std::string(lowered_name); + } } - } - - nvrtcResult nvrtc_result; - nvrtc_result = nvrtcCompileProgram(m_prog, (int)compile_options.size(), &compile_options[0]); - check_nvrtc_result("nvrtcCompileProgram", nvrtc_result); - - size_t ptx_size; - nvrtc_result = nvrtcGetPTXSize(m_prog, &ptx_size); - check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result); - m_ptx_vec = std::vector(ptx_size); - nvrtc_result = nvrtcGetPTX(m_prog, &m_ptx_vec[0]); - check_nvrtc_result("nvrtcGetPTX", nvrtc_result); + // copy compiled ptx result into ptx_buffer + size_t ptx_size; + nvrtc_result = nvrtcGetPTXSize(prog.prog(), &ptx_size); + prog.check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result); -#if PY_VERSION_HEX >= 0x03030000 - // PyMemoryView_FromMemory returns a memoryview object (https://docs.python.org/3/library/stdtypes.html#memoryview) - // http://stackoverflow.com/questions/23064407/expose-c-buffer-as-python-3-bytes - // new since Python 3.3 - PyObject* memory_view = PyMemoryView_FromMemory(&m_ptx_vec[0], ptx_size, PyBUF_READ); - return py::object(py::handle<>(memory_view)); -#else - return py::str(std::string(&m_ptx_vec[0], ptx_size)); + std::vector ptx_buffer(ptx_size); + nvrtc_result = nvrtcGetPTX(prog.prog(), &ptx_buffer[0]); + prog.check_nvrtc_result("nvrtcGetPTX", nvrtc_result); + return ptx_buffer; + } + }; #endif - } - std::string mangled_function(py::object py_function_name) { - const char* function_name = py::extract(py_function_name); - const char* lowered_name = NULL; - const nvrtcResult nvrtc_result = nvrtcGetLoweredName(m_prog, function_name, &lowered_name); - check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result); - return std::string(lowered_name); - } - }; + // }}} + // {{{ linker + +#if CUDAPP_CUDA_VERSION >= 5050 class Linker : public boost::noncopyable { private: @@ -637,6 +648,41 @@ namespace check_cu_result("cuLinkAddFile", cu_result); } +#if CUDAPP_CUDA_VERSION >= 8000 + void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::str py_name, + py::list py_compile_options, py::list py_exports, + py::dict py_mangled_exports) + { + // extract Python arguments + const char* src = py::extract(py_src); + const char* name = (py_name.ptr() != Py_None)? + py::extract(py_name) : "kernel.ptx"; + std::vector compile_options; + if (py_compile_options.ptr() != Py_None) { + PYTHON_FOREACH(py_option, py_compile_options) { + compile_options.push_back(py::extract(py_option)); + } + } + std::vector exports; + if (py_exports.ptr() != Py_None) { + PYTHON_FOREACH(py_function_name, py_exports) { + exports.push_back(py::extract(py_function_name)); + } + } + py::dict mangled_exports_dict = (py_mangled_exports.ptr() != Py_None)? + py::extract(py_mangled_exports) : NULL; + + // compile src into buffer ptx + std::vector ptx = compiler->compile(src, name, + &compile_options, &exports, mangled_exports_dict); + + // link buffer ptx + const CUresult cu_result = cuLinkAddData(m_link_state, + CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL); + check_cu_result("cuLinkAddData", cu_result); + } +#endif + module* link_module() { char* cubin_data = NULL; @@ -1311,6 +1357,19 @@ BOOST_PYTHON_MODULE(_driver) // }}} + // {{{ nvrtc compiler + +#if CUDAPP_CUDA_VERSION >= 8000 + py::class_ >("NvrtcCompiler") + .def("add_header", &NvrtcCompiler::add_header, ( + py::arg("header_name"), + py::arg("header_content"))) + .def("add_compile_options", &NvrtcCompiler::add_compile_options, ( + py::arg("compile_options"))); +#endif + + // }}} + // {{{ linker #if CUDAPP_CUDA_VERSION >= 5050 @@ -1321,19 +1380,25 @@ BOOST_PYTHON_MODULE(_driver) .value("OBJECT", CU_JIT_INPUT_OBJECT) .value("LIBRARY", CU_JIT_INPUT_LIBRARY); - py::class_ >("CudaCompiler", py::init()) - .def(py::init()) - .def("declare_function", &CudaCompiler::declare_function, (py::arg("py_function_name"))) - .def("compile", &CudaCompiler::compile, (py::arg("compile_options")=py::object(), py::arg("function_names")=py::object())) - .def("mangled_function", &CudaCompiler::mangled_function, (py::arg("py_function_name"))); - py::class_ >("Linker") .def(py::init()) .def(py::init()) .def(py::init()) - .def("add_data", &Linker::add_data, (py::arg("data"), py::arg("input_type"), py::arg("name")=py::str("unknown"))) - .def("add_file", &Linker::add_file, (py::arg("filename"), py::arg("input_type"))) + .def("add_data", &Linker::add_data, ( + py::arg("data"), + py::arg("input_type"), + py::arg("name")=py::str("unknown"))) + .def("add_file", &Linker::add_file, ( + py::arg("filename"), + py::arg("input_type"))) + .def("add_nvrtc", &Linker::add_nvrtc, ( + py::arg("src"), + py::arg("compiler"), py::arg("name")=py::object(), + py::arg("compile_options")=py::object(), + py::arg("exports")=py::object(), + py::arg("mangled_exports")=py::object())) .def("link_module", &Linker::link_module, py::return_value_policy()); + #endif // }}} -- GitLab From ce8228bed917ca4fe70c75aea75413a01479eb9e Mon Sep 17 00:00:00 2001 From: Lurch Date: Wed, 18 Jan 2017 20:08:22 +0100 Subject: [PATCH 05/16] Improvements and bug-fixes: - Moved all constant NVRTC compiler configuration from DynamicSourceModule.add_source_nvrtc() to constructor of NvrtcCompiler - Added support for NVRTC in-memory header files - Added support to let the PyCUDA user create and preconfigure a NvrtcCompiler instance and pass it to add_source_nvrtc() - Added argument "exports" to constructor of DynamicSourceModule, triggers NVRTC compilation instead of NVCC - Fixed C++ buffer handling --- pycuda/compiler.py | 53 ++------ src/wrapper/wrap_cudadrv.cpp | 245 ++++++++++++++++++++++++----------- 2 files changed, 184 insertions(+), 114 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index f38ad95d..f9ebbe14 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -307,7 +307,7 @@ class DynamicModule(CudaModule): from pycuda.driver import Context compute_capability = Context.get_device().compute_capability() if compute_capability < (3,5): - raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % + raise RuntimeError('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) else: from pycuda.driver import Linker @@ -390,46 +390,15 @@ class DynamicModule(CudaModule): def add_source_nvrtc(self, source, exports=[], compile_options=[], name='kernel.ptx', compiler=None): - ''' - NOTE: - - self.code is not supported with NVRTC and ignored here - - headers in _find_pycuda_include_path() are supported with NVRTC: - pycuda-helpers.hpp, pycuda-complex.hpp tested (they need -I/include) - TODO: - - the include paths _find_pycuda_include_path() and CUDA_PATH/CUDA_ROOT are constant, they could be moved to NvrtcCompiler (C++) - - the compiler option '-arch' could also be moved to NvrtcCompiler - ''' + # NOTE: - self.code is not supported with NVRTC and ignored here if not hasattr(self.linker, "add_nvrtc"): raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!') - - # append -arch compiler option, if not already exists - compile_options = compile_options[:] - if not '-arch' in compile_options: - if self.arch is not None: - # nvcc: "-arch sm_%d%d" - # nvrtc: "-arch compute_%d%d" - # self.arch: "sm_%d%d" or None - arch_compute_capability = (int(self.arch[3]), int(self.arch[4])) - else: - from pycuda.driver import Context - arch_compute_capability = Context.get_device().compute_capability() - compile_options.extend(['-arch', 'compute_%u%u' % arch_compute_capability]) - - # append -I include directory options - compile_options.append('-I%s' % _find_pycuda_include_path()) - if 'CUDA_PATH' in os.environ: - compile_options.append('-I%s\\include' % os.environ['CUDA_PATH']) - elif 'CUDA_ROOT' in os.environ: - compile_options.append('-I%s/include' % os.environ['CUDA_ROOT']) - for include_dir in self.include_dirs: - compile_options.append('-I%s' % include_dir) - - # compile and link if self.mangled_exports is None and len(exports) > 0: self.mangled_exports = {} if compiler is None: from pycuda.driver import NvrtcCompiler - compiler = NvrtcCompiler() + compiler = NvrtcCompiler(compile_options, self.include_dirs, self.arch) + compile_options = None self.linker.add_nvrtc(source, compiler, name=name, compile_options=compile_options, exports=exports, mangled_exports=self.mangled_exports) @@ -477,16 +446,20 @@ class DynamicSourceModule(DynamicModule): ''' def __init__(self, source, nvcc="nvcc", options=[], keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], cuda_libdir=None): + include_dirs=[], cuda_libdir=None, exports=None): super(DynamicSourceModule, self).__init__(nvcc=nvcc, link_options=None, keep=keep, no_extern_c=no_extern_c, arch=arch, code=code, cache_dir=cache_dir, include_dirs=include_dirs, cuda_libdir=cuda_libdir) options = options[:] - if not '-rdc=true' in options: + if '-rdc=true' not in options: options.append('-rdc=true') - if not '-lcudadevrt' in options: - options.append('-lcudadevrt') - self.add_source(source, nvcc_options=options) + if exports is None: + if '-lcudadevrt' not in options: + options.append('-lcudadevrt') + self.add_source(source, nvcc_options=options) + else: + self.add_source_nvrtc(source, exports=exports, + compile_options=options) self.add_stdlib('cudadevrt') self.link() diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index ecd04a3c..09fb57be 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -434,12 +434,16 @@ namespace } public: - NvrtcProgram(const char* src, const char* name=NULL, int numHeaders=0, - const char** includeNames=NULL, const char** headers=NULL) + NvrtcProgram(const char* src, const char* name, + const std::vector& header_names_p, + const std::vector& header_contents_p) : m_prog() { - const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog, src, - name, numHeaders, headers, includeNames); + const size_t header_count = header_names_p.size(); + const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog, + src, name, (int)header_count, + (header_count > 0)? &header_contents_p[0] : NULL, + (header_count > 0)? &header_names_p[0] : NULL); check_nvrtc_result("nvrtcCreateProgram", nvrtc_result); } @@ -472,66 +476,177 @@ namespace class NvrtcCompiler : public boost::noncopyable { private: - std::vector m_compile_options; - std::vector m_header_names; - std::vector m_header_contents; + std::vector m_compile_options; + std::vector m_header_names; + std::vector m_header_contents; + std::vector m_compile_options_p; + std::vector m_header_names_p; + std::vector m_header_contents_p; +#ifdef _WIN32 + std::vector m_win32_getenv_buf; +#endif + + char* get_env(const char* varname) { +#ifdef _WIN32 + // Visual Studio 2015: std::getenv() causes compiler warning C4996 + size_t required_size = 0; + getenv_s(&required_size, NULL, 0, varname); + if (required_size > 0) { + m_win32_getenv_buf.resize(required_size); + getenv_s(&required_size, &m_win32_getenv_buf[0], required_size, varname); + return &m_win32_getenv_buf[0]; + } + else { + return NULL; + } +#else + return std::getenv(varname); +#endif + } - public: - void add_header(py::str py_header_name, py::str py_header_content) { - m_header_names.push_back(py::extract(py_header_name)); - m_header_contents.push_back(py::extract(py_header_content)); + std::string find_pycuda_include_path() const { + // pycuda.compiler._find_pycuda_include_path(): + // from pkg_resources import Requirement, resource_filename + // return resource_filename(Requirement.parse("pycuda"), "pycuda/cuda") + const py::object pkg_resources = py::import("pkg_resources"); + const py::object resource_filename = pkg_resources.attr("resource_filename"); + const py::object Requirement_parse = pkg_resources.attr("Requirement").attr("parse"); + const py::object py_result = resource_filename(Requirement_parse("pycuda"), "pycuda/cuda"); + return py::extract(py_result); + } + + void options_changed() { + const size_t n = m_compile_options.size(); + m_compile_options_p.resize(n); + for (size_t i=0; i(py_compile_option); - m_compile_options.push_back(compile_option); + public: + NvrtcCompiler( + const py::object py_compiler_options=py::object(), + const py::object py_include_dirs=py::object(), + const py::object py_arch=py::object(), + const py::object py_headers=py::object()) + { + // Target GPU architecture (-arch) compiler option + std::string arch_compute_capability; + if (py_arch.ptr() != Py_None) { + const char* arch = py::extract(py_arch); + if (strlen(arch) == 5 && std::memcmp(arch, "sm_", 3) == 0) { + arch_compute_capability = std::string("compute_") + &arch[3]; + } + else { + arch_compute_capability = arch; + } + } + else { + const py::tuple py_compute_capability = context::get_device().compute_capability(); + const char arch_major = '0' + py::extract(py_compute_capability[0]); + const char arch_minor = '0' + py::extract(py_compute_capability[1]); + arch_compute_capability = std::string("compute_") + arch_major + arch_minor; + } + m_compile_options.push_back("-arch"); + m_compile_options.push_back(arch_compute_capability); + + // Include directory (-I) compiler options + m_compile_options.push_back(std::string("-I") + find_pycuda_include_path()); + const char* CUDA_SDK; + if ((CUDA_SDK = get_env("CUDA_PATH")) != NULL) { + m_compile_options.push_back(std::string("-I") + CUDA_SDK + "\\include"); + } + else if ((CUDA_SDK = get_env("CUDA_ROOT")) != NULL) { + m_compile_options.push_back(std::string("-I") + CUDA_SDK + "/include"); + } + if (py_include_dirs.ptr() != Py_None) { + PYTHON_FOREACH(py_include_dir, py_include_dirs) { + m_compile_options.push_back(std::string("-I") + py::extract(py_include_dir)()); + } + } + + // Other compiler options specified by the caller + if (py_include_dirs.ptr() != Py_None) { + PYTHON_FOREACH(py_compiler_option, py_compiler_options) { + m_compile_options.push_back(py::extract(py_compiler_option)); + } + } + + options_changed(); + + // In-memory NVRCT header files + if (py_headers.ptr() != Py_None) { + const py::dict py_headers_dict = py::extract(py_headers); + if (py::len(py_headers_dict) > 0) { + const py::list py_keys = py_headers_dict.keys(); + PYTHON_FOREACH(py_key, py_keys) { + m_header_names.push_back(py::extract(py_key)); + m_header_contents.push_back(py::extract(py_headers_dict[py_key])); + } + header_changed(); + } } } - std::vector compile(const char* src, const char* name, - const std::vector* compile_options, - const std::vector* exports, - py::dict mangled_exports_dict) + std::vector compile( + const py::str& py_src, + const char* name, + const py::object& py_compile_options, + const py::object& py_exports, + py::object& py_mangled_exports) { - // combine compile option lists into all_compile_options - std::vector all_compile_options(m_compile_options); - if (!compile_options->empty()) { - all_compile_options.insert(all_compile_options.end(), compile_options->begin(), compile_options->end()); + // Combine compile option lists into compile_options_p + std::vector compile_options_p(m_compile_options_p); + if (py_compile_options.ptr() != Py_None) { + PYTHON_FOREACH(py_compile_option, py_compile_options) { + compile_options_p.push_back(py::extract(py_compile_option)); + } } - // create NVRTC program - const char** header_names = m_header_names.empty()? NULL : &m_header_names[0]; - const char** header_contents = m_header_contents.empty()? NULL : &m_header_contents[0]; - NvrtcProgram prog(src, name, (int)m_header_names.size(), header_names, header_contents); + // Create NVRTC program + NvrtcProgram prog(py::extract(py_src), name, + m_header_names_p, m_header_contents_p); - // declare exported function names + // Declare exported function names nvrtcResult nvrtc_result; - for (std::vector::const_iterator it = exports->begin(); it != exports->end(); ++it) { - nvrtc_result = nvrtcAddNameExpression(prog.prog(), *it); - prog.check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); + if (py_exports.ptr() != Py_None) { + PYTHON_FOREACH(py_export, py_exports) { + nvrtc_result = nvrtcAddNameExpression(prog.prog(), py::extract(py_export)); + prog.check_nvrtc_result("nvrtcAddNameExpression", nvrtc_result); + } } - // compile .cu => .ptx + // Compile .cu => .ptx nvrtc_result = nvrtcCompileProgram(prog.prog(), - (int)all_compile_options.size(), &all_compile_options[0]); + (int)compile_options_p.size(), &compile_options_p[0]); prog.check_nvrtc_result("nvrtcCompileProgram", nvrtc_result); - // map exported function names to their mangled representation - if (mangled_exports_dict != NULL) { - for (std::vector::const_iterator it = exports->begin(); it != exports->end(); ++it) { - const char* lowered_name = NULL; - nvrtc_result = nvrtcGetLoweredName(prog.prog(), *it, &lowered_name); + // Map exported function names to their mangled representation + if (py_exports.ptr() != Py_None && py_mangled_exports.ptr() != Py_None) { + py::dict py_mangled_exports_dict = py::extract(py_mangled_exports); + PYTHON_FOREACH(py_export, py_exports) { + const char* plain_name = py::extract(py_export); + const char* mangled_name = NULL; + nvrtc_result = nvrtcGetLoweredName(prog.prog(), plain_name, &mangled_name); prog.check_nvrtc_result("nvrtcGetLoweredName", nvrtc_result); - mangled_exports_dict[*it] = std::string(lowered_name); + py_mangled_exports_dict[py_export] = mangled_name; } } - // copy compiled ptx result into ptx_buffer + // Copy compiled ptx result into ptx_buffer size_t ptx_size; nvrtc_result = nvrtcGetPTXSize(prog.prog(), &ptx_size); prog.check_nvrtc_result("nvrtcGetPTXSize", nvrtc_result); - std::vector ptx_buffer(ptx_size); nvrtc_result = nvrtcGetPTX(prog.prog(), &ptx_buffer[0]); prog.check_nvrtc_result("nvrtcGetPTX", nvrtc_result); @@ -633,7 +748,7 @@ namespace throw py::error_already_set(); } const char* name = (py_name.ptr() != Py_None)? - py::extract(py_name) : NULL; + py::extract(py_name) : NULL; const CUresult cu_result = cuLinkAddData(m_link_state, input_type, static_cast(const_cast(data_buf)), data_buf_len, name, 0, NULL, NULL); @@ -642,40 +757,22 @@ namespace void add_file(py::str py_filename, CUjitInputType input_type) { - const char* filename = py::extract(py_filename); + const char* filename = py::extract(py_filename); const CUresult cu_result = cuLinkAddFile(m_link_state, input_type, filename, 0, NULL, NULL); check_cu_result("cuLinkAddFile", cu_result); } #if CUDAPP_CUDA_VERSION >= 8000 - void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::str py_name, - py::list py_compile_options, py::list py_exports, - py::dict py_mangled_exports) + void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::object py_name, + py::object py_compile_options, py::object py_exports, + py::object py_mangled_exports) { - // extract Python arguments - const char* src = py::extract(py_src); const char* name = (py_name.ptr() != Py_None)? py::extract(py_name) : "kernel.ptx"; - std::vector compile_options; - if (py_compile_options.ptr() != Py_None) { - PYTHON_FOREACH(py_option, py_compile_options) { - compile_options.push_back(py::extract(py_option)); - } - } - std::vector exports; - if (py_exports.ptr() != Py_None) { - PYTHON_FOREACH(py_function_name, py_exports) { - exports.push_back(py::extract(py_function_name)); - } - } - py::dict mangled_exports_dict = (py_mangled_exports.ptr() != Py_None)? - py::extract(py_mangled_exports) : NULL; - // compile src into buffer ptx - std::vector ptx = compiler->compile(src, name, - &compile_options, &exports, mangled_exports_dict); - + std::vector ptx = compiler->compile(py_src, name, + py_compile_options, py_exports, py_mangled_exports); // link buffer ptx const CUresult cu_result = cuLinkAddData(m_link_state, CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL); @@ -1361,11 +1458,10 @@ BOOST_PYTHON_MODULE(_driver) #if CUDAPP_CUDA_VERSION >= 8000 py::class_ >("NvrtcCompiler") - .def("add_header", &NvrtcCompiler::add_header, ( - py::arg("header_name"), - py::arg("header_content"))) - .def("add_compile_options", &NvrtcCompiler::add_compile_options, ( - py::arg("compile_options"))); + .def(py::init()) + .def(py::init()) + .def(py::init()) + .def(py::init()); #endif // }}} @@ -1393,12 +1489,13 @@ BOOST_PYTHON_MODULE(_driver) py::arg("input_type"))) .def("add_nvrtc", &Linker::add_nvrtc, ( py::arg("src"), - py::arg("compiler"), py::arg("name")=py::object(), + py::arg("compiler"), + py::arg("name")=py::object(), py::arg("compile_options")=py::object(), py::arg("exports")=py::object(), py::arg("mangled_exports")=py::object())) - .def("link_module", &Linker::link_module, py::return_value_policy()); - + .def("link_module", &Linker::link_module, + py::return_value_policy()); #endif // }}} -- GitLab From e04c1948ca356036cd63cc8d562f37563a5f6139 Mon Sep 17 00:00:00 2001 From: Lurch Date: Thu, 19 Jan 2017 10:09:27 +0100 Subject: [PATCH 06/16] Temporary test to investigate crash --- src/wrapper/wrap_cudadrv.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 09fb57be..a194fd6f 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -412,7 +412,7 @@ namespace // {{{ nvrtc compiler -#if CUDAPP_CUDA_VERSION >= 8000 +#if CUDAPP_CUDA_VERSION >= 18000 #include // NOTE: this requires us to link static SDK library "nvrtc" class NvrtcProgram @@ -763,7 +763,7 @@ namespace check_cu_result("cuLinkAddFile", cu_result); } -#if CUDAPP_CUDA_VERSION >= 8000 +#if CUDAPP_CUDA_VERSION >= 18000 void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::object py_name, py::object py_compile_options, py::object py_exports, py::object py_mangled_exports) @@ -1456,7 +1456,7 @@ BOOST_PYTHON_MODULE(_driver) // {{{ nvrtc compiler -#if CUDAPP_CUDA_VERSION >= 8000 +#if CUDAPP_CUDA_VERSION >= 18000 py::class_ >("NvrtcCompiler") .def(py::init()) .def(py::init()) @@ -1487,6 +1487,7 @@ BOOST_PYTHON_MODULE(_driver) .def("add_file", &Linker::add_file, ( py::arg("filename"), py::arg("input_type"))) +#if CUDAPP_CUDA_VERSION >= 18000 .def("add_nvrtc", &Linker::add_nvrtc, ( py::arg("src"), py::arg("compiler"), @@ -1494,6 +1495,7 @@ BOOST_PYTHON_MODULE(_driver) py::arg("compile_options")=py::object(), py::arg("exports")=py::object(), py::arg("mangled_exports")=py::object())) +#endif .def("link_module", &Linker::link_module, py::return_value_policy()); #endif -- GitLab From dad4bdba958c03030724d7daba9fada8b78ae4e2 Mon Sep 17 00:00:00 2001 From: Lurch Date: Thu, 19 Jan 2017 11:08:21 +0100 Subject: [PATCH 07/16] 2nd temporary test to investigate crash: removed nvrtc library --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2f909cb3..f6433a84 100644 --- a/setup.py +++ b/setup.py @@ -117,7 +117,7 @@ def main(): LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"] LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"] - + conf["CUDADRV_LIBNAME"]) + ["nvrtc"] + + conf["CUDADRV_LIBNAME"]) # + ["nvrtc"] if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]: conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")] -- GitLab From b33a50803f67fd0d0d057cf187a3b218850f79d7 Mon Sep 17 00:00:00 2001 From: Lurch Date: Thu, 19 Jan 2017 21:59:47 +0100 Subject: [PATCH 08/16] Reverted temporary tests, modified static linking of nvrtc.lib in setup.py to proper syntax, normalized a couple of method prototypes after drafting the documentation. --- examples/demo_cdpSimplePrint.py | 2 +- pycuda/compiler.py | 66 ++++++++++++------------------ setup.py | 5 ++- src/wrapper/wrap_cudadrv.cpp | 72 +++++++++++++++++++++------------ test/test_driver.py | 2 +- 5 files changed, 78 insertions(+), 69 deletions(-) diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py index d5435987..b2c0b4e6 100644 --- a/examples/demo_cdpSimplePrint.py +++ b/examples/demo_cdpSimplePrint.py @@ -95,7 +95,7 @@ def main(argv): print("starting Simple Print (CUDA Dynamic Parallelism)") - mod = DynamicSourceModule(cdpSimplePrint_cu) + mod = DynamicSourceModule(cdpSimplePrint_cu, use_cudadevrt=True) cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call print("***************************************************************************") diff --git a/pycuda/compiler.py b/pycuda/compiler.py index f9ebbe14..94c78352 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -278,10 +278,6 @@ class CudaModule(object): return self.module.get_function(name) class SourceModule(CudaModule): - ''' - Creates a Module from a single .cu source object linked against the - static CUDA runtime. - ''' def __init__(self, source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): @@ -296,22 +292,17 @@ class SourceModule(CudaModule): self._bind_module() class DynamicModule(CudaModule): - ''' - Creates a Module from multiple .cu source, library file and/or data - objects linked against the static or dynamic CUDA runtime. - ''' - def __init__(self, nvcc='nvcc', link_options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], message_handler=None, log_verbose=False, - cuda_libdir=None): + def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None, + code=None, cache_dir=None, include_dirs=None, link_options=None, + message_handler=None, log_verbose=False, cuda_libdir=None): from pycuda.driver import Context compute_capability = Context.get_device().compute_capability() if compute_capability < (3,5): - raise RuntimeError('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % + raise RuntimeError('Minimum compute capability for JIT linker is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) else: from pycuda.driver import Linker - self.linker = Linker(message_handler, link_options, log_verbose) + self.linker = Linker(link_options, message_handler, log_verbose) self._check_arch(arch) self.nvcc = nvcc self.keep = keep @@ -388,19 +379,18 @@ class DynamicModule(CudaModule): self.linker.add_data(ptx, jit_input_type.PTX, name) return self - def add_source_nvrtc(self, source, exports=[], compile_options=[], - name='kernel.ptx', compiler=None): - # NOTE: - self.code is not supported with NVRTC and ignored here + def add_source_nvrtc(self, source, nvrtc_options=None, + name='kernel.ptx', exports=None, compiler=None): if not hasattr(self.linker, "add_nvrtc"): raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!') if self.mangled_exports is None and len(exports) > 0: self.mangled_exports = {} if compiler is None: from pycuda.driver import NvrtcCompiler - compiler = NvrtcCompiler(compile_options, self.include_dirs, self.arch) - compile_options = None - self.linker.add_nvrtc(source, compiler, name=name, - compile_options=compile_options, exports=exports, + compiler = NvrtcCompiler(nvrtc_options, self.include_dirs, self.arch) + nvrtc_options = None + self.linker.add_nvrtc(compiler, source, name=name, + nvrtc_options=nvrtc_options, exports=exports, mangled_exports=self.mangled_exports) return self @@ -436,30 +426,26 @@ class DynamicModule(CudaModule): return self.module.get_function(self.mangled_exports[name]) class DynamicSourceModule(DynamicModule): - ''' - Creates a Module from a single .cu source object linked against the - dynamic CUDA runtime. - - compiler generates PTX relocatable device code (rdc) from source that - can be linked with other relocatable device code - - source is linked against the CUDA device runtime library cudadevrt - - library cudadevrt is statically linked into the generated Module - ''' - def __init__(self, source, nvcc="nvcc", options=[], keep=False, + def __init__(self, source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], cuda_libdir=None, exports=None): + include_dirs=None, use_cudadevrt=False, cuda_libdir=None, + exports=None): super(DynamicSourceModule, self).__init__(nvcc=nvcc, - link_options=None, keep=keep, no_extern_c=no_extern_c, - arch=arch, code=code, cache_dir=cache_dir, - include_dirs=include_dirs, cuda_libdir=cuda_libdir) - options = options[:] - if '-rdc=true' not in options: + keep=keep, no_extern_c=no_extern_c, arch=arch, code=code, + cache_dir=cache_dir, include_dirs=include_dirs, link_options=None, + cuda_libdir=cuda_libdir) + if options is None: + options = [] + else: + options = options[:] + if use_cudadevrt and '-rdc=true' not in options: options.append('-rdc=true') if exports is None: - if '-lcudadevrt' not in options: + if use_cudadevrt and '-lcudadevrt' not in options: options.append('-lcudadevrt') self.add_source(source, nvcc_options=options) else: - self.add_source_nvrtc(source, exports=exports, - compile_options=options) - self.add_stdlib('cudadevrt') + self.add_source_nvrtc(source, nvrtc_options=options, exports=exports) + if use_cudadevrt: + self.add_stdlib('cudadevrt') self.link() diff --git a/setup.py b/setup.py index f6433a84..a398920a 100644 --- a/setup.py +++ b/setup.py @@ -91,6 +91,9 @@ def get_config_schema(): LibraryDir("CURAND", default_lib_dirs), Libraries("CURAND", ["curand"]), + LibraryDir("NVRTC", default_lib_dirs), + Libraries("NVRTC", ["nvrtc"]), + StringListOption("CXXFLAGS", cxxflags_default, help="Any extra C++ compiler options to include"), StringListOption("LDFLAGS", ldflags_default, @@ -117,7 +120,7 @@ def main(): LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"] LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"] - + conf["CUDADRV_LIBNAME"]) # + ["nvrtc"] + + conf["CUDADRV_LIBNAME"]) + conf["NVRTC_LIBNAME"] if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]: conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")] diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index a194fd6f..46b3299c 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -412,7 +412,7 @@ namespace // {{{ nvrtc compiler -#if CUDAPP_CUDA_VERSION >= 18000 +#if CUDAPP_CUDA_VERSION >= 8000 #include // NOTE: this requires us to link static SDK library "nvrtc" class NvrtcProgram @@ -434,14 +434,14 @@ namespace } public: - NvrtcProgram(const char* src, const char* name, + NvrtcProgram(const char* source, const char* name, const std::vector& header_names_p, const std::vector& header_contents_p) : m_prog() { const size_t header_count = header_names_p.size(); const nvrtcResult nvrtc_result = nvrtcCreateProgram(&m_prog, - src, name, (int)header_count, + source, name, (int)header_count, (header_count > 0)? &header_contents_p[0] : NULL, (header_count > 0)? &header_names_p[0] : NULL); check_nvrtc_result("nvrtcCreateProgram", nvrtc_result); @@ -535,7 +535,7 @@ namespace public: NvrtcCompiler( - const py::object py_compiler_options=py::object(), + const py::object py_nvrtc_options=py::object(), const py::object py_include_dirs=py::object(), const py::object py_arch=py::object(), const py::object py_headers=py::object()) @@ -577,7 +577,7 @@ namespace // Other compiler options specified by the caller if (py_include_dirs.ptr() != Py_None) { - PYTHON_FOREACH(py_compiler_option, py_compiler_options) { + PYTHON_FOREACH(py_compiler_option, py_nvrtc_options) { m_compile_options.push_back(py::extract(py_compiler_option)); } } @@ -599,22 +599,22 @@ namespace } std::vector compile( - const py::str& py_src, + const py::str& py_source, const char* name, - const py::object& py_compile_options, + const py::object& py_nvrtc_options, const py::object& py_exports, py::object& py_mangled_exports) { // Combine compile option lists into compile_options_p std::vector compile_options_p(m_compile_options_p); - if (py_compile_options.ptr() != Py_None) { - PYTHON_FOREACH(py_compile_option, py_compile_options) { + if (py_nvrtc_options.ptr() != Py_None) { + PYTHON_FOREACH(py_compile_option, py_nvrtc_options) { compile_options_p.push_back(py::extract(py_compile_option)); } } // Create NVRTC program - NvrtcProgram prog(py::extract(py_src), name, + NvrtcProgram prog(py::extract(py_source), name, m_header_names_p, m_header_contents_p); // Declare exported function names @@ -652,6 +652,20 @@ namespace prog.check_nvrtc_result("nvrtcGetPTX", nvrtc_result); return ptx_buffer; } + + std::string py_compile( + py::str py_source, + py::object py_name, + py::object py_nvrtc_options, + py::object py_exports, + py::object py_mangled_exports) + { + const char* name = (py_name.ptr() != Py_None)? + py::extract(py_name) : "kernel.ptx"; + const std::vector ptx = compile(py_source, name, + py_nvrtc_options, py_exports, py_mangled_exports); + return std::string(ptx.begin(), ptx.end()); + } }; #endif @@ -706,8 +720,8 @@ namespace } public: - Linker(py::object message_handler = py::object(), - py::object py_options = py::object(), + Linker(py::object py_link_options = py::object(), + py::object message_handler = py::object(), py::object py_log_verbose = py::object(false)) : m_message_handler(message_handler), m_link_state(), @@ -719,8 +733,8 @@ namespace add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, sizeof(m_error_buf)); add_option(CU_JIT_LOG_VERBOSE, m_log_verbose? 1ull : 0ull); - if (py_options.ptr() != Py_None) { - PYTHON_FOREACH(key_value, py_options) { + if (py_link_options.ptr() != Py_None) { + PYTHON_FOREACH(key_value, py_link_options) { add_option( py::extract(key_value[0]), py::extract(key_value[1])()); @@ -763,16 +777,16 @@ namespace check_cu_result("cuLinkAddFile", cu_result); } -#if CUDAPP_CUDA_VERSION >= 18000 - void add_nvrtc(py::str py_src, NvrtcCompiler* compiler, py::object py_name, - py::object py_compile_options, py::object py_exports, +#if CUDAPP_CUDA_VERSION >= 8000 + void add_nvrtc(NvrtcCompiler* compiler, py::str py_source, py::object py_name, + py::object py_nvrtc_options, py::object py_exports, py::object py_mangled_exports) { const char* name = (py_name.ptr() != Py_None)? py::extract(py_name) : "kernel.ptx"; - // compile src into buffer ptx - std::vector ptx = compiler->compile(py_src, name, - py_compile_options, py_exports, py_mangled_exports); + // compile source into buffer ptx + std::vector ptx = compiler->compile(py_source, name, + py_nvrtc_options, py_exports, py_mangled_exports); // link buffer ptx const CUresult cu_result = cuLinkAddData(m_link_state, CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL); @@ -1456,12 +1470,18 @@ BOOST_PYTHON_MODULE(_driver) // {{{ nvrtc compiler -#if CUDAPP_CUDA_VERSION >= 18000 +#if CUDAPP_CUDA_VERSION >= 8000 py::class_ >("NvrtcCompiler") .def(py::init()) .def(py::init()) .def(py::init()) - .def(py::init()); + .def(py::init()) + .def("compile", &NvrtcCompiler::py_compile, ( + py::arg("source"), + py::arg("name")=py::str("kernel.ptx"), + py::arg("nvrtc_options")=py::object(), + py::arg("exports")=py::object(), + py::arg("mangled_exports")=py::object())); #endif // }}} @@ -1487,12 +1507,12 @@ BOOST_PYTHON_MODULE(_driver) .def("add_file", &Linker::add_file, ( py::arg("filename"), py::arg("input_type"))) -#if CUDAPP_CUDA_VERSION >= 18000 +#if CUDAPP_CUDA_VERSION >= 8000 .def("add_nvrtc", &Linker::add_nvrtc, ( - py::arg("src"), py::arg("compiler"), - py::arg("name")=py::object(), - py::arg("compile_options")=py::object(), + py::arg("source"), + py::arg("name")=py::str("kernel.ptx"), + py::arg("nvrtc_options")=py::object(), py::arg("exports")=py::object(), py::arg("mangled_exports")=py::object())) #endif diff --git a/test/test_driver.py b/test/test_driver.py index f88a1d67..d7e68d77 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -894,7 +894,7 @@ class TestDriver: drv.memcpy_htod(b_gpu, b) from pycuda.compiler import DynamicSourceModule - mod = DynamicSourceModule(cuda_string, keep=True) + mod = DynamicSourceModule(cuda_string, keep=True, use_cudadevrt=True) func = mod.get_function("math") func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu, -- GitLab From d0c479a20e0e882459e52bf49bc3f9273a3149cc Mon Sep 17 00:00:00 2001 From: Lurch Date: Thu, 19 Jan 2017 22:40:41 +0100 Subject: [PATCH 09/16] Fixed bug, include_dirs must not be None --- pycuda/compiler.py | 4 ++-- src/wrapper/wrap_cudadrv.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 94c78352..7d08ce2b 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -293,7 +293,7 @@ class SourceModule(CudaModule): class DynamicModule(CudaModule): def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None, - code=None, cache_dir=None, include_dirs=None, link_options=None, + code=None, cache_dir=None, include_dirs=[], link_options=None, message_handler=None, log_verbose=False, cuda_libdir=None): from pycuda.driver import Context compute_capability = Context.get_device().compute_capability() @@ -428,7 +428,7 @@ class DynamicModule(CudaModule): class DynamicSourceModule(DynamicModule): def __init__(self, source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=None, use_cudadevrt=False, cuda_libdir=None, + include_dirs=[], use_cudadevrt=False, cuda_libdir=None, exports=None): super(DynamicSourceModule, self).__init__(nvcc=nvcc, keep=keep, no_extern_c=no_extern_c, arch=arch, code=code, diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 46b3299c..68e2fa5c 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -576,7 +576,7 @@ namespace } // Other compiler options specified by the caller - if (py_include_dirs.ptr() != Py_None) { + if (py_nvrtc_options.ptr() != Py_None) { PYTHON_FOREACH(py_compiler_option, py_nvrtc_options) { m_compile_options.push_back(py::extract(py_compiler_option)); } -- GitLab From 784f8c2f867418636d4d21cbbc31386779c18fd4 Mon Sep 17 00:00:00 2001 From: Lurch Date: Fri, 20 Jan 2017 17:54:19 +0100 Subject: [PATCH 10/16] Minor modifications to a few prototypes, removed shared_ptr<> from Boost::Python declaration of C++ classes --- pycuda/compiler.py | 2 +- src/wrapper/wrap_cudadrv.cpp | 32 +++++++++++++++++--------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 7d08ce2b..a6da5c89 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -380,7 +380,7 @@ class DynamicModule(CudaModule): return self def add_source_nvrtc(self, source, nvrtc_options=None, - name='kernel.ptx', exports=None, compiler=None): + name=None, exports=None, compiler=None): if not hasattr(self.linker, "add_nvrtc"): raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!') if self.mangled_exports is None and len(exports) > 0: diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 68e2fa5c..bb1ffc00 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -584,7 +584,7 @@ namespace options_changed(); - // In-memory NVRCT header files + // In-memory NVRTC header files if (py_headers.ptr() != Py_None) { const py::dict py_headers_dict = py::extract(py_headers); if (py::len(py_headers_dict) > 0) { @@ -600,7 +600,7 @@ namespace std::vector compile( const py::str& py_source, - const char* name, + const py::object& py_name, const py::object& py_nvrtc_options, const py::object& py_exports, py::object& py_mangled_exports) @@ -614,6 +614,8 @@ namespace } // Create NVRTC program + const char* name = (py_name.ptr() != Py_None)? + py::extract(py_name) : "kernel.ptx"; NvrtcProgram prog(py::extract(py_source), name, m_header_names_p, m_header_contents_p); @@ -660,9 +662,7 @@ namespace py::object py_exports, py::object py_mangled_exports) { - const char* name = (py_name.ptr() != Py_None)? - py::extract(py_name) : "kernel.ptx"; - const std::vector ptx = compile(py_source, name, + const std::vector ptx = compile(py_source, py_name, py_nvrtc_options, py_exports, py_mangled_exports); return std::string(ptx.begin(), ptx.end()); } @@ -754,7 +754,7 @@ namespace close(); } - void add_data(py::object py_data, CUjitInputType input_type, py::str py_name) + void add_data(py::object py_data, CUjitInputType input_type, py::object py_name) { const char *data_buf; PYCUDA_BUFFER_SIZE_T data_buf_len; @@ -762,7 +762,7 @@ namespace throw py::error_already_set(); } const char* name = (py_name.ptr() != Py_None)? - py::extract(py_name) : NULL; + py::extract(py_name) : "unknown"; const CUresult cu_result = cuLinkAddData(m_link_state, input_type, static_cast(const_cast(data_buf)), data_buf_len, name, 0, NULL, NULL); @@ -782,12 +782,12 @@ namespace py::object py_nvrtc_options, py::object py_exports, py::object py_mangled_exports) { - const char* name = (py_name.ptr() != Py_None)? - py::extract(py_name) : "kernel.ptx"; // compile source into buffer ptx - std::vector ptx = compiler->compile(py_source, name, + std::vector ptx = compiler->compile(py_source, py_name, py_nvrtc_options, py_exports, py_mangled_exports); // link buffer ptx + const char* name = (py_name.ptr() != Py_None)? + py::extract(py_name) : "kernel.ptx"; const CUresult cu_result = cuLinkAddData(m_link_state, CU_JIT_INPUT_PTX, &ptx[0], ptx.size(), name, 0, NULL, NULL); check_cu_result("cuLinkAddData", cu_result); @@ -1471,14 +1471,15 @@ BOOST_PYTHON_MODULE(_driver) // {{{ nvrtc compiler #if CUDAPP_CUDA_VERSION >= 8000 - py::class_ >("NvrtcCompiler") +// py::class_ >("NvrtcCompiler") + py::class_("NvrtcCompiler") .def(py::init()) .def(py::init()) .def(py::init()) .def(py::init()) .def("compile", &NvrtcCompiler::py_compile, ( py::arg("source"), - py::arg("name")=py::str("kernel.ptx"), + py::arg("name")=py::object(), py::arg("nvrtc_options")=py::object(), py::arg("exports")=py::object(), py::arg("mangled_exports")=py::object())); @@ -1496,14 +1497,15 @@ BOOST_PYTHON_MODULE(_driver) .value("OBJECT", CU_JIT_INPUT_OBJECT) .value("LIBRARY", CU_JIT_INPUT_LIBRARY); - py::class_ >("Linker") +// py::class_ >("Linker") + py::class_("Linker") .def(py::init()) .def(py::init()) .def(py::init()) .def("add_data", &Linker::add_data, ( py::arg("data"), py::arg("input_type"), - py::arg("name")=py::str("unknown"))) + py::arg("name")=py::object())) .def("add_file", &Linker::add_file, ( py::arg("filename"), py::arg("input_type"))) @@ -1511,7 +1513,7 @@ BOOST_PYTHON_MODULE(_driver) .def("add_nvrtc", &Linker::add_nvrtc, ( py::arg("compiler"), py::arg("source"), - py::arg("name")=py::str("kernel.ptx"), + py::arg("name")=py::object(), py::arg("nvrtc_options")=py::object(), py::arg("exports")=py::object(), py::arg("mangled_exports")=py::object())) -- GitLab From 715ef3d2ec7cee2b2d5b6196265bf8840ee3e8d6 Mon Sep 17 00:00:00 2001 From: chschnell Date: Sun, 22 Jan 2017 18:48:53 +0100 Subject: [PATCH 11/16] Final tuning of prototypes, changed C++ class names to properly match Boost class naming convention --- pycuda/compiler.py | 49 +++++++++++++++++++++++------------- src/wrapper/wrap_cudadrv.cpp | 44 ++++++++++++++++---------------- 2 files changed, 53 insertions(+), 40 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index a6da5c89..2ebf178a 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -292,8 +292,11 @@ class SourceModule(CudaModule): self._bind_module() class DynamicModule(CudaModule): - def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None, - code=None, cache_dir=None, include_dirs=[], link_options=None, + #def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None, + # code=None, cache_dir=None, include_dirs=[], link_options=None, + # message_handler=None, log_verbose=False, cuda_libdir=None): + def __init__(self, nvcc=None, keep=False, no_extern_c=False, arch=None, + code=None, cache_dir=None, include_dirs=None, link_options=None, message_handler=None, log_verbose=False, cuda_libdir=None): from pycuda.driver import Context compute_capability = Context.get_device().compute_capability() @@ -301,8 +304,8 @@ class DynamicModule(CudaModule): raise RuntimeError('Minimum compute capability for JIT linker is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) else: - from pycuda.driver import Linker - self.linker = Linker(link_options, message_handler, log_verbose) + from pycuda.driver import JitLinker + self.linker = JitLinker(link_options, message_handler, log_verbose) self._check_arch(arch) self.nvcc = nvcc self.keep = keep @@ -370,31 +373,41 @@ class DynamicModule(CudaModule): 'directory, set CUDA library path manually') return libdir, libptn - def add_source(self, source, nvcc_options=[], name='kernel.ptx'): - ptx = compile(source, nvcc=self.nvcc, options=nvcc_options, + def add_source_nvcc(self, source, options=None, name=None): + if self.nvcc is None: + nvcc = 'nvcc' + else: + nvcc = self.nvcc + if self.include_dirs is None: + include_dirs = [] + else: + include_dirs = self.include_dirs + ptx = compile(source, nvcc=nvcc, options=options, keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch, code=self.code, cache_dir=self.cache_dir, - include_dirs=self.include_dirs, target="ptx") + include_dirs=include_dirs, target="ptx") from pycuda.driver import jit_input_type + if name is None: + name = 'kernel.ptx' self.linker.add_data(ptx, jit_input_type.PTX, name) return self - def add_source_nvrtc(self, source, nvrtc_options=None, + def add_source_nvrtc(self, source, options=None, name=None, exports=None, compiler=None): - if not hasattr(self.linker, "add_nvrtc"): + if not hasattr(self.linker, "add_source_nvrtc"): raise RuntimeError('NVRTC not supported, CUDA 8.0 or higher required!') if self.mangled_exports is None and len(exports) > 0: self.mangled_exports = {} if compiler is None: from pycuda.driver import NvrtcCompiler - compiler = NvrtcCompiler(nvrtc_options, self.include_dirs, self.arch) - nvrtc_options = None - self.linker.add_nvrtc(compiler, source, name=name, - nvrtc_options=nvrtc_options, exports=exports, + compiler = NvrtcCompiler(options, self.include_dirs, self.arch) + options = None + self.linker.add_source_nvrtc(source, compiler, name=name, + options=options, exports=exports, mangled_exports=self.mangled_exports) return self - def add_data(self, data, input_type, name='unknown'): + def add_data(self, data, input_type, name=None): self.linker.add_data(data, input_type, name) return self @@ -426,9 +439,9 @@ class DynamicModule(CudaModule): return self.module.get_function(self.mangled_exports[name]) class DynamicSourceModule(DynamicModule): - def __init__(self, source, nvcc="nvcc", options=None, keep=False, + def __init__(self, source, nvcc=None, options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], use_cudadevrt=False, cuda_libdir=None, + include_dirs=None, use_cudadevrt=False, cuda_libdir=None, exports=None): super(DynamicSourceModule, self).__init__(nvcc=nvcc, keep=keep, no_extern_c=no_extern_c, arch=arch, code=code, @@ -443,9 +456,9 @@ class DynamicSourceModule(DynamicModule): if exports is None: if use_cudadevrt and '-lcudadevrt' not in options: options.append('-lcudadevrt') - self.add_source(source, nvcc_options=options) + self.add_source_nvcc(source, options=options) else: - self.add_source_nvrtc(source, nvrtc_options=options, exports=exports) + self.add_source_nvrtc(source, options=options, exports=exports) if use_cudadevrt: self.add_stdlib('cudadevrt') self.link() diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index bb1ffc00..1988d492 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -473,7 +473,7 @@ namespace } }; - class NvrtcCompiler : public boost::noncopyable + class nvrtc_compiler : public boost::noncopyable { private: std::vector m_compile_options; @@ -534,7 +534,7 @@ namespace } public: - NvrtcCompiler( + nvrtc_compiler( const py::object py_nvrtc_options=py::object(), const py::object py_include_dirs=py::object(), const py::object py_arch=py::object(), @@ -671,10 +671,10 @@ namespace // }}} - // {{{ linker + // {{{ jit linker #if CUDAPP_CUDA_VERSION >= 5050 - class Linker : public boost::noncopyable + class jit_linker : public boost::noncopyable { private: py::object m_message_handler; @@ -720,7 +720,7 @@ namespace } public: - Linker(py::object py_link_options = py::object(), + jit_linker(py::object py_link_options = py::object(), py::object message_handler = py::object(), py::object py_log_verbose = py::object(false)) : m_message_handler(message_handler), @@ -749,7 +749,7 @@ namespace check_cu_result("cuLinkCreate", cu_result); } - ~Linker() + ~jit_linker() { close(); } @@ -778,9 +778,9 @@ namespace } #if CUDAPP_CUDA_VERSION >= 8000 - void add_nvrtc(NvrtcCompiler* compiler, py::str py_source, py::object py_name, - py::object py_nvrtc_options, py::object py_exports, - py::object py_mangled_exports) + void add_source_nvrtc(py::str py_source, nvrtc_compiler* compiler, + py::object py_name, py::object py_nvrtc_options, + py::object py_exports, py::object py_mangled_exports) { // compile source into buffer ptx std::vector ptx = compiler->compile(py_source, py_name, @@ -1471,23 +1471,23 @@ BOOST_PYTHON_MODULE(_driver) // {{{ nvrtc compiler #if CUDAPP_CUDA_VERSION >= 8000 -// py::class_ >("NvrtcCompiler") - py::class_("NvrtcCompiler") +// py::class_ >("NvrtcCompiler") + py::class_("NvrtcCompiler") .def(py::init()) .def(py::init()) .def(py::init()) .def(py::init()) - .def("compile", &NvrtcCompiler::py_compile, ( + .def("compile", &nvrtc_compiler::py_compile, ( py::arg("source"), py::arg("name")=py::object(), - py::arg("nvrtc_options")=py::object(), + py::arg("options")=py::object(), py::arg("exports")=py::object(), py::arg("mangled_exports")=py::object())); #endif // }}} - // {{{ linker + // {{{ jit linker #if CUDAPP_CUDA_VERSION >= 5050 py::enum_("jit_input_type") @@ -1497,28 +1497,28 @@ BOOST_PYTHON_MODULE(_driver) .value("OBJECT", CU_JIT_INPUT_OBJECT) .value("LIBRARY", CU_JIT_INPUT_LIBRARY); -// py::class_ >("Linker") - py::class_("Linker") +// py::class_ >("JitLinker") + py::class_("JitLinker") .def(py::init()) .def(py::init()) .def(py::init()) - .def("add_data", &Linker::add_data, ( + .def("add_data", &jit_linker::add_data, ( py::arg("data"), py::arg("input_type"), py::arg("name")=py::object())) - .def("add_file", &Linker::add_file, ( + .def("add_file", &jit_linker::add_file, ( py::arg("filename"), py::arg("input_type"))) #if CUDAPP_CUDA_VERSION >= 8000 - .def("add_nvrtc", &Linker::add_nvrtc, ( - py::arg("compiler"), + .def("add_source_nvrtc", &jit_linker::add_source_nvrtc, ( py::arg("source"), + py::arg("compiler"), py::arg("name")=py::object(), - py::arg("nvrtc_options")=py::object(), + py::arg("options")=py::object(), py::arg("exports")=py::object(), py::arg("mangled_exports")=py::object())) #endif - .def("link_module", &Linker::link_module, + .def("link_module", &jit_linker::link_module, py::return_value_policy()); #endif -- GitLab From c2a5f761106d8c376dacb5843f85b23e12c0ad99 Mon Sep 17 00:00:00 2001 From: chschnell Date: Sun, 22 Jan 2017 18:54:02 +0100 Subject: [PATCH 12/16] Temporarily disabled dynamic compiler tests --- test/test_driver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_driver.py b/test/test_driver.py index d7e68d77..d73d1712 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -893,12 +893,14 @@ class TestDriver: drv.memcpy_htod(a_gpu, a) drv.memcpy_htod(b_gpu, b) + ''' from pycuda.compiler import DynamicSourceModule mod = DynamicSourceModule(cuda_string, keep=True, use_cudadevrt=True) func = mod.get_function("math") func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu, block=(100, 1, 1), grid=(1, 1, 1)) + ''' drv.memcpy_dtoh(c, c_gpu) drv.memcpy_dtoh(d, d_gpu) @@ -934,6 +936,7 @@ class TestDriver: printf(" Hello inner world!\\n"); }''' + ''' from pycuda.compiler import DynamicModule mod = DynamicModule() mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) @@ -943,6 +946,7 @@ class TestDriver: test_kernel = mod.get_function('test_kernel') test_kernel(grid=(2,1), block=(1,1,1)) + ''' def test_import_pyopencl_before_pycuda(): -- GitLab From b75c2d0c810f6117de17972ee20c6a3c7be38974 Mon Sep 17 00:00:00 2001 From: chschnell Date: Sun, 22 Jan 2017 19:12:09 +0100 Subject: [PATCH 13/16] Temporarily disabled all JIT C++ sources --- src/wrapper/wrap_cudadrv.cpp | 12 +++++++++++- test/test_driver.py | 4 ++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 1988d492..988d88b9 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -8,7 +8,9 @@ #include "wrap_helpers.hpp" #include - +/* +#DEFINE USE_JIT +*/ #if CUDAPP_CUDA_VERSION < 1010 @@ -412,6 +414,8 @@ namespace // {{{ nvrtc compiler +#ifdef USE_JIT + #if CUDAPP_CUDA_VERSION >= 8000 #include // NOTE: this requires us to link static SDK library "nvrtc" @@ -814,6 +818,8 @@ namespace }; #endif +#endif /* #ifdef USE_JIT */ + // }}} template @@ -1470,6 +1476,8 @@ BOOST_PYTHON_MODULE(_driver) // {{{ nvrtc compiler +#ifdef USE_JIT + #if CUDAPP_CUDA_VERSION >= 8000 // py::class_ >("NvrtcCompiler") py::class_("NvrtcCompiler") @@ -1522,6 +1530,8 @@ BOOST_PYTHON_MODULE(_driver) py::return_value_policy()); #endif +#endif /* #ifdef USE_JIT */ + // }}} // {{{ function diff --git a/test/test_driver.py b/test/test_driver.py index d73d1712..e0edfc3c 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -939,8 +939,8 @@ class TestDriver: ''' from pycuda.compiler import DynamicModule mod = DynamicModule() - mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) - mod.add_source(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) + mod.add_source_nvcc(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) + mod.add_source_nvcc(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) mod.add_stdlib('cudadevrt') mod.link() -- GitLab From 8542e59bf509051fa2f464c822d90b3e4c428954 Mon Sep 17 00:00:00 2001 From: chschnell Date: Mon, 23 Jan 2017 08:05:19 +0100 Subject: [PATCH 14/16] Added documentation --- doc/source/driver.rst | 668 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 666 insertions(+), 2 deletions(-) diff --git a/doc/source/driver.rst b/doc/source/driver.rst index 296782da..a4f171ab 100644 --- a/doc/source/driver.rst +++ b/doc/source/driver.rst @@ -513,6 +513,36 @@ Constants .. attribute:: PREFER_PTX .. attribute:: PREFER_BINARY +.. class:: jit_input_type + + Supported input data types to CUDA's JIT linker. + The attribute names and values of this class correspond to CUDA's + `enum CUjitInputType`. + + CUDA 3.5 and above. + + .. versionadded:: 2017.? + + .. attribute:: CUBIN + + Data contains compiled device-class-specific device code. + + .. attribute:: PTX + + Data contains PTX (Parallel Thread eXecution) source code. + + .. attribute:: FATBINARY + + Data contains a bundle of multiple cubins and/or PTX of some device code. + + .. attribute:: OBJECT + + Data contains host object with embedded device code. + + .. attribute:: LIBRARY + + Data contains an archive of host objects with embedded device code. + .. class:: host_alloc_flags Flags to be used to allocate :ref:`pagelocked_memory`. @@ -558,6 +588,14 @@ Constants .. attribute:: LAZY_ENABLE_PEER_ACCESS + .. attribute:: DEV_RUNTIME_SYNC_DEPTH + + CUDA 3.5 and above. + + .. attribute:: DEV_RUNTIME_PENDING_LAUNCH_COUNT + + CUDA 3.5 and above. + Graphics-related constants ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1993,6 +2031,290 @@ CUDA 4.0 and newer. Just-in-time Compilation ======================== +PyCUDA provides several tools to build a :class:`Module` from CUDA C++ +sources at run-time. A Module encapsulates binary GPU code and acts as an +interface to function and data pointers in device memory when it is executed +on a device. + +For a typical case with a single CUDA C++ source, see: + +- :class:`~pycuda.compiler.SourceModule` - A Module created using NVIDIA's + command line compiler NVCC, compiled and linked against CUDA's static + runtime. Unless you need any of the other features described below, this + should be your choice. +- :class:`~pycuda.compiler.DynamicSourceModule` - A Module created using NVCC + or NVRTC and CUDA's Just-In-Time (JIT) linker. Facilitates linking against + CUDA's dynamic runtime, which is required for dynamic parallelism / + recursive kernels. + +and when you need to compile from multiple input sources, see: + +- :class:`~pycuda.compiler.DynamicModule` - A Module created from an arbitrary + mix of CUDA C++ sources (compiled with either NVCC or NVRTC), PTX sources + and CUDA libraries with CUDA's Just-In-Time (JIT) linker. + +All classes exhibit the same public interface as :class:`Module`, but do not +inherit from it. In order to compile a CUDA C++ source into a plain string +without building a Module, see: + +- :func:`compiler.compile() ` for NVCC, returns + CUBIN binary code +- :meth:`NvrtcCompiler.compile` for NVRTC, returns PTX source code + +NVRTC +----- + +NVIDIA's Run-Time-Compiler +`NVRTC `_ +is a light-weight alternative to their Command-Line-Compiler +`NVCC `_, +some of the differences include: + +- NVRTC is a pure CUDA C++ source to PTX (Parallel Thread eXecution) code + compiler and thus covers a lot smaller scope than NVCC. +- NVRTC is an API, whereas NVCC is an executable. +- NVRTC supports only a `small subset + `_ + of NVCC's `compiler options + `_. +- NVRTC does not support certain preprocessor directives and for example + fails to parse some STL header files, but just like in NVCC, the + ``printf()`` function is directly available in CUDA sources. +- NVRTC requires extra steps to access functions in compiled CUDA + sources from the host. The names of these functions must be declared + to NVRTC before compiling, and after compilation they can only be + looked up in the Module by using their C++ mangled names. + +PyCUDA already reduces the computational overhead involved with NVCC by +maintaining a file-sytem based cache, which means that it invokes NVCC +only once for each unique combination of CUDA source and compiler setup. +For applications that generate volatile CUDA C++ sources dynamically at +run-time and therby cause overly many cache-misses in PyCUDA's NVCC +cache, using NVRTC instead of NVCC should be considered if CUDA compile +time is a problem. + +See also: + + :class:`~pycuda.compiler.DynamicSourceModule`, + :meth:`DynamicModule.add_source_nvrtc() ` and + :class:`NvrtcCompiler` + +Dynamic Runtime +--------------- + +Be aware that there is an overall negative performance impact caused by +simply linking to CUDA's dynamic device runtime library *cudadevrt*. + +.. warning:: + + From CUDA's `Dynamic Parallelism Programming Guide + `_ + (CUDA 8.0): + + *C.4.2.2. Dynamic-parallelism-enabled Kernel Overhead* + + System software which is active when controlling dynamic launches + may impose an overhead on any kernel which is running at the time, + whether or not it invokes kernel launches of its own. This + overhead arises from the device runtime's execution tracking and + management software and may result in decreased performance for + e.g., library calls when made from the device compared to from the + host side. This overhead is, in general, incurred for applications + that link against the device runtime library. + +It is therefore recommended to use *cudadevrt* only when needed. + +.. class:: JitLinker([link_options [, message_handler [, log_verbose]]]) + + .. versionadded:: 2017.? + + Create an object that provides access to CUDA's Just-In-Time (JIT) linker. + + :param link_options: + JIT linker options + :type link_options: + list(jit_option) or None + :param message_handler: + Optional callback on success or failure (see below) + :type message_handler: + callable or None + :param bool log_verbose: + If *True*, instruct linker to create detailed log output. + Default: *False* + + The *message_handler* callback's Python function prototype is + + .. code-block:: python + + def message_handler(bool succeeded, str info_str, str error_str) + + where *succeeded* informs on success (*True*) or failure (*False*) of the + linker invocation, *info_str* contains informational linker log output and + *error_str* any error output. Note that in case of a linker error an + Excpetion is raised containing *error_str* after *message_handler* was + called. + + CUDA 3.5 and above. + + .. method:: add_data(data, input_type, name=None) + + Add *data* with type *input_type* to this linker session. + + :param data: + A string (Python 2) or bytes (Python 3) object containing the + data to add + :type data: + str or bytes + :param input_type: + JIT linker input type of *data* + :type input_type: + jit_input_type + :param name: + Specifies to the linker under which name to refer to *data* in + its warning and error messages. If *None*, defaults to ``"unknown"`` + :type name: + str or None + + .. method:: add_file(filename, input_type) + + Add content of file *filename* with type *input_type* to this linker + session. + + :param str filename: + The name of the file to add + :param input_type: + JIT linker input type of the file's content + :type input_type: + jit_input_type + + .. method:: add_source_nvrtc(source, compiler, name=None,\ + options=None, exports=None, mangled_exports=None) + + Compile CUDA source code *source* using NVRTC *compiler* with options + *options*, then add the resulting PTX code to this linker + session. + + :param source: + CUDA C++ source code + :type source: + str + :param compiler: + The NVRTC compiler to use to compile *source* + :type compiler: + NvrtcCompiler + :param name: + Specifies to the compiler and linker under which name to refer to + *source* in their warning and error messages. If *None*, defaults + to ``"kernel.ptx"`` + :type name: + str or None + :param options: + List of NVRTC compile options + :type options: + list(str) or None + :param exports: + Function names in *source* that the caller wants to be exported + :type exports: + list(str) or None + :param mangled_exports: + Output variable, receives pairs of function names from *exports* + and their mangled counterparts + :type mangled_exports: + dict(str, str) or None + :see: + :meth:`NvrtcCompiler.compile` + + CUDA 8.0 and above. + + .. method:: link_module() + + Closes this linker session and returns the new :class:`Module` object. + + :return: + the newly created :class:`Module` instance + +.. class:: NvrtcCompiler([options[, include_dirs [, arch [, headers]]]]) + + .. versionadded:: 2017.? + + Create an object that provides access to NVIDIA's Run-Time-Compiler (NVRTC). + + Configure the compiler to add the CUDA SDK include path, the PyCUDA + include path and all directory names in *include_dirs* to its include + search paths. The global compiler options *options* provided in + this constructor can later be extended per CUDA source, see + :meth:`compile`. + + Note that NVRTC `supports + `_ + only a small subset of `NVCC's + `_ + compiler options. + + :param options: + List of NVRTC compile options to use with this compiler instance. + :type options: + list(str) or None + :param include_dirs: + List of extra directory names to use when processing ``#include`` + preprocessor directives in CUDA sources. + :type include_dirs: + list(str) or None + :param arch: + String of the form ``compute_`` where `` and + `` are the two digits of a compute capability + pair `(, )`. If *None*, defaults to the current + context's device's compute capability. + :type arch: + str or None + :param headers: + A `dict` that maps virtual header file names to their respective + content. These extra in-memory headers are made visible to and can + be included in CUDA sources processed by this compiler just like + regular include files by using the ``#include`` preprocessor + directive. + :type headers: + dict(str, str) or None + + .. method:: compile(source, name=None, options=None, exports=None,\ + mangled_exports=None) + + Compile CUDA source code *source* with NVRTC using the compile options + specified earlier in the constructor combined with *options* + and return the resulting PTX code to the caller. Do not upload the + code to the GPU. + + Declare all function names in *exports* to the compiler, and after + successfull compilation, insert each pair of exported function name + in *exports* and its C++ mangled function name into *mangled_exports*. + + :param str source: + CUDA C++ source code + :param name: + Specifies to the compiler under which name to refer to *source* + in its warning and error messages. If *None*, defaults to + ``"kernel.ptx"`` + :type name: + str or None + :param options: + List of additional NVRTC compile options, combine with those + supplied in the constructor + :type options: + list(str) or None + :param exports: + Function names in *source* that the caller wants to be exported + :type exports: + list(str) or None + :param mangled_exports: + Output variable, receives pairs of function names from *exports* + and their mangled counterparts + :type mangled_exports: + dict(str, str) or None + :return: + the compiled PTX code string + + CUDA 8.0 and above. + .. module:: pycuda.compiler .. data:: DEFAULT_NVCC_FLAGS @@ -2042,11 +2364,353 @@ Just-in-time Compilation *Change note:* :class:`SourceModule` was moved from :mod:`pycuda.driver` to :mod:`pycuda.compiler` in version 0.93. -.. function:: compile(source, nvcc="nvcc", options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, +.. function:: compile(source, nvcc="nvcc", options=None, keep=False,\ + no_extern_c=False, arch=None, code=None, cache_dir=None,\ include_dirs=[]) Perform the same compilation as the corresponding :class:`SourceModule` constructor, but only return resulting *cubin* file as a string. In particular, do not upload the code to the GPU. + +.. class:: DynamicSourceModule(source, nvcc=None, options=None, keep=False,\ + no_extern_c=False, arch=None, code=None, cache_dir=None,\ + include_dirs=None, use_cudadevrt=False, cuda_libdir=None,\ + exports=None) + + .. versionadded:: 2017.? + + Create a :class:`~pycuda.driver.Module` from CUDA source code *source* + using either NVCC or NVRTC with optional support for the dynamic CUDA + runtime `cudadevrt` (needed for recursive launches/dynamic parallelism). + + :param source: + CUDA C++ source code + :type source: + str + :param nvcc: + File system path of the NVCC executable. If *None*, defaults to ``nvcc`` + :type nvcc: + str or None + :param options: + List of NVCC or NVRTC compile options + :type options: + list(str) or None + :param keep: + If *True*, keep compiler output and print a line indicating its + location in the file system for debugging purposes. + Default: *False* (NVCC only) + :type keep: + bool + :param no_extern_c: + If *False*, wrap the given source code in ``extern "C" { ... }`` to + prevent C++ name mangling. Default: *False* (NVCC only) + :type no_extern_c: + bool + :param arch: + The name of the class of NVIDIA virtual GPU architecture for which to + compile the CUDA C++ source. If *None*, defaults to the current + context's device's compute capability + :type arch: + str or None + :param code: + The name of the NVIDIA GPU to assemble and optimize PTX code for. If + *None*, it leave unspecified (NVCC only) + :type code: + str or None + :param cache_dir: + The directory used by NVCC for compiler caching. If *None* (default), + then `cache_dir` is taken to be :envvar:`PYCUDA_CACHE_DIR` if set or a + sensible per-user default. If *False*, caching is disabled (NVCC only) + :type cache_dir: + str or None + :param include_dirs: + List of extra directory names to use when processing ``#include`` + preprocessor directives in the CUDA source. + :type include_dirs: + list(str) or None + :param use_cudadevrt: + If *True*, link to the dynamic CUDA runtime, meaning: instruct the + compiler to generate relocatable PTX device code from *source* that + can be linked with other relocatable device code, and statically link + standard library `cudadevrt` to this module. Default: *False* (link + static runtime) + :type use_cudadevrt: + bool + :param cuda_libdir: + The absolute path in the local file system of the CUDA SDK 64-Bit + library directory + :type cuda_libdir: + str or None + :param exports: + If not *None*, use NVRTC instead of NVCC to compile *source*. + This argument contains the list of function names in *source* that + the caller later wants to access using :meth:`get_function()` + :type exports: + list(str) or None + + This class exhibits the same public interface as + :class:`~pycuda.driver.Module`, but does not inherit from it. + + CUDA 3.5 and above, 8.0 and above when using NVRTC. + + .. method:: get_function(name) + + :see: + :meth:`Module.get_function() ` + + .. method:: get_global(name) + + :see: + :meth:`Module.get_global() ` + + .. method:: get_texref(name) + + :see: + :meth:`Module.get_texref() ` + + .. method:: get_surfref(name) + + :see: + :meth:`Module.get_surfref() ` + +.. class:: DynamicModule(nvcc=None, keep=False, no_extern_c=False,\ + arch=None, code=None, cache_dir=None, include_dirs=None,\ + link_options=None, message_handler=None, log_verbose=False,\ + cuda_libdir=None) + + .. versionadded:: 2017.? + + Build a :class:`~pycuda.driver.Module` from an arbitrary mix of CUDA C++ + sources (compiled with either NVCC or NVRTC), PTX sources and CUDA + libraries with CUDA's Just-In-Time (JIT) linker. + + General usage pattern for this class: + + 1. Create a module builder using constructor :meth:`DynamicModule` + 2. Use the ``add_...()`` methods as often as needed and in any order: + + - :meth:`add_source_nvcc(source) ` - to compile *source* with NVCC and add result + - :meth:`add_source_nvrtc(source) ` - to compile *source* with NVRTC and add result + - :meth:`add_data(data) ` - to add in-memory data to the module + - :meth:`add_file(filename) ` - to add files to the module + - :meth:`add_stdlib(libanem) ` - to add SDK libraries to the module + + 3. Link and upload module code to the GPU by calling :meth:`link` once + 4. Access module's kernel functions on the GPU using :meth:`get_function` + + :param nvcc: + File system path of the NVCC executable. If *None*, defaults to ``nvcc`` + :type nvcc: + str or None + :param keep: + If *True*, keep compiler outputs and print a line indicating their + location in the file system for debugging purposes. + Default: *False* (NVCC only) + :type keep: + bool + :param no_extern_c: + If *False*, wrap the given source codes in ``extern "C" { ... }`` to + prevent C++ name mangling. Default: *False* (NVCC only) + :type no_extern_c: + bool + :param arch: + The name of the class of NVIDIA virtual GPU architecture for which to + compile the CUDA C++ sources. If *None*, defaults to the current + context's device's compute capability + :type arch: + str or None + :param code: + The name of the NVIDIA GPU to assemble and optimize PTX code for. If + *None*, it leave unspecified (NVCC only) + :type code: + str or None + :param cache_dir: + The directory used by NVCC for compiler caching. If *None* (default), + then `cache_dir` is taken to be :envvar:`PYCUDA_CACHE_DIR` if set or a + sensible per-user default. If *False*, caching is disabled (NVCC only) + :type cache_dir: + str or None + :param include_dirs: + List of extra directory names to use when processing ``#include`` + preprocessor directives in CUDA sources. + :type include_dirs: + list(str) or None + :param link_options: + Options for the JIT linker :class:`~pycuda.driver.JitLinker` + :type link_options: + list(jit_option) or None + :param message_handler: + Optional callback on success or failure + :type message_handler: + callable or None + :param bool log_verbose: + If *True*, instruct linker to create detailed log output. + Default: *False* + :param cuda_libdir: + The absolute path in the local file system of the CUDA SDK 64-Bit + library directory + :type cuda_libdir: + str or None + + After :meth:`link` has been called, objects of this class exhibit + the same public interface as :class:`~pycuda.driver.Module`, but do not + inherit from it. + + CUDA 3.5 and above. + + .. method:: add_source_nvcc(source, options=None, name=None) + + Compile CUDA source code *source* with NVIDIA's Command-Line-Compiler + NVCC using compiler options *options* and add the resulting + PTX code to the linker session of this module. + + :param str source: + CUDA C++ source code + :param options: + List of NVCC compiler options + :type options: + list(str) or None + :param name: + Specifies to the compiler and linker under which name to refer to + *source* in their warning and error messages. If *None*, defaults + to ``"kernel.ptx"`` + :type name: + str or None + :return: + *self* + + .. method:: add_source_nvrtc(source, options=None,\ + name=None, exports=None, compiler=None) + + Compile CUDA source code *source* with NVIDIA's Run-Time-Compiler + NVRTC using compiler options *options* and add the resulting + PTX code to the linker session of this module. + + NOTE: self.code is not supported with NVRTC and ignored here + + :param str source: + CUDA C++ source code + :param options: + List of NVRTC compile options + :type options: + list(str) or None + :param name: + Specifies to the compiler and linker under which name to refer to + *source* in their warning and error messages. If *None*, defaults + to ``"kernel.ptx"`` + :type name: + str or None + :param exports: + Function names in *source* that the caller wants to be exported + :type exports: + list(str) or None + :param compiler: + The NVRTC compiler to use to compile *source*. If *None*, a + default compiler instance is created and used + :type compiler: + NvrtcCompiler or None + :return: + *self* + :see: + :meth:`JitLinker.add_source_nvrtc() ` + + CUDA 8.0 and above. + + .. method:: add_data(data, input_type, name=None) + + Add the binary data in *data* to the linker session of this module. + + :param data: + A string (Python 2) or bytes (Python 3) object containing the data to add + :type data: + str or bytes + :param input_type: + JIT linker input type of *data* + :type input_type: + jit_input_type + :param name: + Specifies to the linker under which name to refer to *data* in + its warning and error messages. If *None*, defaults + to ``"kernel.ptx"`` + :type name: + str or None + :return: + *self* + :see: + :meth:`JitLinker.add_data() ` + + .. method:: add_file(filename, input_type) + + Add the content of file *filename* to the linker session of this + module. + + :param str filename: + The name of the file to add + :param input_type: + JIT linker input type of the file's content + :type input_type: + jit_input_type + :return: + *self* + :see: + :meth:`JitLinker.add_file() ` + + .. method:: add_stdlib(libname) + + Add the standard library *libname* to the linker session of this + module. + + A standard library is part of the CUDA SDK. If this method fails + to auto-locate the correct CUDA SDK installation path on your system, + use argument *cuda_libdir* in the constructor to set this path + manually to the 64-Bit library directory of your local CUDA SDK + installation. + + :param str libname: + specifies the shortened library file name which gets expanded + internally to match the correct library file name for the local + system, for example `cudadevrt` is the shortened form of + `cudadevrt.lib` on Windows and `libcudadevrt.a` on Linux. + :return: + *self* + + .. method:: link() + + Complete the linker session and create the :class:`~pycuda.driver.Module`. + + After calling this method, calling the *add_...* methods of this + instance is no longer allowed, whereas accessing module functions + through this instance using :meth:`get_function` is only + allowed after calling this method. + + :return: + *self* + + .. method:: get_function(name) + + This method is not defined before calling :meth:`link`. + + :see: + :meth:`Module.get_function() ` + + .. method:: get_global(name) + + This method is not defined before calling :meth:`link`. + + :see: + :meth:`Module.get_global() ` + + .. method:: get_texref(name) + + This method is not defined before calling :meth:`link`. + + :see: + :meth:`Module.get_texref() ` + + .. method:: get_surfref(name) + + This method is not defined before calling :meth:`link`. + + :see: + :meth:`Module.get_surfref() ` -- GitLab From 2d823d589bf9c7a006e95cecf49da17fdd4cfd0e Mon Sep 17 00:00:00 2001 From: chschnell Date: Tue, 24 Jan 2017 05:10:19 +0100 Subject: [PATCH 15/16] Test --- doc/source/driver.rst | 8 +++----- src/wrapper/wrap_cudadrv.cpp | 6 +++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/doc/source/driver.rst b/doc/source/driver.rst index a4f171ab..415a1272 100644 --- a/doc/source/driver.rst +++ b/doc/source/driver.rst @@ -2415,7 +2415,7 @@ It is therefore recommended to use *cudadevrt* only when needed. str or None :param code: The name of the NVIDIA GPU to assemble and optimize PTX code for. If - *None*, it leave unspecified (NVCC only) + *None*, leave it unspecified (NVCC only) :type code: str or None :param cache_dir: @@ -2522,7 +2522,7 @@ It is therefore recommended to use *cudadevrt* only when needed. str or None :param code: The name of the NVIDIA GPU to assemble and optimize PTX code for. If - *None*, it leave unspecified (NVCC only) + *None*, leave it unspecified (NVCC only) :type code: str or None :param cache_dir: @@ -2587,8 +2587,6 @@ It is therefore recommended to use *cudadevrt* only when needed. NVRTC using compiler options *options* and add the resulting PTX code to the linker session of this module. - NOTE: self.code is not supported with NVRTC and ignored here - :param str source: CUDA C++ source code :param options: @@ -2632,7 +2630,7 @@ It is therefore recommended to use *cudadevrt* only when needed. :param name: Specifies to the linker under which name to refer to *data* in its warning and error messages. If *None*, defaults - to ``"kernel.ptx"`` + to ``"unknown"`` :type name: str or None :return: diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 988d88b9..c97f4042 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -8,9 +8,9 @@ #include "wrap_helpers.hpp" #include -/* -#DEFINE USE_JIT -*/ + +#define USE_JIT + #if CUDAPP_CUDA_VERSION < 1010 -- GitLab From 21a21f97d884651c688c5c529d70258f5455c5fe Mon Sep 17 00:00:00 2001 From: chschnell Date: Tue, 24 Jan 2017 05:51:01 +0100 Subject: [PATCH 16/16] Test --- pycuda/compiler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 2ebf178a..646930ad 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -291,6 +291,7 @@ class SourceModule(CudaModule): self._bind_module() +''' class DynamicModule(CudaModule): #def __init__(self, nvcc='nvcc', keep=False, no_extern_c=False, arch=None, # code=None, cache_dir=None, include_dirs=[], link_options=None, @@ -320,7 +321,7 @@ class DynamicModule(CudaModule): self.module = None def _locate_cuda_libdir(self): - ''' + '' ' Locate the "standard" CUDA SDK library directory in the local file system. Supports 64-Bit Windows, Linux and Mac OS X. In case the caller supplied cuda_libdir in the constructor @@ -344,7 +345,7 @@ class DynamicModule(CudaModule): https://cmake.org/cmake/help/v3.0/module/FindCUDA.html https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake - Verify all Linux code paths somehow - ''' + '' ' from os.path import isfile, join from platform import system as platform_system system = platform_system() @@ -462,3 +463,4 @@ class DynamicSourceModule(DynamicModule): if use_cudadevrt: self.add_stdlib('cudadevrt') self.link() +''' -- GitLab