From 99f864919e273e700f6cef1133319e5fbd3a0f74 Mon Sep 17 00:00:00 2001 From: Lurch Date: Tue, 10 Jan 2017 17:51:34 +0100 Subject: [PATCH 01/15] Added Python class pycuda.compiler.JitLinkModule, C++ class Linker and Boost.Python wrappers for class Linker and enum CUjitInputType. --- pycuda/compiler.py | 54 +++++++++++++ src/wrapper/wrap_cudadrv.cpp | 143 +++++++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index c0b5ba63..fe0260f0 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -287,3 +287,57 @@ class SourceModule(object): def get_function(self, name): return self.module.get_function(name) + +class JitLinkModule(object): + def __init__(self, nvcc='nvcc', options=None, keep=False, + no_extern_c=False, arch=None, code=None, cache_dir=None, + include_dirs=[], message_handler=None, log_verbose=False): + self._check_arch(arch) + self.nvcc = nvcc + self.keep = keep + self.no_extern_c = no_extern_c + self.arch = arch + self.code = code + self.cache_dir = cache_dir + self.include_dirs = include_dirs + self.module = None + from pycuda.driver import Linker + self.linker = Linker(message_handler, options, log_verbose) + + def add_source(self, source, nvcc_options=None, name='unknown'): + cubin = compile(source, self.nvcc, nvcc_options, self.keep, + self.no_extern_c, self.arch, self.code, self.cache_dir, + self.include_dirs) + from pycuda.driver import jit_input_type + self.linker.add_data(cubin, jit_input_type.PTX, name=name) + + def add_data(self, data, cu_jit_input_type, name='unknown'): + self.linker.add_data(data, cu_jit_input_type, name) + + def add_file(self, filename, cu_jit_input_type): + self.linker.add_file(filename, cu_jit_input_type) + + def link(self): + self.module = self.linker.link_module() + self.linker = None + + self.get_global = self.module.get_global + self.get_texref = self.module.get_texref + if hasattr(self.module, "get_surfref"): + self.get_surfref = self.module.get_surfref + + def _check_arch(self, arch): + if arch is None: + return + try: + from pycuda.driver import Context + capability = Context.get_device().compute_capability() + if tuple(map(int, tuple(arch.split("_")[1]))) > capability: + from warnings import warn + warn("trying to compile for a compute capability " + "higher than selected GPU") + except: + pass + + def get_function(self, name): + return self.module.get_function(name) diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 72175979..4f126065 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -410,6 +410,128 @@ namespace // }}} + // {{{ linker + +#if CUDAPP_CUDA_VERSION >= 5050 + class Linker : public boost::noncopyable + { + private: + py::object m_message_handler; + CUlinkState m_link_state; + bool m_log_verbose; + std::vector m_options; + std::vector m_values; + char m_info_buf[32768]; + char m_error_buf[32768]; + + void close() { + if (m_link_state != NULL) { + cuLinkDestroy(m_link_state); + m_link_state = NULL; + } + } + + void add_option(CUjit_option option, void* value) { + m_options.push_back(option); + m_values.push_back(value); + } + + void check_cu_result(const char* cu_function_name, CUresult cu_result) const { + if (cu_result != CUDA_SUCCESS) { + call_message_handler(cu_result); + throw pycuda::error(cu_function_name, cu_result, error_str().c_str()); + } + } + + void call_message_handler(CUresult cu_result) const { + if (m_message_handler != py::object()) { + m_message_handler(cu_result == CUDA_SUCCESS, info_str(), error_str()); + } + } + + const std::string info_str() const { + return std::string(m_info_buf, size_t(m_values[1])); + } + + const std::string error_str() const { + return std::string(m_error_buf, size_t(m_values[3])); + } + + public: + Linker(py::object message_handler = py::object(), + py::object py_options = py::object(), + py::object py_log_verbose = py::object(false)) + : m_message_handler(message_handler), + m_link_state(), + m_log_verbose(py::extract(py_log_verbose)) + { + add_option(CU_JIT_INFO_LOG_BUFFER, m_info_buf); + add_option(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, (void*) sizeof(m_info_buf)); + add_option(CU_JIT_ERROR_LOG_BUFFER, m_error_buf); + add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, (void*) sizeof(m_error_buf)); + add_option(CU_JIT_LOG_VERBOSE, (void*) (m_log_verbose? 1ull : 0ull)); + + if (py_options.ptr() != Py_None) { + PYTHON_FOREACH(key_value, py_options) { + add_option( + py::extract(key_value[0]), + (void*) py::extract(key_value[1])()); + } + } + + const CUresult cu_result = cuLinkCreate( + (unsigned int) m_options.size(), + const_cast(&*m_options.begin()), + const_cast(&*m_values.begin()), + &m_link_state); + check_cu_result("cuLinkCreate", cu_result); + } + + ~Linker() + { + close(); + } + + void add_data(py::object py_data, CUjitInputType input_type, py::str py_name) + { + const char *data_buf; + PYCUDA_BUFFER_SIZE_T data_buf_len; + if (PyObject_AsCharBuffer(py_data.ptr(), &data_buf, &data_buf_len) != 0) { + throw py::error_already_set(); + } + const char* name = (py_name.ptr() != Py_None)? py::extract(py_name) : NULL; + const CUresult cu_result = cuLinkAddData(m_link_state, input_type, (void*)data_buf, data_buf_len, name, 0, NULL, NULL); + check_cu_result("cuLinkAddData", cu_result); + } + + void add_file(py::str py_filename, CUjitInputType input_type) + { + const char* filename = py::extract(py_filename); + const CUresult cu_result = cuLinkAddFile(m_link_state, input_type, filename, 0, NULL, NULL); + check_cu_result("cuLinkAddFile", cu_result); + } + + module* link_module() + { + const char* cubin_data = NULL; + size_t cubin_size = 0; + CUresult cu_result = cuLinkComplete(m_link_state, (void**)&cubin_data, &cubin_size); + check_cu_result("cuLinkComplete", cu_result); + + CUmodule cu_module = NULL; + cu_result = cuModuleLoadData(&cu_module, (void*)cubin_data); + check_cu_result("cuModuleLoadData", cu_result); + + call_message_handler(cu_result); + close(); + + return new module(cu_module); + } + }; +#endif + + // }}} + template PyObject *mem_obj_to_long(T const &mo) { @@ -1058,6 +1180,27 @@ BOOST_PYTHON_MODULE(_driver) // }}} + // {{{ linker + +#if CUDAPP_CUDA_VERSION >= 5050 + py::enum_("jit_input_type") + .value("CUBIN", CU_JIT_INPUT_CUBIN) + .value("PTX", CU_JIT_INPUT_PTX) + .value("FATBINARY", CU_JIT_INPUT_FATBINARY) + .value("OBJECT", CU_JIT_INPUT_OBJECT) + .value("LIBRARY", CU_JIT_INPUT_LIBRARY); + + py::class_ >("Linker") + .def(py::init()) + .def(py::init()) + .def(py::init()) + .def("add_data", &Linker::add_data, (py::arg("data"), py::arg("input_type"), py::arg("name")=py::str("unknown"))) + .def("add_file", &Linker::add_file, (py::arg("filename"), py::arg("input_type"))) + .def("link_module", &Linker::link_module, py::return_value_policy()); +#endif + + // }}} + // {{{ function { typedef function cl; -- GitLab From a69e5398b28bbaddf0bd9b3e7376b0b41b93fb25 Mon Sep 17 00:00:00 2001 From: Lurch Date: Tue, 10 Jan 2017 21:22:38 +0100 Subject: [PATCH 02/15] Minor name changes --- pycuda/compiler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index fe0260f0..af684923 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -309,13 +309,13 @@ class JitLinkModule(object): self.no_extern_c, self.arch, self.code, self.cache_dir, self.include_dirs) from pycuda.driver import jit_input_type - self.linker.add_data(cubin, jit_input_type.PTX, name=name) + self.linker.add_data(cubin, jit_input_type.PTX, name) - def add_data(self, data, cu_jit_input_type, name='unknown'): - self.linker.add_data(data, cu_jit_input_type, name) + def add_data(self, data, input_type, name='unknown'): + self.linker.add_data(data, input_type, name) - def add_file(self, filename, cu_jit_input_type): - self.linker.add_file(filename, cu_jit_input_type) + def add_file(self, filename, input_type): + self.linker.add_file(filename, input_type) def link(self): self.module = self.linker.link_module() -- GitLab From 7240a3e0c6a6eeba6c1f4931e3d2684deb776462 Mon Sep 17 00:00:00 2001 From: Lurch Date: Tue, 10 Jan 2017 23:37:09 +0100 Subject: [PATCH 03/15] Switched from target "CUBIN" to "PTX" --- pycuda/compiler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index af684923..58a87602 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -305,11 +305,11 @@ class JitLinkModule(object): self.linker = Linker(message_handler, options, log_verbose) def add_source(self, source, nvcc_options=None, name='unknown'): - cubin = compile(source, self.nvcc, nvcc_options, self.keep, + ptx = compile(source, self.nvcc, nvcc_options, self.keep, self.no_extern_c, self.arch, self.code, self.cache_dir, - self.include_dirs) + self.include_dirs, target="ptx") from pycuda.driver import jit_input_type - self.linker.add_data(cubin, jit_input_type.PTX, name) + self.linker.add_data(ptx, jit_input_type.PTX, name) def add_data(self, data, input_type, name='unknown'): self.linker.add_data(data, input_type, name) -- GitLab From be87fc2054c5ab164cc085cda426de41b23de61f Mon Sep 17 00:00:00 2001 From: Lurch Date: Wed, 11 Jan 2017 00:07:22 +0100 Subject: [PATCH 04/15] Default compiled source name improved --- pycuda/compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 58a87602..484c197d 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -304,7 +304,7 @@ class JitLinkModule(object): from pycuda.driver import Linker self.linker = Linker(message_handler, options, log_verbose) - def add_source(self, source, nvcc_options=None, name='unknown'): + def add_source(self, source, nvcc_options=None, name='kernel.ptx'): ptx = compile(source, self.nvcc, nvcc_options, self.keep, self.no_extern_c, self.arch, self.code, self.cache_dir, self.include_dirs, target="ptx") -- GitLab From bafba75e217a2525dc2d3b0bc7395963724ccb82 Mon Sep 17 00:00:00 2001 From: Lurch Date: Wed, 11 Jan 2017 17:20:09 +0100 Subject: [PATCH 05/15] Refactored class SourceModule, now derived from class JitLinkModule. --- pycuda/compiler.py | 53 +++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 38 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 484c197d..ebeeb41a 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -254,40 +254,6 @@ def compile(source, nvcc="nvcc", options=None, keep=False, return compile_plain(source, options, keep, nvcc, cache_dir, target) - -class SourceModule(object): - def __init__(self, source, nvcc="nvcc", options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[]): - self._check_arch(arch) - - cubin = compile(source, nvcc, options, keep, no_extern_c, - arch, code, cache_dir, include_dirs) - - from pycuda.driver import module_from_buffer - self.module = module_from_buffer(cubin) - - self.get_global = self.module.get_global - self.get_texref = self.module.get_texref - if hasattr(self.module, "get_surfref"): - self.get_surfref = self.module.get_surfref - - def _check_arch(self, arch): - if arch is None: - return - try: - from pycuda.driver import Context - capability = Context.get_device().compute_capability() - if tuple(map(int, tuple(arch.split("_")[1]))) > capability: - from warnings import warn - warn("trying to compile for a compute capability " - "higher than selected GPU") - except: - pass - - def get_function(self, name): - return self.module.get_function(name) - class JitLinkModule(object): def __init__(self, nvcc='nvcc', options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, @@ -305,9 +271,10 @@ class JitLinkModule(object): self.linker = Linker(message_handler, options, log_verbose) def add_source(self, source, nvcc_options=None, name='kernel.ptx'): - ptx = compile(source, self.nvcc, nvcc_options, self.keep, - self.no_extern_c, self.arch, self.code, self.cache_dir, - self.include_dirs, target="ptx") + ptx = compile(source, nvcc=self.nvcc, options=nvcc_options, + keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch, + code=self.code, cache_dir=self.cache_dir, + include_dirs=self.include_dirs, target="ptx") from pycuda.driver import jit_input_type self.linker.add_data(ptx, jit_input_type.PTX, name) @@ -320,7 +287,6 @@ class JitLinkModule(object): def link(self): self.module = self.linker.link_module() self.linker = None - self.get_global = self.module.get_global self.get_texref = self.module.get_texref if hasattr(self.module, "get_surfref"): @@ -341,3 +307,14 @@ class JitLinkModule(object): def get_function(self, name): return self.module.get_function(name) + +class SourceModule(JitLinkModule): + def __init__(self, source, nvcc="nvcc", options=None, keep=False, + no_extern_c=False, arch=None, code=None, cache_dir=None, + include_dirs=[], message_handler=None, log_verbose=False): + super(SourceModule, self).__init__(nvcc=nvcc, options=None, + keep=keep, no_extern_c=no_extern_c, arch=arch, code=code, + cache_dir=cache_dir, include_dirs=include_dirs, + message_handler=message_handler, log_verbose=log_verbose) + self.add_source(source, nvcc_options=options) + self.link() -- GitLab From f7223616d5e5d7fc2ebf8ad01c656a7d79978e20 Mon Sep 17 00:00:00 2001 From: Lurch Date: Wed, 11 Jan 2017 19:03:15 +0100 Subject: [PATCH 06/15] Added adaption of NVIDIA's "cdpSimplePrint - Simple Print (CUDA Dynamic Parallelism)" sample that demonstrates dynamic parallelism with a recursive kernel --- examples/demo_cdpSimplePrint.py | 131 ++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 examples/demo_cdpSimplePrint.py diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py new file mode 100644 index 00000000..ab78b917 --- /dev/null +++ b/examples/demo_cdpSimplePrint.py @@ -0,0 +1,131 @@ +''' + * demo_cdpSimplePrint.py + * + * Adapted from NVIDIA's "cdpSimplePrint - Simple Print (CUDA Dynamic Parallelism)" sample + * http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-print--cuda-dynamic-parallelism- + * http://ecee.colorado.edu/~siewerts/extra/code/example_code_archive/a490dmis_code/CUDA/cuda_work/samples/0_Simple/cdpSimplePrint/cdpSimplePrint.cu + * + * From cdpSimplePrint.cu (not sure if this is Ok with NVIDIA's 38-page EULA though...): + * --------------------------------------------------------------------------- + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * --------------------------------------------------------------------------- +''' + +import sys, os +import pycuda.autoinit +import pycuda.driver as cuda +from pycuda.compiler import JitLinkModule +from pycuda.driver import jit_input_type + +cdpSimplePrint_cu = ''' +#include + +//////////////////////////////////////////////////////////////////////////////// +// Variable on the GPU used to generate unique identifiers of blocks. +//////////////////////////////////////////////////////////////////////////////// +__device__ int g_uids = 0; + +//////////////////////////////////////////////////////////////////////////////// +// Print a simple message to signal the block which is currently executing. +//////////////////////////////////////////////////////////////////////////////// +__device__ void print_info( int depth, int thread, int uid, int parent_uid ) +{ + if( threadIdx.x == 0 ) + { + if( depth == 0 ) + printf( "BLOCK %d launched by the host\\n", uid ); + else + { + char buffer[32]; + for( int i = 0 ; i < depth ; ++i ) + { + buffer[3*i+0] = '|'; + buffer[3*i+1] = ' '; + buffer[3*i+2] = ' '; + } + buffer[3*depth] = '\\0'; + printf( "%sBLOCK %d launched by thread %d of block %d\\n", buffer, uid, thread, parent_uid ); + } + } + __syncthreads(); +} + +//////////////////////////////////////////////////////////////////////////////// +// The kernel using CUDA dynamic parallelism. +// +// It generates a unique identifier for each block. Prints the information +// about that block. Finally, if the 'max_depth' has not been reached, the +// block launches new blocks directly from the GPU. +//////////////////////////////////////////////////////////////////////////////// +__global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid ) +{ + // We create a unique ID per block. Thread 0 does that and shares the value with the other threads. + __shared__ int s_uid; + if( threadIdx.x == 0 ) + { + s_uid = atomicAdd( &g_uids, 1 ); + } + __syncthreads(); + + // We print the ID of the block and information about its parent. + print_info( depth, thread, s_uid, parent_uid ); + + // We launch new blocks if we haven't reached the max_depth yet. + if( ++depth >= max_depth ) + { + return; + } + cdp_kernel<<>>( max_depth, depth, threadIdx.x, s_uid ); +} +''' + +def main(argv): + max_depth = 2 + if len(argv) > 1: + if len(argv) == 2 and argv[1].isdigit() and int(argv[1]) >= 1 and int(argv[1]) <= 8: + max_depth = int(argv[1]) + else: + print("Usage: %s \t(where max_depth is a value between 1 and 8)." % argv[0]) + sys.exit(0) + + print("starting Simple Print (CUDA Dynamic Parallelism)") + + # TODO: this add_file('cudadevrt.lib') probably works under Windows only, + # not sure about Linux, maybe "/usr/lib/x86_64-linux-gnu/libcudadevrt.a"? + + mod = JitLinkModule() + mod.add_source(cdpSimplePrint_cu, nvcc_options=['-O3', '-rdc=true', '-lcudadevrt']) + mod.add_file(os.environ['CUDA_PATH'] + '\\lib\\x64\\cudadevrt.lib', jit_input_type.LIBRARY) + mod.link() + + cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call + + print("***************************************************************************") + print("The CPU launches 2 blocks of 2 threads each. On the device each thread will") + print("launch 2 blocks of 2 threads each. The GPU we will do that recursively") + print("until it reaches max_depth=%d\n" % max_depth) + print("In total 2") + num_blocks, sum = 2, 2 + for i in range(1, max_depth): + num_blocks *= 4 + print("+%d" % num_blocks) + sum += num_blocks + + print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum-2)) + print("***************************************************************************\n") + + # TODO: cudaDeviceSetLimit() is not available on PyCuda, works anyway on my GeForce GTX 980; maybe add that function? + #cudaDeviceSetLimit( cudaLimitDevRuntimeSyncDepth, max_depth ) + + # Launch the kernel from the CPU. + print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n") + cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1) + +if __name__ == "__main__": + main(sys.argv) -- GitLab From 23132b47c6c84bb5335ed48a2389943d96b19f49 Mon Sep 17 00:00:00 2001 From: Lurch Date: Thu, 12 Jan 2017 16:53:51 +0100 Subject: [PATCH 07/15] Added 3 unit tests for jit-link in test_jit_link.py --- test/test_jit_link.py | 78 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 test/test_jit_link.py diff --git a/test/test_jit_link.py b/test/test_jit_link.py new file mode 100644 index 00000000..a0728991 --- /dev/null +++ b/test/test_jit_link.py @@ -0,0 +1,78 @@ +import sys +from pycuda.tools import mark_cuda_test + +def have_pycuda(): + try: + import pycuda # noqa + return True + except: + return False + +if have_pycuda(): + import pycuda.driver as drv # noqa + from pycuda.compiler import SourceModule + from pycuda.compiler import JitLinkModule + from pycuda.driver import jit_input_type + +class TestJitLink: + @mark_cuda_test + def test_create(self): + mod = JitLinkModule() + + @mark_cuda_test + def test_static_parallelism(self): + test_cu = '''#include + __global__ void test_kernel() { + printf("Hello world!\\n"); + }''' + + mod = SourceModule(test_cu) + test_kernel = mod.get_function('test_kernel') + test_kernel(grid=(2,1), block=(1,1,1)) + + @mark_cuda_test + def test_dynamic_parallelism(self): + # nvcc error: + # calling a __global__ function("test_kernel_inner") from a + # __global__ function("test_kernel") is only allowed on the + # compute_35 architecture or above + import pycuda.autoinit + compute_capability = pycuda.autoinit.device.compute_capability() + if compute_capability[0] < 3 or (compute_capability[0] == 3 and compute_capability[1] < 5): + raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % + (compute_capability[0], compute_capability[1])) + + import os, os.path + from platform import system + if system() == 'Windows': + cudadevrt = os.path.join(os.environ['CUDA_PATH'], 'lib/x64/cudadevrt.lib') + else: + cudadevrt = '/usr/lib/x86_64-linux-gnu/libcudadevrt.a' # TODO: this is just an untested guess! + if not os.path.isfile(cudadevrt): + raise Exception('Cannot locate library cudadevrt!') + + test_cu = '''#include + __global__ void test_kernel_inner() { + printf(" Hello inner world!\\n"); + } + __global__ void test_kernel() { + printf("Hello outer world!\\n"); + test_kernel_inner<<<2, 1>>>(); + }''' + + mod = JitLinkModule() + mod.add_source(test_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) + mod.add_file(cudadevrt, jit_input_type.LIBRARY) + mod.link() + test_kernel = mod.get_function('test_kernel') + test_kernel(grid=(2,1), block=(1,1,1)) + +if __name__ == "__main__": + # make sure that import failures get reported, instead of skipping the tests. + import pycuda.autoinit # noqa + + if len(sys.argv) > 1: + exec (sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) -- GitLab From 68dcd6ec1cd1b41155284b1cbea471b7427a5430 Mon Sep 17 00:00:00 2001 From: Lurch Date: Thu, 12 Jan 2017 20:43:05 +0100 Subject: [PATCH 08/15] Added method JitLinkModule.add_stdlib() for static linking of CUDA standard libraries like "cudadevrt". --- examples/demo_cdpSimplePrint.py | 5 +---- pycuda/compiler.py | 28 +++++++++++++++++++++++++++- test/test_jit_link.py | 12 ++---------- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py index ab78b917..d00ba750 100644 --- a/examples/demo_cdpSimplePrint.py +++ b/examples/demo_cdpSimplePrint.py @@ -96,12 +96,9 @@ def main(argv): print("starting Simple Print (CUDA Dynamic Parallelism)") - # TODO: this add_file('cudadevrt.lib') probably works under Windows only, - # not sure about Linux, maybe "/usr/lib/x86_64-linux-gnu/libcudadevrt.a"? - mod = JitLinkModule() mod.add_source(cdpSimplePrint_cu, nvcc_options=['-O3', '-rdc=true', '-lcudadevrt']) - mod.add_file(os.environ['CUDA_PATH'] + '\\lib\\x64\\cudadevrt.lib', jit_input_type.LIBRARY) + mod.add_stdlib('cudadevrt') mod.link() cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call diff --git a/pycuda/compiler.py b/pycuda/compiler.py index ebeeb41a..6296ca25 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -257,7 +257,8 @@ def compile(source, nvcc="nvcc", options=None, keep=False, class JitLinkModule(object): def __init__(self, nvcc='nvcc', options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], message_handler=None, log_verbose=False): + include_dirs=[], message_handler=None, log_verbose=False, + cudalib_dir=None): self._check_arch(arch) self.nvcc = nvcc self.keep = keep @@ -266,6 +267,7 @@ class JitLinkModule(object): self.code = code self.cache_dir = cache_dir self.include_dirs = include_dirs + self.cudalib_dir = cudalib_dir self.module = None from pycuda.driver import Linker self.linker = Linker(message_handler, options, log_verbose) @@ -284,6 +286,30 @@ class JitLinkModule(object): def add_file(self, filename, input_type): self.linker.add_file(filename, input_type) + def add_stdlib(self, libname): + import os, os.path + from pycuda.driver import jit_input_type + from platform import system + syst = system() + if syst == 'Windows': + if self.cudalib_dir is None: + libdir = os.path.join(os.environ['CUDA_PATH'], 'lib/x64') + else: + libdir = self.cudalib_dir + libpath = os.path.join(libdir, '%s.lib' % libname) + elif syst == 'Linux': + if self.cudalib_dir is None: + libdir = '/usr/lib/x86_64-linux-gnu' + else: + libdir = self.cudalib_dir + libpath = os.path.join(libdir, 'lib%s.a' % libname) + else: + raise Exception('System "%s" currently not supported, use add_file() instead' % syst) + if os.path.isfile(libpath): + self.linker.add_file(libpath, jit_input_type.LIBRARY) + else: + raise Exception('Library file "%s" not found, use add_file() instead' % libpath) + def link(self): self.module = self.linker.link_module() self.linker = None diff --git a/test/test_jit_link.py b/test/test_jit_link.py index a0728991..43b54942 100644 --- a/test/test_jit_link.py +++ b/test/test_jit_link.py @@ -42,15 +42,6 @@ class TestJitLink: raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) - import os, os.path - from platform import system - if system() == 'Windows': - cudadevrt = os.path.join(os.environ['CUDA_PATH'], 'lib/x64/cudadevrt.lib') - else: - cudadevrt = '/usr/lib/x86_64-linux-gnu/libcudadevrt.a' # TODO: this is just an untested guess! - if not os.path.isfile(cudadevrt): - raise Exception('Cannot locate library cudadevrt!') - test_cu = '''#include __global__ void test_kernel_inner() { printf(" Hello inner world!\\n"); @@ -62,8 +53,9 @@ class TestJitLink: mod = JitLinkModule() mod.add_source(test_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) - mod.add_file(cudadevrt, jit_input_type.LIBRARY) + mod.add_stdlib('cudadevrt') mod.link() + test_kernel = mod.get_function('test_kernel') test_kernel(grid=(2,1), block=(1,1,1)) -- GitLab From e990a4e67cd84464cc43be72be737202a2f4c47d Mon Sep 17 00:00:00 2001 From: Lurch Date: Fri, 13 Jan 2017 19:32:49 +0100 Subject: [PATCH 09/15] Refactored SourceModule again, new class hierarchy: + CudaModule + SourceModule + JitLinkModule + DynamicSourceModule Splitted the "old" class SourceModule in two: class CudaModule and the "new" SourceModule. - CudaModule is now the common base class for module loading and provides common methods. All methods here were moved from the old SourceModule. - The "new" SourceModule's interface and system requirements are 100% unchanged, it will work under all previous configuration scenarios. - JitLinkModule requires at least CUDA 5.5 and Compute Capabilty 3.5 (that is now guarded in the constructor), it's the swiss-army-knife for non-trivial linker invocations. - DynamicSourceModule is a special case of JitLinkModule, it exposes the same interface as SourceModule but enables dynamic parallelism (it comes with one extra optional argument in the constructor, cudalib_dir). It's meant for the trivial cases where the user has a single source file, as before with SourceModule. So if a PyCuda user only wants to activate dynamic parallelism, all that's required is to replace "SourceModule" with "DynamicSourceModule" given that we're able to locate "cudadevrt" automagically in method _locate_cuda_libdir(), otherwise the caller must provide the CUDA library path manually in constructor argument "cudalib_dir". I do not think this can be reduced any further. Other changes in class JitLinkModule: - Made all add_* methods and the link() method return self - Moved CUDA library path detection logic into method JitLinkModule._locate_cuda_libdir(), gets called only once from constructor --- pycuda/compiler.py | 180 +++++++++++++++++++++++++++++++-------------- 1 file changed, 126 insertions(+), 54 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 6296ca25..09674535 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -254,11 +254,59 @@ def compile(source, nvcc="nvcc", options=None, keep=False, return compile_plain(source, options, keep, nvcc, cache_dir, target) -class JitLinkModule(object): - def __init__(self, nvcc='nvcc', options=None, keep=False, +class CudaModule(object): + def _check_arch(self, arch): + if arch is None: + return + try: + from pycuda.driver import Context + capability = Context.get_device().compute_capability() + if tuple(map(int, tuple(arch.split("_")[1]))) > capability: + from warnings import warn + warn("trying to compile for a compute capability " + "higher than selected GPU") + except: + pass + + def _bind_module(self): + self.get_global = self.module.get_global + self.get_texref = self.module.get_texref + if hasattr(self.module, "get_surfref"): + self.get_surfref = self.module.get_surfref + + def get_function(self, name): + return self.module.get_function(name) + +class SourceModule(CudaModule): + def __init__(self, source, nvcc="nvcc", options=None, keep=False, + no_extern_c=False, arch=None, code=None, cache_dir=None, + include_dirs=[]): + self._check_arch(arch) + + cubin = compile(source, nvcc, options, keep, no_extern_c, + arch, code, cache_dir, include_dirs) + + from pycuda.driver import module_from_buffer + self.module = module_from_buffer(cubin) + + self._bind_module() + +class JitLinkModule(CudaModule): + # TODO: + # - How do we handle multiple CUDA devices? Currently using + # pycuda.autoinit.device + def __init__(self, nvcc='nvcc', link_options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], message_handler=None, log_verbose=False, - cudalib_dir=None): + cuda_libdir=None): + from pycuda.autoinit import device + compute_capability = device.compute_capability() + if compute_capability < (3,5): + raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % + (compute_capability[0], compute_capability[1])) + else: + from pycuda.driver import Linker + self.linker = Linker(message_handler, link_options, log_verbose) self._check_arch(arch) self.nvcc = nvcc self.keep = keep @@ -267,10 +315,56 @@ class JitLinkModule(object): self.code = code self.cache_dir = cache_dir self.include_dirs = include_dirs - self.cudalib_dir = cudalib_dir self.module = None - from pycuda.driver import Linker - self.linker = Linker(message_handler, options, log_verbose) + self.libdir, self.libptn = self._locate_cuda_libdir(cuda_libdir) + + def _locate_cuda_libdir(self, cuda_libdir=None): + ''' + Locate the CUDA "standard" libraries directory in the local + file system. Supports 64-Bit Windows, Linux and Mac OS X. + In case the caller supplied cuda_libdir other than None that + value is returned unchecked, else make a best-effort attempt. + Precedence: + Windows: cuda_libdir > %CUDA_PATH% + Linux: cuda_libdir > $CUDA_ROOT > $LD_LIBRARY_PATH > '/usr/lib/x86_64-linux-gnu' + Returns a pair (libdir, libptn) where libdir is None in case + of failure or a string containing the absolute path of the + directory, and libptn is the %-format pattern to construct + library file names from library names on the local system. + Does not raise an Excpetion in case of failure. + TODO: + - Mac OS X (Darwin) is currently treated like Linux, is that correct? + - Check CMake's FindCUDA module, it might contain some helpful clues in its sources + https://cmake.org/cmake/help/v3.0/module/FindCUDA.html + https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake + - Verify all Linux code paths somehow + ''' + from os.path import isfile, join + from platform import system as platform_system # TODO: Only since Pyhton 2.3., future or is 2.3 fine? + system = platform_system() + libdir, libptn = None, None + if system == 'Windows': + if cuda_libdir is not None: + libdir = cuda_libdir + elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib/x64/cudadevrt.lib')): + libdir = join(os.environ['CUDA_PATH'], 'lib/x64') + libptn = '%s.lib' + elif system == 'Linux' or system == 'Darwin': + if cuda_libdir is not None: + libdir = cuda_libdir + elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')): + # TODO: Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries? + libdir = join(os.environ['CUDA_ROOT'], 'lib64') + elif 'LD_LIBRARY_PATH' in os.environ: + # see: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions + for ld_path in os.environ['LD_LIBRARY_PATH'].split(':'): + if isfile(join(ld_path, 'libcudadevrt.a')): + libdir = ld_path + break + if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'): + libdir = '/usr/lib/x86_64-linux-gnu' + libptn = 'lib%s.a' + return libdir, libptn def add_source(self, source, nvcc_options=None, name='kernel.ptx'): ptx = compile(source, nvcc=self.nvcc, options=nvcc_options, @@ -279,68 +373,46 @@ class JitLinkModule(object): include_dirs=self.include_dirs, target="ptx") from pycuda.driver import jit_input_type self.linker.add_data(ptx, jit_input_type.PTX, name) + return self def add_data(self, data, input_type, name='unknown'): self.linker.add_data(data, input_type, name) + return self def add_file(self, filename, input_type): self.linker.add_file(filename, input_type) + return self def add_stdlib(self, libname): - import os, os.path + # TODO: which error class to raise best here? + if self.libdir is None: + raise Exception('Unable to find CUDA installation path, please set CUDA library path manually') + from os.path import isfile, join + libpath = join(self.libdir, self.libptn % libname) + if not isfile(libpath): + raise Exception('CUDA library file "%s" not found' % libpath) from pycuda.driver import jit_input_type - from platform import system - syst = system() - if syst == 'Windows': - if self.cudalib_dir is None: - libdir = os.path.join(os.environ['CUDA_PATH'], 'lib/x64') - else: - libdir = self.cudalib_dir - libpath = os.path.join(libdir, '%s.lib' % libname) - elif syst == 'Linux': - if self.cudalib_dir is None: - libdir = '/usr/lib/x86_64-linux-gnu' - else: - libdir = self.cudalib_dir - libpath = os.path.join(libdir, 'lib%s.a' % libname) - else: - raise Exception('System "%s" currently not supported, use add_file() instead' % syst) - if os.path.isfile(libpath): - self.linker.add_file(libpath, jit_input_type.LIBRARY) - else: - raise Exception('Library file "%s" not found, use add_file() instead' % libpath) + self.linker.add_file(libpath, jit_input_type.LIBRARY) + return self def link(self): self.module = self.linker.link_module() self.linker = None - self.get_global = self.module.get_global - self.get_texref = self.module.get_texref - if hasattr(self.module, "get_surfref"): - self.get_surfref = self.module.get_surfref + self._bind_module() + return self - def _check_arch(self, arch): - if arch is None: - return - try: - from pycuda.driver import Context - capability = Context.get_device().compute_capability() - if tuple(map(int, tuple(arch.split("_")[1]))) > capability: - from warnings import warn - warn("trying to compile for a compute capability " - "higher than selected GPU") - except: - pass - - def get_function(self, name): - return self.module.get_function(name) - -class SourceModule(JitLinkModule): - def __init__(self, source, nvcc="nvcc", options=None, keep=False, +class DynamicSourceModule(JitLinkModule): + def __init__(self, source, nvcc="nvcc", options=[], keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], message_handler=None, log_verbose=False): - super(SourceModule, self).__init__(nvcc=nvcc, options=None, - keep=keep, no_extern_c=no_extern_c, arch=arch, code=code, - cache_dir=cache_dir, include_dirs=include_dirs, - message_handler=message_handler, log_verbose=log_verbose) + include_dirs=[], cuda_libdir=None): + super(DynamicSourceModule, self).__init__(nvcc=nvcc, + link_options=None, keep=keep, no_extern_c=no_extern_c, + arch=arch, code=code, cache_dir=cache_dir, + include_dirs=include_dirs, cuda_libdir=cuda_libdir) + if not '-rdc=true' in options: + options.append('-rdc=true') + if not '-lcudadevrt' in options: + options.append('-lcudadevrt') self.add_source(source, nvcc_options=options) + self.add_stdlib('cudadevrt') self.link() -- GitLab From e03e9f74b667acdcfccd7a05abe568b95a12ac53 Mon Sep 17 00:00:00 2001 From: Lurch Date: Fri, 13 Jan 2017 20:06:44 +0100 Subject: [PATCH 10/15] Changed demo_cdpSimplePrint.py to use DynamicSourceModule. Using Context.set_limit() now. Needed a missing limit enum CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH (in CUDA8/v8.0/include/cuda.h[975], available since 3.5), added CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH and CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT (same) to pycuda.driver.limit. --- examples/demo_cdpSimplePrint.py | 14 +++----------- src/wrapper/wrap_cudadrv.cpp | 4 ++++ 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py index d00ba750..d5435987 100644 --- a/examples/demo_cdpSimplePrint.py +++ b/examples/demo_cdpSimplePrint.py @@ -20,8 +20,7 @@ import sys, os import pycuda.autoinit import pycuda.driver as cuda -from pycuda.compiler import JitLinkModule -from pycuda.driver import jit_input_type +from pycuda.compiler import DynamicSourceModule cdpSimplePrint_cu = ''' #include @@ -96,11 +95,7 @@ def main(argv): print("starting Simple Print (CUDA Dynamic Parallelism)") - mod = JitLinkModule() - mod.add_source(cdpSimplePrint_cu, nvcc_options=['-O3', '-rdc=true', '-lcudadevrt']) - mod.add_stdlib('cudadevrt') - mod.link() - + mod = DynamicSourceModule(cdpSimplePrint_cu) cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call print("***************************************************************************") @@ -113,14 +108,11 @@ def main(argv): num_blocks *= 4 print("+%d" % num_blocks) sum += num_blocks - print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum-2)) print("***************************************************************************\n") - # TODO: cudaDeviceSetLimit() is not available on PyCuda, works anyway on my GeForce GTX 980; maybe add that function? - #cudaDeviceSetLimit( cudaLimitDevRuntimeSyncDepth, max_depth ) + pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH, max_depth) - # Launch the kernel from the CPU. print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n") cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1) diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 4f126065..76f30df4 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -1010,6 +1010,10 @@ BOOST_PYTHON_MODULE(_driver) .value("PRINTF_FIFO_SIZE", CU_LIMIT_PRINTF_FIFO_SIZE) #if CUDAPP_CUDA_VERSION >= 3020 .value("MALLOC_HEAP_SIZE", CU_LIMIT_MALLOC_HEAP_SIZE) +#endif +#if CUDAPP_CUDA_VERSION >= 3050 + .value("DEV_RUNTIME_SYNC_DEPTH", CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH) + .value("DEV_RUNTIME_PENDING_LAUNCH_COUNT", CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT) #endif ; #endif -- GitLab From d0666aed20c8da41e83d83a4c0f7859687a662bb Mon Sep 17 00:00:00 2001 From: Lurch Date: Fri, 13 Jan 2017 20:48:05 +0100 Subject: [PATCH 11/15] Changed TestDriver.test_recursive_launch() in test/test_deriver.py to use pycuda.compiler.DynamicSourceModule, removed xfail marker Added a test case for pycuda.compiler.JitLinkModule: compiles and links two .cu files. Removed test/test_jit_link.py, all further tests in test/test_deriver.py from now on Removed bad "import pycuda.autoinit" (how did this even get there?) --- test/test_driver.py | 33 ++++++++++++++++++-- test/test_jit_link.py | 70 ------------------------------------------- 2 files changed, 30 insertions(+), 73 deletions(-) delete mode 100644 test/test_jit_link.py diff --git a/test/test_driver.py b/test/test_driver.py index 56d93658..e951134d 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -894,9 +894,8 @@ class TestDriver: drv.memcpy_htod(a_gpu, a) drv.memcpy_htod(b_gpu, b) - mod = SourceModule(cuda_string, - options=['-rdc=true', '-lcudadevrt'], - keep=True) + from pycuda.compiler import DynamicSourceModule + mod = DynamicSourceModule(cuda_string, keep=True) func = mod.get_function("math") func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu, @@ -918,6 +917,34 @@ class TestDriver: math(a, b, c, d, e, f) + @mark_cuda_test + def test_jit_link_module(self): + if drv.Context.get_device().compute_capability() < (3, 5): + from pytest import skip + skip("need compute capability 3.5 or higher for dynamic parallelism") + + test_outer_cu = '''#include + __global__ void test_kernel() { + extern __global__ void test_kernel_inner(); + printf("Hello outer world!\\n"); + test_kernel_inner<<<2, 1>>>(); + }''' + + test_inner_cu = '''#include + __global__ void test_kernel_inner() { + printf(" Hello inner world!\\n"); + }''' + + from pycuda.compiler import JitLinkModule + mod = JitLinkModule() + mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) + mod.add_source(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) + mod.add_stdlib('cudadevrt') + mod.link() + + test_kernel = mod.get_function('test_kernel') + test_kernel(grid=(2,1), block=(1,1,1)) + def test_import_pyopencl_before_pycuda(): try: diff --git a/test/test_jit_link.py b/test/test_jit_link.py deleted file mode 100644 index 43b54942..00000000 --- a/test/test_jit_link.py +++ /dev/null @@ -1,70 +0,0 @@ -import sys -from pycuda.tools import mark_cuda_test - -def have_pycuda(): - try: - import pycuda # noqa - return True - except: - return False - -if have_pycuda(): - import pycuda.driver as drv # noqa - from pycuda.compiler import SourceModule - from pycuda.compiler import JitLinkModule - from pycuda.driver import jit_input_type - -class TestJitLink: - @mark_cuda_test - def test_create(self): - mod = JitLinkModule() - - @mark_cuda_test - def test_static_parallelism(self): - test_cu = '''#include - __global__ void test_kernel() { - printf("Hello world!\\n"); - }''' - - mod = SourceModule(test_cu) - test_kernel = mod.get_function('test_kernel') - test_kernel(grid=(2,1), block=(1,1,1)) - - @mark_cuda_test - def test_dynamic_parallelism(self): - # nvcc error: - # calling a __global__ function("test_kernel_inner") from a - # __global__ function("test_kernel") is only allowed on the - # compute_35 architecture or above - import pycuda.autoinit - compute_capability = pycuda.autoinit.device.compute_capability() - if compute_capability[0] < 3 or (compute_capability[0] == 3 and compute_capability[1] < 5): - raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % - (compute_capability[0], compute_capability[1])) - - test_cu = '''#include - __global__ void test_kernel_inner() { - printf(" Hello inner world!\\n"); - } - __global__ void test_kernel() { - printf("Hello outer world!\\n"); - test_kernel_inner<<<2, 1>>>(); - }''' - - mod = JitLinkModule() - mod.add_source(test_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) - mod.add_stdlib('cudadevrt') - mod.link() - - test_kernel = mod.get_function('test_kernel') - test_kernel(grid=(2,1), block=(1,1,1)) - -if __name__ == "__main__": - # make sure that import failures get reported, instead of skipping the tests. - import pycuda.autoinit # noqa - - if len(sys.argv) > 1: - exec (sys.argv[1]) - else: - from py.test.cmdline import main - main([__file__]) -- GitLab From f6b37048d6412af4b60a0da6bbe4f3ebea834bf2 Mon Sep 17 00:00:00 2001 From: Lurch Date: Fri, 13 Jan 2017 20:50:21 +0100 Subject: [PATCH 12/15] Removed xfail marker from TestDriver.test_recursive_launch() in test/test_deriver.py --- test/test_driver.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_driver.py b/test/test_driver.py index e951134d..6e705ff7 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -838,7 +838,6 @@ class TestDriver: drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize() - @pytest.mark.xfail @mark_cuda_test # https://github.com/inducer/pycuda/issues/45 def test_recursive_launch(self): -- GitLab From 179bba661a5c5494fc1f4ad1b4a8986077749eb0 Mon Sep 17 00:00:00 2001 From: Lurch Date: Sat, 14 Jan 2017 09:32:24 +0100 Subject: [PATCH 13/15] Fixed some mior TODOs --- pycuda/compiler.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 09674535..9b325410 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -292,15 +292,12 @@ class SourceModule(CudaModule): self._bind_module() class JitLinkModule(CudaModule): - # TODO: - # - How do we handle multiple CUDA devices? Currently using - # pycuda.autoinit.device def __init__(self, nvcc='nvcc', link_options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], message_handler=None, log_verbose=False, cuda_libdir=None): - from pycuda.autoinit import device - compute_capability = device.compute_capability() + from pycuda.driver import Context + compute_capability = Context.get_device().compute_capability() if compute_capability < (3,5): raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % (compute_capability[0], compute_capability[1])) @@ -332,7 +329,11 @@ class JitLinkModule(CudaModule): directory, and libptn is the %-format pattern to construct library file names from library names on the local system. Does not raise an Excpetion in case of failure. + Links + - Post-installation Actions + http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions TODO: + - Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries on Linux? - Mac OS X (Darwin) is currently treated like Linux, is that correct? - Check CMake's FindCUDA module, it might contain some helpful clues in its sources https://cmake.org/cmake/help/v3.0/module/FindCUDA.html @@ -340,7 +341,7 @@ class JitLinkModule(CudaModule): - Verify all Linux code paths somehow ''' from os.path import isfile, join - from platform import system as platform_system # TODO: Only since Pyhton 2.3., future or is 2.3 fine? + from platform import system as platform_system system = platform_system() libdir, libptn = None, None if system == 'Windows': @@ -353,10 +354,8 @@ class JitLinkModule(CudaModule): if cuda_libdir is not None: libdir = cuda_libdir elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')): - # TODO: Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries? libdir = join(os.environ['CUDA_ROOT'], 'lib64') elif 'LD_LIBRARY_PATH' in os.environ: - # see: http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions for ld_path in os.environ['LD_LIBRARY_PATH'].split(':'): if isfile(join(ld_path, 'libcudadevrt.a')): libdir = ld_path @@ -384,13 +383,12 @@ class JitLinkModule(CudaModule): return self def add_stdlib(self, libname): - # TODO: which error class to raise best here? if self.libdir is None: - raise Exception('Unable to find CUDA installation path, please set CUDA library path manually') + raise RuntimeError('Unable to find CUDA installation path, please set CUDA library path manually') from os.path import isfile, join libpath = join(self.libdir, self.libptn % libname) if not isfile(libpath): - raise Exception('CUDA library file "%s" not found' % libpath) + raise FileNotFoundError('CUDA library file "%s" not found' % libpath) from pycuda.driver import jit_input_type self.linker.add_file(libpath, jit_input_type.LIBRARY) return self @@ -409,6 +407,7 @@ class DynamicSourceModule(JitLinkModule): link_options=None, keep=keep, no_extern_c=no_extern_c, arch=arch, code=code, cache_dir=cache_dir, include_dirs=include_dirs, cuda_libdir=cuda_libdir) + options = options[:] if not '-rdc=true' in options: options.append('-rdc=true') if not '-lcudadevrt' in options: -- GitLab From 1a25b19853c003945e60941d9231ee00672475fd Mon Sep 17 00:00:00 2001 From: Lurch Date: Sat, 14 Jan 2017 10:44:25 +0100 Subject: [PATCH 14/15] Removed the pointer type conversion hacks from C++ Linker class --- src/wrapper/wrap_cudadrv.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 76f30df4..9756b035 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -431,9 +431,10 @@ namespace } } - void add_option(CUjit_option option, void* value) { + template + void add_option(CUjit_option option, T value) { m_options.push_back(option); - m_values.push_back(value); + m_values.push_back(reinterpret_cast(value)); } void check_cu_result(const char* cu_function_name, CUresult cu_result) const { @@ -466,16 +467,16 @@ namespace m_log_verbose(py::extract(py_log_verbose)) { add_option(CU_JIT_INFO_LOG_BUFFER, m_info_buf); - add_option(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, (void*) sizeof(m_info_buf)); + add_option(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, sizeof(m_info_buf)); add_option(CU_JIT_ERROR_LOG_BUFFER, m_error_buf); - add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, (void*) sizeof(m_error_buf)); - add_option(CU_JIT_LOG_VERBOSE, (void*) (m_log_verbose? 1ull : 0ull)); + add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, sizeof(m_error_buf)); + add_option(CU_JIT_LOG_VERBOSE, m_log_verbose? 1ull : 0ull); if (py_options.ptr() != Py_None) { PYTHON_FOREACH(key_value, py_options) { add_option( py::extract(key_value[0]), - (void*) py::extract(key_value[1])()); + py::extract(key_value[1])()); } } @@ -499,8 +500,11 @@ namespace if (PyObject_AsCharBuffer(py_data.ptr(), &data_buf, &data_buf_len) != 0) { throw py::error_already_set(); } - const char* name = (py_name.ptr() != Py_None)? py::extract(py_name) : NULL; - const CUresult cu_result = cuLinkAddData(m_link_state, input_type, (void*)data_buf, data_buf_len, name, 0, NULL, NULL); + const char* name = (py_name.ptr() != Py_None)? + py::extract(py_name) : NULL; + const CUresult cu_result = cuLinkAddData(m_link_state, input_type, + static_cast(const_cast(data_buf)), + data_buf_len, name, 0, NULL, NULL); check_cu_result("cuLinkAddData", cu_result); } @@ -513,13 +517,14 @@ namespace module* link_module() { - const char* cubin_data = NULL; + char* cubin_data = NULL; size_t cubin_size = 0; - CUresult cu_result = cuLinkComplete(m_link_state, (void**)&cubin_data, &cubin_size); + CUresult cu_result = cuLinkComplete(m_link_state, + reinterpret_cast(&cubin_data), &cubin_size); check_cu_result("cuLinkComplete", cu_result); CUmodule cu_module = NULL; - cu_result = cuModuleLoadData(&cu_module, (void*)cubin_data); + cu_result = cuModuleLoadData(&cu_module, cubin_data); check_cu_result("cuModuleLoadData", cu_result); call_message_handler(cu_result); -- GitLab From 16fc15a0e479b3a6e7f3cde4ce54d33f6aef6207 Mon Sep 17 00:00:00 2001 From: Lurch Date: Sat, 14 Jan 2017 13:08:32 +0100 Subject: [PATCH 15/15] Moved invocation of _locate_cuda_libdir() from constructor to add_stdlib(), it should be only called when actually needed Renamed JitLinkModule to DynamicModule --- pycuda/compiler.py | 55 +++++++++++++++++++++++++----------- src/wrapper/wrap_cudadrv.cpp | 3 +- test/test_driver.py | 4 +-- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 9b325410..d9f25752 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -278,6 +278,10 @@ class CudaModule(object): return self.module.get_function(name) class SourceModule(CudaModule): + ''' + Creates a Module from a single .cu source object linked against the + static CUDA runtime. + ''' def __init__(self, source, nvcc="nvcc", options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[]): @@ -291,7 +295,11 @@ class SourceModule(CudaModule): self._bind_module() -class JitLinkModule(CudaModule): +class DynamicModule(CudaModule): + ''' + Creates a Module from multiple .cu source, library file and/or data + objects linked against the static or dynamic CUDA runtime. + ''' def __init__(self, nvcc='nvcc', link_options=None, keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], message_handler=None, log_verbose=False, @@ -312,15 +320,17 @@ class JitLinkModule(CudaModule): self.code = code self.cache_dir = cache_dir self.include_dirs = include_dirs + self.cuda_libdir = cuda_libdir + self.libdir, self.libptn = None, None self.module = None - self.libdir, self.libptn = self._locate_cuda_libdir(cuda_libdir) - def _locate_cuda_libdir(self, cuda_libdir=None): + def _locate_cuda_libdir(self): ''' - Locate the CUDA "standard" libraries directory in the local + Locate the "standard" CUDA SDK library directory in the local file system. Supports 64-Bit Windows, Linux and Mac OS X. - In case the caller supplied cuda_libdir other than None that - value is returned unchecked, else make a best-effort attempt. + In case the caller supplied cuda_libdir in the constructor + other than None that value is returned unchecked, else a + best-effort attempt is made. Precedence: Windows: cuda_libdir > %CUDA_PATH% Linux: cuda_libdir > $CUDA_ROOT > $LD_LIBRARY_PATH > '/usr/lib/x86_64-linux-gnu' @@ -328,8 +338,8 @@ class JitLinkModule(CudaModule): of failure or a string containing the absolute path of the directory, and libptn is the %-format pattern to construct library file names from library names on the local system. - Does not raise an Excpetion in case of failure. - Links + Raises a RuntimeError in case of failure. + Links: - Post-installation Actions http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions TODO: @@ -345,14 +355,14 @@ class JitLinkModule(CudaModule): system = platform_system() libdir, libptn = None, None if system == 'Windows': - if cuda_libdir is not None: - libdir = cuda_libdir - elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib/x64/cudadevrt.lib')): - libdir = join(os.environ['CUDA_PATH'], 'lib/x64') + if self.cuda_libdir is not None: + libdir = self.cuda_libdir + elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib\\x64\\cudadevrt.lib')): + libdir = join(os.environ['CUDA_PATH'], 'lib\\x64') libptn = '%s.lib' elif system == 'Linux' or system == 'Darwin': - if cuda_libdir is not None: - libdir = cuda_libdir + if self.cuda_libdir is not None: + libdir = self.cuda_libdir elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')): libdir = join(os.environ['CUDA_ROOT'], 'lib64') elif 'LD_LIBRARY_PATH' in os.environ: @@ -363,6 +373,9 @@ class JitLinkModule(CudaModule): if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'): libdir = '/usr/lib/x86_64-linux-gnu' libptn = 'lib%s.a' + if libdir is None: + raise RuntimeError('Unable to locate the CUDA SDK installation ' + 'directory, set CUDA library path manually') return libdir, libptn def add_source(self, source, nvcc_options=None, name='kernel.ptx'): @@ -384,11 +397,11 @@ class JitLinkModule(CudaModule): def add_stdlib(self, libname): if self.libdir is None: - raise RuntimeError('Unable to find CUDA installation path, please set CUDA library path manually') + self.libdir, self.libptn = self._locate_cuda_libdir() from os.path import isfile, join libpath = join(self.libdir, self.libptn % libname) if not isfile(libpath): - raise FileNotFoundError('CUDA library file "%s" not found' % libpath) + raise FileNotFoundError('CUDA SDK library file "%s" not found' % libpath) from pycuda.driver import jit_input_type self.linker.add_file(libpath, jit_input_type.LIBRARY) return self @@ -399,7 +412,15 @@ class JitLinkModule(CudaModule): self._bind_module() return self -class DynamicSourceModule(JitLinkModule): +class DynamicSourceModule(DynamicModule): + ''' + Creates a Module from a single .cu source object linked against the + dynamic CUDA runtime. + - compiler generates PTX relocatable device code (rdc) from source that + can be linked with other relocatable device code + - source is linked against the CUDA device runtime library cudadevrt + - library cudadevrt is statically linked into the generated Module + ''' def __init__(self, source, nvcc="nvcc", options=[], keep=False, no_extern_c=False, arch=None, code=None, cache_dir=None, include_dirs=[], cuda_libdir=None): diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 9756b035..7bb23d4a 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -511,7 +511,8 @@ namespace void add_file(py::str py_filename, CUjitInputType input_type) { const char* filename = py::extract(py_filename); - const CUresult cu_result = cuLinkAddFile(m_link_state, input_type, filename, 0, NULL, NULL); + const CUresult cu_result = cuLinkAddFile(m_link_state, input_type, + filename, 0, NULL, NULL); check_cu_result("cuLinkAddFile", cu_result); } diff --git a/test/test_driver.py b/test/test_driver.py index 6e705ff7..f88a1d67 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -934,8 +934,8 @@ class TestDriver: printf(" Hello inner world!\\n"); }''' - from pycuda.compiler import JitLinkModule - mod = JitLinkModule() + from pycuda.compiler import DynamicModule + mod = DynamicModule() mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) mod.add_source(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) mod.add_stdlib('cudadevrt') -- GitLab