diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py new file mode 100644 index 0000000000000000000000000000000000000000..d5435987420dec7fa4542b5c11667dfda9d3f9dd --- /dev/null +++ b/examples/demo_cdpSimplePrint.py @@ -0,0 +1,120 @@ +''' + * demo_cdpSimplePrint.py + * + * Adapted from NVIDIA's "cdpSimplePrint - Simple Print (CUDA Dynamic Parallelism)" sample + * http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-print--cuda-dynamic-parallelism- + * http://ecee.colorado.edu/~siewerts/extra/code/example_code_archive/a490dmis_code/CUDA/cuda_work/samples/0_Simple/cdpSimplePrint/cdpSimplePrint.cu + * + * From cdpSimplePrint.cu (not sure if this is Ok with NVIDIA's 38-page EULA though...): + * --------------------------------------------------------------------------- + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * --------------------------------------------------------------------------- +''' + +import sys, os +import pycuda.autoinit +import pycuda.driver as cuda +from pycuda.compiler import DynamicSourceModule + +cdpSimplePrint_cu = ''' +#include + +//////////////////////////////////////////////////////////////////////////////// +// Variable on the GPU used to generate unique identifiers of blocks. +//////////////////////////////////////////////////////////////////////////////// +__device__ int g_uids = 0; + +//////////////////////////////////////////////////////////////////////////////// +// Print a simple message to signal the block which is currently executing. +//////////////////////////////////////////////////////////////////////////////// +__device__ void print_info( int depth, int thread, int uid, int parent_uid ) +{ + if( threadIdx.x == 0 ) + { + if( depth == 0 ) + printf( "BLOCK %d launched by the host\\n", uid ); + else + { + char buffer[32]; + for( int i = 0 ; i < depth ; ++i ) + { + buffer[3*i+0] = '|'; + buffer[3*i+1] = ' '; + buffer[3*i+2] = ' '; + } + buffer[3*depth] = '\\0'; + printf( "%sBLOCK %d launched by thread %d of block %d\\n", buffer, uid, thread, parent_uid ); + } + } + __syncthreads(); +} + +//////////////////////////////////////////////////////////////////////////////// +// The kernel using CUDA dynamic parallelism. +// +// It generates a unique identifier for each block. Prints the information +// about that block. Finally, if the 'max_depth' has not been reached, the +// block launches new blocks directly from the GPU. +//////////////////////////////////////////////////////////////////////////////// +__global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid ) +{ + // We create a unique ID per block. Thread 0 does that and shares the value with the other threads. + __shared__ int s_uid; + if( threadIdx.x == 0 ) + { + s_uid = atomicAdd( &g_uids, 1 ); + } + __syncthreads(); + + // We print the ID of the block and information about its parent. + print_info( depth, thread, s_uid, parent_uid ); + + // We launch new blocks if we haven't reached the max_depth yet. + if( ++depth >= max_depth ) + { + return; + } + cdp_kernel<<>>( max_depth, depth, threadIdx.x, s_uid ); +} +''' + +def main(argv): + max_depth = 2 + if len(argv) > 1: + if len(argv) == 2 and argv[1].isdigit() and int(argv[1]) >= 1 and int(argv[1]) <= 8: + max_depth = int(argv[1]) + else: + print("Usage: %s \t(where max_depth is a value between 1 and 8)." % argv[0]) + sys.exit(0) + + print("starting Simple Print (CUDA Dynamic Parallelism)") + + mod = DynamicSourceModule(cdpSimplePrint_cu) + cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call + + print("***************************************************************************") + print("The CPU launches 2 blocks of 2 threads each. On the device each thread will") + print("launch 2 blocks of 2 threads each. The GPU we will do that recursively") + print("until it reaches max_depth=%d\n" % max_depth) + print("In total 2") + num_blocks, sum = 2, 2 + for i in range(1, max_depth): + num_blocks *= 4 + print("+%d" % num_blocks) + sum += num_blocks + print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum-2)) + print("***************************************************************************\n") + + pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH, max_depth) + + print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n") + cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1) + +if __name__ == "__main__": + main(sys.argv) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index c0b5ba6368950e186c73ad85f0efd9b9bbbb22a6..d9f25752a3c55bc8562b163ec77e2e8636c6b25e 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -254,24 +254,7 @@ def compile(source, nvcc="nvcc", options=None, keep=False, return compile_plain(source, options, keep, nvcc, cache_dir, target) - -class SourceModule(object): - def __init__(self, source, nvcc="nvcc", options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[]): - self._check_arch(arch) - - cubin = compile(source, nvcc, options, keep, no_extern_c, - arch, code, cache_dir, include_dirs) - - from pycuda.driver import module_from_buffer - self.module = module_from_buffer(cubin) - - self.get_global = self.module.get_global - self.get_texref = self.module.get_texref - if hasattr(self.module, "get_surfref"): - self.get_surfref = self.module.get_surfref - +class CudaModule(object): def _check_arch(self, arch): if arch is None: return @@ -285,5 +268,171 @@ class SourceModule(object): except: pass + def _bind_module(self): + self.get_global = self.module.get_global + self.get_texref = self.module.get_texref + if hasattr(self.module, "get_surfref"): + self.get_surfref = self.module.get_surfref + def get_function(self, name): return self.module.get_function(name) + +class SourceModule(CudaModule): + ''' + Creates a Module from a single .cu source object linked against the + static CUDA runtime. + ''' + def __init__(self, source, nvcc="nvcc", options=None, keep=False, + no_extern_c=False, arch=None, code=None, cache_dir=None, + include_dirs=[]): + self._check_arch(arch) + + cubin = compile(source, nvcc, options, keep, no_extern_c, + arch, code, cache_dir, include_dirs) + + from pycuda.driver import module_from_buffer + self.module = module_from_buffer(cubin) + + self._bind_module() + +class DynamicModule(CudaModule): + ''' + Creates a Module from multiple .cu source, library file and/or data + objects linked against the static or dynamic CUDA runtime. + ''' + def __init__(self, nvcc='nvcc', link_options=None, keep=False, + no_extern_c=False, arch=None, code=None, cache_dir=None, + include_dirs=[], message_handler=None, log_verbose=False, + cuda_libdir=None): + from pycuda.driver import Context + compute_capability = Context.get_device().compute_capability() + if compute_capability < (3,5): + raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % + (compute_capability[0], compute_capability[1])) + else: + from pycuda.driver import Linker + self.linker = Linker(message_handler, link_options, log_verbose) + self._check_arch(arch) + self.nvcc = nvcc + self.keep = keep + self.no_extern_c = no_extern_c + self.arch = arch + self.code = code + self.cache_dir = cache_dir + self.include_dirs = include_dirs + self.cuda_libdir = cuda_libdir + self.libdir, self.libptn = None, None + self.module = None + + def _locate_cuda_libdir(self): + ''' + Locate the "standard" CUDA SDK library directory in the local + file system. Supports 64-Bit Windows, Linux and Mac OS X. + In case the caller supplied cuda_libdir in the constructor + other than None that value is returned unchecked, else a + best-effort attempt is made. + Precedence: + Windows: cuda_libdir > %CUDA_PATH% + Linux: cuda_libdir > $CUDA_ROOT > $LD_LIBRARY_PATH > '/usr/lib/x86_64-linux-gnu' + Returns a pair (libdir, libptn) where libdir is None in case + of failure or a string containing the absolute path of the + directory, and libptn is the %-format pattern to construct + library file names from library names on the local system. + Raises a RuntimeError in case of failure. + Links: + - Post-installation Actions + http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions + TODO: + - Is $CUDA_ROOT/lib64 the correct path to assume for 64-Bit CUDA libraries on Linux? + - Mac OS X (Darwin) is currently treated like Linux, is that correct? + - Check CMake's FindCUDA module, it might contain some helpful clues in its sources + https://cmake.org/cmake/help/v3.0/module/FindCUDA.html + https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake + - Verify all Linux code paths somehow + ''' + from os.path import isfile, join + from platform import system as platform_system + system = platform_system() + libdir, libptn = None, None + if system == 'Windows': + if self.cuda_libdir is not None: + libdir = self.cuda_libdir + elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib\\x64\\cudadevrt.lib')): + libdir = join(os.environ['CUDA_PATH'], 'lib\\x64') + libptn = '%s.lib' + elif system == 'Linux' or system == 'Darwin': + if self.cuda_libdir is not None: + libdir = self.cuda_libdir + elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')): + libdir = join(os.environ['CUDA_ROOT'], 'lib64') + elif 'LD_LIBRARY_PATH' in os.environ: + for ld_path in os.environ['LD_LIBRARY_PATH'].split(':'): + if isfile(join(ld_path, 'libcudadevrt.a')): + libdir = ld_path + break + if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'): + libdir = '/usr/lib/x86_64-linux-gnu' + libptn = 'lib%s.a' + if libdir is None: + raise RuntimeError('Unable to locate the CUDA SDK installation ' + 'directory, set CUDA library path manually') + return libdir, libptn + + def add_source(self, source, nvcc_options=None, name='kernel.ptx'): + ptx = compile(source, nvcc=self.nvcc, options=nvcc_options, + keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch, + code=self.code, cache_dir=self.cache_dir, + include_dirs=self.include_dirs, target="ptx") + from pycuda.driver import jit_input_type + self.linker.add_data(ptx, jit_input_type.PTX, name) + return self + + def add_data(self, data, input_type, name='unknown'): + self.linker.add_data(data, input_type, name) + return self + + def add_file(self, filename, input_type): + self.linker.add_file(filename, input_type) + return self + + def add_stdlib(self, libname): + if self.libdir is None: + self.libdir, self.libptn = self._locate_cuda_libdir() + from os.path import isfile, join + libpath = join(self.libdir, self.libptn % libname) + if not isfile(libpath): + raise FileNotFoundError('CUDA SDK library file "%s" not found' % libpath) + from pycuda.driver import jit_input_type + self.linker.add_file(libpath, jit_input_type.LIBRARY) + return self + + def link(self): + self.module = self.linker.link_module() + self.linker = None + self._bind_module() + return self + +class DynamicSourceModule(DynamicModule): + ''' + Creates a Module from a single .cu source object linked against the + dynamic CUDA runtime. + - compiler generates PTX relocatable device code (rdc) from source that + can be linked with other relocatable device code + - source is linked against the CUDA device runtime library cudadevrt + - library cudadevrt is statically linked into the generated Module + ''' + def __init__(self, source, nvcc="nvcc", options=[], keep=False, + no_extern_c=False, arch=None, code=None, cache_dir=None, + include_dirs=[], cuda_libdir=None): + super(DynamicSourceModule, self).__init__(nvcc=nvcc, + link_options=None, keep=keep, no_extern_c=no_extern_c, + arch=arch, code=code, cache_dir=cache_dir, + include_dirs=include_dirs, cuda_libdir=cuda_libdir) + options = options[:] + if not '-rdc=true' in options: + options.append('-rdc=true') + if not '-lcudadevrt' in options: + options.append('-lcudadevrt') + self.add_source(source, nvcc_options=options) + self.add_stdlib('cudadevrt') + self.link() diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 72175979e8ee851fc3d4ee2abfb08b152297c86e..7bb23d4a7be6e8a2517cc31a5b758b6cc948f5d7 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -410,6 +410,134 @@ namespace // }}} + // {{{ linker + +#if CUDAPP_CUDA_VERSION >= 5050 + class Linker : public boost::noncopyable + { + private: + py::object m_message_handler; + CUlinkState m_link_state; + bool m_log_verbose; + std::vector m_options; + std::vector m_values; + char m_info_buf[32768]; + char m_error_buf[32768]; + + void close() { + if (m_link_state != NULL) { + cuLinkDestroy(m_link_state); + m_link_state = NULL; + } + } + + template + void add_option(CUjit_option option, T value) { + m_options.push_back(option); + m_values.push_back(reinterpret_cast(value)); + } + + void check_cu_result(const char* cu_function_name, CUresult cu_result) const { + if (cu_result != CUDA_SUCCESS) { + call_message_handler(cu_result); + throw pycuda::error(cu_function_name, cu_result, error_str().c_str()); + } + } + + void call_message_handler(CUresult cu_result) const { + if (m_message_handler != py::object()) { + m_message_handler(cu_result == CUDA_SUCCESS, info_str(), error_str()); + } + } + + const std::string info_str() const { + return std::string(m_info_buf, size_t(m_values[1])); + } + + const std::string error_str() const { + return std::string(m_error_buf, size_t(m_values[3])); + } + + public: + Linker(py::object message_handler = py::object(), + py::object py_options = py::object(), + py::object py_log_verbose = py::object(false)) + : m_message_handler(message_handler), + m_link_state(), + m_log_verbose(py::extract(py_log_verbose)) + { + add_option(CU_JIT_INFO_LOG_BUFFER, m_info_buf); + add_option(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, sizeof(m_info_buf)); + add_option(CU_JIT_ERROR_LOG_BUFFER, m_error_buf); + add_option(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, sizeof(m_error_buf)); + add_option(CU_JIT_LOG_VERBOSE, m_log_verbose? 1ull : 0ull); + + if (py_options.ptr() != Py_None) { + PYTHON_FOREACH(key_value, py_options) { + add_option( + py::extract(key_value[0]), + py::extract(key_value[1])()); + } + } + + const CUresult cu_result = cuLinkCreate( + (unsigned int) m_options.size(), + const_cast(&*m_options.begin()), + const_cast(&*m_values.begin()), + &m_link_state); + check_cu_result("cuLinkCreate", cu_result); + } + + ~Linker() + { + close(); + } + + void add_data(py::object py_data, CUjitInputType input_type, py::str py_name) + { + const char *data_buf; + PYCUDA_BUFFER_SIZE_T data_buf_len; + if (PyObject_AsCharBuffer(py_data.ptr(), &data_buf, &data_buf_len) != 0) { + throw py::error_already_set(); + } + const char* name = (py_name.ptr() != Py_None)? + py::extract(py_name) : NULL; + const CUresult cu_result = cuLinkAddData(m_link_state, input_type, + static_cast(const_cast(data_buf)), + data_buf_len, name, 0, NULL, NULL); + check_cu_result("cuLinkAddData", cu_result); + } + + void add_file(py::str py_filename, CUjitInputType input_type) + { + const char* filename = py::extract(py_filename); + const CUresult cu_result = cuLinkAddFile(m_link_state, input_type, + filename, 0, NULL, NULL); + check_cu_result("cuLinkAddFile", cu_result); + } + + module* link_module() + { + char* cubin_data = NULL; + size_t cubin_size = 0; + CUresult cu_result = cuLinkComplete(m_link_state, + reinterpret_cast(&cubin_data), &cubin_size); + check_cu_result("cuLinkComplete", cu_result); + + CUmodule cu_module = NULL; + cu_result = cuModuleLoadData(&cu_module, cubin_data); + check_cu_result("cuModuleLoadData", cu_result); + + call_message_handler(cu_result); + close(); + + return new module(cu_module); + } + }; +#endif + + // }}} + template PyObject *mem_obj_to_long(T const &mo) { @@ -888,6 +1016,10 @@ BOOST_PYTHON_MODULE(_driver) .value("PRINTF_FIFO_SIZE", CU_LIMIT_PRINTF_FIFO_SIZE) #if CUDAPP_CUDA_VERSION >= 3020 .value("MALLOC_HEAP_SIZE", CU_LIMIT_MALLOC_HEAP_SIZE) +#endif +#if CUDAPP_CUDA_VERSION >= 3050 + .value("DEV_RUNTIME_SYNC_DEPTH", CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH) + .value("DEV_RUNTIME_PENDING_LAUNCH_COUNT", CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT) #endif ; #endif @@ -1058,6 +1190,27 @@ BOOST_PYTHON_MODULE(_driver) // }}} + // {{{ linker + +#if CUDAPP_CUDA_VERSION >= 5050 + py::enum_("jit_input_type") + .value("CUBIN", CU_JIT_INPUT_CUBIN) + .value("PTX", CU_JIT_INPUT_PTX) + .value("FATBINARY", CU_JIT_INPUT_FATBINARY) + .value("OBJECT", CU_JIT_INPUT_OBJECT) + .value("LIBRARY", CU_JIT_INPUT_LIBRARY); + + py::class_ >("Linker") + .def(py::init()) + .def(py::init()) + .def(py::init()) + .def("add_data", &Linker::add_data, (py::arg("data"), py::arg("input_type"), py::arg("name")=py::str("unknown"))) + .def("add_file", &Linker::add_file, (py::arg("filename"), py::arg("input_type"))) + .def("link_module", &Linker::link_module, py::return_value_policy()); +#endif + + // }}} + // {{{ function { typedef function cl; diff --git a/test/test_driver.py b/test/test_driver.py index 56d936589d4983788b1942c215fa222758a339a2..f88a1d67108ae91450b5af411ccc87f784bc45cd 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -838,7 +838,6 @@ class TestDriver: drv.memcpy_htod_async(gpu_ary, a_pin, stream) drv.Context.synchronize() - @pytest.mark.xfail @mark_cuda_test # https://github.com/inducer/pycuda/issues/45 def test_recursive_launch(self): @@ -894,9 +893,8 @@ class TestDriver: drv.memcpy_htod(a_gpu, a) drv.memcpy_htod(b_gpu, b) - mod = SourceModule(cuda_string, - options=['-rdc=true', '-lcudadevrt'], - keep=True) + from pycuda.compiler import DynamicSourceModule + mod = DynamicSourceModule(cuda_string, keep=True) func = mod.get_function("math") func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu, @@ -918,6 +916,34 @@ class TestDriver: math(a, b, c, d, e, f) + @mark_cuda_test + def test_jit_link_module(self): + if drv.Context.get_device().compute_capability() < (3, 5): + from pytest import skip + skip("need compute capability 3.5 or higher for dynamic parallelism") + + test_outer_cu = '''#include + __global__ void test_kernel() { + extern __global__ void test_kernel_inner(); + printf("Hello outer world!\\n"); + test_kernel_inner<<<2, 1>>>(); + }''' + + test_inner_cu = '''#include + __global__ void test_kernel_inner() { + printf(" Hello inner world!\\n"); + }''' + + from pycuda.compiler import DynamicModule + mod = DynamicModule() + mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) + mod.add_source(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt']) + mod.add_stdlib('cudadevrt') + mod.link() + + test_kernel = mod.get_function('test_kernel') + test_kernel(grid=(2,1), block=(1,1,1)) + def test_import_pyopencl_before_pycuda(): try: