// A C++ wrapper for CUDA #ifndef _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_HPP #define _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_HPP #include #include #include #include #include #include #include #include #include #include #include #include #if (BOOST_VERSION/100) < 1035 #warning ***************************************************************** #warning **** Your version of Boost C++ is likely too old for PyCUDA. **** #warning ***************************************************************** #endif // TODO: cuCtxSetCurrent, cuCtxGetCurrent, cuMemHostRegister, cuMemHostUnregister // TODO: cuMemcpy, cuMemcpyPeer, cuMemcpyPeerAsync // TODO: in structured memcpy: set_{src,dest}_unified() // TODO: cuPointerGetAttribute, cuLaunchKernel, deprecation of other launch functions // TODO: cuMemPeerRegister, cuMemPeerUnregister, cuMemPeerGetDevicePointer // #define CUDAPP_TRACE_CUDA #define CUDAPP_POST_30_BETA #ifdef CUDAPP_PRETEND_CUDA_VERSION #define CUDAPP_CUDA_VERSION CUDAPP_PRETEND_CUDA_VERSION #else #define CUDAPP_CUDA_VERSION CUDA_VERSION #endif // {{{ tracing and error guards #ifdef CUDAPP_TRACE_CUDA #define CUDAPP_PRINT_CALL_TRACE(NAME) \ std::cerr << NAME << std::endl; #define CUDAPP_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) \ std::cerr << NAME << " (" << EXTRA_INFO << ')' << std::endl; #define CUDAPP_PRINT_ERROR_TRACE(NAME, CODE) \ if (CODE != CUDA_SUCCESS) \ std::cerr << NAME << " failed with code " << CODE << std::endl; #else #define CUDAPP_PRINT_CALL_TRACE(NAME) /*nothing*/ #define CUDAPP_PRINT_CALL_TRACE_INFO(NAME, EXTRA_INFO) /*nothing*/ #define CUDAPP_PRINT_ERROR_TRACE(NAME, CODE) /*nothing*/ #endif #define CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \ { \ CUDAPP_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \ CUresult cu_status_code; \ Py_BEGIN_ALLOW_THREADS \ cu_status_code = NAME ARGLIST; \ Py_END_ALLOW_THREADS \ if (cu_status_code != CUDA_SUCCESS) \ throw pycuda::error(#NAME, cu_status_code);\ } #define CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(NAME, ARGLIST, TRACE_INFO) \ { \ CUDAPP_PRINT_CALL_TRACE_INFO(#NAME, TRACE_INFO); \ CUresult cu_status_code; \ cu_status_code = NAME ARGLIST; \ CUDAPP_PRINT_ERROR_TRACE(#NAME, cu_status_code); \ if (cu_status_code != CUDA_SUCCESS) \ throw pycuda::error(#NAME, cu_status_code);\ } #define CUDAPP_CALL_GUARDED_THREADED(NAME, ARGLIST) \ { \ CUDAPP_PRINT_CALL_TRACE(#NAME); \ CUresult cu_status_code; \ Py_BEGIN_ALLOW_THREADS \ cu_status_code = NAME ARGLIST; \ Py_END_ALLOW_THREADS \ CUDAPP_PRINT_ERROR_TRACE(#NAME, cu_status_code); \ if (cu_status_code != CUDA_SUCCESS) \ throw pycuda::error(#NAME, cu_status_code);\ } #define CUDAPP_CALL_GUARDED(NAME, ARGLIST) \ { \ CUDAPP_PRINT_CALL_TRACE(#NAME); \ CUresult cu_status_code; \ cu_status_code = NAME ARGLIST; \ CUDAPP_PRINT_ERROR_TRACE(#NAME, cu_status_code); \ if (cu_status_code != CUDA_SUCCESS) \ throw pycuda::error(#NAME, cu_status_code);\ } #define CUDAPP_CALL_GUARDED_CLEANUP(NAME, ARGLIST) \ { \ CUDAPP_PRINT_CALL_TRACE(#NAME); \ CUresult cu_status_code; \ cu_status_code = NAME ARGLIST; \ CUDAPP_PRINT_ERROR_TRACE(#NAME, cu_status_code); \ if (cu_status_code != CUDA_SUCCESS) \ std::cerr \ << "PyCUDA WARNING: a clean-up operation failed (dead context maybe?)" \ << std::endl \ << pycuda::error::make_message(#NAME, cu_status_code) \ << std::endl; \ } #define CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(TYPE) \ catch (pycuda::cannot_activate_out_of_thread_context) \ { } \ catch (pycuda::cannot_activate_dead_context) \ { \ /* PyErr_Warn( \ PyExc_UserWarning, #TYPE " in dead context was implicitly cleaned up");*/ \ } // In all likelihood, this TYPE's managing thread has exited, and // therefore its context has already been deleted. No need to harp // on the fact that we still thought there was cleanup to do. // }}} namespace pycuda { namespace py = boost::python; typedef #if CUDAPP_CUDA_VERSION >= 3020 size_t #else unsigned int #endif pycuda_size_t; typedef #if defined(_WIN32) && defined(_WIN64) long long #else long #endif hash_type; // {{{ error reporting class error : public std::runtime_error { private: const char *m_routine; CUresult m_code; public: static std::string make_message(const char *rout, CUresult c, const char *msg=0) { std::string result = rout; result += " failed: "; result += curesult_to_str(c); if (msg) { result += " - "; result += msg; } return result; } error(const char *rout, CUresult c, const char *msg=0) : std::runtime_error(make_message(rout, c, msg)), m_routine(rout), m_code(c) { } const char *routine() const { return m_routine; } CUresult code() const { return m_code; } bool is_out_of_memory() const { return code() == CUDA_ERROR_OUT_OF_MEMORY; } static const char *curesult_to_str(CUresult e) { switch (e) { case CUDA_SUCCESS: return "success"; case CUDA_ERROR_INVALID_VALUE: return "invalid value"; case CUDA_ERROR_OUT_OF_MEMORY: return "out of memory"; case CUDA_ERROR_NOT_INITIALIZED: return "not initialized"; #if CUDAPP_CUDA_VERSION >= 2000 case CUDA_ERROR_DEINITIALIZED: return "deinitialized"; #endif #if CUDAPP_CUDA_VERSION >= 4000 case CUDA_ERROR_PROFILER_DISABLED: return "profiler disabled"; case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "profiler not initialized"; case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "profiler already started"; case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "profiler already stopped"; #endif case CUDA_ERROR_NO_DEVICE: return "no device"; case CUDA_ERROR_INVALID_DEVICE: return "invalid device"; case CUDA_ERROR_INVALID_IMAGE: return "invalid image"; case CUDA_ERROR_INVALID_CONTEXT: return "invalid context"; case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "context already current"; case CUDA_ERROR_MAP_FAILED: return "map failed"; case CUDA_ERROR_UNMAP_FAILED: return "unmap failed"; case CUDA_ERROR_ARRAY_IS_MAPPED: return "array is mapped"; case CUDA_ERROR_ALREADY_MAPPED: return "already mapped"; case CUDA_ERROR_NO_BINARY_FOR_GPU: return "no binary for gpu"; case CUDA_ERROR_ALREADY_ACQUIRED: return "already acquired"; case CUDA_ERROR_NOT_MAPPED: return "not mapped"; #if CUDAPP_CUDA_VERSION >= 3000 case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "not mapped as array"; case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "not mapped as pointer"; #ifdef CUDAPP_POST_30_BETA case CUDA_ERROR_ECC_UNCORRECTABLE: return "ECC uncorrectable"; #endif #endif #if CUDAPP_CUDA_VERSION >= 3010 case CUDA_ERROR_UNSUPPORTED_LIMIT: return "unsupported limit"; #endif #if CUDAPP_CUDA_VERSION >= 4000 case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "context already in use"; #endif case CUDA_ERROR_INVALID_SOURCE: return "invalid source"; case CUDA_ERROR_FILE_NOT_FOUND: return "file not found"; #if CUDAPP_CUDA_VERSION >= 3010 case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "shared object symbol not found"; case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "shared object init failed"; #endif case CUDA_ERROR_INVALID_HANDLE: return "invalid handle"; case CUDA_ERROR_NOT_FOUND: return "not found"; case CUDA_ERROR_NOT_READY: return "not ready"; case CUDA_ERROR_LAUNCH_FAILED: return "launch failed"; case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "launch out of resources"; case CUDA_ERROR_LAUNCH_TIMEOUT: return "launch timeout"; case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "launch incompatible texturing"; #if CUDAPP_CUDA_VERSION >= 4000 case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "peer access already enabled"; case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "peer access not enabled"; case CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED: return "peer memory already registered"; case CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED: return "peer memory not registered"; case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "primary context active"; case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "context is destroyed"; #endif #if (CUDAPP_CUDA_VERSION >= 3000) && (CUDAPP_CUDA_VERSION < 3020) case CUDA_ERROR_POINTER_IS_64BIT: return "attempted to retrieve 64-bit pointer via 32-bit api function"; case CUDA_ERROR_SIZE_IS_64BIT: return "attempted to retrieve 64-bit size via 32-bit api function"; #endif case CUDA_ERROR_UNKNOWN: return "unknown"; default: return "invalid error code"; } } }; struct cannot_activate_out_of_thread_context : public std::logic_error { cannot_activate_out_of_thread_context(std::string const &w) : std::logic_error(w) { } }; struct cannot_activate_dead_context : public std::logic_error { cannot_activate_dead_context(std::string const &w) : std::logic_error(w) { } }; // }}} // {{{ version query ------------------------------------------------------------ #if CUDAPP_CUDA_VERSION >= 2020 inline int get_driver_version() { int result; CUDAPP_CALL_GUARDED(cuDriverGetVersion, (&result)); return result; } #endif // }}} // {{{ device class context; class device { private: CUdevice m_device; public: device(CUdevice dev) : m_device(dev) { } static int count() { int result; CUDAPP_CALL_GUARDED(cuDeviceGetCount, (&result)); return result; } std::string name() { char buffer[1024]; CUDAPP_CALL_GUARDED(cuDeviceGetName, (buffer, sizeof(buffer), m_device)); return buffer; } py::tuple compute_capability() { int major, minor; CUDAPP_CALL_GUARDED(cuDeviceComputeCapability, (&major, &minor, m_device)); return py::make_tuple(major, minor); } pycuda_size_t total_memory() { pycuda_size_t bytes; CUDAPP_CALL_GUARDED(cuDeviceTotalMem, (&bytes, m_device)); return bytes; } int get_attribute(CUdevice_attribute attr) const { int result; CUDAPP_CALL_GUARDED(cuDeviceGetAttribute, (&result, attr, m_device)); return result; } bool operator==(const device &other) const { return m_device == other.m_device; } bool operator!=(const device &other) const { return m_device != other.m_device; } hash_type hash() const { return m_device; } boost::shared_ptr make_context(unsigned int flags); CUdevice handle() const { return m_device; } #if CUDAPP_CUDA_VERSION >= 4000 bool can_access_peer(device const &other) { int result; CUDAPP_CALL_GUARDED(cuDeviceCanAccessPeer, (&result, handle(), other.handle())); return result; } #endif }; inline void init(unsigned int flags) { CUDAPP_CALL_GUARDED(cuInit, (flags)); } inline device *make_device(int ordinal) { CUdevice result; CUDAPP_CALL_GUARDED(cuDeviceGet, (&result, ordinal)); return new device(result); } // }}} // {{{ context /* A word on context management: We don't let CUDA's context stack get more * than one deep. CUDA only supports pushing floating contexts. We may wish * to push contexts that are already active at a deeper stack level, so we * maintain all contexts floating other than the top one. */ // for friend decl namespace gl { boost::shared_ptr make_gl_context(device const &dev, unsigned int flags); } class context_stack; extern boost::thread_specific_ptr context_stack_ptr; class context_stack { /* This wrapper is necessary because we need to pop the contents * off the stack before we destroy each of the contexts. This, in turn, * is because the contexts need to be able to access the stack in order * to be destroyed. */ private: typedef std::stack > stack_t; typedef stack_t::value_type value_type;; stack_t m_stack; public: ~context_stack(); bool empty() const { return m_stack.empty(); } value_type &top() { return m_stack.top(); } void pop() { m_stack.pop(); } void push(value_type v) { m_stack.push(v); } static context_stack &get() { if (context_stack_ptr.get() == 0) context_stack_ptr.reset(new context_stack); return *context_stack_ptr; } }; class context : boost::noncopyable { private: CUcontext m_context; bool m_valid; unsigned m_use_count; boost::thread::id m_thread; public: context(CUcontext ctx) : m_context(ctx), m_valid(true), m_use_count(1), m_thread(boost::this_thread::get_id()) { } ~context() { if (m_valid) { /* It's possible that we get here with a non-zero m_use_count. Since the context * stack holds shared_ptrs, this must mean that the context stack itself is getting * destroyed, which means it's ok for this context to sign off, too. */ detach(); } } CUcontext handle() const { return m_context; } bool operator==(const context &other) const { return m_context == other.m_context; } bool operator!=(const context &other) const { return m_context != other.m_context; } hash_type hash() const { return hash_type(m_context) ^ hash_type(this); } boost::thread::id thread_id() const { return m_thread; } bool is_valid() const { return m_valid; } void detach() { if (m_valid) { bool active_before_destruction = current_context().get() == this; if (active_before_destruction) { CUDAPP_CALL_GUARDED_CLEANUP(cuCtxDetach, (m_context)); } else { if (m_thread == boost::this_thread::get_id()) { CUDAPP_CALL_GUARDED_CLEANUP(cuCtxPushCurrent, (m_context)); CUDAPP_CALL_GUARDED_CLEANUP(cuCtxDetach, (m_context)); /* pop is implicit in detach */ } else { // In all likelihood, this context's managing thread has exited, and // therefore this context has already been deleted. No need to harp // on the fact that we still thought there was cleanup to do. // std::cerr << "PyCUDA WARNING: leaked out-of-thread context " << std::endl; } } m_valid = false; if (active_before_destruction) { boost::shared_ptr new_active = current_context(this); if (new_active.get()) { CUDAPP_CALL_GUARDED(cuCtxPushCurrent, (new_active->m_context)); } } } else throw error("context::detach", CUDA_ERROR_INVALID_CONTEXT, "cannot detach from invalid context"); } static device get_device() { CUdevice dev; CUDAPP_CALL_GUARDED(cuCtxGetDevice, (&dev)); return device(dev); } #if CUDAPP_CUDA_VERSION >= 2000 static void prepare_context_switch() { if (!context_stack::get().empty()) { CUcontext popped; CUDAPP_CALL_GUARDED(cuCtxPopCurrent, (&popped)); } } static void pop() { prepare_context_switch(); context_stack &ctx_stack = context_stack::get(); if (ctx_stack.empty()) { throw error("context::pop", CUDA_ERROR_INVALID_CONTEXT, "cannot pop non-current context"); } boost::shared_ptr current = current_context(); if (current) --current->m_use_count; ctx_stack.pop(); current = current_context(); if (current) CUDAPP_CALL_GUARDED(cuCtxPushCurrent, (current_context()->m_context)); } #else static void prepare_context_switch() { } #endif static void synchronize() { CUDAPP_CALL_GUARDED_THREADED(cuCtxSynchronize, ()); } static boost::shared_ptr current_context(context *except=0) { while (true) { if (context_stack::get().empty()) return boost::shared_ptr(); boost::shared_ptr result(context_stack::get().top()); if (result.get() != except && result->is_valid()) { // good, weak pointer didn't expire return result; } // context invalid, pop it and try again. context_stack::get().pop(); } } #if CUDAPP_CUDA_VERSION >= 3010 static void set_limit(CUlimit limit, size_t value) { CUDAPP_CALL_GUARDED(cuCtxSetLimit, (limit, value)); } static size_t get_limit(CUlimit limit) { size_t value; CUDAPP_CALL_GUARDED(cuCtxGetLimit, (&value, limit)); return value; } #endif #if CUDAPP_CUDA_VERSION >= 3020 static CUfunc_cache get_cache_config() { CUfunc_cache value; CUDAPP_CALL_GUARDED(cuCtxGetCacheConfig, (&value)); return value; } static void set_cache_config(CUfunc_cache cc) { CUDAPP_CALL_GUARDED(cuCtxSetCacheConfig, (cc)); } unsigned int get_api_version() { unsigned int value; CUDAPP_CALL_GUARDED(cuCtxGetApiVersion, (m_context, &value)); return value; } #endif #if CUDAPP_CUDA_VERSION >= 4000 static void enable_peer_access(context const &peer, unsigned int flags) { CUDAPP_CALL_GUARDED(cuCtxEnablePeerAccess, (peer.handle(), flags)); } static void disable_peer_access(context const &peer) { CUDAPP_CALL_GUARDED(cuCtxDisablePeerAccess, (peer.handle())); } #endif friend class device; friend void context_push(boost::shared_ptr ctx); friend boost::shared_ptr gl::make_gl_context(device const &dev, unsigned int flags); }; inline boost::shared_ptr device::make_context(unsigned int flags) { context::prepare_context_switch(); CUcontext ctx; CUDAPP_CALL_GUARDED(cuCtxCreate, (&ctx, flags, m_device)); boost::shared_ptr result(new context(ctx)); context_stack::get().push(result); return result; } #if CUDAPP_CUDA_VERSION >= 2000 inline void context_push(boost::shared_ptr ctx) { context::prepare_context_switch(); CUDAPP_CALL_GUARDED(cuCtxPushCurrent, (ctx->m_context)); context_stack::get().push(ctx); ++ctx->m_use_count; } #endif inline context_stack::~context_stack() { if (!m_stack.empty()) { std::cerr << "-------------------------------------------------------------------" << std::endl << "PyCUDA ERROR: The context stack was not empty upon module cleanup." << std::endl << "-------------------------------------------------------------------" << std::endl << "A context was still active when the context stack was being" << std::endl << "cleaned up. At this point in our execution, CUDA may already" << std::endl << "have been deinitialized, so there is no way we can finish" << std::endl << "cleanly. The program will be aborted now." << std::endl << "Use Context.pop() to avoid this problem." << std::endl << "-------------------------------------------------------------------" << std::endl; abort(); } } class explicit_context_dependent { private: boost::shared_ptr m_ward_context; public: void acquire_context() { m_ward_context = context::current_context(); if (m_ward_context.get() == 0) throw error("explicit_context_dependent", CUDA_ERROR_INVALID_CONTEXT, "no currently active context?"); } void release_context() { m_ward_context.reset(); } boost::shared_ptr get_context() { return m_ward_context; } }; class context_dependent : public explicit_context_dependent { private: boost::shared_ptr m_ward_context; public: context_dependent() { acquire_context(); } }; class scoped_context_activation { private: boost::shared_ptr m_context; bool m_did_switch; public: scoped_context_activation(boost::shared_ptr ctx) : m_context(ctx) { if (!m_context->is_valid()) throw pycuda::cannot_activate_dead_context( "cannot activate dead context"); m_did_switch = context::current_context() != m_context; if (m_did_switch) { if (boost::this_thread::get_id() != m_context->thread_id()) throw pycuda::cannot_activate_out_of_thread_context( "cannot activate out-of-thread context"); #if CUDAPP_CUDA_VERSION >= 2000 context_push(m_context); #else throw pycuda::error("scoped_context_activation", CUDA_ERROR_INVALID_CONTEXT, "not available in CUDA < 2.0"); #endif } } ~scoped_context_activation() { #if CUDAPP_CUDA_VERSION >= 2000 if (m_did_switch) m_context->pop(); #endif } }; // }}} // {{{ stream class event; class stream : public boost::noncopyable, public context_dependent { private: CUstream m_stream; public: stream(unsigned int flags=0) { CUDAPP_CALL_GUARDED(cuStreamCreate, (&m_stream, flags)); } ~stream() { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuStreamDestroy, (m_stream)); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(stream); } void synchronize() { CUDAPP_CALL_GUARDED_THREADED(cuStreamSynchronize, (m_stream)); } CUstream handle() const { return m_stream; } #if CUDAPP_CUDA_VERSION >= 3020 void wait_for_event(const event &evt); #endif bool is_done() const { CUDAPP_PRINT_CALL_TRACE("cuStreamQuery"); CUresult result = cuStreamQuery(m_stream); switch (result) { case CUDA_SUCCESS: return true; case CUDA_ERROR_NOT_READY: return false; default: CUDAPP_PRINT_ERROR_TRACE("cuStreamQuery", result); throw error("cuStreamQuery", result); } } }; // }}} // {{{ array class array : public boost::noncopyable, public context_dependent { private: CUarray m_array; bool m_managed; public: array(const CUDA_ARRAY_DESCRIPTOR &descr) : m_managed(true) { CUDAPP_CALL_GUARDED(cuArrayCreate, (&m_array, &descr)); } #if CUDAPP_CUDA_VERSION >= 2000 array(const CUDA_ARRAY3D_DESCRIPTOR &descr) : m_managed(true) { CUDAPP_CALL_GUARDED(cuArray3DCreate, (&m_array, &descr)); } #endif array(CUarray ary, bool managed) : m_array(ary), m_managed(managed) { } ~array() { free(); } void free() { if (m_managed) { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuArrayDestroy, (m_array)); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(array); m_managed = false; release_context(); } } CUDA_ARRAY_DESCRIPTOR get_descriptor() { CUDA_ARRAY_DESCRIPTOR result; CUDAPP_CALL_GUARDED(cuArrayGetDescriptor, (&result, m_array)); return result; } #if CUDAPP_CUDA_VERSION >= 2000 CUDA_ARRAY3D_DESCRIPTOR get_descriptor_3d() { CUDA_ARRAY3D_DESCRIPTOR result; CUDAPP_CALL_GUARDED(cuArray3DGetDescriptor, (&result, m_array)); return result; } #endif CUarray handle() const { return m_array; } }; // }}} // {{{ texture reference class module; class texture_reference : public boost::noncopyable { private: CUtexref m_texref; bool m_managed; // life support for array and module boost::shared_ptr m_array; boost::shared_ptr m_module; public: texture_reference() : m_managed(true) { CUDAPP_CALL_GUARDED(cuTexRefCreate, (&m_texref)); } texture_reference(CUtexref tr, bool managed) : m_texref(tr), m_managed(managed) { } ~texture_reference() { if (m_managed) { CUDAPP_CALL_GUARDED_CLEANUP(cuTexRefDestroy, (m_texref)); } } void set_module(boost::shared_ptr mod) { m_module = mod; } CUtexref handle() const { return m_texref; } void set_array(boost::shared_ptr ary) { CUDAPP_CALL_GUARDED(cuTexRefSetArray, (m_texref, ary->handle(), CU_TRSA_OVERRIDE_FORMAT)); m_array = ary; } pycuda_size_t set_address(CUdeviceptr dptr, unsigned int bytes, bool allow_offset=false) { pycuda_size_t byte_offset; CUDAPP_CALL_GUARDED(cuTexRefSetAddress, (&byte_offset, m_texref, dptr, bytes)); if (!allow_offset && byte_offset != 0) throw pycuda::error("texture_reference::set_address", CUDA_ERROR_INVALID_VALUE, "texture binding resulted in offset, but allow_offset was false"); m_array.reset(); return byte_offset; } #if CUDAPP_CUDA_VERSION >= 2020 void set_address_2d(CUdeviceptr dptr, const CUDA_ARRAY_DESCRIPTOR &descr, unsigned int pitch) { CUDAPP_CALL_GUARDED(cuTexRefSetAddress2D, (m_texref, &descr, dptr, pitch)); } #endif void set_format(CUarray_format fmt, int num_packed_components) { CUDAPP_CALL_GUARDED(cuTexRefSetFormat, (m_texref, fmt, num_packed_components)); } void set_address_mode(int dim, CUaddress_mode am) { CUDAPP_CALL_GUARDED(cuTexRefSetAddressMode, (m_texref, dim, am)); } void set_filter_mode(CUfilter_mode fm) { CUDAPP_CALL_GUARDED(cuTexRefSetFilterMode, (m_texref, fm)); } void set_flags(unsigned int flags) { CUDAPP_CALL_GUARDED(cuTexRefSetFlags, (m_texref, flags)); } CUdeviceptr get_address() { CUdeviceptr result; CUDAPP_CALL_GUARDED(cuTexRefGetAddress, (&result, m_texref)); return result; } array *get_array() { CUarray result; CUDAPP_CALL_GUARDED(cuTexRefGetArray, (&result, m_texref)); return new array(result, false); } CUaddress_mode get_address_mode(int dim) { CUaddress_mode result; CUDAPP_CALL_GUARDED(cuTexRefGetAddressMode, (&result, m_texref, dim)); return result; } CUfilter_mode get_filter_mode() { CUfilter_mode result; CUDAPP_CALL_GUARDED(cuTexRefGetFilterMode, (&result, m_texref)); return result; } #if CUDAPP_CUDA_VERSION >= 2000 py::tuple get_format() { CUarray_format fmt; int num_channels; CUDAPP_CALL_GUARDED(cuTexRefGetFormat, (&fmt, &num_channels, m_texref)); return py::make_tuple(fmt, num_channels); } #endif unsigned int get_flags() { unsigned int result; CUDAPP_CALL_GUARDED(cuTexRefGetFlags, (&result, m_texref)); return result; } }; // }}} // {{{ surface reference #if CUDAPP_CUDA_VERSION >= 3010 class module; class surface_reference : public boost::noncopyable { private: CUsurfref m_surfref; // life support for array and module boost::shared_ptr m_array; boost::shared_ptr m_module; public: surface_reference(CUsurfref sr) : m_surfref(sr) { } void set_module(boost::shared_ptr mod) { m_module = mod; } CUsurfref handle() const { return m_surfref; } void set_array(boost::shared_ptr ary, unsigned int flags) { CUDAPP_CALL_GUARDED(cuSurfRefSetArray, (m_surfref, ary->handle(), flags)); m_array = ary; } array *get_array() { CUarray result; CUDAPP_CALL_GUARDED(cuSurfRefGetArray, (&result, m_surfref)); return new array(result, false); } }; #endif // }}} // {{{ module class function; class module : public boost::noncopyable, public context_dependent { private: CUmodule m_module; public: module(CUmodule mod) : m_module(mod) { } ~module() { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuModuleUnload, (m_module)); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(module); } CUmodule handle() const { return m_module; } function get_function(const char *name); py::tuple get_global(const char *name) { CUdeviceptr devptr; pycuda_size_t bytes; CUDAPP_CALL_GUARDED(cuModuleGetGlobal, (&devptr, &bytes, m_module, name)); return py::make_tuple(devptr, bytes); } }; inline module *module_from_file(const char *filename) { CUmodule mod; CUDAPP_CALL_GUARDED(cuModuleLoad, (&mod, filename)); return new module(mod); } inline texture_reference *module_get_texref( boost::shared_ptr mod, const char *name) { CUtexref tr; CUDAPP_CALL_GUARDED(cuModuleGetTexRef, (&tr, mod->handle(), name)); std::auto_ptr result( new texture_reference(tr, false)); result->set_module(mod); return result.release(); } #if CUDAPP_CUDA_VERSION >= 3010 inline surface_reference *module_get_surfref( boost::shared_ptr mod, const char *name) { CUsurfref sr; CUDAPP_CALL_GUARDED(cuModuleGetSurfRef, (&sr, mod->handle(), name)); std::auto_ptr result( new surface_reference(sr)); result->set_module(mod); return result.release(); } #endif // }}} // {{{ function class function { private: CUfunction m_function; std::string m_symbol; public: function(CUfunction func, std::string const &sym) : m_function(func), m_symbol(sym) { } void set_block_shape(int x, int y, int z) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuFuncSetBlockShape, (m_function, x, y, z), m_symbol); } void set_shared_size(unsigned int bytes) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuFuncSetSharedSize, (m_function, bytes), m_symbol); } void param_set_size(unsigned int bytes) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuParamSetSize, (m_function, bytes), m_symbol); } void param_set(int offset, unsigned int value) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuParamSeti, (m_function, offset, value), m_symbol); } void param_set(int offset, float value) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuParamSetf, (m_function, offset, value), m_symbol); } void param_setv(int offset, void *buf, size_t len) { // maybe the unsigned int will change, it does not seem right CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuParamSetv, (m_function, offset, buf, (unsigned int) len), m_symbol); } void param_set_texref(const texture_reference &tr) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(cuParamSetTexRef, (m_function, CU_PARAM_TR_DEFAULT, tr.handle()), m_symbol); } void launch() { CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO( cuLaunch, (m_function), m_symbol); } void launch_grid(int grid_width, int grid_height) { CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO( cuLaunchGrid, (m_function, grid_width, grid_height), m_symbol); } void launch_grid_async(int grid_width, int grid_height, const stream &s) { CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO( cuLaunchGridAsync, (m_function, grid_width, grid_height, s.handle()), m_symbol); } #if CUDAPP_CUDA_VERSION >= 2020 int get_attribute(CUfunction_attribute attr) const { int result; CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuFuncGetAttribute, (&result, attr, m_function), m_symbol); return result; } #endif #if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA) void set_cache_config(CUfunc_cache fc) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( cuFuncSetCacheConfig, (m_function, fc), m_symbol); } #endif }; inline function module::get_function(const char *name) { CUfunction func; CUDAPP_CALL_GUARDED(cuModuleGetFunction, (&func, m_module, name)); return function(func, name); } // }}} // {{{ device memory inline py::tuple mem_get_info() { pycuda_size_t free, total; CUDAPP_CALL_GUARDED(cuMemGetInfo, (&free, &total)); return py::make_tuple(free, total); } inline CUdeviceptr mem_alloc(size_t bytes) { CUdeviceptr devptr; CUDAPP_CALL_GUARDED(cuMemAlloc, (&devptr, bytes)); return devptr; } inline void mem_free(CUdeviceptr devptr) { CUDAPP_CALL_GUARDED_CLEANUP(cuMemFree, (devptr)); } // A class the user can override to make device_allocation- // workalikes. class pointer_holder_base { public: virtual ~pointer_holder_base() { } virtual CUdeviceptr get_pointer() = 0; operator CUdeviceptr() { return get_pointer(); } }; class device_allocation : public boost::noncopyable, public context_dependent { private: bool m_valid; protected: CUdeviceptr m_devptr; public: device_allocation(CUdeviceptr devptr) : m_valid(true), m_devptr(devptr) { } void free() { if (m_valid) { try { scoped_context_activation ca(get_context()); mem_free(m_devptr); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(device_allocation); release_context(); m_valid = false; } else throw pycuda::error("device_allocation::free", CUDA_ERROR_INVALID_HANDLE); } ~device_allocation() { if (m_valid) free(); } operator CUdeviceptr() const { return m_devptr; } }; inline Py_ssize_t mem_alloc_pitch( std::auto_ptr &da, unsigned int width, unsigned int height, unsigned int access_size) { CUdeviceptr devptr; pycuda_size_t pitch; CUDAPP_CALL_GUARDED(cuMemAllocPitch, (&devptr, &pitch, width, height, access_size)); da = std::auto_ptr(new device_allocation(devptr)); return pitch; } inline py::tuple mem_get_address_range(CUdeviceptr ptr) { CUdeviceptr base; pycuda_size_t size; CUDAPP_CALL_GUARDED(cuMemGetAddressRange, (&base, &size, ptr)); return py::make_tuple(base, size); } inline void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, unsigned int len) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoA, (ary.handle(), index, src, len)); } inline void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, unsigned int len) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoD, (dst, ary.handle(), index, len)); } inline void memcpy_atoa( array const &dst, unsigned int dst_index, array const &src, unsigned int src_index, unsigned int len) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoA, (dst.handle(), dst_index, src.handle(), src_index, len)); } // }}} // {{{ structured memcpy #if PY_VERSION_HEX >= 0x02050000 typedef Py_ssize_t PYCUDA_BUFFER_SIZE_T; #else typedef int PYCUDA_BUFFER_SIZE_T; #endif #define MEMCPY_SETTERS \ void set_src_host(py::object buf_py) \ { \ srcMemoryType = CU_MEMORYTYPE_HOST; \ PYCUDA_BUFFER_SIZE_T len; \ if (PyObject_AsReadBuffer(buf_py.ptr(), &srcHost, &len)) \ throw py::error_already_set(); \ } \ \ void set_src_array(array const &ary) \ { \ srcMemoryType = CU_MEMORYTYPE_ARRAY; \ srcArray = ary.handle(); \ } \ \ void set_src_device(CUdeviceptr devptr) \ { \ srcMemoryType = CU_MEMORYTYPE_DEVICE; \ srcDevice = devptr; \ } \ \ void set_dst_host(py::object buf_py) \ { \ dstMemoryType = CU_MEMORYTYPE_HOST; \ PYCUDA_BUFFER_SIZE_T len; \ if (PyObject_AsWriteBuffer(buf_py.ptr(), &dstHost, &len)) \ throw py::error_already_set(); \ } \ \ void set_dst_array(array const &ary) \ { \ dstMemoryType = CU_MEMORYTYPE_ARRAY; \ dstArray = ary.handle(); \ } \ \ void set_dst_device(CUdeviceptr devptr) \ { \ dstMemoryType = CU_MEMORYTYPE_DEVICE; \ dstDevice = devptr; \ } struct memcpy_2d : public CUDA_MEMCPY2D { memcpy_2d() { srcXInBytes = 0; srcY = 0; dstXInBytes = 0; dstY = 0; } MEMCPY_SETTERS; void execute(bool aligned=false) const { if (aligned) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2D, (this)); } else { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DUnaligned, (this)); } } void execute_async(const stream &s) const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DAsync, (this, s.handle())); } }; #if CUDAPP_CUDA_VERSION >= 2000 struct memcpy_3d : public CUDA_MEMCPY3D { memcpy_3d() { reserved0 = 0; reserved1 = 0; srcXInBytes = 0; srcY = 0; srcZ = 0; srcLOD = 0; dstXInBytes = 0; dstY = 0; dstZ = 0; dstLOD = 0; } MEMCPY_SETTERS; void execute() const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3D, (this)); } void execute_async(const stream &s) const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DAsync, (this, s.handle())); } }; #endif #if CUDAPP_CUDA_VERSION >= 4000 struct memcpy_3d_peer : public CUDA_MEMCPY3D_PEER { memcpy_3d_peer() { srcXInBytes = 0; srcY = 0; srcZ = 0; srcLOD = 0; dstXInBytes = 0; dstY = 0; dstZ = 0; dstLOD = 0; } MEMCPY_SETTERS; void set_src_context(context const &ctx) { srcContext = ctx.handle(); } void set_dst_context(context const &ctx) { dstContext = ctx.handle(); } void execute() const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeer, (this)); } void execute_async(const stream &s) const { CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeerAsync, (this, s.handle())); } }; #endif // }}} // {{{ host memory inline void *mem_alloc_host(size_t size, unsigned flags=0) { void *m_data; #if CUDAPP_CUDA_VERSION >= 2020 CUDAPP_CALL_GUARDED(cuMemHostAlloc, (&m_data, size, flags)); #else if (flags != 0) throw pycuda::error("mem_alloc_host", CUDA_ERROR_INVALID_VALUE, "nonzero flags in mem_alloc_host not allowed in CUDA 2.1 and older"); CUDAPP_CALL_GUARDED(cuMemAllocHost, (&m_data, size)); #endif return m_data; } inline void mem_free_host(void *ptr) { CUDAPP_CALL_GUARDED_CLEANUP(cuMemFreeHost, (ptr)); } struct host_allocation : public boost::noncopyable, public context_dependent { private: bool m_valid; void *m_data; public: host_allocation(size_t bytesize, unsigned flags=0) : m_valid(true), m_data(mem_alloc_host(bytesize, flags)) { } ~host_allocation() { if (m_valid) free(); } void free() { if (m_valid) { try { scoped_context_activation ca(get_context()); mem_free_host(m_data); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(host_allocation); release_context(); m_valid = false; } else throw pycuda::error("host_allocation::free", CUDA_ERROR_INVALID_HANDLE); } void *data() { return m_data; } #if CUDAPP_CUDA_VERSION >= 2020 CUdeviceptr get_device_pointer() { CUdeviceptr result; CUDAPP_CALL_GUARDED(cuMemHostGetDevicePointer, (&result, m_data, 0)); return result; } #endif #if CUDAPP_CUDA_VERSION >= 3020 unsigned int get_flags() { unsigned int flags; CUDAPP_CALL_GUARDED(cuMemHostGetFlags, (&flags, m_data)); return flags; } #endif }; // }}} // {{{ event class event : public boost::noncopyable, public context_dependent { private: CUevent m_event; public: event(unsigned int flags=0) { CUDAPP_CALL_GUARDED(cuEventCreate, (&m_event, flags)); } ~event() { try { scoped_context_activation ca(get_context()); CUDAPP_CALL_GUARDED_CLEANUP(cuEventDestroy, (m_event)); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(event); } event *record(py::object stream_py) { CUstream s_handle; if (stream_py.ptr() != Py_None) { const stream &s = py::extract(stream_py); s_handle = s.handle(); } else s_handle = 0; CUDAPP_CALL_GUARDED(cuEventRecord, (m_event, s_handle)); return this; } CUevent handle() const { return m_event; } event *synchronize() { CUDAPP_CALL_GUARDED_THREADED(cuEventSynchronize, (m_event)); return this; } bool query() const { CUDAPP_PRINT_CALL_TRACE("cuEventQuery"); CUresult result = cuEventQuery(m_event); switch (result) { case CUDA_SUCCESS: return true; case CUDA_ERROR_NOT_READY: return false; default: CUDAPP_PRINT_ERROR_TRACE("cuEventQuery", result); throw error("cuEventQuery", result); } } float time_since(event const &start) { float result; CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, start.m_event, m_event)); return result; } float time_till(event const &end) { float result; CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, m_event, end.m_event)); return result; } }; #if CUDAPP_CUDA_VERSION >= 3020 inline void stream::wait_for_event(const event &evt) { CUDAPP_CALL_GUARDED(cuStreamWaitEvent, (m_stream, evt.handle(), 0)); } #endif // }}} // {{{ profiler #if CUDAPP_CUDA_VERSION >= 4000 && !defined(__APPLE__) inline void initialize_profiler( const char *config_file, const char *output_file, CUOutputMode output_mode) { CUDAPP_CALL_GUARDED(cuProfilerInitialize, (config_file, output_file, output_mode)); } inline void start_profiler() { CUDAPP_CALL_GUARDED(cuProfilerStart, ()); } inline void stop_profiler() { CUDAPP_CALL_GUARDED(cuProfilerStart, ()); } #endif // }}} } #endif // vim: foldmethod=marker