#include #include "tools.hpp" #include "wrap_helpers.hpp" #include #include #include #include #include /* from http://graphics.stanford.edu/~seander/bithacks.html */ static const char log_table_8[] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 }; static inline unsigned bitlog2_16(boost::uint16_t v) { if (unsigned long t = v >> 8) return 8+log_table_8[t]; else return log_table_8[v]; } static inline unsigned bitlog2_32(boost::uint32_t v) { if (uint16_t t = v >> 16) return 16+bitlog2_16(t); else return bitlog2_16(v); } static inline unsigned bitlog2(unsigned long v) { #if (ULONG_MAX != 4294967295) if (boost::uint32_t t = v >> 32) return 32+bitlog2_32(t); else #endif return bitlog2_32(v); } namespace { class cuda_allocator : public cuda::context_dependent { public: typedef CUdeviceptr pointer; typedef unsigned long size_type; pointer allocate(size_type s) { cuda::scoped_context_activation ca(get_context()); return cuda::mem_alloc(s); } void free(pointer p) { cuda::scoped_context_activation ca(get_context()); cuda::mem_free(p); } }; template class memory_pool : public cuda::explicit_context_dependent { public: typedef typename Allocator::pointer pointer; typedef typename Allocator::size_type size_type; private: typedef boost::uint32_t bin_nr_t; typedef std::vector bin_t; typedef boost::ptr_map container_t; container_t m_container; typedef typename container_t::value_type bin_pair_t; Allocator m_allocator; // A held block is one that's been released by the application, but that // we are keeping around to dish out again. unsigned m_held_blocks; // An active block is one that is in use by the application. unsigned m_active_blocks; public: memory_pool() : m_held_blocks(0), m_active_blocks(0) { } ~memory_pool() { free_held(); } static const unsigned mantissa_bits = 2; static const unsigned mantissa_mask = (1 << mantissa_bits) - 1; static size_type signed_left_shift(size_type x, signed shift_amount) { if (shift_amount < 0) return x >> -shift_amount; else return x << shift_amount; } static size_type signed_right_shift(size_type x, signed shift_amount) { if (shift_amount < 0) return x << -shift_amount; else return x >> shift_amount; } static bin_nr_t bin_number(size_type size) { signed l = bitlog2(size); size_type shifted = signed_right_shift(size, l-signed(mantissa_bits)); if (size && (shifted & (1 << mantissa_bits)) == 0) throw std::runtime_error("memory_pool::bin_number: bitlog2 fault"); size_type chopped = shifted & mantissa_mask; return l << mantissa_bits | chopped; } static size_type alloc_size(bin_nr_t bin) { bin_nr_t exponent = bin >> mantissa_bits; bin_nr_t mantissa = bin & mantissa_mask; size_type ones = signed_left_shift(1, signed(exponent)-signed(mantissa_bits) ); if (ones) ones -= 1; size_type head = signed_left_shift( (1<second; } void inc_held_blocks() { if (m_held_blocks == 0) acquire_context(); ++m_held_blocks; } void dec_held_blocks() { --m_held_blocks; if (m_held_blocks == 0) release_context(); } public: pointer allocate(size_type size) { bin_nr_t bin_nr = bin_number(size); bin_t &bin = get_bin(bin_nr); if (bin.size()) return pop_block_from_bin(bin, size); size_type alloc_sz = alloc_size(bin_nr); assert(bin_number(alloc_sz) == bin); try { return get_from_allocator(alloc_sz); } catch (cuda::error &e) { // Not OOM? Propagate. if (e.code() != CUDA_ERROR_OUT_OF_MEMORY) throw; } pycuda::run_python_gc(); if (bin.size()) return pop_block_from_bin(bin, size); while (try_to_free_memory()) { try { return get_from_allocator(alloc_sz); } catch (cuda::error &e) { // Not OOM? Propagate. if (e.code() != CUDA_ERROR_OUT_OF_MEMORY) throw; } } throw cuda::error( "memory_pool::allocate", CUDA_ERROR_OUT_OF_MEMORY, "failed to free memory for allocation"); } void free(pointer p, size_type size) { --m_active_blocks; inc_held_blocks(); get_bin(bin_number(size)).push_back(p); if (m_active_blocks == 0) { // last deallocation, allow context to go away. free_held(); } } void free_held() { BOOST_FOREACH(bin_pair_t bin_pair, m_container) { bin_t &bin = *bin_pair.second; while (bin.size()) { m_allocator.free(bin.back()); bin.pop_back(); dec_held_blocks(); } } assert(m_held_blocks == 0); } unsigned active_blocks() { return m_active_blocks; } unsigned held_blocks() { return m_held_blocks; } bool try_to_free_memory() { BOOST_FOREACH(bin_pair_t bin_pair, // free largest stuff first std::make_pair(m_container.rbegin(), m_container.rend())) { bin_t &bin = *bin_pair.second; if (bin.size()) { m_allocator.free(bin.back()); bin.pop_back(); dec_held_blocks(); return true; } } return false; } private: pointer get_from_allocator(size_type alloc_sz) { pointer result = m_allocator.allocate(alloc_sz); ++m_active_blocks; return result; } pointer pop_block_from_bin(bin_t &bin, size_type size) { pointer result = bin.back(); bin.pop_back(); dec_held_blocks(); ++m_active_blocks; return result; } }; class pooled_device_allocation : public cuda::context_dependent, public boost::noncopyable { private: typedef memory_pool pool_type; boost::shared_ptr m_pool; CUdeviceptr m_devptr; unsigned long m_size; bool m_valid; public: typedef pool_type::size_type size_type; pooled_device_allocation(boost::shared_ptr p, CUdeviceptr devptr, size_type size) : m_pool(p), m_devptr(devptr), m_size(size), m_valid(true) { } void free() { if (m_valid) m_pool->free(m_devptr, m_size); else throw cuda::error("pooled_device_allocation::free", CUDA_ERROR_INVALID_HANDLE); } ~pooled_device_allocation() { if (m_valid) m_pool->free(m_devptr, m_size); } operator CUdeviceptr() const { return m_devptr; } unsigned long size() const { return m_size; } }; pooled_device_allocation *pool_allocate( boost::shared_ptr > pool, memory_pool::size_type sz) { return new pooled_device_allocation(pool, pool->allocate(sz), sz); } PyObject *pooled_device_allocation_to_long(pooled_device_allocation const &da) { return PyLong_FromUnsignedLong((CUdeviceptr) da); } } void pycuda_expose_tools() { namespace py = boost::python; py::def("bitlog2", bitlog2); { typedef memory_pool cl; py::class_ >("DeviceMemoryPool") .DEF_SIMPLE_METHOD(free_held) .def("allocate", pool_allocate, py::return_value_policy()) .add_property("held_blocks", &cl::held_blocks) .add_property("active_blocks", &cl::active_blocks) .DEF_SIMPLE_METHOD(bin_number) .DEF_SIMPLE_METHOD(alloc_size) .staticmethod("bin_number") .staticmethod("alloc_size") ; } { typedef pooled_device_allocation cl; py::class_( "PooledDeviceAllocation", py::no_init) .DEF_SIMPLE_METHOD(free) .def("__int__", &cl::operator CUdeviceptr) .def("__long__", pooled_device_allocation_to_long) .def("__len__", &cl::size) ; py::implicitly_convertible(); } }