Newer
Older
#include "tools.hpp"
Andreas Kloeckner
committed
#include <boost/ptr_container/ptr_map.hpp>
#include <boost/cstdint.hpp>
#include <boost/foreach.hpp>
#include <climits>
/* from http://graphics.stanford.edu/~seander/bithacks.html */
static const char log_table_8[] =
{
0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
};
Andreas Kloeckner
committed
static inline unsigned bitlog2_16(boost::uint16_t v)
{
if (unsigned long t = v >> 8)
return 8+log_table_8[t];
else
return log_table_8[v];
}
Andreas Kloeckner
committed
static inline unsigned bitlog2_32(boost::uint32_t v)
Andreas Kloeckner
committed
if (uint16_t t = v >> 16)
return 16+bitlog2_16(t);
else
return bitlog2_16(v);
}
Andreas Kloeckner
committed
static inline unsigned bitlog2(unsigned long v)
#if (ULONG_MAX != 4294967295)
Andreas Kloeckner
committed
if (boost::uint32_t t = v >> 32)
return bitlog2_32(v);
}
namespace
{
class cuda_allocator : public cuda::context_dependent
{
public:
typedef CUdeviceptr pointer;
typedef unsigned long size_type;
pointer allocate(size_type s)
cuda::scoped_context_activation ca(get_context());
return cuda::mem_alloc(s);
void free(pointer p)
cuda::scoped_context_activation ca(get_context());
cuda::mem_free(p);
}
};
template<class Allocator>
class memory_pool : public cuda::explicit_context_dependent
{
public:
typedef typename Allocator::pointer pointer;
typedef typename Allocator::size_type size_type;
private:
Andreas Kloeckner
committed
typedef boost::uint32_t bin_nr_t;
typedef std::vector<pointer> bin_t;
typedef boost::ptr_map<bin_nr_t, bin_t > container_t;
container_t m_container;
typedef typename container_t::value_type bin_pair_t;
// A held block is one that's been released by the application, but that
// we are keeping around to dish out again.
unsigned m_held_blocks;
// An active block is one that is in use by the application.
unsigned m_active_blocks;
: m_held_blocks(0), m_active_blocks(0)
~memory_pool()
{
free_held();
}
Andreas Kloeckner
committed
static const unsigned mantissa_bits = 2;
static const unsigned mantissa_mask = (1 << mantissa_bits) - 1;
static size_type signed_left_shift(size_type x, signed shift_amount)
Andreas Kloeckner
committed
if (shift_amount < 0)
return x >> -shift_amount;
else
return x << shift_amount;
Andreas Kloeckner
committed
static size_type signed_right_shift(size_type x, signed shift_amount)
{
if (shift_amount < 0)
return x << -shift_amount;
else
return x >> shift_amount;
}
static bin_nr_t bin_number(size_type size)
{
signed l = bitlog2(size);
size_type shifted = signed_right_shift(size, l-signed(mantissa_bits));
if (size && (shifted & (1 << mantissa_bits)) == 0)
throw std::runtime_error("memory_pool::bin_number: bitlog2 fault");
size_type chopped = shifted & mantissa_mask;
return l << mantissa_bits | chopped;
}
static size_type alloc_size(bin_nr_t bin)
Andreas Kloeckner
committed
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
bin_nr_t exponent = bin >> mantissa_bits;
bin_nr_t mantissa = bin & mantissa_mask;
size_type ones = signed_left_shift(1,
signed(exponent)-signed(mantissa_bits)
);
if (ones) ones -= 1;
size_type head = signed_left_shift(
(1<<mantissa_bits) | mantissa,
signed(exponent)-signed(mantissa_bits));
if (ones & head)
throw std::runtime_error("memory_pool::alloc_size: bit-counting fault");
return head | ones;
}
protected:
bin_t &get_bin(bin_nr_t bin_nr)
{
typename container_t::iterator it = m_container.find(bin_nr);
if (it == m_container.end())
{
bin_t *new_bin = new bin_t;
m_container.insert(bin_nr, new_bin);
return *new_bin;
}
else
return *it->second;
}
void inc_held_blocks()
{
if (m_held_blocks == 0)
acquire_context();
++m_held_blocks;
}
void dec_held_blocks()
{
--m_held_blocks;
if (m_held_blocks == 0)
release_context();
}
Andreas Kloeckner
committed
bin_nr_t bin_nr = bin_number(size);
bin_t &bin = get_bin(bin_nr);
if (bin.size())
return pop_block_from_bin(bin, size);
size_type alloc_sz = alloc_size(bin_nr);
assert(bin_number(alloc_sz) == bin);
try { return get_from_allocator(alloc_sz); }
catch (cuda::error &e)
// Not OOM? Propagate.
if (e.code() != CUDA_ERROR_OUT_OF_MEMORY)
throw;
pycuda::run_python_gc();
if (bin.size())
return pop_block_from_bin(bin, size);
while (try_to_free_memory())
{
try { return get_from_allocator(alloc_sz); }
catch (cuda::error &e)
// Not OOM? Propagate.
if (e.code() != CUDA_ERROR_OUT_OF_MEMORY)
throw;
throw cuda::error(
"memory_pool::allocate",
CUDA_ERROR_OUT_OF_MEMORY,
"failed to free memory for allocation");
}
void free(pointer p, size_type size)
{
Andreas Kloeckner
committed
get_bin(bin_number(size)).push_back(p);
if (m_active_blocks == 0)
{
// last deallocation, allow context to go away.
free_held();
}
Andreas Kloeckner
committed
BOOST_FOREACH(bin_pair_t bin_pair, m_container)
Andreas Kloeckner
committed
bin_t &bin = *bin_pair.second;
while (bin.size())
Andreas Kloeckner
committed
m_allocator.free(bin.back());
bin.pop_back();
assert(m_held_blocks == 0);
unsigned active_blocks()
{ return m_active_blocks; }
unsigned held_blocks()
{ return m_held_blocks; }
Andreas Kloeckner
committed
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
bool try_to_free_memory()
{
BOOST_FOREACH(bin_pair_t bin_pair,
// free largest stuff first
std::make_pair(m_container.rbegin(), m_container.rend()))
{
bin_t &bin = *bin_pair.second;
if (bin.size())
{
m_allocator.free(bin.back());
bin.pop_back();
dec_held_blocks();
return true;
}
}
return false;
}
private:
pointer get_from_allocator(size_type alloc_sz)
{
pointer result = m_allocator.allocate(alloc_sz);
++m_active_blocks;
return result;
}
pointer pop_block_from_bin(bin_t &bin, size_type size)
{
pointer result = bin.back();
bin.pop_back();
dec_held_blocks();
++m_active_blocks;
return result;
}
};
class pooled_device_allocation
: public cuda::context_dependent, public boost::noncopyable
{
private:
typedef memory_pool<cuda_allocator> pool_type;
boost::shared_ptr<pool_type> m_pool;
CUdeviceptr m_devptr;
unsigned long m_size;
bool m_valid;
public:
typedef pool_type::size_type size_type;
pooled_device_allocation(boost::shared_ptr<pool_type> p,
CUdeviceptr devptr, size_type size)
: m_pool(p), m_devptr(devptr), m_size(size), m_valid(true)
{
}
void free()
{
if (m_valid)
m_pool->free(m_devptr, m_size);
else
throw cuda::error("pooled_device_allocation::free", CUDA_ERROR_INVALID_HANDLE);
}
~pooled_device_allocation()
{
if (m_valid)
m_pool->free(m_devptr, m_size);
unsigned long size() const
{ return m_size; }
pooled_device_allocation *pool_allocate(
boost::shared_ptr<memory_pool<cuda_allocator> > pool,
memory_pool<cuda_allocator>::size_type sz)
{
return new pooled_device_allocation(pool, pool->allocate(sz), sz);
}
PyObject *pooled_device_allocation_to_long(pooled_device_allocation const &da)
{
return PyLong_FromUnsignedLong((CUdeviceptr) da);
}
}
void pycuda_expose_tools()
{
namespace py = boost::python;
py::def("bitlog2", bitlog2);
{
typedef memory_pool<cuda_allocator> cl;
py::class_<cl, boost::noncopyable, boost::shared_ptr<cl> >("DeviceMemoryPool")
.DEF_SIMPLE_METHOD(free_held)
.def("allocate", pool_allocate,
py::return_value_policy<py::manage_new_object>())
.add_property("held_blocks", &cl::held_blocks)
.add_property("active_blocks", &cl::active_blocks)
Andreas Kloeckner
committed
.DEF_SIMPLE_METHOD(bin_number)
.DEF_SIMPLE_METHOD(alloc_size)
.staticmethod("bin_number")
.staticmethod("alloc_size")
;
}
{
typedef pooled_device_allocation cl;
py::class_<cl, boost::noncopyable>(
"PooledDeviceAllocation", py::no_init)
.def("__int__", &cl::operator CUdeviceptr)
.def("__long__", pooled_device_allocation_to_long)
.def("__len__", &cl::size)
;
py::implicitly_convertible<pooled_device_allocation, CUdeviceptr>();
}
}