From 0860b1d1a0ed3cba3734c6043ba9a49f6c138842 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 31 Mar 2021 13:44:24 -0500 Subject: [PATCH 1/2] Update mempool to version from PyOpenCL (closes gh-282) --- doc/source/util.rst | 46 ++++++---- src/cpp/mempool.hpp | 186 ++++++++++++++++++++++++++++------------ src/wrapper/mempool.cpp | 24 +++--- 3 files changed, 173 insertions(+), 83 deletions(-) diff --git a/doc/source/util.rst b/doc/source/util.rst index f83907d6..c7998994 100644 --- a/doc/source/util.rst +++ b/doc/source/util.rst @@ -6,18 +6,18 @@ Automatic Initialization .. module:: pycuda.autoinit -The module :mod:`pycuda.autoinit`, when imported, automatically performs +The module :mod:`pycuda.autoinit`, when imported, automatically performs all the steps necessary to get CUDA ready for submission of compute kernels. It uses :func:`pycuda.tools.make_default_context` to create a compute context. .. data:: device An instance of :class:`pycuda.driver.Device` that was used for automatic - initialization. + initialization. .. data:: context - A default-constructed instance of :class:`pycuda.driver.Context` + A default-constructed instance of :class:`pycuda.driver.Context` on :data:`device`. This context is created by calling :func:`pycuda.tools.make_default_context`. @@ -74,7 +74,7 @@ Kernel Caching .. function:: context_dependent_memoize(func) - This decorator caches the result of the decorated function, *if* a + This decorator caches the result of the decorated function, *if* a subsequent occurs in the same :class:`pycuda.driver.Context`. This is useful for caching of kernels. @@ -98,7 +98,7 @@ Device Metadata and Occupancy ----------------------------- .. class:: DeviceData(dev=None) - + Gives access to more information on a device than is available through :meth:`pycuda.driver.Device.get_attribute`. If `dev` is `None`, it defaults to the device returned by :meth:`pycuda.driver.Context.get_device`. @@ -120,7 +120,7 @@ Device Metadata and Occupancy .. method:: align_bytes(word_size=4) - The distance between global memory base addresses that + The distance between global memory base addresses that allow accesses of word-size `word_size` bytes to get coalesced. .. method:: align(bytes, word_size=4) @@ -134,7 +134,7 @@ Device Metadata and Occupancy .. method:: align_dtype(elements, dtype_size) - Round up `elements` to the next alignment boundary + Round up `elements` to the next alignment boundary as given by :meth:`align_bytes`, where each element is assumed to be `dtype_size` bytes large. @@ -146,7 +146,7 @@ Device Metadata and Occupancy .. class:: OccupancyRecord(devdata, threads, shared_mem=0, registers=0) - Calculate occupancy for a given kernel workload characterized by + Calculate occupancy for a given kernel workload characterized by * thread count of `threads` * shared memory use of `shared_mem` bytes @@ -183,7 +183,7 @@ fresh memory area is allocated for each intermediate result. Memory pools are a remedy for this problem based on the observation that often many of the block allocations are of the same sizes as previously used ones. -Then, instead of fully returning the memory to the system and incurring the +Then, instead of fully returning the memory to the system and incurring the associated reallocation overhead, the pool holds on to the memory and uses it to satisfy future allocations of similarly-sized blocks. The pool reacts appropriately to out-of-memory conditions as long as all memory allocations @@ -198,8 +198,8 @@ Device-based Memory Pool An object representing a :class:`DeviceMemoryPool`-based allocation of linear device memory. Once this object is deleted, its associated device - memory is freed. - :class:`PooledDeviceAllocation` instances can be cast to :class:`int` + memory is freed. + :class:`PooledDeviceAllocation` instances can be cast to :class:`int` (and :class:`long`), yielding the starting address of the device memory allocated. @@ -213,7 +213,7 @@ Device-based Memory Pool .. class:: DeviceMemoryPool - A memory pool for linear device memory as allocated using + A memory pool for linear device memory as allocated using :func:`pycuda.driver.mem_alloc`. (see :ref:`mempool`) .. attribute:: held_blocks @@ -225,6 +225,20 @@ Device-based Memory Pool The number of blocks in active use that have been allocated through this pool. + .. attribute:: managed_bytes + + "Managed" memory is "active" and "held" memory. + + .. versionadded: 2021.1 + + .. attribute:: active_bytes + + "Active" bytes are bytes under the control of the application. + This may be smaller than the actual allocated size reflected + in :attr:`managed_bytes`. + + .. versionadded: 2021.1 + .. method:: allocate(size) Return a :class:`PooledDeviceAllocation` of *size* bytes. @@ -248,7 +262,7 @@ Memory Pool for pagelocked memory An object representing a :class:`PageLockedMemoryPool`-based allocation of linear device memory. Once this object is deleted, its associated device - memory is freed. + memory is freed. .. method:: free @@ -260,12 +274,12 @@ Memory Pool for pagelocked memory .. class:: PageLockedAllocator(flags=0) - Specifies the set of :class:`pycuda.driver.host_alloc_flags` used in its + Specifies the set of :class:`pycuda.driver.host_alloc_flags` used in its associated :class:`PageLockedMemoryPool`. .. class:: PageLockedMemoryPool(allocator=PageLockedAllocator()) - A memory pool for pagelocked host memory as allocated using + A memory pool for pagelocked host memory as allocated using :func:`pycuda.driver.pagelocked_empty`. (see :ref:`mempool`) .. attribute:: held_blocks @@ -279,7 +293,7 @@ Memory Pool for pagelocked memory .. method:: allocate(shape, dtype, order="C") - Return an uninitialized ("empty") :class:`numpy.ndarray` with the given + Return an uninitialized ("empty") :class:`numpy.ndarray` with the given *shape*, *dtype*, and *order*. This array will be backed by a :class:`PooledHostAllocation`, which can be found as the ``.base`` attribute of the array. diff --git a/src/cpp/mempool.hpp b/src/cpp/mempool.hpp index be88f13f..44f0fd64 100644 --- a/src/cpp/mempool.hpp +++ b/src/cpp/mempool.hpp @@ -1,24 +1,61 @@ // Abstract memory pool implementation - - +// +// Copyright (C) 2009-17 Andreas Kloeckner +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. #ifndef _AFJDFJSDFSD_PYGPU_HEADER_SEEN_MEMPOOL_HPP #define _AFJDFJSDFSD_PYGPU_HEADER_SEEN_MEMPOOL_HPP - - -#include -#include -#include +#include +#include +#include +#include +#include +#include #include "bitlog.hpp" - - namespace PYGPU_PACKAGE { + // https://stackoverflow.com/a/44175911 + class mp_noncopyable { + public: + mp_noncopyable() = default; + ~mp_noncopyable() = default; + + private: + mp_noncopyable(const mp_noncopyable&) = delete; + mp_noncopyable& operator=(const mp_noncopyable&) = delete; + }; + +#ifdef PYGPU_PYCUDA +#define PYGPU_SHARED_PTR boost::shared_ptr +#else +#define PYGPU_SHARED_PTR std::shared_ptr +#endif + template inline T signed_left_shift(T x, signed shift_amount) { @@ -43,38 +80,57 @@ namespace PYGPU_PACKAGE +#define always_assert(cond) \ + do { \ + if (!(cond)) \ + throw std::logic_error("mem pool assertion violated: " #cond); \ + } while (false); + + template - class memory_pool + class memory_pool : mp_noncopyable { public: typedef typename Allocator::pointer_type pointer_type; typedef typename Allocator::size_type size_type; private: - typedef boost::uint32_t bin_nr_t; + typedef uint32_t bin_nr_t; typedef std::vector bin_t; - typedef boost::ptr_map container_t; + typedef std::map container_t; container_t m_container; typedef typename container_t::value_type bin_pair_t; - std::auto_ptr m_allocator; + std::unique_ptr m_allocator; // A held block is one that's been released by the application, but that // we are keeping around to dish out again. - unsigned m_held_blocks; + size_type m_held_blocks; // An active block is one that is in use by the application. - unsigned m_active_blocks; + size_type m_active_blocks; + + // "Managed" memory is "active" and "held" memory. + size_type m_managed_bytes; + + // "Active" bytes are bytes under the control of the application. + // This may be smaller than the actual allocated size reflected + // in m_managed_bytes. + size_type m_active_bytes; bool m_stop_holding; int m_trace; + unsigned m_leading_bits_in_bin_id; + public: - memory_pool(Allocator const &alloc=Allocator()) + memory_pool(Allocator const &alloc=Allocator(), unsigned leading_bits_in_bin_id=4) : m_allocator(alloc.copy()), - m_held_blocks(0), m_active_blocks(0), m_stop_holding(false), - m_trace(false) + m_held_blocks(0), m_active_blocks(0), + m_managed_bytes(0), m_active_bytes(0), + m_stop_holding(false), + m_trace(false), m_leading_bits_in_bin_id(leading_bits_in_bin_id) { if (m_allocator->is_deferred()) { @@ -88,17 +144,21 @@ namespace PYGPU_PACKAGE virtual ~memory_pool() { free_held(); } - static const unsigned mantissa_bits = 2; - static const unsigned mantissa_mask = (1 << mantissa_bits) - 1; + private: + unsigned mantissa_mask() const + { + return (1 << m_leading_bits_in_bin_id) - 1; + } - static bin_nr_t bin_number(size_type size) + public: + bin_nr_t bin_number(size_type size) { signed l = bitlog2(size); - size_type shifted = signed_right_shift(size, l-signed(mantissa_bits)); - if (size && (shifted & (1 << mantissa_bits)) == 0) + size_type shifted = signed_right_shift(size, l-signed(m_leading_bits_in_bin_id)); + if (size && (shifted & (1 << m_leading_bits_in_bin_id)) == 0) throw std::runtime_error("memory_pool::bin_number: bitlog2 fault"); - size_type chopped = shifted & mantissa_mask; - return l << mantissa_bits | chopped; + size_type chopped = shifted & mantissa_mask(); + return l << m_leading_bits_in_bin_id | chopped; } void set_trace(bool flag) @@ -109,19 +169,19 @@ namespace PYGPU_PACKAGE --m_trace; } - static size_type alloc_size(bin_nr_t bin) + size_type alloc_size(bin_nr_t bin) { - bin_nr_t exponent = bin >> mantissa_bits; - bin_nr_t mantissa = bin & mantissa_mask; + bin_nr_t exponent = bin >> m_leading_bits_in_bin_id; + bin_nr_t mantissa = bin & mantissa_mask(); - size_type ones = signed_left_shift(1, - signed(exponent)-signed(mantissa_bits) + size_type ones = signed_left_shift((size_type) 1, + signed(exponent)-signed(m_leading_bits_in_bin_id) ); if (ones) ones -= 1; size_type head = signed_left_shift( - (1<second; } else - return *it->second; + return it->second; } void inc_held_blocks() @@ -176,14 +236,15 @@ namespace PYGPU_PACKAGE return pop_block_from_bin(bin, size); } - size_type alloc_sz = alloc_size(bin_nr); + size_type alloc_sz = alloc_size(bin_nr); - assert(bin_number(alloc_sz) == bin_nr); + always_assert(bin_number(alloc_sz) == bin_nr); + always_assert(alloc_sz >= size); if (m_trace) std::cout << "[pool] allocation of size " << size << " required new memory" << std::endl; - try { return get_from_allocator(alloc_sz); } + try { return get_from_allocator(alloc_sz, size); } catch (PYGPU_PACKAGE::error &e) { if (!e.is_out_of_memory()) @@ -202,7 +263,7 @@ namespace PYGPU_PACKAGE while (try_to_free_memory()) { - try { return get_from_allocator(alloc_sz); } + try { return get_from_allocator(alloc_sz, size); } catch (PYGPU_PACKAGE::error &e) { if (!e.is_out_of_memory()) @@ -224,6 +285,7 @@ namespace PYGPU_PACKAGE void free(pointer_type p, size_type size) { --m_active_blocks; + m_active_bytes -= size; bin_nr_t bin_nr = bin_number(size); if (!m_stop_holding) @@ -237,18 +299,22 @@ namespace PYGPU_PACKAGE << " entries" << std::endl; } else + { m_allocator->free(p); + m_managed_bytes -= alloc_size(bin_nr); + } } void free_held() { - BOOST_FOREACH(bin_pair_t bin_pair, m_container) + for (bin_pair_t &bin_pair: m_container) { - bin_t &bin = *bin_pair.second; + bin_t &bin = bin_pair.second; while (bin.size()) { m_allocator->free(bin.back()); + m_managed_bytes -= alloc_size(bin_pair.first); bin.pop_back(); dec_held_blocks(); @@ -264,23 +330,31 @@ namespace PYGPU_PACKAGE free_held(); } - unsigned active_blocks() + size_type active_blocks() const { return m_active_blocks; } - unsigned held_blocks() + size_type held_blocks() const { return m_held_blocks; } + size_type managed_bytes() const + { return m_managed_bytes; } + + size_type active_bytes() const + { return m_active_bytes; } + bool try_to_free_memory() { - BOOST_FOREACH(bin_pair_t bin_pair, - // free largest stuff first - std::make_pair(m_container.rbegin(), m_container.rend())) + // free largest stuff first + for (typename container_t::reverse_iterator it = m_container.rbegin(); + it != m_container.rend(); ++it) { - bin_t &bin = *bin_pair.second; + bin_pair_t &bin_pair = *it; + bin_t &bin = bin_pair.second; if (bin.size()) { m_allocator->free(bin.back()); + m_managed_bytes -= alloc_size(bin_pair.first); bin.pop_back(); dec_held_blocks(); @@ -293,10 +367,12 @@ namespace PYGPU_PACKAGE } private: - pointer_type get_from_allocator(size_type alloc_sz) + pointer_type get_from_allocator(size_type alloc_sz, size_type size) { pointer_type result = m_allocator->allocate(alloc_sz); ++m_active_blocks; + m_managed_bytes += alloc_sz; + m_active_bytes += size; return result; } @@ -308,17 +384,15 @@ namespace PYGPU_PACKAGE dec_held_blocks(); ++m_active_blocks; + m_active_bytes += size; return result; } }; - - - template - class pooled_allocation : public boost::noncopyable + class pooled_allocation : public mp_noncopyable { public: typedef Pool pool_type; @@ -326,14 +400,14 @@ namespace PYGPU_PACKAGE typedef typename Pool::size_type size_type; private: - boost::shared_ptr m_pool; + PYGPU_SHARED_PTR m_pool; pointer_type m_ptr; size_type m_size; bool m_valid; public: - pooled_allocation(boost::shared_ptr p, size_type size) + pooled_allocation(PYGPU_SHARED_PTR p, size_type size) : m_pool(p), m_ptr(p->allocate(size)), m_size(size), m_valid(true) { } @@ -352,7 +426,7 @@ namespace PYGPU_PACKAGE } else throw PYGPU_PACKAGE::error( - "pooled_device_allocation::free", + "pooled_device_allocation::free", #ifdef PYGPU_PYCUDA CUDA_ERROR_INVALID_HANDLE #endif diff --git a/src/wrapper/mempool.cpp b/src/wrapper/mempool.cpp index 918d3d0d..77a4a737 100644 --- a/src/wrapper/mempool.cpp +++ b/src/wrapper/mempool.cpp @@ -102,7 +102,7 @@ namespace template - class context_dependent_memory_pool : + class context_dependent_memory_pool : public pycuda::memory_pool, public pycuda::explicit_context_dependent { @@ -117,12 +117,12 @@ namespace - class pooled_device_allocation - : public pycuda::context_dependent, + class pooled_device_allocation + : public pycuda::context_dependent, public pycuda::pooled_allocation > - { + { private: - typedef + typedef pycuda::pooled_allocation > super; @@ -159,12 +159,12 @@ namespace } - - class pooled_host_allocation + + class pooled_host_allocation : public pycuda::pooled_allocation > { private: - typedef + typedef pycuda::pooled_allocation > super; @@ -193,7 +193,7 @@ namespace back_inserter(dims)); std::auto_ptr alloc( - new pooled_host_allocation( + new pooled_host_allocation( pool, tp_descr->elsize*pycuda::size_from_dims(dims.size(), &dims.front()))); NPY_ORDER order = PyArray_CORDER; @@ -228,6 +228,8 @@ namespace wrapper .add_property("held_blocks", &cl::held_blocks) .add_property("active_blocks", &cl::active_blocks) + .add_property("managed_bytes", &cl::managed_bytes) + .add_property("active_bytes", &cl::active_bytes) .DEF_SIMPLE_METHOD(bin_number) .DEF_SIMPLE_METHOD(alloc_size) .DEF_SIMPLE_METHOD(free_held) @@ -249,7 +251,7 @@ void pycuda_expose_tools() typedef context_dependent_memory_pool cl; py::class_< - cl, boost::noncopyable, + cl, boost::noncopyable, boost::shared_ptr > wrapper("DeviceMemoryPool"); wrapper .def("allocate", device_pool_allocate, @@ -269,7 +271,7 @@ void pycuda_expose_tools() typedef pycuda::memory_pool cl; py::class_< - cl, boost::noncopyable, + cl, boost::noncopyable, boost::shared_ptr > wrapper( "PageLockedMemoryPool", py::init >() -- GitLab From 695f429a47d1df392510ea61bf1c92077ee64cdc Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 31 Mar 2021 13:59:37 -0500 Subject: [PATCH 2/2] mempool: bin_number and alloc_size are no longer static --- src/wrapper/mempool.cpp | 2 -- test/test_driver.py | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/wrapper/mempool.cpp b/src/wrapper/mempool.cpp index 77a4a737..66f43f7e 100644 --- a/src/wrapper/mempool.cpp +++ b/src/wrapper/mempool.cpp @@ -234,8 +234,6 @@ namespace .DEF_SIMPLE_METHOD(alloc_size) .DEF_SIMPLE_METHOD(free_held) .DEF_SIMPLE_METHOD(stop_holding) - .staticmethod("bin_number") - .staticmethod("alloc_size") ; } } diff --git a/test/test_driver.py b/test/test_driver.py index b022aa37..98f3c8aa 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -669,14 +669,15 @@ class TestDriver: def test_mempool_2(self): from pycuda.tools import DeviceMemoryPool from random import randrange + pool = DeviceMemoryPool() for i in range(2000): s = randrange(1 << 31) >> randrange(32) - bin_nr = DeviceMemoryPool.bin_number(s) - asize = DeviceMemoryPool.alloc_size(bin_nr) + bin_nr = pool.bin_number(s) + asize = pool.alloc_size(bin_nr) assert asize >= s, s - assert DeviceMemoryPool.bin_number(asize) == bin_nr, s + assert pool.bin_number(asize) == bin_nr, s assert asize < asize * (1 + 1 / 8) @mark_cuda_test -- GitLab