diff --git a/src/mempool.hpp b/src/mempool.hpp
index 23e5758a9ea54117273810f74129e9d97be08c4b..44f0fd64398509132a1dfef917540a3f8fd6de77 100644
--- a/src/mempool.hpp
+++ b/src/mempool.hpp
@@ -34,12 +34,28 @@
 #include <memory>
 #include <ostream>
 #include <iostream>
-#include "wrap_cl.hpp"
 #include "bitlog.hpp"
 
 
 namespace PYGPU_PACKAGE
 {
+  // https://stackoverflow.com/a/44175911
+  class mp_noncopyable {
+  public:
+    mp_noncopyable() = default;
+    ~mp_noncopyable() = default;
+
+  private:
+    mp_noncopyable(const mp_noncopyable&) = delete;
+    mp_noncopyable& operator=(const mp_noncopyable&) = delete;
+  };
+
+#ifdef PYGPU_PYCUDA
+#define PYGPU_SHARED_PTR boost::shared_ptr
+#else
+#define PYGPU_SHARED_PTR std::shared_ptr
+#endif
+
   template <class T>
   inline T signed_left_shift(T x, signed shift_amount)
   {
@@ -64,8 +80,15 @@ namespace PYGPU_PACKAGE
 
 
 
+#define always_assert(cond) \
+  do { \
+    if (!(cond)) \
+      throw std::logic_error("mem pool assertion violated: " #cond); \
+  } while (false);
+
+
   template<class Allocator>
-  class memory_pool : noncopyable
+  class memory_pool : mp_noncopyable
   {
     public:
       typedef typename Allocator::pointer_type pointer_type;
@@ -151,13 +174,13 @@ namespace PYGPU_PACKAGE
         bin_nr_t exponent = bin >> m_leading_bits_in_bin_id;
         bin_nr_t mantissa = bin & mantissa_mask();
 
-        size_type ones = signed_left_shift(1,
+        size_type ones = signed_left_shift((size_type) 1,
             signed(exponent)-signed(m_leading_bits_in_bin_id)
             );
         if (ones) ones -= 1;
 
         size_type head = signed_left_shift(
-           (1<<m_leading_bits_in_bin_id) | mantissa,
+           (size_type) ((1<<m_leading_bits_in_bin_id) | mantissa),
             signed(exponent)-signed(m_leading_bits_in_bin_id));
         if (ones & head)
           throw std::runtime_error("memory_pool::alloc_size: bit-counting fault");
@@ -213,9 +236,10 @@ namespace PYGPU_PACKAGE
           return pop_block_from_bin(bin, size);
         }
 
-        size_type alloc_sz = alloc_size(bin_nr);
+         size_type alloc_sz = alloc_size(bin_nr);
 
-        assert(bin_number(alloc_sz) == bin_nr);
+        always_assert(bin_number(alloc_sz) == bin_nr);
+        always_assert(alloc_sz >= size);
 
         if (m_trace)
           std::cout << "[pool] allocation of size " << size << " required new memory" << std::endl;
@@ -321,8 +345,10 @@ namespace PYGPU_PACKAGE
       bool try_to_free_memory()
       {
         // free largest stuff first
-        for (bin_pair_t &bin_pair: reverse(m_container))
+        for (typename container_t::reverse_iterator it = m_container.rbegin();
+            it != m_container.rend(); ++it)
         {
+          bin_pair_t &bin_pair = *it;
           bin_t &bin = bin_pair.second;
 
           if (bin.size())
@@ -366,7 +392,7 @@ namespace PYGPU_PACKAGE
 
 
   template <class Pool>
-  class pooled_allocation : public noncopyable
+  class pooled_allocation : public mp_noncopyable
   {
     public:
       typedef Pool pool_type;
@@ -374,14 +400,14 @@ namespace PYGPU_PACKAGE
       typedef typename Pool::size_type size_type;
 
     private:
-      std::shared_ptr<pool_type> m_pool;
+      PYGPU_SHARED_PTR<pool_type> m_pool;
 
       pointer_type m_ptr;
       size_type m_size;
       bool m_valid;
 
     public:
-      pooled_allocation(std::shared_ptr<pool_type> p, size_type size)
+      pooled_allocation(PYGPU_SHARED_PTR<pool_type> p, size_type size)
         : m_pool(p), m_ptr(p->allocate(size)), m_size(size), m_valid(true)
       { }
 
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 04027b014b3b8c82bc1e9433d4453a82b3e7a052..6b014ba5e19ab025993ebd3aaa4c9dc73f217878 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -42,6 +42,34 @@
 
 namespace
 {
+  class test_allocator
+  {
+    public:
+      typedef void *pointer_type;
+      typedef size_t size_type;
+
+      virtual test_allocator *copy() const
+      {
+        return new test_allocator();
+      }
+
+      virtual bool is_deferred() const
+      {
+        return false;
+      }
+      virtual pointer_type allocate(size_type s)
+      {
+        return nullptr;
+      }
+
+      void free(pointer_type p)
+      { }
+
+      void try_release_blocks()
+      { }
+  };
+
+
   class cl_allocator_base
   {
     protected:
@@ -297,6 +325,25 @@ void pyopencl_expose_mempool(py::module &m)
 
   }
 
+  {
+    typedef pyopencl::memory_pool<test_allocator> cls;
+
+    py::class_<cls, std::shared_ptr<cls>> wrapper( m, "_TestMemoryPool");
+    wrapper
+      .def(py::init([](unsigned leading_bits_in_bin_id)
+            { return new cls(test_allocator(), leading_bits_in_bin_id); }),
+          py::arg("leading_bits_in_bin_id")=4
+          )
+      .def("allocate", [](std::shared_ptr<cls> pool, cls::size_type sz)
+          {
+            pool->allocate(sz);
+            return py::none();
+          })
+      ;
+
+    expose_memory_pool(wrapper);
+  }
+
   {
     typedef cl_deferred_allocator cls;
     py::class_<cls, cl_allocator_base> wrapper(
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index debcb2b405c62f6828b8d5fa4efb35dbb3ddb865..a9863a40db25e243ad56cf83808b7e0e233fee30 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -571,8 +571,7 @@ def test_mempool_2(ctx_factory):
 
     pool = MemoryPool(ImmediateAllocator(queue))
 
-    for i in range(2000):
-        s = randrange(1 << 31) >> randrange(32)
+    for s in [randrange(1 << 31) >> randrange(32) for _ in range(2000)] + [2**30]:
         bin_nr = pool.bin_number(s)
         asize = pool.alloc_size(bin_nr)
 
@@ -581,6 +580,16 @@ def test_mempool_2(ctx_factory):
         assert asize < asize*(1+1/8)
 
 
+def test_mempool_32bit_issues():
+    # https://github.com/inducer/pycuda/issues/282
+    from pyopencl._cl import _TestMemoryPool
+    pool = _TestMemoryPool()
+
+    for i in [30, 31, 32, 33, 34]:
+        for offs in range(-5, 5):
+            pool.allocate(2**i + offs)
+
+
 @pytest.mark.parametrize("allocator_cls", [ImmediateAllocator, DeferredAllocator])
 def test_allocator(ctx_factory, allocator_cls):
     context = ctx_factory()