diff --git a/doc/source/array.rst b/doc/source/array.rst
index c24304a7ba56001608f556ecfa7cbe8d6a6845b7..0e435a7c69b8954c53356d1d0d4dd54a4522611e 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -5,9 +5,7 @@ The :class:`Array` Class
 
 .. class:: DefaultAllocator(context, flags=pyopencl.mem_flags.READ_WRITE)
 
-    An Allocator that uses :class:`pyopencl.Buffer` with the given *flags*.
-
-    .. method:: __call__(self, size)
+    An alias for :class:`pyopencl.tools.CLAllocator`.
 
 .. class:: Array(cqa, shape, dtype, order="C", allocator=None, base=None, data=None, queue=None)
 
diff --git a/doc/source/index.rst b/doc/source/index.rst
index c3bca32474ce1b000205fe330b7da38bc0f622d7..457b4cdb44b37c7147b02c3b33e7e670ee43c872 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -71,6 +71,7 @@ Contents
 
     runtime
     array
+    tools
     misc
 
 Note that this guide does not explain OpenCL programming and technology. Please 
diff --git a/doc/source/misc.rst b/doc/source/misc.rst
index b2ff45a80facb4227d5272e724157f4cbe47c412..f94b716252cb0acb5af923806b690547e634b8a8 100644
--- a/doc/source/misc.rst
+++ b/doc/source/misc.rst
@@ -86,12 +86,13 @@ Version 2011.1
   :func:`pyopencl.enqueue_map_image`.
 * Add :mod:`pyopencl.reduction`.
 * Add :ref:`reductions`.
-* Add :meth:`MemoryObject.get_host_array`.
+* Add :meth:`pyopencl.MemoryObject.get_host_array`.
 * Deprecate context arguments of 
   :func:`pyopencl.array.to_device`,
   :func:`pyopencl.array.zeros`,
   :func:`pyopencl.array.arange`.
 * Make construction of :class:`pyopencl.array.Array` more flexible (*cqa* argument.)
+* Add :ref:`memory-pools`.
 
 Version 0.92
 ------------
diff --git a/doc/source/tools.rst b/doc/source/tools.rst
new file mode 100644
index 0000000000000000000000000000000000000000..feb65a7d93f99b4f0b4691d7e6ab67d27afb0517
--- /dev/null
+++ b/doc/source/tools.rst
@@ -0,0 +1,69 @@
+Built-in Utilities
+==================
+
+.. module:: pyopencl.tools
+
+.. _memory-pools:
+
+Memory Pools
+------------
+
+The constructor :func:`pyopencl.Buffer` can consume a fairly large amount of
+processing time if it is invoked very frequently. For example, code based on
+:class:`pyopencl.array.Array` can easily run into this issue because a
+fresh memory area is allocated for each intermediate result. Memory pools are a
+remedy for this problem based on the observation that often many of the block
+allocations are of the same sizes as previously used ones.
+
+Then, instead of fully returning the memory to the system and incurring the 
+associated reallocation overhead, the pool holds on to the memory and uses it
+to satisfy future allocations of similarly-sized blocks. The pool reacts
+appropriately to out-of-memory conditions as long as all memory allocations
+are made through it. Allocations performed from outside of the pool may run
+into spurious out-of-memory conditions due to the pool owning much or all of
+the available memory.
+
+.. class:: PooledBuffer
+
+    An object representing a :class:`MemoryPool`-based allocation of
+    device memory.  Once this object is deleted, its associated device
+    memory is returned to the pool. This supports the same interface
+    as :class:`pyopencl.Buffer`.
+
+.. class:: CLAllocator(context, mem_flags=pyopencl.mem_flags.READ_WRITE)
+
+    *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
+    to the *flags* argument of :class:`pyopencl.Buffer`.
+
+    .. method:: __call__(size)
+
+        Allocate a :class:`pyopencl.Buffer` of the given *size*.
+
+.. class:: MemoryPool(allocator=CLAllocator())
+
+    A memory pool for OpenCL device memory.
+
+    .. attribute:: held_blocks
+
+        The number of unused blocks being held by this pool.
+
+    .. attribute:: active_blocks
+
+        The number of blocks in active use that have been allocated
+        through this pool.
+
+    .. method:: allocate(size)
+
+        Return a :class:`PooledBuffer` of the given *size*.
+
+    .. method:: free_held
+
+        Free all unused memory that the pool is currently holding.
+
+    .. method:: stop_holding
+
+        Instruct the memory to start immediately freeing memory returned
+        to it, instead of holding it for future allocations.
+        Implicitly calls :meth:`free_held`.
+        This is useful as a cleanup action when a memory pool falls out
+        of use.
diff --git a/pyopencl/array.py b/pyopencl/array.py
index f3ae7c3f3a3ab38a90a34f9ba4211b3f4bd5f6c2..d068dfd5f457c7f6aa765016bef0162f600a015b 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -112,13 +112,7 @@ def elwise_kernel_runner(kernel_getter):
 
 
 
-class DefaultAllocator:
-    def __init__(self, context, flags=cl.mem_flags.READ_WRITE):
-        self.context = context
-        self.flags = flags
-
-    def __call__(self, size):
-        return cl.Buffer(self.context, self.flags, size)
+DefaultAllocator = cl.CLAllocator
 
 
 
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 17310a64da97b23723095de9c661ece5660b08c6..d3ac00ee162a2df06fdf9e21026ca716dd881767 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -1,4 +1,4 @@
-"""H."""
+"""Various helpful bits and pieces without much of a common theme."""
 
 from __future__ import division
 
@@ -36,6 +36,13 @@ import pyopencl as cl
 
 
 
+PooledBuffer = cl.PooledBuffer
+CLAllocator = cl.CLAllocator
+MemoryPool = cl.MemoryPool
+
+
+
+
 @decorator
 def context_dependent_memoize(func, context, *args):
     """Provides memoization for things that get created inside
diff --git a/src/wrapper/wrap_mempool.cpp b/src/wrapper/wrap_mempool.cpp
index 6cfa84b87b0b6e15591dce37a9251466bf7243da..31da32860174a210dc89ed5729b66d1b6e115dbf 100644
--- a/src/wrapper/wrap_mempool.cpp
+++ b/src/wrapper/wrap_mempool.cpp
@@ -22,7 +22,7 @@ namespace
 
     public:
       cl_allocator(boost::shared_ptr<pyopencl::context> const &ctx,
-          cl_mem_flags flags)
+          cl_mem_flags flags=CL_MEM_READ_WRITE)
         : m_context(ctx), m_flags(flags)
       {
         if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR))
@@ -52,6 +52,25 @@ namespace
 
 
 
+  inline
+  pyopencl::buffer *allocator_call(cl_allocator &alloc, size_t size)
+  {
+    cl_mem mem = alloc.allocate(size);
+
+    try
+    {
+      return new pyopencl::buffer(mem, false);
+    }
+    catch (...)
+    {
+      PYOPENCL_CALL_GUARDED(clReleaseMemObject, (mem));
+      throw;
+    }
+  }
+
+
+
+
   class pooled_buffer
     : public pyopencl::pooled_allocation<pyopencl::memory_pool<cl_allocator> >,
     public pyopencl::memory_object_holder
@@ -113,7 +132,12 @@ void pyopencl_expose_mempool()
     py::class_<cls> wrapper("CLAllocator",
         py::init<
           boost::shared_ptr<pyopencl::context> const &,
-          cl_mem_flags>());
+          py::optional<cl_mem_flags> >());
+    wrapper
+      .def("__call__", allocator_call,
+          py::return_value_policy<py::manage_new_object>())
+      ;
+
   }
 
   {
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index ddd7c0e149b54d74887e5a9b86ed18d38d271356..d312260a52049f5c2ca3e1f111d14807878a6ef9 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -290,6 +290,42 @@ class TestCL:
 
         assert la.norm(a - b) == 0
 
+    @pytools.test.mark_test.opencl
+    def test_mempool(self, ctx_getter):
+        from pyopencl.tools import MemoryPool, CLAllocator
+
+        context = ctx_getter()
+
+        pool = MemoryPool(CLAllocator(context))
+        maxlen = 10
+        queue = []
+
+        e0 = 12
+
+        for e in range(e0-6, e0-4):
+            for i in range(100):
+                queue.append(pool.allocate(1<<e))
+                if len(queue) > 10:
+                    queue.pop(0)
+        del queue
+        pool.stop_holding()
+
+    @pytools.test.mark_test.opencl
+    def test_mempool_2(self):
+        from pyopencl.tools import MemoryPool
+        from random import randrange
+
+        for i in range(2000):
+            s = randrange(1<<31) >> randrange(32)
+            bin_nr = MemoryPool.bin_number(s)
+            asize = MemoryPool.alloc_size(bin_nr)
+
+            assert asize >= s, s
+            assert MemoryPool.bin_number(asize) == bin_nr, s
+            assert asize < asize*(1+1/8)
+
+
+