diff --git a/.gitignore b/.gitignore
index b86d66515005120e7a2f4d6c5b6d10577cdf3b40..a0f9b24f2eeef75a2e4115c352eccefee957a077 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 build
 .*.swp
+tags
diff --git a/libviennacl/src/blas1.cpp b/libviennacl/src/blas1.cpp
index a7319d5704535976b277dff95196940769b0ccbe..7a69fd6b48068f2f299dd224786a7dcb977a3270 100644
--- a/libviennacl/src/blas1.cpp
+++ b/libviennacl/src/blas1.cpp
@@ -39,7 +39,7 @@
 
 VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *index, ViennaCLVector x)
 {
-  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle<> v1_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -79,7 +79,7 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha
   if ((*alpha)->precision != x->precision)
     return ViennaCLGenericFailure;
 
-  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle<> v1_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -121,8 +121,8 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha,
   if (x->precision != y->precision)
     return ViennaCLGenericFailure;
 
-  viennacl::backend::mem_handle v1_handle;
-  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle<> v1_handle;
+  viennacl::backend::mem_handle<> v2_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -165,8 +165,8 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaC
   if (x->precision != y->precision)
     return ViennaCLGenericFailure;
 
-  viennacl::backend::mem_handle v1_handle;
-  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle<> v1_handle;
+  viennacl::backend::mem_handle<> v2_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -211,8 +211,8 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha,
   if (x->precision != y->precision)
     return ViennaCLGenericFailure;
 
-  viennacl::backend::mem_handle v1_handle;
-  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle<> v1_handle;
+  viennacl::backend::mem_handle<> v2_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -254,7 +254,7 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha
   if ((*alpha)->precision != x->precision)
     return ViennaCLGenericFailure;
 
-  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle<> v1_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -300,8 +300,8 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector     x, Vien
   if (x->precision != y->precision)
     return ViennaCLGenericFailure;
 
-  viennacl::backend::mem_handle v1_handle;
-  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle<> v1_handle;
+  viennacl::backend::mem_handle<> v2_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -343,7 +343,7 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha,
   if (alpha->precision != x->precision)
     return ViennaCLGenericFailure;
 
-  viennacl::backend::mem_handle v1_handle;
+  viennacl::backend::mem_handle<> v1_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -381,8 +381,8 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaC
   if (x->precision != y->precision)
     return ViennaCLGenericFailure;
 
-  viennacl::backend::mem_handle v1_handle;
-  viennacl::backend::mem_handle v2_handle;
+  viennacl::backend::mem_handle<> v1_handle;
+  viennacl::backend::mem_handle<> v2_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
diff --git a/libviennacl/src/blas2.cpp b/libviennacl/src/blas2.cpp
index bc2c0952a0963f34f1c0e18dfa80424341a6a64b..d5014d2864ec612747142b31364852a2f43f9a4f 100644
--- a/libviennacl/src/blas2.cpp
+++ b/libviennacl/src/blas2.cpp
@@ -35,9 +35,9 @@
 
 VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y)
 {
-  viennacl::backend::mem_handle v1_handle;
-  viennacl::backend::mem_handle v2_handle;
-  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle<> v1_handle;
+  viennacl::backend::mem_handle<> v2_handle;
+  viennacl::backend::mem_handle<> A_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -100,8 +100,8 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha,
 
 VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo)
 {
-  viennacl::backend::mem_handle v1_handle;
-  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle<> v1_handle;
+  viennacl::backend::mem_handle<> A_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -176,9 +176,9 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaC
 
 VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A)
 {
-  viennacl::backend::mem_handle v1_handle;
-  viennacl::backend::mem_handle v2_handle;
-  viennacl::backend::mem_handle A_handle;
+  viennacl::backend::mem_handle<> v1_handle;
+  viennacl::backend::mem_handle<> v2_handle;
+  viennacl::backend::mem_handle<> A_handle;
 
   if (init_vector(v1_handle, x) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
diff --git a/libviennacl/src/blas3.cpp b/libviennacl/src/blas3.cpp
index bb6e03eb5fe3edb97569066cd6e1802bed97d9f4..f61c00b02af9d0bf2bb1bec4f73a3412e38edb64 100644
--- a/libviennacl/src/blas3.cpp
+++ b/libviennacl/src/blas3.cpp
@@ -34,9 +34,9 @@
 
 VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C)
 {
-  viennacl::backend::mem_handle A_handle;
-  viennacl::backend::mem_handle B_handle;
-  viennacl::backend::mem_handle C_handle;
+  viennacl::backend::mem_handle<> A_handle;
+  viennacl::backend::mem_handle<> B_handle;
+  viennacl::backend::mem_handle<> C_handle;
 
   if (init_matrix(A_handle, A) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
@@ -117,8 +117,8 @@ VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha,
 
 VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B)
 {
-  viennacl::backend::mem_handle A_handle;
-  viennacl::backend::mem_handle B_handle;
+  viennacl::backend::mem_handle<> A_handle;
+  viennacl::backend::mem_handle<> B_handle;
 
   if (init_matrix(A_handle, A) != ViennaCLSuccess)
     return ViennaCLGenericFailure;
diff --git a/libviennacl/src/init_matrix.hpp b/libviennacl/src/init_matrix.hpp
index e463e88008dfe7a26e863cd2ab6422441ef7089e..c461c57e9fdbd559c0e73f06be9d79b8363b3d24 100644
--- a/libviennacl/src/init_matrix.hpp
+++ b/libviennacl/src/init_matrix.hpp
@@ -20,7 +20,7 @@
 
 
 
-static ViennaCLStatus init_cuda_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+static ViennaCLStatus init_cuda_matrix(viennacl::backend::mem_handle<> & h, ViennaCLMatrix A)
 {
 #ifdef VIENNACL_WITH_CUDA
   h.switch_active_handle_id(viennacl::CUDA_MEMORY);
@@ -41,7 +41,7 @@ static ViennaCLStatus init_cuda_matrix(viennacl::backend::mem_handle & h, Vienna
 #endif
 }
 
-static ViennaCLStatus init_opencl_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+static ViennaCLStatus init_opencl_matrix(viennacl::backend::mem_handle<> & h, ViennaCLMatrix A)
 {
 #ifdef VIENNACL_WITH_OPENCL
   h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
@@ -63,7 +63,7 @@ static ViennaCLStatus init_opencl_matrix(viennacl::backend::mem_handle & h, Vien
 }
 
 
-static ViennaCLStatus init_host_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+static ViennaCLStatus init_host_matrix(viennacl::backend::mem_handle<> & h, ViennaCLMatrix A)
 {
   h.switch_active_handle_id(viennacl::MAIN_MEMORY);
   h.ram_handle().reset(A->host_mem);
@@ -79,7 +79,7 @@ static ViennaCLStatus init_host_matrix(viennacl::backend::mem_handle & h, Vienna
 }
 
 
-static ViennaCLStatus init_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+static ViennaCLStatus init_matrix(viennacl::backend::mem_handle<> & h, ViennaCLMatrix A)
 {
   switch (A->backend->backend_type)
   {
diff --git a/libviennacl/src/init_vector.hpp b/libviennacl/src/init_vector.hpp
index 8be00d730d94d9dd43d901b08ece0d55a371da9e..2ae9ebcc1d6497495d742f19356a05f59d00b622 100644
--- a/libviennacl/src/init_vector.hpp
+++ b/libviennacl/src/init_vector.hpp
@@ -20,7 +20,7 @@
 
 
 
-static ViennaCLStatus init_cuda_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+static ViennaCLStatus init_cuda_vector(viennacl::backend::mem_handle<> & h, ViennaCLVector x)
 {
 #ifdef VIENNACL_WITH_CUDA
   h.switch_active_handle_id(viennacl::CUDA_MEMORY);
@@ -41,7 +41,7 @@ static ViennaCLStatus init_cuda_vector(viennacl::backend::mem_handle & h, Vienna
 #endif
 }
 
-static ViennaCLStatus init_opencl_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+static ViennaCLStatus init_opencl_vector(viennacl::backend::mem_handle<> & h, ViennaCLVector x)
 {
 #ifdef VIENNACL_WITH_OPENCL
   h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
@@ -63,7 +63,7 @@ static ViennaCLStatus init_opencl_vector(viennacl::backend::mem_handle & h, Vien
 }
 
 
-static ViennaCLStatus init_host_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+static ViennaCLStatus init_host_vector(viennacl::backend::mem_handle<> & h, ViennaCLVector x)
 {
   h.switch_active_handle_id(viennacl::MAIN_MEMORY);
   h.ram_handle().reset(x->host_mem);
@@ -79,7 +79,7 @@ static ViennaCLStatus init_host_vector(viennacl::backend::mem_handle & h, Vienna
 }
 
 
-static ViennaCLStatus init_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+static ViennaCLStatus init_vector(viennacl::backend::mem_handle<> & h, ViennaCLVector x)
 {
   switch (x->backend->backend_type)
   {
diff --git a/viennacl/backend/mem_handle.hpp b/viennacl/backend/mem_handle.hpp
index 37c680ba9c3942e2b1b07df9c173eb04dc691b2c..ed8aa8ab809d44a1007a691c36d784655293f476 100644
--- a/viennacl/backend/mem_handle.hpp
+++ b/viennacl/backend/mem_handle.hpp
@@ -86,6 +86,7 @@ inline memory_types default_memory_type(memory_types new_memory_type) { return d
  * Instead, this class collects all the necessary conditional compilations.
  *
  */
+template <typename OCLHandle>
 class mem_handle
 {
 public:
@@ -102,9 +103,9 @@ public:
 
 #ifdef VIENNACL_WITH_OPENCL
   /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
-  viennacl::ocl::handle<cl_mem>       & opencl_handle()       { return opencl_handle_; }
+  OCLHandle       & opencl_handle()       { return opencl_handle_; }
   /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
-  viennacl::ocl::handle<cl_mem> const & opencl_handle() const { return opencl_handle_; }
+  OCLHandle const & opencl_handle() const { return opencl_handle_; }
 #endif
 
 #ifdef VIENNACL_WITH_CUDA
@@ -236,7 +237,7 @@ private:
   memory_types active_handle_;
   ram_handle_type ram_handle_;
 #ifdef VIENNACL_WITH_OPENCL
-  viennacl::ocl::handle<cl_mem> opencl_handle_;
+  OCLHandle opencl_handle_;
 #endif
 #ifdef VIENNACL_WITH_CUDA
   cuda_handle_type        cuda_handle_;
diff --git a/viennacl/backend/memory.hpp b/viennacl/backend/memory.hpp
index 1b1c6c5303a89f48db363393a12ec7dc8723d677..9513bbd84220e62b329255dc0b14cb8ecf0fdac2 100644
--- a/viennacl/backend/memory.hpp
+++ b/viennacl/backend/memory.hpp
@@ -84,7 +84,7 @@ namespace backend
   * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
   *
   */
-  inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, viennacl::context const & ctx, const void * host_ptr = NULL)
+  inline void memory_create(mem_handle<> & handle, vcl_size_t size_in_bytes, viennacl::context const & ctx, const void * host_ptr = NULL)
   {
     if (size_in_bytes > 0)
     {
@@ -101,6 +101,7 @@ namespace backend
       case OPENCL_MEMORY:
         handle.opencl_handle().context(ctx.opencl_context());
         handle.opencl_handle() = opencl::memory_create(handle.opencl_handle().context(), size_in_bytes, host_ptr);
+
         handle.raw_size(size_in_bytes);
         break;
 #endif
@@ -118,6 +119,37 @@ namespace backend
     }
   }
 
+  // Pooled version of the above function!
+  inline void memory_create(mem_handle<viennacl::ocl::pooled_clmem_handle> & handle, vcl_size_t size_in_bytes, viennacl::context const & ctx, const void * host_ptr = NULL)
+  {
+    if (size_in_bytes > 0)
+    {
+      if (handle.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
+        handle.switch_active_handle_id(ctx.memory_type());
+
+      switch (handle.get_active_handle_id())
+      {
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        handle.opencl_handle().context(ctx.opencl_context());
+        // If using memory pool then use  a pooled handle
+        handle.opencl_handle() =
+          viennacl::ocl::pooled_clmem_handle(
+              opencl::pooled_memory_create(handle.opencl_handle().context(), size_in_bytes, host_ptr),
+              ctx.opencl_context(),
+              size_in_bytes);
+
+        handle.raw_size(size_in_bytes);
+        break;
+#endif
+      case MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("Pooled handle only available with OpenCL memory for now!");
+      }
+    }
+  }
+
   /*
   inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
   {
@@ -137,8 +169,9 @@ namespace backend
   *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
   *  @param bytes_to_copy  Number of bytes to be copied
   */
-  inline void memory_copy(mem_handle const & src_buffer,
-                          mem_handle & dst_buffer,
+  template <typename H = viennacl::ocl::handle<cl_mem>>
+  inline void memory_copy(mem_handle<H> const & src_buffer,
+                          mem_handle<H> & dst_buffer,
                           vcl_size_t src_offset,
                           vcl_size_t dst_offset,
                           vcl_size_t bytes_to_copy)
@@ -174,8 +207,9 @@ namespace backend
   /** @brief A 'shallow' copy operation from an initialized buffer to an uninitialized buffer.
    * The uninitialized buffer just copies the raw handle.
    */
-  inline void memory_shallow_copy(mem_handle const & src_buffer,
-                                  mem_handle & dst_buffer)
+  template <typename H = viennacl::ocl::handle<cl_mem>>
+  inline void memory_shallow_copy(mem_handle<H> const & src_buffer,
+                                  mem_handle<H> & dst_buffer)
   {
     assert( (dst_buffer.get_active_handle_id() == MEMORY_NOT_INITIALIZED) && bool("Shallow copy on already initialized memory not supported!"));
 
@@ -217,7 +251,8 @@ namespace backend
   * @param ptr            Pointer to the first byte to be written
   * @param async              Whether the operation should be asynchronous
   */
-  inline void memory_write(mem_handle & dst_buffer,
+  template <typename H = viennacl::ocl::handle<cl_mem>>
+  inline void memory_write(mem_handle<H> & dst_buffer,
                            vcl_size_t dst_offset,
                            vcl_size_t bytes_to_write,
                            const void * ptr,
@@ -258,7 +293,8 @@ namespace backend
   * @param ptr                Location in main RAM where to read data should be written to
   * @param async              Whether the operation should be asynchronous
   */
-  inline void memory_read(mem_handle const & src_buffer,
+  template <typename H>
+  inline void memory_read(mem_handle<H> const & src_buffer,
                           vcl_size_t src_offset,
                           vcl_size_t bytes_to_read,
                           void * ptr,
@@ -364,8 +400,8 @@ namespace backend
 
 
   /** @brief Switches the active memory domain within a memory handle. Data is copied if the new active domain differs from the old one. Memory in the source handle is not free'd. */
-  template<typename DataType>
-  void switch_memory_context(mem_handle & handle, viennacl::context new_ctx)
+  template<typename DataType, typename H = viennacl::ocl::handle<cl_mem>>
+  void switch_memory_context(mem_handle<H> & handle, viennacl::context new_ctx)
   {
     if (handle.get_active_handle_id() == new_ctx.memory_type())
       return;
@@ -466,8 +502,8 @@ namespace backend
 
 
   /** @brief Copies data of the provided 'DataType' from 'handle_src' to 'handle_dst' and converts the data if the binary representation of 'DataType' among the memory domains differs. */
-  template<typename DataType>
-  void typesafe_memory_copy(mem_handle const & handle_src, mem_handle & handle_dst)
+  template<typename DataType, typename H = viennacl::ocl::handle<cl_mem>>
+  void typesafe_memory_copy(mem_handle<H> const & handle_src, mem_handle<H> & handle_dst)
   {
     if (handle_dst.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
       handle_dst.switch_active_handle_id(default_memory_type());
diff --git a/viennacl/backend/opencl.hpp b/viennacl/backend/opencl.hpp
index a8be55a7881b45972fcf0065105093696d5c31ec..10897fc13b34292fe5b86fa59687bf02c410c8ab 100644
--- a/viennacl/backend/opencl.hpp
+++ b/viennacl/backend/opencl.hpp
@@ -58,6 +58,21 @@ inline cl_mem memory_create(viennacl::ocl::context const & ctx, vcl_size_t size_
   return ctx.create_memory_without_smart_handle(CL_MEM_READ_WRITE, static_cast<unsigned int>(size_in_bytes), const_cast<void *>(host_ptr));
 }
 
+
+/** @brief Creates an array of the specified size in the current OpenCL context. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes   Number of bytes to allocate
+ * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ * @param ctx             Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+ *
+ */
+inline cl_mem pooled_memory_create(viennacl::ocl::context const & ctx, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+  //std::cout << "Creating buffer (" << size_in_bytes << " bytes) host buffer " << host_ptr << " in context " << &ctx << std::endl;
+  return ctx.create_memory_without_smart_handle(CL_MEM_READ_WRITE, static_cast<unsigned int>(size_in_bytes), const_cast<void *>(host_ptr), true);
+}
+
+
 /** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' in the OpenCL context to memory starting at address 'dst_buffer + dst_offset' in the same OpenCL context.
  *
  *  @param src_buffer     A smart pointer to the begin of an allocated OpenCL buffer
diff --git a/viennacl/backend/util.hpp b/viennacl/backend/util.hpp
index 9aaeb2e95b7e6783925537721e405d62545d4b3a..b484bdea47a44ba23480d9f5072495eb26cf29f1 100644
--- a/viennacl/backend/util.hpp
+++ b/viennacl/backend/util.hpp
@@ -97,7 +97,8 @@ class typesafe_host_array
 public:
   explicit typesafe_host_array() : bytes_buffer_(NULL), buffer_size_(0) {}
 
-  explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+  template <typename H = viennacl::ocl::handle<cl_mem>>
+  explicit typesafe_host_array(mem_handle<H> const & handle, vcl_size_t num = 0) : bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
   {
     resize(handle, num);
   }
@@ -122,7 +123,8 @@ public:
   //
 
   /** @brief Resize without initializing the new memory */
-  void raw_resize(mem_handle const & /*handle*/, vcl_size_t num)
+  template <typename H = viennacl::ocl::handle<cl_mem>>
+  void raw_resize(mem_handle<H> const & /*handle*/, vcl_size_t num)
   {
     buffer_size_ = sizeof(cpu_type) * num;
 
@@ -135,7 +137,8 @@ public:
   }
 
   /** @brief Resize including initialization of new memory (cf. std::vector<>) */
-  void resize(mem_handle const & handle, vcl_size_t num)
+  template <typename H = viennacl::ocl::handle<cl_mem>>
+  void resize(mem_handle<H> const & handle, vcl_size_t num)
   {
     raw_resize(handle, num);
 
@@ -171,7 +174,8 @@ class typesafe_host_array<T, true>
 public:
   explicit typesafe_host_array() : convert_to_opencl_( (default_memory_type() == OPENCL_MEMORY) ? true : false), bytes_buffer_(NULL), buffer_size_(0) {}
 
-  explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : convert_to_opencl_(false), bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+  template <typename H = viennacl::ocl::handle<cl_mem>>
+  explicit typesafe_host_array(mem_handle<H> const & handle, vcl_size_t num = 0) : convert_to_opencl_(false), bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
   {
     resize(handle, num);
   }
@@ -220,7 +224,8 @@ public:
   //
 
   /** @brief Resize without initializing the new memory */
-  void raw_resize(mem_handle const & handle, vcl_size_t num)
+  template <typename H = viennacl::ocl::handle<cl_mem>>
+  void raw_resize(mem_handle<H> const & handle, vcl_size_t num)
   {
     buffer_size_ = sizeof(cpu_type) * num;
     (void)handle; //silence unused variable warning if compiled without OpenCL support
@@ -246,7 +251,8 @@ public:
   }
 
   /** @brief Resize including initialization of new memory (cf. std::vector<>) */
-  void resize(mem_handle const & handle, vcl_size_t num)
+  template <typename H = viennacl::ocl::handle<cl_mem>>
+  void resize(mem_handle<H> const & handle, vcl_size_t num)
   {
     raw_resize(handle, num);
 
diff --git a/viennacl/circulant_matrix.hpp b/viennacl/circulant_matrix.hpp
index 1ee13d50a9f06d4e32980682b66aeca67bed6dcf..a19fa80cca06968612a6f43edca7a87a8aa862d2 100644
--- a/viennacl/circulant_matrix.hpp
+++ b/viennacl/circulant_matrix.hpp
@@ -41,7 +41,7 @@ template<class NumericT, unsigned int AlignmentV>
 class circulant_matrix
 {
 public:
-  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef viennacl::backend::mem_handle<>                                                              handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
 
   /**
diff --git a/viennacl/compressed_compressed_matrix.hpp b/viennacl/compressed_compressed_matrix.hpp
index f1719a2c3001b8b8eed7bd3b53406658e89bc033..ff3e5e68ed3d77328b331f402e977aa4d9ce4d70 100644
--- a/viennacl/compressed_compressed_matrix.hpp
+++ b/viennacl/compressed_compressed_matrix.hpp
@@ -265,7 +265,7 @@ template<class NumericT>
 class compressed_compressed_matrix
 {
 public:
-  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef viennacl::backend::mem_handle<>                                                              handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
   typedef vcl_size_t                                                                                 size_type;
 
diff --git a/viennacl/compressed_matrix.hpp b/viennacl/compressed_matrix.hpp
index cdb12f44c407c0c8bed8fed5847dacf88f16c819..92fb25c5ce18352157c51e49137b4f792f9b4e83 100644
--- a/viennacl/compressed_matrix.hpp
+++ b/viennacl/compressed_matrix.hpp
@@ -630,7 +630,7 @@ class compressed_matrix
 {
   typedef compressed_matrix<NumericT, AlignmentV>                                                  self_type;
 public:
-  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef viennacl::backend::mem_handle<>                                                            handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
   typedef vcl_size_t                                                                                 size_type;
 
diff --git a/viennacl/coordinate_matrix.hpp b/viennacl/coordinate_matrix.hpp
index 2a24a4edafaff7689b022347ceba5780f7dd7d94..3a9eb074484e2a48a56e68e33a0f1c4859347e35 100644
--- a/viennacl/coordinate_matrix.hpp
+++ b/viennacl/coordinate_matrix.hpp
@@ -186,7 +186,7 @@ template<class NumericT, unsigned int AlignmentV /* see forwards.h */ >
 class coordinate_matrix
 {
 public:
-  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef viennacl::backend::mem_handle<>                                                            handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
   typedef vcl_size_t                                                                                 size_type;
 
diff --git a/viennacl/detail/matrix_def.hpp b/viennacl/detail/matrix_def.hpp
index c13ef01fe088a0949a79d81b21a835d867ba4f94..b85afe38d5d6a5a0a9969b5a59ef7b8fb05849ec 100644
--- a/viennacl/detail/matrix_def.hpp
+++ b/viennacl/detail/matrix_def.hpp
@@ -111,7 +111,7 @@ public:
   typedef NumericT                                                          cpu_value_type;
   typedef SizeT                                                            size_type;
   typedef DistanceT                                                        difference_type;
-  typedef viennacl::backend::mem_handle                                       handle_type;
+  typedef viennacl::backend::mem_handle<>                                       handle_type;
 
   /** @brief The default constructor. Does not allocate any memory. */
   explicit matrix_base(): size1_(0), size2_(0), start1_(0), start2_(0), stride1_(1), stride2_(1), internal_size1_(0), internal_size2_(0), row_major_fixed_(false), row_major_(true) {}
@@ -129,7 +129,7 @@ public:
   explicit matrix_base(size_type rows, size_type columns, bool is_row_major, viennacl::context ctx = viennacl::context());
 
   /** @brief Constructor for creating a matrix_range or matrix_stride from some other matrix/matrix_range/matrix_stride */
-  explicit matrix_base(viennacl::backend::mem_handle & h,
+  explicit matrix_base(handle_type & h,
                        size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
                        size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
                        bool is_row_major): size1_(mat_size1), size2_(mat_size2),
@@ -249,7 +249,7 @@ public:
   void switch_memory_context(viennacl::context new_ctx) { viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx); }
 
 protected:
-  void set_handle(viennacl::backend::mem_handle const & h);
+  void set_handle(viennacl::backend::mem_handle<> const & h);
   void resize(size_type rows, size_type columns, bool preserve = true);
 private:
   size_type size1_;
diff --git a/viennacl/detail/vector_def.hpp b/viennacl/detail/vector_def.hpp
index 4624b762fa2cdd46214043d29cfe500d19e79516..25a0bf6f48f4845d8a5213dc37145b4bae0f0c0c 100644
--- a/viennacl/detail/vector_def.hpp
+++ b/viennacl/detail/vector_def.hpp
@@ -100,19 +100,19 @@ struct zero_vector : public scalar_vector<NumericT>
   *
   * @tparam NumericT   The floating point type, either 'float' or 'double'
   */
-template<class NumericT, typename SizeT /* see forwards.h for default type */, typename DistanceT /* see forwards.h for default type */>
+template<class NumericT, typename OCLHandle, typename SizeT /* see forwards.h for default type */, typename DistanceT /* see forwards.h for default type */>
 class vector_base
 {
-  typedef vector_base<NumericT, SizeT, DistanceT>         self_type;
+  typedef vector_base<NumericT, OCLHandle, SizeT, DistanceT>         self_type;
 
 public:
   typedef scalar<NumericT>                                value_type;
   typedef NumericT                                        cpu_value_type;
-  typedef viennacl::backend::mem_handle                     handle_type;
+  typedef viennacl::backend::mem_handle<OCLHandle>       handle_type;
   typedef SizeT                                          size_type;
   typedef DistanceT                                      difference_type;
-  typedef const_vector_iterator<NumericT, 1>              const_iterator;
-  typedef vector_iterator<NumericT, 1>                    iterator;
+  typedef const_vector_iterator<NumericT, 1, OCLHandle>              const_iterator;
+  typedef vector_iterator<NumericT, 1, OCLHandle>                    iterator;
 
   /** @brief Returns the length of the vector (cf. std::vector)  */
   size_type size() const { return size_; }
@@ -141,7 +141,7 @@ public:
      * @param vec_start  The offset from the beginning of the buffer identified by 'h'
      * @param vec_stride Increment between two elements in the original buffer (in multiples of NumericT)
     */
-  explicit vector_base(viennacl::backend::mem_handle & h, size_type vec_size, size_type vec_start, size_type vec_stride);
+  explicit vector_base(viennacl::backend::mem_handle<OCLHandle> & h, size_type vec_size, size_type vec_start, size_type vec_stride);
 
   /** @brief Creates a vector and allocates the necessary memory */
   explicit vector_base(size_type vec_size, viennacl::context ctx = viennacl::context());
@@ -209,13 +209,13 @@ public:
 
   //read-write access to an element of the vector
   /** @brief Read-write access to a single element of the vector */
-  entry_proxy<NumericT> operator()(size_type index);
+  entry_proxy<NumericT, OCLHandle> operator()(size_type index);
   /** @brief Read-write access to a single element of the vector */
-  entry_proxy<NumericT> operator[](size_type index);
+  entry_proxy<NumericT, OCLHandle> operator[](size_type index);
   /** @brief Read access to a single element of the vector */
-  const_entry_proxy<NumericT> operator()(size_type index) const;
+  const_entry_proxy<NumericT, OCLHandle> operator()(size_type index) const;
   /** @brief Read access to a single element of the vector */
-  const_entry_proxy<NumericT> operator[](size_type index) const;
+  const_entry_proxy<NumericT, OCLHandle> operator[](size_type index) const;
   self_type & operator += (const self_type & vec);
   self_type & operator -= (const self_type & vec);
 
@@ -302,7 +302,7 @@ public:
 
 protected:
 
-  void set_handle(viennacl::backend::mem_handle const & h) {  elements_ = h; }
+  void set_handle(viennacl::backend::mem_handle<OCLHandle> const & h) {  elements_ = h; }
 
   /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy */
   self_type & fast_swap(self_type & other);
diff --git a/viennacl/device_specific/forwards.h b/viennacl/device_specific/forwards.h
index 11368bb4f9983e08a91d17e316738d3552403b33..9c635e0c80000f939ad0ac427d5f7a845c24f236 100644
--- a/viennacl/device_specific/forwards.h
+++ b/viennacl/device_specific/forwards.h
@@ -217,16 +217,18 @@ class symbolic_binder
 {
 public:
   virtual ~symbolic_binder(){ }
-  virtual bool bind(viennacl::backend::mem_handle const * ph) = 0;
-  virtual unsigned int get(viennacl::backend::mem_handle const * ph) = 0;
+  virtual bool bind(viennacl::backend::mem_handle<> const * ph) = 0;
+  virtual unsigned int get(viennacl::backend::mem_handle<> const * ph) = 0;
 };
 
 class bind_to_handle : public symbolic_binder
 {
 public:
   bind_to_handle() : current_arg_(0){ }
-  bool bind(viennacl::backend::mem_handle const * ph) {return (ph==NULL)?true:memory.insert(std::make_pair((void*)ph, current_arg_)).second; }
-  unsigned int get(viennacl::backend::mem_handle const * ph){ return bind(ph) ? current_arg_++ : at(memory, (void*)ph); }
+
+  bool bind(viennacl::backend::mem_handle<> const * ph) {return (ph==NULL)?true:memory.insert(std::make_pair((void*)ph, current_arg_)).second; }
+
+  unsigned int get(viennacl::backend::mem_handle<> const * ph){ return bind(ph) ? current_arg_++ : at(memory, (void*)ph); }
 private:
   unsigned int current_arg_;
   std::map<void*,unsigned int> memory;
@@ -236,8 +238,10 @@ class bind_all_unique : public symbolic_binder
 {
 public:
   bind_all_unique() : current_arg_(0){ }
-  bool bind(viennacl::backend::mem_handle const *) {return true; }
-  unsigned int get(viennacl::backend::mem_handle const *){ return current_arg_++; }
+  bool bind(viennacl::backend::mem_handle<ocl::pooled_clmem_handle> const *) {return true; }
+  bool bind(viennacl::backend::mem_handle<> const *) {return true; }
+  unsigned int get(viennacl::backend::mem_handle<ocl::pooled_clmem_handle> const *){ return current_arg_++; }
+  unsigned int get(viennacl::backend::mem_handle<> const *){ return current_arg_++; }
 private:
   unsigned int current_arg_;
   std::map<void*,unsigned int> memory;
diff --git a/viennacl/ell_matrix.hpp b/viennacl/ell_matrix.hpp
index 3c3a4282bbaae0312c73eeb19fee59097f0b8b22..fc584219de3d69617b68027f75fd679db5d72c22 100644
--- a/viennacl/ell_matrix.hpp
+++ b/viennacl/ell_matrix.hpp
@@ -53,7 +53,7 @@ template<typename NumericT, unsigned int AlignmentV /* see forwards.h for defaul
 class ell_matrix
 {
 public:
-  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef viennacl::backend::mem_handle<>                                                              handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
   typedef vcl_size_t                                                                                 size_type;
 
diff --git a/viennacl/forwards.h b/viennacl/forwards.h
index 092b6e5159ac9455b2e12f4b20f48e3801a39363..8e8a4e56a526df83e7410bea093a0c6e448dffd6 100644
--- a/viennacl/forwards.h
+++ b/viennacl/forwards.h
@@ -69,6 +69,8 @@
 #include "viennacl/meta/enable_if.hpp"
 #include "viennacl/version.hpp"
 
+#include "CL/cl.h"
+
 /** @brief Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them. */
 namespace viennacl
 {
@@ -246,26 +248,34 @@ namespace viennacl
   /** @brief A tag class representing sign flips (for scalars only. Vectors and matrices use the standard multiplication by the scalar -1.0) */
   struct op_flip_sign {};
 
+  /** @brief OpenCL backend. Manages platforms, contexts, buffers, kernels, etc. */
+  namespace ocl {
+    template <class OCL_TYPE>
+    class handle;
+
+    class pooled_clmem_handle;
+  }
+
   //forward declaration of basic types:
-  template<class TYPE>
+  template<class TYPE, typename H = viennacl::ocl::handle<cl_mem>>
   class scalar;
 
   template<typename LHS, typename RHS, typename OP>
   class scalar_expression;
 
-  template<typename SCALARTYPE>
+  template<typename SCALARTYPE, typename H = viennacl::ocl::handle<cl_mem>>
   class entry_proxy;
 
-  template<typename SCALARTYPE>
+  template<typename SCALARTYPE, typename H = viennacl::ocl::handle<cl_mem>>
   class const_entry_proxy;
 
   template<typename LHS, typename RHS, typename OP>
   class vector_expression;
 
-  template<class SCALARTYPE, unsigned int ALIGNMENT>
+  template<class SCALARTYPE, unsigned int ALIGNMENT, typename OCLHandle = viennacl::ocl::handle<cl_mem>>
   class vector_iterator;
 
-  template<class SCALARTYPE, unsigned int ALIGNMENT>
+  template<class SCALARTYPE, unsigned int ALIGNMENT, typename OCLHandle = viennacl::ocl::handle<cl_mem>>
   class const_vector_iterator;
 
   template<typename SCALARTYPE>
@@ -283,13 +293,13 @@ namespace viennacl
   template<typename SCALARTYPE>
   struct scalar_vector;
 
-  template<class SCALARTYPE, typename SizeType = vcl_size_t, typename DistanceType = vcl_ptrdiff_t>
+  template<class SCALARTYPE, typename Handle = viennacl::ocl::handle<cl_mem>, typename SizeType = vcl_size_t, typename DistanceType = vcl_ptrdiff_t>
   class vector_base;
 
-  template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+  template<class SCALARTYPE, unsigned int ALIGNMENT = 1, typename OCLHandle = viennacl::ocl::handle<cl_mem>>
   class vector;
 
-  template<typename ScalarT>
+  template<typename ScalarT, typename H = viennacl::ocl::handle<cl_mem>>
   class vector_tuple;
 
   //the following forwards are needed for GMRES
@@ -308,15 +318,15 @@ namespace viennacl
             const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
             const_vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin);
 
-  template<typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
-  void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
-                 const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+  template<typename SCALARTYPE, unsigned int ALIGNMENT, typename H, typename CPU_ITERATOR>
+  void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT, H> & gpu_begin,
+                 const const_vector_iterator<SCALARTYPE, ALIGNMENT, H> & gpu_end,
                  CPU_ITERATOR cpu_begin );
 
-  template<typename CPU_ITERATOR, typename SCALARTYPE, unsigned int ALIGNMENT>
+  template<typename CPU_ITERATOR, typename SCALARTYPE, unsigned int ALIGNMENT, typename H>
   void fast_copy(CPU_ITERATOR const & cpu_begin,
                   CPU_ITERATOR const & cpu_end,
-                  vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin);
+                  vector_iterator<SCALARTYPE, ALIGNMENT, H> gpu_begin);
 
 
   /** @brief Tag class for indicating row-major layout of a matrix. Not passed to the matrix directly, see row_major type. */
@@ -376,6 +386,7 @@ namespace viennacl
 
   namespace backend
   {
+    template <typename OCL_Handle = viennacl::ocl::handle<cl_mem>>
     class mem_handle;
   }
 
@@ -655,28 +666,27 @@ namespace viennacl
                     viennacl::vector<SCALARTYPE, ALIGNMENT>& input2,
                     viennacl::vector<SCALARTYPE, ALIGNMENT>& output);
 
-    template<typename T>
-    viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_prod> >
-    element_prod(vector_base<T> const & v1, vector_base<T> const & v2);
-
-    template<typename T>
-    viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_div> >
-    element_div(vector_base<T> const & v1, vector_base<T> const & v2);
+    template<typename T, typename H1, typename H2>
+    viennacl::vector_expression<const vector_base<T, H1>, const vector_base<T, H2>, op_element_binary<op_prod> >
+    element_prod(vector_base<T, H1> const & v1, vector_base<T, H2> const & v2);
 
+    template<typename T, typename H1, typename H2>
+    viennacl::vector_expression<const vector_base<T, H1>, const vector_base<T, H2>, op_element_binary<op_div> >
+    element_div(vector_base<T, H1> const & v1, vector_base<T, H2> const & v2);
 
 
-    template<typename T>
-    void inner_prod_impl(vector_base<T> const & vec1,
-                         vector_base<T> const & vec2,
+    template<typename T, typename H1, typename H2>
+    void inner_prod_impl(vector_base<T, H1> const & vec1,
+                         vector_base<T, H2> const & vec2,
                          scalar<T> & result);
 
-    template<typename LHS, typename RHS, typename OP, typename T>
+    template<typename LHS, typename RHS, typename OP, typename T, typename H>
     void inner_prod_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
-                         vector_base<T> const & vec2,
+                         vector_base<T, H> const & vec2,
                          scalar<T> & result);
 
-    template<typename T, typename LHS, typename RHS, typename OP>
-    void inner_prod_impl(vector_base<T> const & vec1,
+    template<typename T, typename LHS, typename RHS, typename OP, typename H>
+    void inner_prod_impl(vector_base<T, H> const & vec1,
                          viennacl::vector_expression<LHS, RHS, OP> const & vec2,
                          scalar<T> & result);
 
@@ -688,18 +698,18 @@ namespace viennacl
 
     ///////////////////////////
 
-    template<typename T>
-    void inner_prod_cpu(vector_base<T> const & vec1,
-                        vector_base<T> const & vec2,
+    template<typename T, typename H1, typename H2>
+    void inner_prod_cpu(vector_base<T, H1> const & vec1,
+                        vector_base<T, H2> const & vec2,
                         T & result);
 
-    template<typename LHS, typename RHS, typename OP, typename T>
+    template<typename LHS, typename RHS, typename OP, typename T, typename H>
     void inner_prod_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
-                        vector_base<T> const & vec2,
+                        vector_base<T, H> const & vec2,
                         T & result);
 
-    template<typename T, typename LHS, typename RHS, typename OP>
-    void inner_prod_cpu(vector_base<T> const & vec1,
+    template<typename T, typename H, typename LHS, typename RHS, typename OP>
+    void inner_prod_cpu(vector_base<T, H> const & vec1,
                         viennacl::vector_expression<LHS, RHS, OP> const & vec2,
                         T & result);
 
@@ -712,16 +722,16 @@ namespace viennacl
 
 
     //forward definition of norm_1_impl function
-    template<typename T>
-    void norm_1_impl(vector_base<T> const & vec, scalar<T> & result);
+    template<typename T, typename H>
+    void norm_1_impl(vector_base<T, H> const & vec, scalar<T> & result);
 
     template<typename LHS, typename RHS, typename OP, typename T>
     void norm_1_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                      scalar<T> & result);
 
 
-    template<typename T>
-    void norm_1_cpu(vector_base<T> const & vec,
+    template<typename T, typename H>
+    void norm_1_cpu(vector_base<T, H> const & vec,
                     T & result);
 
     template<typename LHS, typename RHS, typename OP, typename S2>
@@ -729,15 +739,15 @@ namespace viennacl
                     S2 & result);
 
     //forward definition of norm_2_impl function
-    template<typename T>
-    void norm_2_impl(vector_base<T> const & vec, scalar<T> & result);
+    template<typename T, typename H>
+    void norm_2_impl(vector_base<T, H> const & vec, scalar<T> & result);
 
     template<typename LHS, typename RHS, typename OP, typename T>
     void norm_2_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                      scalar<T> & result);
 
-    template<typename T>
-    void norm_2_cpu(vector_base<T> const & vec, T & result);
+    template<typename T, typename H>
+    void norm_2_cpu(vector_base<T, H> const & vec, T & result);
 
     template<typename LHS, typename RHS, typename OP, typename S2>
     void norm_2_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
@@ -745,64 +755,64 @@ namespace viennacl
 
 
     //forward definition of norm_inf_impl function
-    template<typename T>
-    void norm_inf_impl(vector_base<T> const & vec, scalar<T> & result);
+    template<typename T, typename H>
+    void norm_inf_impl(vector_base<T, H> const & vec, scalar<T> & result);
 
     template<typename LHS, typename RHS, typename OP, typename T>
     void norm_inf_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                       scalar<T> & result);
 
 
-    template<typename T>
-    void norm_inf_cpu(vector_base<T> const & vec, T & result);
+    template<typename T, typename H>
+    void norm_inf_cpu(vector_base<T, H> const & vec, T & result);
 
     template<typename LHS, typename RHS, typename OP, typename S2>
     void norm_inf_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                       S2 & result);
 
     //forward definition of max()-related functions
-    template<typename T>
-    void max_impl(vector_base<T> const & vec, scalar<T> & result);
+    template<typename T, typename H>
+    void max_impl(vector_base<T, H> const & vec, scalar<T> & result);
 
     template<typename LHS, typename RHS, typename OP, typename T>
     void max_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                   scalar<T> & result);
 
 
-    template<typename T>
-    void max_cpu(vector_base<T> const & vec, T & result);
+    template<typename T, typename H>
+    void max_cpu(vector_base<T, H> const & vec, T & result);
 
     template<typename LHS, typename RHS, typename OP, typename S2>
     void max_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                  S2 & result);
 
     //forward definition of min()-related functions
-    template<typename T>
-    void min_impl(vector_base<T> const & vec, scalar<T> & result);
+    template<typename T, typename H>
+    void min_impl(vector_base<T, H> const & vec, scalar<T> & result);
 
     template<typename LHS, typename RHS, typename OP, typename T>
     void min_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                   scalar<T> & result);
 
 
-    template<typename T>
-    void min_cpu(vector_base<T> const & vec, T & result);
+    template<typename T, typename H>
+    void min_cpu(vector_base<T, H> const & vec, T & result);
 
     template<typename LHS, typename RHS, typename OP, typename S2>
     void min_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                  S2 & result);
 
     //forward definition of sum()-related functions
-    template<typename T>
-    void sum_impl(vector_base<T> const & vec, scalar<T> & result);
+    template<typename T, typename H>
+    void sum_impl(vector_base<T, H> const & vec, scalar<T> & result);
 
     template<typename LHS, typename RHS, typename OP, typename T>
     void sum_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
                   scalar<T> & result);
 
 
-    template<typename T>
-    void sum_cpu(vector_base<T> const & vec, T & result);
+    template<typename T, typename H>
+    void sum_cpu(vector_base<T, H> const & vec, T & result);
 
     template<typename LHS, typename RHS, typename OP, typename S2>
     void sum_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
@@ -817,8 +827,8 @@ namespace viennacl
     void norm_frobenius_cpu(matrix_base<T> const & vec, T & result);
 
 
-    template<typename T>
-    vcl_size_t index_norm_inf(vector_base<T> const & vec);
+    template<typename T, typename H>
+    vcl_size_t index_norm_inf(vector_base<T, H> const & vec);
 
     template<typename LHS, typename RHS, typename OP>
     vcl_size_t index_norm_inf(viennacl::vector_expression<LHS, RHS, OP> const & vec);
@@ -1023,8 +1033,6 @@ namespace viennacl
     }
   }
 
-  /** @brief OpenCL backend. Manages platforms, contexts, buffers, kernels, etc. */
-  namespace ocl {}
 
   /** @brief Namespace containing many meta-functions. */
   namespace result_of {}
diff --git a/viennacl/hankel_matrix.hpp b/viennacl/hankel_matrix.hpp
index 084e6c8717cdccc7eb35462b05ba5406f0165fd3..e50e04b8f05a94adf8c1a9df6e2134b9679cf8d2 100644
--- a/viennacl/hankel_matrix.hpp
+++ b/viennacl/hankel_matrix.hpp
@@ -43,7 +43,7 @@ template<class NumericT, unsigned int AlignmentV>
 class hankel_matrix
 {
 public:
-  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef viennacl::backend::mem_handle<>                                                              handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
 
   /**
diff --git a/viennacl/hyb_matrix.hpp b/viennacl/hyb_matrix.hpp
index e93ede5f5fdf99b6c7d317752050051aefbdf493..d54f6db5cd1bfac4d77211306405c9959d836565 100644
--- a/viennacl/hyb_matrix.hpp
+++ b/viennacl/hyb_matrix.hpp
@@ -38,7 +38,7 @@ template<typename NumericT, unsigned int AlignmentV  /* see forwards.h for defau
 class hyb_matrix
 {
 public:
-  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef viennacl::backend::mem_handle<>                                                              handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
 
   hyb_matrix() : csr_threshold_(NumericT(0.8)), rows_(0), cols_(0) {}
diff --git a/viennacl/linalg/cuda/common.hpp b/viennacl/linalg/cuda/common.hpp
index 562d558cf9faa739f7f5a91ca32e2514b01fde64..52811867a3a80337809b674d730c2ef09382dc84 100644
--- a/viennacl/linalg/cuda/common.hpp
+++ b/viennacl/linalg/cuda/common.hpp
@@ -103,14 +103,14 @@ const NumericT * cuda_arg(matrix_base<NumericT> const & obj)
 
 /** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Non-const version. */
 template<typename ReturnT>
-ReturnT * cuda_arg(viennacl::backend::mem_handle & h)
+ReturnT * cuda_arg(viennacl::backend::mem_handle<> & h)
 {
   return reinterpret_cast<ReturnT *>(h.cuda_handle().get());
 }
 
 /** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Const-version. */
 template<typename ReturnT>
-ReturnT const * cuda_arg(viennacl::backend::mem_handle const & h)
+ReturnT const * cuda_arg(viennacl::backend::mem_handle<> const & h)
 {
   return reinterpret_cast<const ReturnT *>(h.cuda_handle().get());
 }
diff --git a/viennacl/linalg/cuda/ilu_operations.hpp b/viennacl/linalg/cuda/ilu_operations.hpp
index 302a73cc7afefa2ba9ddbd55aca7cb15c8594344..f15eb944c80b96d85f128290b00be61522346bd2 100644
--- a/viennacl/linalg/cuda/ilu_operations.hpp
+++ b/viennacl/linalg/cuda/ilu_operations.hpp
@@ -285,7 +285,7 @@ template<typename NumericT>
 void icc_chow_patel_sweep(compressed_matrix<NumericT>       & L,
                           vector<NumericT>            const & aij_L)
 {
-  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::mem_handle<> L_backup;
   viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
   viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
 
@@ -578,11 +578,11 @@ void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
                           compressed_matrix<NumericT>       & U_trans,
                           vector<NumericT>            const & aij_U_trans)
 {
-  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::mem_handle<> L_backup;
   viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
   viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
 
-  viennacl::backend::mem_handle U_backup;
+  viennacl::backend::mem_handle<> U_backup;
   viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans));
   viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size());
 
diff --git a/viennacl/linalg/cuda/misc_operations.hpp b/viennacl/linalg/cuda/misc_operations.hpp
index 4821f5b4eddad8b4f9b681b7f0b9bb1c401bb3ac..7b834904eafe9c87d876cb6bff835c46781bd91b 100644
--- a/viennacl/linalg/cuda/misc_operations.hpp
+++ b/viennacl/linalg/cuda/misc_operations.hpp
@@ -66,10 +66,10 @@ __global__ void level_scheduling_substitute_kernel(
 
 template<typename NumericT>
 void level_scheduling_substitute(vector<NumericT> & vec,
-                             viennacl::backend::mem_handle const & row_index_array,
-                             viennacl::backend::mem_handle const & row_buffer,
-                             viennacl::backend::mem_handle const & col_buffer,
-                             viennacl::backend::mem_handle const & element_buffer,
+                             viennacl::backend::mem_handle<> const & row_index_array,
+                             viennacl::backend::mem_handle<> const & row_buffer,
+                             viennacl::backend::mem_handle<> const & col_buffer,
+                             viennacl::backend::mem_handle<> const & element_buffer,
                              vcl_size_t num_rows
                             )
 {
diff --git a/viennacl/linalg/cuda/sparse_matrix_operations.hpp b/viennacl/linalg/cuda/sparse_matrix_operations.hpp
index 51d99e135fbde609f190a7820b7e0f5982f3bb77..467f8772a706aaca7d6e6da15be34b8d1018d68a 100644
--- a/viennacl/linalg/cuda/sparse_matrix_operations.hpp
+++ b/viennacl/linalg/cuda/sparse_matrix_operations.hpp
@@ -970,7 +970,7 @@ namespace detail
   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
                                                    const compressed_matrix<NumericT, AlignmentV>,
                                                    op_trans> & L,
-                           viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                           viennacl::backend::mem_handle<> const & block_indices, vcl_size_t num_blocks,
                            vector_base<NumericT> const & /* L_diagonal */,  //ignored
                            vector_base<NumericT> & vec,
                            viennacl::linalg::unit_lower_tag)
@@ -989,7 +989,7 @@ namespace detail
   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
                                                    const compressed_matrix<NumericT, AlignmentV>,
                                                    op_trans> & U,
-                           viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                           viennacl::backend::mem_handle<> const & block_indices, vcl_size_t num_blocks,
                            vector_base<NumericT> const & U_diagonal,
                            vector_base<NumericT> & vec,
                            viennacl::linalg::upper_tag)
diff --git a/viennacl/linalg/cuda/vector_operations.hpp b/viennacl/linalg/cuda/vector_operations.hpp
index 61274b75f4cb62eca85e189d4099d3bb92cc6c80..026a3a98cee7e37a963e76d1681d45e62aac9259 100644
--- a/viennacl/linalg/cuda/vector_operations.hpp
+++ b/viennacl/linalg/cuda/vector_operations.hpp
@@ -2753,7 +2753,7 @@ vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1)
 {
   typedef NumericT       value_type;
 
-  viennacl::backend::mem_handle h;
+  viennacl::backend::mem_handle<> h;
   viennacl::backend::memory_create(h, sizeof(unsigned int), viennacl::traits::context(vec1));
 
   index_norm_inf_kernel<<<1, 128>>>(viennacl::cuda_arg(vec1),
@@ -2965,7 +2965,7 @@ namespace detail
     vcl_size_t block_num = 128;
     vcl_size_t threads_per_block = 128;
 
-    viennacl::backend::mem_handle cuda_carries;
+    viennacl::backend::mem_handle<> cuda_carries;
     viennacl::backend::memory_create(cuda_carries, sizeof(NumericT)*block_num, viennacl::traits::context(input));
 
     // First step: Scan within each thread group and write carries
diff --git a/viennacl/linalg/detail/ilu/block_ilu.hpp b/viennacl/linalg/detail/ilu/block_ilu.hpp
index 1540e2dd7a505ad7c13c75fc6b5cff78a9f91a22..1cb2f4b29a890a74098502a54d3c7fe2fab92ee6 100644
--- a/viennacl/linalg/detail/ilu/block_ilu.hpp
+++ b/viennacl/linalg/detail/ilu/block_ilu.hpp
@@ -595,7 +595,7 @@ private:
 
   ILUTagT                               tag_;
   index_vector_type                     block_indices_;
-  viennacl::backend::mem_handle         gpu_block_indices_;
+  viennacl::backend::mem_handle<>       gpu_block_indices_;
   viennacl::compressed_matrix<NumericT> gpu_L_trans_;
   viennacl::compressed_matrix<NumericT> gpu_U_trans_;
   viennacl::vector<NumericT>            gpu_D_;
diff --git a/viennacl/linalg/detail/ilu/common.hpp b/viennacl/linalg/detail/ilu/common.hpp
index 93b0cba88b842124f8ad00059412abfce891a47c..cb8b81f5061481f39a4fc060885f4fd2f4b514ba 100644
--- a/viennacl/linalg/detail/ilu/common.hpp
+++ b/viennacl/linalg/detail/ilu/common.hpp
@@ -50,10 +50,10 @@ namespace detail
 template<typename NumericT, unsigned int AlignmentV>
 void level_scheduling_setup_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
                                  viennacl::vector<NumericT> const & diagonal_LU,
-                                 std::list<viennacl::backend::mem_handle> & row_index_arrays,
-                                 std::list<viennacl::backend::mem_handle> & row_buffers,
-                                 std::list<viennacl::backend::mem_handle> & col_buffers,
-                                 std::list<viennacl::backend::mem_handle> & element_buffers,
+                                 std::list<viennacl::backend::mem_handle<>> & row_index_arrays,
+                                 std::list<viennacl::backend::mem_handle<>> & row_buffers,
+                                 std::list<viennacl::backend::mem_handle<>> & col_buffers,
+                                 std::list<viennacl::backend::mem_handle<>> & element_buffers,
                                  std::list<vcl_size_t> & row_elimination_num_list,
                                  bool setup_U)
 {
@@ -119,19 +119,19 @@ void level_scheduling_setup_impl(viennacl::compressed_matrix<NumericT, Alignment
 
     if (num_tainted_cols > 0)
     {
-      row_index_arrays.push_back(viennacl::backend::mem_handle());
+      row_index_arrays.push_back(viennacl::backend::mem_handle<>());
       viennacl::backend::switch_memory_context<unsigned int>(row_index_arrays.back(), viennacl::traits::context(LU));
       viennacl::backend::typesafe_host_array<unsigned int> elim_row_index_array(row_index_arrays.back(), num_tainted_cols);
 
-      row_buffers.push_back(viennacl::backend::mem_handle());
+      row_buffers.push_back(viennacl::backend::mem_handle<>());
       viennacl::backend::switch_memory_context<unsigned int>(row_buffers.back(), viennacl::traits::context(LU));
       viennacl::backend::typesafe_host_array<unsigned int> elim_row_buffer(row_buffers.back(), num_tainted_cols + 1);
 
-      col_buffers.push_back(viennacl::backend::mem_handle());
+      col_buffers.push_back(viennacl::backend::mem_handle<>());
       viennacl::backend::switch_memory_context<unsigned int>(col_buffers.back(), viennacl::traits::context(LU));
       viennacl::backend::typesafe_host_array<unsigned int> elim_col_buffer(col_buffers.back(), num_entries);
 
-      element_buffers.push_back(viennacl::backend::mem_handle());
+      element_buffers.push_back(viennacl::backend::mem_handle<>());
       viennacl::backend::switch_memory_context<NumericT>(element_buffers.back(), viennacl::traits::context(LU));
       std::vector<NumericT> elim_elements_buffer(num_entries);
 
@@ -190,10 +190,10 @@ void level_scheduling_setup_impl(viennacl::compressed_matrix<NumericT, Alignment
 template<typename NumericT, unsigned int AlignmentV>
 void level_scheduling_setup_L(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
                               viennacl::vector<NumericT> const & diagonal_LU,
-                              std::list<viennacl::backend::mem_handle> & row_index_arrays,
-                              std::list<viennacl::backend::mem_handle> & row_buffers,
-                              std::list<viennacl::backend::mem_handle> & col_buffers,
-                              std::list<viennacl::backend::mem_handle> & element_buffers,
+                              std::list<viennacl::backend::mem_handle<>> & row_index_arrays,
+                              std::list<viennacl::backend::mem_handle<>> & row_buffers,
+                              std::list<viennacl::backend::mem_handle<>> & col_buffers,
+                              std::list<viennacl::backend::mem_handle<>> & element_buffers,
                               std::list<vcl_size_t> & row_elimination_num_list)
 {
   level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, false);
@@ -207,10 +207,10 @@ void level_scheduling_setup_L(viennacl::compressed_matrix<NumericT, AlignmentV>
 template<typename NumericT, unsigned int AlignmentV>
 void level_scheduling_setup_U(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
                               viennacl::vector<NumericT> const & diagonal_LU,
-                              std::list<viennacl::backend::mem_handle> & row_index_arrays,
-                              std::list<viennacl::backend::mem_handle> & row_buffers,
-                              std::list<viennacl::backend::mem_handle> & col_buffers,
-                              std::list<viennacl::backend::mem_handle> & element_buffers,
+                              std::list<viennacl::backend::mem_handle<>> & row_index_arrays,
+                              std::list<viennacl::backend::mem_handle<>> & row_buffers,
+                              std::list<viennacl::backend::mem_handle<>> & col_buffers,
+                              std::list<viennacl::backend::mem_handle<>> & element_buffers,
                               std::list<vcl_size_t> & row_elimination_num_list)
 {
   level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, true);
@@ -222,13 +222,13 @@ void level_scheduling_setup_U(viennacl::compressed_matrix<NumericT, AlignmentV>
 //
 template<typename NumericT>
 void level_scheduling_substitute(viennacl::vector<NumericT> & vec,
-                                 std::list<viennacl::backend::mem_handle> const & row_index_arrays,
-                                 std::list<viennacl::backend::mem_handle> const & row_buffers,
-                                 std::list<viennacl::backend::mem_handle> const & col_buffers,
-                                 std::list<viennacl::backend::mem_handle> const & element_buffers,
+                                 std::list<viennacl::backend::mem_handle<>> const & row_index_arrays,
+                                 std::list<viennacl::backend::mem_handle<>> const & row_buffers,
+                                 std::list<viennacl::backend::mem_handle<>> const & col_buffers,
+                                 std::list<viennacl::backend::mem_handle<>> const & element_buffers,
                                  std::list<vcl_size_t> const & row_elimination_num_list)
 {
-  typedef typename std::list< viennacl::backend::mem_handle >::const_iterator  ListIterator;
+  typedef typename std::list< viennacl::backend::mem_handle<> >::const_iterator  ListIterator;
   ListIterator row_index_array_it = row_index_arrays.begin();
   ListIterator row_buffers_it = row_buffers.begin();
   ListIterator col_buffers_it = col_buffers.begin();
diff --git a/viennacl/linalg/detail/ilu/ilu0.hpp b/viennacl/linalg/detail/ilu/ilu0.hpp
index 1c3191a7e5ca7b005bd52ebdca5b409a4373bc78..67f2f7fcfea3ddb0dfa1a1e384ae6e0e8bd1cdd0 100644
--- a/viennacl/linalg/detail/ilu/ilu0.hpp
+++ b/viennacl/linalg/detail/ilu/ilu0.hpp
@@ -304,22 +304,22 @@ private:
     //
 
     // L:
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_L_row_index_arrays_.begin();
                                                                        it != multifrontal_L_row_index_arrays_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_L_row_buffers_.begin();
                                                                        it != multifrontal_L_row_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_L_col_buffers_.begin();
                                                                        it != multifrontal_L_col_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_L_element_buffers_.begin();
                                                                        it != multifrontal_L_element_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
@@ -329,22 +329,22 @@ private:
 
     viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_U_row_index_arrays_.begin();
                                                                        it != multifrontal_U_row_index_arrays_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_U_row_buffers_.begin();
                                                                        it != multifrontal_U_row_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_U_col_buffers_.begin();
                                                                        it != multifrontal_U_col_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_U_element_buffers_.begin();
                                                                        it != multifrontal_U_element_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
@@ -354,17 +354,17 @@ private:
   ilu0_tag tag_;
   viennacl::compressed_matrix<NumericT> LU_;
 
-  std::list<viennacl::backend::mem_handle> multifrontal_L_row_index_arrays_;
-  std::list<viennacl::backend::mem_handle> multifrontal_L_row_buffers_;
-  std::list<viennacl::backend::mem_handle> multifrontal_L_col_buffers_;
-  std::list<viennacl::backend::mem_handle> multifrontal_L_element_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_L_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_L_row_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_L_col_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_L_element_buffers_;
   std::list<vcl_size_t>                    multifrontal_L_row_elimination_num_list_;
 
   viennacl::vector<NumericT> multifrontal_U_diagonal_;
-  std::list<viennacl::backend::mem_handle> multifrontal_U_row_index_arrays_;
-  std::list<viennacl::backend::mem_handle> multifrontal_U_row_buffers_;
-  std::list<viennacl::backend::mem_handle> multifrontal_U_col_buffers_;
-  std::list<viennacl::backend::mem_handle> multifrontal_U_element_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_U_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_U_row_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_U_col_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_U_element_buffers_;
   std::list<vcl_size_t>                    multifrontal_U_row_elimination_num_list_;
 
 };
diff --git a/viennacl/linalg/detail/ilu/ilut.hpp b/viennacl/linalg/detail/ilu/ilut.hpp
index 9c0dd966537fd6dbdc528df40bb70c1a27602be7..650d45219424c19b462c6f3f73e788ab3cbb75bb 100644
--- a/viennacl/linalg/detail/ilu/ilut.hpp
+++ b/viennacl/linalg/detail/ilu/ilut.hpp
@@ -611,22 +611,22 @@ private:
 
     // L:
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_L_row_index_arrays_.begin();
                                                                        it != multifrontal_L_row_index_arrays_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_L_row_buffers_.begin();
                                                                        it != multifrontal_L_row_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_L_col_buffers_.begin();
                                                                        it != multifrontal_L_col_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_L_element_buffers_.begin();
                                                                        it != multifrontal_L_element_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
@@ -636,22 +636,22 @@ private:
 
     viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_U_row_index_arrays_.begin();
                                                                        it != multifrontal_U_row_index_arrays_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_U_row_buffers_.begin();
                                                                        it != multifrontal_U_row_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_U_col_buffers_.begin();
                                                                        it != multifrontal_U_col_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
 
-    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+    for (typename std::list< viennacl::backend::mem_handle<> >::iterator it  = multifrontal_U_element_buffers_.begin();
                                                                        it != multifrontal_U_element_buffers_.end();
                                                                      ++it)
       viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
@@ -663,17 +663,17 @@ private:
   viennacl::compressed_matrix<NumericT> L_;
   viennacl::compressed_matrix<NumericT> U_;
 
-  std::list<viennacl::backend::mem_handle> multifrontal_L_row_index_arrays_;
-  std::list<viennacl::backend::mem_handle> multifrontal_L_row_buffers_;
-  std::list<viennacl::backend::mem_handle> multifrontal_L_col_buffers_;
-  std::list<viennacl::backend::mem_handle> multifrontal_L_element_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_L_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_L_row_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_L_col_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_L_element_buffers_;
   std::list<vcl_size_t > multifrontal_L_row_elimination_num_list_;
 
   viennacl::vector<NumericT> multifrontal_U_diagonal_;
-  std::list<viennacl::backend::mem_handle> multifrontal_U_row_index_arrays_;
-  std::list<viennacl::backend::mem_handle> multifrontal_U_row_buffers_;
-  std::list<viennacl::backend::mem_handle> multifrontal_U_col_buffers_;
-  std::list<viennacl::backend::mem_handle> multifrontal_U_element_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_U_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_U_row_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_U_col_buffers_;
+  std::list<viennacl::backend::mem_handle<>> multifrontal_U_element_buffers_;
   std::list<vcl_size_t > multifrontal_U_row_elimination_num_list_;
 
   mutable viennacl::vector<NumericT>      x_k_;
diff --git a/viennacl/linalg/detail/op_executor.hpp b/viennacl/linalg/detail/op_executor.hpp
index bd49b3bdc308526e6486c47b364de2801cf37c7c..68c8a8679f52927776fad1c4a2209ea6a2cca553 100644
--- a/viennacl/linalg/detail/op_executor.hpp
+++ b/viennacl/linalg/detail/op_executor.hpp
@@ -32,27 +32,27 @@ namespace linalg
 namespace detail
 {
 
-template<typename NumericT, typename B>
-bool op_aliasing(vector_base<NumericT> const & /*lhs*/, B const & /*b*/)
+template<typename NumericT, typename H, typename B>
+bool op_aliasing(vector_base<NumericT, H> const & /*lhs*/, B const & /*b*/)
 {
   return false;
 }
 
-template<typename NumericT>
-bool op_aliasing(vector_base<NumericT> const & lhs, vector_base<NumericT> const & b)
+template<typename NumericT, typename H>
+bool op_aliasing(vector_base<NumericT, H> const & lhs, vector_base<NumericT> const & b)
 {
   return lhs.handle() == b.handle();
 }
 
-template<typename NumericT, typename LhsT, typename RhsT, typename OpT>
-bool op_aliasing(vector_base<NumericT> const & lhs, vector_expression<const LhsT, const RhsT, OpT> const & rhs)
+template<typename NumericT, typename H, typename LhsT, typename RhsT, typename OpT>
+bool op_aliasing(vector_base<NumericT, H> const & lhs, vector_expression<const LhsT, const RhsT, OpT> const & rhs)
 {
   return op_aliasing(lhs, rhs.lhs()) || op_aliasing(lhs, rhs.rhs());
 }
 
 
-template<typename NumericT, typename B>
-bool op_aliasing(matrix_base<NumericT> const & /*lhs*/, B const & /*b*/)
+template<typename NumericT, typename H, typename B>
+bool op_aliasing(matrix_base<NumericT, H> const & /*lhs*/, B const & /*b*/)
 {
   return false;
 }
diff --git a/viennacl/linalg/host_based/misc_operations.hpp b/viennacl/linalg/host_based/misc_operations.hpp
index 11061d935f290b4727f2c54fcfcf147c4605dc67..ce1f774faa7f3b38c4eb50ad54773ad32b57e7ca 100644
--- a/viennacl/linalg/host_based/misc_operations.hpp
+++ b/viennacl/linalg/host_based/misc_operations.hpp
@@ -40,10 +40,10 @@ namespace detail
 {
   template<typename NumericT>
   void level_scheduling_substitute(vector<NumericT> & vec,
-                                   viennacl::backend::mem_handle const & row_index_array,
-                                   viennacl::backend::mem_handle const & row_buffer,
-                                   viennacl::backend::mem_handle const & col_buffer,
-                                   viennacl::backend::mem_handle const & element_buffer,
+                                   viennacl::backend::mem_handle<> const & row_index_array,
+                                   viennacl::backend::mem_handle<> const & row_buffer,
+                                   viennacl::backend::mem_handle<> const & col_buffer,
+                                   viennacl::backend::mem_handle<> const & element_buffer,
                                    vcl_size_t num_rows
                                   )
   {
diff --git a/viennacl/linalg/host_based/sparse_matrix_operations.hpp b/viennacl/linalg/host_based/sparse_matrix_operations.hpp
index 3cb738d14db36f7602043babb669570f8d838f9c..d535effd0ad9f768116d5afccdc839d62035b1f3 100644
--- a/viennacl/linalg/host_based/sparse_matrix_operations.hpp
+++ b/viennacl/linalg/host_based/sparse_matrix_operations.hpp
@@ -868,7 +868,7 @@ namespace detail
   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
                                                    const compressed_matrix<NumericT, AlignmentV>,
                                                    op_trans> & L,
-                           viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
+                           viennacl::backend::mem_handle<> const & /* block_indices */, vcl_size_t /* num_blocks */,
                            vector_base<NumericT> const & /* L_diagonal */,  //ignored
                            vector_base<NumericT> & vec,
                            viennacl::linalg::unit_lower_tag)
@@ -899,7 +899,7 @@ namespace detail
   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
                                                    const compressed_matrix<NumericT, AlignmentV>,
                                                    op_trans> & L,
-                           viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
+                           viennacl::backend::mem_handle<> const & /*block_indices*/, vcl_size_t /* num_blocks */,
                            vector_base<NumericT> const & L_diagonal,
                            vector_base<NumericT> & vec,
                            viennacl::linalg::lower_tag)
@@ -935,7 +935,7 @@ namespace detail
   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
                                                    const compressed_matrix<NumericT, AlignmentV>,
                                                    op_trans> & U,
-                           viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
+                           viennacl::backend::mem_handle<> const & /*block_indices*/, vcl_size_t /* num_blocks */,
                            vector_base<NumericT> const & /* U_diagonal */, //ignored
                            vector_base<NumericT> & vec,
                            viennacl::linalg::unit_upper_tag)
@@ -968,7 +968,7 @@ namespace detail
   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
                                                    const compressed_matrix<NumericT, AlignmentV>,
                                                    op_trans> & U,
-                           viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
+                           viennacl::backend::mem_handle<> const & /* block_indices */, vcl_size_t /* num_blocks */,
                            vector_base<NumericT> const & U_diagonal,
                            vector_base<NumericT> & vec,
                            viennacl::linalg::upper_tag)
diff --git a/viennacl/linalg/host_based/vector_operations.hpp b/viennacl/linalg/host_based/vector_operations.hpp
index 01d87166c93c899998ccc8e37e89610c968394a6..6c41ba6db81463bbf148b3f33ffa62a2099d8dfc 100644
--- a/viennacl/linalg/host_based/vector_operations.hpp
+++ b/viennacl/linalg/host_based/vector_operations.hpp
@@ -64,8 +64,8 @@ namespace detail
 //
 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
 //
-template<typename DestNumericT, typename SrcNumericT>
-void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+template<typename DestNumericT, typename SrcNumericT, typename H1, typename H2>
+void convert(vector_base<DestNumericT, H1> & dest, vector_base<SrcNumericT, H2> const & src)
 {
   DestNumericT      * data_dest = detail::extract_raw_pointer<DestNumericT>(dest);
   SrcNumericT const * data_src  = detail::extract_raw_pointer<SrcNumericT>(src);
@@ -84,9 +84,9 @@ void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const &
     data_dest[static_cast<vcl_size_t>(i)*inc_dest+start_dest] = static_cast<DestNumericT>(data_src[static_cast<vcl_size_t>(i)*inc_src+start_src]);
 }
 
-template<typename NumericT, typename ScalarT1>
-void av(vector_base<NumericT> & vec1,
-        vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+template<typename NumericT, typename ScalarT1, typename H1, typename H2>
+void av(vector_base<NumericT, H1> & vec1,
+        vector_base<NumericT, H2> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
 {
   typedef NumericT        value_type;
 
@@ -123,10 +123,10 @@ void av(vector_base<NumericT> & vec1,
 }
 
 
-template<typename NumericT, typename ScalarT1, typename ScalarT2>
-void avbv(vector_base<NumericT> & vec1,
-          vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha,
-          vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t /* len_beta */,  bool reciprocal_beta,  bool flip_sign_beta)
+template<typename NumericT, typename H1, typename H2, typename H3, typename ScalarT1, typename ScalarT2>
+void avbv(vector_base<NumericT, H1> & vec1,
+          vector_base<NumericT, H2> const & vec2, ScalarT1 const & alpha, vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha,
+          vector_base<NumericT, H3> const & vec3, ScalarT2 const & beta,  vcl_size_t /* len_beta */,  bool reciprocal_beta,  bool flip_sign_beta)
 {
   typedef NumericT      value_type;
 
@@ -193,10 +193,10 @@ void avbv(vector_base<NumericT> & vec1,
 }
 
 
-template<typename NumericT, typename ScalarT1, typename ScalarT2>
-void avbv_v(vector_base<NumericT> & vec1,
-            vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
-            vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+template<typename NumericT, typename H1, typename H2, typename H3, typename ScalarT1, typename ScalarT2>
+void avbv_v(vector_base<NumericT, H1> & vec1,
+            vector_base<NumericT, H2>  const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+            vector_base<NumericT, H3> const & vec3, ScalarT2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
 {
   typedef NumericT        value_type;
 
@@ -271,8 +271,8 @@ void avbv_v(vector_base<NumericT> & vec1,
 * @param alpha  The value to be assigned
 * @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
 */
-template<typename NumericT>
-void vector_assign(vector_base<NumericT> & vec1, const NumericT & alpha, bool up_to_internal_size = false)
+template<typename NumericT, typename H>
+void vector_assign(vector_base<NumericT, H> & vec1, const NumericT & alpha, bool up_to_internal_size = false)
 {
   typedef NumericT       value_type;
 
@@ -298,8 +298,8 @@ void vector_assign(vector_base<NumericT> & vec1, const NumericT & alpha, bool up
 * @param vec1   The first vector (or -range, or -slice)
 * @param vec2   The second vector (or -range, or -slice)
 */
-template<typename NumericT>
-void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
+template<typename NumericT, typename H>
+void vector_swap(vector_base<NumericT, H> & vec1, vector_base<NumericT> & vec2)
 {
   typedef NumericT      value_type;
 
@@ -332,9 +332,9 @@ void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
 * @param vec1   The result vector (or -range, or -slice)
 * @param proxy  The proxy object holding v2, v3 and the operation
 */
-template<typename NumericT, typename OpT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+template<typename NumericT, typename H1, typename H2, typename H3, typename OpT>
+void element_op(vector_base<NumericT, H1> & vec1,
+                vector_expression<const vector_base<NumericT, H2>, const vector_base<NumericT, H3>, op_element_binary<OpT> > const & proxy)
 {
   typedef NumericT                                           value_type;
   typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> >    OpFunctor;
@@ -365,9 +365,9 @@ void element_op(vector_base<NumericT> & vec1,
 * @param vec1   The result vector (or -range, or -slice)
 * @param proxy  The proxy object holding alpha, v3 and the operation
 */
-template<typename NumericT, typename OpT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const NumericT, op_element_binary<OpT> > const & proxy)
+template<typename NumericT, typename OpT, typename H1, typename H2>
+void element_op(vector_base<NumericT, H1> & vec1,
+                vector_expression<const vector_base<NumericT, H2>, const NumericT, op_element_binary<OpT> > const & proxy)
 {
   typedef NumericT                                           value_type;
   typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> >    OpFunctor;
@@ -394,9 +394,9 @@ void element_op(vector_base<NumericT> & vec1,
 * @param vec1   The result vector (or -range, or -slice)
 * @param proxy  The proxy object holding v2, alpha and the operation
 */
-template<typename NumericT, typename OpT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const NumericT, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+template<typename NumericT, typename OpT, typename H1, typename H2>
+void element_op(vector_base<NumericT, H1> & vec1,
+                vector_expression<const NumericT, const vector_base<NumericT, H2>, op_element_binary<OpT> > const & proxy)
 {
   typedef NumericT                                           value_type;
   typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> >    OpFunctor;
@@ -425,9 +425,9 @@ void element_op(vector_base<NumericT> & vec1,
 * @param vec1   The result vector (or -range, or -slice)
 * @param proxy  The proxy object holding v2, v3 and the operation
 */
-template<typename NumericT, typename OpT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<OpT> > const & proxy)
+template<typename NumericT, typename H1, typename H2, typename H3, typename OpT>
+void element_op(vector_base<NumericT, H1> & vec1,
+                vector_expression<const vector_base<NumericT, H2>, const vector_base<NumericT, H3>, op_element_unary<OpT> > const & proxy)
 {
   typedef NumericT      value_type;
   typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> >    OpFunctor;
@@ -553,9 +553,9 @@ VIENNACL_INNER_PROD_IMPL_2(double)
 * @param vec2 The second vector
 * @param result The result scalar (on the gpu)
 */
-template<typename NumericT, typename ScalarT>
-void inner_prod_impl(vector_base<NumericT> const & vec1,
-                     vector_base<NumericT> const & vec2,
+template<typename NumericT, typename H1, typename H2, typename ScalarT>
+void inner_prod_impl(vector_base<NumericT, H1> const & vec1,
+                     vector_base<NumericT, H2> const & vec2,
                      ScalarT & result)
 {
   typedef NumericT      value_type;
@@ -574,10 +574,10 @@ void inner_prod_impl(vector_base<NumericT> const & vec1,
                                    data_vec2, start2, inc2);  //Note: Assignment to result might be expensive, thus a temporary is introduced here
 }
 
-template<typename NumericT>
-void inner_prod_impl(vector_base<NumericT> const & x,
-                     vector_tuple<NumericT> const & vec_tuple,
-                     vector_base<NumericT> & result)
+template<typename NumericT, typename H1, typename H2, typename H3>
+void inner_prod_impl(vector_base<NumericT, H1> const & x,
+                     vector_tuple<NumericT, H2> const & vec_tuple,
+                     vector_base<NumericT, H3> & result)
 {
   typedef NumericT        value_type;
 
@@ -704,8 +704,8 @@ VIENNACL_NORM_1_IMPL_2(double, double)
 * @param vec1 The vector
 * @param result The result scalar
 */
-template<typename NumericT, typename ScalarT>
-void norm_1_impl(vector_base<NumericT> const & vec1,
+template<typename NumericT, typename H, typename ScalarT>
+void norm_1_impl(vector_base<NumericT, H> const & vec1,
                  ScalarT & result)
 {
   typedef NumericT        value_type;
@@ -817,8 +817,8 @@ VIENNACL_NORM_2_IMPL_2(double, double)
 * @param vec1 The vector
 * @param result The result scalar
 */
-template<typename NumericT, typename ScalarT>
-void norm_2_impl(vector_base<NumericT> const & vec1,
+template<typename NumericT, typename H, typename ScalarT>
+void norm_2_impl(vector_base<NumericT, H> const & vec1,
                  ScalarT & result)
 {
   typedef NumericT       value_type;
@@ -837,8 +837,8 @@ void norm_2_impl(vector_base<NumericT> const & vec1,
 * @param vec1 The vector
 * @param result The result scalar
 */
-template<typename NumericT, typename ScalarT>
-void norm_inf_impl(vector_base<NumericT> const & vec1,
+template<typename NumericT, typename H, typename ScalarT>
+void norm_inf_impl(vector_base<NumericT, H> const & vec1,
                    ScalarT & result)
 {
   typedef NumericT       value_type;
@@ -887,8 +887,8 @@ void norm_inf_impl(vector_base<NumericT> const & vec1,
 * @param vec1 The vector
 * @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
 */
-template<typename NumericT>
-vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1)
+template<typename NumericT, typename H>
+vcl_size_t index_norm_inf(vector_base<NumericT, H> const & vec1)
 {
   typedef NumericT      value_type;
 
@@ -947,8 +947,8 @@ vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1)
 * @param vec1 The vector
 * @param result The result scalar
 */
-template<typename NumericT, typename ScalarT>
-void max_impl(vector_base<NumericT> const & vec1,
+template<typename NumericT, typename H, typename ScalarT>
+void max_impl(vector_base<NumericT, H> const & vec1,
               ScalarT & result)
 {
   typedef NumericT       value_type;
@@ -996,8 +996,8 @@ void max_impl(vector_base<NumericT> const & vec1,
 * @param vec1 The vector
 * @param result The result scalar
 */
-template<typename NumericT, typename ScalarT>
-void min_impl(vector_base<NumericT> const & vec1,
+template<typename NumericT, typename H, typename ScalarT>
+void min_impl(vector_base<NumericT, H> const & vec1,
               ScalarT & result)
 {
   typedef NumericT       value_type;
@@ -1045,8 +1045,8 @@ void min_impl(vector_base<NumericT> const & vec1,
 * @param vec1 The vector
 * @param result The result scalar
 */
-template<typename NumericT, typename ScalarT>
-void sum_impl(vector_base<NumericT> const & vec1,
+template<typename NumericT, typename H, typename ScalarT>
+void sum_impl(vector_base<NumericT, H> const & vec1,
               ScalarT & result)
 {
   typedef NumericT       value_type;
@@ -1076,9 +1076,9 @@ void sum_impl(vector_base<NumericT> const & vec1,
 * @param alpha  The first transformation coefficient
 * @param beta   The second transformation coefficient
 */
-template<typename NumericT>
-void plane_rotation(vector_base<NumericT> & vec1,
-                    vector_base<NumericT> & vec2,
+template<typename NumericT, typename H1, typename H2>
+void plane_rotation(vector_base<NumericT, H1> & vec1,
+                    vector_base<NumericT, H2> & vec2,
                     NumericT alpha, NumericT beta)
 {
   typedef NumericT  value_type;
@@ -1112,9 +1112,9 @@ void plane_rotation(vector_base<NumericT> & vec1,
 namespace detail
 {
   /** @brief Implementation of inclusive_scan and exclusive_scan for the host (OpenMP) backend. */
-  template<typename NumericT>
-  void vector_scan_impl(vector_base<NumericT> const & vec1,
-                        vector_base<NumericT>       & vec2,
+  template<typename NumericT, typename H1, typename H2>
+  void vector_scan_impl(vector_base<NumericT, H1> const & vec1,
+                        vector_base<NumericT, H2>       & vec2,
                         bool is_inclusive)
   {
     NumericT const * data_vec1 = detail::extract_raw_pointer<NumericT>(vec1);
@@ -1217,9 +1217,9 @@ namespace detail
 * @param vec1       Input vector: Gets overwritten by the routine.
 * @param vec2       The output vector. Either idential to vec1 or non-overlapping.
 */
-template<typename NumericT>
-void inclusive_scan(vector_base<NumericT> const & vec1,
-                    vector_base<NumericT>       & vec2)
+template<typename NumericT, typename H1, typename H2>
+void inclusive_scan(vector_base<NumericT, H1> const & vec1,
+                    vector_base<NumericT, H2>       & vec2)
 {
   detail::vector_scan_impl(vec1, vec2, true);
 }
@@ -1232,9 +1232,9 @@ void inclusive_scan(vector_base<NumericT> const & vec1,
 * @param vec1       Input vector: Gets overwritten by the routine.
 * @param vec2       The output vector. Either idential to vec1 or non-overlapping.
 */
-template<typename NumericT>
-void exclusive_scan(vector_base<NumericT> const & vec1,
-                    vector_base<NumericT>       & vec2)
+template<typename NumericT, typename H1, typename H2>
+void exclusive_scan(vector_base<NumericT, H1> const & vec1,
+                    vector_base<NumericT, H2>       & vec2)
 {
   detail::vector_scan_impl(vec1, vec2, false);
 }
diff --git a/viennacl/linalg/misc_operations.hpp b/viennacl/linalg/misc_operations.hpp
index 208573fd49b1cbb88b7e32384533b1c485ced068..bf42cc9ac010c90a8c729175d975611ffec25ae1 100644
--- a/viennacl/linalg/misc_operations.hpp
+++ b/viennacl/linalg/misc_operations.hpp
@@ -47,10 +47,10 @@ namespace viennacl
 
       template<typename ScalarType>
       void level_scheduling_substitute(vector<ScalarType> & vec,
-                                  viennacl::backend::mem_handle const & row_index_array,
-                                  viennacl::backend::mem_handle const & row_buffer,
-                                  viennacl::backend::mem_handle const & col_buffer,
-                                  viennacl::backend::mem_handle const & element_buffer,
+                                  viennacl::backend::mem_handle<> const & row_index_array,
+                                  viennacl::backend::mem_handle<> const & row_buffer,
+                                  viennacl::backend::mem_handle<> const & col_buffer,
+                                  viennacl::backend::mem_handle<> const & element_buffer,
                                   vcl_size_t num_rows
                                   )
       {
diff --git a/viennacl/linalg/mixed_precision_cg.hpp b/viennacl/linalg/mixed_precision_cg.hpp
index 78254b34a47b26759bb0c5be3b6d9a2f013e6952..fa4fc7a5bfdf7bc6c58b59aea32fcca0dc5e4fd6 100644
--- a/viennacl/linalg/mixed_precision_cg.hpp
+++ b/viennacl/linalg/mixed_precision_cg.hpp
@@ -128,10 +128,10 @@ namespace viennacl
 
       // transfer matrix to single precision:
       viennacl::compressed_matrix<float> matrix_low_precision(matrix.size1(), matrix.size2(), matrix.nnz(), viennacl::traits::context(rhs));
-      viennacl::backend::memory_copy(matrix.handle1(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle1()), 0, 0, matrix_low_precision.handle1().raw_size() );
-      viennacl::backend::memory_copy(matrix.handle2(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle2()), 0, 0, matrix_low_precision.handle2().raw_size() );
+      viennacl::backend::memory_copy(matrix.handle1(), const_cast<viennacl::backend::mem_handle<> &>(matrix_low_precision.handle1()), 0, 0, matrix_low_precision.handle1().raw_size() );
+      viennacl::backend::memory_copy(matrix.handle2(), const_cast<viennacl::backend::mem_handle<> &>(matrix_low_precision.handle2()), 0, 0, matrix_low_precision.handle2().raw_size() );
 
-      viennacl::vector_base<CPU_ScalarType> matrix_elements_high_precision(const_cast<viennacl::backend::mem_handle &>(matrix.handle()), matrix.nnz(), 0, 1);
+      viennacl::vector_base<CPU_ScalarType> matrix_elements_high_precision(const_cast<viennacl::backend::mem_handle<> &>(matrix.handle()), matrix.nnz(), 0, 1);
       viennacl::vector_base<float>          matrix_elements_low_precision(matrix_low_precision.handle(), matrix.nnz(), 0, 1);
       matrix_elements_low_precision = matrix_elements_high_precision;
       matrix_low_precision.generate_row_block_information();
diff --git a/viennacl/linalg/norm_2.hpp b/viennacl/linalg/norm_2.hpp
index babb2856744141cf9789fe8ddbf80f3bd9e549ea..26a2da01f0f162f26172a0b1b38bd370fd9594ae 100644
--- a/viennacl/linalg/norm_2.hpp
+++ b/viennacl/linalg/norm_2.hpp
@@ -105,15 +105,15 @@ namespace viennacl
     // ----------------------------------------------------
     // VIENNACL
     //
-    template< typename ScalarType>
-    viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
-                                 const viennacl::vector_base<ScalarType>,
+    template< typename ScalarType, typename H=viennacl::ocl::handle<cl_mem> >
+    viennacl::scalar_expression< const viennacl::vector_base<ScalarType, H>,
+                                 const viennacl::vector_base<ScalarType, H>,
                                  viennacl::op_norm_2 >
-    norm_2(viennacl::vector_base<ScalarType> const & v)
+    norm_2(viennacl::vector_base<ScalarType, H> const & v)
     {
        //std::cout << "viennacl .. " << std::endl;
-      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
-                                          const viennacl::vector_base<ScalarType>,
+      return viennacl::scalar_expression< const viennacl::vector_base<ScalarType, H>,
+                                          const viennacl::vector_base<ScalarType, H>,
                                           viennacl::op_norm_2 >(v, v);
     }
 
diff --git a/viennacl/linalg/opencl/ilu_operations.hpp b/viennacl/linalg/opencl/ilu_operations.hpp
index 248a88ac8ab69f5bd8106dbcdbcaee26602b5350..e67b8c53fb93b61fb55cdfea2c29a1d3fb1977c0 100644
--- a/viennacl/linalg/opencl/ilu_operations.hpp
+++ b/viennacl/linalg/opencl/ilu_operations.hpp
@@ -119,7 +119,7 @@ void icc_chow_patel_sweep(compressed_matrix<NumericT>       & L,
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
   viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
 
-  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::mem_handle<> L_backup;
   viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
   viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
 
@@ -218,11 +218,11 @@ void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
   viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
 
-  viennacl::backend::mem_handle L_backup;
+  viennacl::backend::mem_handle<> L_backup;
   viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
   viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
 
-  viennacl::backend::mem_handle U_backup;
+  viennacl::backend::mem_handle<> U_backup;
   viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans));
   viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size());
 
diff --git a/viennacl/linalg/opencl/misc_operations.hpp b/viennacl/linalg/opencl/misc_operations.hpp
index 83a3db77f6a88134872abdddbb930d626a9c317d..38d9d1202ffcc2b1a310644fa27eab9793d52bf8 100644
--- a/viennacl/linalg/opencl/misc_operations.hpp
+++ b/viennacl/linalg/opencl/misc_operations.hpp
@@ -43,10 +43,10 @@ namespace detail
 
 template<typename NumericT>
 void level_scheduling_substitute(vector<NumericT> & x,
-                                 viennacl::backend::mem_handle const & row_index_array,
-                                 viennacl::backend::mem_handle const & row_buffer,
-                                 viennacl::backend::mem_handle const & col_buffer,
-                                 viennacl::backend::mem_handle const & element_buffer,
+                                 viennacl::backend::mem_handle<> const & row_index_array,
+                                 viennacl::backend::mem_handle<> const & row_buffer,
+                                 viennacl::backend::mem_handle<> const & col_buffer,
+                                 viennacl::backend::mem_handle<> const & element_buffer,
                                  vcl_size_t num_rows
                                 )
 {
diff --git a/viennacl/linalg/opencl/sparse_matrix_operations.hpp b/viennacl/linalg/opencl/sparse_matrix_operations.hpp
index a8d1557b365c7f60f3ace797dae93c901a1495b4..6e55b2c50a177bfba7f79d61b84585959330d304 100644
--- a/viennacl/linalg/opencl/sparse_matrix_operations.hpp
+++ b/viennacl/linalg/opencl/sparse_matrix_operations.hpp
@@ -483,7 +483,7 @@ namespace detail
   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
                                                    const compressed_matrix<NumericT, AlignmentV>,
                                                    op_trans> & L,
-                           viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                           viennacl::backend::mem_handle<> const & block_indices, vcl_size_t num_blocks,
                            vector_base<NumericT> const & /* L_diagonal */,  //ignored
                            vector_base<NumericT> & x,
                            viennacl::linalg::unit_lower_tag)
@@ -506,7 +506,7 @@ namespace detail
   void block_inplace_solve(matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
                                              const compressed_matrix<NumericT, AlignmentV>,
                                              op_trans> const & U,
-                           viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+                           viennacl::backend::mem_handle<> const & block_indices, vcl_size_t num_blocks,
                            vector_base<NumericT> const & U_diagonal,
                            vector_base<NumericT>       & x,
                            viennacl::linalg::upper_tag)
diff --git a/viennacl/linalg/opencl/vector_operations.hpp b/viennacl/linalg/opencl/vector_operations.hpp
index 2b1c24ca42f4d42b1c5d95cf422a53d66accfe26..2b3423459fbee568ed1c671dfa35aca989f8861d 100644
--- a/viennacl/linalg/opencl/vector_operations.hpp
+++ b/viennacl/linalg/opencl/vector_operations.hpp
@@ -42,6 +42,12 @@
 #include "viennacl/traits/handle.hpp"
 #include "viennacl/traits/stride.hpp"
 
+#ifdef VIENNACL_WITH_OPENCL
+#define TEMP_HANDLE viennacl::ocl::pooled_clmem_handle
+#else
+#define TEMP_HANDLE viennacl::ocl::handle<cl_mem>
+#endif
+
 namespace viennacl
 {
 namespace linalg
@@ -52,8 +58,8 @@ namespace opencl
 //
 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
 //
-template<typename DestNumericT, typename SrcNumericT>
-void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+template<typename DestNumericT, typename SrcNumericT, typename H1, typename H2>
+void convert(vector_base<DestNumericT, H1> & dest, vector_base<SrcNumericT, H2> const & src)
 {
   assert(viennacl::traits::opencl_handle(dest).context() == viennacl::traits::opencl_handle(src).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
 
@@ -72,9 +78,9 @@ void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const &
 
 }
 
-template <typename T, typename ScalarType1>
-void av(vector_base<T> & vec1,
-        vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+template <typename T, typename ScalarType1, typename H1, typename H2>
+void av(vector_base<T, H1> & vec1,
+        vector_base<T, H2> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
 
@@ -112,10 +118,10 @@ void av(vector_base<T> & vec1,
 }
 
 
-template <typename T, typename ScalarType1, typename ScalarType2>
-void avbv(vector_base<T> & vec1,
-          vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-          vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+template <typename T, typename ScalarType1, typename ScalarType2, typename H1, typename H2, typename H3>
+void avbv(vector_base<T, H1> & vec1,
+          vector_base<T, H2> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+          vector_base<T, H3> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
   assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -174,10 +180,10 @@ void avbv(vector_base<T> & vec1,
 }
 
 
-template <typename T, typename ScalarType1, typename ScalarType2>
-void avbv_v(vector_base<T> & vec1,
-            vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-            vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+template <typename T, typename ScalarType1, typename ScalarType2, typename H1, typename H2, typename H3>
+void avbv_v(vector_base<T, H1> & vec1,
+            vector_base<T, H2> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+            vector_base<T, H3> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
   assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -242,8 +248,8 @@ void avbv_v(vector_base<T> & vec1,
 * @param alpha  The value to be assigned
 * @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
 */
-template <typename T>
-void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
+template <typename T, typename H>
+void vector_assign(vector_base<T, H> & vec1, const T & alpha, bool up_to_internal_size = false)
 {
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
   viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
@@ -268,8 +274,8 @@ void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_s
 * @param vec1   The first vector (or -range, or -slice)
 * @param vec2   The second vector (or -range, or -slice)
 */
-template <typename T>
-void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
+template <typename T, typename H1, typename H2>
+void vector_swap(vector_base<T, H1> & vec1, vector_base<T, H2> & vec2)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
 
@@ -296,9 +302,9 @@ void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
 * @param vec1   The result vector (or -range, or -slice)
 * @param proxy  The proxy object holding v2, v3 and the operation
 */
-template <typename T, typename OP>
-void element_op(vector_base<T> & vec1,
-                vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+template <typename T, typename OP, typename H1, typename H2, typename H3>
+void element_op(vector_base<T, H1> & vec1,
+                vector_expression<const vector_base<T, H2>, const vector_base<T, H3>, op_element_binary<OP> > const & proxy)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -343,9 +349,9 @@ void element_op(vector_base<T> & vec1,
 * @param vec1   The result vector (or -range, or -slice)
 * @param proxy  The proxy object holding v2, v3 and the operation
 */
-template <typename T, typename OP>
-void element_op(vector_base<T> & vec1,
-                vector_expression<const vector_base<T>, const T, op_element_binary<OP> > const & proxy)
+template <typename T, typename OP, typename H1, typename H2>
+void element_op(vector_base<T, H1> & vec1,
+                vector_expression<const vector_base<T, H2>, const T, op_element_binary<OP> > const & proxy)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
 
@@ -387,9 +393,9 @@ void element_op(vector_base<T> & vec1,
 * @param vec1   The result vector (or -range, or -slice)
 * @param proxy  The proxy object holding v2, v3 and the operation
 */
-template <typename T, typename OP>
-void element_op(vector_base<T> & vec1,
-                vector_expression<const T, const vector_base<T>, op_element_binary<OP> > const & proxy)
+template <typename T, typename OP, typename H1, typename H2>
+void element_op(vector_base<T, H1> & vec1,
+                vector_expression<const T, const vector_base<T, H2>, op_element_binary<OP> > const & proxy)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
 
@@ -432,9 +438,9 @@ void element_op(vector_base<T> & vec1,
 * @param vec1   The result vector (or -range, or -slice)
 * @param proxy  The proxy object holding v2 and the operation
 */
-template <typename T, typename OP>
-void element_op(vector_base<T> & vec1,
-                vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+template <typename T, typename OP, typename H1, typename H2>
+void element_op(vector_base<T, H1> & vec1,
+                vector_expression<const vector_base<T, H2>, const vector_base<T>, op_element_unary<OP> > const & proxy)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -471,10 +477,10 @@ void element_op(vector_base<T> & vec1,
 * @param vec2 The second vector
 * @param partial_result The results of each group
 */
-template <typename T>
-void inner_prod_impl(vector_base<T> const & vec1,
-                     vector_base<T> const & vec2,
-                     vector_base<T> & partial_result)
+template <typename T, typename H1, typename H2, typename H3>
+void inner_prod_impl(vector_base<T, H1> const & vec1,
+                     vector_base<T, H2> const & vec2,
+                     vector_base<T, H3> & partial_result)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
   assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -520,9 +526,9 @@ void inner_prod_impl(vector_base<T> const & vec1,
 * @param vec2 The second vector
 * @param result The result scalar (on the gpu)
 */
-template <typename T>
-void inner_prod_impl(vector_base<T> const & vec1,
-                     vector_base<T> const & vec2,
+template <typename T, typename H1, typename H2>
+void inner_prod_impl(vector_base<T, H1> const & vec1,
+                     vector_base<T, H2> const & vec2,
                      scalar<T> & result)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -531,7 +537,7 @@ void inner_prod_impl(vector_base<T> const & vec1,
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
 
   vcl_size_t work_groups = 128;
-  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
+  viennacl::vector<T, 1, TEMP_HANDLE> temp(work_groups, viennacl::traits::context(vec1));
   temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
 
   // Step 1: Compute partial inner products for each work group:
@@ -571,10 +577,10 @@ namespace detail
 * @param vec_tuple  The tuple of vectors y1, y2, ..., yN
 * @param result     The result vector
 */
-template <typename NumericT>
-void inner_prod_impl(vector_base<NumericT> const & x,
-                     vector_tuple<NumericT> const & vec_tuple,
-                     vector_base<NumericT> & result)
+template <typename NumericT, typename H1, typename H2, typename H3>
+void inner_prod_impl(vector_base<NumericT, H1> const & x,
+                     vector_tuple<NumericT, H2> const & vec_tuple,
+                     vector_base<NumericT, H3> & result)
 {
   assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
 
@@ -604,10 +610,10 @@ void inner_prod_impl(vector_base<NumericT> const & x,
       case 5:
       case 4:
       {
-        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
-        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
-        vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
-        vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
+        vector_base<NumericT, H2> const & y0 = vec_tuple.const_at(current_index    );
+        vector_base<NumericT, H2> const & y1 = vec_tuple.const_at(current_index + 1);
+        vector_base<NumericT, H2> const & y2 = vec_tuple.const_at(current_index + 2);
+        vector_base<NumericT, H2> const & y3 = vec_tuple.const_at(current_index + 3);
         viennacl::ocl::enqueue(inner_prod_kernel_4( viennacl::traits::opencl_handle(x), layout_x,
                                                    viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
                                                    viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
@@ -632,9 +638,9 @@ void inner_prod_impl(vector_base<NumericT> const & x,
 
       case 3:
       {
-        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
-        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
-        vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+        vector_base<NumericT, H2> const & y0 = vec_tuple.const_at(current_index    );
+        vector_base<NumericT, H2> const & y1 = vec_tuple.const_at(current_index + 1);
+        vector_base<NumericT, H2> const & y2 = vec_tuple.const_at(current_index + 2);
         viennacl::ocl::enqueue(inner_prod_kernel_3( viennacl::traits::opencl_handle(x), layout_x,
                                                     viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
                                                     viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
@@ -658,8 +664,8 @@ void inner_prod_impl(vector_base<NumericT> const & x,
 
       case 2:
       {
-        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
-        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+        vector_base<NumericT, H2> const & y0 = vec_tuple.const_at(current_index    );
+        vector_base<NumericT, H2> const & y1 = vec_tuple.const_at(current_index + 1);
         viennacl::ocl::enqueue(inner_prod_kernel_2( viennacl::traits::opencl_handle(x), layout_x,
                                                     viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
                                                     viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
@@ -682,7 +688,7 @@ void inner_prod_impl(vector_base<NumericT> const & x,
 
       case 1:
       {
-        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
+        vector_base<NumericT, H2> const & y0 = vec_tuple.const_at(current_index    );
         viennacl::ocl::enqueue(inner_prod_kernel_1( viennacl::traits::opencl_handle(x), layout_x,
                                                     viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
                                                     viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 1 * inner_prod_kernel_1.local_work_size()),
@@ -704,14 +710,14 @@ void inner_prod_impl(vector_base<NumericT> const & x,
 
       default: //8 or more vectors
       {
-        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index    );
-        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
-        vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
-        vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
-        vector_base<NumericT> const & y4 = vec_tuple.const_at(current_index + 4);
-        vector_base<NumericT> const & y5 = vec_tuple.const_at(current_index + 5);
-        vector_base<NumericT> const & y6 = vec_tuple.const_at(current_index + 6);
-        vector_base<NumericT> const & y7 = vec_tuple.const_at(current_index + 7);
+        vector_base<NumericT, H2> const & y0 = vec_tuple.const_at(current_index    );
+        vector_base<NumericT, H2> const & y1 = vec_tuple.const_at(current_index + 1);
+        vector_base<NumericT, H2> const & y2 = vec_tuple.const_at(current_index + 2);
+        vector_base<NumericT, H2> const & y3 = vec_tuple.const_at(current_index + 3);
+        vector_base<NumericT, H2> const & y4 = vec_tuple.const_at(current_index + 4);
+        vector_base<NumericT, H2> const & y5 = vec_tuple.const_at(current_index + 5);
+        vector_base<NumericT, H2> const & y6 = vec_tuple.const_at(current_index + 6);
+        vector_base<NumericT, H2> const & y7 = vec_tuple.const_at(current_index + 7);
         viennacl::ocl::enqueue(inner_prod_kernel_8( viennacl::traits::opencl_handle(x), layout_x,
                                                     viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
                                                     viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
@@ -752,9 +758,9 @@ void inner_prod_impl(vector_base<NumericT> const & x,
 * @param vec2 The second vector
 * @param result The result scalar (on the gpu)
 */
-template <typename T>
-void inner_prod_cpu(vector_base<T> const & vec1,
-                    vector_base<T> const & vec2,
+template <typename T, typename H1, typename H2>
+void inner_prod_cpu(vector_base<T, H1> const & vec1,
+                    vector_base<T, H2> const & vec2,
                     T & result)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -762,7 +768,7 @@ void inner_prod_cpu(vector_base<T> const & vec1,
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
 
   vcl_size_t work_groups = 128;
-  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
+  viennacl::vector<T, 1, TEMP_HANDLE> temp(work_groups, viennacl::traits::context(vec1));
   temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
 
   // Step 1: Compute partial inner products for each work group:
@@ -788,9 +794,9 @@ void inner_prod_cpu(vector_base<T> const & vec1,
 * @param partial_result The result scalar
 * @param norm_id        Norm selector. 0: norm_inf, 1: norm_1, 2: norm_2
 */
-template <typename T>
-void norm_reduction_impl(vector_base<T> const & vec,
-                         vector_base<T> & partial_result,
+template <typename T, typename H1, typename H2>
+void norm_reduction_impl(vector_base<T, H1> const & vec,
+                         vector_base<T, H2> & partial_result,
                           cl_uint norm_id)
 {
   assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -820,8 +826,8 @@ void norm_reduction_impl(vector_base<T> const & vec,
 * @param vec The vector
 * @param result The result scalar
 */
-template <typename T>
-void norm_1_impl(vector_base<T> const & vec,
+template <typename T, typename H>
+void norm_1_impl(vector_base<T, H> const & vec,
                  scalar<T> & result)
 {
   assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -829,7 +835,7 @@ void norm_1_impl(vector_base<T> const & vec,
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
 
   vcl_size_t work_groups = 128;
-  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+  viennacl::vector<T, 1, TEMP_HANDLE> temp(work_groups, viennacl::traits::context(vec));
 
   // Step 1: Compute the partial work group results
   norm_reduction_impl(vec, temp, 1);
@@ -853,12 +859,12 @@ void norm_1_impl(vector_base<T> const & vec,
 * @param vec The vector
 * @param result The result scalar
 */
-template <typename T>
-void norm_1_cpu(vector_base<T> const & vec,
+template <typename T, typename H>
+void norm_1_cpu(vector_base<T, H> const & vec,
                 T & result)
 {
   vcl_size_t work_groups = 128;
-  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+  viennacl::vector<T, 1, TEMP_HANDLE> temp(work_groups, viennacl::traits::context(vec));
 
   // Step 1: Compute the partial work group results
   norm_reduction_impl(vec, temp, 1);
@@ -884,8 +890,8 @@ void norm_1_cpu(vector_base<T> const & vec,
 * @param vec The vector
 * @param result The result scalar
 */
-template <typename T>
-void norm_2_impl(vector_base<T> const & vec,
+template <typename T, typename H>
+void norm_2_impl(vector_base<T, H> const & vec,
                  scalar<T> & result)
 {
   assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -893,7 +899,7 @@ void norm_2_impl(vector_base<T> const & vec,
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
 
   vcl_size_t work_groups = 128;
-  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+  viennacl::vector<T, 1, TEMP_HANDLE> temp(work_groups, viennacl::traits::context(vec));
 
   // Step 1: Compute the partial work group results
   norm_reduction_impl(vec, temp, 2);
@@ -917,12 +923,12 @@ void norm_2_impl(vector_base<T> const & vec,
 * @param vec The vector
 * @param result The result scalar
 */
-template <typename T>
-void norm_2_cpu(vector_base<T> const & vec,
+template <typename T, typename H>
+void norm_2_cpu(vector_base<T, H> const & vec,
                 T & result)
 {
   vcl_size_t work_groups = 128;
-  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+  viennacl::vector<T, 1, TEMP_HANDLE> temp(work_groups, viennacl::traits::context(vec));
 
   // Step 1: Compute the partial work group results
   norm_reduction_impl(vec, temp, 2);
@@ -948,8 +954,8 @@ void norm_2_cpu(vector_base<T> const & vec,
 * @param vec The vector
 * @param result The result scalar
 */
-template <typename T>
-void norm_inf_impl(vector_base<T> const & vec,
+template <typename T, typename H>
+void norm_inf_impl(vector_base<T, H> const & vec,
                    scalar<T> & result)
 {
   assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -957,7 +963,7 @@ void norm_inf_impl(vector_base<T> const & vec,
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
 
   vcl_size_t work_groups = 128;
-  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+  viennacl::vector<T, 1, TEMP_HANDLE> temp(work_groups, viennacl::traits::context(vec));
 
   // Step 1: Compute the partial work group results
   norm_reduction_impl(vec, temp, 0);
@@ -981,12 +987,12 @@ void norm_inf_impl(vector_base<T> const & vec,
 * @param vec The vector
 * @param result The result scalar
 */
-template <typename T>
-void norm_inf_cpu(vector_base<T> const & vec,
+template <typename T, typename H>
+void norm_inf_cpu(vector_base<T, H> const & vec,
                   T & result)
 {
   vcl_size_t work_groups = 128;
-  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+  viennacl::vector<T, 1, TEMP_HANDLE> temp(work_groups, viennacl::traits::context(vec));
 
   // Step 1: Compute the partial work group results
   norm_reduction_impl(vec, temp, 0);
@@ -1013,8 +1019,8 @@ void norm_inf_cpu(vector_base<T> const & vec,
 * @param vec The vector
 * @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
 */
-template <typename T>
-cl_uint index_norm_inf(vector_base<T> const & vec)
+template <typename T, typename H>
+cl_uint index_norm_inf(vector_base<T, H> const & vec)
 {
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
   viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
@@ -1049,8 +1055,8 @@ cl_uint index_norm_inf(vector_base<T> const & vec)
 * @param x      The vector
 * @param result The result scalar
 */
-template<typename NumericT>
-void max_impl(vector_base<NumericT> const & x,
+template<typename NumericT, typename H>
+void max_impl(vector_base<NumericT, H> const & x,
                    scalar<NumericT> & result)
 {
   assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -1087,8 +1093,8 @@ void max_impl(vector_base<NumericT> const & x,
 * @param x      The vector
 * @param result The result scalar
 */
-template<typename NumericT>
-void max_cpu(vector_base<NumericT> const & x,
+template<typename NumericT, typename H>
+void max_cpu(vector_base<NumericT, H> const & x,
              NumericT & result)
 {
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
@@ -1128,8 +1134,8 @@ void max_cpu(vector_base<NumericT> const & x,
 * @param x      The vector
 * @param result The result scalar
 */
-template<typename NumericT>
-void min_impl(vector_base<NumericT> const & x,
+template<typename NumericT, typename H>
+void min_impl(vector_base<NumericT, H> const & x,
                    scalar<NumericT> & result)
 {
   assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -1166,8 +1172,8 @@ void min_impl(vector_base<NumericT> const & x,
 * @param x      The vector
 * @param result The result scalar
 */
-template<typename NumericT>
-void min_cpu(vector_base<NumericT> const & x,
+template<typename NumericT, typename H>
+void min_cpu(vector_base<NumericT, H> const & x,
                   NumericT & result)
 {
   viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
@@ -1205,13 +1211,13 @@ void min_cpu(vector_base<NumericT> const & x,
 * @param x      The vector
 * @param result The result scalar
 */
-template<typename NumericT>
-void sum_impl(vector_base<NumericT> const & x,
-                   scalar<NumericT> & result)
+template<typename NumericT, typename H>
+void sum_impl(vector_base<NumericT, H> const & x,
+                   scalar<NumericT, H> & result)
 {
   assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
 
-  viennacl::vector<NumericT> all_ones = viennacl::scalar_vector<NumericT>(x.size(), NumericT(1), viennacl::traits::context(x));
+  viennacl::vector<NumericT, 1, TEMP_HANDLE> all_ones = viennacl::scalar_vector<NumericT>(x.size(), NumericT(1), viennacl::traits::context(x));
   viennacl::linalg::opencl::inner_prod_impl(x, all_ones, result);
 }
 
@@ -1239,9 +1245,9 @@ void sum_cpu(vector_base<NumericT> const & x, NumericT & result)
 * @param alpha  The first transformation coefficient
 * @param beta   The second transformation coefficient
 */
-template <typename T>
-void plane_rotation(vector_base<T> & vec1,
-                    vector_base<T> & vec2,
+template <typename T, typename H1, typename H2>
+void plane_rotation(vector_base<T, H1> & vec1,
+                    vector_base<T, H2> & vec2,
                     T alpha, T beta)
 {
   assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
@@ -1276,15 +1282,15 @@ namespace detail
    * Note on performance: For non-in-place scans one could optimize away the temporary 'opencl_carries'-array.
    * This, however, only provides small savings in the latency-dominated regime, yet would effectively double the amount of code to maintain.
    */
-  template<typename NumericT>
-  void scan_impl(vector_base<NumericT> const & input,
-                 vector_base<NumericT>       & output,
+  template<typename NumericT, typename H1, typename H2>
+  void scan_impl(vector_base<NumericT, H1> const & input,
+                 vector_base<NumericT, H2>       & output,
                  bool is_inclusive)
   {
     vcl_size_t local_worksize = 128;
     vcl_size_t workgroups = 128;
 
-    viennacl::backend::mem_handle opencl_carries;
+    viennacl::backend::mem_handle<H1> opencl_carries;
     viennacl::backend::memory_create(opencl_carries, sizeof(NumericT)*workgroups, viennacl::traits::context(input));
 
     viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
@@ -1321,9 +1327,9 @@ namespace detail
 * @param input       Input vector.
 * @param output      The output vector. Either idential to input or non-overlapping.
 */
-template<typename NumericT>
-void inclusive_scan(vector_base<NumericT> const & input,
-                    vector_base<NumericT>       & output)
+template<typename NumericT, typename H1, typename H2>
+void inclusive_scan(vector_base<NumericT, H1> const & input,
+                    vector_base<NumericT, H2>       & output)
 {
   detail::scan_impl(input, output, true);
 }
@@ -1334,9 +1340,9 @@ void inclusive_scan(vector_base<NumericT> const & input,
 * @param input       Input vector
 * @param output      The output vector. Either idential to input or non-overlapping.
 */
-template<typename NumericT>
-void exclusive_scan(vector_base<NumericT> const & input,
-                    vector_base<NumericT>       & output)
+template<typename NumericT, typename H1, typename H2>
+void exclusive_scan(vector_base<NumericT, H1> const & input,
+                    vector_base<NumericT, H2>       & output)
 {
   detail::scan_impl(input, output, false);
 }
diff --git a/viennacl/linalg/sparse_matrix_operations.hpp b/viennacl/linalg/sparse_matrix_operations.hpp
index dccb330c2e59bd1f83c7b73140a209c43b6aa80c..e6ffdbbbc00dd882c38f7d4caab937f1db16db2e 100644
--- a/viennacl/linalg/sparse_matrix_operations.hpp
+++ b/viennacl/linalg/sparse_matrix_operations.hpp
@@ -330,7 +330,7 @@ namespace viennacl
       template<typename SparseMatrixType, class ScalarType, typename SOLVERTAG>
       typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
       block_inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
-                          viennacl::backend::mem_handle const & block_index_array, vcl_size_t num_blocks,
+                          viennacl::backend::mem_handle<> const & block_index_array, vcl_size_t num_blocks,
                           viennacl::vector_base<ScalarType> const & mat_diagonal,
                           viennacl::vector_base<ScalarType> & vec,
                           SOLVERTAG tag)
diff --git a/viennacl/linalg/vector_operations.hpp b/viennacl/linalg/vector_operations.hpp
index 5add9ceaddd3c0d378ebd3400f14f556ab76d385..3c75d03ff14b42fbaf10e26295480390c3a6ecc1 100644
--- a/viennacl/linalg/vector_operations.hpp
+++ b/viennacl/linalg/vector_operations.hpp
@@ -47,8 +47,8 @@ namespace viennacl
 {
   namespace linalg
   {
-    template<typename DestNumericT, typename SrcNumericT>
-    void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+    template<typename DestNumericT, typename SrcNumericT, typename H1, typename H2>
+    void convert(vector_base<DestNumericT, H1> & dest, vector_base<SrcNumericT, H2> const & src)
     {
       assert(viennacl::traits::size(dest) == viennacl::traits::size(src) && bool("Incompatible vector sizes in v1 = v2 (convert): size(v1) != size(v2)"));
 
@@ -74,9 +74,9 @@ namespace viennacl
       }
     }
 
-    template<typename T, typename ScalarType1>
-    void av(vector_base<T> & vec1,
-            vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+    template<typename T, typename H1, typename H2, typename ScalarType1>
+    void av(vector_base<T, H1> & vec1,
+            vector_base<T, H2> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
     {
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 = v2 @ alpha: size(v1) != size(v2)"));
 
@@ -103,10 +103,10 @@ namespace viennacl
     }
 
 
-    template<typename T, typename ScalarType1, typename ScalarType2>
-    void avbv(vector_base<T> & vec1,
-              vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-              vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+    template<typename T, typename H1, typename H2, typename H3, typename ScalarType1, typename ScalarType2>
+    void avbv(vector_base<T, H1> & vec1,
+              vector_base<T, H2> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+              vector_base<T, H3> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
     {
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 = v2 @ alpha + v3 @ beta: size(v1) != size(v2)"));
       assert(viennacl::traits::size(vec2) == viennacl::traits::size(vec3) && bool("Incompatible vector sizes in v1 = v2 @ alpha + v3 @ beta: size(v2) != size(v3)"));
@@ -140,10 +140,10 @@ namespace viennacl
     }
 
 
-    template<typename T, typename ScalarType1, typename ScalarType2>
-    void avbv_v(vector_base<T> & vec1,
-                vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-                vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+    template<typename T, typename H1, typename H2, typename H3, typename ScalarType1, typename ScalarType2>
+    void avbv_v(vector_base<T, H1> & vec1,
+                vector_base<T, H2> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+                vector_base<T, H3> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
     {
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in v1 += v2 @ alpha + v3 @ beta: size(v1) != size(v2)"));
       assert(viennacl::traits::size(vec2) == viennacl::traits::size(vec3) && bool("Incompatible vector sizes in v1 += v2 @ alpha + v3 @ beta: size(v2) != size(v3)"));
@@ -183,8 +183,8 @@ namespace viennacl
     * @param alpha  The value to be assigned
     * @param up_to_internal_size    Whether 'alpha' should be written to padded memory as well. This is used for setting all entries to zero, including padded memory.
     */
-    template<typename T>
-    void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
+    template<typename T, typename H>
+    void vector_assign(vector_base<T, H> & vec1, const T & alpha, bool up_to_internal_size = false)
     {
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
       {
@@ -214,8 +214,8 @@ namespace viennacl
     * @param vec1   The first vector (or -range, or -slice)
     * @param vec2   The second vector (or -range, or -slice)
     */
-    template<typename T>
-    void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
+    template<typename T, typename H1, typename H2>
+    void vector_swap(vector_base<T, H1> & vec1, vector_base<T, H2> & vec2)
     {
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2) && bool("Incompatible vector sizes in vector_swap()"));
 
@@ -251,9 +251,9 @@ namespace viennacl
     * @param vec1   The result vector (or -range, or -slice)
     * @param proxy  The proxy object holding v2, v3 and the operation
     */
-    template<typename T, typename OP>
-    void element_op(vector_base<T> & vec1,
-                    vector_expression<const vector_base<T>, const vector_base<T>, OP> const & proxy)
+    template<typename T, typename OP, typename H1, typename H2, typename H3>
+    void element_op(vector_base<T, H1> & vec1,
+                    vector_expression<const vector_base<T, H2>, const vector_base<T, H3>, OP> const & proxy)
     {
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(proxy) && bool("Incompatible vector sizes in element_op()"));
 
@@ -284,9 +284,9 @@ namespace viennacl
     * @param vec1   The result vector (or -range, or -slice)
     * @param proxy  The proxy object holding v2, v3 and the operation
     */
-    template<typename T, typename OP>
-    void element_op(vector_base<T> & vec1,
-                    vector_expression<const vector_base<T>, const T, OP> const & proxy)
+    template<typename T, typename OP, typename H1, typename H2>
+    void element_op(vector_base<T, H1> & vec1,
+                    vector_expression<const vector_base<T, H2>, const T, OP> const & proxy)
     {
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(proxy) && bool("Incompatible vector sizes in element_op()"));
 
@@ -317,9 +317,9 @@ namespace viennacl
     * @param vec1   The result vector (or -range, or -slice)
     * @param proxy  The proxy object holding v2, v3 and the operation
     */
-    template<typename T, typename OP>
-    void element_op(vector_base<T> & vec1,
-                    vector_expression<const T, const vector_base<T>, OP> const & proxy)
+    template<typename T, typename OP, typename H1, typename H2>
+    void element_op(vector_base<T, H1> & vec1,
+                    vector_expression<const T, const vector_base<T, H2>, OP> const & proxy)
     {
       assert(viennacl::traits::size(vec1) == viennacl::traits::size(proxy) && bool("Incompatible vector sizes in element_op()"));
 
@@ -349,25 +349,25 @@ namespace viennacl
 
 // Helper macro for generating binary element-wise operations such as element_prod(), element_div(), element_pow() without unnecessary code duplication */
 #define VIENNACL_GENERATE_BINARY_ELEMENTOPERATION_OVERLOADS(OPNAME) \
-    template<typename T> \
-    viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_##OPNAME> > \
-    element_##OPNAME(vector_base<T> const & v1, vector_base<T> const & v2) \
+    template<typename T, typename H1, typename H2> \
+    viennacl::vector_expression<const vector_base<T, H1>, const vector_base<T, H2>, op_element_binary<op_##OPNAME> > \
+    element_##OPNAME(vector_base<T, H1> const & v1, vector_base<T, H2> const & v2) \
     { \
-      return viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_##OPNAME> >(v1, v2); \
+      return viennacl::vector_expression<const vector_base<T, H1>, const vector_base<T, H2>, op_element_binary<op_##OPNAME> >(v1, v2); \
     } \
 \
-    template<typename V1, typename V2, typename OP, typename T> \
-    viennacl::vector_expression<const vector_expression<const V1, const V2, OP>, const vector_base<T>, op_element_binary<op_##OPNAME> > \
-    element_##OPNAME(vector_expression<const V1, const V2, OP> const & proxy, vector_base<T> const & v2) \
+    template<typename V1, typename V2, typename OP, typename T, typename H> \
+    viennacl::vector_expression<const vector_expression<const V1, const V2, OP>, const vector_base<T, H>, op_element_binary<op_##OPNAME> > \
+    element_##OPNAME(vector_expression<const V1, const V2, OP> const & proxy, vector_base<T, H> const & v2) \
     { \
-      return viennacl::vector_expression<const vector_expression<const V1, const V2, OP>, const vector_base<T>, op_element_binary<op_##OPNAME> >(proxy, v2); \
+      return viennacl::vector_expression<const vector_expression<const V1, const V2, OP>, const vector_base<T, H>, op_element_binary<op_##OPNAME> >(proxy, v2); \
     } \
 \
-    template<typename T, typename V2, typename V3, typename OP> \
-    viennacl::vector_expression<const vector_base<T>, const vector_expression<const V2, const V3, OP>, op_element_binary<op_##OPNAME> > \
-    element_##OPNAME(vector_base<T> const & v1, vector_expression<const V2, const V3, OP> const & proxy) \
+    template<typename T, typename V2, typename V3, typename OP, typename H> \
+    viennacl::vector_expression<const vector_base<T, H>, const vector_expression<const V2, const V3, OP>, op_element_binary<op_##OPNAME> > \
+    element_##OPNAME(vector_base<T, H> const & v1, vector_expression<const V2, const V3, OP> const & proxy) \
     { \
-      return viennacl::vector_expression<const vector_base<T>, const vector_expression<const V2, const V3, OP>, op_element_binary<op_##OPNAME> >(v1, proxy); \
+      return viennacl::vector_expression<const vector_base<T, H>, const vector_expression<const V2, const V3, OP>, op_element_binary<op_##OPNAME> >(v1, proxy); \
     } \
 \
     template<typename V1, typename V2, typename OP1, \
@@ -383,11 +383,11 @@ namespace viennacl
                                          op_element_binary<op_##OPNAME> >(proxy1, proxy2); \
     }\
 \
-    template<typename T> \
-    viennacl::vector_expression<const vector_base<T>, const T, op_element_binary<op_##OPNAME> > \
-    element_##OPNAME(vector_base<T> const & v1, T const & alpha) \
+    template<typename T, typename H> \
+    viennacl::vector_expression<const vector_base<T, H>, const T, op_element_binary<op_##OPNAME> > \
+    element_##OPNAME(vector_base<T, H> const & v1, T const & alpha) \
     { \
-      return viennacl::vector_expression<const vector_base<T>, const T, op_element_binary<op_##OPNAME> >(v1, alpha); \
+      return viennacl::vector_expression<const vector_base<T, H>, const T, op_element_binary<op_##OPNAME> >(v1, alpha); \
     } \
 \
     template<typename V1, typename V2, typename OP> \
@@ -397,11 +397,11 @@ namespace viennacl
       return viennacl::vector_expression<const vector_expression<const V1, const V2, OP>, const typename viennacl::result_of::cpu_value_type<V1>::type, op_element_binary<op_##OPNAME> >(proxy, alpha); \
     } \
 \
-    template<typename T> \
-    viennacl::vector_expression<const T, const vector_base<T>, op_element_binary<op_##OPNAME> > \
-    element_##OPNAME(T const & alpha, vector_base<T> const & v2) \
+    template<typename T, typename H> \
+    viennacl::vector_expression<const T, const vector_base<T, H>, op_element_binary<op_##OPNAME> > \
+    element_##OPNAME(T const & alpha, vector_base<T, H> const & v2) \
     { \
-      return viennacl::vector_expression<const T, const vector_base<T>, op_element_binary<op_##OPNAME> >(alpha, v2); \
+      return viennacl::vector_expression<const T, const vector_base<T, H>, op_element_binary<op_##OPNAME> >(alpha, v2); \
     } \
 \
     template<typename V1, typename V2, typename OP> \
@@ -490,9 +490,9 @@ namespace viennacl
      * @param vec2 The second vector
      * @param result The result scalar (on the gpu)
      */
-    template<typename T>
-    void inner_prod_impl(vector_base<T> const & vec1,
-                         vector_base<T> const & vec2,
+    template<typename T, typename H1, typename H2>
+    void inner_prod_impl(vector_base<T, H1> const & vec1,
+                         vector_base<T, H2> const & vec2,
                          scalar<T> & result)
     {
       assert( vec1.size() == vec2.size() && bool("Size mismatch") );
@@ -520,9 +520,9 @@ namespace viennacl
     }
 
     // vector expression on lhs
-    template<typename LHS, typename RHS, typename OP, typename T>
+    template<typename LHS, typename RHS, typename OP, typename T, typename H>
     void inner_prod_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
-                         vector_base<T> const & vec2,
+                         vector_base<T, H> const & vec2,
                          scalar<T> & result)
     {
       viennacl::vector<T> temp = vec1;
@@ -531,8 +531,8 @@ namespace viennacl
 
 
     // vector expression on rhs
-    template<typename T, typename LHS, typename RHS, typename OP>
-    void inner_prod_impl(vector_base<T> const & vec1,
+    template<typename T, typename LHS, typename RHS, typename OP, typename H>
+    void inner_prod_impl(vector_base<T, H> const & vec1,
                          viennacl::vector_expression<LHS, RHS, OP> const & vec2,
                          scalar<T> & result)
     {
@@ -562,9 +562,9 @@ namespace viennacl
      * @param vec2 The second vector
      * @param result The result scalar (on the gpu)
      */
-    template<typename T>
-    void inner_prod_cpu(vector_base<T> const & vec1,
-                        vector_base<T> const & vec2,
+    template<typename T, typename H1, typename H2>
+    void inner_prod_cpu(vector_base<T, H1> const & vec1,
+                        vector_base<T, H2> const & vec2,
                         T & result)
     {
       assert( vec1.size() == vec2.size() && bool("Size mismatch") );
@@ -592,9 +592,9 @@ namespace viennacl
     }
 
     // vector expression on lhs
-    template<typename LHS, typename RHS, typename OP, typename T>
+    template<typename LHS, typename RHS, typename OP, typename T, typename H>
     void inner_prod_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
-                        vector_base<T> const & vec2,
+                        vector_base<T, H> const & vec2,
                         T & result)
     {
       viennacl::vector<T> temp = vec1;
@@ -603,8 +603,8 @@ namespace viennacl
 
 
     // vector expression on rhs
-    template<typename T, typename LHS, typename RHS, typename OP>
-    void inner_prod_cpu(vector_base<T> const & vec1,
+    template<typename T, typename H, typename LHS, typename RHS, typename OP>
+    void inner_prod_cpu(vector_base<T, H> const & vec1,
                         viennacl::vector_expression<LHS, RHS, OP> const & vec2,
                         T & result)
     {
@@ -633,10 +633,10 @@ namespace viennacl
      * @param y_tuple A collection of vector, all of the same size.
      * @param result  The result scalar (on the gpu). Needs to match the number of elements in y_tuple
      */
-    template<typename T>
-    void inner_prod_impl(vector_base<T> const & x,
-                         vector_tuple<T> const & y_tuple,
-                         vector_base<T> & result)
+    template<typename T, typename H1, typename H2, typename H3>
+    void inner_prod_impl(vector_base<T, H1> const & x,
+                         vector_tuple<T, H2> const & y_tuple,
+                         vector_base<T, H3> & result)
     {
       assert( x.size() == y_tuple.const_at(0).size() && bool("Size mismatch") );
       assert( result.size() == y_tuple.const_size() && bool("Number of elements does not match result size") );
@@ -669,8 +669,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename T>
-    void norm_1_impl(vector_base<T> const & vec,
+    template<typename T, typename H>
+    void norm_1_impl(vector_base<T, H> const & vec,
                      scalar<T> & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
@@ -716,8 +716,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename T>
-    void norm_1_cpu(vector_base<T> const & vec,
+    template<typename T, typename H>
+    void norm_1_cpu(vector_base<T, H> const & vec,
                     T & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
@@ -763,8 +763,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename T>
-    void norm_2_impl(vector_base<T> const & vec,
+    template<typename T, typename H>
+    void norm_2_impl(vector_base<T, H> const & vec,
                      scalar<T> & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
@@ -808,8 +808,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename T>
-    void norm_2_cpu(vector_base<T> const & vec,
+    template<typename T, typename H>
+    void norm_2_cpu(vector_base<T, H> const & vec,
                     T & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
@@ -855,8 +855,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename T>
-    void norm_inf_impl(vector_base<T> const & vec,
+    template<typename T, typename H>
+    void norm_inf_impl(vector_base<T, H> const & vec,
                        scalar<T> & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
@@ -900,8 +900,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename T>
-    void norm_inf_cpu(vector_base<T> const & vec,
+    template<typename T, typename H>
+    void norm_inf_cpu(vector_base<T, H> const & vec,
                       T & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
@@ -948,8 +948,8 @@ namespace viennacl
     * @param vec The vector
     * @return The result. Note that the result must be a CPU scalar
     */
-    template<typename T>
-    vcl_size_t index_norm_inf(vector_base<T> const & vec)
+    template<typename T, typename H>
+    vcl_size_t index_norm_inf(vector_base<T, H> const & vec)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
@@ -988,8 +988,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename NumericT>
-    void max_impl(vector_base<NumericT> const & vec, viennacl::scalar<NumericT> & result)
+    template<typename NumericT, typename H>
+    void max_impl(vector_base<NumericT, H> const & vec, viennacl::scalar<NumericT> & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
@@ -1031,8 +1031,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename T>
-    void max_cpu(vector_base<T> const & vec, T & result)
+    template<typename T, typename H>
+    void max_cpu(vector_base<T, H> const & vec, T & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
@@ -1075,8 +1075,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename NumericT>
-    void min_impl(vector_base<NumericT> const & vec, viennacl::scalar<NumericT> & result)
+    template<typename NumericT, typename H>
+    void min_impl(vector_base<NumericT, H> const & vec, viennacl::scalar<NumericT> & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
@@ -1118,8 +1118,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename T>
-    void min_cpu(vector_base<T> const & vec, T & result)
+    template<typename T, typename H>
+    void min_cpu(vector_base<T, H> const & vec, T & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
@@ -1162,8 +1162,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename NumericT>
-    void sum_impl(vector_base<NumericT> const & vec, viennacl::scalar<NumericT> & result)
+    template<typename NumericT, typename H>
+    void sum_impl(vector_base<NumericT, H> const & vec, viennacl::scalar<NumericT> & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
@@ -1205,8 +1205,8 @@ namespace viennacl
     * @param vec The vector
     * @param result The result scalar
     */
-    template<typename T>
-    void sum_cpu(vector_base<T> const & vec, T & result)
+    template<typename T, typename H>
+    void sum_cpu(vector_base<T, H> const & vec, T & result)
     {
       switch (viennacl::traits::handle(vec).get_active_handle_id())
       {
@@ -1255,9 +1255,9 @@ namespace viennacl
     * @param alpha  The first transformation coefficient (CPU scalar)
     * @param beta   The second transformation coefficient (CPU scalar)
     */
-    template<typename T>
-    void plane_rotation(vector_base<T> & vec1,
-                        vector_base<T> & vec2,
+    template<typename T, typename H1, typename H2>
+    void plane_rotation(vector_base<T, H1> & vec1,
+                        vector_base<T, H2> & vec2,
                         T alpha, T beta)
     {
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
@@ -1293,9 +1293,9 @@ namespace viennacl
     * @param vec1       Input vector.
     * @param vec2       The output vector.
     */
-    template<typename NumericT>
-    void inclusive_scan(vector_base<NumericT> & vec1,
-                        vector_base<NumericT> & vec2)
+    template<typename NumericT, typename H1, typename H2>
+    void inclusive_scan(vector_base<NumericT, H1> & vec1,
+                        vector_base<NumericT, H2> & vec2)
     {
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
       {
@@ -1326,8 +1326,8 @@ namespace viennacl
     * Given an input element vector (x_0, x_1, ..., x_{n-1}),
     * this routine overwrites the vector with (x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-1})
     */
-    template<typename NumericT>
-    void inclusive_scan(vector_base<NumericT> & vec)
+    template<typename NumericT, typename H>
+    void inclusive_scan(vector_base<NumericT, H> & vec)
     {
       inclusive_scan(vec, vec);
     }
@@ -1343,9 +1343,9 @@ namespace viennacl
     * @param vec1       Input vector.
     * @param vec2       The output vector.
     */
-    template<typename NumericT>
-    void exclusive_scan(vector_base<NumericT> & vec1,
-                        vector_base<NumericT> & vec2)
+    template<typename NumericT, typename H1, typename H2>
+    void exclusive_scan(vector_base<NumericT, H1> & vec1,
+                        vector_base<NumericT, H2> & vec2)
     {
       switch (viennacl::traits::handle(vec1).get_active_handle_id())
       {
@@ -1376,15 +1376,15 @@ namespace viennacl
     * Given an element vector (x_0, x_1, ..., x_{n-1}),
     * this routine overwrites the input vector with (0, x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-2})
     */
-    template<typename NumericT>
-    void exclusive_scan(vector_base<NumericT> & vec)
+    template<typename NumericT, typename H>
+    void exclusive_scan(vector_base<NumericT, H> & vec)
     {
       exclusive_scan(vec, vec);
     }
   } //namespace linalg
 
-  template<typename T, typename LHS, typename RHS, typename OP>
-  vector_base<T> & operator += (vector_base<T> & v1, const vector_expression<const LHS, const RHS, OP> & proxy)
+  template<typename T, typename H, typename LHS, typename RHS, typename OP>
+  vector_base<T> & operator += (vector_base<T, H> & v1, const vector_expression<const LHS, const RHS, OP> & proxy)
   {
     assert( (viennacl::traits::size(proxy) == v1.size()) && bool("Incompatible vector sizes!"));
     assert( (v1.size() > 0) && bool("Vector not yet initialized!") );
@@ -1394,8 +1394,8 @@ namespace viennacl
     return v1;
   }
 
-  template<typename T, typename LHS, typename RHS, typename OP>
-  vector_base<T> & operator -= (vector_base<T> & v1, const vector_expression<const LHS, const RHS, OP> & proxy)
+  template<typename T, typename H, typename LHS, typename RHS, typename OP>
+  vector_base<T> & operator -= (vector_base<T, H> & v1, const vector_expression<const LHS, const RHS, OP> & proxy)
   {
     assert( (viennacl::traits::size(proxy) == v1.size()) && bool("Incompatible vector sizes!"));
     assert( (v1.size() > 0) && bool("Vector not yet initialized!") );
diff --git a/viennacl/meta/result_of.hpp b/viennacl/meta/result_of.hpp
index 32a0491a2f87ea4ccf59b8314611bc9afef3b329..67cb5384e615215efb87886af8c628ee28c81449 100644
--- a/viennacl/meta/result_of.hpp
+++ b/viennacl/meta/result_of.hpp
@@ -79,8 +79,8 @@ struct alignment<const T>
   enum { value = alignment<T>::value };
 };
 
-template<typename NumericT, unsigned int AlignmentV>
-struct alignment< vector<NumericT, AlignmentV> >
+template<typename NumericT, unsigned int AlignmentV, typename H>
+struct alignment< vector<NumericT, AlignmentV, H> >
 {
   enum { value = AlignmentV };
 };
@@ -142,8 +142,8 @@ struct size_type
 };
 
 /** \cond */
-template<typename T, typename SizeType>
-struct size_type< vector_base<T, SizeType> >
+template<typename T, typename H, typename SizeType>
+struct size_type< vector_base<T, H, SizeType> >
 {
   typedef SizeType   type;
 };
@@ -339,14 +339,14 @@ struct cpu_value_type<double>
   typedef double    type;
 };
 
-template<typename T>
-struct cpu_value_type<viennacl::scalar<T> >
+template<typename T, typename H>
+struct cpu_value_type<viennacl::scalar<T, H> >
 {
   typedef T    type;
 };
 
-template<typename T>
-struct cpu_value_type<viennacl::vector_base<T> >
+template<typename T, typename H>
+struct cpu_value_type<viennacl::vector_base<T, H> >
 {
   typedef T    type;
 };
diff --git a/viennacl/ocl/context.hpp b/viennacl/ocl/context.hpp
index b8b7f19f2af668fc79fd1fcef98a9bcce2e0c877..52f638f08b51fc3715808edf69776816dc28401e 100644
--- a/viennacl/ocl/context.hpp
+++ b/viennacl/ocl/context.hpp
@@ -31,8 +31,15 @@
 #include <algorithm>
 #include <fstream>
 #include <vector>
+#include <cassert>
 #include <map>
 #include <cstdlib>
+#include <memory>
+#include <algorithm>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <map>
 #include "viennacl/ocl/forwards.h"
 #include "viennacl/ocl/error.hpp"
 #include "viennacl/ocl/handle.hpp"
@@ -42,11 +49,102 @@
 #include "viennacl/ocl/platform.hpp"
 #include "viennacl/ocl/command_queue.hpp"
 #include "viennacl/tools/sha1.hpp"
+#include "viennacl/ocl/mempool/bitlog.hpp"
 #include "viennacl/tools/shared_ptr.hpp"
+#include "viennacl/ocl/mempool/mempool.hpp"
+
+
 namespace viennacl
 {
 namespace ocl
 {
+
+  // {{{ allocator class
+  
+  class cl_allocator_base
+  {
+    protected:
+      viennacl::ocl::context* m_context;
+      cl_mem_flags m_flags;
+
+    public:
+      // CTOR
+      cl_allocator_base(viennacl::ocl::context* const &ctx,
+          cl_mem_flags flags=CL_MEM_READ_WRITE)
+        : m_context(ctx), m_flags(flags)
+      {
+        if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR))
+        {
+          std::cerr << "[Allocator]: cannot specify USE_HOST_PTR or "
+            "COPY_HOST_PTR flags" << std::endl;
+          throw viennacl::ocl::invalid_value();
+        }
+      }
+
+
+      // Copy CTOR
+      cl_allocator_base(cl_allocator_base const &src)
+      : m_context(src.m_context), m_flags(src.m_flags)
+      { }
+
+
+      ~cl_allocator_base()
+      { }
+
+      void free(cl_mem p)
+      {
+        cl_int err = clReleaseMemObject(p);
+        VIENNACL_ERR_CHECK(err);
+#ifdef VIENNACL_DEBUG_ALL
+        std :: cout << "[allocator]: deallocating memory: " << p << std::endl;
+#endif
+      }
+
+        virtual cl_allocator_base *copy() const = 0;
+        virtual bool is_deferred() const  = 0;
+        virtual cl_mem  allocate(size_t) = 0;
+  };
+
+  class cl_immediate_allocator : public cl_allocator_base
+  {
+    private:
+      viennacl::ocl::command_queue* m_queue;
+
+    public:
+      // NOTE: Changed the declaration as viennacl comman=d queue does nt store
+      // the context
+      //
+      
+      cl_immediate_allocator(viennacl::ocl::context* const &ctx,
+          viennacl::ocl::command_queue* const &queue,
+          cl_mem_flags flags=CL_MEM_READ_WRITE)
+        : cl_allocator_base(ctx, flags),
+        m_queue(queue)
+      { }
+
+      cl_immediate_allocator(cl_immediate_allocator const &src)
+        : cl_allocator_base(src), m_queue(src.m_queue)
+      { }
+
+      cl_immediate_allocator *copy() const
+      {
+        return new cl_immediate_allocator(*this);
+      }
+
+      inline cl_mem allocate(size_t s);
+      bool is_deferred() const
+      { return false; }
+
+      virtual ~cl_immediate_allocator()
+      {}
+  };
+
+
+  // }}}
+  
+
+  // }}}
+
 /** @brief Manages an OpenCL context and provides the respective convenience functions for creating buffers, etc.
   *
   * This class was originally written before the OpenCL C++ bindings were standardized.
@@ -84,6 +182,8 @@ public:
   /** @brief Sets the maximum number of devices to be set up for the context */
   void default_device_num(vcl_size_t new_num) { default_device_num_ = new_num; }
 
+  /** Creating a memory pool */
+
   ////////// get and set preferred device type /////////////////////
   /** @brief Returns the default device type for the context */
   cl_device_type default_device_type()
@@ -197,10 +297,23 @@ public:
     *  @param flags  OpenCL flags for the buffer creation
     *  @param size   Size of the memory buffer in bytes
     *  @param ptr    Optional pointer to CPU memory, with which the OpenCL memory should be initialized
+    *  @param use_mempool Optional boolean to create memory through the memory pool.
     *  @return       A plain OpenCL handle. Either assign it to a viennacl::ocl::handle<cl_mem> directly, or make sure that you free to memory manually if you no longer need the allocated memory.
     */
-  cl_mem create_memory_without_smart_handle(cl_mem_flags flags, unsigned int size, void * ptr = NULL) const
+  cl_mem create_memory_without_smart_handle(cl_mem_flags flags, unsigned int size, void * ptr = NULL, bool use_mempool = false) const
   {
+
+    if(use_mempool){
+#ifdef VIENNACL_DEBUG_ALL
+      std::cout << "[mempool]: querying for memory\n";
+#endif
+      cl_mem mem = get_mempool()->allocate(size);
+#ifdef VIENNACL_DEBUG_ALL
+      std::cout << "[mempool]: gave memory at: " << mem << std::endl;
+
+#endif
+      return mem;
+    }
 #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
     std::cout << "ViennaCL: Creating memory of size " << size << " for context " << h_ << " (unsafe, returning cl_mem directly)" << std::endl;
 #endif
@@ -212,6 +325,18 @@ public:
     return mem;
   }
 
+  /** @brief Decerements the reference count of the memory in the memory pool **/
+  void decrement_mem_ref_counter(cl_mem p, vcl_size_t s) const
+  {
+    get_mempool()->decrement_ref_counter(p, s);
+  }
+
+  /** @brief Incerements the reference count of the memory in the memory pool **/
+  void increment_mem_ref_counter(cl_mem p, vcl_size_t s) const
+  {
+    get_mempool()->increment_ref_counter(p, s);
+  }
+
 
   /** @brief Creates a memory buffer within the context
     *
@@ -251,6 +376,7 @@ public:
   /** @brief Adds a queue for the given device to the context */
   void add_queue(cl_device_id dev)
   {
+
 #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_CONTEXT)
     std::cout << "ViennaCL: Adding new queue for device " << dev << " to context " << h_ << std::endl;
 #endif
@@ -263,6 +389,18 @@ public:
     VIENNACL_ERR_CHECK(err);
 
     queues_[dev].push_back(viennacl::ocl::command_queue(temp));
+
+    // register the allocator for the device
+    if(allocators_.find(dev) == allocators_.end())
+    {
+      // did not find an queue for the present device => allot one
+      allocators_[dev] = tools::shared_ptr<cl_immediate_allocator>(new
+        cl_immediate_allocator(this,
+            &(queues_[dev][0]),
+            CL_MEM_READ_WRITE));
+      mempools_[dev] = tools::shared_ptr<memory_pool<cl_immediate_allocator>> (new
+        memory_pool<cl_immediate_allocator>(*allocators_[dev]));
+    }
   }
 
   /** @brief Adds a queue for the given device to the context */
@@ -279,6 +417,15 @@ public:
     return queues_[devices_[current_device_id_].id()][current_queue_id_];
   }
 
+  //get current mempool
+  tools::shared_ptr<viennacl::ocl::memory_pool<cl_immediate_allocator>> const& get_mempool() const
+  {
+    typedef std::map< cl_device_id, tools::shared_ptr<viennacl::ocl::memory_pool<cl_immediate_allocator>> >    MempoolContainer;
+    MempoolContainer::const_iterator it = mempools_.find(devices_[current_device_id_].id());
+    assert (it != mempools_.end()&&bool("Did not find a memory pool."));
+    return it->second;
+  }
+
   viennacl::ocl::command_queue const & get_queue() const
   {
     typedef std::map< cl_device_id, std::vector<viennacl::ocl::command_queue> >    QueueContainer;
@@ -765,6 +912,10 @@ private:
   std::string build_options_;
   vcl_size_t pf_index_;
   vcl_size_t current_queue_id_;
+
+  // Memory pool
+  std::map< cl_device_id, tools::shared_ptr<cl_immediate_allocator>> allocators_;
+  std::map< cl_device_id, tools::shared_ptr<memory_pool<cl_immediate_allocator>>> mempools_;
 }; //context
 
 
@@ -822,6 +973,53 @@ inline void viennacl::ocl::kernel::set_work_size_defaults()
   }
 }
 
+// {{{ definition of cl_immediate_allocator::allocate
+
+cl_mem cl_immediate_allocator::allocate(size_t s)
+{
+
+  cl_mem ptr =
+    m_context->create_memory_without_smart_handle(m_flags, s, NULL);
+
+  // Make sure the buffer gets allocated right here and right now.
+  // This looks (and is) expensive. But immediate allocators
+  // have their main use in memory pools, whose basic assumption
+  // is that allocation is too expensive anyway--but they rely
+  // on exact 'out-of-memory' information.
+  unsigned zero = 0;
+  cl_int err = clEnqueueWriteBuffer(
+        m_queue->handle().get(),
+        ptr,
+        /* is blocking */ CL_FALSE,
+        0, std::min(s, sizeof(zero)), &zero,
+        0, NULL, NULL
+        );
+  VIENNACL_ERR_CHECK(err);
+
+  // No need to wait for completion here. clWaitForEvents (e.g.)
+  // cannot return mem object allocation failures. This implies that
+  // the buffer is faulted onto the device on enqueue.
+
+  return ptr;
+}
+
+// }}}
+
+// {{{ pooled handle dec
+
+void pooled_clmem_handle::inc()
+{
+  p_context_->increment_mem_ref_counter(h_, m_size);
+}
+
+
+void pooled_clmem_handle::dec()
+{
+  p_context_->decrement_mem_ref_counter(h_, m_size);
+}
+
+// }}}
+
 }
 }
 
diff --git a/viennacl/ocl/forwards.h b/viennacl/ocl/forwards.h
index a3603e913e84eaf27dd3a1e537ecc5c9d1b72487..faa0d30a0b29ee7a53ecf2de83e559009c37de81 100644
--- a/viennacl/ocl/forwards.h
+++ b/viennacl/ocl/forwards.h
@@ -40,16 +40,17 @@ namespace viennacl
     /** @brief A tag denoting the default OpenCL device type (SDK-specific) */
     struct default_tag {};
 
-
     class kernel;
     class device;
     class command_queue;
     class context;
     class program;
 
-    template<class OCL_TYPE>
+    template <class OCL_TYPE>
     class handle;
 
+    class pooled_clmem_handle;
+
     template<typename KernelType>
     void enqueue(KernelType & k, viennacl::ocl::command_queue const & queue);
 
diff --git a/viennacl/ocl/handle.hpp b/viennacl/ocl/handle.hpp
index 8283f848cc0f9146b16e1e251176bb10e20a5904..e4e5f6ba0ddd4bb29427c34372ba7b3eccf0620c 100644
--- a/viennacl/ocl/handle.hpp
+++ b/viennacl/ocl/handle.hpp
@@ -33,6 +33,7 @@
 #include <iostream>
 #include "viennacl/ocl/forwards.h"
 #include "viennacl/ocl/error.hpp"
+#include "viennacl/forwards.h"
 
 namespace viennacl
 {
@@ -154,7 +155,7 @@ namespace viennacl
         handle() : h_(0), p_context_(NULL) {}
         handle(const OCL_TYPE & something, viennacl::ocl::context const & c) : h_(something), p_context_(&c) {}
         handle(const handle & other) : h_(other.h_), p_context_(other.p_context_) { if (h_ != 0) inc(); }
-        ~handle() { if (h_ != 0) dec(); }
+        virtual ~handle() { if (h_ != 0) dec(); }
 
         /** @brief Copies the OpenCL handle from the provided handle. Does not take ownership like e.g. std::auto_ptr<>, so both handle objects are valid (more like shared_ptr). */
         handle & operator=(const handle & other)
@@ -190,12 +191,12 @@ namespace viennacl
 
         const OCL_TYPE & get() const { return h_; }
 
-        viennacl::ocl::context const & context() const
+        virtual viennacl::ocl::context const & context() const
         {
           assert(p_context_ != NULL && bool("Logic error: Accessing dangling context from handle."));
           return *p_context_;
         }
-        void context(viennacl::ocl::context const & c) { p_context_ = &c; }
+        virtual void context(viennacl::ocl::context const & c) { p_context_ = &c; }
 
 
         /** @brief Swaps the OpenCL handle of two handle objects */
@@ -208,19 +209,89 @@ namespace viennacl
           viennacl::ocl::context const * tmp2 = other.p_context_;
           other.p_context_ = this->p_context_;
           this->p_context_ = tmp2;
-
           return *this;
         }
 
         /** @brief Manually increment the OpenCL reference count. Typically called automatically, but is necessary if user-supplied memory objects are wrapped. */
-        void inc() { handle_inc_dec_helper<OCL_TYPE>::inc(h_); }
+        virtual void inc() { handle_inc_dec_helper<OCL_TYPE>::inc(h_); }
         /** @brief Manually decrement the OpenCL reference count. Typically called automatically, but might be useful with user-supplied memory objects.  */
-        void dec() { handle_inc_dec_helper<OCL_TYPE>::dec(h_); }
-      private:
+        virtual void dec() { handle_inc_dec_helper<OCL_TYPE>::dec(h_); }
+      protected:
         OCL_TYPE h_;
         viennacl::ocl::context const * p_context_;
     };
 
+  // {{{ pooled handle
+  //
+
+  class pooled_clmem_handle: public handle<cl_mem>
+  {
+    protected:
+      typedef handle<cl_mem> super;
+
+    public:
+      pooled_clmem_handle() : super(), m_size(0) {}
+      pooled_clmem_handle(const cl_mem & something, viennacl::ocl::context const & c, vcl_size_t & _s) : super(something, c), m_size(_s)
+      {if(h_!=0)
+        {
+          inc();
+          cl_int err = clRetainMemObject(something);
+          VIENNACL_ERR_CHECK(err);
+        }
+      }
+      pooled_clmem_handle(const pooled_clmem_handle & other) : super(other), m_size(other.m_size)
+      {
+        if(h_!=0)
+          inc();
+      }
+
+      pooled_clmem_handle & operator=(const pooled_clmem_handle & other)
+      {
+        if (h_ != 0)
+          dec();
+        h_         = other.h_;
+        p_context_ = other.p_context_;
+        m_size     = other.m_size;
+        inc();
+        return *this;
+      }
+
+      pooled_clmem_handle & operator=(const cl_mem & something)
+      {
+        std::cerr << "[pooled_handle]: Pooled handle needs to know about size\n";
+        throw std::exception();
+        return *this;
+      }
+
+      /** @brief Swaps the OpenCL handle of two handle objects */
+      pooled_clmem_handle & swap(pooled_clmem_handle & other)
+      {
+        cl_mem tmp = other.h_;
+        other.h_ = this->h_;
+        this->h_ = tmp;
+
+        viennacl::ocl::context const * tmp2 = other.p_context_;
+        other.p_context_ = this->p_context_;
+        this->p_context_ = tmp2;
+
+        size_t tmp3 = other.m_size;
+        other.m_size = this->m_size;
+        this->m_size = tmp3;
+
+        return *this;
+      }
+
+      inline virtual void inc();
+      inline virtual void dec();
+
+      virtual ~pooled_clmem_handle() {
+        if (h_!=0) dec();
+      }
+
+    private:
+      size_t m_size;
+  };
+
 
   } //namespace ocl
 } //namespace viennacl
diff --git a/viennacl/ocl/kernel.hpp b/viennacl/ocl/kernel.hpp
index 5f2cab139d87fa4410cf8ababe74822e952da8c4..900e6d80d2568502a2cb883663192b84ea10446d 100644
--- a/viennacl/ocl/kernel.hpp
+++ b/viennacl/ocl/kernel.hpp
@@ -223,6 +223,30 @@ namespace viennacl
         VIENNACL_ERR_CHECK(err);
       }
 
+
+      //forward handles directly:
+      /** @brief Sets an OpenCL object at the provided position */
+      template<class CL_TYPE>
+      void arg(unsigned int pos, viennacl::ocl::handle<CL_TYPE> const & h)
+      {
+        CL_TYPE temp = h.get();
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting handle kernel argument " << temp << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(CL_TYPE), (void*)&temp);
+        VIENNACL_ERR_CHECK(err);
+      }
+
+      void arg(unsigned int pos, viennacl::ocl::pooled_clmem_handle const & h)
+      {
+        cl_mem temp = h.get();
+        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
+        std::cout << "ViennaCL: Setting handle kernel argument " << temp << " at pos " << pos << " for kernel " << name_ << std::endl;
+        #endif
+        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(cl_mem), (void*)&temp);
+        VIENNACL_ERR_CHECK(err);
+      }
+
       //generic handling: call .handle() member
       /** @brief Sets an OpenCL memory object at the provided position */
       template<class VCL_TYPE>
@@ -238,19 +262,6 @@ namespace viennacl
         VIENNACL_ERR_CHECK(err);
       }
 
-      //forward handles directly:
-      /** @brief Sets an OpenCL object at the provided position */
-      template<class CL_TYPE>
-      void arg(unsigned int pos, viennacl::ocl::handle<CL_TYPE> const & h)
-      {
-        CL_TYPE temp = h.get();
-        #if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_KERNEL)
-        std::cout << "ViennaCL: Setting handle kernel argument " << temp << " at pos " << pos << " for kernel " << name_ << std::endl;
-        #endif
-        cl_int err = clSetKernelArg(handle_.get(), pos, sizeof(CL_TYPE), (void*)&temp);
-        VIENNACL_ERR_CHECK(err);
-      }
-
 
       //local buffer argument:
       /** @brief Sets an OpenCL local memory object at the provided position */
diff --git a/viennacl/ocl/mempool/bitlog.hpp b/viennacl/ocl/mempool/bitlog.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a0ba633d455e2277a3ebbec4d1afd504c3b90d4
--- /dev/null
+++ b/viennacl/ocl/mempool/bitlog.hpp
@@ -0,0 +1,94 @@
+// Base-2 logarithm bithack.
+//
+// Copyright (C) 2009 Andreas Kloeckner
+// Copyright (C) Sean Eron Anderson (in the public domain)
+//
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+
+#ifndef _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP
+#define _AFJDFJSDFSD_PYOPENCL_HEADER_SEEN_BITLOG_HPP
+
+
+#include <climits>
+#include <cstdint>
+
+namespace viennacl
+{
+namespace mempool
+{
+  /* from http://graphics.stanford.edu/~seander/bithacks.html */
+
+
+  const char log_table_8[] =
+  {
+    0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+  };
+
+  inline unsigned bitlog2_16(uint16_t v)
+  {
+    if (unsigned long t = v >> 8)
+      return 8+log_table_8[t];
+    else
+      return log_table_8[v];
+  }
+
+  inline unsigned bitlog2_32(uint32_t v)
+  {
+    if (uint16_t t = v >> 16)
+      return 16+bitlog2_16(t);
+    else
+      return bitlog2_16(v);
+  }
+
+  inline unsigned bitlog2(unsigned long v)
+  {
+#if (ULONG_MAX != 4294967295)
+    if (uint32_t t = v >> 32)
+      return 32+bitlog2_32(t);
+    else
+#endif
+      return bitlog2_32(v);
+  }
+} // namespace mempool
+} // namespace viennacl
+
+
+
+
+
+#endif
diff --git a/viennacl/ocl/mempool/mempool.hpp b/viennacl/ocl/mempool/mempool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..12abdc62a56a0822e6bbf6a56d6b0023cdd28997
--- /dev/null
+++ b/viennacl/ocl/mempool/mempool.hpp
@@ -0,0 +1,357 @@
+// Abstract memory pool implementation
+//
+// Copyright (C) 2009-17 Andreas Kloeckner
+//               2018-19 Kaushik Kulkarni
+//
+//
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+
+#ifndef VIENNACL_OCL_MEMPOOL_HPP_
+#define VIENNACL_OCL_MEMPOOL_HPP_
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <memory>
+#include <vector>
+#include <map>
+#include <iostream>
+#include <cassert>
+
+#include "viennacl/ocl/mempool/bitlog.hpp"
+#include "viennacl/ocl/mempool/mempool_utils.hpp"
+#include "viennacl/ocl/error.hpp"
+
+namespace viennacl
+{
+namespace ocl
+{
+
+  template<class Allocator>
+  class memory_pool : mempool::noncopyable
+  {
+    public:
+      typedef size_t size_type;
+
+    private:
+      typedef uint32_t bin_nr_t;
+      typedef std::vector<cl_mem> bin_t;
+
+      typedef std::map<bin_nr_t, bin_t> container_t;
+      container_t m_container;
+      typedef typename container_t::value_type bin_pair_t;
+
+      std::map<cl_mem, unsigned> m_reference_count; // ref counter
+      std::unique_ptr<Allocator> m_allocator;
+
+      // A held block is one that's been released by the application, but that
+      // we are keeping around to dish out again.
+      unsigned m_held_blocks;
+
+      // An active block is one that is in use by the application.
+      unsigned m_active_blocks;
+
+      bool m_stop_holding;
+      int m_trace;
+
+    public:
+      memory_pool(Allocator const &alloc=Allocator())
+        : m_allocator(alloc.copy()),
+        m_held_blocks(0), m_active_blocks(0), m_stop_holding(false),
+        m_trace(false)
+      {
+        if (m_allocator->is_deferred())
+        {
+          std::cerr <<  "Memory pools expect non-deferred "
+              "semantics from their allocators. You passed a deferred "
+              "allocator, i.e. an allocator whose allocations can turn out to "
+              "be unavailable long after allocation.\n";
+          throw std::exception();
+        }
+      }
+
+      virtual ~memory_pool()
+      {
+        free_held();
+      }
+
+      static const unsigned mantissa_bits = 2;
+      static const unsigned mantissa_mask = (1 << mantissa_bits) - 1;
+
+      static bin_nr_t bin_number(size_type size)
+      {
+        signed l = viennacl::mempool::bitlog2(size);
+        size_type shifted = viennacl::mempool::signed_right_shift(size,
+            l-signed(mantissa_bits));
+        if (size && (shifted & (1 << mantissa_bits)) == 0)
+          throw std::runtime_error("memory_pool::bin_number: bitlog2 fault");
+        size_type chopped = shifted & mantissa_mask;
+        return l << mantissa_bits | chopped;
+      }
+
+      void set_trace(bool flag)
+      {
+        if (flag)
+          ++m_trace;
+        else
+          --m_trace;
+      }
+
+      static size_type alloc_size(bin_nr_t bin)
+      {
+        bin_nr_t exponent = bin >> mantissa_bits;
+        bin_nr_t mantissa = bin & mantissa_mask;
+
+        size_type ones = viennacl::mempool::signed_left_shift(1,
+            signed(exponent)-signed(mantissa_bits)
+            );
+        if (ones) ones -= 1;
+
+        size_type head = viennacl::mempool::signed_left_shift(
+           (1<<mantissa_bits) | mantissa,
+            signed(exponent)-signed(mantissa_bits));
+        if (ones & head)
+          throw std::runtime_error("memory_pool::alloc_size: bit-counting fault");
+        return head | ones;
+      }
+
+    protected:
+      bin_t &get_bin(bin_nr_t bin_nr)
+      {
+        typename container_t::iterator it = m_container.find(bin_nr);
+        if (it == m_container.end())
+        {
+          auto it_and_inserted = m_container.insert(std::make_pair(bin_nr, bin_t()));
+          assert(it_and_inserted.second);
+          return it_and_inserted.first->second;
+        }
+        else
+          return it->second;
+      }
+
+      void inc_held_blocks()
+      {
+        if (m_held_blocks == 0)
+          start_holding_blocks();
+        ++m_held_blocks;
+      }
+
+      void dec_held_blocks()
+      {
+        --m_held_blocks;
+        if (m_held_blocks == 0)
+          stop_holding_blocks();
+      }
+
+      virtual void start_holding_blocks()
+      { }
+
+      virtual void stop_holding_blocks()
+      { }
+
+    public:
+      cl_mem allocate(size_type size)
+      {
+        bin_nr_t bin_nr = bin_number(size);
+        bin_t &bin = get_bin(bin_nr);
+
+        if (bin.size())
+        {
+          if (m_trace)
+            std::cout
+              << "[pool] allocation of size " << size << " served from bin " << bin_nr
+              << " which contained " << bin.size() << " entries" << std::endl;
+
+          cl_mem result = pop_block_from_bin(bin, size);
+          assert(m_reference_count.find(result) == m_reference_count.end() && bool("Memory already registered in reference counter."));
+          m_reference_count[result] = 0;
+          cl_int err = clRetainMemObject(result);
+          VIENNACL_ERR_CHECK(err);
+
+          return result;
+        }
+
+        size_type alloc_sz = alloc_size(bin_nr);
+
+        assert(bin_number(alloc_sz) == bin_nr);
+
+        if (m_trace)
+          std::cout << "[pool] allocation of size " << size << " required new memory" << std::endl;
+
+        try {
+          cl_mem result = get_from_allocator(alloc_sz);
+
+          assert(m_reference_count.find(result) == m_reference_count.end() && bool("Memory already registered in reference counter."));
+
+          cl_int err = clRetainMemObject(result);
+          VIENNACL_ERR_CHECK(err);
+          m_reference_count[result] = 0;
+
+          return result;
+        }
+        catch (viennacl::ocl::mem_object_allocation_failure &e)
+        {
+            throw;
+        }
+
+        throw viennacl::ocl::mem_object_allocation_failure();
+      }
+
+      void free(cl_mem p, size_type size)
+      {
+#ifdef VIENNACL_DEBUG_ALL
+        std::cout << "[mempool]: freeing the memory " <<
+          p << ". So that it could be used again."<< std::endl;
+#endif
+        --m_active_blocks;
+        bin_nr_t bin_nr = bin_number(size);
+
+        if (!m_stop_holding)
+        {
+          inc_held_blocks();
+          get_bin(bin_nr).push_back(p);
+
+          if (m_trace)
+            std::cout << "[pool] block of size " << size << " returned to bin "
+              << bin_nr << " which now contains " << get_bin(bin_nr).size()
+              << " entries" << std::endl;
+        }
+        else
+          m_allocator->free(p);
+      }
+
+      void free_held()
+      {
+        for (bin_pair_t &bin_pair: m_container)
+        {
+          bin_t &bin = bin_pair.second;
+
+          while (bin.size())
+          {
+            m_allocator->free(bin.back());
+            bin.pop_back();
+
+            dec_held_blocks();
+          }
+        }
+
+        assert(m_held_blocks == 0);
+      }
+
+      void stop_holding()
+      {
+        m_stop_holding = true;
+        free_held();
+      }
+
+      unsigned active_blocks()
+      { return m_active_blocks; }
+
+      unsigned held_blocks()
+      { return m_held_blocks; }
+
+      bool try_to_free_memory()
+      {
+        // free largest stuff first
+        for (bin_pair_t &bin_pair: viennacl::mempool::reverse(m_container))
+        {
+          bin_t &bin = bin_pair.second;
+
+          if (bin.size())
+          {
+            m_allocator->free(bin.back());
+            bin.pop_back();
+
+            dec_held_blocks();
+
+            return true;
+          }
+        }
+
+        return false;
+      }
+
+      void increment_ref_counter(cl_mem p, size_type s)
+      {
+#ifdef VIENNACL_DEBUG_ALL
+        std::cout << "[mempool]: Incrementing for " << p << std::endl;
+#endif
+        if(m_reference_count.find(p) == m_reference_count.end())
+        {
+          std::cerr << "Did not find a memory to reference count.\n";
+          throw std::exception();
+        }
+
+        ++m_reference_count[p];
+      }
+
+      void decrement_ref_counter(cl_mem p, size_type s)
+      {
+#ifdef VIENNACL_DEBUG_ALL
+        std::cout << "[mempool]: Decrementing for " << p << std::endl;
+#endif
+        if(m_reference_count.find(p) == m_reference_count.end())
+        {
+          std::cerr << "Did not find a memory to reference count.\n";
+          throw std::exception();
+        }
+
+        --m_reference_count[p];
+
+        if(m_reference_count[p] == 0)
+        {
+          // this is not longer useful => free it
+          free(p, s);
+          
+          // no longer need to store this in the map
+          m_reference_count.erase(p);
+        }
+      }
+
+    private:
+      cl_mem get_from_allocator(size_type alloc_sz)
+      {
+        cl_mem result = m_allocator->allocate(alloc_sz);
+        ++m_active_blocks;
+
+        return result;
+      }
+
+      cl_mem pop_block_from_bin(bin_t &bin, size_type size)
+      {
+        cl_mem result = bin.back();
+        bin.pop_back();
+
+        dec_held_blocks();
+        ++m_active_blocks;
+
+        return result;
+      }
+  };
+}
+}
+
+#endif
diff --git a/viennacl/ocl/mempool/mempool_utils.hpp b/viennacl/ocl/mempool/mempool_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..de04ee314aa1055ecee45faf7a41292d704f5b6c
--- /dev/null
+++ b/viennacl/ocl/mempool/mempool_utils.hpp
@@ -0,0 +1,137 @@
+// Various odds and ends
+//
+// Copyright (C) 2009 Andreas Kloeckner
+//
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+
+#ifndef VIENNACL_MEMPOOL_UTILS_HPP
+#define VIENNACL_MEMPOOL_UTILS_HPP
+
+#include <stdexcept>
+#include <CL/cl.h>
+
+namespace viennacl
+{
+namespace mempool
+{
+
+  // {{{ error
+  class error : public std::runtime_error
+  {
+    private:
+      std::string m_routine;
+      cl_int m_code;
+
+      // This is here because clLinkProgram returns a program
+      // object *just* so that there is somewhere for it to
+      // stuff the linker logs. :/
+      bool m_program_initialized;
+      cl_program m_program;
+
+    public:
+      error(const char *routine, cl_int c, const char *msg="")
+        : std::runtime_error(msg), m_routine(routine), m_code(c),
+        m_program_initialized(false), m_program(nullptr)
+      { }
+
+      error(const char *routine, cl_program prg, cl_int c,
+          const char *msg="")
+        : std::runtime_error(msg), m_routine(routine), m_code(c),
+        m_program_initialized(true), m_program(prg)
+      { }
+
+      virtual ~error()
+      {
+        if (m_program_initialized)
+          clReleaseProgram(m_program);
+      }
+
+      const std::string &routine() const
+      {
+        return m_routine;
+      }
+
+      cl_int code() const
+      {
+        return m_code;
+      }
+
+      bool is_out_of_memory() const
+      {
+        return (code() == CL_MEM_OBJECT_ALLOCATION_FAILURE
+            || code() == CL_OUT_OF_RESOURCES
+            || code() == CL_OUT_OF_HOST_MEMORY);
+      }
+  };
+
+  // }}}
+
+  template <class T>
+  inline T signed_left_shift(T x, signed shift_amount)
+  {
+    if (shift_amount < 0)
+      return x >> -shift_amount;
+    else
+      return x << shift_amount;
+  }
+
+  template <class T>
+  inline T signed_right_shift(T x, signed shift_amount)
+  {
+    if (shift_amount < 0)
+      return x << -shift_amount;
+    else
+      return x >> shift_amount;
+  }
+
+  // https://stackoverflow.com/a/28139075
+  template <typename T>
+  struct reversion_wrapper { T& iterable; };
+
+  template <typename T>
+  auto begin (reversion_wrapper<T> w) { return w.iterable.rbegin(); }
+
+  template <typename T>
+  auto end (reversion_wrapper<T> w) { return w.iterable.rend(); }
+
+  template <typename T>
+  reversion_wrapper<T> reverse (T&& iterable) { return { iterable }; }
+
+
+  // https://stackoverflow.com/a/44175911
+  class noncopyable {
+  public:
+    noncopyable() = default;
+    ~noncopyable() = default;
+
+  private:
+    noncopyable(const noncopyable&) = delete;
+    noncopyable& operator=(const noncopyable&) = delete;
+  };
+
+}
+}
+
+#endif
+
+// vim:foldmethod=marker
diff --git a/viennacl/scalar.hpp b/viennacl/scalar.hpp
index 4eaf7522a1ad9f5b04977fc2e053ab8e2b2c8c6d..9b08ca489aa931b0814555e3b1fc488bb0c5a5b7 100644
--- a/viennacl/scalar.hpp
+++ b/viennacl/scalar.hpp
@@ -343,12 +343,12 @@ private:
   *
   * @tparam NumericT  Either float or double. Checked at compile time.
   */
-template<class NumericT>
+template<class NumericT, typename H>
 class scalar
 {
-  typedef scalar<NumericT>         self_type;
+  typedef scalar<NumericT, H>         self_type;
 public:
-  typedef viennacl::backend::mem_handle                     handle_type;
+  typedef viennacl::backend::mem_handle<H>                     handle_type;
   typedef vcl_size_t                                        size_type;
 
   /** @brief Returns the underlying host scalar type. */
@@ -411,7 +411,7 @@ public:
   }
 
   /** @brief Assigns a vector entry. */
-  self_type & operator= (entry_proxy<NumericT> const & other)
+  self_type & operator= (entry_proxy<NumericT, H> const & other)
   {
     init_if_necessary(viennacl::traits::context(other));
     viennacl::backend::memory_copy(other.handle(), val_, other.index() * sizeof(NumericT), 0, sizeof(NumericT));
@@ -419,7 +419,7 @@ public:
   }
 
   /** @brief Assigns the value from another scalar. */
-  self_type & operator= (scalar<NumericT> const & other)
+  self_type & operator= (scalar<NumericT, H> const & other)
   {
     init_if_necessary(viennacl::traits::context(other));
     viennacl::backend::memory_copy(other.handle(), val_, 0, 0, sizeof(NumericT));
diff --git a/viennacl/sliced_ell_matrix.hpp b/viennacl/sliced_ell_matrix.hpp
index f66b0d4d6807ad821c879d79be3a533f9d9a8361..3c7ab74e050daba04e9889c7de69eb2a09c99732 100644
--- a/viennacl/sliced_ell_matrix.hpp
+++ b/viennacl/sliced_ell_matrix.hpp
@@ -46,7 +46,7 @@ template<typename ScalarT, typename IndexT /* see forwards.h = unsigned int */>
 class sliced_ell_matrix
 {
 public:
-  typedef viennacl::backend::mem_handle                                                           handle_type;
+  typedef viennacl::backend::mem_handle<>                                                           handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<ScalarT>::ResultType>   value_type;
   typedef vcl_size_t                                                                              size_type;
 
diff --git a/viennacl/toeplitz_matrix.hpp b/viennacl/toeplitz_matrix.hpp
index 1891a6aa24e0dd8573556d51f818ed4b0e15e3b7..00b998ec9856d052b7acd3754e2776cc8a4c1fcb 100644
--- a/viennacl/toeplitz_matrix.hpp
+++ b/viennacl/toeplitz_matrix.hpp
@@ -43,7 +43,7 @@ template<class NumericT, unsigned int AlignmentV>
 class toeplitz_matrix
 {
 public:
-  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef viennacl::backend::mem_handle<>                                                              handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
 
   /**
diff --git a/viennacl/tools/entry_proxy.hpp b/viennacl/tools/entry_proxy.hpp
index 64114eb2f152d452f9034141527618f0ab409dd0..7b055794e05f2edf28e511ccd5e9903cddcc18b4 100644
--- a/viennacl/tools/entry_proxy.hpp
+++ b/viennacl/tools/entry_proxy.hpp
@@ -37,11 +37,11 @@ namespace viennacl
   *
   * @tparam NumericT Either float or double
   */
-template<typename NumericT>
+template<typename NumericT, typename H>
 class entry_proxy
 {
 public:
-  typedef viennacl::backend::mem_handle      handle_type;
+  typedef viennacl::backend::mem_handle<H>      handle_type;
 
   /** @brief The constructor for the proxy class. Declared explicit to avoid any surprises created by the compiler.
       *
@@ -159,7 +159,7 @@ private:
   }
 
   vcl_size_t index_;
-  viennacl::backend::mem_handle & mem_handle_;
+  viennacl::backend::mem_handle<H> & mem_handle_;
 }; //entry_proxy
 
 
@@ -175,12 +175,12 @@ private:
   *
   * @tparam NumericT Either float or double
   */
-template<typename NumericT>
+template<typename NumericT, typename H>
 class const_entry_proxy
 {
-  typedef const_entry_proxy<NumericT>      self_type;
+  typedef const_entry_proxy<NumericT, H>      self_type;
 public:
-  typedef viennacl::backend::mem_handle      handle_type;
+  typedef viennacl::backend::mem_handle<H>      handle_type;
 
   /** @brief The constructor for the proxy class. Declared explicit to avoid any surprises created by the compiler.
       *
@@ -226,7 +226,7 @@ private:
   }
 
   vcl_size_t index_;
-  viennacl::backend::mem_handle const & mem_handle_;
+  handle_type const & mem_handle_;
 }; //entry_proxy
 
 }
diff --git a/viennacl/traits/context.hpp b/viennacl/traits/context.hpp
index c84ab99c553a99a2b3fb5581949dc4871858f928..0613ee37b4552bbdc399d8e1b4cab19e217f7d79 100644
--- a/viennacl/traits/context.hpp
+++ b/viennacl/traits/context.hpp
@@ -48,7 +48,8 @@ viennacl::context context(T const & t)
 }
 
 /** @brief Returns an ID for the currently active memory domain of an object */
-inline viennacl::context context(viennacl::backend::mem_handle const & h)
+template <typename H = viennacl::ocl::handle<cl_mem>>
+inline viennacl::context context(viennacl::backend::mem_handle<H> const & h)
 {
 #ifdef VIENNACL_WITH_OPENCL
   if (h.get_active_handle_id() == OPENCL_MEMORY)
diff --git a/viennacl/traits/handle.hpp b/viennacl/traits/handle.hpp
index 7a2af9a864933bf7098b49f4b1c57e48140bcfc6..e1d69f3df77ad8278324ba7f982439f22ca6d541 100644
--- a/viennacl/traits/handle.hpp
+++ b/viennacl/traits/handle.hpp
@@ -38,14 +38,14 @@ namespace traits
 //
 /** @brief Returns the generic memory handle of an object. Non-const version. */
 template<typename T>
-viennacl::backend::mem_handle & handle(T & obj)
+typename T::handle_type  & handle(T & obj)
 {
   return obj.handle();
 }
 
 /** @brief Returns the generic memory handle of an object. Const-version. */
 template<typename T>
-viennacl::backend::mem_handle const & handle(T const & obj)
+typename T::handle_type const & handle(T const & obj)
 {
   return obj.handle();
 }
@@ -58,96 +58,96 @@ inline long   handle(long val)   { return val; }  //for unification purposes whe
 inline float  handle(float val)  { return val; }  //for unification purposes when passing CPU-scalars to kernels
 inline double handle(double val) { return val; }  //for unification purposes when passing CPU-scalars to kernels
 
-template<typename LHS, typename RHS, typename OP>
-viennacl::backend::mem_handle       & handle(viennacl::scalar_expression< const LHS, const RHS, OP> & obj)
+template<typename LHS, typename RHS, typename OP, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H>       & handle(viennacl::scalar_expression< const LHS, const RHS, OP> & obj)
 {
   return handle(obj.lhs());
 }
 
-template<typename LHS, typename RHS, typename OP>
-viennacl::backend::mem_handle const & handle(viennacl::matrix_expression<LHS, RHS, OP> const & obj);
+template<typename LHS, typename RHS, typename OP, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::matrix_expression<LHS, RHS, OP> const & obj);
 
-template<typename LHS, typename RHS, typename OP>
-viennacl::backend::mem_handle const & handle(viennacl::vector_expression<LHS, RHS, OP> const & obj);
+template<typename LHS, typename RHS, typename OP, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::vector_expression<LHS, RHS, OP> const & obj);
 
-template<typename LHS, typename RHS, typename OP>
-viennacl::backend::mem_handle const & handle(viennacl::scalar_expression< const LHS, const RHS, OP> const & obj)
+template<typename LHS, typename RHS, typename OP, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::scalar_expression< const LHS, const RHS, OP> const & obj)
 {
   return handle(obj.lhs());
 }
 
 // proxy objects require extra care (at the moment)
-template<typename T>
-viennacl::backend::mem_handle       & handle(viennacl::vector_base<T>       & obj)
+template<typename T, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H>       & handle(viennacl::vector_base<T>       & obj)
 {
   return obj.handle();
 }
 
-template<typename T>
-viennacl::backend::mem_handle const & handle(viennacl::vector_base<T> const & obj)
+template<typename T, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::vector_base<T> const & obj)
 {
   return obj.handle();
 }
 
 
 
-template<typename T>
-viennacl::backend::mem_handle       & handle(viennacl::matrix_range<T>       & obj)
+template<typename T, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H>       & handle(viennacl::matrix_range<T>       & obj)
 {
   return obj.get().handle();
 }
 
-template<typename T>
-viennacl::backend::mem_handle const & handle(viennacl::matrix_range<T> const & obj)
+template<typename T, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::matrix_range<T> const & obj)
 {
   return obj.get().handle();
 }
 
 
-template<typename T>
-viennacl::backend::mem_handle       & handle(viennacl::matrix_slice<T>      & obj)
+template<typename T, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H>       & handle(viennacl::matrix_slice<T>      & obj)
 {
   return obj.get().handle();
 }
 
-template<typename T>
-viennacl::backend::mem_handle const & handle(viennacl::matrix_slice<T> const & obj)
+template<typename T, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::matrix_slice<T> const & obj)
 {
   return obj.get().handle();
 }
 
-template<typename LHS, typename RHS, typename OP>
-viennacl::backend::mem_handle const & handle(viennacl::vector_expression<LHS, RHS, OP> const & obj)
+template<typename LHS, typename RHS, typename OP, typename H>
+viennacl::backend::mem_handle<H> const & handle(viennacl::vector_expression<LHS, RHS, OP> const & obj)
 {
   return handle(obj.lhs());
 }
 
-template<typename RHS, typename OP>
-viennacl::backend::mem_handle const & handle(viennacl::vector_expression<const float, RHS, OP> const & obj)
+template<typename RHS, typename OP, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::vector_expression<const float, RHS, OP> const & obj)
 {
   return handle(obj.rhs());
 }
 
-template<typename RHS, typename OP>
-viennacl::backend::mem_handle const & handle(viennacl::vector_expression<const double, RHS, OP> const & obj)
+template<typename RHS, typename OP, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::vector_expression<const double, RHS, OP> const & obj)
 {
   return handle(obj.rhs());
 }
 
-template<typename LHS, typename RHS, typename OP>
-viennacl::backend::mem_handle const & handle(viennacl::matrix_expression<LHS, RHS, OP> const & obj)
+template<typename LHS, typename RHS, typename OP, typename H>
+viennacl::backend::mem_handle<H> const & handle(viennacl::matrix_expression<LHS, RHS, OP> const & obj)
 {
   return handle(obj.lhs());
 }
 
-template<typename RHS, typename OP>
-viennacl::backend::mem_handle const & handle(viennacl::matrix_expression<const float, RHS, OP> const & obj)
+template<typename RHS, typename OP, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::matrix_expression<const float, RHS, OP> const & obj)
 {
   return handle(obj.rhs());
 }
 
-template<typename RHS, typename OP>
-viennacl::backend::mem_handle const & handle(viennacl::matrix_expression<const double, RHS, OP> const & obj)
+template<typename RHS, typename OP, typename H = viennacl::ocl::handle<cl_mem>>
+viennacl::backend::mem_handle<H> const & handle(viennacl::matrix_expression<const double, RHS, OP> const & obj)
 {
   return handle(obj.rhs());
 }
@@ -158,36 +158,50 @@ viennacl::backend::mem_handle const & handle(viennacl::matrix_expression<const d
 // RAM handle extraction
 //
 /** @brief Generic helper routine for extracting the RAM handle of a ViennaCL object. Non-const version. */
-template<typename T>
-typename viennacl::backend::mem_handle::ram_handle_type & ram_handle(T & obj)
+template<typename T, typename H = viennacl::ocl::handle<cl_mem>>
+typename viennacl::backend::mem_handle<H>::ram_handle_type & ram_handle(T & obj)
 {
   return viennacl::traits::handle(obj).ram_handle();
 }
 
 /** @brief Generic helper routine for extracting the RAM handle of a ViennaCL object. Const version. */
-template<typename T>
-typename viennacl::backend::mem_handle::ram_handle_type const & ram_handle(T const & obj)
+template<typename T, typename H = viennacl::ocl::handle<cl_mem>>
+typename viennacl::backend::mem_handle<H>::ram_handle_type const & ram_handle(T const & obj)
 {
   return viennacl::traits::handle(obj).ram_handle();
 }
 
 /** \cond */
-inline viennacl::backend::mem_handle::ram_handle_type & ram_handle(viennacl::backend::mem_handle & h)
+template <typename H>
+inline viennacl::tools::shared_ptr<char> & ram_handle(viennacl::backend::mem_handle<H> & h)
 {
   return h.ram_handle();
 }
 
-inline viennacl::backend::mem_handle::ram_handle_type const & ram_handle(viennacl::backend::mem_handle const & h)
+template <typename H = viennacl::ocl::handle<cl_mem>>
+inline viennacl::tools::shared_ptr<char> const & ram_handle(viennacl::backend::mem_handle<H> const & h)
 {
   return h.ram_handle();
 }
-/** \endcond */
 
+/** \endcond */
 //
 // OpenCL handle extraction
 //
 #ifdef VIENNACL_WITH_OPENCL
 /** @brief Generic helper routine for extracting the OpenCL handle of a ViennaCL object. Non-const version. */
+template <typename T>
+viennacl::ocl::pooled_clmem_handle & opencl_handle(viennacl::vector_base<T, viennacl::ocl::pooled_clmem_handle> & obj)
+{
+  return viennacl::traits::handle(obj).opencl_handle();
+}
+
+template <typename T>
+viennacl::ocl::pooled_clmem_handle const & opencl_handle(viennacl::vector_base<T, viennacl::ocl::pooled_clmem_handle> const & obj)
+{
+  return viennacl::traits::handle(obj).opencl_handle();
+}
+
 template<typename T>
 viennacl::ocl::handle<cl_mem> & opencl_handle(T & obj)
 {
@@ -220,6 +234,12 @@ viennacl::ocl::handle<cl_mem> const & opencl_handle(viennacl::vector_expression<
   return viennacl::traits::handle(obj.rhs()).opencl_handle();
 }
 
+template<typename LHS, typename NumericT>
+viennacl::ocl::pooled_clmem_handle const & opencl_handle(viennacl::vector_expression<LHS, const vector_base<NumericT, viennacl::ocl::pooled_clmem_handle>, op_prod> const & obj)
+{
+  return viennacl::traits::handle(obj.rhs()).opencl_handle();
+}
+
 template<typename T>
 viennacl::ocl::context & opencl_context(T const & obj)
 {
diff --git a/viennacl/traits/size.hpp b/viennacl/traits/size.hpp
index 2e2e0d763f304c7b31e71dbac866379c0252f60f..3f3e1769b518458f8bc5f30dba809fe6fe97ac67 100644
--- a/viennacl/traits/size.hpp
+++ b/viennacl/traits/size.hpp
@@ -384,8 +384,8 @@ vcl_size_t size(vector_expression<const matrix_expression<const LhsT, const RhsT
 // internal_size: Returns the internal (padded) length of vectors
 //
 /** @brief Helper routine for obtaining the buffer length of a ViennaCL vector  */
-template<typename NumericT>
-vcl_size_t internal_size(vector_base<NumericT> const & vec)
+template<typename NumericT, typename H>
+vcl_size_t internal_size(vector_base<NumericT, H> const & vec)
 {
   return vec.internal_size();
 }
diff --git a/viennacl/traits/start.hpp b/viennacl/traits/start.hpp
index c81a3b35d28c772a8368bea557be677e91d6ea94..a01581e2c45565a206a4a4c9247b7a815563ec3d 100644
--- a/viennacl/traits/start.hpp
+++ b/viennacl/traits/start.hpp
@@ -47,9 +47,9 @@ start(T const & obj)
 }
 
 //ViennaCL vector leads to start index 0:
-template<typename ScalarType, unsigned int AlignmentV>
-typename result_of::size_type<viennacl::vector<ScalarType, AlignmentV> >::type
-start(viennacl::vector<ScalarType, AlignmentV> const &)
+template<typename ScalarType, unsigned int AlignmentV, typename H>
+typename result_of::size_type<viennacl::vector<ScalarType, AlignmentV, H> >::type
+start(viennacl::vector<ScalarType, AlignmentV, H> const &)
 {
   return 0;
 }
diff --git a/viennacl/traits/stride.hpp b/viennacl/traits/stride.hpp
index 68c46814ae316fa222fc00bc167e9a9eee191c2b..dcc79268412fbea1a89d9e828dd2b1daa07efc1e 100644
--- a/viennacl/traits/stride.hpp
+++ b/viennacl/traits/stride.hpp
@@ -40,9 +40,9 @@ namespace traits
 //
 // inc: Increment for vectors. Defaults to 1
 //
-template<typename T>
-typename result_of::size_type< viennacl::vector_base<T> >::type
-stride(viennacl::vector_base<T> const & s) { return s.stride(); }
+template<typename T, typename H>
+typename result_of::size_type< viennacl::vector_base<T, H> >::type
+stride(viennacl::vector_base<T, H> const & s) { return s.stride(); }
 
 //
 // inc1: Row increment for matrices. Defaults to 1
diff --git a/viennacl/vandermonde_matrix.hpp b/viennacl/vandermonde_matrix.hpp
index d3f3a66c2abf542ddec2ec513c8681856068ba33..6a9eff153ff984e11662c2d629ecd6a146e28662 100644
--- a/viennacl/vandermonde_matrix.hpp
+++ b/viennacl/vandermonde_matrix.hpp
@@ -44,7 +44,7 @@ template<class NumericT, unsigned int AlignmentV>
 class vandermonde_matrix
 {
 public:
-  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef viennacl::backend::mem_handle<>                                                              handle_type;
   typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
 
   /**
diff --git a/viennacl/vector.hpp b/viennacl/vector.hpp
index 69bf3d4c38de643a235613628b09b9a28eac62bc..c735044f812aec747a586c59e613422c50a0d5d9 100644
--- a/viennacl/vector.hpp
+++ b/viennacl/vector.hpp
@@ -104,15 +104,15 @@ private:
 * @tparam NumericT  The underlying floating point type (either float or double)
 * @tparam AlignmentV   Alignment of the underlying vector, @see vector
 */
-template<class NumericT, unsigned int AlignmentV>
+template<class NumericT, unsigned int AlignmentV, typename OCLHandle>
 class const_vector_iterator
 {
-  typedef const_vector_iterator<NumericT, AlignmentV>    self_type;
+  typedef const_vector_iterator<NumericT, AlignmentV, OCLHandle>    self_type;
 public:
   typedef scalar<NumericT>            value_type;
   typedef vcl_size_t                size_type;
   typedef vcl_ptrdiff_t                 difference_type;
-  typedef viennacl::backend::mem_handle handle_type;
+  typedef viennacl::backend::mem_handle<OCLHandle> handle_type;
 
   //const_vector_iterator() {}
 
@@ -122,7 +122,7 @@ public:
     *   @param start  First index of the element in the vector pointed to be the iterator (for vector_range and vector_slice)
     *   @param stride Stride for the support of vector_slice
     */
-  const_vector_iterator(vector_base<NumericT> const & vec,
+  const_vector_iterator(vector_base<NumericT, OCLHandle> const & vec,
                         size_type index,
                         size_type start = 0,
                         size_type stride = 1) : elements_(vec.handle()), index_(index), start_(start), stride_(stride) {}
@@ -142,7 +142,7 @@ public:
   value_type operator*(void) const
   {
     value_type result;
-    result = const_entry_proxy<NumericT>(start_ + index_ * stride(), elements_);
+    result = const_entry_proxy<NumericT, OCLHandle>(start_ + index_ * stride(), elements_);
     return result;
   }
   self_type operator++(void) { ++index_; return *this; }
@@ -201,11 +201,11 @@ protected:
 * @tparam NumericT  The underlying floating point type (either float or double)
 * @tparam AlignmentV   Alignment of the underlying vector, @see vector
 */
-template<class NumericT, unsigned int AlignmentV>
-class vector_iterator : public const_vector_iterator<NumericT, AlignmentV>
+template<class NumericT, unsigned int AlignmentV, typename OCLHandle>
+class vector_iterator : public const_vector_iterator<NumericT, AlignmentV, OCLHandle>
 {
-  typedef const_vector_iterator<NumericT, AlignmentV>  base_type;
-  typedef vector_iterator<NumericT, AlignmentV>        self_type;
+  typedef const_vector_iterator<NumericT, AlignmentV, OCLHandle>  base_type;
+  typedef vector_iterator<NumericT, AlignmentV, OCLHandle>        self_type;
 public:
   typedef typename base_type::handle_type               handle_type;
   typedef typename base_type::size_type             size_type;
@@ -221,15 +221,15 @@ public:
     *   @param start  Offset from the beginning of the underlying vector (for ranges and slices)
     *   @param stride Stride for slices
     */
-  vector_iterator(vector_base<NumericT> & vec,
+  vector_iterator(vector_base<NumericT, OCLHandle> & vec,
                   size_type index,
                   size_type start = 0,
                   size_type stride = 1) : base_type(vec, index, start, stride), elements_(vec.handle()) {}
   //vector_iterator(base_type const & b) : base_type(b) {}
 
-  entry_proxy<NumericT> operator*(void)
+  entry_proxy<NumericT, OCLHandle> operator*(void)
   {
-    return entry_proxy<NumericT>(base_type::start_ + base_type::index_ * base_type::stride(), elements_);
+    return entry_proxy<NumericT, OCLHandle>(base_type::start_ + base_type::index_ * base_type::stride(), elements_);
   }
 
   difference_type operator-(self_type const & other) const { difference_type result = base_type::index_; return (result - static_cast<difference_type>(other.index_)); }
@@ -247,136 +247,138 @@ private:
 };
 
 
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT>::vector_base() : size_(0), start_(0), stride_(1), internal_size_(0) { /* Note: One must not call ::init() here because a vector might have been created globally before the backend has become available */ }
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::vector_base() : size_(0), start_(0), stride_(1), internal_size_(0) { /* Note: One must not call ::init() here because a vector might have been created globally before the backend has become available */ }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT>::vector_base(viennacl::backend::mem_handle & h,
-                                                     size_type vec_size, size_type vec_start, size_type vec_stride)
-  : size_(vec_size), start_(vec_start), stride_(vec_stride), internal_size_(vec_size), elements_(h) {}
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::vector_base(viennacl::backend::mem_handle<OCLHandle> & h,
+                                                   size_type vec_size, size_type vec_start, size_type vec_stride)
+: size_(vec_size), start_(vec_start), stride_(vec_stride), internal_size_(vec_size), elements_(h) {}
 
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT>::vector_base(size_type vec_size, viennacl::context ctx)
-  : size_(vec_size), start_(0), stride_(1), internal_size_(viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size))
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::vector_base(size_type vec_size, viennacl::context ctx)
+: size_(vec_size), start_(0), stride_(1), internal_size_(viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size))
 {
-  if (size_ > 0)
-  {
-    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), ctx);
-    clear();
-  }
+if (size_ > 0)
+{
+  // [kk:] this is the constructor that we are concerned about
+  viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), ctx, NULL);
+  clear();
 }
+}
+
 
 // CUDA or host memory:
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT>::vector_base(NumericT * ptr_to_mem, viennacl::memory_types mem_type, size_type vec_size, vcl_size_t start, size_type stride)
-  : size_(vec_size), start_(start), stride_(stride), internal_size_(vec_size)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::vector_base(NumericT * ptr_to_mem, viennacl::memory_types mem_type, size_type vec_size, vcl_size_t start, size_type stride)
+: size_(vec_size), start_(start), stride_(stride), internal_size_(vec_size)
+{
+if (mem_type == viennacl::CUDA_MEMORY)
 {
-  if (mem_type == viennacl::CUDA_MEMORY)
-  {
 #ifdef VIENNACL_WITH_CUDA
-    elements_.switch_active_handle_id(viennacl::CUDA_MEMORY);
-    elements_.cuda_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
-    elements_.cuda_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+  elements_.switch_active_handle_id(viennacl::CUDA_MEMORY);
+  elements_.cuda_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+  elements_.cuda_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
 #else
-    throw cuda_not_available_exception();
+  throw cuda_not_available_exception();
 #endif
-  }
-  else if (mem_type == viennacl::MAIN_MEMORY)
-  {
-    elements_.switch_active_handle_id(viennacl::MAIN_MEMORY);
-    elements_.ram_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
-    elements_.ram_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
-  }
+}
+else if (mem_type == viennacl::MAIN_MEMORY)
+{
+  elements_.switch_active_handle_id(viennacl::MAIN_MEMORY);
+  elements_.ram_handle().reset(reinterpret_cast<char*>(ptr_to_mem));
+  elements_.ram_handle().inc(); //prevents that the user-provided memory is deleted once the vector object is destroyed.
+}
 
-  elements_.raw_size(sizeof(NumericT) * vec_size);
+elements_.raw_size(sizeof(NumericT) * vec_size);
 
 }
 
 #ifdef VIENNACL_WITH_OPENCL
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT>::vector_base(cl_mem existing_mem, size_type vec_size, size_type start, size_type stride, viennacl::context ctx)
-  : size_(vec_size), start_(start), stride_(stride), internal_size_(vec_size)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::vector_base(cl_mem existing_mem, size_type vec_size, size_type start, size_type stride, viennacl::context ctx)
+: size_(vec_size), start_(start), stride_(stride), internal_size_(vec_size)
 {
-  elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
-  elements_.opencl_handle() = existing_mem;
-  elements_.opencl_handle().inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
-  elements_.opencl_handle().context(ctx.opencl_context());
-  elements_.raw_size(sizeof(NumericT) * vec_size);
+elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+elements_.opencl_handle() = existing_mem;
+elements_.opencl_handle().inc();  //prevents that the user-provided memory is deleted once the vector object is destroyed.
+elements_.opencl_handle().context(ctx.opencl_context());
+elements_.raw_size(sizeof(NumericT) * vec_size);
 }
 #endif
 
 
-template<class NumericT, typename SizeT, typename DistanceT>
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
 template<typename LHS, typename RHS, typename OP>
-vector_base<NumericT, SizeT, DistanceT>::vector_base(vector_expression<const LHS, const RHS, OP> const & proxy)
-  : size_(viennacl::traits::size(proxy)), start_(0), stride_(1), internal_size_(viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size))
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::vector_base(vector_expression<const LHS, const RHS, OP> const & proxy)
+: size_(viennacl::traits::size(proxy)), start_(0), stride_(1), internal_size_(viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size))
 {
-  if (size_ > 0)
-  {
-    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(proxy));
-    clear();
-  }
-  self_type::operator=(proxy);
+if (size_ > 0)
+{
+  viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(proxy));
+  clear();
+}
+self_type::operator=(proxy);
 }
 
 // Copy CTOR:
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT>::vector_base(const vector_base<NumericT, SizeT, DistanceT> & other) :
-  size_(other.size_), start_(0), stride_(1),
-  internal_size_(viennacl::tools::align_to_multiple<size_type>(other.size_, dense_padding_size))
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::vector_base(const vector_base<NumericT, OCLHandle, SizeT, DistanceT> & other) :
+size_(other.size_), start_(0), stride_(1),
+internal_size_(viennacl::tools::align_to_multiple<size_type>(other.size_, dense_padding_size))
 {
-  elements_.switch_active_handle_id(viennacl::traits::active_handle_id(other));
-  if (internal_size() > 0)
-  {
-    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(other));
-    clear();
-    self_type::operator=(other);
-  }
+elements_.switch_active_handle_id(viennacl::traits::active_handle_id(other));
+if (internal_size() > 0)
+{
+  viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(other));
+  clear();
+  self_type::operator=(other);
+}
 }
 
 // Conversion CTOR:
-template<typename NumericT, typename SizeT, typename DistanceT>
+template<typename NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
 template<typename OtherNumericT>
-vector_base<NumericT, SizeT, DistanceT>::vector_base(const vector_base<OtherNumericT> & other) :
-  size_(other.size()), start_(0), stride_(1),
-  internal_size_(viennacl::tools::align_to_multiple<size_type>(other.size(), dense_padding_size))
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::vector_base(const vector_base<OtherNumericT> & other) :
+size_(other.size()), start_(0), stride_(1),
+internal_size_(viennacl::tools::align_to_multiple<size_type>(other.size(), dense_padding_size))
 {
-  elements_.switch_active_handle_id(viennacl::traits::active_handle_id(other));
-  if (internal_size() > 0)
-  {
-    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(other));
-    clear();
-    self_type::operator=(other);
-  }
+elements_.switch_active_handle_id(viennacl::traits::active_handle_id(other));
+if (internal_size() > 0)
+{
+  viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(other));
+  clear();
+  self_type::operator=(other);
+}
 }
 
 
 
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator=(const self_type & vec)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator=(const self_type & vec)
 {
-  assert( ( (vec.size() == size()) || (size() == 0) )
-          && bool("Incompatible vector sizes!"));
+assert( ( (vec.size() == size()) || (size() == 0) )
+        && bool("Incompatible vector sizes!"));
 
-  if (&vec==this)
-    return *this;
+if (&vec==this)
+  return *this;
 
-  if (vec.size() > 0)
+if (vec.size() > 0)
+{
+  if (size_ == 0)
   {
-    if (size_ == 0)
-    {
-      size_ = vec.size();
-      internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
-      elements_.switch_active_handle_id(vec.handle().get_active_handle_id());
-      viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(vec));
-      pad();
-    }
-
-    viennacl::linalg::av(*this,
-                         vec, cpu_value_type(1.0), 1, false, false);
+    size_ = vec.size();
+    internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
+    elements_.switch_active_handle_id(vec.handle().get_active_handle_id());
+    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(vec));
+    pad();
   }
 
-  return *this;
+  viennacl::linalg::av(*this,
+                       vec, cpu_value_type(1.0), 1, false, false);
+}
+
+return *this;
 }
 
 
@@ -384,122 +386,122 @@ vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT
 *
 * @param proxy  An expression template proxy class.
 */
-template<class NumericT, typename SizeT, typename DistanceT>
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
 template<typename LHS, typename RHS, typename OP>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator=(const vector_expression<const LHS, const RHS, OP> & proxy)
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator=(const vector_expression<const LHS, const RHS, OP> & proxy)
 {
-  assert( ( (viennacl::traits::size(proxy) == size()) || (size() == 0) )
-          && bool("Incompatible vector sizes!"));
+assert( ( (viennacl::traits::size(proxy) == size()) || (size() == 0) )
+        && bool("Incompatible vector sizes!"));
 
-  // initialize the necessary buffer
-  if (size() == 0)
-  {
-    size_ = viennacl::traits::size(proxy);
-    internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
-    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(proxy));
-    pad();
-  }
+// initialize the necessary buffer
+if (size() == 0)
+{
+  size_ = viennacl::traits::size(proxy);
+  internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
+  viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(proxy));
+  pad();
+}
 
-  linalg::detail::op_executor<self_type, op_assign, vector_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
+linalg::detail::op_executor<self_type, op_assign, vector_expression<const LHS, const RHS, OP> >::apply(*this, proxy);
 
-  return *this;
+return *this;
 }
 
 // convert from vector with other numeric type
-template<class NumericT, typename SizeT, typename DistanceT>
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
 template<typename OtherNumericT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>:: operator = (const vector_base<OtherNumericT> & v1)
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>:: operator = (const vector_base<OtherNumericT> & v1)
 {
-  assert( ( (v1.size() == size()) || (size() == 0) )
-          && bool("Incompatible vector sizes!"));
+assert( ( (v1.size() == size()) || (size() == 0) )
+        && bool("Incompatible vector sizes!"));
 
-  if (size() == 0)
+if (size() == 0)
+{
+  size_ = v1.size();
+  if (size_ > 0)
   {
-    size_ = v1.size();
-    if (size_ > 0)
-    {
-      internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
-      viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(v1));
-      pad();
-    }
+    internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
+    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), viennacl::traits::context(v1));
+    pad();
   }
+}
 
-  viennacl::linalg::convert(*this, v1);
+viennacl::linalg::convert(*this, v1);
 
-  return *this;
+return *this;
 }
 
 /** @brief Creates the vector from the supplied unit vector. */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator = (unit_vector<NumericT> const & v)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator = (unit_vector<NumericT> const & v)
 {
-  assert( ( (v.size() == size()) || (size() == 0) )
-          && bool("Incompatible vector sizes!"));
+assert( ( (v.size() == size()) || (size() == 0) )
+        && bool("Incompatible vector sizes!"));
 
-  if (size() == 0)
+if (size() == 0)
+{
+  size_ = v.size();
+  internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
+  if (size_ > 0)
   {
-    size_ = v.size();
-    internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
-    if (size_ > 0)
-    {
-      viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), v.context());
-      clear();
-    }
+    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), v.context());
+    clear();
   }
-  else
-    viennacl::linalg::vector_assign(*this, NumericT(0));
+}
+else
+  viennacl::linalg::vector_assign(*this, NumericT(0));
 
-  if (size_ > 0)
-    this->operator()(v.index()) = NumericT(1);
+if (size_ > 0)
+  this->operator()(v.index()) = NumericT(1);
 
-  return *this;
+return *this;
 }
 
 /** @brief Creates the vector from the supplied zero vector. */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator = (zero_vector<NumericT> const & v)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator = (zero_vector<NumericT> const & v)
 {
-  assert( ( (v.size() == size()) || (size() == 0) )
-          && bool("Incompatible vector sizes!"));
+assert( ( (v.size() == size()) || (size() == 0) )
+        && bool("Incompatible vector sizes!"));
 
-  if (size() == 0)
+if (size() == 0)
+{
+  size_ = v.size();
+  internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
+  if (size_ > 0)
   {
-    size_ = v.size();
-    internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
-    if (size_ > 0)
-    {
-      viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), v.context());
-      clear();
-    }
+    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), v.context());
+    clear();
   }
-  else
-    viennacl::linalg::vector_assign(*this, NumericT(0));
+}
+else
+  viennacl::linalg::vector_assign(*this, NumericT(0));
 
-  return *this;
+return *this;
 }
 
 /** @brief Creates the vector from the supplied scalar vector. */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator = (scalar_vector<NumericT> const & v)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator = (scalar_vector<NumericT> const & v)
 {
-  assert( ( (v.size() == size()) || (size() == 0) )
-          && bool("Incompatible vector sizes!"));
+assert( ( (v.size() == size()) || (size() == 0) )
+        && bool("Incompatible vector sizes!"));
 
-  if (size() == 0)
+if (size() == 0)
+{
+  size_ = v.size();
+  internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
+  if (size_ > 0)
   {
-    size_ = v.size();
-    internal_size_ = viennacl::tools::align_to_multiple<size_type>(size_, dense_padding_size);
-    if (size_ > 0)
-    {
-      viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), v.context());
-      pad();
-    }
+    viennacl::backend::memory_create(elements_, sizeof(NumericT)*internal_size(), v.context());
+    pad();
   }
+}
 
-  if (size_ > 0)
-    viennacl::linalg::vector_assign(*this, v[0]);
+if (size_ > 0)
+  viennacl::linalg::vector_assign(*this, v[0]);
 
-  return *this;
+return *this;
 }
 
 
@@ -510,46 +512,46 @@ vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT
 //This is certainly not the nicest approach and will most likely by changed in the future, but it works :-)
 
 //matrix<>
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator=(const viennacl::vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, viennacl::op_prod> & proxy)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator=(const viennacl::vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, viennacl::op_prod> & proxy)
 {
-  assert(viennacl::traits::size1(proxy.lhs()) == size() && bool("Size check failed for v1 = A * v2: size1(A) != size(v1)"));
+assert(viennacl::traits::size1(proxy.lhs()) == size() && bool("Size check failed for v1 = A * v2: size1(A) != size(v1)"));
 
-  // check for the special case x = A * x
-  if (viennacl::traits::handle(proxy.rhs()) == viennacl::traits::handle(*this))
-  {
-    viennacl::vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
-    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    *this = result;
-  }
-  else
-  {
-    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-  }
-  return *this;
+// check for the special case x = A * x
+if (viennacl::traits::handle(proxy.rhs()) == viennacl::traits::handle(*this))
+{
+  viennacl::vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
+  viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+  *this = result;
+}
+else
+{
+  viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+}
+return *this;
 }
 
 
 //transposed_matrix_proxy:
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator=(const vector_expression< const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans >,
-                                                                                             const vector_base<NumericT>,
-                                                                                             op_prod> & proxy)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator=(const vector_expression< const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans >,
+                                                                                           const vector_base<NumericT>,
+                                                                                           op_prod> & proxy)
 {
-  assert(viennacl::traits::size1(proxy.lhs()) == size() && bool("Size check failed in v1 = trans(A) * v2: size2(A) != size(v1)"));
+assert(viennacl::traits::size1(proxy.lhs()) == size() && bool("Size check failed in v1 = trans(A) * v2: size2(A) != size(v1)"));
 
-  // check for the special case x = trans(A) * x
-  if (viennacl::traits::handle(proxy.rhs()) == viennacl::traits::handle(*this))
-  {
-    viennacl::vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
-    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
-    *this = result;
-  }
-  else
-  {
-    viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
-  }
-  return *this;
+// check for the special case x = trans(A) * x
+if (viennacl::traits::handle(proxy.rhs()) == viennacl::traits::handle(*this))
+{
+  viennacl::vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
+  viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+  *this = result;
+}
+else
+{
+  viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+}
+return *this;
 }
 
 ///////////////////////////// Matrix Vector interaction end ///////////////////////////////////
@@ -558,40 +560,40 @@ vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT
 //////////////////////////// Read-write access to an element of the vector start ///////////////////
 //read-write access to an element of the vector
 
-template<class NumericT, typename SizeT, typename DistanceT>
-entry_proxy<NumericT> vector_base<NumericT, SizeT, DistanceT>::operator()(size_type index)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+entry_proxy<NumericT, OCLHandle> vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator()(size_type index)
 {
-  assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
-  assert( index < size() && bool("Index out of bounds!") );
+assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+assert( index < size() && bool("Index out of bounds!") );
 
-  return entry_proxy<NumericT>(start_ + stride_ * index, elements_);
+return entry_proxy<NumericT, OCLHandle>(start_ + stride_ * index, elements_);
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-entry_proxy<NumericT> vector_base<NumericT, SizeT, DistanceT>::operator[](size_type index)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+entry_proxy<NumericT, OCLHandle> vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator[](size_type index)
 {
-  assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
-  assert( index < size() && bool("Index out of bounds!") );
+assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+assert( index < size() && bool("Index out of bounds!") );
 
-  return entry_proxy<NumericT>(start_ + stride_ * index, elements_);
+return entry_proxy<NumericT, OCLHandle>(start_ + stride_ * index, elements_);
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-const_entry_proxy<NumericT> vector_base<NumericT, SizeT, DistanceT>::operator()(size_type index) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+const_entry_proxy<NumericT, OCLHandle> vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator()(size_type index) const
 {
-  assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
-  assert( index < size() && bool("Index out of bounds!") );
+assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+assert( index < size() && bool("Index out of bounds!") );
 
-  return const_entry_proxy<NumericT>(start_ + stride_ * index, elements_);
+return const_entry_proxy<NumericT, OCLHandle>(start_ + stride_ * index, elements_);
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-const_entry_proxy<NumericT> vector_base<NumericT, SizeT, DistanceT>::operator[](size_type index) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+const_entry_proxy<NumericT, OCLHandle> vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator[](size_type index) const
 {
-  assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
-  assert( index < size() && bool("Index out of bounds!") );
+assert( (size() > 0)  && bool("Cannot apply operator() to vector of size zero!"));
+assert( index < size() && bool("Index out of bounds!") );
 
-  return const_entry_proxy<NumericT>(start_ + stride_ * index, elements_);
+return const_entry_proxy<NumericT, OCLHandle>(start_ + stride_ * index, elements_);
 }
 
 //////////////////////////// Read-write access to an element of the vector end ///////////////////
@@ -600,236 +602,236 @@ const_entry_proxy<NumericT> vector_base<NumericT, SizeT, DistanceT>::operator[](
 //
 // Operator overloads with implicit conversion (thus cannot be made global without introducing additional headache)
 //
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator += (const self_type & vec)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator += (const self_type & vec)
 {
-  assert(vec.size() == size() && bool("Incompatible vector sizes!"));
+assert(vec.size() == size() && bool("Incompatible vector sizes!"));
 
-  if (size() > 0)
-    viennacl::linalg::avbv(*this,
-                           *this, NumericT(1.0), 1, false, false,
-                           vec,   NumericT(1.0), 1, false, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::avbv(*this,
+                         *this, NumericT(1.0), 1, false, false,
+                         vec,   NumericT(1.0), 1, false, false);
+return *this;
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator -= (const self_type & vec)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator -= (const self_type & vec)
 {
-  assert(vec.size() == size() && bool("Incompatible vector sizes!"));
+assert(vec.size() == size() && bool("Incompatible vector sizes!"));
 
-  if (size() > 0)
-    viennacl::linalg::avbv(*this,
-                           *this, NumericT(1.0),  1, false, false,
-                           vec,   NumericT(-1.0), 1, false, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::avbv(*this,
+                         *this, NumericT(1.0),  1, false, false,
+                         vec,   NumericT(-1.0), 1, false, false);
+return *this;
 }
 
 /** @brief Scales a vector (or proxy) by a char (8-bit integer) value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator *= (char val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator *= (char val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, false, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, false, false);
+return *this;
 }
 /** @brief Scales a vector (or proxy) by a short integer value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator *= (short val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator *= (short val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, false, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, false, false);
+return *this;
 }
 /** @brief Scales a vector (or proxy) by an integer value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator *= (int val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator *= (int val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, false, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, false, false);
+return *this;
 }
 /** @brief Scales a vector (or proxy) by a long integer value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator *= (long val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator *= (long val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, false, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, false, false);
+return *this;
 }
 /** @brief Scales a vector (or proxy) by a single precision floating point value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator *= (float val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator *= (float val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, false, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, false, false);
+return *this;
 }
 /** @brief Scales a vector (or proxy) by a double precision floating point value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator *= (double val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator *= (double val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, false, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, false, false);
+return *this;
 }
 
 
 /** @brief Scales this vector by a char (8-bit) value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator /= (char val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator /= (char val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, true, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, true, false);
+return *this;
 }
 /** @brief Scales this vector by a short integer value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator /= (short val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator /= (short val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, true, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, true, false);
+return *this;
 }
 /** @brief Scales this vector by an integer value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator /= (int val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator /= (int val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, true, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, true, false);
+return *this;
 }
 /** @brief Scales this vector by a long integer value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator /= (long val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator /= (long val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, true, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, true, false);
+return *this;
 }
 /** @brief Scales this vector by a single precision floating point value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator /= (float val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator /= (float val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, true, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, true, false);
+return *this;
 }
 /** @brief Scales this vector by a double precision floating point value */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::operator /= (double val)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator /= (double val)
 {
-  if (size() > 0)
-    viennacl::linalg::av(*this,
-                         *this, NumericT(val), 1, true, false);
-  return *this;
+if (size() > 0)
+  viennacl::linalg::av(*this,
+                       *this, NumericT(val), 1, true, false);
+return *this;
 }
 
 
 /** @brief Scales the vector by a char (8-bit value) 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_mult>
-vector_base<NumericT, SizeT, DistanceT>::operator * (char value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_mult>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator * (char value) const
 {
-  return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
 }
 /** @brief Scales the vector by a short integer 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_mult>
-vector_base<NumericT, SizeT, DistanceT>::operator * (short value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_mult>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator * (short value) const
 {
-  return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
 }
 /** @brief Scales the vector by an integer 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_mult>
-vector_base<NumericT, SizeT, DistanceT>::operator * (int value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_mult>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator * (int value) const
 {
-  return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
 }
 /** @brief Scales the vector by a long integer 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_mult>
-vector_base<NumericT, SizeT, DistanceT>::operator * (long value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_mult>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator * (long value) const
 {
-  return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
 }
 /** @brief Scales the vector by a single precision floating point number 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_mult>
-vector_base<NumericT, SizeT, DistanceT>::operator * (float value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_mult>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator * (float value) const
 {
-  return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
 }
 /** @brief Scales the vector by a single precision floating point number 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_mult>
-vector_base<NumericT, SizeT, DistanceT>::operator * (double value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_mult>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator * (double value) const
 {
-  return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_mult>(*this, NumericT(value));
 }
 
 
 /** @brief Scales the vector by a char (8-bit value) 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_div>
-vector_base<NumericT, SizeT, DistanceT>::operator / (char value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_div>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator / (char value) const
 {
-  return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
 }
 /** @brief Scales the vector by a short integer 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_div>
-vector_base<NumericT, SizeT, DistanceT>::operator / (short value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_div>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator / (short value) const
 {
-  return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
 }
 /** @brief Scales the vector by an integer 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_div>
-vector_base<NumericT, SizeT, DistanceT>::operator / (int value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_div>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator / (int value) const
 {
-  return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
 }
 /** @brief Scales the vector by a long integer 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_div>
-vector_base<NumericT, SizeT, DistanceT>::operator / (long value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_div>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator / (long value) const
 {
-  return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
 }
 /** @brief Scales the vector by a single precision floating point number 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_div>
-vector_base<NumericT, SizeT, DistanceT>::operator / (float value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_div>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator / (float value) const
 {
-  return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
 }
 /** @brief Scales the vector by a double precision floating point number 'alpha' and returns an expression template */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression< const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_div>
-vector_base<NumericT, SizeT, DistanceT>::operator / (double value) const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression< const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_div>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator / (double value) const
 {
-  return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
+return vector_expression< const self_type, const NumericT, op_div>(*this, NumericT(value));
 }
 
 
 /** @brief Sign flip for the vector. Emulated to be equivalent to -1.0 * vector */
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_expression<const vector_base<NumericT, SizeT, DistanceT>, const NumericT, op_mult>
-vector_base<NumericT, SizeT, DistanceT>::operator-() const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_expression<const vector_base<NumericT, OCLHandle, SizeT, DistanceT>, const NumericT, op_mult>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT>::operator-() const
 {
-  return vector_expression<const self_type, const NumericT, op_mult>(*this, NumericT(-1.0));
+return vector_expression<const self_type, const NumericT, op_mult>(*this, NumericT(-1.0));
 }
 
 //
@@ -837,88 +839,88 @@ vector_base<NumericT, SizeT, DistanceT>::operator-() const
 //
 
 /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
-template<class NumericT, typename SizeT, typename DistanceT>
-typename vector_base<NumericT, SizeT, DistanceT>::iterator vector_base<NumericT, SizeT, DistanceT>::begin()
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+typename vector_base<NumericT, OCLHandle, SizeT, DistanceT>::iterator vector_base<NumericT, OCLHandle, SizeT, DistanceT>::begin()
 {
-  return iterator(*this, 0, start_, stride_);
+return iterator(*this, 0, start_, stride_);
 }
 
 /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
-template<class NumericT, typename SizeT, typename DistanceT>
-typename vector_base<NumericT, SizeT, DistanceT>::iterator vector_base<NumericT, SizeT, DistanceT>::end()
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+typename vector_base<NumericT, OCLHandle, SizeT, DistanceT>::iterator vector_base<NumericT, OCLHandle, SizeT, DistanceT>::end()
 {
-  return iterator(*this, size(), start_, stride_);
+return iterator(*this, size(), start_, stride_);
 }
 
 /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
-template<class NumericT, typename SizeT, typename DistanceT>
-typename vector_base<NumericT, SizeT, DistanceT>::const_iterator vector_base<NumericT, SizeT, DistanceT>::begin() const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+typename vector_base<NumericT, OCLHandle, SizeT, DistanceT>::const_iterator vector_base<NumericT, OCLHandle, SizeT, DistanceT>::begin() const
 {
-  return const_iterator(*this, 0, start_, stride_);
+return const_iterator(*this, 0, start_, stride_);
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-typename vector_base<NumericT, SizeT, DistanceT>::const_iterator vector_base<NumericT, SizeT, DistanceT>::end() const
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+typename vector_base<NumericT, OCLHandle, SizeT, DistanceT>::const_iterator vector_base<NumericT, OCLHandle, SizeT, DistanceT>::end() const
 {
-  return const_iterator(*this, size(), start_, stride_);
+return const_iterator(*this, size(), start_, stride_);
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::swap(self_type & other)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::swap(self_type & other)
 {
-  viennacl::linalg::vector_swap(*this, other);
-  return *this;
+viennacl::linalg::vector_swap(*this, other);
+return *this;
 }
 
 
-template<class NumericT, typename SizeT, typename DistanceT>
-void vector_base<NumericT, SizeT, DistanceT>::clear()
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+void vector_base<NumericT, OCLHandle, SizeT, DistanceT>::clear()
 {
-  viennacl::linalg::vector_assign(*this, cpu_value_type(0.0), true);
+viennacl::linalg::vector_assign(*this, cpu_value_type(0.0), true);
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-vector_base<NumericT, SizeT, DistanceT> & vector_base<NumericT, SizeT, DistanceT>::fast_swap(self_type & other)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+vector_base<NumericT, OCLHandle, SizeT, DistanceT> & vector_base<NumericT, OCLHandle, SizeT, DistanceT>::fast_swap(self_type & other)
 {
-  assert(this->size_ == other.size_ && bool("Vector size mismatch"));
-  this->elements_.swap(other.elements_);
-  return *this;
+assert(this->size_ == other.size_ && bool("Vector size mismatch"));
+this->elements_.swap(other.elements_);
+return *this;
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-void vector_base<NumericT, SizeT, DistanceT>::pad()
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+void vector_base<NumericT, OCLHandle, SizeT, DistanceT>::pad()
 {
-  if (internal_size() != size())
-  {
-    std::vector<NumericT> pad(internal_size() - size());
-    viennacl::backend::memory_write(elements_, sizeof(NumericT) * size(), sizeof(NumericT) * pad.size(), &(pad[0]));
-  }
+if (internal_size() != size())
+{
+  std::vector<NumericT> pad(internal_size() - size());
+  viennacl::backend::memory_write(elements_, sizeof(NumericT) * size(), sizeof(NumericT) * pad.size(), &(pad[0]));
+}
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-void vector_base<NumericT, SizeT, DistanceT>::switch_memory_context(viennacl::context new_ctx)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+void vector_base<NumericT, OCLHandle, SizeT, DistanceT>::switch_memory_context(viennacl::context new_ctx)
 {
-  viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx);
+viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx);
 }
 
 //TODO: Think about implementing the following public member functions
 //void insert_element(unsigned int i, NumericT val){}
 //void erase_element(unsigned int i){}
 
-template<class NumericT, typename SizeT, typename DistanceT>
-void vector_base<NumericT, SizeT, DistanceT>::resize(size_type new_size, bool preserve)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+void vector_base<NumericT, OCLHandle, SizeT, DistanceT>::resize(size_type new_size, bool preserve)
 {
-  resize_impl(new_size, viennacl::traits::context(*this), preserve);
+resize_impl(new_size, viennacl::traits::context(*this), preserve);
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-void vector_base<NumericT, SizeT, DistanceT>::resize(size_type new_size, viennacl::context ctx, bool preserve)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+void vector_base<NumericT, OCLHandle, SizeT, DistanceT>::resize(size_type new_size, viennacl::context ctx, bool preserve)
 {
-  resize_impl(new_size, ctx, preserve);
+resize_impl(new_size, ctx, preserve);
 }
 
-template<class NumericT, typename SizeT, typename DistanceT>
-void vector_base<NumericT, SizeT, DistanceT>::resize_impl(size_type new_size, viennacl::context ctx, bool preserve)
+template<class NumericT, typename OCLHandle, typename SizeT, typename DistanceT>
+void vector_base<NumericT, OCLHandle, SizeT, DistanceT>::resize_impl(size_type new_size, viennacl::context ctx, bool preserve)
 {
   assert(new_size > 0 && bool("Positive size required when resizing vector!"));
 
@@ -946,11 +948,11 @@ void vector_base<NumericT, SizeT, DistanceT>::resize_impl(size_type new_size, vi
 }
 
 
-template<class NumericT, unsigned int AlignmentV>
-class vector : public vector_base<NumericT>
+template<class NumericT, unsigned int AlignmentV, typename OCLHandle>
+class vector : public vector_base<NumericT, OCLHandle>
 {
-  typedef vector<NumericT, AlignmentV>         self_type;
-  typedef vector_base<NumericT>               base_type;
+  typedef vector<NumericT, AlignmentV, OCLHandle>                                 self_type;
+  typedef vector_base<NumericT, OCLHandle>  base_type;
 
 public:
   typedef typename base_type::size_type                  size_type;
@@ -1069,10 +1071,10 @@ public:
 }; //vector
 
 /** @brief Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from viennacl::tie(). */
-template<typename ScalarT>
+template<typename ScalarT, typename H>
 class vector_tuple
 {
-  typedef vector_base<ScalarT>   VectorType;
+  typedef vector_base<ScalarT, H>   VectorType;
 
 public:
   // 2 vectors
@@ -1151,65 +1153,65 @@ private:
 };
 
 // 2 args
-template<typename ScalarT>
-vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0, vector_base<ScalarT> const & v1) { return vector_tuple<ScalarT>(v0, v1); }
+template<typename ScalarT, typename H>
+vector_tuple<ScalarT, H> tie(vector_base<ScalarT, H> const & v0, vector_base<ScalarT, H> const & v1) { return vector_tuple<ScalarT, H>(v0, v1); }
 
-template<typename ScalarT>
-vector_tuple<ScalarT> tie(vector_base<ScalarT>       & v0, vector_base<ScalarT>       & v1) { return vector_tuple<ScalarT>(v0, v1); }
+template<typename ScalarT, typename H>
+vector_tuple<ScalarT, H> tie(vector_base<ScalarT, H>       & v0, vector_base<ScalarT, H>       & v1) { return vector_tuple<ScalarT, H>(v0, v1); }
 
 // 3 args
-template<typename ScalarT>
-vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0, vector_base<ScalarT> const & v1, vector_base<ScalarT> const & v2) { return vector_tuple<ScalarT>(v0, v1, v2); }
+template<typename ScalarT, typename H>
+vector_tuple<ScalarT, H> tie(vector_base<ScalarT, H> const & v0, vector_base<ScalarT, H> const & v1, vector_base<ScalarT, H> const & v2) { return vector_tuple<ScalarT, H>(v0, v1, v2); }
 
-template<typename ScalarT>
-vector_tuple<ScalarT> tie(vector_base<ScalarT>       & v0, vector_base<ScalarT>       & v1, vector_base<ScalarT>       & v2) { return vector_tuple<ScalarT>(v0, v1, v2); }
+template<typename ScalarT, typename H>
+vector_tuple<ScalarT, H> tie(vector_base<ScalarT, H>       & v0, vector_base<ScalarT, H>       & v1, vector_base<ScalarT, H>       & v2) { return vector_tuple<ScalarT, H>(v0, v1, v2); }
 
 // 4 args
-template<typename ScalarT>
-vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0, vector_base<ScalarT> const & v1, vector_base<ScalarT> const & v2, vector_base<ScalarT> const & v3)
+template<typename ScalarT, typename H>
+vector_tuple<ScalarT, H> tie(vector_base<ScalarT, H> const & v0, vector_base<ScalarT, H> const & v1, vector_base<ScalarT, H> const & v2, vector_base<ScalarT, H> const & v3)
 {
-  return vector_tuple<ScalarT>(v0, v1, v2, v3);
+  return vector_tuple<ScalarT, H>(v0, v1, v2, v3);
 }
 
-template<typename ScalarT>
-vector_tuple<ScalarT> tie(vector_base<ScalarT>       & v0, vector_base<ScalarT>       & v1, vector_base<ScalarT>       & v2, vector_base<ScalarT>       & v3)
+template<typename ScalarT, typename H>
+vector_tuple<ScalarT, H> tie(vector_base<ScalarT, H>       & v0, vector_base<ScalarT, H>       & v1, vector_base<ScalarT, H>       & v2, vector_base<ScalarT, H>       & v3)
 {
-  return vector_tuple<ScalarT>(v0, v1, v2, v3);
+  return vector_tuple<ScalarT, H>(v0, v1, v2, v3);
 }
 
 // 5 args
-template<typename ScalarT>
-vector_tuple<ScalarT> tie(vector_base<ScalarT> const & v0,
-                          vector_base<ScalarT> const & v1,
-                          vector_base<ScalarT> const & v2,
-                          vector_base<ScalarT> const & v3,
-                          vector_base<ScalarT> const & v4)
-{
-  typedef vector_base<ScalarT> const *       VectorPointerType;
+template<typename ScalarT, typename H>
+vector_tuple<ScalarT, H> tie(vector_base<ScalarT, H> const & v0,
+                          vector_base<ScalarT, H> const & v1,
+                          vector_base<ScalarT, H> const & v2,
+                          vector_base<ScalarT, H> const & v3,
+                          vector_base<ScalarT, H> const & v4)
+{
+  typedef vector_base<ScalarT, H> const *       VectorPointerType;
   std::vector<VectorPointerType> vec(5);
   vec[0] = &v0;
   vec[1] = &v1;
   vec[2] = &v2;
   vec[3] = &v3;
   vec[4] = &v4;
-  return vector_tuple<ScalarT>(vec);
+  return vector_tuple<ScalarT, H>(vec);
 }
 
-template<typename ScalarT>
-vector_tuple<ScalarT> tie(vector_base<ScalarT> & v0,
-                          vector_base<ScalarT> & v1,
-                          vector_base<ScalarT> & v2,
-                          vector_base<ScalarT> & v3,
-                          vector_base<ScalarT> & v4)
+template<typename ScalarT, typename H>
+vector_tuple<ScalarT, H> tie(vector_base<ScalarT, H> & v0,
+                          vector_base<ScalarT, H> & v1,
+                          vector_base<ScalarT, H> & v2,
+                          vector_base<ScalarT, H> & v3,
+                          vector_base<ScalarT, H> & v4)
 {
-  typedef vector_base<ScalarT> *       VectorPointerType;
+  typedef vector_base<ScalarT, H> *       VectorPointerType;
   std::vector<VectorPointerType> vec(5);
   vec[0] = &v0;
   vec[1] = &v1;
   vec[2] = &v2;
   vec[3] = &v3;
   vec[4] = &v4;
-  return vector_tuple<ScalarT>(vec);
+  return vector_tuple<ScalarT, H>(vec);
 }
 
 // TODO: Add more arguments to tie() here. Maybe use some preprocessor magic to accomplish this.
@@ -1230,9 +1232,9 @@ vector_tuple<ScalarT> tie(vector_base<ScalarT> & v0,
 * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
 * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
 */
-template<typename NumericT, unsigned int AlignmentV, typename CPU_ITERATOR>
-void fast_copy(const const_vector_iterator<NumericT, AlignmentV> & gpu_begin,
-               const const_vector_iterator<NumericT, AlignmentV> & gpu_end,
+template<typename NumericT, unsigned int AlignmentV, typename H, typename CPU_ITERATOR>
+void fast_copy(const const_vector_iterator<NumericT, AlignmentV, H> & gpu_begin,
+               const const_vector_iterator<NumericT, AlignmentV, H> & gpu_end,
                CPU_ITERATOR cpu_begin )
 {
   if (gpu_begin != gpu_end)
@@ -1263,8 +1265,8 @@ void fast_copy(const const_vector_iterator<NumericT, AlignmentV> & gpu_begin,
 * @param gpu_vec    A gpu vector.
 * @param cpu_vec    The cpu vector. Type requirements: Output iterator pointing to entries linear in memory can be obtained via member function .begin()
 */
-template<typename NumericT, typename CPUVECTOR>
-void fast_copy(vector_base<NumericT> const & gpu_vec, CPUVECTOR & cpu_vec )
+template<typename NumericT, class H, typename CPUVECTOR>
+void fast_copy(vector_base<NumericT, H> const & gpu_vec, CPUVECTOR & cpu_vec )
 {
   viennacl::fast_copy(gpu_vec.begin(), gpu_vec.end(), cpu_vec.begin());
 }
@@ -1280,9 +1282,9 @@ void fast_copy(vector_base<NumericT> const & gpu_vec, CPUVECTOR & cpu_vec )
 * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
 * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
 */
-template<typename NumericT, unsigned int AlignmentV, typename CPU_ITERATOR>
-void async_copy(const const_vector_iterator<NumericT, AlignmentV> & gpu_begin,
-                const const_vector_iterator<NumericT, AlignmentV> & gpu_end,
+template<typename NumericT, unsigned int AlignmentV, typename CPU_ITERATOR, typename H>
+void async_copy(const const_vector_iterator<NumericT, AlignmentV, H> & gpu_begin,
+                const const_vector_iterator<NumericT, AlignmentV, H> & gpu_end,
                 CPU_ITERATOR cpu_begin )
 {
   if (gpu_begin != gpu_end)
@@ -1318,9 +1320,9 @@ void async_copy(vector_base<NumericT> const & gpu_vec, CPUVECTOR & cpu_vec )
 * @param gpu_end    GPU constant iterator pointing to the end of the vector (STL-like)
 * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
 */
-template<typename NumericT, unsigned int AlignmentV, typename CPU_ITERATOR>
-void copy(const const_vector_iterator<NumericT, AlignmentV> & gpu_begin,
-          const const_vector_iterator<NumericT, AlignmentV> & gpu_end,
+template<typename NumericT, unsigned int AlignmentV, typename CPU_ITERATOR, typename H>
+void copy(const const_vector_iterator<NumericT, AlignmentV, H> & gpu_begin,
+          const const_vector_iterator<NumericT, AlignmentV, H> & gpu_end,
           CPU_ITERATOR cpu_begin )
 {
   assert(gpu_end - gpu_begin >= 0 && bool("Iterators incompatible"));
@@ -1340,14 +1342,14 @@ void copy(const const_vector_iterator<NumericT, AlignmentV> & gpu_begin,
 * @param gpu_end    GPU iterator pointing to the end of the vector (STL-like)
 * @param cpu_begin  Output iterator for the cpu vector. The cpu vector must be at least as long as the gpu vector!
 */
-template<typename NumericT, unsigned int AlignmentV, typename CPU_ITERATOR>
-void copy(const vector_iterator<NumericT, AlignmentV> & gpu_begin,
-          const vector_iterator<NumericT, AlignmentV> & gpu_end,
+template<typename NumericT, unsigned int AlignmentV, typename CPU_ITERATOR, typename H>
+void copy(const vector_iterator<NumericT, AlignmentV, H> & gpu_begin,
+          const vector_iterator<NumericT, AlignmentV, H> & gpu_end,
           CPU_ITERATOR cpu_begin )
 
 {
-  viennacl::copy(const_vector_iterator<NumericT, AlignmentV>(gpu_begin),
-                 const_vector_iterator<NumericT, AlignmentV>(gpu_end),
+  viennacl::copy(const_vector_iterator<NumericT, AlignmentV, H>(gpu_begin),
+                 const_vector_iterator<NumericT, AlignmentV, H>(gpu_end),
                  cpu_begin);
 }
 
@@ -1396,10 +1398,10 @@ void copy(vector<NumericT, AlignmentV> const & gpu_vec,
 * @param cpu_end    CPU iterator pointing to the end of the vector (STL-like)
 * @param gpu_begin  Output iterator for the gpu vector. The gpu iterator must be incrementable (cpu_end - cpu_begin) times, otherwise the result is undefined.
 */
-template<typename CPU_ITERATOR, typename NumericT, unsigned int AlignmentV>
+template<typename CPU_ITERATOR, typename NumericT, unsigned int AlignmentV, typename H>
 void fast_copy(CPU_ITERATOR const & cpu_begin,
                CPU_ITERATOR const & cpu_end,
-               vector_iterator<NumericT, AlignmentV> gpu_begin)
+               vector_iterator<NumericT, AlignmentV, H> gpu_begin)
 {
   if (cpu_end - cpu_begin > 0)
   {
@@ -1430,8 +1432,8 @@ void fast_copy(CPU_ITERATOR const & cpu_begin,
 * @param cpu_vec    A cpu vector. Type requirements: Iterator can be obtained via member function .begin() and .end()
 * @param gpu_vec    The gpu vector.
 */
-template<typename CPUVECTOR, typename NumericT>
-void fast_copy(const CPUVECTOR & cpu_vec, vector_base<NumericT> & gpu_vec)
+template<typename CPUVECTOR, typename NumericT, typename H>
+void fast_copy(const CPUVECTOR & cpu_vec, vector_base<NumericT, H> & gpu_vec)
 {
   viennacl::fast_copy(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin());
 }
@@ -2056,10 +2058,10 @@ namespace detail
   };
 
   // x = inner_prod(z, {y0, y1, ...})
-  template<typename T>
-  struct op_executor<vector_base<T>, op_assign, vector_expression<const vector_base<T>, const vector_tuple<T>, op_inner_prod> >
+  template<typename T, typename H1, typename H2>
+  struct op_executor<vector_base<T, H1>, op_assign, vector_expression<const vector_base<T, H2>, const vector_tuple<T>, op_inner_prod> >
   {
-    static void apply(vector_base<T> & lhs, vector_expression<const vector_base<T>, const vector_tuple<T>, op_inner_prod> const & rhs)
+    static void apply(vector_base<T, H1> & lhs, vector_expression<const vector_base<T, H2>, const vector_tuple<T>, op_inner_prod> const & rhs)
     {
       viennacl::linalg::inner_prod_impl(rhs.lhs(), rhs.rhs(), lhs);
     }