From 8a858f1292ec922b0c8e415196404756adb07f78 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 12 Aug 2019 11:20:54 -0500
Subject: [PATCH 1/2] Support for bindless textures (thanks Binu
 Mathew/Kraenion Labs for the patch)

---
 pycuda/driver.py             |   2 +-
 src/cpp/cuda.hpp             | 303 ++++++++++++++++++++++++++++++++++-
 src/wrapper/wrap_cudadrv.cpp |  34 +++-
 test/test_driver.py          |  80 +++++++++
 4 files changed, 414 insertions(+), 5 deletions(-)
diff --git a/pycuda/driver.py b/pycuda/driver.py
index fd042a75..916d0980 100644
--- a/pycuda/driver.py
+++ b/pycuda/driver.py
@@ -188,7 +188,7 @@ def _add_functionality():
             if isinstance(arg, np.number):
                 arg_data.append(arg)
                 format += arg.dtype.char
-            elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)):
+            elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation, TextureObject, SurfaceObject)):
                 arg_data.append(int(arg))
                 format += "P"
             elif isinstance(arg, ArgumentHandler):
diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp
index da60bd57..e7bcdbf5 100644
--- a/src/cpp/cuda.hpp
+++ b/src/cpp/cuda.hpp
@@ -37,6 +37,7 @@
 #include <boost/thread/thread.hpp>
 #include <boost/thread/tss.hpp>
 #include <boost/version.hpp>
+#include <numpy/arrayobject.h>
 
 #if (BOOST_VERSION/100) < 1035
 #warning *****************************************************************
@@ -1190,6 +1191,305 @@ namespace pycuda
 
   // }}}
 
+  // {{{ bindless textures
+
+#if CUDAPP_CUDA_VERSION >= 6000
+
+  template <class R, class T>
+  void tuple_to_array(T *dst, py::tuple src_py, unsigned n)
+  {
+    pycuda_size_t src_length = py::len(src_py);
+    if (src_length != n)
+        throw pycuda::error(
+            "setter:", CUDA_ERROR_INVALID_VALUE,
+            "incorrect number of elements in tuple");
+
+    for (unsigned i = 0; i < src_length; ++i)
+      {
+        R value = py::extract<R>(src_py[i]);
+        dst[i] = (T)(value);
+      }
+  }
+
+  template <class R, class T>
+  py::tuple array_to_tuple_3(T *src)
+  {
+    return py::make_tuple((R)src[0], (R)src[1], (R)src[2]);
+  }
+
+  template <class R, class T>
+  py::tuple array_to_tuple_4(T *src)
+  {
+    return py::make_tuple((R)src[0], (R)src[1], (R)src[2], (R)src[3]);
+  }
+
+  // {{{ texture descriptor
+
+  struct WRAP_CUDA_TEXTURE_DESC : public CUDA_TEXTURE_DESC
+  {
+      WRAP_CUDA_TEXTURE_DESC()
+      {
+        // This class is a standard layout type. CUDA_TEXTURE_DESC
+        // needs to be memset to 0. When I made it a member
+        // of this class, wrapping of members such as .fields
+        // failed. So unorthodox as this might be, deriving
+        // a class and memset(this) works and is permissible because
+        // of standard layout.
+        // It can be verified that this class is standard layout by:
+        // #include <type_traits>
+        //printf("traits: %d\n", std::is_standard_layout<WRAP_CUDA_TEXTURE_DESC>::value);
+        // Do not add *any* member variables or virtual functions.
+        // That will make it not standard layout.
+        memset(this, 0, sizeof(CUDA_TEXTURE_DESC));
+      }
+
+      py::tuple get_address_mode()
+      {
+        return array_to_tuple_3<int>(addressMode);
+      }
+
+      void set_address_mode(py::tuple src_py)
+      {
+        tuple_to_array<int>(addressMode, src_py, 3);
+      }
+
+      py::tuple get_border_color()
+      {
+        return array_to_tuple_4<float>(borderColor);
+      }
+
+      void set_border_color(py::tuple src_py)
+      {
+        tuple_to_array<float>(borderColor, src_py, 4);
+      }
+  };
+
+  // }}}
+
+  // {{{ surface object
+
+  class surface_object : public boost::noncopyable
+  {
+    private:
+      PyObject* m_pyobj;
+      CUsurfObject m_cusurf;
+
+    public:
+      surface_object(py::object ary)
+        : m_cusurf(0), m_pyobj(NULL)
+      {
+
+        m_pyobj = ary.ptr();
+
+        CUDA_RESOURCE_DESC resdesc;
+        memset(&resdesc, 0, sizeof(resdesc));
+
+        py::extract<array> extract_array(ary);
+        if (!extract_array.check())
+          throw pycuda::error("surface_object", CUDA_ERROR_INVALID_VALUE,
+              "ary argument is not an instance of Array");
+
+
+        const array & pycuda_array = extract_array();
+        resdesc.resType = CU_RESOURCE_TYPE_ARRAY;
+        resdesc.res.array.hArray = pycuda_array.handle();
+        CUDAPP_CALL_GUARDED(cuSurfObjectCreate, (&m_cusurf, &resdesc));
+        Py_INCREF(m_pyobj); // Prevent backing array from being garbage collected
+      }
+
+      CUsurfObject to_int()
+      {
+        // CUsurfObject is a typedef for unsigned long long
+        return m_cusurf;
+      }
+
+      ~surface_object()
+      {
+        cuSurfObjectDestroy(m_cusurf);
+        Py_XDECREF(m_pyobj);
+      }
+
+  };
+
+  // }}}
+
+  // {{{ texture object
+
+  class texture_object : public boost::noncopyable
+  {
+     private:
+       PyObject* m_pyobj;
+       CUtexObject m_cutex;
+
+    public:
+       texture_object(py::object ary, const CUDA_TEXTURE_DESC & texdesc)
+         : m_cutex(0), m_pyobj(NULL)
+       {
+         // Tried two different constructors earlier. One had signature
+         // texture_object(const array &, const cudaTextureDesc &texdesc)
+         // However, with that signature it was not distinguishable from
+         // this constructor. C++ treated both differently, but boost
+         // routed calls to this constructor. So we just use one constructor
+         // and check argument type.
+
+         m_pyobj = ary.ptr();
+
+         CUDA_RESOURCE_DESC resdesc;
+         memset(&resdesc, 0, sizeof(resdesc));
+
+         py::extract<array> extract_array(ary);
+         if (extract_array.check())
+           {
+             const array & pycuda_array = extract_array();
+             resdesc.resType = CU_RESOURCE_TYPE_ARRAY;
+             resdesc.res.array.hArray = pycuda_array.handle();
+             CUDAPP_CALL_GUARDED(cuTexObjectCreate, (&m_cutex, &resdesc, &texdesc, NULL));
+             Py_INCREF(m_pyobj); // Prevent backing array from being garbage collected
+             return;
+           }
+
+         if (!PyArray_Check(ary.ptr()))
+           throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE,
+                               "ary argument is not a numpy array");
+
+
+
+         PyArrayObject* np_a = (PyArrayObject*)(m_pyobj);
+         void * data = (float*)PyArray_DATA(np_a);
+         unsigned memtype;
+         CUDAPP_CALL_GUARDED(cuPointerGetAttribute, (&memtype, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)data));
+         if(memtype != CU_MEMORYTYPE_DEVICE)
+           throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE,
+                               "ary is not device memory");
+
+         int ndim = PyArray_NDIM(np_a);
+         int item_type = PyArray_TYPE(np_a);
+         int item_size = PyArray_ITEMSIZE(np_a);
+         npy_intp* shape_v = PyArray_SHAPE(np_a);
+         npy_intp* stride_v = PyArray_STRIDES(np_a);
+
+         // This used to be a check for !PyArray_ISCONTIGUOUS(ary.ptr())
+         // However, we use views into padded arrays to create the equivalent
+         // of pitch2d memory from managed memory. All views count as non-contiguous.
+         // So we do a different check here just to see if the last dimension stride is
+         // greater than the itemsize. Also as of Jan 2019, textures do not allow
+         // double precision.
+         if ((stride_v[ndim-1] != item_size) || (item_size > sizeof(float)))
+           throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE,
+                               "ary argument is not contiguous/texture incompatible");
+
+         int nchan, ndim_effective;
+         if (ndim == 1)
+           {
+             nchan = 1;
+             ndim_effective = 1;
+           }
+         else
+           {
+             nchan = shape_v[ndim - 1];
+             ndim_effective = ndim - 1;
+           }
+
+         if ((nchan < 1) || (nchan > 4))
+           throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE,
+                               "number of texture channels should be in {1,2,3,4}");
+         if (!((ndim_effective == 1) || (ndim_effective == 2)))
+           throw pycuda::error(
+               "texture_object", CUDA_ERROR_INVALID_VALUE,
+               "textures are limited to 1 or 2 dimensions of texels");
+
+
+         if (ndim_effective == 1)
+           {
+             resdesc.resType = CU_RESOURCE_TYPE_LINEAR;
+             resdesc.res.linear.format = get_cuarray_format(item_type);
+             resdesc.res.linear.devPtr = (CUdeviceptr)data;
+             resdesc.res.linear.numChannels = nchan;
+             resdesc.res.linear.sizeInBytes = PyArray_NBYTES(np_a);
+           }
+         else
+           {
+             resdesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+             resdesc.res.pitch2D.format = get_cuarray_format(item_type);
+             resdesc.res.pitch2D.devPtr = (CUdeviceptr)data;
+             resdesc.res.pitch2D.height = shape_v[0];
+             if(nchan == 3)
+               {
+                 // CUDA does not support 3 channel images even though RGB/BGR
+                 // is a common case. For convolutional kernels we can still use
+                 // the 3channel image without copying, but we need to consider
+                 // it a 3x wide image of 1 channel.
+                 resdesc.res.pitch2D.numChannels = 1;
+                 resdesc.res.pitch2D.width = shape_v[1] * 3;
+               }
+             else
+               {
+                 resdesc.res.pitch2D.numChannels = nchan;
+                 resdesc.res.pitch2D.width = shape_v[1];
+               }
+             resdesc.res.pitch2D.pitchInBytes = stride_v[0];
+           }
+
+         CUDAPP_CALL_GUARDED(cuTexObjectCreate, (&m_cutex, &resdesc, &texdesc, NULL));
+         Py_INCREF(m_pyobj);  // Prevent backing array from being garbage collected
+       }
+
+
+       CUarray_format get_cuarray_format(int type)
+       {
+         switch (type)
+         {
+             case NPY_INT8:
+               return CU_AD_FORMAT_SIGNED_INT8;
+
+             case NPY_UINT8:
+               return CU_AD_FORMAT_UNSIGNED_INT8;
+
+             case NPY_INT16:
+               return CU_AD_FORMAT_SIGNED_INT16;
+
+             case NPY_UINT16:
+               return CU_AD_FORMAT_UNSIGNED_INT16;
+
+             case NPY_INT32:
+               return CU_AD_FORMAT_SIGNED_INT32;
+
+             case NPY_UINT32:
+               return CU_AD_FORMAT_UNSIGNED_INT32;
+
+             case NPY_FLOAT16:
+               return CU_AD_FORMAT_HALF;
+
+             case NPY_FLOAT32:
+               return CU_AD_FORMAT_FLOAT;
+
+             default:
+               throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE,
+                                   "Unsupported numpy type");
+               break;
+         }
+       }
+
+       CUtexObject to_int()
+       {
+         // CUtexObject is a typedef for unsigned long long
+         return m_cutex;
+       }
+
+     ~texture_object()
+     {
+       cuTexObjectDestroy(m_cutex);
+       Py_XDECREF(m_pyobj);
+     }
+
+  };
+
+  // }}}
+
+#endif // CUDAPP_CUDA_VERSION >= 6000
+
+  // }}}
+
   // {{{ surface reference
 #if CUDAPP_CUDA_VERSION >= 3010
   class module;
@@ -2194,9 +2494,8 @@ namespace pycuda
   }
 #endif
   // }}}
-}
-
 
+}
 
 
 #endif
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index dfa3d1cd..07c9b0b5 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -260,7 +260,7 @@ namespace
     py_buffer_wrapper buf_wrapper;
     buf_wrapper.get(dest.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE);
 
-    CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoH, 
+    CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoH,
         (buf_wrapper.m_buf.buf, ary.handle(), index, buf_wrapper.m_buf.len));
   }
 
@@ -1348,7 +1348,7 @@ BOOST_PYTHON_MODULE(_driver)
 
     wrp
       .DEF_SIMPLE_METHOD(get_device_pointer)
-      .def("attach", &cl::attach, 
+      .def("attach", &cl::attach,
         (py::arg("mem_flags"), py::arg("stream")=py::object()))
       ;
   }
@@ -1633,6 +1633,35 @@ BOOST_PYTHON_MODULE(_driver)
   }
   // }}}
 
+#if CUDAPP_CUDA_VERSION >= 6000
+  // {{{ texture descriptor
+  {
+    typedef WRAP_CUDA_TEXTURE_DESC cl;
+    py::class_<cl>("TextureDescriptor")
+    .add_property("address_mode", &cl::get_address_mode, &cl::set_address_mode)
+    .def_readwrite("filter_mode", &cl::filterMode)
+    .def_readwrite("flags", &cl::flags)
+    .add_property("border_color", &cl::get_border_color, &cl::set_border_color);
+  }
+  // }}}
+
+  // {{{ texture object
+  {
+    typedef texture_object cl;
+    py::class_<cl, boost::noncopyable>("TextureObject", py::init<py::object, const WRAP_CUDA_TEXTURE_DESC &>())
+    .def("__int__", &cl::to_int);
+  }
+  // }}}
+
+    // {{{ surface object
+  {
+    typedef surface_object cl;
+    py::class_<cl, boost::noncopyable>("SurfaceObject", py::init<py::object>())
+    .def("__int__", &cl::to_int);
+  }
+    // }}}
+#endif
+
   // {{{ surface reference
 #if CUDAPP_CUDA_VERSION >= 3010
   {
@@ -1669,6 +1698,7 @@ BOOST_PYTHON_MODULE(_driver)
 #ifdef HAVE_CURAND
   pycuda_expose_curand();
 #endif
+
 }
 
 // vim: foldmethod=marker
diff --git a/test/test_driver.py b/test/test_driver.py
index b440eff0..8425e7e3 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -947,6 +947,86 @@ class TestDriver:
         test_kernel = mod.get_function('test_kernel')
         test_kernel(grid=(2, 1), block=(1, 1, 1))
 
+    @mark_cuda_test
+    def test_bindless_textures(self):
+        if drv.get_version() < (6, 0, 0):
+            pytest.skip("bindless textures not supported on CUDA < 6")
+
+        mod = SourceModule("""
+                __global__ void gpu_add(float *a, float *b, float *res)
+                {
+                    int idx = threadIdx.x + threadIdx.y*16;
+                    res[idx] = a[idx] + b[idx];
+                }
+
+                // texture object is a kernel argument
+                __global__ void tex_kernel1(
+                        cudaTextureObject_t tex, cudaSurfaceObject_t surf)
+                {
+                  int i = blockIdx.x *blockDim.x + threadIdx.x;
+                  float x = tex1Dfetch<float>(tex, i);
+                  // do some work using x ...
+                }
+                """)
+
+        print('TextureObject exists. Members:', dir(drv.TextureObject))
+
+        tdesc = drv.TextureDescriptor()
+        print('TextureDescriptor exists. Members:', dir(tdesc))
+
+        print('Setting address mode')
+        print('before', tdesc.address_mode)
+        tdesc.address_mode = (1, 2, 3)
+        print('after', tdesc.address_mode)
+
+        adesc = drv.ArrayDescriptor()
+        adesc.width = 1024
+        adesc.height = 768
+        adesc.format = drv.array_format.UNSIGNED_INT8
+        adesc.num_channels = 4
+        sarray = drv.Array(adesc)
+        print('sarray made')
+        surf1 = drv.SurfaceObject(sarray)
+        print('tex1 made')
+
+        a_um_a = drv.managed_empty(
+                (16, 16, 4), np.float32, mem_flags=drv.mem_attach_flags.GLOBAL)
+        a_um_a[:] = np.random.randn(*a_um_a.shape)
+
+        import sys
+        print('pre ref_count', sys.getrefcount(a_um_a))
+        tex2 = drv.TextureObject(a_um_a, tdesc)
+        print('post ref_count', sys.getrefcount(a_um_a))
+        print('tex2 made')
+
+        print('post ref_count', sys.getrefcount(a_um_a))
+        print('{0:016x}'.format(id(tex2)), '{0:016x}'.format(int(tex2)))
+
+        func = mod.get_function("tex_kernel1")
+        func(a_um_a, tex2, surf1, block=(16, 16, 1))
+        drv.Context.synchronize()
+
+        # Uniform memory version of hello world.
+        a_um_a = drv.managed_empty(
+                (16, 16), np.float32, mem_flags=drv.mem_attach_flags.GLOBAL)
+        a_um_a[:] = np.random.randn(*a_um_a.shape)
+
+        b_um_a = drv.managed_empty(
+                (16, 16), np.float32, mem_flags=drv.mem_attach_flags.GLOBAL)
+        b_um_a[:] = np.random.randn(*b_um_a.shape)
+
+        res_um_a = drv.managed_empty(
+                (16, 16), np.float32, mem_flags=drv.mem_attach_flags.GLOBAL)
+
+        func = mod.get_function("gpu_add")
+        func(a_um_a, b_um_a, res_um_a, block=(16, 16, 1))
+        drv.Context.synchronize()
+
+        res_um_a = res_um_a
+        print(res_um_a)
+
+        assert(np.allclose(res_um_a, a_um_a+b_um_a))
+
 
 def test_import_pyopencl_before_pycuda():
     try:
-- 
GitLab


From ac18f220b7097fb294c84ec286ab0d80f3f3c5ad Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 12 Aug 2019 11:43:54 -0500
Subject: [PATCH 2/2] Improve bindless texture code

---
 src/cpp/cuda.hpp             | 137 ++++++++++++++---------------------
 src/wrapper/wrap_cudadrv.cpp |  12 ++-
 2 files changed, 64 insertions(+), 85 deletions(-)

diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp
index e7bcdbf5..d04111c7 100644
--- a/src/cpp/cuda.hpp
+++ b/src/cpp/cuda.hpp
@@ -1271,30 +1271,19 @@ namespace pycuda
   class surface_object : public boost::noncopyable
   {
     private:
-      PyObject* m_pyobj;
+      boost::shared_ptr<array> m_array;
       CUsurfObject m_cusurf;
 
     public:
-      surface_object(py::object ary)
-        : m_cusurf(0), m_pyobj(NULL)
+      surface_object(boost::shared_ptr<array> ary)
+        : m_array(ary), m_cusurf(0)
       {
-
-        m_pyobj = ary.ptr();
-
         CUDA_RESOURCE_DESC resdesc;
         memset(&resdesc, 0, sizeof(resdesc));
 
-        py::extract<array> extract_array(ary);
-        if (!extract_array.check())
-          throw pycuda::error("surface_object", CUDA_ERROR_INVALID_VALUE,
-              "ary argument is not an instance of Array");
-
-
-        const array & pycuda_array = extract_array();
         resdesc.resType = CU_RESOURCE_TYPE_ARRAY;
-        resdesc.res.array.hArray = pycuda_array.handle();
+        resdesc.res.array.hArray = m_array->handle();
         CUDAPP_CALL_GUARDED(cuSurfObjectCreate, (&m_cusurf, &resdesc));
-        Py_INCREF(m_pyobj); // Prevent backing array from being garbage collected
       }
 
       CUsurfObject to_int()
@@ -1305,8 +1294,7 @@ namespace pycuda
 
       ~surface_object()
       {
-        cuSurfObjectDestroy(m_cusurf);
-        Py_XDECREF(m_pyobj);
+        CUDAPP_CALL_GUARDED_CLEANUP(cuSurfObjectDestroy, (m_cusurf));
       }
 
   };
@@ -1318,49 +1306,38 @@ namespace pycuda
   class texture_object : public boost::noncopyable
   {
      private:
-       PyObject* m_pyobj;
+       // either a numpy array or a CUDA array
+       py::object m_array;
        CUtexObject m_cutex;
 
     public:
        texture_object(py::object ary, const CUDA_TEXTURE_DESC & texdesc)
-         : m_cutex(0), m_pyobj(NULL)
+         : m_array(ary), m_cutex(0)
        {
-         // Tried two different constructors earlier. One had signature
-         // texture_object(const array &, const cudaTextureDesc &texdesc)
-         // However, with that signature it was not distinguishable from
-         // this constructor. C++ treated both differently, but boost
-         // routed calls to this constructor. So we just use one constructor
-         // and check argument type.
-
-         m_pyobj = ary.ptr();
-
          CUDA_RESOURCE_DESC resdesc;
          memset(&resdesc, 0, sizeof(resdesc));
 
          py::extract<array> extract_array(ary);
          if (extract_array.check())
-           {
-             const array & pycuda_array = extract_array();
-             resdesc.resType = CU_RESOURCE_TYPE_ARRAY;
-             resdesc.res.array.hArray = pycuda_array.handle();
-             CUDAPP_CALL_GUARDED(cuTexObjectCreate, (&m_cutex, &resdesc, &texdesc, NULL));
-             Py_INCREF(m_pyobj); // Prevent backing array from being garbage collected
-             return;
-           }
+         {
+           const array & pycuda_array = extract_array();
+           resdesc.resType = CU_RESOURCE_TYPE_ARRAY;
+           resdesc.res.array.hArray = pycuda_array.handle();
+           CUDAPP_CALL_GUARDED(cuTexObjectCreate, (&m_cutex, &resdesc, &texdesc, NULL));
+           return;
+         }
 
-         if (!PyArray_Check(ary.ptr()))
+         if (!PyArray_Check(m_array.ptr()))
            throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE,
-                               "ary argument is not a numpy array");
+               "ary argument is not a numpy array");
 
-
-
-         PyArrayObject* np_a = (PyArrayObject*)(m_pyobj);
+         PyArrayObject* np_a = (PyArrayObject*)(m_array.ptr());
          void * data = (float*)PyArray_DATA(np_a);
          unsigned memtype;
          CUDAPP_CALL_GUARDED(cuPointerGetAttribute, (&memtype, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)data));
          if(memtype != CU_MEMORYTYPE_DEVICE)
            throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE,
-                               "ary is not device memory");
+               "ary is not device memory");
 
          int ndim = PyArray_NDIM(np_a);
          int item_type = PyArray_TYPE(np_a);
@@ -1376,23 +1353,23 @@ namespace pycuda
          // double precision.
          if ((stride_v[ndim-1] != item_size) || (item_size > sizeof(float)))
            throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE,
-                               "ary argument is not contiguous/texture incompatible");
+               "ary argument is not contiguous/texture incompatible");
 
          int nchan, ndim_effective;
          if (ndim == 1)
-           {
-             nchan = 1;
-             ndim_effective = 1;
-           }
+         {
+           nchan = 1;
+           ndim_effective = 1;
+         }
          else
-           {
-             nchan = shape_v[ndim - 1];
-             ndim_effective = ndim - 1;
-           }
+         {
+           nchan = shape_v[ndim - 1];
+           ndim_effective = ndim - 1;
+         }
 
          if ((nchan < 1) || (nchan > 4))
            throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE,
-                               "number of texture channels should be in {1,2,3,4}");
+               "number of texture channels should be in {1,2,3,4}");
          if (!((ndim_effective == 1) || (ndim_effective == 2)))
            throw pycuda::error(
                "texture_object", CUDA_ERROR_INVALID_VALUE,
@@ -1400,38 +1377,37 @@ namespace pycuda
 
 
          if (ndim_effective == 1)
+         {
+           resdesc.resType = CU_RESOURCE_TYPE_LINEAR;
+           resdesc.res.linear.format = get_cuarray_format(item_type);
+           resdesc.res.linear.devPtr = (CUdeviceptr)data;
+           resdesc.res.linear.numChannels = nchan;
+           resdesc.res.linear.sizeInBytes = PyArray_NBYTES(np_a);
+         }
+         else
+         {
+           resdesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+           resdesc.res.pitch2D.format = get_cuarray_format(item_type);
+           resdesc.res.pitch2D.devPtr = (CUdeviceptr)data;
+           resdesc.res.pitch2D.height = shape_v[0];
+           if(nchan == 3)
            {
-             resdesc.resType = CU_RESOURCE_TYPE_LINEAR;
-             resdesc.res.linear.format = get_cuarray_format(item_type);
-             resdesc.res.linear.devPtr = (CUdeviceptr)data;
-             resdesc.res.linear.numChannels = nchan;
-             resdesc.res.linear.sizeInBytes = PyArray_NBYTES(np_a);
+             // CUDA does not support 3 channel images even though RGB/BGR
+             // is a common case. For convolutional kernels we can still use
+             // the 3channel image without copying, but we need to consider
+             // it a 3x wide image of 1 channel.
+             resdesc.res.pitch2D.numChannels = 1;
+             resdesc.res.pitch2D.width = shape_v[1] * 3;
            }
-         else
+           else
            {
-             resdesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-             resdesc.res.pitch2D.format = get_cuarray_format(item_type);
-             resdesc.res.pitch2D.devPtr = (CUdeviceptr)data;
-             resdesc.res.pitch2D.height = shape_v[0];
-             if(nchan == 3)
-               {
-                 // CUDA does not support 3 channel images even though RGB/BGR
-                 // is a common case. For convolutional kernels we can still use
-                 // the 3channel image without copying, but we need to consider
-                 // it a 3x wide image of 1 channel.
-                 resdesc.res.pitch2D.numChannels = 1;
-                 resdesc.res.pitch2D.width = shape_v[1] * 3;
-               }
-             else
-               {
-                 resdesc.res.pitch2D.numChannels = nchan;
-                 resdesc.res.pitch2D.width = shape_v[1];
-               }
-             resdesc.res.pitch2D.pitchInBytes = stride_v[0];
+             resdesc.res.pitch2D.numChannels = nchan;
+             resdesc.res.pitch2D.width = shape_v[1];
            }
+           resdesc.res.pitch2D.pitchInBytes = stride_v[0];
+         }
 
          CUDAPP_CALL_GUARDED(cuTexObjectCreate, (&m_cutex, &resdesc, &texdesc, NULL));
-         Py_INCREF(m_pyobj);  // Prevent backing array from being garbage collected
        }
 
 
@@ -1472,14 +1448,12 @@ namespace pycuda
 
        CUtexObject to_int()
        {
-         // CUtexObject is a typedef for unsigned long long
          return m_cutex;
        }
 
      ~texture_object()
      {
-       cuTexObjectDestroy(m_cutex);
-       Py_XDECREF(m_pyobj);
+       CUDAPP_CALL_GUARDED_CLEANUP(cuTexObjectDestroy, (m_cutex));
      }
 
   };
@@ -1499,7 +1473,6 @@ namespace pycuda
     private:
       CUsurfref m_surfref;
 
-      // life support for array and module
       boost::shared_ptr<array> m_array;
       boost::shared_ptr<module> m_module;
 
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 07c9b0b5..39467d94 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -1635,6 +1635,7 @@ BOOST_PYTHON_MODULE(_driver)
 
 #if CUDAPP_CUDA_VERSION >= 6000
   // {{{ texture descriptor
+
   {
     typedef WRAP_CUDA_TEXTURE_DESC cl;
     py::class_<cl>("TextureDescriptor")
@@ -1643,23 +1644,28 @@ BOOST_PYTHON_MODULE(_driver)
     .def_readwrite("flags", &cl::flags)
     .add_property("border_color", &cl::get_border_color, &cl::set_border_color);
   }
+
   // }}}
 
   // {{{ texture object
+
   {
     typedef texture_object cl;
     py::class_<cl, boost::noncopyable>("TextureObject", py::init<py::object, const WRAP_CUDA_TEXTURE_DESC &>())
     .def("__int__", &cl::to_int);
   }
+
   // }}}
 
-    // {{{ surface object
+  // {{{ surface object
+
   {
     typedef surface_object cl;
-    py::class_<cl, boost::noncopyable>("SurfaceObject", py::init<py::object>())
+    py::class_<cl, boost::noncopyable>("SurfaceObject", py::init<shared_ptr<array> >())
     .def("__int__", &cl::to_int);
   }
-    // }}}
+
+  // }}}
 #endif
 
   // {{{ surface reference
-- 
GitLab