diff --git a/pycuda/driver.py b/pycuda/driver.py index fd042a758436fcbbd3a658f47ad71442a5fa62ef..916d0980e5c6f686b0a9c58a01ccfa9e3b1f66aa 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -188,7 +188,7 @@ def _add_functionality(): if isinstance(arg, np.number): arg_data.append(arg) format += arg.dtype.char - elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)): + elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation, TextureObject, SurfaceObject)): arg_data.append(int(arg)) format += "P" elif isinstance(arg, ArgumentHandler): diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp index da60bd5756508e599cb2c54d643e4c0f60d17036..d04111c720e304a1c9a8dc9334624eb9ca456454 100644 --- a/src/cpp/cuda.hpp +++ b/src/cpp/cuda.hpp @@ -37,6 +37,7 @@ #include #include #include +#include #if (BOOST_VERSION/100) < 1035 #warning ***************************************************************** @@ -1190,6 +1191,279 @@ namespace pycuda // }}} + // {{{ bindless textures + +#if CUDAPP_CUDA_VERSION >= 6000 + + template + void tuple_to_array(T *dst, py::tuple src_py, unsigned n) + { + pycuda_size_t src_length = py::len(src_py); + if (src_length != n) + throw pycuda::error( + "setter:", CUDA_ERROR_INVALID_VALUE, + "incorrect number of elements in tuple"); + + for (unsigned i = 0; i < src_length; ++i) + { + R value = py::extract(src_py[i]); + dst[i] = (T)(value); + } + } + + template + py::tuple array_to_tuple_3(T *src) + { + return py::make_tuple((R)src[0], (R)src[1], (R)src[2]); + } + + template + py::tuple array_to_tuple_4(T *src) + { + return py::make_tuple((R)src[0], (R)src[1], (R)src[2], (R)src[3]); + } + + // {{{ texture descriptor + + struct WRAP_CUDA_TEXTURE_DESC : public CUDA_TEXTURE_DESC + { + WRAP_CUDA_TEXTURE_DESC() + { + // This class is a standard layout type. CUDA_TEXTURE_DESC + // needs to be memset to 0. When I made it a member + // of this class, wrapping of members such as .fields + // failed. So unorthodox as this might be, deriving + // a class and memset(this) works and is permissible because + // of standard layout. + // It can be verified that this class is standard layout by: + // #include + //printf("traits: %d\n", std::is_standard_layout::value); + // Do not add *any* member variables or virtual functions. + // That will make it not standard layout. + memset(this, 0, sizeof(CUDA_TEXTURE_DESC)); + } + + py::tuple get_address_mode() + { + return array_to_tuple_3(addressMode); + } + + void set_address_mode(py::tuple src_py) + { + tuple_to_array(addressMode, src_py, 3); + } + + py::tuple get_border_color() + { + return array_to_tuple_4(borderColor); + } + + void set_border_color(py::tuple src_py) + { + tuple_to_array(borderColor, src_py, 4); + } + }; + + // }}} + + // {{{ surface object + + class surface_object : public boost::noncopyable + { + private: + boost::shared_ptr m_array; + CUsurfObject m_cusurf; + + public: + surface_object(boost::shared_ptr ary) + : m_array(ary), m_cusurf(0) + { + CUDA_RESOURCE_DESC resdesc; + memset(&resdesc, 0, sizeof(resdesc)); + + resdesc.resType = CU_RESOURCE_TYPE_ARRAY; + resdesc.res.array.hArray = m_array->handle(); + CUDAPP_CALL_GUARDED(cuSurfObjectCreate, (&m_cusurf, &resdesc)); + } + + CUsurfObject to_int() + { + // CUsurfObject is a typedef for unsigned long long + return m_cusurf; + } + + ~surface_object() + { + CUDAPP_CALL_GUARDED_CLEANUP(cuSurfObjectDestroy, (m_cusurf)); + } + + }; + + // }}} + + // {{{ texture object + + class texture_object : public boost::noncopyable + { + private: + // either a numpy array or a CUDA array + py::object m_array; + CUtexObject m_cutex; + + public: + texture_object(py::object ary, const CUDA_TEXTURE_DESC & texdesc) + : m_array(ary), m_cutex(0) + { + CUDA_RESOURCE_DESC resdesc; + memset(&resdesc, 0, sizeof(resdesc)); + + py::extract extract_array(ary); + if (extract_array.check()) + { + const array & pycuda_array = extract_array(); + resdesc.resType = CU_RESOURCE_TYPE_ARRAY; + resdesc.res.array.hArray = pycuda_array.handle(); + CUDAPP_CALL_GUARDED(cuTexObjectCreate, (&m_cutex, &resdesc, &texdesc, NULL)); + return; + } + + if (!PyArray_Check(m_array.ptr())) + throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE, + "ary argument is not a numpy array"); + + PyArrayObject* np_a = (PyArrayObject*)(m_array.ptr()); + void * data = (float*)PyArray_DATA(np_a); + unsigned memtype; + CUDAPP_CALL_GUARDED(cuPointerGetAttribute, (&memtype, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)data)); + if(memtype != CU_MEMORYTYPE_DEVICE) + throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE, + "ary is not device memory"); + + int ndim = PyArray_NDIM(np_a); + int item_type = PyArray_TYPE(np_a); + int item_size = PyArray_ITEMSIZE(np_a); + npy_intp* shape_v = PyArray_SHAPE(np_a); + npy_intp* stride_v = PyArray_STRIDES(np_a); + + // This used to be a check for !PyArray_ISCONTIGUOUS(ary.ptr()) + // However, we use views into padded arrays to create the equivalent + // of pitch2d memory from managed memory. All views count as non-contiguous. + // So we do a different check here just to see if the last dimension stride is + // greater than the itemsize. Also as of Jan 2019, textures do not allow + // double precision. + if ((stride_v[ndim-1] != item_size) || (item_size > sizeof(float))) + throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE, + "ary argument is not contiguous/texture incompatible"); + + int nchan, ndim_effective; + if (ndim == 1) + { + nchan = 1; + ndim_effective = 1; + } + else + { + nchan = shape_v[ndim - 1]; + ndim_effective = ndim - 1; + } + + if ((nchan < 1) || (nchan > 4)) + throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE, + "number of texture channels should be in {1,2,3,4}"); + if (!((ndim_effective == 1) || (ndim_effective == 2))) + throw pycuda::error( + "texture_object", CUDA_ERROR_INVALID_VALUE, + "textures are limited to 1 or 2 dimensions of texels"); + + + if (ndim_effective == 1) + { + resdesc.resType = CU_RESOURCE_TYPE_LINEAR; + resdesc.res.linear.format = get_cuarray_format(item_type); + resdesc.res.linear.devPtr = (CUdeviceptr)data; + resdesc.res.linear.numChannels = nchan; + resdesc.res.linear.sizeInBytes = PyArray_NBYTES(np_a); + } + else + { + resdesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resdesc.res.pitch2D.format = get_cuarray_format(item_type); + resdesc.res.pitch2D.devPtr = (CUdeviceptr)data; + resdesc.res.pitch2D.height = shape_v[0]; + if(nchan == 3) + { + // CUDA does not support 3 channel images even though RGB/BGR + // is a common case. For convolutional kernels we can still use + // the 3channel image without copying, but we need to consider + // it a 3x wide image of 1 channel. + resdesc.res.pitch2D.numChannels = 1; + resdesc.res.pitch2D.width = shape_v[1] * 3; + } + else + { + resdesc.res.pitch2D.numChannels = nchan; + resdesc.res.pitch2D.width = shape_v[1]; + } + resdesc.res.pitch2D.pitchInBytes = stride_v[0]; + } + + CUDAPP_CALL_GUARDED(cuTexObjectCreate, (&m_cutex, &resdesc, &texdesc, NULL)); + } + + + CUarray_format get_cuarray_format(int type) + { + switch (type) + { + case NPY_INT8: + return CU_AD_FORMAT_SIGNED_INT8; + + case NPY_UINT8: + return CU_AD_FORMAT_UNSIGNED_INT8; + + case NPY_INT16: + return CU_AD_FORMAT_SIGNED_INT16; + + case NPY_UINT16: + return CU_AD_FORMAT_UNSIGNED_INT16; + + case NPY_INT32: + return CU_AD_FORMAT_SIGNED_INT32; + + case NPY_UINT32: + return CU_AD_FORMAT_UNSIGNED_INT32; + + case NPY_FLOAT16: + return CU_AD_FORMAT_HALF; + + case NPY_FLOAT32: + return CU_AD_FORMAT_FLOAT; + + default: + throw pycuda::error("texture_object", CUDA_ERROR_INVALID_VALUE, + "Unsupported numpy type"); + break; + } + } + + CUtexObject to_int() + { + return m_cutex; + } + + ~texture_object() + { + CUDAPP_CALL_GUARDED_CLEANUP(cuTexObjectDestroy, (m_cutex)); + } + + }; + + // }}} + +#endif // CUDAPP_CUDA_VERSION >= 6000 + + // }}} + // {{{ surface reference #if CUDAPP_CUDA_VERSION >= 3010 class module; @@ -1199,7 +1473,6 @@ namespace pycuda private: CUsurfref m_surfref; - // life support for array and module boost::shared_ptr m_array; boost::shared_ptr m_module; @@ -2194,9 +2467,8 @@ namespace pycuda } #endif // }}} -} - +} #endif diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index dfa3d1cda929f86a186fb6788bd0e438128a0f6f..39467d940208246bd29caf56c4eb2442bf5bdd5d 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -260,7 +260,7 @@ namespace py_buffer_wrapper buf_wrapper; buf_wrapper.get(dest.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); - CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoH, + CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoH, (buf_wrapper.m_buf.buf, ary.handle(), index, buf_wrapper.m_buf.len)); } @@ -1348,7 +1348,7 @@ BOOST_PYTHON_MODULE(_driver) wrp .DEF_SIMPLE_METHOD(get_device_pointer) - .def("attach", &cl::attach, + .def("attach", &cl::attach, (py::arg("mem_flags"), py::arg("stream")=py::object())) ; } @@ -1633,6 +1633,41 @@ BOOST_PYTHON_MODULE(_driver) } // }}} +#if CUDAPP_CUDA_VERSION >= 6000 + // {{{ texture descriptor + + { + typedef WRAP_CUDA_TEXTURE_DESC cl; + py::class_("TextureDescriptor") + .add_property("address_mode", &cl::get_address_mode, &cl::set_address_mode) + .def_readwrite("filter_mode", &cl::filterMode) + .def_readwrite("flags", &cl::flags) + .add_property("border_color", &cl::get_border_color, &cl::set_border_color); + } + + // }}} + + // {{{ texture object + + { + typedef texture_object cl; + py::class_("TextureObject", py::init()) + .def("__int__", &cl::to_int); + } + + // }}} + + // {{{ surface object + + { + typedef surface_object cl; + py::class_("SurfaceObject", py::init >()) + .def("__int__", &cl::to_int); + } + + // }}} +#endif + // {{{ surface reference #if CUDAPP_CUDA_VERSION >= 3010 { @@ -1669,6 +1704,7 @@ BOOST_PYTHON_MODULE(_driver) #ifdef HAVE_CURAND pycuda_expose_curand(); #endif + } // vim: foldmethod=marker diff --git a/test/test_driver.py b/test/test_driver.py index b440eff09e9791ea7977afd1cba550f0cebe173f..8425e7e38f90789fdae09d76a6a1dfbb9ea78ef0 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -947,6 +947,86 @@ class TestDriver: test_kernel = mod.get_function('test_kernel') test_kernel(grid=(2, 1), block=(1, 1, 1)) + @mark_cuda_test + def test_bindless_textures(self): + if drv.get_version() < (6, 0, 0): + pytest.skip("bindless textures not supported on CUDA < 6") + + mod = SourceModule(""" + __global__ void gpu_add(float *a, float *b, float *res) + { + int idx = threadIdx.x + threadIdx.y*16; + res[idx] = a[idx] + b[idx]; + } + + // texture object is a kernel argument + __global__ void tex_kernel1( + cudaTextureObject_t tex, cudaSurfaceObject_t surf) + { + int i = blockIdx.x *blockDim.x + threadIdx.x; + float x = tex1Dfetch(tex, i); + // do some work using x ... + } + """) + + print('TextureObject exists. Members:', dir(drv.TextureObject)) + + tdesc = drv.TextureDescriptor() + print('TextureDescriptor exists. Members:', dir(tdesc)) + + print('Setting address mode') + print('before', tdesc.address_mode) + tdesc.address_mode = (1, 2, 3) + print('after', tdesc.address_mode) + + adesc = drv.ArrayDescriptor() + adesc.width = 1024 + adesc.height = 768 + adesc.format = drv.array_format.UNSIGNED_INT8 + adesc.num_channels = 4 + sarray = drv.Array(adesc) + print('sarray made') + surf1 = drv.SurfaceObject(sarray) + print('tex1 made') + + a_um_a = drv.managed_empty( + (16, 16, 4), np.float32, mem_flags=drv.mem_attach_flags.GLOBAL) + a_um_a[:] = np.random.randn(*a_um_a.shape) + + import sys + print('pre ref_count', sys.getrefcount(a_um_a)) + tex2 = drv.TextureObject(a_um_a, tdesc) + print('post ref_count', sys.getrefcount(a_um_a)) + print('tex2 made') + + print('post ref_count', sys.getrefcount(a_um_a)) + print('{0:016x}'.format(id(tex2)), '{0:016x}'.format(int(tex2))) + + func = mod.get_function("tex_kernel1") + func(a_um_a, tex2, surf1, block=(16, 16, 1)) + drv.Context.synchronize() + + # Uniform memory version of hello world. + a_um_a = drv.managed_empty( + (16, 16), np.float32, mem_flags=drv.mem_attach_flags.GLOBAL) + a_um_a[:] = np.random.randn(*a_um_a.shape) + + b_um_a = drv.managed_empty( + (16, 16), np.float32, mem_flags=drv.mem_attach_flags.GLOBAL) + b_um_a[:] = np.random.randn(*b_um_a.shape) + + res_um_a = drv.managed_empty( + (16, 16), np.float32, mem_flags=drv.mem_attach_flags.GLOBAL) + + func = mod.get_function("gpu_add") + func(a_um_a, b_um_a, res_um_a, block=(16, 16, 1)) + drv.Context.synchronize() + + res_um_a = res_um_a + print(res_um_a) + + assert(np.allclose(res_um_a, a_um_a+b_um_a)) + def test_import_pyopencl_before_pycuda(): try: