Newer
Older
class array : public boost::noncopyable, public context_dependent
{
private:
CUarray m_array;
bool m_managed;
public:
array(const CUDA_ARRAY_DESCRIPTOR &descr)
{ CUDAPP_CALL_GUARDED(cuArrayCreate, (&m_array, &descr)); }
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2000
array(const CUDA_ARRAY3D_DESCRIPTOR &descr)
{ CUDAPP_CALL_GUARDED(cuArray3DCreate, (&m_array, &descr)); }
#endif
array(CUarray ary, bool managed)
{ }
~array()
{ free(); }
void free()
{
if (m_managed)
{
try
{
scoped_context_activation ca(get_context());
CUDAPP_CALL_GUARDED_CLEANUP(cuArrayDestroy, (m_array));
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(array);
m_managed = false;
release_context();
}
}
CUDA_ARRAY_DESCRIPTOR get_descriptor()
{
CUDA_ARRAY_DESCRIPTOR result;
CUDAPP_CALL_GUARDED(cuArrayGetDescriptor, (&result, m_array));
return result;
}
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2000
CUDA_ARRAY3D_DESCRIPTOR get_descriptor_3d()
{
CUDA_ARRAY3D_DESCRIPTOR result;
CUDAPP_CALL_GUARDED(cuArray3DGetDescriptor, (&result, m_array));
return result;
}
#endif
CUarray handle() const
{ return m_array; }
intptr_t handle_int() const
{ return (intptr_t) m_array; }
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
class module;
class texture_reference : public boost::noncopyable
{
private:
CUtexref m_texref;
bool m_managed;
// life support for array and module
boost::shared_ptr<array> m_array;
boost::shared_ptr<module> m_module;
public:
texture_reference()
: m_managed(true)
{ CUDAPP_CALL_GUARDED(cuTexRefCreate, (&m_texref)); }
texture_reference(CUtexref tr, bool managed)
: m_texref(tr), m_managed(managed)
{ }
~texture_reference()
if (m_managed)
{
CUDAPP_CALL_GUARDED_CLEANUP(cuTexRefDestroy, (m_texref));
}
}
void set_module(boost::shared_ptr<module> mod)
{ m_module = mod; }
CUtexref handle() const
{ return m_texref; }
void set_array(boost::shared_ptr<array> ary)
{
CUDAPP_CALL_GUARDED(cuTexRefSetArray, (m_texref,
ary->handle(), CU_TRSA_OVERRIDE_FORMAT));
m_array = ary;
}
pycuda_size_t set_address(CUdeviceptr dptr, unsigned int bytes, bool allow_offset=false)
CUDAPP_CALL_GUARDED(cuTexRefSetAddress, (&byte_offset,
if (!allow_offset && byte_offset != 0)
throw pycuda::error("texture_reference::set_address", CUDA_ERROR_INVALID_VALUE,
"texture binding resulted in offset, but allow_offset was false");
m_array.reset();
return byte_offset;
}
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2020
const CUDA_ARRAY_DESCRIPTOR &descr, unsigned int pitch)
{
CUDAPP_CALL_GUARDED(cuTexRefSetAddress2D, (m_texref, &descr, dptr, pitch));
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
void set_format(CUarray_format fmt, int num_packed_components)
{ CUDAPP_CALL_GUARDED(cuTexRefSetFormat, (m_texref, fmt, num_packed_components)); }
void set_address_mode(int dim, CUaddress_mode am)
{ CUDAPP_CALL_GUARDED(cuTexRefSetAddressMode, (m_texref, dim, am)); }
void set_filter_mode(CUfilter_mode fm)
{ CUDAPP_CALL_GUARDED(cuTexRefSetFilterMode, (m_texref, fm)); }
void set_flags(unsigned int flags)
{ CUDAPP_CALL_GUARDED(cuTexRefSetFlags, (m_texref, flags)); }
CUdeviceptr get_address()
{
CUdeviceptr result;
CUDAPP_CALL_GUARDED(cuTexRefGetAddress, (&result, m_texref));
return result;
}
array *get_array()
{
CUarray result;
CUDAPP_CALL_GUARDED(cuTexRefGetArray, (&result, m_texref));
return new array(result, false);
}
CUaddress_mode get_address_mode(int dim)
{
CUaddress_mode result;
CUDAPP_CALL_GUARDED(cuTexRefGetAddressMode, (&result, m_texref, dim));
return result;
}
CUfilter_mode get_filter_mode()
{
CUfilter_mode result;
CUDAPP_CALL_GUARDED(cuTexRefGetFilterMode, (&result, m_texref));
return result;
}
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2000
py::tuple get_format()
{
CUarray_format fmt;
int num_channels;
CUDAPP_CALL_GUARDED(cuTexRefGetFormat, (&fmt, &num_channels, m_texref));
return py::make_tuple(fmt, num_channels);
}
#endif
unsigned int get_flags()
{
unsigned int result;
CUDAPP_CALL_GUARDED(cuTexRefGetFlags, (&result, m_texref));
return result;
}
};
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3010
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
class module;
class surface_reference : public boost::noncopyable
{
private:
CUsurfref m_surfref;
// life support for array and module
boost::shared_ptr<array> m_array;
boost::shared_ptr<module> m_module;
public:
surface_reference(CUsurfref sr)
: m_surfref(sr)
{ }
void set_module(boost::shared_ptr<module> mod)
{ m_module = mod; }
CUsurfref handle() const
{ return m_surfref; }
void set_array(boost::shared_ptr<array> ary, unsigned int flags)
{
CUDAPP_CALL_GUARDED(cuSurfRefSetArray, (m_surfref, ary->handle(), flags));
m_array = ary;
}
array *get_array()
{
CUarray result;
CUDAPP_CALL_GUARDED(cuSurfRefGetArray, (&result, m_surfref));
return new array(result, false);
}
};
#endif
class function;
class module : public boost::noncopyable, public context_dependent
{
private:
CUmodule m_module;
public:
module(CUmodule mod)
{ }
~module()
{
try
{
scoped_context_activation ca(get_context());
CUDAPP_CALL_GUARDED_CLEANUP(cuModuleUnload, (m_module));
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(module);
CUmodule handle() const
{ return m_module; }
function get_function(const char *name);
py::tuple get_global(const char *name)
{
CUdeviceptr devptr;
CUDAPP_CALL_GUARDED(cuModuleGetGlobal, (&devptr, &bytes, m_module, name));
return py::make_tuple(devptr, bytes);
}
};
module *module_from_file(const char *filename)
{
CUmodule mod;
CUDAPP_CALL_GUARDED(cuModuleLoad, (&mod, filename));
return new module(mod);
}
texture_reference *module_get_texref(
boost::shared_ptr<module> mod, const char *name)
{
CUtexref tr;
CUDAPP_CALL_GUARDED(cuModuleGetTexRef, (&tr, mod->handle(), name));
std::auto_ptr<texture_reference> result(
new texture_reference(tr, false));
result->set_module(mod);
return result.release();
}
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3010
inline
surface_reference *module_get_surfref(
boost::shared_ptr<module> mod, const char *name)
{
CUsurfref sr;
CUDAPP_CALL_GUARDED(cuModuleGetSurfRef, (&sr, mod->handle(), name));
std::auto_ptr<surface_reference> result(
new surface_reference(sr));
result->set_module(mod);
return result.release();
}
#endif
class function
{
private:
CUfunction m_function;
function(CUfunction func, std::string const &sym)
: m_function(func), m_symbol(sym)
{ }
void set_block_shape(int x, int y, int z)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetBlockShape, (m_function, x, y, z), m_symbol);
void set_shared_size(unsigned int bytes)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetSharedSize, (m_function, bytes), m_symbol);
void param_set_size(unsigned int bytes)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSetSize, (m_function, bytes), m_symbol);
void param_set(int offset, unsigned int value)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSeti, (m_function, offset, value), m_symbol);
void param_set(int offset, float value)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSetf, (m_function, offset, value), m_symbol);
void param_setv(int offset, void *buf, size_t len)
// maybe the unsigned int will change, it does not seem right
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSetv, (m_function, offset, buf, (unsigned int) len), m_symbol);
}
void param_set_texref(const texture_reference &tr)
{
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(cuParamSetTexRef, (m_function,
CU_PARAM_TR_DEFAULT, tr.handle()), m_symbol);
}
void launch()
CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
void launch_grid(int grid_width, int grid_height)
CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
cuLaunchGrid, (m_function, grid_width, grid_height), m_symbol);
void launch_grid_async(int grid_width, int grid_height, const stream &s)
CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
cuLaunchGridAsync, (m_function, grid_width, grid_height, s.handle()),
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2020
int get_attribute(CUfunction_attribute attr) const
{
int result;
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncGetAttribute, (&result, attr, m_function), m_symbol);
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA)
void set_cache_config(CUfunc_cache fc)
{
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetCacheConfig, (m_function, fc), m_symbol);
}
#endif
#if CUDAPP_CUDA_VERSION >= 4000
void launch_kernel(py::tuple grid_dim_py, py::tuple block_dim_py,
py::object parameter_buffer,
unsigned shared_mem_bytes, py::object stream_py)
{
const unsigned axis_count = 3;
unsigned grid_dim[axis_count];
unsigned block_dim[axis_count];
for (unsigned i = 0; i < axis_count; ++i)
{
grid_dim[i] = 1;
block_dim[i] = 1;
}
pycuda_size_t gd_length = py::len(grid_dim_py);
if (gd_length > axis_count)
throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
"too many grid dimensions in kernel launch");
for (unsigned i = 0; i < gd_length; ++i)
grid_dim[i] = py::extract<unsigned>(grid_dim_py[i]);
pycuda_size_t bd_length = py::len(block_dim_py);
if (bd_length > axis_count)
throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
"too many block dimensions in kernel launch");
for (unsigned i = 0; i < bd_length; ++i)
block_dim[i] = py::extract<unsigned>(block_dim_py[i]);
PYCUDA_PARSE_STREAM_PY;
py_buffer_wrapper par_buf_wrapper;
par_buf_wrapper.get(parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS);
size_t par_len = par_buf_wrapper.m_buf.len;
CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast<void *>(par_buf_wrapper.m_buf.buf),
CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len,
CU_LAUNCH_PARAM_END
};
CUDAPP_CALL_GUARDED(
cuLaunchKernel, (m_function,
grid_dim[0], grid_dim[1], grid_dim[2],
block_dim[0], block_dim[1], block_dim[2],
shared_mem_bytes, s_handle, 0, config
));
}
#if CUDAPP_CUDA_VERSION >= 4020
void set_shared_config(CUsharedconfig config)
{
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetSharedMemConfig, (m_function, config), m_symbol);
}
#endif
function module::get_function(const char *name)
{
CUfunction func;
CUDAPP_CALL_GUARDED(cuModuleGetFunction, (&func, m_module, name));
CUDAPP_CALL_GUARDED(cuMemGetInfo, (&free, &total));
return py::make_tuple(free, total);
}
CUdeviceptr mem_alloc(size_t bytes)
{
CUdeviceptr devptr;
CUDAPP_CALL_GUARDED(cuMemAlloc, (&devptr, bytes));
return devptr;
}
CUDAPP_CALL_GUARDED_CLEANUP(cuMemFree, (devptr));
// A class the user can override to make device_allocation-
// workalikes.
class pointer_holder_base
{
public:
virtual ~pointer_holder_base() { }
virtual CUdeviceptr get_pointer() = 0;
Andreas Klöckner
committed
operator CUdeviceptr()
{ return get_pointer(); }
Andreas Klöckner
committed
py::object as_buffer(size_t size, size_t offset)
{
return py::object(
py::handle<>(
PyMemoryView_FromMemory((char *) (get_pointer() + offset), size,
PyBUF_READ | PyBUF_WRITE)
#else /* Py2 */
PyBuffer_FromReadWriteMemory((void *) (get_pointer() + offset), size)
#endif
));
Andreas Klöckner
committed
}
class device_allocation : public boost::noncopyable, public context_dependent
{
private:
CUdeviceptr m_devptr;
public:
device_allocation(CUdeviceptr devptr)
try
{
scoped_context_activation ca(get_context());
mem_free(m_devptr);
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(device_allocation);
release_context();
throw pycuda::error("device_allocation::free", CUDA_ERROR_INVALID_HANDLE);
~device_allocation()
{
free();
{ return m_devptr; }
Andreas Klöckner
committed
py::object as_buffer(size_t size, size_t offset)
{
return py::object(
py::handle<>(
PyMemoryView_FromMemory((char *) (m_devptr + offset), size,
PyBUF_READ | PyBUF_WRITE)
#else /* Py2 */
PyBuffer_FromReadWriteMemory((void *) (m_devptr + offset), size)
#endif
));
Andreas Klöckner
committed
}
inline Py_ssize_t mem_alloc_pitch(
std::auto_ptr<device_allocation> &da,
unsigned int width, unsigned int height, unsigned int access_size)
{
CUdeviceptr devptr;
CUDAPP_CALL_GUARDED(cuMemAllocPitch, (&devptr, &pitch, width, height, access_size));
da = std::auto_ptr<device_allocation>(new device_allocation(devptr));
return pitch;
py::tuple mem_get_address_range(CUdeviceptr ptr)
{
CUdeviceptr base;
CUDAPP_CALL_GUARDED(cuMemGetAddressRange, (&base, &size, ptr));
return py::make_tuple(base, size);
}
void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, unsigned int len)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoA, (ary.handle(), index, src, len)); }
void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, unsigned int len)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoD, (dst, ary.handle(), index, len)); }
void memcpy_atoa(
array const &dst, unsigned int dst_index,
array const &src, unsigned int src_index,
unsigned int len)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoA, (dst.handle(), dst_index, src.handle(), src_index, len)); }
// {{{ ipc_mem_handle
#if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000
class ipc_mem_handle : public boost::noncopyable, public context_dependent
{
private:
bool m_valid;
protected:
CUdeviceptr m_devptr;
public:
ipc_mem_handle(py::object obj, CUipcMem_flags flags=CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS)
: m_valid(true)
{
if (!PyByteArray_Check(obj.ptr()))
throw pycuda::error("event_from_ipc_handle", CUDA_ERROR_INVALID_VALUE,
"argument is not a bytes array");
CUipcMemHandle handle;
if (PyByteArray_GET_SIZE(obj.ptr()) != sizeof(handle))
throw pycuda::error("event_from_ipc_handle", CUDA_ERROR_INVALID_VALUE,
"handle has the wrong size");
memcpy(&handle, PyByteArray_AS_STRING(obj.ptr()), sizeof(handle));
CUDAPP_CALL_GUARDED(cuIpcOpenMemHandle, (&m_devptr, handle, flags));
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
}
void close()
{
if (m_valid)
{
try
{
scoped_context_activation ca(get_context());
CUDAPP_CALL_GUARDED_CLEANUP(cuIpcCloseMemHandle, (m_devptr));
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(ipc_mem_handle);
release_context();
m_valid = false;
}
else
throw pycuda::error("ipc_mem_handle::close", CUDA_ERROR_INVALID_HANDLE);
}
~ipc_mem_handle()
{
if (m_valid)
close();
}
operator CUdeviceptr() const
{ return m_devptr; }
};
inline
py::object mem_get_ipc_handle(CUdeviceptr devptr)
{
CUipcMemHandle handle;
CUDAPP_CALL_GUARDED(cuIpcGetMemHandle, (&handle, devptr));
return py::object(py::handle<>(PyByteArray_FromStringAndSize(
reinterpret_cast<const char *>(&handle),
sizeof(handle))));
}
#endif
// }}}
#define MEMCPY_SETTERS \
void set_src_host(py::object buf_py) \
{ \
srcMemoryType = CU_MEMORYTYPE_HOST; \
py_buffer_wrapper buf_wrapper; \
davidweichiang
committed
buf_wrapper.get(buf_py.ptr(), PyBUF_STRIDED_RO); \
srcHost = buf_wrapper.m_buf.buf; \
} \
\
void set_src_array(array const &ary) \
{ \
srcMemoryType = CU_MEMORYTYPE_ARRAY; \
srcArray = ary.handle(); \
} \
\
void set_src_device(CUdeviceptr devptr) \
{ \
srcMemoryType = CU_MEMORYTYPE_DEVICE; \
srcDevice = devptr; \
} \
\
void set_dst_host(py::object buf_py) \
{ \
dstMemoryType = CU_MEMORYTYPE_HOST; \
py_buffer_wrapper buf_wrapper; \
davidweichiang
committed
buf_wrapper.get(buf_py.ptr(), PyBUF_STRIDED); \
dstHost = buf_wrapper.m_buf.buf; \
} \
\
void set_dst_array(array const &ary) \
{ \
dstMemoryType = CU_MEMORYTYPE_ARRAY; \
dstArray = ary.handle(); \
} \
\
void set_dst_device(CUdeviceptr devptr) \
{ \
dstMemoryType = CU_MEMORYTYPE_DEVICE; \
dstDevice = devptr; \
}
#if CUDAPP_CUDA_VERSION >= 4000
#define MEMCPY_SETTERS_UNIFIED \
void set_src_unified(py::object buf_py) \
{ \
srcMemoryType = CU_MEMORYTYPE_UNIFIED; \
py_buffer_wrapper buf_wrapper; \
buf_wrapper.get(buf_py.ptr(), PyBUF_ANY_CONTIGUOUS); \
srcHost = buf_wrapper.m_buf.buf; \
} \
\
void set_dst_unified(py::object buf_py) \
{ \
dstMemoryType = CU_MEMORYTYPE_UNIFIED; \
py_buffer_wrapper buf_wrapper; \
buf_wrapper.get(buf_py.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); \
dstHost = buf_wrapper.m_buf.buf; \
}
#else
#define MEMCPY_SETTERS_UNIFIED /* empty */
#endif
struct memcpy_2d : public CUDA_MEMCPY2D
{
memcpy_2d()
{
srcXInBytes = 0;
srcY = 0;
dstXInBytes = 0;
dstY = 0;
}
MEMCPY_SETTERS;
MEMCPY_SETTERS_UNIFIED;
void execute(bool aligned=false) const
{
if (aligned)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2D, (this)); }
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DUnaligned, (this)); }
}
void execute_async(const stream &s) const
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DAsync, (this, s.handle())); }
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2000
struct memcpy_3d : public CUDA_MEMCPY3D
{
memcpy_3d()
{
reserved0 = 0;
reserved1 = 0;
srcXInBytes = 0;
srcY = 0;
srcZ = 0;
srcLOD = 0;
dstXInBytes = 0;
dstY = 0;
dstZ = 0;
dstLOD = 0;
}
MEMCPY_SETTERS;
MEMCPY_SETTERS_UNIFIED;
void execute() const
{
CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3D, (this));
}
void execute_async(const stream &s) const
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DAsync, (this, s.handle())); }
};
#endif
#if CUDAPP_CUDA_VERSION >= 4000
struct memcpy_3d_peer : public CUDA_MEMCPY3D_PEER
{
memcpy_3d_peer()
{
srcXInBytes = 0;
srcY = 0;
srcZ = 0;
srcLOD = 0;
dstXInBytes = 0;
dstY = 0;
dstZ = 0;
dstLOD = 0;
}
MEMCPY_SETTERS;
MEMCPY_SETTERS_UNIFIED;
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
void set_src_context(context const &ctx)
{
srcContext = ctx.handle();
}
void set_dst_context(context const &ctx)
{
dstContext = ctx.handle();
}
void execute() const
{
CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeer, (this));
}
void execute_async(const stream &s) const
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeerAsync, (this, s.handle())); }
};
#endif
inline void *mem_host_alloc(size_t size, unsigned flags=0)
{
void *m_data;
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2020
CUDAPP_CALL_GUARDED(cuMemHostAlloc, (&m_data, size, flags));
#else
if (flags != 0)
throw pycuda::error("mem_host_alloc", CUDA_ERROR_INVALID_VALUE,
"nonzero flags in mem_host_alloc not allowed in CUDA 2.1 and older");
CUDAPP_CALL_GUARDED(cuMemAllocHost, (&m_data, size));
return m_data;
}
inline void mem_host_free(void *ptr)
{
CUDAPP_CALL_GUARDED_CLEANUP(cuMemFreeHost, (ptr));
}
Stan Seibert
committed
#if CUDAPP_CUDA_VERSION >= 6000
inline CUdeviceptr mem_managed_alloc(size_t size, unsigned flags=0)
{
CUdeviceptr m_data;
CUDAPP_CALL_GUARDED(cuMemAllocManaged, (&m_data, size, flags));
return m_data;
}
#endif
#if CUDAPP_CUDA_VERSION >= 4000
inline void *mem_host_register(void *ptr, size_t bytes, unsigned int flags=0)
{
CUDAPP_CALL_GUARDED(cuMemHostRegister, (ptr, bytes, flags));
return ptr;
}
inline void mem_host_unregister(void *ptr)
{
CUDAPP_CALL_GUARDED_CLEANUP(cuMemHostUnregister, (ptr));
}
#endif
inline void *aligned_malloc(size_t size, size_t alignment, void **original_pointer)
{
// alignment must be a power of two.
if ((alignment & (alignment - 1)) != 0)
throw pycuda::error("aligned_malloc", CUDA_ERROR_INVALID_VALUE,
"alignment must be a power of two");
if (alignment == 0)
throw pycuda::error("aligned_malloc", CUDA_ERROR_INVALID_VALUE,
"alignment must non-zero");
void *p = malloc(size + (alignment - 1));
if (!p)
throw pycuda::error("aligned_malloc", CUDA_ERROR_OUT_OF_MEMORY,
"aligned malloc failed");
*original_pointer = p;
p = (void *)((((ptrdiff_t)(p)) + (alignment-1)) & -alignment);
return p;
struct host_pointer : public boost::noncopyable, public context_dependent
host_pointer()
: m_valid(false)
{ }
host_pointer(void *ptr)
: m_valid(true), m_data(ptr)
{ }
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
virtual ~host_pointer()
{ }
void *data()
{ return m_data; }
#if CUDAPP_CUDA_VERSION >= 2020
CUdeviceptr get_device_pointer()
{
CUdeviceptr result;
CUDAPP_CALL_GUARDED(cuMemHostGetDevicePointer, (&result, m_data, 0));
return result;
}
#endif
};
struct pagelocked_host_allocation : public host_pointer
{
public:
pagelocked_host_allocation(size_t bytesize, unsigned flags=0)
: host_pointer(mem_host_alloc(bytesize, flags))
Andreas Klöckner
committed
/* Don't try to be clever and coalesce these in the base class.
* Won't work: Destructors may not call virtual functions.
*/
~pagelocked_host_allocation()
{
if (m_valid)
free();
}
try
{
scoped_context_activation ca(get_context());
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(pagelocked_host_allocation);
release_context();
m_valid = false;
throw pycuda::error("pagelocked_host_allocation::free", CUDA_ERROR_INVALID_HANDLE);
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3020
unsigned int get_flags()
{
unsigned int flags;
CUDAPP_CALL_GUARDED(cuMemHostGetFlags, (&flags, m_data));
return flags;
}
#endif
};
struct aligned_host_allocation : public host_pointer
{
void *m_original_pointer;
public:
aligned_host_allocation(size_t size, size_t alignment)
: host_pointer(aligned_malloc(size, alignment, &m_original_pointer))
Andreas Klöckner
committed
/* Don't try to be clever and coalesce these in the base class.
* Won't work: Destructors may not call virtual functions.
*/
~aligned_host_allocation()
{
if (m_valid)
free();
}
void free()
{
if (m_valid)
{