Newer
Older
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
class module;
class texture_reference : public boost::noncopyable
{
private:
CUtexref m_texref;
bool m_managed;
// life support for array and module
boost::shared_ptr<array> m_array;
boost::shared_ptr<module> m_module;
public:
texture_reference()
: m_managed(true)
{ CUDAPP_CALL_GUARDED(cuTexRefCreate, (&m_texref)); }
texture_reference(CUtexref tr, bool managed)
: m_texref(tr), m_managed(managed)
{ }
~texture_reference()
if (m_managed)
{
CUDAPP_CALL_GUARDED_CLEANUP(cuTexRefDestroy, (m_texref));
}
}
void set_module(boost::shared_ptr<module> mod)
{ m_module = mod; }
CUtexref handle() const
{ return m_texref; }
void set_array(boost::shared_ptr<array> ary)
{
CUDAPP_CALL_GUARDED(cuTexRefSetArray, (m_texref,
ary->handle(), CU_TRSA_OVERRIDE_FORMAT));
m_array = ary;
}
pycuda_size_t set_address(CUdeviceptr dptr, unsigned int bytes, bool allow_offset=false)
CUDAPP_CALL_GUARDED(cuTexRefSetAddress, (&byte_offset,
if (!allow_offset && byte_offset != 0)
throw pycuda::error("texture_reference::set_address", CUDA_ERROR_INVALID_VALUE,
"texture binding resulted in offset, but allow_offset was false");
m_array.reset();
return byte_offset;
}
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2020
const CUDA_ARRAY_DESCRIPTOR &descr, unsigned int pitch)
{
CUDAPP_CALL_GUARDED(cuTexRefSetAddress2D, (m_texref, &descr, dptr, pitch));
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
void set_format(CUarray_format fmt, int num_packed_components)
{ CUDAPP_CALL_GUARDED(cuTexRefSetFormat, (m_texref, fmt, num_packed_components)); }
void set_address_mode(int dim, CUaddress_mode am)
{ CUDAPP_CALL_GUARDED(cuTexRefSetAddressMode, (m_texref, dim, am)); }
void set_filter_mode(CUfilter_mode fm)
{ CUDAPP_CALL_GUARDED(cuTexRefSetFilterMode, (m_texref, fm)); }
void set_flags(unsigned int flags)
{ CUDAPP_CALL_GUARDED(cuTexRefSetFlags, (m_texref, flags)); }
CUdeviceptr get_address()
{
CUdeviceptr result;
CUDAPP_CALL_GUARDED(cuTexRefGetAddress, (&result, m_texref));
return result;
}
array *get_array()
{
CUarray result;
CUDAPP_CALL_GUARDED(cuTexRefGetArray, (&result, m_texref));
return new array(result, false);
}
CUaddress_mode get_address_mode(int dim)
{
CUaddress_mode result;
CUDAPP_CALL_GUARDED(cuTexRefGetAddressMode, (&result, m_texref, dim));
return result;
}
CUfilter_mode get_filter_mode()
{
CUfilter_mode result;
CUDAPP_CALL_GUARDED(cuTexRefGetFilterMode, (&result, m_texref));
return result;
}
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2000
py::tuple get_format()
{
CUarray_format fmt;
int num_channels;
CUDAPP_CALL_GUARDED(cuTexRefGetFormat, (&fmt, &num_channels, m_texref));
return py::make_tuple(fmt, num_channels);
}
#endif
unsigned int get_flags()
{
unsigned int result;
CUDAPP_CALL_GUARDED(cuTexRefGetFlags, (&result, m_texref));
return result;
}
};
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3010
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
class module;
class surface_reference : public boost::noncopyable
{
private:
CUsurfref m_surfref;
// life support for array and module
boost::shared_ptr<array> m_array;
boost::shared_ptr<module> m_module;
public:
surface_reference(CUsurfref sr)
: m_surfref(sr)
{ }
void set_module(boost::shared_ptr<module> mod)
{ m_module = mod; }
CUsurfref handle() const
{ return m_surfref; }
void set_array(boost::shared_ptr<array> ary, unsigned int flags)
{
CUDAPP_CALL_GUARDED(cuSurfRefSetArray, (m_surfref, ary->handle(), flags));
m_array = ary;
}
array *get_array()
{
CUarray result;
CUDAPP_CALL_GUARDED(cuSurfRefGetArray, (&result, m_surfref));
return new array(result, false);
}
};
#endif
class function;
class module : public boost::noncopyable, public context_dependent
{
private:
CUmodule m_module;
public:
module(CUmodule mod)
{ }
~module()
{
try
{
scoped_context_activation ca(get_context());
CUDAPP_CALL_GUARDED_CLEANUP(cuModuleUnload, (m_module));
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(module);
CUmodule handle() const
{ return m_module; }
function get_function(const char *name);
py::tuple get_global(const char *name)
{
CUdeviceptr devptr;
CUDAPP_CALL_GUARDED(cuModuleGetGlobal, (&devptr, &bytes, m_module, name));
return py::make_tuple(devptr, bytes);
}
};
module *module_from_file(const char *filename)
{
CUmodule mod;
CUDAPP_CALL_GUARDED(cuModuleLoad, (&mod, filename));
return new module(mod);
}
texture_reference *module_get_texref(
boost::shared_ptr<module> mod, const char *name)
{
CUtexref tr;
CUDAPP_CALL_GUARDED(cuModuleGetTexRef, (&tr, mod->handle(), name));
std::auto_ptr<texture_reference> result(
new texture_reference(tr, false));
result->set_module(mod);
return result.release();
}
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3010
inline
surface_reference *module_get_surfref(
boost::shared_ptr<module> mod, const char *name)
{
CUsurfref sr;
CUDAPP_CALL_GUARDED(cuModuleGetSurfRef, (&sr, mod->handle(), name));
std::auto_ptr<surface_reference> result(
new surface_reference(sr));
result->set_module(mod);
return result.release();
}
#endif
class function
{
private:
CUfunction m_function;
function(CUfunction func, std::string const &sym)
: m_function(func), m_symbol(sym)
{ }
void set_block_shape(int x, int y, int z)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetBlockShape, (m_function, x, y, z), m_symbol);
void set_shared_size(unsigned int bytes)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetSharedSize, (m_function, bytes), m_symbol);
void param_set_size(unsigned int bytes)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSetSize, (m_function, bytes), m_symbol);
void param_set(int offset, unsigned int value)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSeti, (m_function, offset, value), m_symbol);
void param_set(int offset, float value)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSetf, (m_function, offset, value), m_symbol);
void param_setv(int offset, void *buf, size_t len)
// maybe the unsigned int will change, it does not seem right
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSetv, (m_function, offset, buf, (unsigned int) len), m_symbol);
}
void param_set_texref(const texture_reference &tr)
{
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(cuParamSetTexRef, (m_function,
CU_PARAM_TR_DEFAULT, tr.handle()), m_symbol);
}
void launch()
CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
void launch_grid(int grid_width, int grid_height)
CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
cuLaunchGrid, (m_function, grid_width, grid_height), m_symbol);
void launch_grid_async(int grid_width, int grid_height, const stream &s)
CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
cuLaunchGridAsync, (m_function, grid_width, grid_height, s.handle()),
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2020
int get_attribute(CUfunction_attribute attr) const
{
int result;
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncGetAttribute, (&result, attr, m_function), m_symbol);
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA)
void set_cache_config(CUfunc_cache fc)
{
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetCacheConfig, (m_function, fc), m_symbol);
}
#endif
#if CUDAPP_CUDA_VERSION >= 4000
void launch_kernel(py::tuple grid_dim_py, py::tuple block_dim_py,
py::object parameter_buffer,
unsigned shared_mem_bytes, py::object stream_py)
{
const unsigned axis_count = 3;
unsigned grid_dim[axis_count];
unsigned block_dim[axis_count];
for (unsigned i = 0; i < axis_count; ++i)
{
grid_dim[i] = 1;
block_dim[i] = 1;
}
pycuda_size_t gd_length = py::len(grid_dim_py);
if (gd_length > axis_count)
throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
"too many grid dimensions in kernel launch");
for (unsigned i = 0; i < gd_length; ++i)
grid_dim[i] = py::extract<unsigned>(grid_dim_py[i]);
pycuda_size_t bd_length = py::len(block_dim_py);
if (bd_length > axis_count)
throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE,
"too many block dimensions in kernel launch");
for (unsigned i = 0; i < bd_length; ++i)
block_dim[i] = py::extract<unsigned>(block_dim_py[i]);
PYCUDA_PARSE_STREAM_PY;
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
const void *par_buf;
PYCUDA_BUFFER_SIZE_T py_par_len;
if (PyObject_AsReadBuffer(parameter_buffer.ptr(), &par_buf, &py_par_len))
throw py::error_already_set();
size_t par_len = py_par_len;
void *config[] = {
CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast<void *>(par_buf),
CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len,
CU_LAUNCH_PARAM_END
};
CUDAPP_CALL_GUARDED(
cuLaunchKernel, (m_function,
grid_dim[0], grid_dim[1], grid_dim[2],
block_dim[0], block_dim[1], block_dim[2],
shared_mem_bytes, s_handle, 0, config
));
}
#endif
function module::get_function(const char *name)
{
CUfunction func;
CUDAPP_CALL_GUARDED(cuModuleGetFunction, (&func, m_module, name));
CUDAPP_CALL_GUARDED(cuMemGetInfo, (&free, &total));
return py::make_tuple(free, total);
}
CUdeviceptr mem_alloc(size_t bytes)
{
CUdeviceptr devptr;
CUDAPP_CALL_GUARDED(cuMemAlloc, (&devptr, bytes));
return devptr;
}
CUDAPP_CALL_GUARDED_CLEANUP(cuMemFree, (devptr));
// A class the user can override to make device_allocation-
// workalikes.
class pointer_holder_base
{
public:
virtual ~pointer_holder_base() { }
virtual CUdeviceptr get_pointer() = 0;
operator CUdeviceptr()
{ return get_pointer(); }
};
class device_allocation : public boost::noncopyable, public context_dependent
{
private:
CUdeviceptr m_devptr;
public:
device_allocation(CUdeviceptr devptr)
try
{
scoped_context_activation ca(get_context());
mem_free(m_devptr);
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(device_allocation);
release_context();
throw pycuda::error("device_allocation::free", CUDA_ERROR_INVALID_HANDLE);
~device_allocation()
{
free();
{ return m_devptr; }
};
inline Py_ssize_t mem_alloc_pitch(
std::auto_ptr<device_allocation> &da,
unsigned int width, unsigned int height, unsigned int access_size)
{
CUdeviceptr devptr;
CUDAPP_CALL_GUARDED(cuMemAllocPitch, (&devptr, &pitch, width, height, access_size));
da = std::auto_ptr<device_allocation>(new device_allocation(devptr));
return pitch;
py::tuple mem_get_address_range(CUdeviceptr ptr)
{
CUdeviceptr base;
CUDAPP_CALL_GUARDED(cuMemGetAddressRange, (&base, &size, ptr));
return py::make_tuple(base, size);
}
void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, unsigned int len)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoA, (ary.handle(), index, src, len)); }
void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, unsigned int len)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoD, (dst, ary.handle(), index, len)); }
void memcpy_atoa(
array const &dst, unsigned int dst_index,
array const &src, unsigned int src_index,
unsigned int len)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoA, (dst.handle(), dst_index, src.handle(), src_index, len)); }
// {{{ ipc_mem_handle
#if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000
class ipc_mem_handle : public boost::noncopyable, public context_dependent
{
private:
bool m_valid;
protected:
CUdeviceptr m_devptr;
public:
ipc_mem_handle(py::object obj, CUipcMem_flags flags=CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS)
: m_valid(true)
{
if (!PyByteArray_Check(obj.ptr()))
throw pycuda::error("event_from_ipc_handle", CUDA_ERROR_INVALID_VALUE,
"argument is not a bytes array");
CUipcMemHandle handle;
if (PyByteArray_GET_SIZE(obj.ptr()) != sizeof(handle))
throw pycuda::error("event_from_ipc_handle", CUDA_ERROR_INVALID_VALUE,
"handle has the wrong size");
memcpy(&handle, PyByteArray_AS_STRING(obj.ptr()), sizeof(handle));
CUDAPP_CALL_GUARDED(cuIpcOpenMemHandle, (&m_devptr, handle, flags));
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
}
void close()
{
if (m_valid)
{
try
{
scoped_context_activation ca(get_context());
CUDAPP_CALL_GUARDED_CLEANUP(cuIpcCloseMemHandle, (m_devptr));
mem_free(m_devptr);
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(ipc_mem_handle);
release_context();
m_valid = false;
}
else
throw pycuda::error("ipc_mem_handle::close", CUDA_ERROR_INVALID_HANDLE);
}
~ipc_mem_handle()
{
if (m_valid)
close();
}
operator CUdeviceptr() const
{ return m_devptr; }
};
inline
py::object mem_get_ipc_handle(CUdeviceptr devptr)
{
CUipcMemHandle handle;
CUDAPP_CALL_GUARDED(cuIpcGetMemHandle, (&handle, devptr));
return py::object(py::handle<>(PyByteArray_FromStringAndSize(
reinterpret_cast<const char *>(&handle),
sizeof(handle))));
}
#endif
// }}}
#define MEMCPY_SETTERS \
void set_src_host(py::object buf_py) \
{ \
srcMemoryType = CU_MEMORYTYPE_HOST; \
PYCUDA_BUFFER_SIZE_T len; \
if (PyObject_AsReadBuffer(buf_py.ptr(), &srcHost, &len)) \
throw py::error_already_set(); \
} \
\
void set_src_array(array const &ary) \
{ \
srcMemoryType = CU_MEMORYTYPE_ARRAY; \
srcArray = ary.handle(); \
} \
\
void set_src_device(CUdeviceptr devptr) \
{ \
srcMemoryType = CU_MEMORYTYPE_DEVICE; \
srcDevice = devptr; \
} \
\
void set_dst_host(py::object buf_py) \
{ \
dstMemoryType = CU_MEMORYTYPE_HOST; \
PYCUDA_BUFFER_SIZE_T len; \
if (PyObject_AsWriteBuffer(buf_py.ptr(), &dstHost, &len)) \
throw py::error_already_set(); \
} \
\
void set_dst_array(array const &ary) \
{ \
dstMemoryType = CU_MEMORYTYPE_ARRAY; \
dstArray = ary.handle(); \
} \
\
void set_dst_device(CUdeviceptr devptr) \
{ \
dstMemoryType = CU_MEMORYTYPE_DEVICE; \
dstDevice = devptr; \
}
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
#if CUDAPP_CUDA_VERSION >= 4000
#define MEMCPY_SETTERS_UNIFIED \
void set_src_unified(py::object buf_py) \
{ \
srcMemoryType = CU_MEMORYTYPE_UNIFIED; \
PYCUDA_BUFFER_SIZE_T len; \
if (PyObject_AsReadBuffer(buf_py.ptr(), &srcHost, &len)) \
throw py::error_already_set(); \
} \
\
void set_dst_unified(py::object buf_py) \
{ \
dstMemoryType = CU_MEMORYTYPE_UNIFIED; \
PYCUDA_BUFFER_SIZE_T len; \
if (PyObject_AsWriteBuffer(buf_py.ptr(), &dstHost, &len)) \
throw py::error_already_set(); \
}
#else
#define MEMCPY_SETTERS_UNIFIED /* empty */
#endif
struct memcpy_2d : public CUDA_MEMCPY2D
{
memcpy_2d()
{
srcXInBytes = 0;
srcY = 0;
dstXInBytes = 0;
dstY = 0;
}
MEMCPY_SETTERS;
MEMCPY_SETTERS_UNIFIED;
void execute(bool aligned=false) const
{
if (aligned)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2D, (this)); }
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DUnaligned, (this)); }
}
void execute_async(const stream &s) const
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DAsync, (this, s.handle())); }
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2000
struct memcpy_3d : public CUDA_MEMCPY3D
{
memcpy_3d()
{
reserved0 = 0;
reserved1 = 0;
srcXInBytes = 0;
srcY = 0;
srcZ = 0;
srcLOD = 0;
dstXInBytes = 0;
dstY = 0;
dstZ = 0;
dstLOD = 0;
}
MEMCPY_SETTERS;
MEMCPY_SETTERS_UNIFIED;
void execute() const
{
CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3D, (this));
}
void execute_async(const stream &s) const
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DAsync, (this, s.handle())); }
};
#endif
#if CUDAPP_CUDA_VERSION >= 4000
struct memcpy_3d_peer : public CUDA_MEMCPY3D_PEER
{
memcpy_3d_peer()
{
srcXInBytes = 0;
srcY = 0;
srcZ = 0;
srcLOD = 0;
dstXInBytes = 0;
dstY = 0;
dstZ = 0;
dstLOD = 0;
}
MEMCPY_SETTERS;
MEMCPY_SETTERS_UNIFIED;
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
void set_src_context(context const &ctx)
{
srcContext = ctx.handle();
}
void set_dst_context(context const &ctx)
{
dstContext = ctx.handle();
}
void execute() const
{
CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeer, (this));
}
void execute_async(const stream &s) const
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeerAsync, (this, s.handle())); }
};
#endif
inline void *mem_host_alloc(size_t size, unsigned flags=0)
{
void *m_data;
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2020
CUDAPP_CALL_GUARDED(cuMemHostAlloc, (&m_data, size, flags));
#else
if (flags != 0)
throw pycuda::error("mem_host_alloc", CUDA_ERROR_INVALID_VALUE,
"nonzero flags in mem_host_alloc not allowed in CUDA 2.1 and older");
CUDAPP_CALL_GUARDED(cuMemAllocHost, (&m_data, size));
return m_data;
}
inline void mem_host_free(void *ptr)
{
CUDAPP_CALL_GUARDED_CLEANUP(cuMemFreeHost, (ptr));
}
#if CUDAPP_CUDA_VERSION >= 4000
inline void *mem_host_register(void *ptr, size_t bytes, unsigned int flags=0)
{
CUDAPP_CALL_GUARDED(cuMemHostRegister, (ptr, bytes, flags));
return ptr;
}
inline void mem_host_unregister(void *ptr)
{
CUDAPP_CALL_GUARDED_CLEANUP(cuMemHostUnregister, (ptr));
}
#endif
inline void *aligned_malloc(size_t size, size_t alignment, void **original_pointer)
{
// alignment must be a power of two.
if ((alignment & (alignment - 1)) != 0)
throw pycuda::error("aligned_malloc", CUDA_ERROR_INVALID_VALUE,
"alignment must be a power of two");
if (alignment == 0)
throw pycuda::error("aligned_malloc", CUDA_ERROR_INVALID_VALUE,
"alignment must non-zero");
void *p = malloc(size + (alignment - 1));
if (!p)
throw pycuda::error("aligned_malloc", CUDA_ERROR_OUT_OF_MEMORY,
"aligned malloc failed");
*original_pointer = p;
p = (void *)((((ptrdiff_t)(p)) + (alignment-1)) & -alignment);
return p;
struct host_pointer : public boost::noncopyable, public context_dependent
host_pointer()
: m_valid(false)
{ }
host_pointer(void *ptr)
: m_valid(true), m_data(ptr)
{ }
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
virtual ~host_pointer()
{ }
void *data()
{ return m_data; }
#if CUDAPP_CUDA_VERSION >= 2020
CUdeviceptr get_device_pointer()
{
CUdeviceptr result;
CUDAPP_CALL_GUARDED(cuMemHostGetDevicePointer, (&result, m_data, 0));
return result;
}
#endif
};
struct pagelocked_host_allocation : public host_pointer
{
public:
pagelocked_host_allocation(size_t bytesize, unsigned flags=0)
: host_pointer(mem_host_alloc(bytesize, flags))
Andreas Klöckner
committed
/* Don't try to be clever and coalesce these in the base class.
* Won't work: Destructors may not call virtual functions.
*/
~pagelocked_host_allocation()
{
if (m_valid)
free();
}
try
{
scoped_context_activation ca(get_context());
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(pagelocked_host_allocation);
release_context();
m_valid = false;
throw pycuda::error("pagelocked_host_allocation::free", CUDA_ERROR_INVALID_HANDLE);
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3020
unsigned int get_flags()
{
unsigned int flags;
CUDAPP_CALL_GUARDED(cuMemHostGetFlags, (&flags, m_data));
return flags;
}
#endif
};
struct aligned_host_allocation : public host_pointer
{
void *m_original_pointer;
public:
aligned_host_allocation(size_t size, size_t alignment)
: host_pointer(aligned_malloc(size, alignment, &m_original_pointer))
Andreas Klöckner
committed
/* Don't try to be clever and coalesce these in the base class.
* Won't work: Destructors may not call virtual functions.
*/
~aligned_host_allocation()
{
if (m_valid)
free();
}
void free()
{
if (m_valid)
{
::free(m_original_pointer);
m_valid = false;
}
else
throw pycuda::error("aligned_host_allocation::free", CUDA_ERROR_INVALID_HANDLE);
}
};
#if CUDAPP_CUDA_VERSION >= 4000
struct registered_host_memory : public host_pointer
{
private:
py::object m_base;
public:
registered_host_memory(void *p, size_t bytes, unsigned int flags=0,
py::object base=py::object())
: host_pointer(mem_host_register(p, bytes, flags)), m_base(base)
{
}
Andreas Klöckner
committed
/* Don't try to be clever and coalesce these in the base class.
* Won't work: Destructors may not call virtual functions.
*/
~registered_host_memory()
{
if (m_valid)
free();
}
void free()
{
if (m_valid)
{
try
{
scoped_context_activation ca(get_context());
mem_host_unregister(m_data);
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(host_allocation);
release_context();
m_valid = false;
}
else
throw pycuda::error("registered_host_memory::free", CUDA_ERROR_INVALID_HANDLE);
}
py::object base() const
{
return m_base;
}
class event : public boost::noncopyable, public context_dependent
{
private:
CUevent m_event;
public:
event(unsigned int flags=0)
{ CUDAPP_CALL_GUARDED(cuEventCreate, (&m_event, flags)); }
try
{
scoped_context_activation ca(get_context());
CUDAPP_CALL_GUARDED_CLEANUP(cuEventDestroy, (m_event));
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(event);
event *record(py::object stream_py)
PYCUDA_PARSE_STREAM_PY;
CUDAPP_CALL_GUARDED(cuEventRecord, (m_event, s_handle));
CUevent handle() const
{ return m_event; }
{
CUDAPP_CALL_GUARDED_THREADED(cuEventSynchronize, (m_event));
bool query() const
CUDAPP_PRINT_CALL_TRACE("cuEventQuery");
CUresult result = cuEventQuery(m_event);
switch (result)
{