Newer
Older
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
{ CUDAPP_CALL_GUARDED(cuTexRefSetFormat, (m_texref, fmt, num_packed_components)); }
void set_address_mode(int dim, CUaddress_mode am)
{ CUDAPP_CALL_GUARDED(cuTexRefSetAddressMode, (m_texref, dim, am)); }
void set_filter_mode(CUfilter_mode fm)
{ CUDAPP_CALL_GUARDED(cuTexRefSetFilterMode, (m_texref, fm)); }
void set_flags(unsigned int flags)
{ CUDAPP_CALL_GUARDED(cuTexRefSetFlags, (m_texref, flags)); }
CUdeviceptr get_address()
{
CUdeviceptr result;
CUDAPP_CALL_GUARDED(cuTexRefGetAddress, (&result, m_texref));
return result;
}
array *get_array()
{
CUarray result;
CUDAPP_CALL_GUARDED(cuTexRefGetArray, (&result, m_texref));
return new array(result, false);
}
CUaddress_mode get_address_mode(int dim)
{
CUaddress_mode result;
CUDAPP_CALL_GUARDED(cuTexRefGetAddressMode, (&result, m_texref, dim));
return result;
}
CUfilter_mode get_filter_mode()
{
CUfilter_mode result;
CUDAPP_CALL_GUARDED(cuTexRefGetFilterMode, (&result, m_texref));
return result;
}
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2000
py::tuple get_format()
{
CUarray_format fmt;
int num_channels;
CUDAPP_CALL_GUARDED(cuTexRefGetFormat, (&fmt, &num_channels, m_texref));
return py::make_tuple(fmt, num_channels);
}
#endif
unsigned int get_flags()
{
unsigned int result;
CUDAPP_CALL_GUARDED(cuTexRefGetFlags, (&result, m_texref));
return result;
}
};
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3010
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
class module;
class surface_reference : public boost::noncopyable
{
private:
CUsurfref m_surfref;
// life support for array and module
boost::shared_ptr<array> m_array;
boost::shared_ptr<module> m_module;
public:
surface_reference(CUsurfref sr)
: m_surfref(sr)
{ }
void set_module(boost::shared_ptr<module> mod)
{ m_module = mod; }
CUsurfref handle() const
{ return m_surfref; }
void set_array(boost::shared_ptr<array> ary, unsigned int flags)
{
CUDAPP_CALL_GUARDED(cuSurfRefSetArray, (m_surfref, ary->handle(), flags));
m_array = ary;
}
array *get_array()
{
CUarray result;
CUDAPP_CALL_GUARDED(cuSurfRefGetArray, (&result, m_surfref));
return new array(result, false);
}
};
#endif
class function;
class module : public boost::noncopyable, public context_dependent
{
private:
CUmodule m_module;
public:
module(CUmodule mod)
{ }
~module()
{
try
{
scoped_context_activation ca(get_context());
CUDAPP_CALL_GUARDED_CLEANUP(cuModuleUnload, (m_module));
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(module);
CUmodule handle() const
{ return m_module; }
function get_function(const char *name);
py::tuple get_global(const char *name)
{
CUdeviceptr devptr;
CUDAPP_CALL_GUARDED(cuModuleGetGlobal, (&devptr, &bytes, m_module, name));
return py::make_tuple(devptr, bytes);
}
};
module *module_from_file(const char *filename)
{
CUmodule mod;
CUDAPP_CALL_GUARDED(cuModuleLoad, (&mod, filename));
return new module(mod);
}
texture_reference *module_get_texref(
boost::shared_ptr<module> mod, const char *name)
{
CUtexref tr;
CUDAPP_CALL_GUARDED(cuModuleGetTexRef, (&tr, mod->handle(), name));
std::auto_ptr<texture_reference> result(
new texture_reference(tr, false));
result->set_module(mod);
return result.release();
}
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3010
inline
surface_reference *module_get_surfref(
boost::shared_ptr<module> mod, const char *name)
{
CUsurfref sr;
CUDAPP_CALL_GUARDED(cuModuleGetSurfRef, (&sr, mod->handle(), name));
std::auto_ptr<surface_reference> result(
new surface_reference(sr));
result->set_module(mod);
return result.release();
}
#endif
class function
{
private:
CUfunction m_function;
function(CUfunction func, std::string const &sym)
: m_function(func), m_symbol(sym)
{ }
void set_block_shape(int x, int y, int z)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetBlockShape, (m_function, x, y, z), m_symbol);
void set_shared_size(unsigned int bytes)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetSharedSize, (m_function, bytes), m_symbol);
void param_set_size(unsigned int bytes)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSetSize, (m_function, bytes), m_symbol);
void param_set(int offset, unsigned int value)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSeti, (m_function, offset, value), m_symbol);
void param_set(int offset, float value)
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSetf, (m_function, offset, value), m_symbol);
void param_setv(int offset, void *buf, size_t len)
// maybe the unsigned int will change, it does not seem right
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuParamSetv, (m_function, offset, buf, (unsigned int) len), m_symbol);
}
void param_set_texref(const texture_reference &tr)
{
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(cuParamSetTexRef, (m_function,
CU_PARAM_TR_DEFAULT, tr.handle()), m_symbol);
}
void launch()
CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
void launch_grid(int grid_width, int grid_height)
CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
cuLaunchGrid, (m_function, grid_width, grid_height), m_symbol);
void launch_grid_async(int grid_width, int grid_height, const stream &s)
CUDAPP_CALL_GUARDED_THREADED_WITH_TRACE_INFO(
cuLaunchGridAsync, (m_function, grid_width, grid_height, s.handle()),
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2020
int get_attribute(CUfunction_attribute attr) const
{
int result;
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncGetAttribute, (&result, attr, m_function), m_symbol);
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3000 && defined(CUDAPP_POST_30_BETA)
void set_cache_config(CUfunc_cache fc)
{
CUDAPP_CALL_GUARDED_WITH_TRACE_INFO(
cuFuncSetCacheConfig, (m_function, fc), m_symbol);
}
#endif
function module::get_function(const char *name)
{
CUfunction func;
CUDAPP_CALL_GUARDED(cuModuleGetFunction, (&func, m_module, name));
CUDAPP_CALL_GUARDED(cuMemGetInfo, (&free, &total));
return py::make_tuple(free, total);
}
CUdeviceptr mem_alloc(size_t bytes)
{
CUdeviceptr devptr;
CUDAPP_CALL_GUARDED(cuMemAlloc, (&devptr, bytes));
return devptr;
}
CUDAPP_CALL_GUARDED_CLEANUP(cuMemFree, (devptr));
// A class the user can override to make device_allocation-
// workalikes.
class pointer_holder_base
{
public:
virtual ~pointer_holder_base() { }
virtual CUdeviceptr get_pointer() = 0;
operator CUdeviceptr()
{ return get_pointer(); }
};
class device_allocation : public boost::noncopyable, public context_dependent
{
private:
CUdeviceptr m_devptr;
public:
device_allocation(CUdeviceptr devptr)
try
{
scoped_context_activation ca(get_context());
mem_free(m_devptr);
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(device_allocation);
release_context();
throw pycuda::error("device_allocation::free", CUDA_ERROR_INVALID_HANDLE);
~device_allocation()
{
free();
{ return m_devptr; }
};
inline Py_ssize_t mem_alloc_pitch(
std::auto_ptr<device_allocation> &da,
unsigned int width, unsigned int height, unsigned int access_size)
{
CUdeviceptr devptr;
CUDAPP_CALL_GUARDED(cuMemAllocPitch, (&devptr, &pitch, width, height, access_size));
da = std::auto_ptr<device_allocation>(new device_allocation(devptr));
return pitch;
py::tuple mem_get_address_range(CUdeviceptr ptr)
{
CUdeviceptr base;
CUDAPP_CALL_GUARDED(cuMemGetAddressRange, (&base, &size, ptr));
return py::make_tuple(base, size);
}
void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, unsigned int len)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoA, (ary.handle(), index, src, len)); }
void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, unsigned int len)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoD, (dst, ary.handle(), index, len)); }
void memcpy_atoa(
array const &dst, unsigned int dst_index,
array const &src, unsigned int src_index,
unsigned int len)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoA, (dst.handle(), dst_index, src.handle(), src_index, len)); }
#if PY_VERSION_HEX >= 0x02050000
typedef Py_ssize_t PYCUDA_BUFFER_SIZE_T;
#else
typedef int PYCUDA_BUFFER_SIZE_T;
#endif
#define MEMCPY_SETTERS \
void set_src_host(py::object buf_py) \
{ \
srcMemoryType = CU_MEMORYTYPE_HOST; \
PYCUDA_BUFFER_SIZE_T len; \
if (PyObject_AsReadBuffer(buf_py.ptr(), &srcHost, &len)) \
throw py::error_already_set(); \
} \
\
void set_src_array(array const &ary) \
{ \
srcMemoryType = CU_MEMORYTYPE_ARRAY; \
srcArray = ary.handle(); \
} \
\
void set_src_device(CUdeviceptr devptr) \
{ \
srcMemoryType = CU_MEMORYTYPE_DEVICE; \
srcDevice = devptr; \
} \
\
void set_dst_host(py::object buf_py) \
{ \
dstMemoryType = CU_MEMORYTYPE_HOST; \
PYCUDA_BUFFER_SIZE_T len; \
if (PyObject_AsWriteBuffer(buf_py.ptr(), &dstHost, &len)) \
throw py::error_already_set(); \
} \
\
void set_dst_array(array const &ary) \
{ \
dstMemoryType = CU_MEMORYTYPE_ARRAY; \
dstArray = ary.handle(); \
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
} \
\
void set_dst_device(CUdeviceptr devptr) \
{ \
dstMemoryType = CU_MEMORYTYPE_DEVICE; \
dstDevice = devptr; \
}
struct memcpy_2d : public CUDA_MEMCPY2D
{
memcpy_2d()
{
srcXInBytes = 0;
srcY = 0;
dstXInBytes = 0;
dstY = 0;
}
MEMCPY_SETTERS;
void execute(bool aligned=false) const
{
if (aligned)
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2D, (this)); }
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DUnaligned, (this)); }
}
void execute_async(const stream &s) const
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy2DAsync, (this, s.handle())); }
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2000
struct memcpy_3d : public CUDA_MEMCPY3D
{
memcpy_3d()
{
reserved0 = 0;
reserved1 = 0;
srcXInBytes = 0;
srcY = 0;
srcZ = 0;
srcLOD = 0;
dstXInBytes = 0;
dstY = 0;
dstZ = 0;
dstLOD = 0;
}
MEMCPY_SETTERS;
void execute() const
{
CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3D, (this));
}
void execute_async(const stream &s) const
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DAsync, (this, s.handle())); }
};
#endif
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
#if CUDAPP_CUDA_VERSION >= 4000
struct memcpy_3d_peer : public CUDA_MEMCPY3D_PEER
{
memcpy_3d_peer()
{
srcXInBytes = 0;
srcY = 0;
srcZ = 0;
srcLOD = 0;
dstXInBytes = 0;
dstY = 0;
dstZ = 0;
dstLOD = 0;
}
MEMCPY_SETTERS;
void set_src_context(context const &ctx)
{
srcContext = ctx.handle();
}
void set_dst_context(context const &ctx)
{
dstContext = ctx.handle();
}
void execute() const
{
CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeer, (this));
}
void execute_async(const stream &s) const
{ CUDAPP_CALL_GUARDED_THREADED(cuMemcpy3DPeerAsync, (this, s.handle())); }
};
#endif
inline void *mem_alloc_host(size_t size, unsigned flags=0)
{
void *m_data;
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2020
CUDAPP_CALL_GUARDED(cuMemHostAlloc, (&m_data, size, flags));
#else
if (flags != 0)
throw pycuda::error("mem_alloc_host", CUDA_ERROR_INVALID_VALUE,
"nonzero flags in mem_alloc_host not allowed in CUDA 2.1 and older");
CUDAPP_CALL_GUARDED(cuMemAllocHost, (&m_data, size));
return m_data;
}
inline void mem_free_host(void *ptr)
{
CUDAPP_CALL_GUARDED_CLEANUP(cuMemFreeHost, (ptr));
}
struct host_allocation : public boost::noncopyable, public context_dependent
{
private:
host_allocation(size_t bytesize, unsigned flags=0)
: m_valid(true), m_data(mem_alloc_host(bytesize, flags))
{ }
~host_allocation()
void free()
{
try
{
scoped_context_activation ca(get_context());
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(host_allocation);
release_context();
m_valid = false;
throw pycuda::error("host_allocation::free", CUDA_ERROR_INVALID_HANDLE);
void *data()
{ return m_data; }
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 2020
CUdeviceptr get_device_pointer()
{
CUdeviceptr result;
CUDAPP_CALL_GUARDED(cuMemHostGetDevicePointer, (&result, m_data, 0));
return result;
}
#endif
Andreas Klöckner
committed
#if CUDAPP_CUDA_VERSION >= 3020
unsigned int get_flags()
{
unsigned int flags;
CUDAPP_CALL_GUARDED(cuMemHostGetFlags, (&flags, m_data));
return flags;
}
#endif
class event : public boost::noncopyable, public context_dependent
{
private:
CUevent m_event;
public:
event(unsigned int flags=0)
{ CUDAPP_CALL_GUARDED(cuEventCreate, (&m_event, flags)); }
~event()
try
{
scoped_context_activation ca(get_context());
CUDAPP_CALL_GUARDED_CLEANUP(cuEventDestroy, (m_event));
}
CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(event);
event *record(py::object stream_py)
CUstream s_handle;
if (stream_py.ptr() != Py_None)
{
const stream &s = py::extract<const stream &>(stream_py);
s_handle = s.handle();
}
else
s_handle = 0;
CUDAPP_CALL_GUARDED(cuEventRecord, (m_event, s_handle));
CUevent handle() const
{ return m_event; }
{
CUDAPP_CALL_GUARDED_THREADED(cuEventSynchronize, (m_event));
bool query() const
CUDAPP_PRINT_CALL_TRACE("cuEventQuery");
CUresult result = cuEventQuery(m_event);
switch (result)
{
return false;
default:
CUDAPP_PRINT_ERROR_TRACE("cuEventQuery", result);
throw error("cuEventQuery", result);
}
}
float time_since(event const &start)
{
float result;
CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, start.m_event, m_event));
return result;
}
float time_till(event const &end)
{
float result;
CUDAPP_CALL_GUARDED(cuEventElapsedTime, (&result, m_event, end.m_event));
return result;
}
};
#if CUDAPP_CUDA_VERSION >= 3020
inline void stream::wait_for_event(const event &evt)
{
CUDAPP_CALL_GUARDED(cuStreamWaitEvent, (m_stream, evt.handle(), 0));
}
#endif
inline void initialize_profiler(
const char *config_file,
const char *output_file,
{
CUDAPP_CALL_GUARDED(cuProfilerInitialize, (config_file, output_file, output_mode));
}
inline void start_profiler()
{
CUDAPP_CALL_GUARDED(cuProfilerStart, ());
}
inline void stop_profiler()
{
CUDAPP_CALL_GUARDED(cuProfilerStart, ());
}
#endif
// }}}
}
#endif