diff --git a/doc/cudagraph.rst b/doc/cudagraph.rst new file mode 100644 index 0000000000000000000000000000000000000000..f42f5134fcaafae0109aa85360e232577bbeca8d --- /dev/null +++ b/doc/cudagraph.rst @@ -0,0 +1,6 @@ +CUDAGraph +========== +.. module:: pycuda.Graph + +The :class:`CUDAGraph` Array Class +--------------------------------- \ No newline at end of file diff --git a/pycuda/cumath.py b/pycuda/cumath.py index ab2b32c86079b05af501e0eebeabacd7e181d2e7..9926db682279918beea50909ba3910c21d2f8919 100644 --- a/pycuda/cumath.py +++ b/pycuda/cumath.py @@ -25,6 +25,11 @@ def _make_unary_array_func(name): if "stream" in kwargs: stream = kwargs["stream"] + if isinstance(array, (int, float, complex)): + func = getattr(np, numpy_func_names.get(name, name)) + out = func(array) + return out + if array.dtype == np.float32: func_name = name + "f" else: @@ -57,6 +62,12 @@ def _make_unary_array_func(name): return f +numpy_func_names = { + "asin": "arcsin", + "acos": "arccos", + "atan": "arctan", +} + fabs = _make_unary_array_func("fabs") ceil = _make_unary_array_func("ceil") floor = _make_unary_array_func("floor") @@ -70,7 +81,6 @@ cos = _make_unary_array_func("cos") tan = _make_unary_array_func("tan") asin = _make_unary_array_func("asin") acos = _make_unary_array_func("acos") -atan = _make_unary_array_func("atan") sinh = _make_unary_array_func("sinh") cosh = _make_unary_array_func("cosh") diff --git a/pycuda/driver.py b/pycuda/driver.py index 47d15b196bab8c3d9ca996d1ef43fed2fa9dd9c0..f257fe7ada88ece1aff8142ffb2082466ca1daf6 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -229,7 +229,6 @@ def _add_functionality(): format += "P" from pycuda._pvt_struct import pack - return handlers, pack(format, *arg_data) # {{{ pre-CUDA 4 call interface (stateful) @@ -709,6 +708,79 @@ def _add_functionality(): _add_functionality() +def _add_cudagraph_kernel(): + def _build_arg_buf(args): + handlers = [] + + arg_data = [] + format = "" + for i, arg in enumerate(args): + if isinstance(arg, np.number): + arg_data.append(arg) + format += arg.dtype.char + elif isinstance(arg, np.int): + arg_data.append(arg) + format += "P" # not sure about this + elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)): + arg_data.append(int(arg)) + format += "P" + elif isinstance(arg, ArgumentHandler): + handlers.append(arg) + arg_data.append(int(arg.get_device_alloc())) + format += "P" + elif isinstance(arg, np.ndarray): + if isinstance(arg.base, ManagedAllocationOrStub): + arg_data.append(int(arg.base)) + format += "P" + else: + arg_data.append(arg) + format += "%ds" % arg.nbytes + elif isinstance(arg, np.void): + arg_data.append(_my_bytes(_memoryview(arg))) + format += "%ds" % arg.itemsize + else: + cai = getattr(arg, "__cuda_array_interface__", None) + if cai: + arg_data.append(cai["data"][0]) + format += "P" + continue + + try: + gpudata = np.uintp(arg.gpudata) + except AttributeError: + raise TypeError("invalid type on parameter #%d (0-based)" % i) + else: + # for gpuarrays + arg_data.append(int(gpudata)) + format += "P" + + from pycuda._pvt_struct import pack + return handlers, pack(format, *arg_data) + + def cudagraph_kernel_call(graph, *args, **kwargs): + func = kwargs.pop("func", None) + grid = kwargs.pop("grid", None) + block = kwargs.pop("block", None) + dependencies = kwargs.pop("dependencies", []) + shared = kwargs.pop("shared", 0) + if kwargs: + raise ValueError( + "extra keyword arguments: %s" % (",".join(kwargs.keys())) + ) + + if block is None: + raise ValueError("must specify block size") + + handlers, arg_buf = _build_arg_buf(args) + + kernel_node = graph._add_kernel_node(func, grid, block, dependencies, + arg_buf, shared) + return kernel_node + + Graph.add_kernel_node = cudagraph_kernel_call + + +_add_cudagraph_kernel() # {{{ pagelocked numpy arrays diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 7d633011ce727f549b5a7ce929a0f5594de2ad3a..6614138fca0eebe81efa932e0853188188faca56 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -488,6 +488,18 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z, "axpbyz", ) +@context_dependent_memoize +def get_axpbyz_kernel_broadcast(dtype_x, dtype_y, dtype_z): + return get_elwise_kernel( + "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = a*x[0] + b*y[i]", + "axpbyz_broadcast", + ) @context_dependent_memoize def get_axpbz_kernel(dtype_x, dtype_z): @@ -565,7 +577,7 @@ def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z): def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): - if np.float64 not in [dtype_x, dtype_y]: + if (np.float64 not in [dtype_x, dtype_y]) and (np.bool not in [dtype_x, dtype_y]): func = func + "f" if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]): @@ -789,6 +801,18 @@ def get_scalar_op_kernel(dtype_x, dtype_a, dtype_y, operator): "scalarop_kernel", ) +@context_dependent_memoize +def get_scalar_op_kernel_reverse(dtype_x, dtype_y, operator): + return get_elwise_kernel( + "%(tp_a)s a, %(tp_x)s *x, %(tp_y)s *y" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_a": dtype_to_ctype(dtype_x), + }, + "y[i] = a %s x[i]" % operator, + "scalarop_kernel_reverse", + ) @context_dependent_memoize def get_logical_not_kernel(dtype_x, dtype_out): diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 100d21cc7a75325d93295bdc8a89d0bf3e439b01..9ac909bb04b28db9f3bc69f6a5e84b8000f6ebec 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -153,6 +153,8 @@ def _make_binary_op(operator): raise RuntimeError( "only contiguous arrays may " "be used as arguments to this operation" ) + + if isinstance(other, GPUArray) and (self, GPUArray): if not other.flags.forc: raise RuntimeError( @@ -178,7 +180,6 @@ def _make_binary_op(operator): result.gpudata, self.mem_size, ) - return result elif isinstance(self, GPUArray): # scalar operator assert np.isscalar(other) @@ -196,6 +197,19 @@ def _make_binary_op(operator): self.mem_size, ) return result + elif isinstance(other, GPUArray): # reverse scalar operator + result = self._new_like_me() + func = elementwise.get_scalar_op_kernel_reverse(other.dtype, result.dtype, operator) + func.prepared_async_call( + self._grid, + self._block, + None, + self, + other.gpudata, + result.gpudata, + self.mem_size, + ) + return result else: return AssertionError @@ -305,6 +319,28 @@ class GPUArray: def flags(self): return _ArrayFlags(self) + def _new_with_changes(self, base, shape=None, dtype=None, allocator=None, + gpudata=None, strides=None): + """ + :arg data: *None* means allocate a new array. + """ + size = self.size + if shape is None: + shape = self.shape + else: + size = None + + if dtype is None: + dtype = self.dtype + if strides is None: + strides = self.strides + if allocator is None: + allocator = self.allocator + if gpudata is None: + gpudata = 0 + + return self.__class__(shape, dtype, allocator=allocator, + base=base, gpudata=gpudata, strides=strides) def set(self, ary, async_=False, stream=None, **kwargs): # {{{ handle 'async' deprecation @@ -417,10 +453,12 @@ class GPUArray: def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" + if not self.flags.forc or not other.flags.forc: raise RuntimeError( "only contiguous arrays may " "be used as arguments to this operation" ) + assert ((self.shape == other.shape == out.shape) or ((self.shape == ()) and other.shape == out.shape) or ((other.shape == ()) and self.shape == out.shape)) @@ -585,7 +623,10 @@ class GPUArray: if isinstance(other, GPUArray): # add another vector result = _get_broadcasted_binary_op_result(self, other) - return self._axpbyz(1, other, 1, result) + if self.shape == (): + return other._axpbyz(1, self, 1, result) + else: + return self._axpbyz(1, other, 1, result) elif np.isscalar(other): # add a scalar @@ -597,7 +638,7 @@ class GPUArray: else: return NotImplemented __radd__ = __add__ - + def __sub__(self, other): """Substract an array from an array or a scalar from an array.""" @@ -959,7 +1000,7 @@ class GPUArray: same_contiguity = (order == "C" and self.flags.c_contiguous) or ( order == "F" and self.flags.f_contiguous - ) + ) if shape == self.shape and same_contiguity: return self @@ -977,6 +1018,16 @@ class GPUArray: if size != self.size: raise ValueError("total size of new array must be unchanged") + if self.size == 0: + return self._new_with_changes( + shape=shape, base=None, + gpudata=None, + strides=( + _f_contiguous_strides(self.dtype.itemsize, shape) + if order == "F" else + _c_contiguous_strides(self.dtype.itemsize, shape) + )) + return GPUArray( shape=shape, dtype=self.dtype, @@ -1271,6 +1322,7 @@ class GPUArray: # }}} + # {{{ rich comparisons __eq__ = _make_binary_op("==") @@ -1279,6 +1331,12 @@ class GPUArray: __ge__ = _make_binary_op(">=") __lt__ = _make_binary_op("<") __gt__ = _make_binary_op(">") + __and__ = _make_binary_op("&") + __rand__ = __and__ + __or__ = _make_binary_op("|") + __ror__ = __or__ + __xor__ = _make_binary_op("^") + __rxor__ = __xor__ # }}} @@ -2038,7 +2096,7 @@ def _make_binary_minmax_func(which): func.prepared_async_call( a._grid, a._block, stream, a.gpudata, b, out.gpudata, a.size ) - else: # assuming b is a GPUArray + elif isinstance(b, GPUArray): # assuming b is a GPUArray if out is None: out = empty_like(b) func = elementwise.get_binary_minmax_kernel( @@ -2048,6 +2106,11 @@ def _make_binary_minmax_func(which): func.prepared_async_call( b._grid, b._block, stream, b.gpudata, a, out.gpudata, b.size ) + else: # a and b are scalars + if which == 'max': + out = np.fmax(a, b) + else: + out = np.fmin(a, b) return out return f diff --git a/pycuda/reduction.py b/pycuda/reduction.py index deb254aaad6b52b5d16bcdf54ebd7d41ce24f505..a1eea00224e35bdbcad78ed6ac6f961cd3c4a8a9 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -347,6 +347,16 @@ class ReductionKernel: args = (result,) + stage1_args +@context_dependent_memoize +def get_all_kernel(dtype_in): + from pycuda.tools import VectorArg + return ReductionKernel( + np.int8, + "true", + "a && b", + map_expr="(bool) (in[i])", + arguments=[VectorArg(dtype_in, "in")]) + @context_dependent_memoize def get_sum_kernel(dtype_out, dtype_in): if dtype_out is None: diff --git a/setup.py b/setup.py index 9adccd876135787d83e1b6cf5f11f3abdbb8c7a5..906bbe7555bc6851e8d3941af8fe06e023f82f43 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,7 @@ def get_config_schema(): IncludeDir("CUDA", None), Switch("CUDA_ENABLE_GL", False, "Enable CUDA GL interoperability"), Switch("CUDA_ENABLE_CURAND", True, "Enable CURAND library"), + Switch("CUDA_ENABLE_CUDAGRAPH", True, "Enable CUDAGRAPH library"), LibraryDir("CUDADRV", default_lib_dirs), Libraries("CUDADRV", ["cuda"]), LibraryDir("CUDART", default_lib_dirs), @@ -168,6 +169,10 @@ def main(): for lib_dir in conf["CUDADRV_LIB_DIR"]: conf["LDFLAGS"].extend(["-Xlinker", "-rpath", "-Xlinker", lib_dir]) + if conf["CUDA_ENABLE_CUDAGRAPH"]: + EXTRA_SOURCES.append("src/wrapper/wrap_cudagraph.cpp") + EXTRA_DEFINES["HAVE_CUDAGRAPH"] = 1 + if conf["CUDA_ENABLE_GL"]: EXTRA_SOURCES.append("src/wrapper/wrap_cudagl.cpp") EXTRA_DEFINES["HAVE_GL"] = 1 diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp index f60bb1d9cd77286b801b2bc7a65b3586e5daf7a0..804e2141f321231fe65166e59bcf0b9d817f5c91 100644 --- a/src/cpp/cuda.hpp +++ b/src/cpp/cuda.hpp @@ -1354,6 +1354,14 @@ namespace pycuda : m_function(func), m_symbol(sym) { } + CUfunction handle(){ + return m_function; + } + + intptr_t handle_int(){ + return (intptr_t) m_function; + } + void set_block_shape(int x, int y, int z) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( @@ -2140,6 +2148,16 @@ namespace pycuda return this; } + bool operator==(const event& rhs) + { + return m_event == rhs.m_event; + } + + bool operator!=(const event& rhs) + { + return !(*this == rhs); + } + CUevent handle() const { return m_event; } diff --git a/src/cpp/cudagraph.hpp b/src/cpp/cudagraph.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c3ff57b54f30150961b9edf07b070b3c125d72fe --- /dev/null +++ b/src/cpp/cudagraph.hpp @@ -0,0 +1,890 @@ +#ifndef _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_GRAPH_HPP +#define _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_GRAPH_HPP + +#include "../cpp/cuda.hpp" + +namespace pycuda { namespace cuda_graph { + +// {{{ CUDANode +class CUDANode{ + protected: + CUgraphNode m_node; + + public: + CUDANode(CUgraphNode node) : m_node(node) { + } + + ~CUDANode() { + } + + bool operator==(const CUDANode& rhs) + { + return m_node == rhs.m_node; + } + + bool operator!=(const CUDANode& rhs) + { + return !(*this == rhs); + } + + CUgraphNode handle() const { + return m_node; + } + }; + + //}}} + + py::list list_maker(CUgraphNode* nodes, size_t length){ + py::list list_nodes; + for (int i=0; i(list_nodes[i]); + nodes[i] = node->handle(); + } + return nodes; + } + + //{{{ CUDAGraph + class CUDAGraph : public context_dependent + { + protected: + CUgraph m_graph; + bool m_managed; + + public: + CUDAGraph(CUgraph graph, bool managed) + : m_graph(graph), m_managed(managed) + {} + + CUDAGraph(unsigned int flags=0) + : m_managed(true) + { + CUDAPP_CALL_GUARDED(cuGraphCreate, (&m_graph, flags)); + } + + ~CUDAGraph() { + free(); + } + + void free(){ + if (m_managed) + { + try + { + scoped_context_activation ca(get_context()); + CUDAPP_CALL_GUARDED_CLEANUP(cuGraphDestroy,(m_graph)); + } + CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(CUDAGraph); + + m_managed = false; + release_context(); + } + } + + bool operator==(const CUDAGraph& rhs) + { + return m_graph == rhs.m_graph; + } + + bool operator!=(const CUDAGraph& rhs) + { + return !(*this == rhs); + } + + CUgraph handle() const { + return m_graph; + } + + void debug_dot_print(std::string path) + { // borrowed from https://github.com/gfokkema/pycuda/commit/57ca7e8879e2d1ce73d57232d81af883722eea85 + CUDAPP_CALL_GUARDED(cuGraphDebugDotPrint, (m_graph, path.c_str(), 0)) + } + + CUDAGraph *clone() const { + CUgraph clone; + CUDAPP_CALL_GUARDED(cuGraphClone,(&clone, m_graph)); + return new CUDAGraph(clone, false); + } + + CUDANode *add_empty_node(py::list list_dependencies) const { + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUgraphNode empty_node; + CUDAPP_CALL_GUARDED(cuGraphAddEmptyNode,(&empty_node, m_graph, dependencies, len(list_dependencies))); + return new CUDANode(empty_node); + } + + py::tuple get_nodes() const { + size_t numNodes = 0; + CUDAPP_CALL_GUARDED(cuGraphGetNodes, (m_graph, NULL, &numNodes)); + if (numNodes == 0) + { + return py::tuple(); + } + CUgraphNode* nodes = new CUgraphNode[numNodes]; + CUDAPP_CALL_GUARDED(cuGraphGetNodes, (m_graph, nodes, &numNodes)); + py::list list_nodes = list_maker(nodes, numNodes); + return py::tuple(list_nodes); + } + + py::tuple get_root_nodes() const{ + size_t numRootNodes = 0; + CUDAPP_CALL_GUARDED(cuGraphGetRootNodes, (m_graph, NULL, &numRootNodes)); + if (numRootNodes == 0) + { + return py::tuple(); + } + CUgraphNode* root_nodes = new CUgraphNode[numRootNodes]; + CUDAPP_CALL_GUARDED(cuGraphGetRootNodes, (m_graph, root_nodes, &numRootNodes)); + py::list list_root_nodes = list_maker(root_nodes, numRootNodes); + return py::tuple(list_root_nodes); + } + + void add_dependencies(CUDANode* user_from_node, py::list list_to_nodes) { + CUgraphNode* to_nodes = list_extractor(list_to_nodes); + CUgraphNode from_node = user_from_node->handle(); + cuGraphAddDependencies(m_graph, &from_node, to_nodes, len(list_to_nodes)); + } + + + void remove_dependencies(CUDANode* user_from_node, py::list list_to_nodes) { + CUgraphNode* to_nodes = list_extractor(list_to_nodes); + CUgraphNode from_node = user_from_node->handle(); + cuGraphRemoveDependencies(m_graph, &from_node, to_nodes, len(list_to_nodes)); + } + + py::tuple get_dependencies(CUDANode* user_node){ + size_t numDependencies=0; + CUgraphNode node = user_node->handle(); + CUDAPP_CALL_GUARDED(cuGraphNodeGetDependencies, (node, NULL, &numDependencies)); + if (numDependencies == 0) + { + return py::tuple(); + } + CUgraphNode* dependencies = new CUgraphNode[numDependencies]; + CUDAPP_CALL_GUARDED(cuGraphNodeGetDependencies, (node, dependencies, &numDependencies)); + py::list list_dependencies = list_maker(dependencies, numDependencies); + return py::tuple(list_dependencies); + } + + py::tuple get_dependent_nodes(CUDANode* user_node){ + size_t numDependentnodes=0; + CUgraphNode node = user_node->handle(); + CUDAPP_CALL_GUARDED(cuGraphNodeGetDependentNodes, (node, NULL, &numDependentnodes)); + if (numDependentnodes == 0) + { + return py::tuple(); + } + CUgraphNode* dependent_nodes = new CUgraphNode[numDependentnodes];; + CUDAPP_CALL_GUARDED(cuGraphNodeGetDependentNodes, (node, dependent_nodes, &numDependentnodes)); + py::list list_dependent_nodes = list_maker(dependent_nodes, numDependentnodes); + return py::tuple(list_dependent_nodes); + } + + CUDANode *add_child_graph_node(CUDAGraph* user_child_graph, py::list list_dependencies){ + CUgraph child_graph = user_child_graph->handle(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUgraphNode child_node; + CUDAPP_CALL_GUARDED(cuGraphAddChildGraphNode,(&child_node, m_graph, dependencies, len(list_dependencies), child_graph)); + return new CUDANode(child_node); + } + + CUDAGraph *get_child_graph(CUDANode* user_child_node){ + CUgraphNode child_node = user_child_node->handle(); + CUgraph child_graph; + CUDAPP_CALL_GUARDED(cuGraphChildGraphNodeGetGraph,(child_node, &child_graph)); + return new CUDAGraph(child_graph, false); + } + + CUDANode *add_event_record_node(pycuda::event* user_event, const py::list list_dependencies){ + CUevent event = user_event->handle(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUgraphNode event_record_node; + CUDAPP_CALL_GUARDED(cuGraphAddEventRecordNode,(&event_record_node, m_graph, dependencies, len(list_dependencies), event)); + return new CUDANode(event_record_node); + } + + CUDANode *add_event_wait_node(pycuda::event* user_event, py::list list_dependencies){ + CUevent event = user_event->handle(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUgraphNode event_wait_node; + CUDAPP_CALL_GUARDED(cuGraphAddEventWaitNode,(&event_wait_node, m_graph, dependencies, len(list_dependencies), event)); + return new CUDANode(event_wait_node); + } + + pycuda::event *get_event_from_event_record_node(CUDANode* user_event_record_node){ + CUevent event; + CUgraphNode event_record_node = user_event_record_node->handle(); + CUDAPP_CALL_GUARDED(cuGraphEventRecordNodeGetEvent,(event_record_node, &event)); + return new pycuda::event(event); + } + + pycuda::event *get_event_from_event_wait_node(CUDANode* user_event_wait_node){ + CUevent event; + CUgraphNode event_wait_node = user_event_wait_node->handle(); + CUDAPP_CALL_GUARDED(cuGraphEventWaitNodeGetEvent,(event_wait_node, &event)); + return new pycuda::event(event); + } + + void set_record_node_event( CUDANode* user_event_record_node, pycuda::event* user_event ){ + CUgraphNode event_record_node = user_event_record_node->handle(); + CUevent event = user_event->handle(); + CUDAPP_CALL_GUARDED(cuGraphEventRecordNodeSetEvent,(event_record_node, event)); + } + + void set_wait_node_event( CUDANode* user_event_wait_node, pycuda::event* user_event){ + CUgraphNode event_wait_node = user_event_wait_node->handle(); + CUevent event = user_event->handle(); + CUDAPP_CALL_GUARDED(cuGraphEventWaitNodeSetEvent,(event_wait_node, event)); + } + + py::tuple add_memalloc_node(py::list list_dependencies, + size_t bytes){ + + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUDA_MEM_ALLOC_NODE_PARAMS nodeParams; + memset(&nodeParams, 0, sizeof(nodeParams)); + + nodeParams.bytesize = bytes; + nodeParams.poolProps.location.id = 0; + nodeParams.poolProps.allocType = CU_MEM_ALLOCATION_TYPE_PINNED; + nodeParams.poolProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + + CUgraphNode memalloc_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemAllocNode,(&memalloc_node, m_graph, + dependencies, len(list_dependencies), &nodeParams)); + CUdeviceptr devptr = nodeParams.dptr; + CUDANode* user_memalloc_node = new CUDANode(memalloc_node); + return py::make_tuple(boost::ref(user_memalloc_node), devptr); + } + + CUDANode* add_memfree_node(py::list list_dependencies, + CUdeviceptr input_ptr){ + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUgraphNode memfree_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemFreeNode,(&memfree_node, m_graph, + dependencies, len(list_dependencies), input_ptr)); + return new CUDANode(memfree_node); + } + + CUDANode* add_kernel_node(pycuda::function* user_func, + py::tuple grid_dim_py, py::tuple block_dim_py, + py::list list_dependencies, + py::object parameter_buffer, + // py::list list_kernel_params, + unsigned int sharedMemBytes){ + + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUfunction func = user_func->handle(); + + const unsigned axis_count = 3; + unsigned grid_dim[axis_count]; + unsigned block_dim[axis_count]; + + for (unsigned i = 0; i < axis_count; ++i) + { + grid_dim[i] = 1; + block_dim[i] = 1; + } + + pycuda_size_t gd_length = py::len(grid_dim_py); + if (gd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many grid dimensions in kernel launch"); + + for (unsigned i = 0; i < gd_length; ++i) + grid_dim[i] = py::extract(grid_dim_py[i]); + + pycuda_size_t bd_length = py::len(block_dim_py); + if (bd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many block dimensions in kernel launch"); + + for (unsigned i = 0; i < bd_length; ++i) + block_dim[i] = py::extract(block_dim_py[i]); + + // CUdeviceptr temp_ptr; + // void ** kernelParams = new void *[len(list_kernel_params)](); + // for (unsigned i = 0; i < len(list_kernel_params); i++){ + // temp_ptr = py::extract(list_kernel_params[i]); + // kernelParams[i] = (void *)(uintptr_t)&temp_ptr; + // } + + py_buffer_wrapper par_buf_wrapper; + par_buf_wrapper.get(parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS); + size_t par_len = par_buf_wrapper.m_buf.len; + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast(par_buf_wrapper.m_buf.buf), + CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len, + CU_LAUNCH_PARAM_END + }; + + CUDA_KERNEL_NODE_PARAMS nodeParams; + nodeParams.func = func; + nodeParams.gridDimX = grid_dim[0]; + nodeParams.gridDimY = grid_dim[1]; + nodeParams.gridDimZ = grid_dim[2]; + nodeParams.blockDimX = block_dim[0]; + nodeParams.blockDimY = block_dim[1]; + nodeParams.blockDimZ = block_dim[2]; + nodeParams.sharedMemBytes = sharedMemBytes; + nodeParams.kernelParams = 0; + nodeParams.extra = config; + + CUgraphNode kernel_node; + CUDAPP_CALL_GUARDED(cuGraphAddKernelNode,(&kernel_node, m_graph, + dependencies, len(list_dependencies), &nodeParams)); + return new CUDANode(kernel_node); + } + + // CUgraphNode add_host_node(py::list list_dependencies,CUhostFn func, void* userData, size_t numdependencies=0){ + + // CUDA_HOST_NODE_PARAMS nodeParams; + // nodeParams.fn = func; + // nodeParams.userData = userData; + // CUgraphNode host_node; + // CUDAPP_CALL_GUARDED(cuGraphAddHostNode,(&host_node, graph, &dependencies, numdependencies, &nodeParams)); + // return host_node; + // } + + CUDANode *add_memcpy_htod_node(py::list list_dependencies, + py::object ctx_py, + py::object src, CUdeviceptr dstPtr, + size_t srcXInBytes, size_t dstXInBytes, size_t WidthInBytes){ + + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(src.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + boost::shared_ptr ptr_ctx = py::extract >(ctx_py); + context* ctx = ptr_ctx.get(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcHost = buf_wrapper.m_buf.buf; + nodeParams.dstDevice = dstPtr; + nodeParams.WidthInBytes = WidthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_HOST; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + CUgraphNode memcpy_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemcpyNode,(&memcpy_node, m_graph, dependencies, len(list_dependencies), &nodeParams, ctx->handle())); + return new CUDANode(memcpy_node); + } + + CUDANode *add_memcpy_dtoh_node(py::list list_dependencies, + py::object ctx_py, + CUdeviceptr srcPtr, py::object dst, + size_t srcXInBytes, size_t dstXInBytes, size_t WidthInBytes){ + + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(dst.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + boost::shared_ptr ptr_ctx = py::extract >(ctx_py); + context* ctx = ptr_ctx.get(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcDevice = srcPtr; + nodeParams.dstHost = buf_wrapper.m_buf.buf; + nodeParams.WidthInBytes = WidthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_HOST; + + + CUgraphNode memcpy_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemcpyNode,(&memcpy_node, m_graph, dependencies, len(list_dependencies), &nodeParams, ctx->handle())); + return new CUDANode(memcpy_node); + } + + CUDANode *add_memcpy_dtod_node(py::list list_dependencies, + py::object ctx_py, + CUdeviceptr srcPtr, CUdeviceptr dstPtr, + size_t srcXInBytes, size_t dstXInBytes, size_t WidthInBytes){ + + boost::shared_ptr ptr_ctx = py::extract >(ctx_py); + context* ctx = ptr_ctx.get(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcDevice = srcPtr; + nodeParams.dstDevice = dstPtr; + nodeParams.WidthInBytes = WidthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + + + CUgraphNode memcpy_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemcpyNode,(&memcpy_node, m_graph, dependencies, len(list_dependencies), &nodeParams, ctx->handle())); + return new CUDANode(memcpy_node); + } + + + CUDANode *add_memcpy_atod_node(py::list list_dependencies, + py::object ctx_py, + pycuda::array* srcArray, CUdeviceptr dstPtr, + size_t srcXInBytes, size_t dstXInBytes, size_t WidthInBytes){ + + boost::shared_ptr ptr_ctx = py::extract >(ctx_py); + context* ctx = ptr_ctx.get(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcArray = srcArray->handle(); + nodeParams.dstDevice = dstPtr; + nodeParams.WidthInBytes = WidthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_ARRAY; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + + CUgraphNode memcpy_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemcpyNode,(&memcpy_node, m_graph, dependencies, len(list_dependencies), &nodeParams, ctx->handle())); + return new CUDANode(memcpy_node); + } + + CUDANode *add_memcpy_atoa_node(py::list list_dependencies, + py::object ctx_py, + pycuda::array* srcArray, pycuda::array* dstArray, + size_t srcXInBytes, size_t dstXInBytes, size_t WidthInBytes){ + + boost::shared_ptr ptr_ctx = py::extract >(ctx_py); + context* ctx = ptr_ctx.get(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcArray = srcArray->handle(); + nodeParams.dstArray = dstArray->handle(); + nodeParams.WidthInBytes = WidthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_ARRAY; + nodeParams.dstMemoryType = CU_MEMORYTYPE_ARRAY; + + CUgraphNode memcpy_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemcpyNode,(&memcpy_node, m_graph, dependencies, len(list_dependencies), &nodeParams, ctx->handle())); + return new CUDANode(memcpy_node); + } + + CUDANode *add_memcpy_dtoa_node(py::list list_dependencies, + py::object ctx_py, + CUdeviceptr srcPtr, pycuda::array* dstArray, + size_t srcXInBytes, size_t dstXInBytes, size_t WidthInBytes){ + + boost::shared_ptr ptr_ctx = py::extract >(ctx_py); + context* ctx = ptr_ctx.get(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcDevice = srcPtr; + nodeParams.dstArray = dstArray->handle(); + nodeParams.WidthInBytes = WidthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_ARRAY; + + + CUgraphNode memcpy_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemcpyNode,(&memcpy_node, m_graph, dependencies, len(list_dependencies), &nodeParams, ctx->handle())); + return new CUDANode(memcpy_node); + } + + CUDANode *add_memset_node(py::list list_dependencies, + py::object ctx_py, + CUdeviceptr dstPtr, + unsigned int elementSize, unsigned int value, + size_t width, size_t height, size_t pitch){ + + boost::shared_ptr ptr_ctx = py::extract >(ctx_py); + context* ctx = ptr_ctx.get(); + CUgraphNode* dependencies = list_extractor(list_dependencies); + CUDA_MEMSET_NODE_PARAMS nodeParams; + nodeParams.dst = dstPtr; + nodeParams.value = value; + nodeParams.elementSize = elementSize; + nodeParams.width = width; + nodeParams.height = height; + nodeParams.pitch = pitch; + CUgraphNode memset_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemsetNode,(&memset_node, m_graph, dependencies, len(list_dependencies), &nodeParams, ctx->handle())); + return new CUDANode(memset_node); + } + +// void set_host_node_params(CUgraphNode host_node, CUhostFn func, void* userData){ + +// CUDA_HOST_NODE_PARAMS nodeParams; +// nodeParams.fn = func; +// nodeParams.userData = userData; +// CUDAPP_CALL_GUARDED(cuGraphHostNodeSetParams,(host_node, &nodeParams)); +// } + void set_kernel_node_params(CUDANode* user_kernel_node, pycuda::function* user_func, + py::tuple grid_dim_py, py::tuple block_dim_py, + py::list list_kernel_params, + unsigned sharedMemBytes){ + + CUgraphNode kernel_node = user_kernel_node->handle(); + CUfunction func = user_func->handle(); + + const unsigned axis_count = 3; + unsigned grid_dim[axis_count]; + unsigned block_dim[axis_count]; + + for (unsigned i = 0; i < axis_count; ++i) + { + grid_dim[i] = 1; + block_dim[i] = 1; + } + + pycuda_size_t gd_length = py::len(grid_dim_py); + if (gd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many grid dimensions in kernel launch"); + + for (unsigned i = 0; i < gd_length; ++i) + grid_dim[i] = py::extract(grid_dim_py[i]); + + pycuda_size_t bd_length = py::len(block_dim_py); + if (bd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many block dimensions in kernel launch"); + + for (unsigned i = 0; i < bd_length; ++i) + block_dim[i] = py::extract(block_dim_py[i]); + void **kernelParams = new void *[len(list_kernel_params)]; + for (unsigned i = 0; i < len(list_kernel_params); i++){ + pycuda::device_allocation* temp_dev = py::extract(list_kernel_params[i]); + CUdeviceptr temp_ptr = (CUdeviceptr) temp_dev; + kernelParams[i] = (void *) (uintptr_t)&temp_ptr; + } + + CUDA_KERNEL_NODE_PARAMS nodeParams; + nodeParams.func = func; + nodeParams.gridDimX = grid_dim[0]; + nodeParams.gridDimY = grid_dim[1]; + nodeParams.gridDimZ = grid_dim[2]; + nodeParams.blockDimX = block_dim[0]; + nodeParams.blockDimY = block_dim[1]; + nodeParams.blockDimZ = block_dim[2]; + nodeParams.sharedMemBytes = sharedMemBytes; + nodeParams.kernelParams = kernelParams; + nodeParams.extra = (void **)NULL; + + CUDAPP_CALL_GUARDED(cuGraphKernelNodeSetParams,(kernel_node, &nodeParams)); + } + + void set_memcpy_htod_node_params(CUDANode* user_memcpy_node, py::object src, CUdeviceptr dstPtr, + size_t srcXInBytes, size_t dstXInBytes, size_t WidthInBytes){ + + CUgraphNode memcpy_node = user_memcpy_node->handle(); + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(src.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcHost = buf_wrapper.m_buf.buf; + nodeParams.dstDevice = dstPtr; + nodeParams.WidthInBytes = WidthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_HOST; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeSetParams,(memcpy_node, &nodeParams)); + } + + void set_memcpy_dtoh_node_params(CUDANode* user_memcpy_node, CUdeviceptr srcPtr, py::object dst, + size_t srcXInBytes, size_t dstXInBytes, size_t WidthInBytes){ + + CUgraphNode memcpy_node = user_memcpy_node->handle(); + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(dst.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcDevice = srcPtr; + nodeParams.dstHost = buf_wrapper.m_buf.buf; + nodeParams.WidthInBytes = WidthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_HOST; + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeSetParams,(memcpy_node, &nodeParams)); + } + + void set_memcpy_dtod_node_params(CUDANode* user_memcpy_node, CUdeviceptr srcPtr, CUdeviceptr dstPtr, + size_t srcXInBytes, size_t dstXInBytes, size_t WidthInBytes){ + + CUgraphNode memcpy_node = user_memcpy_node->handle(); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcDevice = srcPtr; + nodeParams.dstDevice = dstPtr; + nodeParams.WidthInBytes = WidthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeSetParams,(memcpy_node, &nodeParams)); + } + + + void set_memset_node_params(CUDANode* user_memset_node, CUdeviceptr dstPtr, + unsigned int value, unsigned int elementSize, + size_t width, size_t height, size_t pitch){ + + CUgraphNode memset_node = user_memset_node->handle(); + CUDA_MEMSET_NODE_PARAMS nodeParams; + nodeParams.dst = dstPtr; + nodeParams.elementSize = elementSize; + nodeParams.width = width; + nodeParams.height = height; + nodeParams.pitch = pitch; + nodeParams.value = value; + CUDAPP_CALL_GUARDED(cuGraphMemsetNodeSetParams,(memset_node, &nodeParams)); + } + +// CUDA_HOST_NODE_PARAMS get_host_node_params(CUgraphNode host_node){ +// CUDA_HOST_NODE_PARAMS nodeParams; +// CUDAPP_CALL_GUARDED(cuGraphHostNodeGetParams,(host_node, &nodeParams)); +// return nodeParams; +// } + + py::tuple get_kernel_node_params(CUDANode* user_kernel_node){ + CUgraphNode kernel_node = user_kernel_node->handle(); + CUDA_KERNEL_NODE_PARAMS nodeParams; + + CUDAPP_CALL_GUARDED(cuGraphKernelNodeGetParams,(kernel_node, &nodeParams)); + + pycuda::function* func = new pycuda::function(nodeParams.func, "function"); + py::tuple grid_dim_py = py::make_tuple(nodeParams.gridDimX, + nodeParams.gridDimY, + nodeParams.gridDimZ); + py::tuple block_dim_py = py::make_tuple(nodeParams.blockDimX, + nodeParams.blockDimY, + nodeParams.blockDimZ); + unsigned sharedMemBytes = nodeParams.sharedMemBytes; + void **kernelParams = nodeParams.kernelParams; + py::list list_kernel_arguments; + size_t size_kernelParams = sizeof(kernelParams)/ sizeof(kernelParams[0]); + for (unsigned i = 0; i < size_kernelParams; i++){ + CUdeviceptr temp_ptr = (CUdeviceptr) (uintptr_t) kernelParams[i]; + device_allocation* pre_to_python_devicealloc = new device_allocation(temp_ptr); + list_kernel_arguments.append(boost::ref(pre_to_python_devicealloc)); + } + + return py::make_tuple(func, grid_dim_py, block_dim_py, list_kernel_arguments); + } + + py::tuple get_memset_node_params(CUDANode* user_memset_node){ + CUgraphNode memset_node = user_memset_node->handle(); + CUDA_MEMSET_NODE_PARAMS nodeParams; + + CUDAPP_CALL_GUARDED(cuGraphMemsetNodeGetParams,(memset_node, &nodeParams)); + + return py::make_tuple(nodeParams.dst, nodeParams.value, nodeParams.elementSize, nodeParams.width, nodeParams.height, nodeParams.pitch); + } + + py::tuple get_memcpy_node_params(CUDANode* user_memcpy_node){ + CUgraphNode memcpy_node = user_memcpy_node->handle(); + CUDA_MEMCPY3D nodeParams; + + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeGetParams,(memcpy_node, &nodeParams)); + + return py::make_tuple(nodeParams.srcDevice, (CUdeviceptr)nodeParams.dstHost, nodeParams.srcXInBytes, nodeParams.dstXInBytes, nodeParams.WidthInBytes); + } + + }; + + ///{{{ CUDAGraphExec + + class CUDAGraphExec{ + protected: + CUgraphExec m_exec; + bool m_managed; + + public: + CUDAGraphExec(CUgraphExec exec, bool managed) + : m_exec(exec), m_managed(managed){ + } + + CUDAGraphExec(CUDAGraph* user_graph) + : m_managed(true) + { CUgraph graph = user_graph->handle(); + CUDAPP_CALL_GUARDED(cuGraphInstantiate,(&m_exec, graph, NULL, NULL, 0));} + + + ~CUDAGraphExec() { + } + + CUgraphExec handle(){ + return m_exec; + } + + void launch(py::object stream_py) + { // borrowed from https://github.com/inducer/pycuda/pull/343/commits/57ca7e8879e2d1ce73d57232d81af883722eea85 + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuGraphLaunch,(m_exec, s_handle)); + } + + void upload(py::object stream_py) + { + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuGraphUpload,(m_exec, s_handle)); + } + + void set_record_node_event(CUDANode* user_event_record_node, pycuda::event* user_event) + { + CUgraphNode event_record_node = user_event_record_node->handle(); + CUevent event = user_event->handle(); + CUDAPP_CALL_GUARDED(cuGraphExecEventRecordNodeSetEvent,(m_exec, event_record_node, event)); + } + + void set_wait_node_event(CUDANode* user_event_wait_node, pycuda::event* user_event) + { + CUgraphNode event_wait_node = user_event_wait_node->handle(); + CUevent event = user_event->handle(); + CUDAPP_CALL_GUARDED(cuGraphExecEventWaitNodeSetEvent,(m_exec, event_wait_node, event)); + } + + + +// void set_host_node_params(CUgraphNode host_node, CUhostFn func, void* userData){ + +// CUDA_HOST_NODE_PARAMS nodeParams; +// nodeParams.fn = func; +// nodeParams.userData = userData; +// CUDAPP_CALL_GUARDED(cuGraphExecHostNodeSetParams,(graph_exec, host_node, &nodeParams)); +// } + + void set_kernel_node_params(CUDANode* user_kernel_node, pycuda::function* user_func, + py::tuple grid_dim_py, py::tuple block_dim_py){ + + CUgraphNode kernel_node = user_kernel_node->handle(); + CUfunction func = user_func->handle(); + const unsigned axis_count = 3; + unsigned grid_dim[axis_count]; + unsigned block_dim[axis_count]; + + for (unsigned i = 0; i < axis_count; ++i) + { + grid_dim[i] = 1; + block_dim[i] = 1; + } + + pycuda_size_t gd_length = py::len(grid_dim_py); + if (gd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many grid dimensions in kernel launch"); + + for (unsigned i = 0; i < gd_length; ++i) + grid_dim[i] = py::extract(grid_dim_py[i]); + + pycuda_size_t bd_length = py::len(block_dim_py); + if (bd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many block dimensions in kernel launch"); + + for (unsigned i = 0; i < bd_length; ++i) + block_dim[i] = py::extract(block_dim_py[i]); + + CUDA_KERNEL_NODE_PARAMS nodeParams; + nodeParams.func = func; + nodeParams.gridDimX = grid_dim[0]; + nodeParams.gridDimY = grid_dim[1]; + nodeParams.gridDimZ = grid_dim[2]; + nodeParams.blockDimX = block_dim[0]; + nodeParams.blockDimY = block_dim[1]; + nodeParams.blockDimZ = block_dim[2]; + nodeParams.kernelParams = NULL; + nodeParams.extra = (void **)NULL; + + CUDAPP_CALL_GUARDED(cuGraphExecKernelNodeSetParams,(m_exec, kernel_node, &nodeParams)); + } + + void set_memcpy_node_params(CUDANode* user_memcpy_node, py::object ctx_py, + CUdeviceptr srcPtr, CUdeviceptr dstPtr, + size_t srcXInBytes, size_t dstXInBytes, size_t widthInBytes){ + + CUgraphNode memcpy_node = user_memcpy_node->handle(); + boost::shared_ptr ctx = py::extract >(ctx_py); + CUDA_MEMCPY3D nodeParams; + nodeParams.Depth = 0; + nodeParams.Height = 0; + nodeParams.dstDevice = dstPtr; + nodeParams.srcDevice = srcPtr; + nodeParams.WidthInBytes = widthInBytes; + nodeParams.srcXInBytes = srcXInBytes; + nodeParams.dstXInBytes = dstXInBytes; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + CUDAPP_CALL_GUARDED(cuGraphExecMemcpyNodeSetParams,(m_exec, memcpy_node, &nodeParams,ctx->handle())); + } + + + void set_memset_node_params(CUDANode* user_memset_node, py::object ctx_py, + CUdeviceptr dstPtr, unsigned int value, unsigned int elementSize, + size_t width, size_t height, size_t pitch){ + + CUgraphNode memset_node = user_memset_node->handle(); + boost::shared_ptr ctx = py::extract >(ctx_py); + CUDA_MEMSET_NODE_PARAMS nodeParams; + nodeParams.dst = dstPtr; + nodeParams.elementSize = elementSize; + nodeParams.width = width; + nodeParams.height = height; + nodeParams.pitch = pitch; + nodeParams.value = value; + CUDAPP_CALL_GUARDED(cuGraphExecMemsetNodeSetParams,(m_exec, memset_node, &nodeParams, ctx->handle())); + } + + void update_child_graph(CUDANode* user_child_node, CUDAGraph* user_child_graph){ + CUgraphNode child_node = user_child_node->handle(); + CUgraph child_graph = user_child_graph->handle(); + CUDAPP_CALL_GUARDED(cuGraphExecChildGraphNodeSetParams,(m_exec, child_node, child_graph)); + } + + }; + +///}}} +//{{{ stream capture +//borrowed from https://github.com/inducer/pycuda/pull/343/files +inline void begin_capture(py::object stream_py, CUstreamCaptureMode mode = CU_STREAM_CAPTURE_MODE_GLOBAL) +{ + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuStreamBeginCapture, (s_handle, mode)); +} + +inline CUDAGraph *end_capture(py::object stream_py) +{ + CUgraph graph; + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuStreamEndCapture, (s_handle, &graph)) + return new CUDAGraph(graph,false); +} +//}}} + +}} + +#endif + +// vim: foldmethod=marker + diff --git a/src/wrapper/tools.hpp b/src/wrapper/tools.hpp index 98a7d8c3714d4069a3db8c5449937869654fa45d..a2861a64bade7f48287cd00cab232331d2407ade 100644 --- a/src/wrapper/tools.hpp +++ b/src/wrapper/tools.hpp @@ -4,7 +4,7 @@ -#include +#include "../cpp/cuda.hpp" #include #include #include @@ -46,7 +46,7 @@ namespace pycuda return pycuda::mem_alloc(bytes); } catch (pycuda::error &e) - { + { if (e.code() != CUDA_ERROR_OUT_OF_MEMORY) throw; } diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 01f09189f23e37ab37b73d95075c2ae641889a30..04b9e2acd20f8ff2c07c3610159049799ba28c5e 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -82,7 +82,7 @@ namespace class mem_host_register_flags { }; class mem_peer_register_flags { }; class array3d_flags { }; - + class CUDAGraph {}; // {{{ "python-aware" wrappers @@ -654,6 +654,7 @@ namespace void pycuda_expose_tools(); void pycuda_expose_gl(); void pycuda_expose_curand(); +void pycuda_expose_cuda_graph(); @@ -1203,6 +1204,14 @@ BOOST_PYTHON_MODULE(_driver) // }}} // {{{ stream + //borrowed from https://github.com/inducer/pycuda/pull/343/files +#if CUDAPP_CUDA_VERSION >= 10000 + py::enum_("capture_mode") + .value("GLOBAL", CU_STREAM_CAPTURE_MODE_GLOBAL) + .value("THREAD_LOCAL", CU_STREAM_CAPTURE_MODE_THREAD_LOCAL) + .value("RELAXED", CU_STREAM_CAPTURE_MODE_RELAXED) + ; +#endif { typedef stream cl; py::class_ > @@ -1277,7 +1286,7 @@ BOOST_PYTHON_MODULE(_driver) .def("_param_setf", (void (cl::*)(int, float )) &cl::param_set) .def("_param_setv", function_param_setv) .DEF_SIMPLE_METHOD(param_set_texref) - + .add_property("handle_int",&cl::handle_int) .def("_launch", &cl::launch) .def("_launch_grid", &cl::launch_grid, @@ -1598,6 +1607,8 @@ BOOST_PYTHON_MODULE(_driver) ("Event", py::init >(py::arg("flags"))) .def("record", &cl::record, py::arg("stream")=py::object(), py::return_self<>()) + .def("__eq__", &cl::operator==) + .def("__ne__", &cl::operator!=) .def("synchronize", &cl::synchronize, py::return_self<>()) .DEF_SIMPLE_METHOD(query) .DEF_SIMPLE_METHOD(time_since) @@ -1717,6 +1728,10 @@ BOOST_PYTHON_MODULE(_driver) #ifdef HAVE_CURAND pycuda_expose_curand(); #endif +#if CUDAPP_CUDA_VERSION >= 10000 +pycuda_expose_cuda_graph(); +#endif + } // vim: foldmethod=marker diff --git a/src/wrapper/wrap_cudagraph.cpp b/src/wrapper/wrap_cudagraph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eedc451f863497c3a5f0b828ff956d035a51ee4a --- /dev/null +++ b/src/wrapper/wrap_cudagraph.cpp @@ -0,0 +1,108 @@ +#ifdef _WIN32 + #include +#endif + +#include "../cpp/cuda.hpp" +#include "../cpp/cudagraph.hpp" + +#include "tools.hpp" +#include "wrap_helpers.hpp" + +using namespace pycuda; +using namespace pycuda::cuda_graph; +using boost::shared_ptr; + + + void pycuda_expose_cuda_graph() + { + using py::args; + + // {{{ CUDANode + { + typedef CUDANode cl; + py::class_ + ("Node", py::no_init) + .def("__eq__", &cl::operator==) + .def("__ne__", &cl::operator!=); + } + /// }}} + + /// {{{ CUDAGraph + { + typedef CUDAGraph cl; + py::class_ + ("Graph", py::init >(py::arg("flags"))) + .def("__eq__", &cl::operator==) + .def("__ne__", &cl::operator!=) + .DEF_SIMPLE_METHOD(debug_dot_print) + .def("clone",&cl::clone,py::return_value_policy()) + .def("add_empty_node",&cl::add_empty_node,py::return_value_policy(),py::arg("list_dependencies")=py::list()) + .def("get_nodes",&cl::get_nodes) + .def("get_root_nodes",&cl::get_root_nodes) + .def("add_dependencies",&cl::add_dependencies,py::arg("user_from_node"),py::arg("list_to_nodes")) + .def("remove_dependencies",&cl::remove_dependencies,py::arg("user_from_node"),py::arg("list_to_nodes")) + .def("get_dependencies",&cl::get_dependencies,py::arg("user_node")) + .def("get_dependent_nodes",&cl::get_dependent_nodes,py::arg("user_node")) + .def("add_child_graph_node",&cl::add_child_graph_node,py::return_value_policy(),py::arg("user_child_graph"),py::arg("list_dependencies")=py::list()) + .def("get_child_graph",&cl::get_child_graph,py::return_value_policy(),py::arg("user_child_node")) + .def("add_event_record_node",&cl::add_event_record_node,py::return_value_policy(),py::arg("user_event"),py::arg("list_dependencies")=py::list()) + .def("add_event_wait_node",&cl::add_event_wait_node,py::return_value_policy(),py::arg("user_event"),py::arg("list_dependencies")=py::list()) + .def("get_event_from_event_record_node",&cl::get_event_from_event_record_node,py::return_value_policy(),py::arg("user_event_record_node")) + .def("get_event_from_event_wait_node",&cl::get_event_from_event_wait_node,py::return_value_policy(),py::arg("user_event_record_node")) + .def("set_record_node_event",&cl::set_record_node_event,py::arg("user_event_record_node"),py::arg("user_event")) + .def("set_wait_node_event",&cl::set_wait_node_event,py::arg("user_event_record_node"),py::args("user_event")) + .def("add_memalloc_node",&cl::add_memalloc_node) + .def("add_memfree_node",&cl::add_memfree_node,py::return_value_policy()) + .def("_add_kernel_node",&cl::add_kernel_node,py::return_value_policy()) + .def("add_memcpy_htod_node",&cl::add_memcpy_htod_node,py::return_value_policy()) + .def("add_memcpy_dtoh_node",&cl::add_memcpy_dtoh_node,py::return_value_policy()) + .def("add_memcpy_dtod_node",&cl::add_memcpy_dtod_node,py::return_value_policy()) + .def("add_memcpy_atod_node",&cl::add_memcpy_atod_node,py::return_value_policy()) + .def("add_memcpy_atoa_node",&cl::add_memcpy_atoa_node,py::return_value_policy()) + .def("add_memcpy_dtoa_node",&cl::add_memcpy_dtoa_node,py::return_value_policy()) + .def("add_memset_node",&cl::add_memset_node,py::return_value_policy()) + .def("set_kernel_node_params",&cl::set_kernel_node_params) + .def("set_memcpy_htod_node_params",&cl::set_memcpy_htod_node_params) + .def("set_memcpy_dtoh_node_params",&cl::set_memcpy_dtoh_node_params) + .def("set_memcpy_dtod_node_params",&cl::set_memcpy_dtod_node_params) + .def("set_memset_node_params",&cl::set_memset_node_params) + .def("get_kernel_node_params",&cl::get_kernel_node_params) + .def("get_memcpy_node_params",&cl::get_memcpy_node_params) + .def("get_memset_node_params",&cl::get_memset_node_params); + } + /// }}} + + /// {{{ CUDAGraphExec + { + typedef CUDAGraphExec cl; + py::class_ + ("GraphExec", py::init()) + .def("launch",&cl::launch,(py::arg("stream_py")=py::object())) + .def("upload",&cl::upload,(py::arg("stream_py")=py::object())) + .def("set_record_node_event",&cl::set_record_node_event) + .def("set_wait_node_event",&cl::set_wait_node_event) + .def("set_kernel_node_params",&cl::set_kernel_node_params) + .def("set_memcpy_node_params",&cl::set_memcpy_node_params) + .def("set_memset_node_params",&cl::set_memset_node_params) + .def("update_child_graph",&cl::update_child_graph); + } + // }}} + + // {{{ stream capture + + #if CUDAPP_CUDA_VERSION >= 10000 + py::enum_("capture_mode") + .value("GLOBAL", CU_STREAM_CAPTURE_MODE_GLOBAL) + .value("THREAD_LOCAL", CU_STREAM_CAPTURE_MODE_THREAD_LOCAL) + .value("RELAXED", CU_STREAM_CAPTURE_MODE_RELAXED) + ; + #endif + + py::def("begin_capture", begin_capture, + py::arg("capture_mode") = CU_STREAM_CAPTURE_MODE_GLOBAL); + py::def("end_capture", end_capture, py::return_value_policy()); + // }}} + + } + +// vim: foldmethod=marker \ No newline at end of file diff --git a/test/test_cudagraph.py b/test/test_cudagraph.py new file mode 100644 index 0000000000000000000000000000000000000000..53a6132b782df71cc5fc89cb6f7976c6df74997b --- /dev/null +++ b/test/test_cudagraph.py @@ -0,0 +1,236 @@ +__copyright__ = """ +Copyright 2008-2021 Andreas Kloeckner +Copyright 2021 NVIDIA Corporation +""" + +import numpy as np +import pytest # noqa + +import pycuda +import pycuda.autoinit # noqa: F401 +import pycuda.driver as drv +from pycuda.compiler import SourceModule + +ctx = drv.Context.get_current() +dev = ctx.get_device() +post_volta_devices = ['NVIDIA TITAN V'] +post_volta_flag = dev.name() in post_volta_devices + + +class TestCUDAGraph: + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_graph_features(self, post_volta_flag): + + g = drv.Graph() + empty_node = g.add_empty_node([]) + assert (empty_node == g.get_nodes()[0]) + + g_exec = drv.GraphExec(g) + g_exec.launch() + + clone = g.clone() + child_node = g.add_child_graph_node(clone, []) + assert (len(clone.get_nodes()) == len(g.get_child_graph(child_node).get_nodes())) + g.add_dependencies(child_node, [empty_node]) + assert ((empty_node, ) == g.get_dependent_nodes(child_node)) + g.remove_dependencies(child_node, [empty_node]) + g.add_dependencies(empty_node, [child_node]) + assert ((empty_node, ) == g.get_dependencies(child_node)) + + event = drv.Event() + event2 = drv.Event() + event_record_node = g.add_event_record_node(event, [empty_node]) + assert ((empty_node, ) == g.get_dependencies(event_record_node)) + assert ((child_node, event_record_node) == g.get_dependent_nodes(empty_node)) + assert ((empty_node, ) == g.get_root_nodes()) + g.set_record_node_event(event_record_node, event2) + assert (event2 == g.get_event_from_event_record_node(event_record_node)) + event_wait_node = g.add_event_wait_node(event, [empty_node]) + g.set_wait_node_event(event_wait_node, event2) + assert (event2 == g.get_event_from_event_wait_node(event_wait_node)) + + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_kernel_node(self, post_volta_flag): + + g = drv.Graph() + mod = SourceModule(""" + __global__ void doublify(float *a) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= 2; + } + """) + + doublify = mod.get_function("doublify") + + a = np.random.randn(4, 4).astype(np.float32) + a_doubled = np.empty_like(a) + if post_volta_flag: + mem_alloc_node, a_gpu = g.add_memalloc_node([], a.nbytes) + else: + a_gpu = drv.mem_alloc(a.nbytes) + + ctx = pycuda.tools.make_default_context() + memcpy_htod_node = g.add_memcpy_htod_node([mem_alloc_node] if post_volta_flag else [], ctx, a, a_gpu, 0, 0, a.nbytes) + kernel_node = g.add_kernel_node(a_gpu, func=doublify, grid=(1, ), block=(4, 4, 1), dependencies=[memcpy_htod_node], shared=0) + memcpy_dtoh_node = g.add_memcpy_dtoh_node([kernel_node], ctx, a_gpu, a_doubled, 0, 0, a.nbytes) + if post_volta_flag: + g.add_memfree_node([memcpy_dtoh_node], a_gpu) + g_exec = drv.GraphExec(g) + g_exec.launch() + + np.testing.assert_allclose(a_doubled, a * 2, rtol=1e-5) + + b = np.random.randn(4, 4).astype(np.float32) + b_gpu = drv.mem_alloc(b.nbytes) + g.set_kernel_node_params(kernel_node, doublify, (1, ), (4, 4, 1), [b_gpu], 0) + assert (g.get_kernel_node_params(kernel_node)[1] == (1, 1, 1)) + assert (g.get_kernel_node_params(kernel_node)[2] == (4, 4, 1)) + + ctx.pop() + + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_memcpy_node(self, post_volta_flag): + + g = drv.Graph() + a = np.random.randn(32, 8).astype(np.float32) + b = np.empty([32, 8], dtype=np.float32) + if post_volta_flag: + mem_alloc_node_a_gpu, a_gpu = g.add_memalloc_node([], a.nbytes) + mem_alloc_node_b_gpu, b_gpu = g.add_memalloc_node([], b.nbytes) + else: + a_gpu = drv.mem_alloc(a.nbytes) + b_gpu = drv.mem_alloc(b.nbytes) + + ctx = pycuda.tools.make_default_context() + + node_htod = g.add_memcpy_htod_node([mem_alloc_node_a_gpu] if post_volta_flag else [], ctx, a, a_gpu, 0, 0, a.nbytes) + node_dtod = g.add_memcpy_dtod_node([mem_alloc_node_b_gpu] if post_volta_flag else [node_htod], ctx, a_gpu, b_gpu, 0, 0, a.nbytes) + if post_volta_flag: + g.add_memfree_node([node_dtod], a_gpu) + node_dtoh = g.add_memcpy_dtoh_node([node_dtod], ctx, b_gpu, b, 0, 0, a.nbytes) + if post_volta_flag: + g.add_memfree_node([node_dtoh], b_gpu) + g.set_memcpy_dtoh_node_params(node_dtoh, b_gpu, b, 0, 0, a.nbytes) + assert (g.get_memcpy_node_params(node_dtoh)[2] == 0) + + g_exec = drv.GraphExec(g) + g_exec.launch() + + np.testing.assert_allclose(a, b, rtol=1e-5) + + a_array = drv.np_to_array(a, "C") + a_array_gpu = drv.mem_alloc(a.nbytes) + b_array = drv.np_to_array(b, "C") + b_array_gpu = drv.mem_alloc(b.nbytes) + node_atod = g.add_memcpy_atod_node([node_dtoh], ctx, a_array, a_array_gpu, 0, 0, a.nbytes) + node_dtod = g.add_memcpy_atoa_node([node_atod], ctx, a_array_gpu, b_array_gpu, 0, 0, a.nbytes) + g.add_memcpy_dtoa_node([node_dtod], ctx, a_array_gpu, b_array, 0, 0, a.nbytes) + ctx.pop() + + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_memset_node(self, post_volta_flag): + g = drv.Graph() + + a = np.random.randn(32, 8).astype(np.float32) + if post_volta_flag: + mem_alloc_node_a_gpu, a_gpu = g.add_memalloc_node([], a.nbytes) + else: + a_gpu = drv.mem_alloc(a.nbytes) + ctx = pycuda.tools.make_default_context() + + memset_node = g.add_memset_node([mem_alloc_node_a_gpu] if post_volta_flag else [], ctx, a_gpu, 1, 1, 16, 1, 1) + if post_volta_flag: + g.add_memfree_node([memset_node], a_gpu) + b = np.random.randn(32, 8).astype(np.float32) + if post_volta_flag: + mem_alloc_node_b_gpu, b_gpu = g.add_memalloc_node([], b.nbytes) + else: + b_gpu = drv.mem_alloc(b.nbytes) + g.set_memset_node_params(memset_node, b_gpu, 1, 1, 16, 1, 1) + assert(g.get_memset_node_params(memset_node)[1] == 1) + + g_exec = drv.GraphExec(g) + g_exec.launch() + ctx.pop() + + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_stream_capture(self, post_volta_flag): + # borrowed from https://github.com/inducer/pycuda/pull/343/files#diff-8a50e6f7173b100e6323e9433913e982831b7f440eb6ca48cd8c339ad6c24383R1226 + # Sample source code from the Tutorial Introduction in the documentation. + + mod = SourceModule(""" + __global__ void plus(float *a) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] += 1; + } + __global__ void times(float *a, float *b) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= b[idx]; + } + """) + func_plus = mod.get_function("plus") + func_times = mod.get_function("times") + + a = np.zeros((4, 4)).astype(np.float32) + a_gpu = drv.mem_alloc_like(a) + b = np.zeros((4, 4)).astype(np.float32) + b_gpu = drv.mem_alloc_like(b) + result = np.zeros_like(b) + + # begin graph capture, pull stream_2 into it as a dependency + # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cross-stream-dependencies + stream_1 = drv.Stream() + stream_2 = drv.Stream() + drv.begin_capture(stream_1) + event_init = drv.Event() + event_a = drv.Event() + event_b = drv.Event() + + event_init.record(stream_1) + stream_2.wait_for_event(event_init) + + drv.memcpy_htod_async(a_gpu, a, stream_1) + func_plus(a_gpu, block=(4, 4, 1), stream=stream_1) + event_a.record(stream_1) + + drv.memcpy_htod_async(b_gpu, b, stream_2) + func_plus(b_gpu, block=(4, 4, 1), stream=stream_2) + event_b.record(stream_2) + + stream_1.wait_for_event(event_a) + stream_1.wait_for_event(event_b) + func_times(a_gpu, b_gpu, block=(4, 4, 1), stream=stream_1) + drv.memcpy_dtoh_async(result, a_gpu, stream_1) + + graph = drv.end_capture(stream_1) + instance = drv.GraphExec(graph) + + # using a separate graph stream to launch, this is not strictly necessary + stream_graph = drv.Stream() + instance.launch(stream_graph) + + # manual mode + g = drv.Graph() + if post_volta_flag: + mem_alloc_node_a_gpu, a_gpu_m = g.add_memalloc_node([], a.nbytes) + mem_alloc_node_b_gpu, b_gpu_m = g.add_memalloc_node([], b.nbytes) + else: + a_gpu_m = drv.mem_alloc(a.nbytes) + b_gpu_m = drv.mem_alloc(b.nbytes) + result_m = np.zeros_like(b) + ctx = pycuda.tools.make_default_context() + memcpy_htod_node_a_gpu = g.add_memcpy_htod_node([mem_alloc_node_a_gpu] if post_volta_flag else [], ctx, a, a_gpu_m, 0, 0, a.nbytes) + memcpy_htod_node_b_gpu = g.add_memcpy_htod_node([mem_alloc_node_b_gpu] if post_volta_flag else [], ctx, b, b_gpu_m, 0, 0, b.nbytes) + kernel_node_a_gpu_m = g.add_kernel_node(a_gpu_m, func=func_plus, grid=(1,), block=(4, 4, 1), dependencies=[memcpy_htod_node_a_gpu], shared=0) + kernel_node_b_gpu_m = g.add_kernel_node(b_gpu_m, func=func_plus, grid=(1,), block=(4, 4, 1), dependencies=[memcpy_htod_node_b_gpu], shared=0) + kernel_node = g.add_kernel_node(a_gpu_m, b_gpu_m, func=func_times, grid=(1,), block=(4, 4, 1), dependencies=[kernel_node_a_gpu_m, kernel_node_b_gpu_m], shared=0) + g.add_memcpy_dtoh_node([kernel_node], ctx, a_gpu, result_m, 0, 0, b.nbytes) + + g_exec = drv.GraphExec(g) + g_exec.launch() + ctx.pop() + + np.testing.assert_allclose(result, result_m, rtol=1e-5) diff --git a/test/test_driver.py b/test/test_driver.py index 9deae3be71856c762d96395d285bcedbc2b174b2..a7a77dc696e0c3b2339f2e88210a37166bde5278 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -8,7 +8,8 @@ import numpy.linalg as la from pycuda.tools import mark_cuda_test, dtype_to_ctype import pytest # noqa - +import pycuda +import pycuda.autoinit # noqa: F401 import pycuda.gpuarray as gpuarray import pycuda.driver as drv from pycuda.compiler import SourceModule diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 7091fee9481c52f4109ea1085a0d2cb0fa9765dc..f73abe5a58434892eba077fef0ffd446b74cd3c6 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -419,7 +419,6 @@ class TestGPUArray: def test_array_ne(self): """Test whether array contents are != the other array's contents""" - a = np.array([5, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b = np.array([2, 10]).astype(np.float32) @@ -428,6 +427,21 @@ class TestGPUArray: assert result[0] assert not result[1] + def test_array_and(self): + """Test whether array contents are & the other array's + contents""" + a = np.array([True, True]) + a_gpu = gpuarray.to_gpu(a) + b = np.array([False, True]) + b_gpu = gpuarray.to_gpu(b) + result = (a_gpu & b_gpu).get() + print(result) + assert not result[0] + assert result[1] + result2 = (False & a_gpu).get() + assert not result2[0] + assert not result2[1] + def test_nan_arithmetic(self): def make_nan_contaminated_vector(size): shape = (size,) @@ -602,7 +616,7 @@ class TestGPUArray: any_array = np.any(array) any_array_gpu = array_gpu.any().get() - np.testing.assert_array_equal(any_array_gpu, any_array) + np.testing.assert_allclose(any_array_gpu, any_array, rtol=1e-4) assert any_array_gpu.dtype == any_array.dtype @pytest.mark.parametrize("dtype", [np.int32, np.bool, np.float32, np.float64]) @@ -809,6 +823,15 @@ class TestGPUArray: assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0 + a = np.array(True) + b = np.array(False) + a_gpu = gpuarray.to_gpu(a) + b_gpu = gpuarray.to_gpu(b) + + result = np.minimum(a, b) + result_ref = gpuarray.minimum(a_gpu, b_gpu).get() + np.testing.assert_allclose(result, result_ref, rtol=1e-6) + def test_take_put(self): for n in [5, 17, 333]: one_field_size = 8