diff --git a/doc/driver.rst b/doc/driver.rst index 6e6d65f520d8abf07e151d1f333a658c3648209e..d0a9a759901d98ffe43d167c6ea741ae3125cc1d 100644 --- a/doc/driver.rst +++ b/doc/driver.rst @@ -684,6 +684,10 @@ Devices and Contexts Return the total amount of memory on the device in bytes. + .. method:: trim_graph_memory() + + Frees unused memory that was cached on the device for use with graphs. + .. method:: get_attribute(attr) Return the (numeric) value of the attribute *attr*, which may be one of the @@ -705,6 +709,10 @@ Devices and Contexts Also make the newly-created context the current context. + .. method:: get_current() + + Returns the current active :class:`Context`. + .. method:: retain_primary_context() Return the :class:`Context` obtained by retaining the device's @@ -895,6 +903,511 @@ Concurrency and Streams .. versionadded: 2011.2 +CUDAGraphs +---------- + +CUDA 10.0 and above + +Launching a simple kernel using CUDAGraphs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. literalinclude:: ../examples/cudagraph_kernel.py + +.. class:: CUDANode + + An object representing a node on :class:`Graph`. + + Wraps `cuGraphNode `_ + + .. method:: __eq__() + +.. class:: KernelNodeParams(args, func=None, grid=(), block=(), shared_mem_bytes=0) + + An object storing kernel node launch parameters. + + .. attribute:: args + + A :class:`list` containing positional C arguments to the kernel. + Refer to :meth:`Graph.add_kernel_node`for more details. + + .. attribute:: func + + A :class:`Function`object specifying kernel function. + + .. attribute:: grid + + A :class:`tuple` of up to three integer entries specifying the number + of thread blocks to launch, as a multi-dimensional grid. + + .. attribute:: block + + A :class:`tuple` of up to three integer entries specifying the grid configuration. + + .. attribute:: shared_mem_bytes + + A :class:`int` specifying size of shared memory. + +.. class:: Graph(flags=0) + + A cudagraph is a data dependency graph meant to + serve as an alternative to :class:`Stream`. + + .. attribute:: flags + + A :class:`int` specifying graph creation flag. + + Wraps `cuGraph `_ + + .. method:: free() + + Releases the graph and its device memory now instead of when + this object becomes unreachable. Any further use of the + object is an error and will lead to undefined behavior. + + .. method:: debug_dot_print(path) + + Returns a DOT file describing graph structure at specifed path. + + :arg path: String specifying path for saving DOT file. + + .. method:: clone() + + Returns a copy of the original graph. All parameters are copied + into the cloned graph. The original graph may be modified after + this call without affecting the clone. + + .. method:: get_nodes() + + Returns a graph's nodes as a tuple. + + .. method:: get_root_nodes() + + Returns a graph's root nodes as a tuple. + + .. method:: add_dependencies(from_nodes, to_nodes) + + Adds dependency edges between nodes in from_nodes + and to_nodes at corresponding indices. + + :arg list_from_nodes: A :class:`list` of :class:`CUDANode` objects specifying dependent nodes. + + :arg list_to_nodes: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + .. note:: + + from_nodes and to_nodes should have the same length. + + + .. method:: remove_dependencies(from_nodes, to_nodes) + + Removed dependency edges between nodes in list_from_nodes + and list_to_nodes at corresponding indices. + + :arg list_from_nodes: A :class:`list` of :class:`CUDANode` objects specifying dependent nodes. + + :arg list_to_nodes: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + .. note:: + + from_nodes and to_nodes should have the same length. + + .. method:: get_dependendencies(user_node) + + Returns a node's dependencies as a :class:`tuple`. + + :arg user_node: A :class:`CUDANode` object for finding dependencies. + + .. method:: get_dependent_nodes(user_node) + + Returns a node's dependent nodes as a :class:`tuple`. + + :arg user_node: A :class:`CUDANode` object for finding dependent_nodes. + + .. method:: add_empty_node(dependencies=[]) + + Returns and adds a :class:`CUDANode` object specifying + empty node to the graph. + + Will be placed at the root of the graph if + dependencies are not specified. + + :arg dependencies: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + .. method:: add_child_graph_node(child_graph, dependencies=[]) + + Returns and adds a :class:`CUDANode` specifying + child graph node to the graph. + + Will be placed at the root of the graph if + dependencies are not specified. + + :arg child_graph: A :class:`Graph` object specfying child graph to be embedded. + + :arg dependencies: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + .. method:: get_child_graph(child_graph_node) + + Returns a :class:`Graph` object specifying + embedded graph of a child graph node. + + :arg child_graph_node: A :class:`CUDANode` object returnd by :meth:`Graph.add_child_graph_node`. + + .. method:: add_event_record_node(event, dependencies=[]) + + Returns and adds a :class:`Event` object specifying + record event to the graph. + + Will be placed at the root of the graph if dependencies + are not specified. + + :arg event: A :class:`Event` object specifying record event. + + :arg dependencies: a :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + .. method:: get_event_from_event_record_node(event_record_node) + + Returns a :class:`Event` object specifying event + associated with an event record node. + + :arg event_record_node: A :class:`CUDANode` object returnd by :meth:`Graph.add_event_record_node`. + + .. method:: set_record_node_event(event_record_node, event) + + Sets an event record node's event. + + :arg event_record_node: A :class:`CUDANode` object returned by :meth:`Graph.add_event_record_node`. + + :arg event: A :class:`Event` object specifying record event. + + .. method:: add_event_wait_node(event, dependencies=[]) + + Returns and adds a :class:`CUDANode` object + specifying event wait node to the graph. + + Will be placed at the root of the graph if dependencies + are not specified. + + :arg event: A :class:`Event` object specifying wait event. + + :arg dependencies: List of :class:`CUDANode` objects specifying dependency nodes. + + .. method:: get_event_from_event_wait_node(event_wait_node) + + Returns a :class:`Event` object specifying + event associated with an event wait node. + + :arg event_record_node: A :class:`CUDANode` object returnd by :meth:`Graph.add_event_wait_node`. + + .. method:: set_wait_node_event(event_wait_node, event) + + Sets an event wait node's event. + + :arg event_wait_node: A :class:`CUDANode` object returned by :meth:`Graph.add_event_wait_node`. + + :arg event: A :class:`Event` object specifying wait event. + + .. method:: add_mem_alloc_node(size, dependencies=[]) + + Adds a memory allocation node to the graph and returns a :class:`tuple` + consisting of :class:`DeviceAllocation` and :class:`CUDANode`. + + Will be placed at the root of the graph if dependencies + are not specified. + + Requires post-Volta architecture. + + :args size: A :class:`int` specifying size of object to be allocated. + + :arg dependencies: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + .. method:: add_memfree_node(input_ptr, dependencies=[], ) + + Returns and adds a :class:`CUDANode` object specifying + memory free node to the graph. + + Can only free an address returned by :meth:`Graph.add_mem_alloc_node`. + + Will be placed at the root of the graph if dependencies + are not specified. + + :arg dependencies: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + :arg input_ptr: A :class:`DeviceAllocation` specifying address that needs to be freed. + + .. method:: add_kernel_node(*args, func, block, grid=(1, ), dependencies=[], shared_mem_bytes=0) + + Returns and adds a :class:`CUDANode` object specifying + kernel node to the graph. + + Will be placed at the root of the graph if dependencies + are not specified. + + :arg args: *arg1* through *argn* are the positional C arguments to the kernel. + See :meth:`Function.__call__` for more argument details. + + :arg func: a :class:`Function`object specifying kernel function. + + :arg block: a :class:`tuple` of up to three integer entries specifying the number + of thread blocks to launch, as a multi-dimensional grid. + + :arg grid: a :class:`tuple` of up to three integer entries specifying the grid configuration. + + :arg dependencies: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + :arg shared_mem_bytes: A :class:`int` specifying size of shared memory. + + .. method:: set_kernel_node_params(*args, kernel_node, func=None, block=(), grid=(), shared_mem_bytes=0) + + Sets a kernel node's parameters. Refer to :meth:`add_kernel_node` for argument specifications. + + .. method:: get_kernel_node_params(kernel_node) + + Returns kernel node params as a :class:`tuple` consisting of + a :class:`Function`object specifying kernel function, + a :class:`tuple` object specifying block configuration + and a :class:`tuple` object specifying grid configuration. + + :arg kernel_node: A :class:`CUDANode` object returned by :meth:`Graph.add_kernel_node`. + + .. method:: add_memcpy_htod_node(dest, src, size, dependencies=[], ctx=None, src_offset=0, dest_offset=0) + + Returns and adds a :class:`CUDANode` object specifying + memcpy_htod node to the graph. + + Will be placed at the root of the graph if dependencies + are not specified. + + :arg dest: A :class:`DeviceAllocation` specfying destination address. + + :arg src: A :class:`numpy.ndarray` object specfying source address. + + :arg dependencies: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + :arg size: A :class:`int` specifying size of the vector in bytes. + + :arg ctx: A :class:`Context` specifying device context. + Calls :meth:`Device.get_current()` if *ctx* is None. + + :arg src_offset: A :class:`int` object specifying source offset in bytes. + + :arg dest_offset: A :class:`int` object specifying destination offset in bytes. + + .. method:: set_memcpy_htod_node_params(memcpy_node, dest, src, size, src_offset=0, dest_offset=0) + + Sets parameters of memcpy_node. Refer to :meth:`Graph.add_memcpy_htod_node()` for details. + + .. method:: get_memcpy_htod_node_params(memcpy_node) + + Returns memcpy_htod params as a :class:`tuple` consisting of + a :class:`DeviceAllocation` object specifying destination address, + a :class:`numpy.ndarray` object specifying source address, + an :class:`int` object specifying size of vector in bytes, + an :class:`int` object specifying source offset in bytes and + an :class:`int` object specifying destination offset in bytes. + + :arg memcpy_node: A :class:`CUDANode` object returned by :meth:`Graph.add_memcpy_htod_node`. + + .. method:: add_memcpy_dtod_node(dest, src, size, dependencies=[], ctx=None, src_offset=0, dest_offset=0) + + Returns and adds a :class:`CUDANode` object specifying + memcpy_dtod node to the graph. + + Will be placed at the root of the graph if dependencies + are not specified. + + :arg dest: A :class:`DeviceAllocation` specfying destination address. + + :arg src: A :class:`DeviceAllocation` specfying source address. + + :arg size: A :class:`int` specifying size of the vector in bytes. + + :arg dependencies: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + :arg ctx: A :class:`Context` specifying device context. + Calls :meth:`Device.get_current()` if *ctx* is None. + + :arg src_offset: A :class:`int` specifying source offset in bytes. + + :arg dest_offset: A :class:`int` specifying destination offset in bytes. + + .. method:: set_memcpy_dtod_node_params(memcpy_node, dest, src, size, src_offset=0, dest_offset=0) + + Sets parameters of memcpy node. Refer to :meth:`Graph.add_memcpy_dtod_node()` for details. + + .. method:: get_memcpy_dtod_node_params(memcpy_node) + + Returns memcpy_dtod params as a :class:`tuple` consisting of + a :class:`DeviceAllocation` object specifying destination address, + a :class:`DeviceAllocation` object specifying source address, + an :class:`int` object specifying size of vector in bytes, + an :class:`int` object specifying source offset in bytes and + an :class:`int` object specifying destination offset in bytes. + + :arg memcpy_node: A :class:`CUDANode` object returned by :meth:`Graph.add_memcpy_dtod_node`. + + .. method:: add_memcpy_dtoh_node(dest, src, size, dependencies=[], ctx=None, src_offset=0, dest_offset=0) + + Returns and adds a :class:`CUDANode` object specifying + memcpy_dtoh node to the graph. + + Will be placed at the root of the graph if dependencies + are not specified. + + :arg dest: A :class:`numpy.ndarray` object specfying destination address. + + :arg src: A :class:`DeviceAllocation` specfying source address. + + :arg size: A :class:`int` object specifying size of the vector in bytes. + + :arg dependencies: A :class:`list` of :class:`CUDANode` objects specifying dependency nodes. + + :arg ctx: A :class:`Context` specifying device context. + Calls :meth:`Device.get_current()` if *ctx* is None. + + :arg src_offset: A :class:`int` object specifying source offset in bytes. + + :arg dest_offset: A :class:`int` object specifying destination offset in bytes. + + .. method:: set_memcpy_dtoh_node_params(memcpy_node, dest, src, size, src_offset=0, dest_offset=0) + + Sets parameters of memcpy_node. Refer to :meth:`Graph.add_memcpy_htod_node()` for details. + + .. method:: get_memcpy_dtoh_node_params(memcpy_node) + + Returns memcpy_dtoh params as a :class:`tuple` consisting of + a :class:`numpy.ndarray` object specifying destination address, + a :class:`DeviceAllocation`object specifying source address, + an :class:`int` object specifying size of vector in bytes, + an :class:`int` object specifying source offset in bytes and + an :class:`int` object specifying destination offset in bytes. + + :arg memcpy_node: A :class:`CUDANode` object returned by :meth:`Graph.add_memcpy_dtoh_node`. + + .. method:: add_memset_node(ptr, value, width, dependencies=[], height=1, pitch=1, ctx=None) + + Returns and adds a :class:`CUDANode` object specifying + memset node to the graph. + + Will be placed at the root of the graph if dependencies + are not specified. + + :arg ptr: A :class:`DeviceAllocation` specifying memset address. + + :arg value: A :class:`int` specifying value to be set. + + :arg width: A :class:`int` specifying width of the row in elements. + + :arg height: A :class:`int` specifying number of rows. + + :arg pitch: A :class:`int` specifying pitch of device pointer. + + :arg ctx: A :class:`Context` specifying device context. + Calls :meth:`Device.get_current()` if *ctx* is None. + + .. method:: set_memset_node_params(memset_node, ptr, value, width=1, height=1, pitch=1) + + Sets parameters of memset_node. Refer to :meth:`Graph.add_memset_node()` for details. + + .. method:: get_memset_node_params(memset_node) + + Returns memset params as a :class:`tuple` consisting of + a :class:`DeviceAllocation`object specifying source address, + an :class:`int` object specifying value to be set, + an :class:`int` object specifying width of the vector, + an :class:`int` object specifying height of the vector and + an :class:`int` object specifying pitch of the pointer. + + :arg memset_node: A :class:`CUDANode` object returned by :meth:`Graph.add_memset_node`. + + .. method:: get_exec_graph() + + Returns and instantiates a :class:`GraphExec` object. + +.. class:: GraphExec(graph) + + An executable graph to be launched on a stream. + + Wraps `cuGraphExec `_ + + .. method:: launch(stream_py=None) + + Launches an executable graph in a stream. + + :arg stream_py: :class:`Stream` object specifying device stream. + Will use default stream if *stream_py* is None. + + .. method:: upload(stream_py=None) + + Uploads an executable graph to a stream + without executing it. + + :arg stream_py: :class:`Stream` object specifying device stream. + Will use default stream if *stream_py* is None. + + .. method:: set_kernel_node_params(kernel_node, params) + + Sets parameters of kernel_node. Similar to :meth:`Graph.set_kernel_node_params`. + + .. method:: batched_set_kernel_node_arguments(kernel_dict) + + Sets parameters for multiple kernel_nodes. + + :arg kernel_dict: A :class:`dict` with :class:`CUDANode` kernel_node keys + and :class:`KernelNodeParams` as values. + + .. method:: set_memcpy_htod_node_params(memcpy_node, dest, src, size, ctx=None, src_offset=0, dest_offset=0) + + Sets parameters of memcpy node. Refer to :meth:`Graph.add_memcpy_htod_node()` for details. + + .. method:: set_memcpy_dtod_node_params(memcpy_node, dest, src, size, ctx=None, src_offset=0, dest_offset=0) + + Sets parameters of memcpy node. Refer to :meth:`Graph.add_memcpy_dtod_node()` for details. + + .. method:: set_memcpy_dtoh_node_params(memcpy_node, dest, src, size, ctx=None, src_offset=0, dest_offset=0) + + Sets parameters of memcpy node. Refer to :meth:`Graph.add_memcpy_dtoh_node()` for details. + + .. method:: set_memset_node_params(memset_node, ptr, value, width, height=1, pitch=1, ctx=None) + + Sets parameters of memset node. Refer to :meth:`Graph.add_memset_node()` for details. + + .. method:: set_record_node_event(event_record_node, event) + + Sets event_record_node event. Similar to :meth:`Graph.set_record_node_event`. + + .. method:: set_wait_node_event(event_wait_node, event) + + Sets event_wait_node event. Similar to :meth:`Graph.set_wait_node_event`. + + .. method:: update_child_graph_node(child_node, child_graph) + + Updates graph in the child graph node in the given graphExec. + + :arg child_node: A :class:`CUDANode` object returnd by :meth:`Graph.add_child_graph_node`. + + :arg child_graph: A :class:`Graph` object specfying child graph to be embedded. + +.. function:: begin_capture(stream_py) + + Begins graph stream capture on a stream. + + When a stream is in capture mode, all operations pushed into the stream + will not be executed, but will instead be captured into a graph. + + :arg stream_py: A :class:`Stream` object specifying stream for capturing graph. + +.. function:: end_capture(steam_py) + + Ends stream capture and returns a :class:`Graph` object. + + :arg stream_py: A :class:`Stream` object specifying stream for capturing graph. + + +Creating graph using Stream Capture +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. literalinclude:: ../examples/cudagraph_streamcapture.py + Memory ------ diff --git a/examples/cudagraph_kernel.py b/examples/cudagraph_kernel.py new file mode 100644 index 0000000000000000000000000000000000000000..49df66a1f773431f7417f38d6952a12604926103 --- /dev/null +++ b/examples/cudagraph_kernel.py @@ -0,0 +1,27 @@ +import numpy as np +import pycuda.autoinit # noqa: F401 +import pycuda.driver as drv +from pycuda.compiler import SourceModule + +g = drv.Graph() +mod = SourceModule(""" + __global__ void doublify(float *a) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= 2; + } + """) + +doublify = mod.get_function("doublify") + +a = np.random.randn(4, 4).astype(np.float32) +a_doubled = np.empty_like(a) +a_gpu = drv.mem_alloc(a.nbytes) + +memcpy_htod_node = g.add_memcpy_htod_node(a_gpu, a, a.nbytes) +kernel_node = g.add_kernel_node(a_gpu, func=doublify, block=(4, 4, 1), dependencies=[memcpy_htod_node]) +memcpy_dtoh_node = g.add_memcpy_dtoh_node(a_doubled, a_gpu, a.nbytes, [kernel_node]) +g_exec = drv.GraphExec(g) +g_exec.launch() + +np.testing.assert_allclose(a_doubled, a * 2, rtol=1e-5) diff --git a/examples/cudagraph_streamcapture.py b/examples/cudagraph_streamcapture.py new file mode 100644 index 0000000000000000000000000000000000000000..8bebf1b404d16489799ac860de565d16097af96f --- /dev/null +++ b/examples/cudagraph_streamcapture.py @@ -0,0 +1,60 @@ +# borrowed from https://github.com/inducer/pycuda/pull/343/files#diff-8a50e6f7173b100e6323e9433913e982831b7f440eb6ca48cd8c339ad6c24383R1226 +import numpy as np +import pycuda.autoinit # noqa: F401 +import pycuda.driver as drv +from pycuda.compiler import SourceModule +mod = SourceModule(""" + __global__ void plus(float *a) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] += 1; + } + __global__ void times(float *a, float *b) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= b[idx]; + } + """) + +func_plus = mod.get_function("plus") +func_times = mod.get_function("times") + +a = np.zeros((4, 4)).astype(np.float32) +a_gpu = drv.mem_alloc_like(a) +b = np.zeros((4, 4)).astype(np.float32) +b_gpu = drv.mem_alloc_like(b) +result = np.zeros_like(b) + +# begin graph capture, pull stream_2 into it as a dependency +# https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cross-stream-dependencies +stream_1 = drv.Stream() +stream_2 = drv.Stream() +drv.begin_capture(stream_1) +event_init = drv.Event() +event_a = drv.Event() +event_b = drv.Event() + +event_init.record(stream_1) +stream_2.wait_for_event(event_init) + +drv.memcpy_htod_async(a_gpu, a, stream_1) +func_plus(a_gpu, block=(4, 4, 1), stream=stream_1) +event_a.record(stream_1) + +drv.memcpy_htod_async(b_gpu, b, stream_2) +func_plus(b_gpu, block=(4, 4, 1), stream=stream_2) +event_b.record(stream_2) + +stream_1.wait_for_event(event_a) +stream_1.wait_for_event(event_b) +func_times(a_gpu, b_gpu, block=(4, 4, 1), stream=stream_1) +drv.memcpy_dtoh_async(result, a_gpu, stream_1) + +graph = drv.end_capture(stream_1) +instance = drv.GraphExec(graph) + +# using a separate graph stream to launch, this is not strictly necessary +stream_graph = drv.Stream() +instance.launch(stream_graph) + +np.testing.assert_allclose(result, (a + 1)*(b + 1), rtol=1e-5) diff --git a/pycuda/driver.py b/pycuda/driver.py index 4bce8347a486d853ab989b3181b8e77053ae0fd7..066e58a9e9f2f029deb8a11cb8992f8f9385f52d 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -5,6 +5,8 @@ Copyright 2021 NVIDIA Corporation import os import numpy as np +from typing import Dict, Tuple, List, Optional +from dataclasses import dataclass # {{{ add cuda lib dir to Python DLL path @@ -160,6 +162,62 @@ class InOut(In, Out): pass +@dataclass(frozen=True, eq=True) +class KernelNodeParams: + args: List[int] + func: Optional[Function] = None + grid: Optional[Tuple[int, ...]] = () + block: Optional[Tuple[int, ...]] = () + shared_mem_bytes: Optional[int] = 0 + + +def _build_arg_buf(args): + handlers = [] + arg_data = [] + format = "" + for i, arg in enumerate(args): + if isinstance(arg, np.number): + arg_data.append(arg) + format += arg.dtype.char + elif isinstance(arg, int): + arg_data.append(arg) + format += "P" + elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)): + arg_data.append(int(arg)) + format += "P" + elif isinstance(arg, ArgumentHandler): + handlers.append(arg) + arg_data.append(int(arg.get_device_alloc())) + format += "P" + elif isinstance(arg, np.ndarray): + if isinstance(arg.base, ManagedAllocationOrStub): + arg_data.append(int(arg.base)) + format += "P" + else: + arg_data.append(arg) + format += "%ds" % arg.nbytes + elif isinstance(arg, np.void): + arg_data.append(_my_bytes(_memoryview(arg))) + format += "%ds" % arg.itemsize + else: + cai = getattr(arg, "__cuda_array_interface__", None) + if cai: + arg_data.append(cai["data"][0]) + format += "P" + continue + + try: + gpudata = np.uintp(arg.gpudata) + except AttributeError: + raise TypeError("invalid type on parameter #%d (0-based)" % i) + else: + # for gpuarrays + arg_data.append(int(gpudata)) + format += "P" + from pycuda._pvt_struct import pack + return handlers, pack(format, *arg_data) + + def _add_functionality(): def device_get_attributes(dev): result = {} @@ -187,52 +245,6 @@ def _add_functionality(): def device___getattr__(dev, name): return dev.get_attribute(getattr(device_attribute, name.upper())) - def _build_arg_buf(args): - handlers = [] - - arg_data = [] - format = "" - for i, arg in enumerate(args): - if isinstance(arg, np.number): - arg_data.append(arg) - format += arg.dtype.char - elif isinstance(arg, (DeviceAllocation, PooledDeviceAllocation)): - arg_data.append(int(arg)) - format += "P" - elif isinstance(arg, ArgumentHandler): - handlers.append(arg) - arg_data.append(int(arg.get_device_alloc())) - format += "P" - elif isinstance(arg, np.ndarray): - if isinstance(arg.base, ManagedAllocationOrStub): - arg_data.append(int(arg.base)) - format += "P" - else: - arg_data.append(arg) - format += "%ds" % arg.nbytes - elif isinstance(arg, np.void): - arg_data.append(_my_bytes(_memoryview(arg))) - format += "%ds" % arg.itemsize - else: - cai = getattr(arg, "__cuda_array_interface__", None) - if cai: - arg_data.append(cai["data"][0]) - format += "P" - continue - - try: - gpudata = np.uintp(arg.gpudata) - except AttributeError: - raise TypeError("invalid type on parameter #%d (0-based)" % i) - else: - # for gpuarrays - arg_data.append(int(gpudata)) - format += "P" - - from pycuda._pvt_struct import pack - - return handlers, pack(format, *arg_data) - # {{{ pre-CUDA 4 call interface (stateful) def function_param_set_pre_v4(func, *args): @@ -711,6 +723,67 @@ def _add_functionality(): _add_functionality() +def monkeypatch_cudagraph(): + + def cudagraph_add_kernel_node_call(graph, *args, func, block, grid=(1, ), dependencies=[], shared_mem_bytes=0): + + if func is None: + raise ValueError("must specify func") + if block is None: + raise ValueError("must specify block size") + + _, arg_buf = _build_arg_buf(args) + kernel_node = graph._add_kernel_node(func=func, block=block, grid=grid, arg_buf=arg_buf, dependencies=dependencies, shared_mem_bytes=shared_mem_bytes) + return kernel_node + + def cudagraph_set_kernel_node_call(graph, *args, kernel_node, func=None, block=(), grid=(), shared_mem_bytes=0): + _, arg_buf = _build_arg_buf(args) + return graph._set_kernel_node_params(kernel_node=kernel_node, func=func, block=block, grid=grid, arg_buf=arg_buf, shared_mem_bytes=shared_mem_bytes) + + def cudagraph_mem_alloc_call(graph, size, dependencies=[]): + if size is None: + raise ValueError("must specify size") + node_buffer = graph._add_mem_alloc_node(size, dependencies) + return node_buffer.get_node(), node_buffer.get_device_allocation() + + def exec_graph_batched_set_kernel_node_arguments(exec_graph: GraphExec, kernel_dict: Dict[Node, KernelNodeParams]): + arg_buf_list = [] + kernel_node_list = [] + func_list = [] + block_list = [] + grid_list = [] + shared_mem_bytes_list = [] + for kernel_node, kernel_params in kernel_dict.items(): + kernel_node_list.append(kernel_node) + _, arg_buf = _build_arg_buf(kernel_params.args) + arg_buf_list.append(arg_buf) + if bool(kernel_params.func): + func_list.append(kernel_params.func) + if bool(kernel_params.block): + block_list.append(kernel_params.block) + if bool(kernel_params.grid): + grid_list.append(kernel_params.grid) + if bool(kernel_params.shared_mem_bytes): + shared_mem_bytes_list.append(kernel_params.shared_mem_bytes) + return exec_graph._batched_set_kernel_node_arguments(kernel_node_list=kernel_node_list, func_list=func_list, block_list=block_list, grid_list=grid_list, arg_buf_list=arg_buf_list, shared_mem_bytes_list=shared_mem_bytes_list) + + def exec_graph_set_kernel_node_call(exec_graph, *args, kernel_node, func=None, block=(), grid=(), shared_mem_bytes=0): + _, arg_buf = _build_arg_buf(args) + return exec_graph._set_kernel_node_params(kernel_node=kernel_node, func=func, block=block, grid=grid, arg_buf=arg_buf, shared_mem_bytes=shared_mem_bytes) + + def exec_graph_set_memcpy_dtd_node_call(exec_graph, memcpy_node, dest=0, src=0, size=0, src_offset=0, dest_offset=0, ctx=None): + return exec_graph._set_memcpy_dtod_node_params(memcpy_node, dest, src, size, src_offset, dest_offset, ctx) + + Graph.add_kernel_node = cudagraph_add_kernel_node_call + Graph.set_kernel_node_params = cudagraph_set_kernel_node_call + Graph.add_memalloc_node = cudagraph_mem_alloc_call + GraphExec.batched_set_kernel_node_arguments = exec_graph_batched_set_kernel_node_arguments + GraphExec.set_kernel_node_params = exec_graph_set_kernel_node_call + GraphExec.set_memcpy_dtod_node_params = exec_graph_set_memcpy_dtd_node_call + + +monkeypatch_cudagraph() + # {{{ pagelocked numpy arrays diff --git a/setup.py b/setup.py index b655b20921d8847af54e8366c1b1856735c16f12..993b2c58bf36e181dee84d805267de436a2e4e08 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,7 @@ def get_config_schema(): IncludeDir("CUDA", None), Switch("CUDA_ENABLE_GL", False, "Enable CUDA GL interoperability"), Switch("CUDA_ENABLE_CURAND", True, "Enable CURAND library"), + Switch("CUDA_ENABLE_CUDAGRAPH", True, "Enable CUDAGRAPH library"), LibraryDir("CUDADRV", default_lib_dirs), Libraries("CUDADRV", ["cuda"]), LibraryDir("CUDART", default_lib_dirs), @@ -168,6 +169,10 @@ def main(): for lib_dir in conf["CUDADRV_LIB_DIR"]: conf["LDFLAGS"].extend(["-Xlinker", "-rpath", "-Xlinker", lib_dir]) + if conf["CUDA_ENABLE_CUDAGRAPH"]: + EXTRA_SOURCES.append("src/wrapper/wrap_cudagraph.cpp") + EXTRA_DEFINES["HAVE_CUDAGRAPH"] = 1 + if conf["CUDA_ENABLE_GL"]: EXTRA_SOURCES.append("src/wrapper/wrap_cudagl.cpp") EXTRA_DEFINES["HAVE_GL"] = 1 diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp index 18079ab485e24be4f55a388fcd62365d695ac623..ccf9e52318066c9f0e09d91f4de3ca0f8733deeb 100644 --- a/src/cpp/cuda.hpp +++ b/src/cpp/cuda.hpp @@ -1,4 +1,5 @@ // A C++ wrapper for CUDA +// A C++ wrapper for CUDA @@ -32,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -467,6 +469,10 @@ namespace pycuda return result; } + void trim_graph_memory(){ + CUDAPP_CALL_GUARDED(cuDeviceGraphMemTrim, (m_device)); + } + bool operator==(const device &other) const { return m_device == other.m_device; @@ -1287,13 +1293,14 @@ namespace pycuda { } ~module() - { + { try - { - scoped_context_activation ca(get_context()); - CUDAPP_CALL_GUARDED_CLEANUP(cuModuleUnload, (m_module)); - } - CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(module); + { + scoped_context_activation ca(get_context()); + CUDAPP_CALL_GUARDED_CLEANUP(cuModuleUnload, (m_module)); + } + CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(module); + } CUmodule handle() const @@ -1345,6 +1352,35 @@ namespace pycuda // }}} + inline + void preprocess_grid_block(py::tuple block_dim_py, unsigned int block_dim[], + py::tuple grid_dim_py, unsigned int grid_dim[], + const unsigned axis_count){ + + for (unsigned i = 0; i < axis_count; ++i) + { + grid_dim[i] = 1; + block_dim[i] = 1; + } + + pycuda_size_t gd_length = py::len(grid_dim_py); + if (gd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many grid dimensions in kernel launch"); + + for (unsigned i = 0; i < gd_length; ++i) + grid_dim[i] = py::extract(grid_dim_py[i]); + + pycuda_size_t bd_length = py::len(block_dim_py); + if (bd_length > axis_count) + throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, + "too many block dimensions in kernel launch"); + + for (unsigned i = 0; i < bd_length; ++i) + block_dim[i] = py::extract(block_dim_py[i]); + + } + // {{{ function class function { @@ -1357,6 +1393,14 @@ namespace pycuda : m_function(func), m_symbol(sym) { } + CUfunction handle(){ + return m_function; + } + + intptr_t handle_int(){ + return (intptr_t) m_function; + } + void set_block_shape(int x, int y, int z) { CUDAPP_CALL_GUARDED_WITH_TRACE_INFO( @@ -1441,34 +1485,13 @@ namespace pycuda #if CUDAPP_CUDA_VERSION >= 4000 void launch_kernel(py::tuple grid_dim_py, py::tuple block_dim_py, - py::object parameter_buffer, - unsigned shared_mem_bytes, py::object stream_py) - { + py::object parameter_buffer, unsigned shared_mem_bytes, py::object stream_py) + { const unsigned axis_count = 3; - unsigned grid_dim[axis_count]; unsigned block_dim[axis_count]; + unsigned grid_dim[axis_count]; - for (unsigned i = 0; i < axis_count; ++i) - { - grid_dim[i] = 1; - block_dim[i] = 1; - } - - pycuda_size_t gd_length = py::len(grid_dim_py); - if (gd_length > axis_count) - throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, - "too many grid dimensions in kernel launch"); - - for (unsigned i = 0; i < gd_length; ++i) - grid_dim[i] = py::extract(grid_dim_py[i]); - - pycuda_size_t bd_length = py::len(block_dim_py); - if (bd_length > axis_count) - throw pycuda::error("function::launch_kernel", CUDA_ERROR_INVALID_HANDLE, - "too many block dimensions in kernel launch"); - - for (unsigned i = 0; i < bd_length; ++i) - block_dim[i] = py::extract(block_dim_py[i]); + preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count); PYCUDA_PARSE_STREAM_PY; @@ -1564,19 +1587,19 @@ namespace pycuda class device_allocation : public boost::noncopyable, public context_dependent { private: - bool m_valid; + bool m_managed; protected: CUdeviceptr m_devptr; public: device_allocation(CUdeviceptr devptr) - : m_valid(true), m_devptr(devptr) + : m_managed(true), m_devptr(devptr) { } void free() { - if (m_valid) + if (m_managed) { try { @@ -1586,21 +1609,27 @@ namespace pycuda CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(device_allocation); release_context(); - m_valid = false; + m_managed = false; } - else - throw pycuda::error("device_allocation::free", CUDA_ERROR_INVALID_HANDLE); } + void set_m_managed(){ + if (m_managed){ + m_managed = false; + } + } ~device_allocation() { - if (m_valid) + if (m_managed) free(); } operator CUdeviceptr() const { return m_devptr; } + CUdeviceptr handle() const + { return m_devptr; } + py::object as_buffer(size_t size, size_t offset) { return py::object( @@ -2116,16 +2145,23 @@ namespace pycuda { private: CUevent m_event; + bool m_managed; public: event(unsigned int flags=0) + : m_managed(true) { CUDAPP_CALL_GUARDED(cuEventCreate, (&m_event, flags)); } - event(CUevent evt) - : m_event(evt) + event(CUevent evt, bool managed) + : m_event(evt), m_managed(managed) { } - ~event() + ~event(){ + free(); + } + + void free(){ + if (m_managed) { try { @@ -2133,7 +2169,10 @@ namespace pycuda CUDAPP_CALL_GUARDED_CLEANUP(cuEventDestroy, (m_event)); } CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(event); + m_managed = false; + release_context(); } + } event *record(py::object stream_py) { @@ -2143,6 +2182,16 @@ namespace pycuda return this; } + bool operator==(const event& rhs) + { + return m_event == rhs.m_event; + } + + bool operator!=(const event& rhs) + { + return !(*this == rhs); + } + CUevent handle() const { return m_event; } @@ -2204,7 +2253,7 @@ namespace pycuda #if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000 inline - event *event_from_ipc_handle(py::object obj) + event *event_from_ipc_handle(py::object obj, bool m_managed) { if (!PyByteArray_Check(obj.ptr())) throw pycuda::error("event_from_ipc_handle", CUDA_ERROR_INVALID_VALUE, @@ -2218,7 +2267,7 @@ namespace pycuda CUevent evt; CUDAPP_CALL_GUARDED(cuIpcOpenEventHandle, (&evt, handle)); - return new event(evt); + return new event(evt, m_managed); } #endif diff --git a/src/cpp/cudagraph.hpp b/src/cpp/cudagraph.hpp new file mode 100644 index 0000000000000000000000000000000000000000..284b2c9cfdb6c3ea7addc3213fe75f5ef67e152e --- /dev/null +++ b/src/cpp/cudagraph.hpp @@ -0,0 +1,1018 @@ +#ifndef _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_GRAPH_HPP +#define _AFJDFJSDFSD_PYCUDA_HEADER_SEEN_CUDA_GRAPH_HPP + +#include + +namespace pycuda { namespace cuda_graph { + + +///{{{ KernelNodeParams + +class KernelNodeParams{ + public: + py::list m_arg_list; + pycuda::function m_user_func; + py::tuple m_grid_tuple_py; + py::tuple m_block_tuple_py; + py::object m_parameter_buffer; + unsigned int m_shared_mem_bytes; + + KernelNodeParams(pycuda::function user_func, py::tuple grid_tuple_py, py::tuple block_tuple_py, + py::object parameter_buffer, unsigned int shared_mem_bytes) : + m_user_func(user_func), m_grid_tuple_py(grid_tuple_py), m_block_tuple_py(block_tuple_py), + m_parameter_buffer(parameter_buffer), m_shared_mem_bytes(shared_mem_bytes){} + + ~KernelNodeParams() {} + }; + +///}}} + +///{{{ MemcpyDtoDNodeParams + +class MemcpyDtoDNodeParams{ + public: + CUdeviceptr m_src; + CUdeviceptr m_dest; + size_t m_size; + size_t m_src_offset; + size_t m_dest_offset; + CUcontext m_ctx; + + MemcpyDtoDNodeParams(CUdeviceptr src, CUdeviceptr dest, + size_t size, size_t src_offset, size_t dest_offset, CUcontext ctx): + m_src(src), m_dest(dest), m_size(size), + m_src_offset(src_offset), m_dest_offset(dest_offset), m_ctx(ctx){} + + ~MemcpyDtoDNodeParams() {} + }; + +///}}} + + +// CUDAGraph fwd declaration +class CUDAGraph; + +// {{{ CUDANode + +class CUDANode{ + protected: + CUgraphNode m_node; + KernelNodeParams* m_kernel_params; + MemcpyDtoDNodeParams* m_memcpy_dtod_params; + + public: + CUDANode(CUgraphNode node) : m_node(node) { + } + + ~CUDANode() { + } + + void set_kernel_params(pycuda::function user_func, py::tuple grid_tuple_py, py::tuple block_tuple_py, + py::object parameter_buffer, unsigned int shared_mem_bytes){ + m_kernel_params = new KernelNodeParams(user_func, grid_tuple_py, block_tuple_py, parameter_buffer, shared_mem_bytes); + } + + KernelNodeParams* get_kernel_params(){ + return m_kernel_params; + } + + void set_memcpy_dtod_params(CUdeviceptr src, CUdeviceptr dest, size_t size, size_t src_offset, size_t dest_offset, CUcontext ctx){ + m_memcpy_dtod_params = new MemcpyDtoDNodeParams(src, dest, size, src_offset, dest_offset, ctx); + } + + MemcpyDtoDNodeParams* get_memcpy_dtod_params(){ + return m_memcpy_dtod_params; + } + + bool operator==(const CUDANode& rhs) + { + return m_node == rhs.m_node; + } + + bool operator!=(const CUDANode& rhs) + { + return !(*this == rhs); + } + + CUgraphNode handle() const { + return m_node; + } + }; + + //}}} + + template + inline T* begin_ptr(std::vector& v) + {return v.empty() ? NULL : &v[0];} + + py::list array_of_nodes_to_list(CUgraphNode* nodes, size_t length){ + py::list list_nodes; + for (int i=0; i list_to_vector_of_nodes(py::list list_nodes){ + std::vector v; + for (int i=0; i(list_nodes[i]); + v.push_back(node->handle()); + } + return v; + } + + struct node_buffer { + protected: + CUDANode* m_node; + device_allocation* m_da; + + public: + node_buffer(CUDANode* node, device_allocation* da) + : m_node(node), m_da(da){} + + CUDANode* get_node(){ + return m_node; + } + + device_allocation* get_device_allocation(){ + return m_da; + } + }; + + ///{{{ CUDAGraphExec + + class CUDAGraphExec : public context_dependent{ + protected: + CUgraphExec m_exec; + + public: + CUDAGraphExec(CUgraphExec exec) + : m_exec(exec){} + + ~CUDAGraphExec() { + free(); + } + + void free(){ + try + { + scoped_context_activation ca(get_context()); + CUDAPP_CALL_GUARDED_CLEANUP(cuGraphExecDestroy,(m_exec)); + } + CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(CUDAGraphExec); + + release_context(); + } + + CUgraphExec handle(){ + return m_exec; + } + + void launch(py::object stream_py) + { // borrowed from https://github.com/inducer/pycuda/pull/343/commits/57ca7e8879e2d1ce73d57232d81af883722eea85 + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuGraphLaunch,(m_exec, s_handle)); + } + + void upload(py::object stream_py) + { + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuGraphUpload,(m_exec, s_handle)); + } + + void batched_set_kernel_node_arguments(py::list kernel_node_list, + py::list py_user_func_list, + py::list block_dim_py_list, + py::list grid_dim_py_list, + py::list parameter_buffer_list, + py::list shared_mem_list){ + + for (int i=0; i(kernel_node_list[i]); + py::object py_user_func; + py::tuple block_dim_py; + py::tuple grid_dim_py; + py::object parameter_buffer; + unsigned int shared_mem_bytes = 0; + + if (len(py_user_func_list) != 0){ + py_user_func = boost::python::extract(py_user_func_list[i]); + } + if (len(block_dim_py_list) != 0){ + block_dim_py = boost::python::extract(block_dim_py_list[i]); + } + if (len(grid_dim_py_list) != 0){ + grid_dim_py = boost::python::extract(grid_dim_py_list[i]); + } + if (len(parameter_buffer_list) != 0){ + parameter_buffer = boost::python::extract(parameter_buffer_list[i]); + } + if (len(shared_mem_list) != 0){ + shared_mem_bytes = boost::python::extract(shared_mem_list[i]); + } + set_kernel_node_params(kernel_node, py_user_func, block_dim_py, grid_dim_py, parameter_buffer, shared_mem_bytes); + }} + + void set_kernel_node_params(CUDANode* kernel_node, + py::object py_user_func, + py::tuple block_dim_py, py::tuple grid_dim_py, + py::object parameter_buffer, + unsigned int shared_mem_bytes){ + + CUDA_KERNEL_NODE_PARAMS nodeParams; + const unsigned axis_count = 3; + unsigned grid_dim[axis_count]; + unsigned block_dim[axis_count]; + + if (py::len(grid_dim_py) == 0){ + grid_dim_py = kernel_node->get_kernel_params()->m_grid_tuple_py; + } + if (py::len(block_dim_py) == 0){ + block_dim_py = kernel_node->get_kernel_params()->m_block_tuple_py; + } + + pycuda::preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count); + + if (py_user_func.ptr() != Py_None){ + pycuda::function* user_func = boost::python::extract(py_user_func); + nodeParams.func = user_func->handle(); + } + else{ + nodeParams.func = kernel_node->get_kernel_params()->m_user_func.handle(); + } + + py_buffer_wrapper par_buf_wrapper; + if (parameter_buffer.ptr() != Py_None){ + par_buf_wrapper.get(parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS); + } + else { + par_buf_wrapper.get(kernel_node->get_kernel_params()->m_parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS); + } + size_t par_len = par_buf_wrapper.m_buf.len; + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast(par_buf_wrapper.m_buf.buf), + CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len, + CU_LAUNCH_PARAM_END + }; + + if (shared_mem_bytes != 0){ + nodeParams.sharedMemBytes = shared_mem_bytes; + } + else{ + nodeParams.sharedMemBytes = kernel_node->get_kernel_params()->m_shared_mem_bytes; + } + nodeParams.gridDimX = grid_dim[0]; + nodeParams.gridDimY = grid_dim[1]; + nodeParams.gridDimZ = grid_dim[2]; + nodeParams.blockDimX = block_dim[0]; + nodeParams.blockDimY = block_dim[1]; + nodeParams.blockDimZ = block_dim[2]; + nodeParams.kernelParams = 0; + nodeParams.extra = config; + CUDAPP_CALL_GUARDED(cuGraphExecKernelNodeSetParams,(m_exec, kernel_node->handle(), &nodeParams)); + } + + void set_memcpy_htod_node_params(CUDANode* memcpy_node, + CUdeviceptr dest, py::object src, + size_t size, py::object ctx, + size_t src_offset, size_t dest_offset){ + + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(src.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + boost::shared_ptr ptr_ctx; + if (ctx.ptr() != Py_None){ + ptr_ctx = py::extract >(ctx); + } + else{ + ptr_ctx = pycuda::context::current_context(); + } + context* Ctx = ptr_ctx.get(); + CUDA_MEMCPY3D nodeParams; + nodeParams.Depth = 0; + nodeParams.Height = 0; + nodeParams.dstDevice = dest; + nodeParams.srcHost = buf_wrapper.m_buf.buf; + nodeParams.WidthInBytes = size; + nodeParams.srcXInBytes = src_offset; + nodeParams.dstXInBytes = dest_offset; + nodeParams.srcMemoryType = CU_MEMORYTYPE_HOST; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + CUDAPP_CALL_GUARDED(cuGraphExecMemcpyNodeSetParams,(m_exec, memcpy_node->handle(), &nodeParams,Ctx->handle())); + } + + void set_memcpy_dtod_node_params(CUDANode* memcpy_node, + CUdeviceptr dest, CUdeviceptr src, + size_t size, size_t src_offset, size_t dest_offset, + py::object ctx){ + + boost::shared_ptr ptr_ctx; + if (ctx.ptr() != Py_None){ + ptr_ctx = py::extract >(ctx); + } + else{ + ptr_ctx = pycuda::context::current_context(); + } + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + if (dest != 0){ + nodeParams.dstDevice = dest; + } + else{ + nodeParams.dstDevice = memcpy_node->get_memcpy_dtod_params()->m_dest; + } + + if (src != 0){ + nodeParams.srcDevice = src; + } + else{ + nodeParams.srcDevice = memcpy_node->get_memcpy_dtod_params()->m_src; + } + + if (size != 0){ + nodeParams.WidthInBytes = size; + } + else{ + nodeParams.WidthInBytes = memcpy_node->get_memcpy_dtod_params()->m_size; + } + + if (src_offset != 0){ + nodeParams.srcXInBytes = src_offset; + } + else{ + nodeParams.srcXInBytes = memcpy_node->get_memcpy_dtod_params()->m_src_offset; + } + if (dest_offset != 0){ + nodeParams.dstXInBytes = dest_offset; + } + else{ + nodeParams.dstXInBytes = memcpy_node->get_memcpy_dtod_params()->m_dest_offset; + } + + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + CUDAPP_CALL_GUARDED(cuGraphExecMemcpyNodeSetParams,(m_exec, memcpy_node->handle(), &nodeParams, ptr_ctx.get()->handle())); + } + + void set_memcpy_dtoh_node_params(CUDANode* memcpy_node, + py::object dest, CUdeviceptr src, + size_t size, py::object ctx, + size_t src_offset, size_t dest_offset){ + + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(dest.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + boost::shared_ptr ptr_ctx; + if (ctx.ptr() != Py_None){ + ptr_ctx = py::extract >(ctx); + } + else{ + ptr_ctx = pycuda::context::current_context(); + } + context* Ctx = ptr_ctx.get(); + CUDA_MEMCPY3D nodeParams; + nodeParams.Depth = 0; + nodeParams.Height = 0; + nodeParams.dstHost = buf_wrapper.m_buf.buf; + nodeParams.srcDevice = src; + nodeParams.WidthInBytes = size; + nodeParams.srcXInBytes = src_offset; + nodeParams.dstXInBytes = dest_offset; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_HOST; + CUDAPP_CALL_GUARDED(cuGraphExecMemcpyNodeSetParams,(m_exec, memcpy_node->handle(), &nodeParams,Ctx->handle())); + } + + void set_memset_node_params(CUDANode* memset_node, + CUdeviceptr ptr, unsigned int value, + size_t element_width, size_t element_height, size_t pitch, + py::object ctx){ + + boost::shared_ptr ptr_ctx; + if (ctx.ptr() != Py_None){ + ptr_ctx = py::extract >(ctx); + } + else{ + ptr_ctx = pycuda::context::current_context(); + } + context* Ctx = ptr_ctx.get(); + CUDA_MEMSET_NODE_PARAMS nodeParams; + nodeParams.dst = ptr; + nodeParams.elementSize = sizeof(value); + nodeParams.width = element_width; + nodeParams.height = element_height; + nodeParams.pitch = pitch; + nodeParams.value = value; + CUDAPP_CALL_GUARDED(cuGraphExecMemsetNodeSetParams,(m_exec, memset_node->handle(), &nodeParams, Ctx->handle())); + } + + // void update_child_graph_node(CUDANode* child_node, CUDAGraph* child_graph){ + // CUDAPP_CALL_GUARDED(cuGraphExecChildGraphNodeSetParams,(m_exec, child_node->handle(), child_graph->handle())); + // } + + }; + + ///}}} + + //{{{ CUDAGraph + + class CUDAGraph : public context_dependent + { + protected: + CUgraph m_graph; + py::list m_event_list; + bool m_managed; + + public: + CUDAGraph(CUgraph graph, bool managed) + : m_graph(graph), m_managed(managed) + {} + + CUDAGraph(unsigned int flags=0) + : m_managed(true) + { + CUDAPP_CALL_GUARDED(cuGraphCreate, (&m_graph, flags)); + } + + ~CUDAGraph() { + if (m_managed) + free(); + } + + void free(){ + if (m_managed) + { + try + { + scoped_context_activation ca(get_context()); + CUDAPP_CALL_GUARDED_CLEANUP(cuGraphDestroy,(m_graph)); + } + CUDAPP_CATCH_CLEANUP_ON_DEAD_CONTEXT(CUDAGraph); + + m_managed = false; + release_context(); + } + } + + bool operator==(const CUDAGraph& rhs) + { + return m_graph == rhs.m_graph; + } + + bool operator!=(const CUDAGraph& rhs) + { + return !(*this == rhs); + } + + CUgraph handle() const { + return m_graph; + } + + void debug_dot_print(std::string path) + { // borrowed from https://github.com/gfokkema/pycuda/commit/57ca7e8879e2d1ce73d57232d81af883722eea85 + CUDAPP_CALL_GUARDED(cuGraphDebugDotPrint, (m_graph, path.c_str(), 1<<1)) + } + + CUDAGraph *clone() const { + CUgraph clone; + CUDAPP_CALL_GUARDED(cuGraphClone,(&clone, m_graph)); + return new CUDAGraph(clone, true); + } + + CUDANode *add_empty_node(py::list dependencies) const { + std::vector v= list_to_vector_of_nodes(dependencies); + CUgraphNode empty_node; + CUDAPP_CALL_GUARDED(cuGraphAddEmptyNode,(&empty_node, m_graph, begin_ptr(v), len(dependencies))); + return new CUDANode(empty_node); + } + + py::tuple get_nodes() const { + size_t numNodes = 0; + CUDAPP_CALL_GUARDED(cuGraphGetNodes, (m_graph, NULL, &numNodes)); + if (numNodes == 0) + { + return py::tuple(); + } + CUgraphNode* nodes = new CUgraphNode[numNodes]; + CUDAPP_CALL_GUARDED(cuGraphGetNodes, (m_graph, nodes, &numNodes)); + py::list list_nodes = array_of_nodes_to_list(nodes, numNodes); + delete[] nodes; + return py::tuple(list_nodes); + } + + py::tuple get_root_nodes() const{ + size_t numRootNodes = 0; + CUDAPP_CALL_GUARDED(cuGraphGetRootNodes, (m_graph, NULL, &numRootNodes)); + if (numRootNodes == 0) + { + return py::tuple(); + } + CUgraphNode* root_nodes = new CUgraphNode[numRootNodes]; + CUDAPP_CALL_GUARDED(cuGraphGetRootNodes, (m_graph, root_nodes, &numRootNodes)); + py::list list_root_nodes = array_of_nodes_to_list(root_nodes, numRootNodes); + delete[] root_nodes; + return py::tuple(list_root_nodes); + } + + void add_dependencies(py::list from_nodes, py::list to_nodes) { + if (len(from_nodes) != len(to_nodes)){ + throw pycuda::error("Length mismatch", CUDA_ERROR_INVALID_VALUE, + "from_nodes and to_nodes must have same list length"); + } + std::vector to_v = list_to_vector_of_nodes(to_nodes); + std::vector from_v = list_to_vector_of_nodes(from_nodes); + CUDAPP_CALL_GUARDED(cuGraphAddDependencies,(m_graph, begin_ptr(from_v), begin_ptr(to_v), len(to_nodes))); + } + + + void remove_dependencies(py::list from_nodes, py::list to_nodes) { + if (len(from_nodes) != len(to_nodes)){ + throw pycuda::error("Length mismatch", CUDA_ERROR_INVALID_VALUE, + "from_nodes and to_nodes must have same list length"); + } + std::vector to_v = list_to_vector_of_nodes(to_nodes); + std::vector from_v = list_to_vector_of_nodes(from_nodes); + cuGraphRemoveDependencies(m_graph, begin_ptr(from_v), begin_ptr(to_v), len(to_nodes)); + } + + py::tuple get_dependencies(CUDANode* user_node){ + size_t numDependencies=0; + CUgraphNode node = user_node->handle(); + CUDAPP_CALL_GUARDED(cuGraphNodeGetDependencies, (node, NULL, &numDependencies)); + if (numDependencies == 0) + { + return py::tuple(); + } + CUgraphNode* deps = new CUgraphNode[numDependencies]; + CUDAPP_CALL_GUARDED(cuGraphNodeGetDependencies, (node, deps, &numDependencies)); + py::list dependencies = array_of_nodes_to_list(deps, numDependencies); + delete[] deps; + return py::tuple(dependencies); + } + + py::tuple get_dependent_nodes(CUDANode* user_node){ + size_t numDependentnodes=0; + CUgraphNode node = user_node->handle(); + CUDAPP_CALL_GUARDED(cuGraphNodeGetDependentNodes, (node, NULL, &numDependentnodes)); + if (numDependentnodes == 0) + { + return py::tuple(); + } + CUgraphNode* dep_nodes = new CUgraphNode[numDependentnodes]; + CUDAPP_CALL_GUARDED(cuGraphNodeGetDependentNodes, (node, dep_nodes, &numDependentnodes)); + py::list dependent_nodes = array_of_nodes_to_list(dep_nodes, numDependentnodes); + delete dep_nodes; + return py::tuple(dependent_nodes); + } + + CUDANode *add_child_graph_node(CUDAGraph* child_graph, py::list dependencies){ + std::vector v = list_to_vector_of_nodes(dependencies); + CUgraphNode child_node; + CUDAPP_CALL_GUARDED(cuGraphAddChildGraphNode,(&child_node, m_graph, begin_ptr(v), len(dependencies), child_graph->handle())); + return new CUDANode(child_node); + } + + CUDAGraph *get_child_graph(CUDANode* child_graph_node){ + CUgraph child_graph; + CUDAPP_CALL_GUARDED(cuGraphChildGraphNodeGetGraph,(child_graph_node->handle(), &child_graph)); + return new CUDAGraph(child_graph, false); + } + + CUDANode *add_event_record_node(pycuda::event* event, const py::list dependencies){ + std::vector v = list_to_vector_of_nodes(dependencies); + CUgraphNode event_record_node; + CUDAPP_CALL_GUARDED(cuGraphAddEventRecordNode,(&event_record_node, m_graph, begin_ptr(v), len(dependencies), event->handle())); + // making sure that event object cannot be accidently deleted by the user + m_event_list.append(boost::ref(event)); + return new CUDANode(event_record_node); + } + + pycuda::event *get_event_from_event_record_node(CUDANode* event_record_node){ + CUevent event; + CUDAPP_CALL_GUARDED(cuGraphEventRecordNodeGetEvent,(event_record_node->handle(), &event)); + return new pycuda::event(event, false); + } + + CUDANode *add_event_wait_node(pycuda::event* event, py::list dependencies){ + std::vector v = list_to_vector_of_nodes(dependencies); + CUgraphNode event_wait_node; + CUDAPP_CALL_GUARDED(cuGraphAddEventWaitNode,(&event_wait_node, m_graph, begin_ptr(v), len(dependencies), event->handle())); + // making sure that event object cannot be accidently deleted by the user + m_event_list.append(boost::ref(event)); + return new CUDANode(event_wait_node); + } + + + pycuda::event *get_event_from_event_wait_node(CUDANode* event_wait_node){ + CUevent event; + CUDAPP_CALL_GUARDED(cuGraphEventWaitNodeGetEvent,(event_wait_node->handle(), &event)); + return new pycuda::event(event, false); + } + + node_buffer* add_mem_alloc_node(size_t size, py::list dependencies){ + + std::vector v = list_to_vector_of_nodes(dependencies); + CUDA_MEM_ALLOC_NODE_PARAMS nodeParams; + memset(&nodeParams, 0, sizeof(nodeParams)); + + nodeParams.bytesize = size; + nodeParams.poolProps.location.id = 0; + nodeParams.poolProps.allocType = CU_MEM_ALLOCATION_TYPE_PINNED; + nodeParams.poolProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + + CUgraphNode memalloc_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemAllocNode,(&memalloc_node, m_graph, + begin_ptr(v), len(dependencies), &nodeParams)); + // making sure that device allocation object cannot be accidently deleted by the userd + return new node_buffer(new CUDANode(memalloc_node), new device_allocation(nodeParams.dptr)); + } + + CUDANode* add_memfree_node(device_allocation* da, + py::list dependencies){ + + da->set_m_managed(); + std::vector v = list_to_vector_of_nodes(dependencies); + CUgraphNode memfree_node; + CUDAPP_CALL_GUARDED_CLEANUP(cuGraphAddMemFreeNode,(&memfree_node, m_graph, + begin_ptr(v), len(dependencies), da->handle())); + return new CUDANode(memfree_node); + } + + CUDANode* add_kernel_node(pycuda::function &user_func, + py::tuple block_dim_py, py::tuple grid_dim_py, + py::object parameter_buffer, + py::list dependencies, + unsigned int shared_mem_bytes){ + std::vector v = list_to_vector_of_nodes(dependencies); + CUDA_KERNEL_NODE_PARAMS nodeParams; + const unsigned axis_count = 3; + unsigned grid_dim[axis_count]; + unsigned block_dim[axis_count]; + + pycuda::preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count); + + py_buffer_wrapper par_buf_wrapper; + par_buf_wrapper.get(parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS); + size_t par_len = par_buf_wrapper.m_buf.len; + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast(par_buf_wrapper.m_buf.buf), + CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len, + CU_LAUNCH_PARAM_END + }; + + nodeParams.func = user_func.handle(); + nodeParams.gridDimX = grid_dim[0]; + nodeParams.gridDimY = grid_dim[1]; + nodeParams.gridDimZ = grid_dim[2]; + nodeParams.blockDimX = block_dim[0]; + nodeParams.blockDimY = block_dim[1]; + nodeParams.blockDimZ = block_dim[2]; + nodeParams.kernelParams = 0; + nodeParams.extra = config; + nodeParams.sharedMemBytes = shared_mem_bytes; + + CUgraphNode kernel_node; + CUDAPP_CALL_GUARDED(cuGraphAddKernelNode,(&kernel_node, m_graph, + begin_ptr(v), len(dependencies), &nodeParams)); + CUDANode* user_kernel_node = new CUDANode(kernel_node); + user_kernel_node->set_kernel_params(user_func, grid_dim_py, block_dim_py, parameter_buffer, shared_mem_bytes); + return user_kernel_node; + } + + CUDANode *add_memcpy_htod_node(CUdeviceptr dest, py::object src, + size_t size, py::list dependencies, + py::object ctx, size_t src_offset, size_t dest_offset){ + + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(src.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + boost::shared_ptr ptr_ctx; + if (ctx.ptr() != Py_None){ + ptr_ctx = py::extract >(ctx); + } + else{ + ptr_ctx = pycuda::context::current_context(); + } + context* Ctx = ptr_ctx.get(); + std::vector v = list_to_vector_of_nodes(dependencies); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcHost = buf_wrapper.m_buf.buf; + nodeParams.dstDevice = dest; + nodeParams.WidthInBytes = size; + nodeParams.srcXInBytes = src_offset; + nodeParams.dstXInBytes = dest_offset; + nodeParams.srcMemoryType = CU_MEMORYTYPE_HOST; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + CUgraphNode memcpy_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemcpyNode,(&memcpy_node, m_graph, begin_ptr(v), len(dependencies), &nodeParams, Ctx->handle())); + return new CUDANode(memcpy_node); + } + + CUDANode *add_memcpy_dtoh_node(py::object dest, CUdeviceptr src, + size_t size, py::list dependencies, + py::object ctx, size_t src_offset, size_t dest_offset){ + + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(dest.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + boost::shared_ptr ptr_ctx; + if (ctx.ptr() != Py_None){ + ptr_ctx = py::extract >(ctx); + } + else{ + ptr_ctx = pycuda::context::current_context(); + } + context* Ctx = ptr_ctx.get(); + std::vector v = list_to_vector_of_nodes(dependencies); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcDevice = src; + nodeParams.dstHost = buf_wrapper.m_buf.buf; + nodeParams.WidthInBytes = size; + nodeParams.srcXInBytes = src_offset; + nodeParams.dstXInBytes = dest_offset; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_HOST; + + + CUgraphNode memcpy_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemcpyNode,(&memcpy_node, m_graph, begin_ptr(v), len(dependencies), &nodeParams, Ctx->handle())); + return new CUDANode(memcpy_node); + } + + CUDANode *add_memcpy_dtod_node(CUdeviceptr dest, CUdeviceptr src, + size_t size, py::list dependencies, + py::object ctx, size_t src_offset, size_t dest_offset){ + + boost::shared_ptr ptr_ctx; + if (ctx.ptr() != Py_None){ + ptr_ctx = py::extract >(ctx); + } + else{ + ptr_ctx = pycuda::context::current_context(); + } + std::vector v = list_to_vector_of_nodes(dependencies); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcDevice = src; + nodeParams.dstDevice = dest; + nodeParams.WidthInBytes = size; + nodeParams.srcXInBytes = src_offset; + nodeParams.dstXInBytes = dest_offset; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + + CUgraphNode memcpy_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemcpyNode,(&memcpy_node, m_graph, begin_ptr(v), len(dependencies), &nodeParams, ptr_ctx.get()->handle())); + CUDANode* user_memcpy_node = new CUDANode(memcpy_node); + user_memcpy_node->set_memcpy_dtod_params(src, dest, size, src_offset, dest_offset, ptr_ctx.get()->handle()); + return user_memcpy_node; + } + + CUDANode *add_memset_node(CUdeviceptr userPtr, + unsigned int value, + size_t element_width, py::list dependencies, + size_t element_height, size_t pitch, + py::object ctx){ + + boost::shared_ptr ptr_ctx; + if (ctx.ptr() != Py_None){ + ptr_ctx = py::extract >(ctx); + } + else{ + ptr_ctx = pycuda::context::current_context(); + } + context* Ctx = ptr_ctx.get(); + std::vector v = list_to_vector_of_nodes(dependencies); + CUDA_MEMSET_NODE_PARAMS nodeParams = {0}; + nodeParams.dst = userPtr; + nodeParams.value = value; + nodeParams.elementSize = sizeof(value); + nodeParams.width = element_width; + nodeParams.height = element_height; + CUgraphNode memset_node; + CUDAPP_CALL_GUARDED(cuGraphAddMemsetNode,(&memset_node, m_graph, begin_ptr(v), len(dependencies), &nodeParams, Ctx->handle())); + return new CUDANode(memset_node); + } + + void set_kernel_node_params(CUDANode* kernel_node, + py::object py_user_func, + py::tuple block_dim_py, py::tuple grid_dim_py, + py::object parameter_buffer, + unsigned int shared_mem_bytes){ + + CUDA_KERNEL_NODE_PARAMS nodeParams; + const unsigned axis_count = 3; + unsigned grid_dim[axis_count]; + unsigned block_dim[axis_count]; + + if (py::len(grid_dim_py) == 0){ + grid_dim_py = kernel_node->get_kernel_params()->m_grid_tuple_py; + } + if (py::len(block_dim_py) == 0){ + block_dim_py = kernel_node->get_kernel_params()->m_block_tuple_py; + } + + pycuda::preprocess_grid_block(block_dim_py, block_dim, grid_dim_py, grid_dim, axis_count); + + if (py_user_func.ptr() != Py_None){ + pycuda::function* user_func = boost::python::extract(py_user_func); + nodeParams.func = user_func->handle(); + } + else{ + nodeParams.func = kernel_node->get_kernel_params()->m_user_func.handle(); + } + + py_buffer_wrapper par_buf_wrapper; + if (parameter_buffer.ptr() != Py_None){ + par_buf_wrapper.get(parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS); + } + else { + par_buf_wrapper.get(kernel_node->get_kernel_params()->m_parameter_buffer.ptr(), PyBUF_ANY_CONTIGUOUS); + } + size_t par_len = par_buf_wrapper.m_buf.len; + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, const_cast(par_buf_wrapper.m_buf.buf), + CU_LAUNCH_PARAM_BUFFER_SIZE, &par_len, + CU_LAUNCH_PARAM_END + }; + + if (shared_mem_bytes != 0){ + nodeParams.sharedMemBytes = shared_mem_bytes; + } + else{ + nodeParams.sharedMemBytes = kernel_node->get_kernel_params()->m_shared_mem_bytes; + } + nodeParams.gridDimX = grid_dim[0]; + nodeParams.gridDimY = grid_dim[1]; + nodeParams.gridDimZ = grid_dim[2]; + nodeParams.blockDimX = block_dim[0]; + nodeParams.blockDimY = block_dim[1]; + nodeParams.blockDimZ = block_dim[2]; + nodeParams.kernelParams = 0; + nodeParams.extra = config; + CUDAPP_CALL_GUARDED(cuGraphKernelNodeSetParams,(kernel_node->handle(), &nodeParams)); + } + + void set_memcpy_htod_node_params(CUDANode* memcpy_node, CUdeviceptr dest, py::object src, + size_t size, size_t src_offset, size_t dest_offset){ + + CUgraphNode Memcpy_node = memcpy_node->handle(); + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(src.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcHost = buf_wrapper.m_buf.buf; + nodeParams.dstDevice = dest; + nodeParams.WidthInBytes = size; + nodeParams.srcXInBytes = src_offset; + nodeParams.dstXInBytes = dest_offset; + nodeParams.srcMemoryType = CU_MEMORYTYPE_HOST; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeSetParams,(Memcpy_node, &nodeParams)); + } + + void set_memcpy_dtoh_node_params(CUDANode* memcpy_node, py::object dest, CUdeviceptr src, + size_t size, size_t src_offset, size_t dest_offset){ + + CUgraphNode Memcpy_node = memcpy_node->handle(); + py_buffer_wrapper buf_wrapper; + buf_wrapper.get(dest.ptr(), PyBUF_ANY_CONTIGUOUS | PyBUF_WRITABLE); + + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcDevice = src; + nodeParams.dstHost = buf_wrapper.m_buf.buf; + nodeParams.WidthInBytes = size; + nodeParams.srcXInBytes = src_offset; + nodeParams.dstXInBytes = dest_offset; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_HOST; + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeSetParams,(Memcpy_node, &nodeParams)); + } + + void set_memcpy_dtod_node_params(CUDANode* memcpy_node, CUdeviceptr dest, CUdeviceptr src, + size_t size, size_t src_offset, size_t dest_offset){ + + CUgraphNode Memcpy_node = memcpy_node->handle(); + CUDA_MEMCPY3D nodeParams = {0}; + nodeParams.Depth = 1; + nodeParams.Height = 1; + nodeParams.srcDevice = src; + nodeParams.dstDevice = dest; + nodeParams.WidthInBytes = size; + nodeParams.srcXInBytes = src_offset; + nodeParams.dstXInBytes = dest_offset; + nodeParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + nodeParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeSetParams,(Memcpy_node, &nodeParams)); + } + + + void set_memset_node_params(CUDANode* memset_node, CUdeviceptr Ptr, + unsigned int value, + size_t width, size_t height, size_t pitch){ + + CUgraphNode Memset_node = memset_node->handle(); + CUDA_MEMSET_NODE_PARAMS nodeParams; + nodeParams.dst = Ptr; + nodeParams.elementSize = sizeof(value); + nodeParams.width = width; + nodeParams.height = height; + nodeParams.pitch = pitch; + nodeParams.value = value; + CUDAPP_CALL_GUARDED(cuGraphMemsetNodeSetParams,(Memset_node, &nodeParams)); + } + + py::tuple get_kernel_node_params(CUDANode* kernel_node){ + CUgraphNode Kernel_node = kernel_node->handle(); + CUDA_KERNEL_NODE_PARAMS nodeParams; + + CUDAPP_CALL_GUARDED(cuGraphKernelNodeGetParams,(Kernel_node, &nodeParams)); + + pycuda::function* func = new pycuda::function(nodeParams.func, "function"); + py::tuple grid_dim_py = py::make_tuple(nodeParams.gridDimX, + nodeParams.gridDimY, + nodeParams.gridDimZ); + py::tuple block_dim_py = py::make_tuple(nodeParams.blockDimX, + nodeParams.blockDimY, + nodeParams.blockDimZ); + return py::make_tuple(func, grid_dim_py, block_dim_py); + } + + py::tuple get_memset_node_params(CUDANode* memset_node){ + CUgraphNode Memset_node = memset_node->handle(); + CUDA_MEMSET_NODE_PARAMS nodeParams; + + CUDAPP_CALL_GUARDED(cuGraphMemsetNodeGetParams,(Memset_node, &nodeParams)); + + return py::make_tuple(nodeParams.dst, nodeParams.value, nodeParams.elementSize, nodeParams.width, nodeParams.height, nodeParams.pitch); + } + + py::tuple get_memcpy_htod_node_params(CUDANode* memcpy_node){ + CUgraphNode Memcpy_node = memcpy_node->handle(); + CUDA_MEMCPY3D nodeParams; + + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeGetParams,(Memcpy_node, &nodeParams)); + + return py::make_tuple(nodeParams.dstDevice, (CUdeviceptr)nodeParams.srcHost, nodeParams.WidthInBytes, nodeParams.srcXInBytes, nodeParams.dstXInBytes); + } + + py::tuple get_memcpy_dtoh_node_params(CUDANode* memcpy_node){ + CUgraphNode Memcpy_node = memcpy_node->handle(); + CUDA_MEMCPY3D nodeParams; + + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeGetParams,(Memcpy_node, &nodeParams)); + + return py::make_tuple((CUdeviceptr)nodeParams.dstHost, nodeParams.srcDevice, nodeParams.WidthInBytes, nodeParams.srcXInBytes, nodeParams.dstXInBytes); + } + + py::tuple get_memcpy_dtod_node_params(CUDANode* memcpy_node){ + CUgraphNode Memcpy_node = memcpy_node->handle(); + CUDA_MEMCPY3D nodeParams; + + CUDAPP_CALL_GUARDED(cuGraphMemcpyNodeGetParams,(Memcpy_node, &nodeParams)); + + return py::make_tuple(nodeParams.dstDevice, nodeParams.srcDevice, nodeParams.WidthInBytes, nodeParams.srcXInBytes, nodeParams.dstXInBytes); + } + + CUDAGraphExec *get_exec_graph() + { CUgraphExec exec; + CUDAPP_CALL_GUARDED(cuGraphInstantiateWithFlags, (&exec, m_graph, 1)) + return new CUDAGraphExec(exec); + } + }; + + +//{{{ stream capture +//borrowed from https://github.com/inducer/pycuda/pull/343/files +inline void begin_capture(py::object stream_py, CUstreamCaptureMode mode = CU_STREAM_CAPTURE_MODE_GLOBAL) +{ + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuStreamBeginCapture, (s_handle, mode)); +} + +inline CUDAGraph *end_capture(py::object stream_py) +{ + CUgraph graph; + PYCUDA_PARSE_STREAM_PY; + CUDAPP_CALL_GUARDED(cuStreamEndCapture, (s_handle, &graph)) + return new CUDAGraph(graph, false); +} +//}}} + +}} + +#endif + +// vim: foldmethod=marker + diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 58581f548076a40bbfb255399723760c70b2de8d..ca8e8b664c7c3f00d23ba3677694b3e12531b243 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -82,7 +82,7 @@ namespace class mem_host_register_flags { }; class mem_peer_register_flags { }; class array3d_flags { }; - + class CUDAGraph {}; // {{{ "python-aware" wrappers @@ -654,6 +654,7 @@ namespace void pycuda_expose_tools(); void pycuda_expose_gl(); void pycuda_expose_curand(); +void pycuda_expose_cuda_graph(); @@ -1155,6 +1156,7 @@ BOOST_PYTHON_MODULE(_driver) #endif .DEF_SIMPLE_METHOD(compute_capability) .DEF_SIMPLE_METHOD(total_memory) + .DEF_SIMPLE_METHOD(trim_graph_memory) .def("get_attribute", device_get_attribute) .def(py::self == py::self) .def(py::self != py::self) @@ -1230,6 +1232,14 @@ BOOST_PYTHON_MODULE(_driver) // }}} // {{{ stream + //borrowed from https://github.com/inducer/pycuda/pull/343/files +#if CUDAPP_CUDA_VERSION >= 10000 + py::enum_("capture_mode") + .value("GLOBAL", CU_STREAM_CAPTURE_MODE_GLOBAL) + .value("THREAD_LOCAL", CU_STREAM_CAPTURE_MODE_THREAD_LOCAL) + .value("RELAXED", CU_STREAM_CAPTURE_MODE_RELAXED) + ; +#endif { typedef stream cl; py::class_ > @@ -1304,7 +1314,7 @@ BOOST_PYTHON_MODULE(_driver) .def("_param_setf", (void (cl::*)(int, float )) &cl::param_set) .def("_param_setv", function_param_setv) .DEF_SIMPLE_METHOD(param_set_texref) - + .add_property("handle_int",&cl::handle_int) .def("_launch", &cl::launch) .def("_launch_grid", &cl::launch_grid, @@ -1625,13 +1635,15 @@ BOOST_PYTHON_MODULE(_driver) ("Event", py::init >(py::arg("flags"))) .def("record", &cl::record, py::arg("stream")=py::object(), py::return_self<>()) + .def("__eq__", &cl::operator==) + .def("__ne__", &cl::operator!=) .def("synchronize", &cl::synchronize, py::return_self<>()) .DEF_SIMPLE_METHOD(query) .DEF_SIMPLE_METHOD(time_since) .DEF_SIMPLE_METHOD(time_till) #if CUDAPP_CUDA_VERSION >= 4010 && PY_VERSION_HEX >= 0x02060000 .DEF_SIMPLE_METHOD(ipc_handle) - .def("from_ipc_handle", event_from_ipc_handle, + .def("from_ipc_handle", event_from_ipc_handle, py::arg("managed")=true, py::return_value_policy()) .staticmethod("from_ipc_handle") #endif @@ -1744,6 +1756,10 @@ BOOST_PYTHON_MODULE(_driver) #ifdef HAVE_CURAND pycuda_expose_curand(); #endif +#if CUDAPP_CUDA_VERSION >= 10000 +pycuda_expose_cuda_graph(); +#endif + } // vim: foldmethod=marker diff --git a/src/wrapper/wrap_cudagraph.cpp b/src/wrapper/wrap_cudagraph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..11b279db66a983b528f35cef95278c5ec231f4be --- /dev/null +++ b/src/wrapper/wrap_cudagraph.cpp @@ -0,0 +1,111 @@ +#ifdef _WIN32 + #include +#endif + +#include +#include + +#include "tools.hpp" +#include "wrap_helpers.hpp" + +using namespace pycuda; +using namespace pycuda::cuda_graph; +using boost::shared_ptr; + + + void pycuda_expose_cuda_graph() + { + using py::args; + + // {{{ CUDANode + { + typedef CUDANode cl; + py::class_ + ("Node", py::no_init) + .def("__eq__", &cl::operator==) + .def("__ne__", &cl::operator!=); + } + /// }}} + + // {{{ node_buffer + { + typedef node_buffer cl; + py::class_ + ("NodeBuffer", py::no_init) + .def("get_node", &cl::get_node, py::return_value_policy()) + .def("get_device_allocation", &cl::get_device_allocation, py::return_value_policy()) + ; + } + + /// {{{ CUDAGraph + { + typedef CUDAGraph cl; + py::class_ + ("Graph", py::init >(py::arg("flags"))) + .def("__eq__", &cl::operator==) + .def("__ne__", &cl::operator!=) + .def("show_dot_graph", &cl::debug_dot_print) + .def("clone",&cl::clone,py::return_value_policy()) + .def("add_empty_node",&cl::add_empty_node,py::return_value_policy(),py::arg("dependencies")=py::list()) + .def("get_nodes",&cl::get_nodes) + .def("get_root_nodes",&cl::get_root_nodes) + .def("add_dependencies",&cl::add_dependencies,py::arg("from_nodes"),py::arg("to_nodes")) + .def("remove_dependencies",&cl::remove_dependencies,py::arg("from_nodes"),py::arg("to_nodes")) + .def("get_dependencies",&cl::get_dependencies,py::arg("user_node")) + .def("get_dependent_nodes",&cl::get_dependent_nodes,py::arg("user_node")) + .def("add_child_graph_node",&cl::add_child_graph_node,py::return_value_policy(),py::arg("child_graph"),py::arg("dependencies")=py::list()) + .def("get_child_graph",&cl::get_child_graph,py::return_value_policy(),py::arg("child_graph_node")) + .def("add_event_record_node",&cl::add_event_record_node,py::return_value_policy(),py::arg("event"),py::arg("dependencies")=py::list()) + .def("get_event_from_event_record_node",&cl::get_event_from_event_record_node,py::return_value_policy(),py::arg("event_record_node")) + .def("add_event_wait_node",&cl::add_event_wait_node,py::return_value_policy(),py::arg("event"),py::arg("dependencies")=py::list()) + .def("get_event_from_event_wait_node",&cl::get_event_from_event_wait_node,py::return_value_policy(),py::arg("event_record_node")) + .def("_add_mem_alloc_node",&cl::add_mem_alloc_node,py::return_value_policy(),(py::arg("dependencies")=py::list(), py::arg("size"))) + .def("add_memfree_node",&cl::add_memfree_node,py::return_value_policy(),(py::arg("da"), py::arg("dependencies")=py::list())) + .def("_add_kernel_node",&cl::add_kernel_node,py::return_value_policy(),(py::arg("self"), py::arg("func"), py::arg("block"), py::arg("grid"), py::arg("arg_buf"), py::arg("dependencies"), py::arg("shared_mem_bytes")), py::with_custodian_and_ward_postcall<1, 2>()) + .def("add_memcpy_htod_node",&cl::add_memcpy_htod_node,py::return_value_policy(),(py::arg("dest"), py::arg("src"), py::arg("size"), py::arg("dependencies")=py::list(), py::arg("ctx")=py::object(), py::arg("src_offset")=0,py::arg("dest_offset")=0)) + .def("add_memcpy_dtoh_node",&cl::add_memcpy_dtoh_node,py::return_value_policy(),(py::arg("dest"), py::arg("src"), py::arg("size"), py::arg("dependencies")=py::list(), py::arg("ctx")=py::object(), py::arg("src_offset")=0,py::arg("dest_offset")=0)) + .def("add_memcpy_dtod_node",&cl::add_memcpy_dtod_node,py::return_value_policy(),(py::arg("dest"), py::arg("src"), py::arg("size"), py::arg("dependencies")=py::list(), py::arg("ctx")=py::object(), py::arg("src_offset")=0,py::arg("dest_offset")=0)) + .def("add_memset_node",&cl::add_memset_node,py::return_value_policy(),(py::arg("ptr"),py::arg("value"),py::arg("width"), py::arg("dependencies")=py::list(), py::arg("height")=1, py::arg("pitch")=1, py::arg("ctx")=py::object())) + .def("_set_kernel_node_params",&cl::set_kernel_node_params,(py::arg("kernel_node"), py::arg("func"), py::arg("block"), py::arg("grid"), py::arg("arg_buf"), py::arg("shared_mem_bytes"))) + .def("set_memcpy_htod_node_params",&cl::set_memcpy_htod_node_params,(py::arg("memcpy_node"),py::arg("dest"),py::arg("src"),py::arg("size"),py::arg("src_offset")=0,py::arg("dest_offset")=0)) + .def("set_memcpy_dtoh_node_params",&cl::set_memcpy_dtoh_node_params,(py::arg("memcpy_node"),py::arg("dest"),py::arg("src"),py::arg("size"),py::arg("src_offset")=0,py::arg("dest_offset")=0)) + .def("set_memcpy_dtod_node_params",&cl::set_memcpy_dtod_node_params,(py::arg("memcpy_node"),py::arg("dest"),py::arg("src"),py::arg("size"),py::arg("src_offset")=0,py::arg("dest_offset")=0)) + .def("set_memset_node_params",&cl::set_memset_node_params,(py::arg("memset_node"),py::arg("ptr"),py::arg("value"),py::arg("width"),py::arg("height")=1,py::arg("pitch")=1)) + .def("get_kernel_node_params",&cl::get_kernel_node_params,(py::arg("kernel_node"))) + .def("get_memcpy_htod_node_params",&cl::get_memcpy_htod_node_params,(py::arg("memcpy_node"))) + .def("get_memcpy_dtod_node_params",&cl::get_memcpy_dtod_node_params,(py::arg("memcpy_node"))) + .def("get_memcpy_dtoh_node_params",&cl::get_memcpy_dtoh_node_params,(py::arg("memcpy_node"))) + .def("get_memset_node_params",&cl::get_memset_node_params,(py::arg("memset_node"))) + .def("get_exec_graph", &cl::get_exec_graph, py::return_value_policy(), (py::arg("self"), py::with_custodian_and_ward_postcall<0, 1>())); + } + /// }}} + + /// {{{ CUDAGraphExec + { + typedef CUDAGraphExec cl; + py::class_ + ("GraphExec", py::no_init) + .def("launch",&cl::launch,(py::arg("stream_py")=py::object())) + .def("upload",&cl::upload,(py::arg("stream_py")=py::object())) + .def("_set_kernel_node_params",&cl::set_kernel_node_params,(py::arg("kernel_node"), py::arg("func"), py::arg("block"), py::arg("grid"), py::arg("arg_buf"), py::arg("shared_mem_bytes"))) + .def("_batched_set_kernel_node_arguments",&cl::batched_set_kernel_node_arguments,(py::arg("kernel_node_list"), py::arg("func_list"), py::arg("block_list"), py::arg("grid_list"), py::arg("arg_buf_list"), py::arg("shared_mem_bytes_list"))) + .def("set_memcpy_htod_node_params",&cl::set_memcpy_htod_node_params,(py::arg("memcpy_node"),py::arg("dest"),py::arg("src"),py::arg("size"),py::arg("ctx")=py::object(),py::arg("src_offset")=0,py::arg("dest_offset")=0)) + .def("_set_memcpy_dtod_node_params",&cl::set_memcpy_dtod_node_params,(py::arg("memcpy_node"),py::arg("dest"),py::arg("src"),py::arg("size"),py::arg("src_offset"),py::arg("dest_offset"),py::arg("ctx")=py::object())) + .def("set_memcpy_dtoh_node_params",&cl::set_memcpy_dtoh_node_params,(py::arg("memcpy_node"),py::arg("dest"),py::arg("src"),py::arg("size"),py::arg("ctx")=py::object(),py::arg("src_offset")=0,py::arg("dest_offset")=0)) + .def("set_memset_node_params",&cl::set_memset_node_params,(py::arg("memset_node"),py::arg("ptr"),py::arg("value"),py::arg("width"),py::arg("height")=1,py::arg("pitch")=1,py::arg("ctx")=py::object())); + //.def("update_child_graph_node",&cl::update_child_graph_node); + } + // }}} + + // {{{ stream capture + + + + py::def("begin_capture", begin_capture, + py::arg("capture_mode") = CU_STREAM_CAPTURE_MODE_GLOBAL); + py::def("end_capture", end_capture, py::return_value_policy()); + // }}} + + } + +// vim: foldmethod=marker diff --git a/test/test_cudagraph.py b/test/test_cudagraph.py new file mode 100644 index 0000000000000000000000000000000000000000..6be60c923a2f193788ff53473ebdf457524986dc --- /dev/null +++ b/test/test_cudagraph.py @@ -0,0 +1,203 @@ +__copyright__ = """ +Copyright 2008-2021 Andreas Kloeckner +Copyright 2021 NVIDIA Corporation +""" + +import numpy as np +import pytest # noqa + +import pycuda +import pycuda.autoinit # noqa: F401 +import pycuda.driver as drv +from pycuda.compiler import SourceModule + +ctx = drv.Context.get_current() +dev = ctx.get_device() +post_volta_devices = ["NVIDIA TITAN V"] +post_volta_flag = dev.name() in post_volta_devices + + +class TestCUDAGraph: + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_graph_features(self, post_volta_flag): + + g = drv.Graph() + assert (() == g.get_nodes()) + empty_node = g.add_empty_node() + assert ((empty_node,) == g.get_nodes()) + + clone = g.clone() + child_node = g.add_child_graph_node(clone, []) + assert (len(clone.get_nodes()) == len(g.get_child_graph(child_node).get_nodes())) + g.add_dependencies([child_node], [empty_node]) + assert ((empty_node, ) == g.get_dependent_nodes(child_node)) + g.remove_dependencies([child_node], [empty_node]) + g.add_dependencies([empty_node], [child_node]) + assert ((empty_node, ) == g.get_dependencies(child_node)) + + event = drv.Event() + event_record_node = g.add_event_record_node(event, [empty_node]) + assert ((empty_node, ) == g.get_dependencies(event_record_node)) + assert ((child_node, event_record_node) == g.get_dependent_nodes(empty_node)) + assert ((empty_node, ) == g.get_root_nodes()) + + g_exec = g.get_exec_graph() + g_exec.launch() + + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_kernel_node(self, post_volta_flag): + g = drv.Graph() + mod = SourceModule(""" + __global__ void doublify(float *a) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= 2; + } + """) + + doublify = mod.get_function("doublify") + + a = np.random.randn(4, 4).astype(np.float32) + a_doubled = np.empty_like(a) + if post_volta_flag: + mem_alloc_node, a_gpu = g.add_memalloc_node(a.nbytes, []) + else: + a_gpu = drv.mem_alloc(a.nbytes) + + memcpy_htod_node = g.add_memcpy_htod_node(a_gpu, a, a.nbytes, [mem_alloc_node] if post_volta_flag else []) + kernel_node = g.add_kernel_node(139712027164672, func=doublify, block=(4, 4, 1), dependencies=[memcpy_htod_node]) + memcpy_dtoh_node = g.add_memcpy_dtoh_node(dest=a_doubled, src=a_gpu, size=a.nbytes, dependencies=[kernel_node]) + if post_volta_flag: + g.add_memfree_node(a_gpu, [memcpy_dtoh_node]) + g_exec = g.get_exec_graph() + g_exec.set_kernel_node_params(a_gpu, kernel_node=kernel_node) + g_exec.launch() + np.testing.assert_allclose(a_doubled, a * 2, rtol=1e-5) + + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_memcpy_node(self, post_volta_flag): + + g = drv.Graph() + a = np.random.randn(32, 8).astype(np.float32) + b = np.empty([32, 8], dtype=np.float32) + if post_volta_flag: + mem_alloc_node_a_gpu, a_gpu = g.add_memalloc_node(a.nbytes, []) + mem_alloc_node_b_gpu, b_gpu = g.add_memalloc_node(b.nbytes, []) + else: + a_gpu = drv.mem_alloc(a.nbytes) + b_gpu = drv.mem_alloc(b.nbytes) + + node_htod = g.add_memcpy_htod_node(a_gpu, a, a.nbytes, [mem_alloc_node_a_gpu] if post_volta_flag else []) + g.set_memcpy_htod_node_params(node_htod, a_gpu, a, a.nbytes) + assert (g.get_memcpy_htod_node_params(node_htod)[2] == a.nbytes) + + node_dtod = g.add_memcpy_dtod_node(a_gpu, a_gpu, a.nbytes, [node_htod, mem_alloc_node_b_gpu] if post_volta_flag else [node_htod]) + assert (g.get_memcpy_dtod_node_params(node_dtod)[2] == a.nbytes) + + if post_volta_flag: + g.add_memfree_node(a_gpu, [node_dtod]) + node_dtoh = g.add_memcpy_dtoh_node(b, b_gpu, b.nbytes, [node_dtod]) + g.set_memcpy_dtoh_node_params(node_dtoh, b, b_gpu, b.nbytes) + assert (g.get_memcpy_dtoh_node_params(node_dtoh)[2] == a.nbytes) + if post_volta_flag: + g.add_memfree_node(b_gpu, [node_dtoh]) + g_exec = g.get_exec_graph() + g_exec.set_memcpy_dtod_node_params(memcpy_node=node_dtod, dest=b_gpu, src=a_gpu) + g_exec.launch() + np.testing.assert_allclose(a, b, rtol=1e-5) + + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_memset_node(self, post_volta_flag): + g = drv.Graph() + a = np.empty(10, "int32") + a_new = np.empty_like(a) + if post_volta_flag: + mem_alloc_node_a_gpu, a_gpu = g.add_memalloc_node(a.nbytes, []) + else: + a_gpu = drv.mem_alloc(a.nbytes) + memset_node = g.add_memset_node(dependencies=[mem_alloc_node_a_gpu] if post_volta_flag else [], ptr=a_gpu, value=5, width=10) + g.add_memcpy_dtoh_node(dependencies=[memset_node], src=a_gpu, dest=a_new, size=a.nbytes) + g_exec = g.get_exec_graph() + g_exec.launch() + np.testing.assert_allclose(5 * np.ones_like(a).astype("int32"), a_new) + + @pytest.mark.parametrize("post_volta_flag", [post_volta_flag]) + def test_stream_capture(self, post_volta_flag): + # borrowed from https://github.com/inducer/pycuda/pull/343/files#diff-8a50e6f7173b100e6323e9433913e982831b7f440eb6ca48cd8c339ad6c24383R1226 + # Sample source code from the Tutorial Introduction in the documentation. + + mod = SourceModule(""" + __global__ void plus(float *a) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] += 1; + } + __global__ void times(float *a, float *b) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= b[idx]; + } + """) + func_plus = mod.get_function("plus") + func_times = mod.get_function("times") + + a = np.zeros((4, 4)).astype(np.float32) + a_gpu = drv.mem_alloc_like(a) + b = np.zeros((4, 4)).astype(np.float32) + b_gpu = drv.mem_alloc_like(b) + result = np.zeros_like(b) + + # begin graph capture, pull stream_2 into it as a dependency + # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cross-stream-dependencies + stream_1 = drv.Stream() + stream_2 = drv.Stream() + drv.begin_capture(stream_1) + event_init = drv.Event() + event_a = drv.Event() + event_b = drv.Event() + + event_init.record(stream_1) + stream_2.wait_for_event(event_init) + + drv.memcpy_htod_async(a_gpu, a, stream_1) + func_plus(a_gpu, block=(4, 4, 1), stream=stream_1) + event_a.record(stream_1) + + drv.memcpy_htod_async(b_gpu, b, stream_2) + func_plus(b_gpu, block=(4, 4, 1), stream=stream_2) + event_b.record(stream_2) + + stream_1.wait_for_event(event_a) + stream_1.wait_for_event(event_b) + func_times(a_gpu, b_gpu, block=(4, 4, 1), stream=stream_1) + drv.memcpy_dtoh_async(result, a_gpu, stream_1) + + graph = drv.end_capture(stream_1) + instance = graph.get_exec_graph() + + # using a separate graph stream to launch, this is not strictly necessary + stream_graph = drv.Stream() + instance.launch(stream_graph) + + # manual mode + g = drv.Graph() + if post_volta_flag: + mem_alloc_node_a_gpu, a_gpu_m = g.add_memalloc_node(a.nbytes) + mem_alloc_node_b_gpu, b_gpu_m = g.add_memalloc_node(b.nbytes) + else: + a_gpu_m = drv.mem_alloc(a.nbytes) + b_gpu_m = drv.mem_alloc(b.nbytes) + + result_m = np.zeros_like(b) + + memcpy_htod_node_a_gpu = g.add_memcpy_htod_node(a_gpu_m, a, a.nbytes, [mem_alloc_node_a_gpu] if post_volta_flag else []) + memcpy_htod_node_b_gpu = g.add_memcpy_htod_node(b_gpu_m, b, b.nbytes, [mem_alloc_node_b_gpu] if post_volta_flag else []) + kernel_node_a_gpu_m = g.add_kernel_node(a_gpu_m, func=func_plus, block=(4, 4, 1), dependencies=[memcpy_htod_node_a_gpu]) + kernel_node_b_gpu_m = g.add_kernel_node(b_gpu_m, func=func_plus, block=(4, 4, 1), dependencies=[memcpy_htod_node_b_gpu]) + kernel_node = g.add_kernel_node(a_gpu_m, b_gpu_m, func=func_times, block=(4, 4, 1), dependencies=[kernel_node_a_gpu_m, kernel_node_b_gpu_m]) + g.add_memcpy_dtoh_node(result_m, a_gpu, a.nbytes, [kernel_node]) + + g_exec = g.get_exec_graph() + g_exec.launch() + + np.testing.assert_allclose(result, result_m, rtol=1e-5)