diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 11a78ad2b542cb9b9c69e32b2e821bec42904903..75fc264ffa0a49a4b50a6e501be10b25a6859894 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,9 +1,29 @@ +Python 3 Titan V: + script: + - py_version=3 + - export PYOPENCL_TEST=nvi:titan + - export EXTRA_INSTALL="pyopencl mpi4py jax[cpu]" + - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh + - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" + - ". ./build-and-test-py-project.sh" + tags: + - python3 + - nvidia-titan-v + except: + - tags + artifacts: + reports: + junit: test/pytest.xml + Python 3 POCL: script: - export PY_EXE=python3 - export PYOPENCL_TEST=portable:pthread - export EXTRA_INSTALL="pyopencl mpi4py jax[cpu]" + - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh + - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" - ". ./build-and-test-py-project.sh" tags: - python3 diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml index f5a171e9651e39bf475b13df5f03f10667cfec0f..ecf95ba35d90af785bbeb00430dfa181cf73b985 100644 --- a/.pylintrc-local.yml +++ b/.pylintrc-local.yml @@ -6,6 +6,7 @@ - ply - pygments.lexers - pygments.formatters + - pycuda # https://github.com/PyCQA/pylint/issues/7623 - arg: disable diff --git a/config b/config new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/pytato/__init__.py b/pytato/__init__.py index a040b61564a284cdd54b892860730edde75b93cc..e80c21a304006688caf8364170b0218b5c12ba65 100644 --- a/pytato/__init__.py +++ b/pytato/__init__.py @@ -87,6 +87,7 @@ from pytato.target.loopy.codegen import generate_loopy from pytato.target import Target from pytato.target.loopy import LoopyPyOpenCLTarget from pytato.target.python.jax import generate_jax +from pytato.target.pycuda.cudagraph import generate_cudagraph from pytato.visualization import (get_dot_graph, show_dot_graph, get_ascii_graph, show_ascii_graph, get_dot_graph_from_partition) @@ -127,7 +128,7 @@ __all__ = ( "matmul", "roll", "transpose", "stack", "reshape", "expand_dims", "concatenate", - "generate_loopy", "generate_jax", + "generate_loopy", "generate_jax", "generate_cudagraph", "Target", "LoopyPyOpenCLTarget", diff --git a/pytato/target/pycuda/__init__.py b/pytato/target/pycuda/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5b58223c2d8d447ffe0f5bd108a21833160262a0 --- /dev/null +++ b/pytato/target/pycuda/__init__.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +__copyright__ = """Copyright (C) 2022 Mit Kotak""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +__doc__ = """ +.. currentmodule:: pytato + +.. autofunction:: generate_cudagraph + +.. currentmodule:: pytato.target.python + +.. autoclass:: PythonTarget +.. autoclass:: BoundPythonProgram +.. autoclass:: CUDAGraphTarget +.. autoclass:: BoundCUDAGraphProgram +""" + +from dataclasses import dataclass +from abc import ABC, abstractmethod +from functools import cached_property +from typing import Any, Mapping, FrozenSet, Callable, Dict, Set + +from pytato.target import Target, BoundProgram + + +# {{{ abstract types + +class PythonTarget(Target, ABC): + """ + A target that generates code for a python program, typically by invoking + some :mod:`numpy`-like for the array operations. + """ + + @abstractmethod + def bind_program(self, + program: str, + entrypoint: str, + expected_arguments: FrozenSet[str], + bound_arguments: Mapping[str, Any]) -> BoundProgram: + """ + :arg program: The python code containing the compiled routine. + :arg entrypoint: Name of the entrypoint + """ + pass + + +@dataclass(repr=False, eq=False) +class BoundCUDAGraphProgram(BoundProgram): + """ + A wrapper for executing python programs with bound arguments. + + .. automethod:: __call__ + .. automethod:: copy + .. automethod:: with_transformed_program + """ + expected_arguments: FrozenSet[str] + entrypoint: str + + @cached_property + def _compiled_function(self) -> Callable[..., Any]: + variables_after_execution: Dict[str, Any] = { + "_MODULE_SOURCE_CODE": self.program, # helps pudb + "cuda_allocator": self.cuda_allocator, + "cuda_dev": self.cuda_dev + } + exec(self.program, variables_after_execution) + assert callable(variables_after_execution[self.entrypoint]) + return variables_after_execution[ # type: ignore[no-any-return] + self.entrypoint] + + @cached_property + def _bound_argment_names(self) -> Set[str]: + return set(self.bound_arguments.keys()) + + @cached_property + def _processed_bound_args(self) -> Mapping[str, Any]: + import pycuda.autoinit + import pycuda.gpuarray as gpuarray + import numpy as np + + processed_bound_args = {} + for name, arg in zip(self.bound_arguments.keys(), + self.bound_arguments.values()): + if isinstance(arg, np.ndarray): + processed_bound_args[name] = gpuarray.to_gpu(arg) + elif isinstance(arg, gpuarray.GPUArray): + processed_bound_args[name] = arg + else: + raise ValueError("Array format not supported") + return processed_bound_args + + def __call__(self, cuda_allocator=None, *args: Any, **kwargs: Any) -> Any: + if bool(cuda_allocator): + self.cuda_allocator = cuda_allocator + else: + import pycuda + self.cuda_allocator = pycuda.tools.DeviceMemoryPool().allocate + import pycuda.driver as drv + self.cuda_dev = drv.Context.get_device() + if args: + raise ValueError(f"'{type(self).__call__}' does not take positional" + " arguments.") + + if set(kwargs.keys()) & self._bound_argment_names: + raise ValueError("Got arguments that were previously bound: " + f"'{set(kwargs.keys()) & set(self.bound_arguments.keys())}'.") + updated_kwargs = dict(self._processed_bound_args) + updated_kwargs.update(kwargs) + updated_kwargs = {kw: arg + for kw, arg in updated_kwargs.items() + if kw in self.expected_arguments} + return self._compiled_function(**updated_kwargs) + + def copy(self, **kwargs: Any) -> BoundCUDAGraphProgram: + from dataclasses import replace + return replace(self, **kwargs) + + def with_transformed_program(self, *args: Any, **kwargs: Any + ) -> BoundCUDAGraphProgram: + raise ValueError("Cannot transform python program.") + +# }}} + + +# {{{ cudagraph-numpy target + + +class CUDAGraphTarget(PythonTarget): + """ + A target that generates code for a python program by offloading array + operations to :mod:`cudagraph.cudagraph`. + """ + + @property + def numpy_like_module_name(self) -> str: + return "pycuda.driver" + + @property + def numpy_like_module_name_shorthand(self) -> str: + return "_pt_drv" + + def bind_program(self, + program: str, + entrypoint: str, + expected_arguments: FrozenSet[str], + bound_arguments: Mapping[str, Any]) -> BoundCUDAGraphProgram: + return BoundCUDAGraphProgram(target=self, program=program, + entrypoint=entrypoint, + expected_arguments=expected_arguments, + bound_arguments=bound_arguments) + +# }}} + +# vim: foldmethod=marker diff --git a/pytato/target/pycuda/cudagraph.py b/pytato/target/pycuda/cudagraph.py new file mode 100644 index 0000000000000000000000000000000000000000..373862195ff6c32053a82253f51b8f277aadf8bd --- /dev/null +++ b/pytato/target/pycuda/cudagraph.py @@ -0,0 +1,57 @@ +__copyright__ = """ +Copyright (C) 2022 Mit Kotak +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from typing import Union, Optional, Mapping + +from pytato.array import Array, DictOfNamedArrays +from pytato.target.pycuda import CUDAGraphTarget, BoundCUDAGraphProgram +from pytato.target.pycuda.numpy_like import generate_cudagraph_numpy_like + +__doc__ = """ +.. autofunction:: generate_cudagraph +""" + + +def generate_cudagraph(expr: Union[Array, Mapping[str, Array], DictOfNamedArrays], + *, + target: Optional[CUDAGraphTarget] = None, + function_name: str = "_pt_kernel", + show_code: bool = False, + colorize_show_code: bool = True, + dot_graph_path: Optional[str] = "") -> BoundCUDAGraphProgram: + """ + Returns a :class:`pytato.target.python.BoundCUDAGraphProgram` for the array + expressions in *expr*. + :arg function: Name of the entrypoint function in the generated code. + :arg show_code: If *True*, the generated code is printed to ``stdout``. + """ + if target is None: + target = CUDAGraphTarget() + assert isinstance(target, CUDAGraphTarget) + return generate_cudagraph_numpy_like(expr, + target=target, + function_name=function_name, + show_code=show_code, + colorize_show_code=colorize_show_code, + dot_graph_path=dot_graph_path) diff --git a/pytato/target/pycuda/levels.py b/pytato/target/pycuda/levels.py new file mode 100644 index 0000000000000000000000000000000000000000..1057f729c55fff6cb1ae352169ed851d0b1bf566 --- /dev/null +++ b/pytato/target/pycuda/levels.py @@ -0,0 +1,101 @@ +import ast +import sys +import os +import numpy as np +import loopy as lp + +from typing import (Callable, Union, Optional, Mapping, Dict, TypeVar, + List, Set, Tuple, Any) + +import pymbolic.primitives as prim +from pytools import (UniqueNameGenerator, memoize_method) +from pytools.graph import reverse_graph +from pytato.transform import CachedMapper, ArrayOrNames +from pytato.array import (IndexLambda, DataWrapper, + Placeholder, SizeParam, + Array, DictOfNamedArrays, + DataInterface) +from pytato.scalar_expr import (IdentityMapper, Reduce, + WalkMapper as ScalarWalkMapper) +from pytato.target.pycuda import (CUDAGraphTarget, + BoundCUDAGraphProgram) +from pyrsistent import pmap +from loopy.symbolic import Reduction as LoopyReduction +from pytato.target.loopy.codegen import PYTATO_REDUCTION_TO_LOOPY_REDUCTION +from dataclasses import dataclass + +T = TypeVar("T") + + +class StringMapper(CachedMapper[ArrayOrNames]): + + def __init__(self): + super().__init__() + from bidict import bidict + self.string_map = bidict() + self.dep_map = {} + self.vng = UniqueNameGenerator() + + def map_dict_of_named_arrays(self, expr: DictOfNamedArrays): + from pytato.codegen import normalize_outputs as normalize_outputs + from pytato.target.loopy.codegen import preprocess as preprocess_loopy + result = normalize_outputs(expr) + preproc_result = preprocess_loopy(result, target=lp.CudaTarget()) + for _, subexpr in list(preproc_result.outputs._data.items()): + if not isinstance(subexpr, Placeholder): + expression = self.rec(subexpr) + self.string_map[subexpr] = expression + + + def map_placeholder(self, expr: Placeholder): + placeholder = self.vng("_pt_N") + return placeholder + + def map_index_lambda(self, expr: IndexLambda): + operation = self.vng("_pt_N") + self.string_map[expr] = self.vng("_pt_N") + for _, bnd in list(expr.bindings.items()): + if not isinstance(bnd, Placeholder): + index_lambda = self.rec(bnd) + self.string_map[bnd] = index_lambda + if not index_lambda in self.dep_map: + self.dep_map[index_lambda] = set() + self.dep_map[index_lambda].add(operation) + return operation + + def topological_sort(self): + graph_predecessor = reverse_graph(self.dep_map) + graph_successor = reverse_graph(graph_predecessor) + levels = [] + levels_ops = [] + while (len(graph_predecessor)) > 0: + level_nodes = [] + level_nodes_ops = [] + for key,values in list(graph_predecessor.items()): + if len(values) == 0: + level_nodes.append(key) + if key in self.string_map.inverse: + level_nodes_ops.append(self.string_map.inverse[key]) + nodes_to_remove = graph_successor[key] + for node in nodes_to_remove: + if key in graph_predecessor[node]: + tmp_set = set(graph_predecessor[node]) + tmp_set.remove(key) + graph_predecessor[node] = frozenset(tmp_set) + del graph_predecessor[key] + levels.append(len(level_nodes)) + levels_ops.append(level_nodes_ops) + return levels, levels_ops + +def weight_calculator(levels_ops): + levels_weights = np.zeros(shape=(len(levels_ops))) + for level_i,level_op in enumerate(levels_ops): + level_bytes = 0 + for node in level_op: + assert isinstance(node, IndexLambda) + level_bytes += node.size * np.dtype(node.dtype).itemsize + for key,bnd in node.bindings.items(): + level_bytes += bnd.size * np.dtype(bnd.dtype).itemsize + levels_weights[level_i] = level_bytes + return levels_weights/sum(levels_weights) + diff --git a/pytato/target/pycuda/numpy_like.py b/pytato/target/pycuda/numpy_like.py new file mode 100644 index 0000000000000000000000000000000000000000..79ef85c5dafb8bca5578d32b708ed7f160b03b05 --- /dev/null +++ b/pytato/target/pycuda/numpy_like.py @@ -0,0 +1,823 @@ +__copyright__ = """ +Copyright (C) 2022 Mit Kotak +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import ast +import sys +import os +import numpy as np +import loopy as lp + +from typing import (Callable, Union, Optional, Mapping, Dict, TypeVar, + List, Set, Tuple, Any) + +import pymbolic.primitives as prim +from pytools import (UniqueNameGenerator, memoize_method) +from pytato.transform import CachedMapper, ArrayOrNames +from pytato.array import (IndexLambda, DataWrapper, + Placeholder, SizeParam, + Array, DictOfNamedArrays, + DataInterface) +from pytato.scalar_expr import (IdentityMapper, Reduce, + WalkMapper as ScalarWalkMapper) +from pytato.target.pycuda import (CUDAGraphTarget, + BoundCUDAGraphProgram) +from pyrsistent import pmap +from loopy.symbolic import Reduction as LoopyReduction +from pytato.target.loopy.codegen import PYTATO_REDUCTION_TO_LOOPY_REDUCTION +from dataclasses import dataclass + +T = TypeVar("T") + + +def _can_colorize_output() -> bool: + try: + import pygments # noqa: F401 + return True + except ImportError: + return False + + +def _get_default_colorize_code() -> bool: + return ((not sys.stdout.isatty()) + # https://no-color.org/ + and "NO_COLOR" not in os.environ) + + +def fill_tuple(dim_tuple: Tuple[int, ...]) -> Tuple[int, ...]: + while len(dim_tuple) < 3: + dim_tuple += (1,) + return dim_tuple + + +def _splay(n: int) -> Tuple[Tuple[int, ...], Tuple[int, ...]]: + # heavily modified from cublas + min_threads = 32 + max_threads = 128 + max_blocks = ( + 4 + * 8 + * 64 + ) + + if n < min_threads: + block_count = 1 + threads_per_block = min_threads + elif n < (max_blocks * min_threads): + block_count = (n + min_threads - 1) // min_threads + threads_per_block = min_threads + elif n < (max_blocks * max_threads): + block_count = max_blocks + grp = (n + min_threads - 1) // min_threads + threads_per_block = ((grp + max_blocks - 1) // max_blocks) * min_threads + else: + block_count = max_blocks + threads_per_block = max_threads + + # print "n:%d bc:%d tpb:%d" % (n, block_count, threads_per_block) + return (block_count, 1), (threads_per_block, 1, 1) + + +class _RednVarToBoundRecorder(ScalarWalkMapper): + def __init__(self) -> None: + self.redn_var_to_bounds: Mapping[ + Any, Tuple[int, Union[int, Any, Array]] + ] = {} + super().__init__() + + def map_reduce(self, expr: Reduce) -> None: + for redn_var, (lbound, ubound) in expr.bounds.items(): + assert redn_var not in self.redn_var_to_bounds + self.redn_var_to_bounds[redn_var] = (lbound, ubound) + + +def _get_dim_to_bounds( + expression, + shape + ) -> Mapping[str, Tuple[int, Union[int, Any, Array]]]: + redn_var_to_bound_recorder = _RednVarToBoundRecorder() + redn_var_to_bound_recorder(expression) + return {**redn_var_to_bound_recorder.redn_var_to_bounds, + **{f"_{i}": (0, dim) for i, dim in enumerate(shape)}} + + +def _get_knl_name(expr: IndexLambda) -> str: + from pytato.raising import (index_lambda_to_high_level_op, + BinaryOp, WhereOp, + BroadcastOp, + C99CallOp, + FullOp) + from pytato.diagnostic import (UnknownIndexLambdaExpr) + try: + hlo = index_lambda_to_high_level_op(expr) + if isinstance(hlo, BinaryOp): + knl_string = str(hlo.binary_op)[13:] + \ + "_x1_" + str(hlo.x1.size) + \ + "_x2_" + str(hlo.x2.size) + elif isinstance(hlo, C99CallOp): + from pytato.target.python.numpy_like import _c99_callop_numpy_name + knl_string = str(_c99_callop_numpy_name(hlo)) + elif isinstance(hlo, WhereOp): + knl_string = "where" + elif isinstance(hlo, FullOp): + knl_string = "fill_" + str(int(hlo.fill_value)) + elif isinstance(hlo, BroadcastOp): + knl_string = "broadcast" + else: + knl_string = str(type(hlo)).split(".")[-1][:-2] + except UnknownIndexLambdaExpr: + if isinstance(expr.expr, Reduce): + knl_string = str(type(expr.expr.op)).split(".")[-1][:-2] + elif isinstance(expr.expr, prim.Subscript): + knl_string = "subscript" + elif isinstance(expr.expr, prim.Product): + knl_string = "product" + elif isinstance(expr.expr, prim.If): + knl_string = "if" + else: + knl_string = str(type(expr)).split(".")[-1][:-2] + except Exception: + knl_string = str(type(expr)).split(".")[-1][:-2] + return "knl_" + knl_string.lower() + + +@dataclass(frozen=True, eq=True) +class LoopyArg: + expr: prim.Expression + shape: Tuple[Any, ...] + dtype: np.dtype[Any] + + @property + def size(self): + from pytools import product + return product(self.shape) + + @property + def ndim(self): + return len(self.shape) + + +class CgenMapperAccumulator: + def __init__(self, + buffer_name: str, + dep_nodes: Optional[Tuple[str, ...]] = tuple(), + buffers: Optional[Tuple[str, ...]] = tuple(), + is_allocated: Optional[bool] = False): + + self.buffer_name = buffer_name + self.dep_nodes = dep_nodes + self.is_allocated = is_allocated + self.buffers = buffers + + +class BufferAccumulator: + def __init__(self, + array_name: str, + mapped_array_name: str, + stmt: ast.stmt): + self.array_name = array_name + self.mapped_array_name = mapped_array_name + self.stmt = stmt + + +class NumpyCodegenMapper(CachedMapper[ArrayOrNames]): + """ + .. note:: + - This mapper stores mutable state for building the program. The same + mapper instance must be re-used with care. + """ + def __init__(self, + graph_name: str, + buffer_accumulator: str, + node_accumulator: str, + numpy_backend: str, + vng: Callable[[str], str], + bound_arguments: Dict[str, DataInterface], + extra_preambles: List[ast.stmt], + dot_graph_path: Optional[str] = ""): + super().__init__() + self.graph_name = graph_name + self.buffer_accumulator = buffer_accumulator + self.node_accumulator = node_accumulator + self.numpy_backend = numpy_backend + self.vng = vng + self.bound_arguments = bound_arguments + self.extra_preambles = extra_preambles + self.dot_graph_path = dot_graph_path + self.lines_graph_exec: List[ast.stmt] = [] + self.lines_pt_kernel: List[ast.stmt] = [] + self.lines_output_allocation: List[ast.stmt] = [] + self.allocated_buffers_to_users: Mapping[str, Set[str]] = {} + self.input_args: Mapping[str, Placeholder] = {} + self.kernel_to_kernel_args: Mapping[str, List[str]] = {} + self.temp_input_arrays_to_output_stmt_mapper = {} + self.dict_line: List[ast.stmt] = [] + + @memoize_method + def _add_source_module_line(self, knl_name, lp_arg_acc, bindings) -> Tuple[Any, + Tuple[int, ...], + Tuple[int, ...], + str]: + pt2lp_mapper = PytatoExprToLoopyExprMapper() + lp_expr = pt2lp_mapper(lp_arg_acc.expr) + dim_to_bounds = _get_dim_to_bounds(lp_arg_acc.expr, lp_arg_acc.shape) + import loopy as lp + from pymbolic import var + all_dims = ", ".join(list(dim_to_bounds)) + bounds = " and ".join([f"{lbound} <= {dim} < {ubound}" + for dim, (lbound, ubound) + in list(dim_to_bounds.items())]) + out_var = var("out")[tuple(var(f"_{i}") for i in range(lp_arg_acc.ndim))] + knl = lp.make_kernel( + #FIXME : Need to remove the if condition for null domains + name=knl_name, + domains="{ [%s]: %s }" % (all_dims, bounds) if bool( + dim_to_bounds) else "{:}", + instructions=[lp.Assignment( + out_var, + lp_expr, + within_inames=frozenset( + {f"_{i}" for i in range(lp_arg_acc.ndim)} + ))], + kernel_data=[ + lp.GlobalArg("out", shape=lp_arg_acc.shape, dtype=lp_arg_acc.dtype, + offset=0), + *[lp.GlobalArg(name, shape=bnd[0], dtype=bnd[1], + offset=0) + for name, bnd in list(sorted(bindings.items()))]], + target=lp.PyCudaTarget(), + lang_version=(2018, 2)) + + grid, block = _splay(lp_arg_acc.size * lp_arg_acc.dtype.itemsize) + knl = lp.join_inames(knl, [f"_{i}" for i in range(lp_arg_acc.ndim)], "iout") + knl = lp.split_iname(knl, "iout", np.prod( + grid) * np.prod(block), outer_iname="ibatch") + knl = lp.split_iname(knl, "iout_inner", np.prod( + block), inner_iname="ithread", outer_iname="ithreadblock") + knl = lp.split_iname( + knl, "ithread", block[0], + inner_iname="ithread_x", outer_iname="ithread_y") + knl = lp.split_iname(knl, "ithreadblock", + grid[1], + outer_iname="iblock_x", inner_iname="iblock_y") + knl = lp.tag_inames( + knl, { + "iblock_x": "g.0", + "iblock_y": "g.1", + "ithread_x": "l.0", + "ithread_y": "l.1" + }) + grid, block = knl.default_entrypoint.get_grid_size_upper_bounds_as_exprs( + knl.callables_table) + grid, block = fill_tuple(grid), fill_tuple(block) + knl_string = lp.generate_code_v2(knl).device_code() + mod = self.vng("_pt_mod") + source_mod_line = ast.Assign(targets=[ast.Name(mod)], + value=ast.Call(ast.Name("_pt_SourceModule"), + args=[ast.Constant(knl_string)], + keywords=[])) + self.extra_preambles.append(source_mod_line) + return mod, grid, block + + def map_index_lambda(self, expr: IndexLambda) -> CgenMapperAccumulator: + array_name = self.vng("_pt_array") + array_malloc_node_name = self.vng("_pt_memalloc") + mod = self.vng("_pt_mod") + kernel_call_node_name = self.vng("_pt_kernel") + if expr.size == 0: + return CgenMapperAccumulator(buffer_name=array_name) + + def _get_dependencies( + array_name: str, + kernel_call_node_name: str + ) -> Tuple[Tuple[str, ...], Mapping[str, ast.Name]]: + dependencies = () + buffer_accumulator_lines = [] + buffers_kernel = [array_name] + buffers = [array_name] + buffer_accumulator_lines.append( + ast.Assign( + targets=[ast.Subscript(ast.Name(self.buffer_accumulator), + ast.Constant(array_name))], + value=ast.Name(array_name) + ) + ) + for _, bnd in list(expr.bindings.items()): + rec_bnd = self.rec(bnd) + if rec_bnd.is_allocated: + self.allocated_buffers_to_users[rec_bnd.buffer_name].add( + kernel_call_node_name) + buffer_accumulator_lines.append( + ast.Assign( + targets=[ + ast.Subscript(ast.Name(self.buffer_accumulator), + ast.Constant(rec_bnd.buffer_name))], + value=ast.Name(rec_bnd.buffer_name) + ) + ) + buffers_kernel.append(rec_bnd.buffer_name) + else: + buffers_kernel.append("139712027164672") + buffers.append(rec_bnd.buffer_name) + dependencies += rec_bnd.dep_nodes + + self.kernel_to_kernel_args[kernel_call_node_name] = buffers + return (tuple(set(dependencies)), + buffers, buffers_kernel, + buffer_accumulator_lines) + + output_tuple = _get_dependencies(array_name, kernel_call_node_name) + dependencies = output_tuple[0] + buffers = output_tuple[1] + buffers_kernel = output_tuple[2] + buffer_accumulator_lines = output_tuple[3] + array_memalloc_line = ast.Assign(targets=[ast.Tuple( + elts=[ast.Name(array_malloc_node_name), + ast.Name(array_name)])], + value=ast.Call( + ast.Attribute( + ast.Name(self.graph_name), + "add_memalloc_node"), + args=[], + keywords=[ + ast.keyword( + "size", + ast.Constant( + expr.size + * + expr.dtype.itemsize + )), + ast.keyword( + "dependencies", + ast.List( + elts=[ + ast.Name(dep) + for + dep in + dependencies])) + ])) + self.lines_graph_exec.append(array_memalloc_line) + self.allocated_buffers_to_users[array_name] = set() + self.allocated_buffers_to_users[array_name].add(kernel_call_node_name) + knl_name = _get_knl_name(expr) + lp_arg_acc = LoopyArg(expr.expr, + expr.shape, + expr.dtype) + + import immutables + mod, grid, block = self._add_source_module_line(knl_name, + lp_arg_acc, + immutables.Map( + {name: ( + bnd.shape, bnd.dtype + ) + for name, bnd + in expr.bindings.items() + })) + kernel_call_line = ast.Assign(targets=[ + ast.Name(kernel_call_node_name)], + value=ast.Call(ast.Attribute(ast.Name(self.graph_name), + "add_kernel_node"), + args=[ + ast.Name(buffer_name) + for buffer_name in buffers_kernel + ], + keywords=[ast.keyword( + arg="func", + value=ast.Call( + ast.Attribute( + ast.Name(mod), + "get_function"), + args=[ + ast.Constant(knl_name) + ], + keywords=[])), + ast.keyword( + arg="block", + value=ast.Tuple( + elts=[ + ast.Constant(d) + for d in block + ])), + ast.keyword( + arg="grid", + value=ast.Tuple( + elts=[ + ast.Constant(d) + for d in grid + ])), + ast.keyword( + arg="dependencies", + value=ast.List( + elts=[ + ast.Name( + array_malloc_node_name + )] + [ + ast.Name( + dep + ) + for dep in dependencies + ]))])) + self.lines_graph_exec.append(kernel_call_line) + self.lines_graph_exec += buffer_accumulator_lines + self.lines_graph_exec.append( + ast.Assign( + targets=[ast.Subscript(ast.Name(self.node_accumulator), + ast.Constant(kernel_call_node_name))], + value=ast.Name(kernel_call_node_name) + ) + ) + return CgenMapperAccumulator( + buffer_name=array_name, + dep_nodes=(kernel_call_node_name,), + buffers=buffers, + is_allocated=True) + + def map_placeholder(self, expr: Placeholder) -> CgenMapperAccumulator: + self.input_args[expr.name] = expr + input_buffer_name = self.vng("_pt_buffer") + self.temp_input_arrays_to_output_stmt_mapper[ + input_buffer_name] = ast.Attribute(ast.Name(expr.name), "gpudata") + return CgenMapperAccumulator(buffer_name=input_buffer_name) + + def map_size_param(self, expr: SizeParam) -> str: + # would demand a more complicated BoundProgram implementation. + raise NotImplementedError("SizeParams not yet supported in numpy-targets.") + + def map_dict_of_named_arrays(self, + expr: DictOfNamedArrays) -> CgenMapperAccumulator: + dict_name = self.vng("_pt_tmp") + keys = [] + values = [] + buffer_to_result_mapper = {} + for name, subexpr in list(expr._data.items()): + keys.append(ast.Constant(name)) + rec = self.rec(subexpr) + result_array_name = self.vng("_pt_result") + if bool(rec.buffers): + if rec.buffer_name in buffer_to_result_mapper: + values.append(ast.Name(buffer_to_result_mapper[rec.buffer_name])) + else: + buffer_to_result_mapper[rec.buffer_name] = result_array_name + self.lines_output_allocation.append( + ast.Assign( + targets=[ast.Name(result_array_name)], + value=ast.Call( + ast.Attribute(ast.Name("_pt_gpuarray"), "GPUArray"), + args=[ast.Tuple(elts=[ + ast.Constant(dim) for dim in subexpr.shape + ])], + keywords=[ + ast.keyword( + arg="dtype", + value=ast.Constant(subexpr.dtype.name)), + ast.keyword( + arg="allocator", + value=ast.Name("allocator")), + ast.keyword( + arg="dev", + value=ast.Name("dev"))]))) + values.append(ast.Name(result_array_name)) + self.temp_input_arrays_to_output_stmt_mapper[ + rec.buffers[0]] = ast.Attribute( + ast.Name(result_array_name), "gpudata") + else: + if subexpr.size != 0: + values.append(ast.Name(subexpr.name)) + else: + self.lines_output_allocation.append( + ast.Assign( + targets=[ast.Name(result_array_name)], + value=ast.Call( + ast.Attribute( + ast.Name("_pt_gpuarray"), + "GPUArray" + ), + args=[ + ast.Tuple( + elts=[ + ast.Constant(dim) + for dim in subexpr.shape + ])], + keywords=[ + ast.keyword( + arg="dtype", + value=ast.Constant(subexpr.dtype.name)), + ast.keyword( + arg="allocator", + value=ast.Name("allocator"))]))) + values.append(ast.Name(result_array_name)) + for array_name, dependencies in self.allocated_buffers_to_users.items(): + self.lines_graph_exec.append( + ast.Expr( + ast.Call( + ast.Attribute(ast.Name("_pt_g"), + "add_memfree_node"), + args=[ + ast.Name(array_name), + ast.List( + elts=[ast.Name(dep) for dep in dependencies] + )], + keywords=[]))) + if bool(self.dot_graph_path): + self.lines_graph_exec.append( + ast.Expr( + ast.Call( + ast.Attribute( + ast.Name(self.graph_name), + "show_dot_graph"), + args=[ast.Constant(self.dot_graph_path)], + keywords=[]))) + + self.dict_line = [ast.Assign(targets=[ast.Name(dict_name)], + value=ast.Dict(keys=keys, values=values))] + + return CgenMapperAccumulator(dict_name, ()) + + +class PytatoExprToLoopyExprMapper(IdentityMapper): + + @memoize_method + def call(self, expr): + return super().call(expr) + + def map_reduce(self, expr: Reduce) -> LoopyReduction: + try: + loopy_redn = PYTATO_REDUCTION_TO_LOOPY_REDUCTION[type(expr.op)] + except KeyError: + raise NotImplementedError(expr.op) + return LoopyReduction(loopy_redn, + tuple(list(expr.bounds)), + self.rec(expr.inner_expr)) + + def map_call(self, expr: prim.Call) -> prim.Call: + if isinstance(expr.function, prim.Variable) and ( + expr.function.name.startswith("pytato.c99.")): + name_in_loopy = expr.function.name[11:] + return prim.Call(prim.Variable(name_in_loopy), + self.rec(expr.parameters)) + + return super().map_call(expr) + + def map_subscript(self, expr: prim.Subscript) -> prim.Subscript: + aggregate = self.rec(expr.aggregate) + index = self.rec(expr.index) + if aggregate is expr.aggregate and index is expr.index: + if not bool(expr.index): + return aggregate + else: + return expr + return type(expr)(aggregate, index) + + +def generate_cudagraph_numpy_like(expr: Union[ + Array, + Mapping[str, Array], + DictOfNamedArrays], + target: CUDAGraphTarget, + function_name: str, + show_code: bool, + colorize_show_code: Optional[bool] = None, + dot_graph_path: Optional[str] = "" + ) -> BoundCUDAGraphProgram: + + extra_preambles = [ast.ImportFrom(module="pycuda.driver", + names=[ast.alias( + "KernelNodeParams", + asname="_pt_KernelNodeParams")], + level=0), + ast.ImportFrom(module="pycuda.compiler", + names=[ast.alias( + "SourceModule", + asname="_pt_SourceModule")], + level=0), + ast.ImportFrom(module="pycuda", + names=[ast.alias( + "gpuarray", + asname="_pt_gpuarray")], + level=0), + ast.ImportFrom(module="functools", + names=[ast.alias( + "cache", + asname="")], + level=0)] + import collections + from pytato.transform import InputGatherer + + if ((not isinstance(expr, DictOfNamedArrays)) + and isinstance(expr, collections.abc.Mapping)): + from pytato.array import make_dict_of_named_arrays + expr = make_dict_of_named_arrays(dict(expr)) + + assert isinstance(expr, (Array, DictOfNamedArrays)) + + var_name_gen = UniqueNameGenerator() + + var_name_gen.add_names({input_expr.name + for input_expr in InputGatherer()(expr) + if isinstance(input_expr, + (Placeholder, SizeParam, DataWrapper)) + if input_expr.name is not None}) + + if isinstance(expr, DictOfNamedArrays): + var_name_gen.add_names(expr) + + var_name_gen.add_names({target.numpy_like_module_name_shorthand, + "np", + function_name}) + dict_flag = False + if isinstance(expr, DictOfNamedArrays): + dict_flag = True + from pytato.codegen import normalize_outputs as normalize_outputs + from pytato.target.loopy.codegen import preprocess as preprocess_loopy + result = normalize_outputs(expr) + preproc_result = preprocess_loopy(result, target=lp.CudaTarget()) + graph_name = var_name_gen("_pt_g") + buffer_accumulator = var_name_gen("_pt_buffer_acc") + node_accumulator = var_name_gen("_pt_node_acc") + cgen_mapper = NumpyCodegenMapper( + graph_name=graph_name, + buffer_accumulator=buffer_accumulator, + node_accumulator=node_accumulator, + numpy_backend=target.numpy_like_module_name_shorthand, + vng=var_name_gen, + bound_arguments=preproc_result.bound_arguments, + extra_preambles=extra_preambles, + dot_graph_path=dot_graph_path) + # type-ignore-reason: https://github.com/inducer/pytato/issues/236 + result_var = cgen_mapper(preproc_result.outputs) + lines_graph_exec: List[Any] = [] + lines_graph_exec.append( + ast.Module( + body=[ + ast.Assign(targets=[ + ast.Name(id=graph_name, ctx=ast.Store()) + ], + value=ast.Call( + func=ast.Attribute( + ast.Name( + cgen_mapper.numpy_backend, ctx=ast.Load()), + "Graph"), + args=[], + keywords=[])) + ], type_ignores=[])) + lines_graph_exec.append( + ast.Module( + body=[ + ast.Assign(targets=[ + ast.Name(id=buffer_accumulator, ctx=ast.Store()) + ], + value=ast.Dict(keys=[], values=[])) + ], type_ignores=[] + ) + ) + lines_graph_exec.append( + ast.Module( + body=[ + ast.Assign(targets=[ + ast.Name(id=node_accumulator, ctx=ast.Store()) + ], + value=ast.Dict(keys=[], values=[])) + ], type_ignores=[] + ) + ) + lines_graph_exec += cgen_mapper.lines_graph_exec + lines_graph_exec.append(ast.Return( + ast.Tuple(elts=[ + ast.Call( + ast.Attribute(ast.Name(graph_name), + "get_exec_graph"), + args=[], + keywords=[]), + ast.Name(graph_name), + ast.Name(node_accumulator), + ast.Name(buffer_accumulator)]))) + + lines_pt_kernel: List[Any] = [] + lines_pt_kernel += cgen_mapper.lines_output_allocation + g_exec = cgen_mapper.vng("_pt_exec_g") + lines_pt_kernel.append( + ast.Assign(targets=[ast.Tuple( + elts=[ast.Name(g_exec), + ast.Name(graph_name), + ast.Name(node_accumulator), + ast.Name(buffer_accumulator) + ])], + value=ast.Call(ast.Name("exec_graph_builder"), + args=[], + keywords=[] + ))) + keys = [] + values = [] + for kernel_node, kernel_args in cgen_mapper.kernel_to_kernel_args.items(): + keys.append( + ast.Subscript( + ast.Name(node_accumulator), + ast.Constant(kernel_node))) + values.append( + ast.Call(func=ast.Name("_pt_drv.KernelNodeParams"), + args=[], + keywords=[ast.keyword("args", + ast.List(elts=[cgen_mapper.temp_input_arrays_to_output_stmt_mapper[array_name] if array_name in cgen_mapper.temp_input_arrays_to_output_stmt_mapper else ast.Subscript(ast.Name(buffer_accumulator), ast.Constant(array_name)) for array_name in kernel_args]))])) # noqa : 501 + lines_pt_kernel.append( + ast.Expr( + ast.Call( + ast.Attribute(ast.Name(g_exec), "batched_set_kernel_node_arguments"), + args=[ast.Dict( + keys=keys, + values=values)], + keywords=[]))) + lines_pt_kernel.append(ast.Expr( + ast.Call(ast.Attribute( + ast.Name(g_exec), "launch"), + args=[], + keywords=[]))) + if bool(cgen_mapper.dict_line): + lines_pt_kernel += cgen_mapper.dict_line + if dict_flag: + lines_pt_kernel.append(ast.Return(ast.Name(result_var.buffer_name))) + else: + lines_pt_kernel.append( + ast.Return( + ast.Call( + ast.Attribute( + ast.Subscript( + ast.Name(result_var.buffer_name), + ast.Constant("_pt_out")), "get"), args=[], keywords=[]))) + module = ast.Module( + body=[ast.Import(names=[ast.alias(name=target.numpy_like_module_name, + asname=( + target + .numpy_like_module_name_shorthand + ))]), + ast.Import(names=[ast.alias(name="numpy", asname="np")]), + *tuple(extra_preambles), + ast.FunctionDef( + name="exec_graph_builder", + posonlyargs=[], + args=ast.arguments( + args=[], + posonlyargs=[], + kwonlyargs=[], + kw_defaults=[], + defaults=[]), + body=lines_graph_exec, + decorator_list=[ast.Name("cache")]), + ast.FunctionDef( + name=function_name, + posonlyargs=[], + args=ast.arguments( + args=[ast.arg(arg="allocator"), ast.arg("dev")], + posonlyargs=[], + kwonlyargs=[ast.arg(arg=name) + for name in list(cgen_mapper.input_args.keys())], + kw_defaults=[ + None for _ in list(cgen_mapper.input_args.keys()) + ], + defaults=[ast.Name("cuda_allocator"), ast.Name("cuda_dev")]), + body=lines_pt_kernel, + decorator_list=[]) + ], + type_ignores=[]) + + program = ast.unparse(ast.fix_missing_locations(module)) + + if show_code: + if colorize_show_code is None: + colorize_show_code = _get_default_colorize_code() + assert isinstance(colorize_show_code, bool) + + if _can_colorize_output() and colorize_show_code: + from pygments import highlight + from pygments.lexers import PythonLexer + from pygments.formatters import TerminalTrueColorFormatter + print(highlight(program, + formatter=TerminalTrueColorFormatter(), + lexer=PythonLexer())) + else: + print(program) + return target.bind_program( + program, + function_name, + expected_arguments=frozenset(list(cgen_mapper.input_args.keys())), + bound_arguments=pmap(cgen_mapper.bound_arguments)) diff --git a/pytato/transform/__init__.py b/pytato/transform/__init__.py index 5583e359edd31f666cde0b6c4ac246f3dbfe7b66..2e4255143c15c9ba1394c3dfb1361cf3b8ffb661 100644 --- a/pytato/transform/__init__.py +++ b/pytato/transform/__init__.py @@ -210,6 +210,10 @@ class CopyMapper(CachedMapper[ArrayOrNames]): This does not copy the data of a :class:`pytato.array.DataWrapper`. """ + # type-ignore reason: incompatible type with Mapper.rec + def __call__(self, expr: T) -> T: # type: ignore[override] + return self.rec(expr) # type: ignore[no-any-return] + def rec_idx_or_size_tuple(self, situp: Tuple[IndexOrShapeExpr, ...] ) -> Tuple[IndexOrShapeExpr, ...]: return tuple(self.rec(s) if isinstance(s, Array) else s for s in situp) @@ -1707,6 +1711,33 @@ class EdgeCachedMapper(CachedMapper[ArrayOrNames]): # }}} +# {{{ BranchMorpher + +class BranchMorpher(CopyMapper): + """ + A mapper that replaces equal segments of graphs with identical objects. + """ + def __init__(self) -> None: + super().__init__() + self.result_cache: Dict[ArrayOrNames, ArrayOrNames] = {} + + def cache_key(self, expr: CachedMapperT) -> Any: + return (id(expr), expr) + + # type-ignore reason: incompatible with Mapper.rec + def rec(self, expr: T) -> T: # type: ignore[override] + rec_expr = super().rec(expr) + try: + # type-ignored because 'result_cache' maps to ArrayOrNames + return self.result_cache[rec_expr] # type: ignore[return-value] + except KeyError: + self.result_cache[rec_expr] = rec_expr + # type-ignored because of super-class' relaxed types + return rec_expr # type: ignore[no-any-return] + +# }}} + + # {{{ deduplicate_data_wrappers def _get_data_dedup_cache_key(ary: DataInterface) -> Hashable: @@ -1796,8 +1827,9 @@ def deduplicate_data_wrappers(array_or_names: ArrayOrNames) -> ArrayOrNames: len(data_wrapper_cache), data_wrappers_encountered - len(data_wrapper_cache)) - return array_or_names + return BranchMorpher()(array_or_names) # }}} + # vim: foldmethod=marker diff --git a/requirements.txt b/requirements.txt index a9cf2c76ed9eea4972066223ea89d2c20a4859c2..85249a4c142b84c8852528ae86ccd5b088f15959 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ git+https://github.com/inducer/pytools.git#egg=pytools >= 2021.1 git+https://github.com/inducer/pymbolic.git#egg=pymbolic git+https://github.com/inducer/genpy.git#egg=genpy -git+https://github.com/inducer/loopy.git#egg=loopy +git+https://gitlab.tiker.net/kaushikcfd/pycuda.git@cudagraph#egg=pycuda +git+https://gitlab.tiker.net/inducer/loopy.git@pycuda_tgt#egg=loopy asciidag diff --git a/setup.cfg b/setup.cfg index 22434cfe6fd5dddfc73d94f8fcf3f022513f6c8e..e1c1828441428fd134788eebb4a20b288d4a48b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,3 +39,7 @@ ignore_missing_imports = True [mypy-pygments.*] ignore_missing_imports = True + +[my-pycuda.*] +ignore_missing_imports = True + diff --git a/test/test_cudagraph.py b/test/test_cudagraph.py new file mode 100644 index 0000000000000000000000000000000000000000..83e8c03093a4ed459f9f2e428fedfac27796f0b1 --- /dev/null +++ b/test/test_cudagraph.py @@ -0,0 +1,138 @@ +__copyright__ = """Copyright (C) 2022 Mit Kotak""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import pytest +import numpy as np +import pytato as pt +import pycuda.driver as drv +import pycuda.autoinit # noqa: F401 +ctx = drv.Context.get_current() +dev = ctx.get_device() +post_volta_devices = ["NVIDIA TITAN V"] +post_volta_flag = dev.name() in post_volta_devices + + +@pytest.mark.parametrize("spec,argshapes", ([("im,mj,km->ijk", + [(3, 3)]*3), + + ("ik,kj->ij", # A @ B + [(4, 3), (3, 5)]), + + ("ij,ij->ij", # A * B + [(4, 4)]*2), + + ("ij,ji->ij", # A * B.T + [(4, 4)]*2), + + ("ij,kj->ik", # inner(A, B) + [(4, 4)]*2), + + ("ij,j->j", # A @ x + [(4, 4), (4,)]), + + ("ij->ij", # identity + [(10, 4)]), + + ("ij->ji", # transpose + [(10, 4)]), + + ("ii->i", # diag + [(5, 5)]), + + (" ij -> ", # np.sum + [(10, 4)]), + ("dij,ej,ej,dej->ei", # diff: curvimesh + [(2, 10, 10), (100, 10), + (100, 10), (2, 100, 10)]), + + ("dij,ej,ej,dej->ei", # diff: simplex + [(2, 10, 10), (100, 1), + (100, 1), (2, 100, 10)]), + + ("ij,ij->ij", # broadcasting + [(1, 3), (3, 1)]), + ])) +def test_einsum(spec, argshapes): + if not post_volta_flag: + pytest.skip("CUDAGraph only works for post volta GPUs") + np_operands = [np.random.rand(*argshape) for argshape in argshapes] + pt_operands = [pt.make_data_wrapper(x_in) for x_in in np_operands] + + np_out = np.einsum(spec, *np_operands) + pt_expr = pt.einsum(spec, *pt_operands) + + pt_out = pt.generate_cudagraph(pt_expr, show_code=True)() + assert np_out.shape == pt_out.shape + np.testing.assert_allclose(np_out, pt_out) + + +def test_random_dag_against_numpy(): + if not post_volta_flag: + pytest.skip("CUDAGraph only works for post volta GPUs") + from testlib import RandomDAGContext, make_random_dag + axis_len = 5 + # from warnings import filterwarnings, catch_warnings + # with catch_warnings(): + # We'd like to know if Numpy divides by zero. + # filterwarnings("error") + + for i in range(50): + print(i) # progress indicator for somewhat slow test + seed = 120 + i + rdagc_pt = RandomDAGContext(np.random.default_rng(seed=seed), + axis_len=axis_len, use_numpy=False) + rdagc_np = RandomDAGContext(np.random.default_rng(seed=seed), + axis_len=axis_len, use_numpy=True) + + ref_result = make_random_dag(rdagc_np) + dag = make_random_dag(rdagc_pt) + from pytato.transform import materialize_with_mpms + dict_named_arys = pt.DictOfNamedArrays({"result": dag}) + dict_named_arys = materialize_with_mpms(dict_named_arys) + if 0: + pt.show_dot_graph(dict_named_arys) + pt_result = pt.generate_cudagraph(dict_named_arys, show_code=True)() + + assert np.allclose(pt_result["result"].get(), ref_result) + + +def test_placeholders_in_cudagraph(): + if not post_volta_flag: + pytest.skip("CUDAGraph only works for post volta GPUs") + import pycuda.gpuarray as gpuarray + from numpy.random import default_rng + rng = default_rng() + + img_data = pt.make_placeholder("img", (256, 256, 3), dtype=np.float32) + scl_data = pt.make_placeholder("scl", (3), dtype=np.float32) + + img = rng.random(size=(256, 256, 3), dtype=np.float32) + scl = rng.random(size=(3), dtype=np.float32) + + img_in = gpuarray.to_gpu(img) + scl_in = gpuarray.to_gpu(scl) + + prg = pt.generate_cudagraph(img_data * scl_data, show_code=True) + pt_out = prg(img=img_in, scl=scl_in) + np_out = img * scl + + np.testing.assert_allclose(pt_out, np_out, rtol=1e-6)