From 956b4cbdc27b74a7d550bd52e100f8dce79410a2 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 11 Oct 2020 15:46:40 -0500 Subject: [PATCH 1/3] Engage flake8, add to CI --- .github/workflows/ci.yml | 23 + .gitlab-ci.yml | 9 + pycuda/_cluda.py | 4 - pycuda/_mymako.py | 23 +- pycuda/autoinit.py | 8 +- pycuda/characterize.py | 15 +- pycuda/compiler.py | 319 ++++--- pycuda/cumath.py | 99 ++- pycuda/curandom.py | 633 +++++++++----- pycuda/debug.py | 17 +- pycuda/driver.py | 326 +++++--- pycuda/elementwise.py | 587 +++++++------ pycuda/gl/__init__.py | 2 +- pycuda/gl/autoinit.py | 4 +- pycuda/gpuarray.py | 832 ++++++++++++------- pycuda/reduction.py | 253 ++++-- pycuda/scan.py | 151 ++-- pycuda/sparse/__init__.py | 8 +- pycuda/sparse/cg.py | 109 +-- pycuda/sparse/coordinate.py | 90 +- pycuda/sparse/inner.py | 22 +- pycuda/sparse/operator.py | 13 +- pycuda/sparse/packeted.py | 192 +++-- pycuda/sparse/pkt_build.py | 52 +- pycuda/tools.py | 165 ++-- setup.cfg | 8 +- test/test_cumath.py | 123 ++- test/test_driver.py | 610 ++++++++------ test/test_gpuarray.py | 451 +++++----- test/undistributed/elwise-perf.py | 18 +- test/undistributed/measure_gpuarray_speed.py | 40 +- test/undistributed/reduction-perf.py | 34 +- 32 files changed, 3217 insertions(+), 2023 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..a6295018 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,23 @@ +name: CI +on: + push: + branches: + - master + pull_request: + schedule: + - cron: '17 3 * * 0' + +jobs: + flake8: + name: Flake8 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - + uses: actions/setup-python@v1 + with: + python-version: '3.x' + - name: "Main Script" + run: | + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh + . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" test/*.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 710ad647..fc5d3216 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -53,3 +53,12 @@ Documentation: - python3 # needs CUDA headers to compile - nvidia-titan-x + +Flake8: + script: + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh + - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test/*.py + tags: + - python3 + except: + - tags diff --git a/pycuda/_cluda.py b/pycuda/_cluda.py index 8cddd4d6..fa5eda2a 100644 --- a/pycuda/_cluda.py +++ b/pycuda/_cluda.py @@ -24,7 +24,3 @@ CLUDA_PREAMBLE = """ #define GDIM_1 gridDim.y #define GDIM_2 gridDim.z """ - - - - diff --git a/pycuda/_mymako.py b/pycuda/_mymako.py index cfd37360..f2a98c91 100644 --- a/pycuda/_mymako.py +++ b/pycuda/_mymako.py @@ -1,15 +1,18 @@ from __future__ import absolute_import + try: - import mako.template + import mako.template # noqa: F401 except ImportError: raise ImportError( - "Some of PyCUDA's facilities require the Mako templating engine.\n" - "You or a piece of software you have used has tried to call such a\n" - "part of PyCUDA, but there was a problem importing Mako.\n\n" - "You may install mako now by typing one of:\n" - "- easy_install Mako\n" - "- pip install Mako\n" - "- aptitude install python-mako\n" - "\nor whatever else is appropriate for your system.") + "Some of PyCUDA's facilities require the Mako templating engine.\n" + "You or a piece of software you have used has tried to call such a\n" + "part of PyCUDA, but there was a problem importing Mako.\n\n" + "You may install mako now by typing one of:\n" + "- easy_install Mako\n" + "- pip install Mako\n" + "- aptitude install python-mako\n" + "\nor whatever else is appropriate for your system." + ) + -from mako import * +from mako import * # noqa: F403, F401 diff --git a/pycuda/autoinit.py b/pycuda/autoinit.py index f52e1e38..00f39827 100644 --- a/pycuda/autoinit.py +++ b/pycuda/autoinit.py @@ -1,21 +1,25 @@ from __future__ import absolute_import import pycuda.driver as cuda +import atexit # Initialize CUDA cuda.init() -from pycuda.tools import make_default_context +from pycuda.tools import make_default_context # noqa: E402 + global context context = make_default_context() device = context.get_device() + def _finish_up(): global context context.pop() context = None from pycuda.tools import clear_context_caches + clear_context_caches() -import atexit + atexit.register(_finish_up) diff --git a/pycuda/characterize.py b/pycuda/characterize.py index 2206e588..2badde2e 100644 --- a/pycuda/characterize.py +++ b/pycuda/characterize.py @@ -7,7 +7,8 @@ import numpy as np def platform_bits(): import sys - if sys.maxsize > 2**32: + + if sys.maxsize > 2 ** 32: return 64 else: return 32 @@ -15,27 +16,35 @@ def platform_bits(): def has_stack(): from pycuda.driver import Context + return Context.get_device().compute_capability() >= (2, 0) def has_double_support(): from pycuda.driver import Context + return Context.get_device().compute_capability() >= (1, 3) @context_dependent_memoize def sizeof(type_name, preamble=""): from pycuda.compiler import SourceModule - mod = SourceModule(""" + + mod = SourceModule( + """ %s extern "C" __global__ void write_size(size_t *output) { *output = sizeof(%s); } - """ % (preamble, type_name), no_extern_c=True) + """ + % (preamble, type_name), + no_extern_c=True, + ) import pycuda.gpuarray as gpuarray + output = gpuarray.empty((), dtype=np.uintp) mod.get_function("write_size")(output, block=(1, 1, 1), grid=(1, 1)) diff --git a/pycuda/compiler.py b/pycuda/compiler.py index b73793b6..9bbd05d0 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -1,7 +1,10 @@ from __future__ import absolute_import from __future__ import print_function from pytools import memoize + # don't import pycuda.driver here--you'll create an import loop +import os + import sys from tempfile import mkstemp from os import unlink @@ -17,6 +20,7 @@ def get_nvcc_version(nvcc): if result != 0 or not stdout: from warnings import warn + warn("NVCC version could not be determined.") stdout = "nvcc unknown version" @@ -26,40 +30,48 @@ def get_nvcc_version(nvcc): def _new_md5(): try: import hashlib + return hashlib.md5() except ImportError: # for Python << 2.5 import md5 + return md5.new() def preprocess_source(source, options, nvcc): - handle, source_path = mkstemp(suffix='.cu') + handle, source_path = mkstemp(suffix=".cu") - outf = open(source_path, 'w') + outf = open(source_path, "w") outf.write(source) outf.close() os.close(handle) - cmdline = [nvcc, '--preprocess'] + options + [source_path] - if 'win32' in sys.platform: - cmdline.extend(['--compiler-options', '-EP']) + cmdline = [nvcc, "--preprocess"] + options + [source_path] + if "win32" in sys.platform: + cmdline.extend(["--compiler-options", "-EP"]) else: - cmdline.extend(['--compiler-options', '-P']) + cmdline.extend(["--compiler-options", "-P"]) result, stdout, stderr = call_capture_output(cmdline, error_on_nonzero=False) if result != 0: from pycuda.driver import CompileError - raise CompileError("nvcc preprocessing of %s failed" % source_path, - cmdline, stderr=stderr) + + raise CompileError( + "nvcc preprocessing of %s failed" % source_path, cmdline, stderr=stderr + ) # sanity check - if len(stdout) < 0.5*len(source): + if len(stdout) < 0.5 * len(source): from pycuda.driver import CompileError - raise CompileError("nvcc preprocessing of %s failed with ridiculously " - "small code output - likely unsupported compiler." % source_path, - cmdline, stderr=stderr.decode("utf-8", "replace")) + + raise CompileError( + "nvcc preprocessing of %s failed with ridiculously " + "small code output - likely unsupported compiler." % source_path, + cmdline, + stderr=stderr.decode("utf-8", "replace"), + ) unlink(source_path) @@ -74,7 +86,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): if cache_dir: checksum = _new_md5() - if '#include' in source: + if "#include" in source: checksum.update(preprocess_source(source, options, nvcc).encode("utf-8")) else: checksum.update(source.encode("utf-8")) @@ -83,6 +95,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): checksum.update(option.encode("utf-8")) checksum.update(get_nvcc_version(nvcc).encode("utf-8")) from pycuda.characterize import platform_bits + checksum.update(str(platform_bits()).encode("utf-8")) cache_file = checksum.hexdigest() @@ -99,6 +112,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): pass from tempfile import mkdtemp + file_dir = mkdtemp() file_root = "kernel" @@ -116,8 +130,9 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): print("*** compiler output in %s" % file_dir) cmdline = [nvcc, "--" + target] + options + [cu_file_name] - result, stdout, stderr = call_capture_output(cmdline, - cwd=file_dir, error_on_nonzero=False) + result, stdout, stderr = call_capture_output( + cmdline, cwd=file_dir, error_on_nonzero=False + ) try: result_f = open(join(file_dir, file_root + "." + target), "rb") @@ -129,22 +144,35 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): if result != 0 or (no_output and (stdout or stderr)): if result == 0: from warnings import warn - warn("PyCUDA: nvcc exited with status 0, but appears to have " - "encountered an error") + + warn( + "PyCUDA: nvcc exited with status 0, but appears to have " + "encountered an error" + ) from pycuda.driver import CompileError - raise CompileError("nvcc compilation of %s failed" % cu_file_path, - cmdline, stdout=stdout.decode("utf-8", "replace"), - stderr=stderr.decode("utf-8", "replace")) + + raise CompileError( + "nvcc compilation of %s failed" % cu_file_path, + cmdline, + stdout=stdout.decode("utf-8", "replace"), + stderr=stderr.decode("utf-8", "replace"), + ) if stdout or stderr: - lcase_err_text = (stdout+stderr).decode("utf-8", "replace").lower() + lcase_err_text = (stdout + stderr).decode("utf-8", "replace").lower() from warnings import warn + if "demoted" in lcase_err_text or "demoting" in lcase_err_text: - warn("nvcc said it demoted types in source code it " + warn( + "nvcc said it demoted types in source code it " "compiled--this is likely not what you want.", - stacklevel=4) - warn("The CUDA compiler succeeded, but said the following:\n" - + (stdout+stderr).decode("utf-8", "replace"), stacklevel=4) + stacklevel=4, + ) + warn( + "The CUDA compiler succeeded, but said the following:\n" + + (stdout + stderr).decode("utf-8", "replace"), + stacklevel=4, + ) result_data = result_f.read() result_f.close() @@ -156,6 +184,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): if not keep: from os import listdir, unlink, rmdir + for name in listdir(file_dir): unlink(join(file_dir, name)) rmdir(file_dir) @@ -169,6 +198,7 @@ def _get_per_user_string(): except ImportError: checksum = _new_md5() from os import environ + checksum.update(environ["USERNAME"].encode("utf-8")) return checksum.hexdigest() else: @@ -177,19 +207,29 @@ def _get_per_user_string(): def _find_pycuda_include_path(): from pkg_resources import Requirement, resource_filename + return resource_filename(Requirement.parse("pycuda"), "pycuda/cuda") -import os DEFAULT_NVCC_FLAGS = [ - _flag.strip() for _flag in - os.environ.get("PYCUDA_DEFAULT_NVCC_FLAGS", "").split() - if _flag.strip()] - - -def compile(source, nvcc="nvcc", options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], target="cubin"): + _flag.strip() + for _flag in os.environ.get("PYCUDA_DEFAULT_NVCC_FLAGS", "").split() + if _flag.strip() +] + + +def compile( + source, + nvcc="nvcc", + options=None, + keep=False, + no_extern_c=False, + arch=None, + code=None, + cache_dir=None, + include_dirs=[], + target="cubin", +): assert target in ["cubin", "ptx", "fatbin"] @@ -202,13 +242,16 @@ def compile(source, nvcc="nvcc", options=None, keep=False, options = options[:] if arch is None: from pycuda.driver import Error + try: from pycuda.driver import Context + arch = "sm_%d%d" % Context.get_device().compute_capability() except Error: pass from pycuda.driver import CUDA_DEBUGGING + if CUDA_DEBUGGING: cache_dir = False keep = True @@ -222,14 +265,18 @@ def compile(source, nvcc="nvcc", options=None, keep=False, if cache_dir is None: import appdirs - cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"), - "compiler-cache-v1") + + cache_dir = os.path.join( + appdirs.user_cache_dir("pycuda", "pycuda"), "compiler-cache-v1" + ) from os import makedirs + try: makedirs(cache_dir) except OSError as e: from errno import EEXIST + if e.errno != EEXIST: raise @@ -239,17 +286,17 @@ def compile(source, nvcc="nvcc", options=None, keep=False, if code is not None: options.extend(["-code", code]) - if 'darwin' in sys.platform and sys.maxsize == 9223372036854775807: - options.append('-m64') - elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807: - options.append('-m64') - elif 'win32' in sys.platform and sys.maxsize == 2147483647: - options.append('-m32') + if "darwin" in sys.platform and sys.maxsize == 9223372036854775807: + options.append("-m64") + elif "win32" in sys.platform and sys.maxsize == 9223372036854775807: + options.append("-m64") + elif "win32" in sys.platform and sys.maxsize == 2147483647: + options.append("-m32") include_dirs = include_dirs + [_find_pycuda_include_path()] for i in include_dirs: - options.append("-I"+i) + options.append("-I" + i) return compile_plain(source, options, keep, nvcc, cache_dir, target) @@ -260,11 +307,15 @@ class CudaModule(object): return try: from pycuda.driver import Context + capability = Context.get_device().compute_capability() if tuple(map(int, tuple(arch.split("_")[1]))) > capability: from warnings import warn - warn("trying to compile for a compute capability " - "higher than selected GPU") + + warn( + "trying to compile for a compute capability " + "higher than selected GPU" + ) except Exception: pass @@ -277,20 +328,41 @@ class CudaModule(object): def get_function(self, name): return self.module.get_function(name) + class SourceModule(CudaModule): - ''' + """ Creates a Module from a single .cu source object linked against the static CUDA runtime. - ''' - def __init__(self, source, nvcc="nvcc", options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[]): + """ + + def __init__( + self, + source, + nvcc="nvcc", + options=None, + keep=False, + no_extern_c=False, + arch=None, + code=None, + cache_dir=None, + include_dirs=[], + ): self._check_arch(arch) - cubin = compile(source, nvcc, options, keep, no_extern_c, - arch, code, cache_dir, include_dirs) + cubin = compile( + source, + nvcc, + options, + keep, + no_extern_c, + arch, + code, + cache_dir, + include_dirs, + ) from pycuda.driver import module_from_buffer + self.module = module_from_buffer(cubin) self._bind_module() @@ -318,21 +390,36 @@ def _find_nvcc_on_path(): class DynamicModule(CudaModule): - ''' + """ Creates a Module from multiple .cu source, library file and/or data objects linked against the static or dynamic CUDA runtime. - ''' - def __init__(self, nvcc='nvcc', link_options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], message_handler=None, log_verbose=False, - cuda_libdir=None): + """ + + def __init__( + self, + nvcc="nvcc", + link_options=None, + keep=False, + no_extern_c=False, + arch=None, + code=None, + cache_dir=None, + include_dirs=[], + message_handler=None, + log_verbose=False, + cuda_libdir=None, + ): from pycuda.driver import Context + compute_capability = Context.get_device().compute_capability() - if compute_capability < (3,5): - raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' % - (compute_capability[0], compute_capability[1])) + if compute_capability < (3, 5): + raise Exception( + "Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!" + % (compute_capability[0], compute_capability[1]) + ) else: from pycuda.driver import Linker + self.linker = Linker(message_handler, link_options, log_verbose) self._check_arch(arch) self.nvcc = nvcc @@ -347,7 +434,7 @@ class DynamicModule(CudaModule): self.module = None def _locate_cuda_libdir(self): - ''' + """ Locate the "standard" CUDA SDK library directory in the local file system. Supports 64-Bit Windows, Linux and Mac OS X. In case the caller supplied cuda_libdir in the constructor @@ -371,52 +458,68 @@ class DynamicModule(CudaModule): https://cmake.org/cmake/help/v3.0/module/FindCUDA.html https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA.cmake - Verify all Linux code paths somehow - ''' + """ from os.path import isfile, join from platform import system as platform_system + system = platform_system() libdir, libptn = None, None - if system == 'Windows': + if system == "Windows": if self.cuda_libdir is not None: libdir = self.cuda_libdir - elif 'CUDA_PATH' in os.environ and isfile(join(os.environ['CUDA_PATH'], 'lib\\x64\\cudadevrt.lib')): - libdir = join(os.environ['CUDA_PATH'], 'lib\\x64') - libptn = '%s.lib' - elif system in ['Linux', 'Darwin']: + elif "CUDA_PATH" in os.environ and isfile( + join(os.environ["CUDA_PATH"], "lib\\x64\\cudadevrt.lib") + ): + libdir = join(os.environ["CUDA_PATH"], "lib\\x64") + libptn = "%s.lib" + elif system in ["Linux", "Darwin"]: if self.cuda_libdir is not None: libdir = self.cuda_libdir - elif 'CUDA_ROOT' in os.environ and isfile(join(os.environ['CUDA_ROOT'], 'lib64/libcudadevrt.a')): - libdir = join(os.environ['CUDA_ROOT'], 'lib64') - elif 'LD_LIBRARY_PATH' in os.environ: - for ld_path in os.environ['LD_LIBRARY_PATH'].split(':'): - if isfile(join(ld_path, 'libcudadevrt.a')): + elif "CUDA_ROOT" in os.environ and isfile( + join(os.environ["CUDA_ROOT"], "lib64/libcudadevrt.a") + ): + libdir = join(os.environ["CUDA_ROOT"], "lib64") + elif "LD_LIBRARY_PATH" in os.environ: + for ld_path in os.environ["LD_LIBRARY_PATH"].split(":"): + if isfile(join(ld_path, "libcudadevrt.a")): libdir = ld_path break - if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'): - libdir = '/usr/lib/x86_64-linux-gnu' + if libdir is None and isfile("/usr/lib/x86_64-linux-gnu/libcudadevrt.a"): + libdir = "/usr/lib/x86_64-linux-gnu" if libdir is None: nvcc_path = _find_nvcc_on_path() if nvcc_path is not None: libdir = join(os.path.dirname(nvcc_path), "..", "lib64") - libptn = 'lib%s.a' + libptn = "lib%s.a" if libdir is None: - raise RuntimeError('Unable to locate the CUDA SDK installation ' - 'directory, set CUDA library path manually') + raise RuntimeError( + "Unable to locate the CUDA SDK installation " + "directory, set CUDA library path manually" + ) return libdir, libptn - def add_source(self, source, nvcc_options=None, name='kernel.ptx'): - ptx = compile(source, nvcc=self.nvcc, options=nvcc_options, - keep=self.keep, no_extern_c=self.no_extern_c, arch=self.arch, - code=self.code, cache_dir=self.cache_dir, - include_dirs=self.include_dirs, target="ptx") + def add_source(self, source, nvcc_options=None, name="kernel.ptx"): + ptx = compile( + source, + nvcc=self.nvcc, + options=nvcc_options, + keep=self.keep, + no_extern_c=self.no_extern_c, + arch=self.arch, + code=self.code, + cache_dir=self.cache_dir, + include_dirs=self.include_dirs, + target="ptx", + ) from pycuda.driver import jit_input_type + self.linker.add_data(ptx, jit_input_type.PTX, name) return self - def add_data(self, data, input_type, name='unknown'): + def add_data(self, data, input_type, name="unknown"): self.linker.add_data(data, input_type, name) return self @@ -428,10 +531,12 @@ class DynamicModule(CudaModule): if self.libdir is None: self.libdir, self.libptn = self._locate_cuda_libdir() from os.path import isfile, join + libpath = join(self.libdir, self.libptn % libname) if not isfile(libpath): raise OSError('CUDA SDK library file "%s" not found' % libpath) from pycuda.driver import jit_input_type + self.linker.add_file(libpath, jit_input_type.LIBRARY) return self @@ -443,28 +548,46 @@ class DynamicModule(CudaModule): class DynamicSourceModule(DynamicModule): - ''' + """ Creates a Module from a single .cu source object linked against the dynamic CUDA runtime. - compiler generates PTX relocatable device code (rdc) from source that can be linked with other relocatable device code - source is linked against the CUDA device runtime library cudadevrt - library cudadevrt is statically linked into the generated Module - ''' - def __init__(self, source, nvcc="nvcc", options=None, keep=False, - no_extern_c=False, arch=None, code=None, cache_dir=None, - include_dirs=[], cuda_libdir=None): - super(DynamicSourceModule, self).__init__(nvcc=nvcc, - link_options=None, keep=keep, no_extern_c=no_extern_c, - arch=arch, code=code, cache_dir=cache_dir, - include_dirs=include_dirs, cuda_libdir=cuda_libdir) + """ + + def __init__( + self, + source, + nvcc="nvcc", + options=None, + keep=False, + no_extern_c=False, + arch=None, + code=None, + cache_dir=None, + include_dirs=[], + cuda_libdir=None, + ): + super(DynamicSourceModule, self).__init__( + nvcc=nvcc, + link_options=None, + keep=keep, + no_extern_c=no_extern_c, + arch=arch, + code=code, + cache_dir=cache_dir, + include_dirs=include_dirs, + cuda_libdir=cuda_libdir, + ) if options is None: options = DEFAULT_NVCC_FLAGS options = options[:] - if '-rdc=true' not in options: - options.append('-rdc=true') - if '-lcudadevrt' not in options: - options.append('-lcudadevrt') + if "-rdc=true" not in options: + options.append("-rdc=true") + if "-lcudadevrt" not in options: + options.append("-lcudadevrt") self.add_source(source, nvcc_options=options) - self.add_stdlib('cudadevrt') + self.add_stdlib("cudadevrt") self.link() diff --git a/pycuda/cumath.py b/pycuda/cumath.py index dbae5bd6..42a510e6 100644 --- a/pycuda/cumath.py +++ b/pycuda/cumath.py @@ -10,7 +10,9 @@ def _make_unary_array_func(name): def f(array, stream_or_out=None, **kwargs): if stream_or_out is not None: - warnings.warn("please use 'out' or 'stream' keyword arguments", DeprecationWarning) + warnings.warn( + "please use 'out' or 'stream' keyword arguments", DeprecationWarning + ) if isinstance(stream_or_out, Stream): stream = stream_or_out out = None @@ -19,10 +21,10 @@ def _make_unary_array_func(name): out = stream_or_out out, stream = None, None - if 'out' in kwargs: - out = kwargs['out'] - if 'stream' in kwargs: - stream = kwargs['stream'] + if "out" in kwargs: + out = kwargs["out"] + if "stream" in kwargs: + stream = kwargs["stream"] if array.dtype == np.float32: func_name = name + "f" @@ -30,8 +32,9 @@ def _make_unary_array_func(name): func_name = name if not array.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) if out is None: out = array._new_like_me() @@ -41,12 +44,20 @@ def _make_unary_array_func(name): assert out.shape == array.shape func = elementwise.get_unary_func_kernel(func_name, array.dtype) - func.prepared_async_call(array._grid, array._block, stream, - array.gpudata, out.gpudata, array.mem_size) + func.prepared_async_call( + array._grid, + array._block, + stream, + array.gpudata, + out.gpudata, + array.mem_size, + ) return out + return f + fabs = _make_unary_array_func("fabs") ceil = _make_unary_array_func("ceil") floor = _make_unary_array_func("floor") @@ -66,70 +77,104 @@ sinh = _make_unary_array_func("sinh") cosh = _make_unary_array_func("cosh") tanh = _make_unary_array_func("tanh") + def fmod(arg, mod, stream=None): """Return the floating point remainder of the division `arg/mod`, for each element in `arg` and `mod`.""" result = gpuarray.GPUArray(arg.shape, arg.dtype) if not arg.flags.forc or not mod.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) func = elementwise.get_fmod_kernel() - func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, mod.gpudata, result.gpudata, arg.mem_size) + func.prepared_async_call( + arg._grid, + arg._block, + stream, + arg.gpudata, + mod.gpudata, + result.gpudata, + arg.mem_size, + ) return result + def frexp(arg, stream=None): """Return a tuple `(significands, exponents)` such that `arg == significand * 2**exponent`. """ if not arg.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) sig = gpuarray.GPUArray(arg.shape, arg.dtype) expt = gpuarray.GPUArray(arg.shape, arg.dtype) func = elementwise.get_frexp_kernel() - func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) + func.prepared_async_call( + arg._grid, + arg._block, + stream, + arg.gpudata, + sig.gpudata, + expt.gpudata, + arg.mem_size, + ) return sig, expt + def ldexp(significand, exponent, stream=None): """Return a new array of floating point values composed from the entries of `significand` and `exponent`, paired together as `result = significand * 2**exponent`. """ if not significand.flags.forc or not exponent.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) result = gpuarray.GPUArray(significand.shape, significand.dtype) func = elementwise.get_ldexp_kernel() - func.prepared_async_call(significand._grid, significand._block, stream, - significand.gpudata, exponent.gpudata, result.gpudata, - significand.mem_size) + func.prepared_async_call( + significand._grid, + significand._block, + stream, + significand.gpudata, + exponent.gpudata, + result.gpudata, + significand.mem_size, + ) return result + def modf(arg, stream=None): """Return a tuple `(fracpart, intpart)` of arrays containing the integer and fractional parts of `arg`. """ if not arg.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) intpart = gpuarray.GPUArray(arg.shape, arg.dtype) fracpart = gpuarray.GPUArray(arg.shape, arg.dtype) func = elementwise.get_modf_kernel() - func.prepared_async_call(arg._grid, arg._block, stream, - arg.gpudata, intpart.gpudata, fracpart.gpudata, - arg.mem_size) + func.prepared_async_call( + arg._grid, + arg._block, + stream, + arg.gpudata, + intpart.gpudata, + fracpart.gpudata, + arg.mem_size, + ) return fracpart, intpart diff --git a/pycuda/curandom.py b/pycuda/curandom.py index a5d3d38d..1d6f930b 100644 --- a/pycuda/curandom.py +++ b/pycuda/curandom.py @@ -9,7 +9,6 @@ from pytools import memoize_method import six - # {{{ MD5-based random number generation md5_code = """ @@ -186,8 +185,6 @@ md5_code = """ """ - - def rand(shape, dtype=np.float32, stream=None): from pycuda.gpuarray import GPUArray from pycuda.elementwise import get_elwise_kernel @@ -197,7 +194,8 @@ def rand(shape, dtype=np.float32, stream=None): if dtype == np.float32: func = get_elwise_kernel( "float *dest, unsigned int seed", - md5_code + """ + md5_code + + """ #define POW_2_M32 (1/4294967296.0f) dest[i] = a*POW_2_M32; if ((i += total_threads) < n) @@ -207,11 +205,13 @@ def rand(shape, dtype=np.float32, stream=None): if ((i += total_threads) < n) dest[i] = d*POW_2_M32; """, - "md5_rng_float") + "md5_rng_float", + ) elif dtype == np.float64: func = get_elwise_kernel( "double *dest, unsigned int seed", - md5_code + """ + md5_code + + """ #define POW_2_M32 (1/4294967296.0) #define POW_2_M64 (1/18446744073709551616.) @@ -222,11 +222,13 @@ def rand(shape, dtype=np.float32, stream=None): dest[i] = c*POW_2_M32 + d*POW_2_M64; } """, - "md5_rng_float") + "md5_rng_float", + ) elif dtype in [np.int32, np.uint32]: func = get_elwise_kernel( "unsigned int *dest, unsigned int seed", - md5_code + """ + md5_code + + """ dest[i] = a; if ((i += total_threads) < n) dest[i] = b; @@ -235,24 +237,35 @@ def rand(shape, dtype=np.float32, stream=None): if ((i += total_threads) < n) dest[i] = d; """, - "md5_rng_int") + "md5_rng_int", + ) else: - raise NotImplementedError; + raise NotImplementedError - func.prepared_async_call(result._grid, result._block, stream, - result.gpudata, np.random.randint(2**31-1), result.size) + func.prepared_async_call( + result._grid, + result._block, + stream, + result.gpudata, + np.random.randint(2 ** 31 - 1), + result.size, + ) return result + # }}} # {{{ CURAND wrapper try: - import pycuda._driver as _curand # used to be separate module + import pycuda._driver as _curand # used to be separate module except ImportError: + def get_curand_version(): return None + + else: get_curand_version = _curand.get_curand_version @@ -355,6 +368,7 @@ __global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned } """ + class _RandomNumberGeneratorBase(object): """ Class surrounding CURAND kernels from CUDA 3.2. @@ -371,34 +385,41 @@ class _RandomNumberGeneratorBase(object): ("normal_double", "double", "_normal_double"), ("normal_float2", "float2", "_normal2"), ("normal_double2", "double2", "_normal2_double"), - ] + ] gen_log_info = [ ("normal_log_float", "float", "float", "_normal"), ("normal_log_double", "double", "double", "_normal_double"), ("normal_log_float2", "float", "float2", "_normal2"), ("normal_log_double2", "double", "double2", "_normal2_double"), - ] + ] gen_poisson_info = [ ("poisson_int", "unsigned int", ""), - ] + ] gen_poisson_inplace_info = [ ("poisson_inplace_float", "float", ""), ("poisson_inplace_double", "double", ""), ("poisson_inplace_int", "unsigned int", ""), - ] - - def __init__(self, state_type, vector_type, generator_bits, - additional_source, scramble_type=None): + ] + + def __init__( + self, + state_type, + vector_type, + generator_bits, + additional_source, + scramble_type=None, + ): if get_curand_version() < (3, 2, 0): raise EnvironmentError("Need at least CUDA 3.2") dev = drv.Context.get_device() self.block_count = dev.get_attribute( - pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT) + pycuda.driver.device_attribute.MULTIPROCESSOR_COUNT + ) from pycuda.characterize import has_double_support @@ -411,67 +432,99 @@ class _RandomNumberGeneratorBase(object): return result my_generators = [ - (name, out_type, suffix) - for name, out_type, suffix in self.gen_info - if do_generate(out_type)] + (name, out_type, suffix) + for name, out_type, suffix in self.gen_info + if do_generate(out_type) + ] if get_curand_version() >= (4, 0, 0): my_log_generators = [ - (name, in_type, out_type, suffix) - for name, in_type, out_type, suffix in self.gen_log_info - if do_generate(out_type)] + (name, in_type, out_type, suffix) + for name, in_type, out_type, suffix in self.gen_log_info + if do_generate(out_type) + ] if get_curand_version() >= (5, 0, 0): my_poisson_generators = [ - (name, out_type, suffix) - for name, out_type, suffix in self.gen_poisson_info - if do_generate(out_type)] + (name, out_type, suffix) + for name, out_type, suffix in self.gen_poisson_info + if do_generate(out_type) + ] my_poisson_inplace_generators = [ - (name, inout_type, suffix) - for name, inout_type, suffix in self.gen_poisson_inplace_info - if do_generate(inout_type)] + (name, inout_type, suffix) + for name, inout_type, suffix in self.gen_poisson_inplace_info + if do_generate(inout_type) + ] generator_sources = [ - gen_template % { - "name": name, "out_type": out_type, "suffix": suffix, - "state_type": state_type, } - for name, out_type, suffix in my_generators] - + gen_template + % { + "name": name, + "out_type": out_type, + "suffix": suffix, + "state_type": state_type, + } + for name, out_type, suffix in my_generators + ] + if get_curand_version() >= (4, 0, 0): - generator_sources.extend([ - gen_log_template % { - "name": name, "in_type": in_type, "out_type": out_type, - "suffix": suffix, "state_type": state_type, } - for name, in_type, out_type, suffix in my_log_generators]) + generator_sources.extend( + [ + gen_log_template + % { + "name": name, + "in_type": in_type, + "out_type": out_type, + "suffix": suffix, + "state_type": state_type, + } + for name, in_type, out_type, suffix in my_log_generators + ] + ) if get_curand_version() >= (5, 0, 0): - generator_sources.extend([ - gen_poisson_template % { - "name": name, "out_type": out_type, "suffix": suffix, - "state_type": state_type, } - for name, out_type, suffix in my_poisson_generators]) - generator_sources.extend([ - gen_poisson_inplace_template % { - "name": name, "inout_type": inout_type, "suffix": suffix, - "state_type": state_type, } - for name, inout_type, suffix in my_poisson_inplace_generators]) + generator_sources.extend( + [ + gen_poisson_template + % { + "name": name, + "out_type": out_type, + "suffix": suffix, + "state_type": state_type, + } + for name, out_type, suffix in my_poisson_generators + ] + ) + generator_sources.extend( + [ + gen_poisson_inplace_template + % { + "name": name, + "inout_type": inout_type, + "suffix": suffix, + "state_type": state_type, + } + for name, inout_type, suffix in my_poisson_inplace_generators + ] + ) source = (random_source + additional_source) % { "state_type": state_type, "vector_type": vector_type, "scramble_type": scramble_type, - "generators": "\n".join(generator_sources)} + "generators": "\n".join(generator_sources), + } # store in instance to let subclass constructors get to it. self.module = module = pycuda.compiler.SourceModule(source, no_extern_c=True) self.generators = {} - for name, out_type, suffix in my_generators: + for name, out_type, suffix in my_generators: gen_func = module.get_function(name) gen_func.prepare("PPi") self.generators[name] = gen_func if get_curand_version() >= (4, 0, 0): - for name, in_type, out_type, suffix in my_log_generators: + for name, in_type, out_type, suffix in my_log_generators: gen_func = module.get_function(name) if in_type == "float": gen_func.prepare("PPffi") @@ -479,11 +532,11 @@ class _RandomNumberGeneratorBase(object): gen_func.prepare("PPddi") self.generators[name] = gen_func if get_curand_version() >= (5, 0, 0): - for name, out_type, suffix in my_poisson_generators: + for name, out_type, suffix in my_poisson_generators: gen_func = module.get_function(name) gen_func.prepare("PPdi") self.generators[name] = gen_func - for name, inout_type, suffix in my_poisson_inplace_generators: + for name, inout_type, suffix in my_poisson_inplace_generators: gen_func = module.get_function(name) gen_func.prepare("PPi") self.generators[name] = gen_func @@ -504,24 +557,26 @@ class _RandomNumberGeneratorBase(object): self.skip_ahead_array.prepare("PiP") def _kernels(self): - return ( - list(six.itervalues(self.generators)) - + [self.skip_ahead, self.skip_ahead_array]) + return list(six.itervalues(self.generators)) + [ + self.skip_ahead, + self.skip_ahead_array, + ] @property @memoize_method def generators_per_block(self): - return min(kernel.max_threads_per_block - for kernel in self._kernels()) + return min(kernel.max_threads_per_block for kernel in self._kernels()) @property def state(self): if self._state is None: from pycuda.characterize import sizeof + data_type_size = sizeof(self.state_type, "#include ") self._state = drv.mem_alloc( - self.block_count * self.generators_per_block * data_type_size) + self.block_count * self.generators_per_block * data_type_size + ) return self._state @@ -538,8 +593,13 @@ class _RandomNumberGeneratorBase(object): raise NotImplementedError func.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, data.gpudata, data.size) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + data.gpudata, + data.size, + ) def fill_normal(self, data, stream=None): if data.dtype == np.float32: @@ -557,8 +617,13 @@ class _RandomNumberGeneratorBase(object): func = self.generators[func_name] func.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, data.gpudata, int(data_size)) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + data.gpudata, + int(data_size), + ) def gen_uniform(self, shape, dtype, stream=None): result = array.empty(shape, dtype) @@ -571,6 +636,7 @@ class _RandomNumberGeneratorBase(object): return result if get_curand_version() >= (4, 0, 0): + def fill_log_normal(self, data, mean, stddev, stream=None): if data.dtype == np.float32: func_name = "normal_log_float" @@ -587,8 +653,15 @@ class _RandomNumberGeneratorBase(object): func = self.generators[func_name] func.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, data.gpudata, mean, stddev, int(data_size)) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + data.gpudata, + mean, + stddev, + int(data_size), + ) def gen_log_normal(self, shape, dtype, mean, stddev, stream=None): result = array.empty(shape, dtype) @@ -596,6 +669,7 @@ class _RandomNumberGeneratorBase(object): return result if get_curand_version() >= (5, 0, 0): + def fill_poisson(self, data, lambda_value=None, stream=None): if lambda_value is None: if data.dtype == np.float32: @@ -616,12 +690,23 @@ class _RandomNumberGeneratorBase(object): if lambda_value is None: func.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, data.gpudata, data.size) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + data.gpudata, + data.size, + ) else: func.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, data.gpudata, lambda_value, data.size) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + data.gpudata, + lambda_value, + data.size, + ) def gen_poisson(self, shape, dtype, lambda_value, stream=None): result = array.empty(shape, dtype) @@ -630,44 +715,68 @@ class _RandomNumberGeneratorBase(object): def call_skip_ahead(self, i, stream=None): self.skip_ahead.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, self.generators_per_block, i) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + self.generators_per_block, + i, + ) def call_skip_ahead_array(self, i, stream=None): self.skip_ahead_array.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, self.generators_per_block, i.gpudata) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + self.generators_per_block, + i.gpudata, + ) + # }}} # {{{ XORWOW RNG + class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase): - def __init__(self, seed_getter, offset, state_type, vector_type, - generator_bits, additional_source, scramble_type=None): + def __init__( + self, + seed_getter, + offset, + state_type, + vector_type, + generator_bits, + additional_source, + scramble_type=None, + ): super(_PseudoRandomNumberGeneratorBase, self).__init__( - state_type, vector_type, generator_bits, additional_source) + state_type, vector_type, generator_bits, additional_source + ) generator_count = self.generators_per_block * self.block_count if seed_getter is None: seed = array.to_gpu( - np.asarray( - np.random.randint( - 0, (1 << 31) - 1, generator_count), - dtype=np.int32)) + np.asarray( + np.random.randint(0, (1 << 31) - 1, generator_count), dtype=np.int32 + ) + ) else: seed = seed_getter(generator_count) - if not (isinstance(seed, pycuda.gpuarray.GPUArray) - and seed.dtype == np.int32 - and seed.size == generator_count): + if not ( + isinstance(seed, pycuda.gpuarray.GPUArray) + and seed.dtype == np.int32 + and seed.size == generator_count + ): raise TypeError("seed must be GPUArray of integers of right length") p = self.module.get_function("prepare") p.prepare("PiPi") from pycuda.characterize import has_stack + has_stack = has_stack() if has_stack: @@ -675,11 +784,16 @@ class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase): try: if has_stack: - drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k + drv.Context.set_limit(drv.limit.STACK_SIZE, 1 << 14) # 16k try: p.prepared_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), self.state, - generator_count, seed.gpudata, offset) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + self.state, + generator_count, + seed.gpudata, + offset, + ) except drv.LaunchError: raise ValueError("Initialisation failed. Decrease number of threads.") @@ -694,36 +808,55 @@ class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase): self.skip_ahead_array.prepare("PiP") self.skip_ahead_sequence = self.module.get_function("skip_ahead_sequence") self.skip_ahead_sequence.prepare("PiQ") - self.skip_ahead_sequence_array = self.module.get_function("skip_ahead_sequence_array") + self.skip_ahead_sequence_array = self.module.get_function( + "skip_ahead_sequence_array" + ) self.skip_ahead_sequence_array.prepare("PiP") def call_skip_ahead_sequence(self, i, stream=None): self.skip_ahead_sequence.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, self.generators_per_block * self.block_count, i) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + self.generators_per_block * self.block_count, + i, + ) def call_skip_ahead_sequence_array(self, i, stream=None): self.skip_ahead_sequence_array.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, self.generators_per_block * self.block_count, i.gpudata) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + self.generators_per_block * self.block_count, + i.gpudata, + ) def _kernels(self): - return (_RandomNumberGeneratorBase._kernels(self) - + [self.module.get_function("prepare")] - + [self.module.get_function("skip_ahead_sequence"), - self.module.get_function("skip_ahead_sequence_array")]) + return ( + _RandomNumberGeneratorBase._kernels(self) + + [self.module.get_function("prepare")] + + [ + self.module.get_function("skip_ahead_sequence"), + self.module.get_function("skip_ahead_sequence_array"), + ] + ) -def seed_getter_uniform(N): - result = pycuda.gpuarray.empty([N], np.int32) +def seed_getter_uniform(n): + result = pycuda.gpuarray.empty([n], np.int32) import random - value = random.randint(0, 2**31-1) + + value = random.randint(0, 2 ** 31 - 1) return result.fill(value) -def seed_getter_unique(N): - result = np.random.randint(0, 2**31-1, N).astype(np.int32) + +def seed_getter_unique(n): + result = np.random.randint(0, 2 ** 31 - 1, n).astype(np.int32) return pycuda.gpuarray.to_gpu(result) + xorwow_random_source = """ extern "C" { __global__ void prepare(%(state_type)s *s, const int n, @@ -755,6 +888,7 @@ __global__ void skip_ahead_sequence_array(%(state_type)s *s, const int n, const """ if get_curand_version() >= (3, 2, 0): + class XORWOWRandomNumberGenerator(_PseudoRandomNumberGeneratorBase): has_box_muller = True @@ -765,9 +899,16 @@ if get_curand_version() >= (3, 2, 0): """ super(XORWOWRandomNumberGenerator, self).__init__( - seed_getter, offset, - 'curandStateXORWOW', 'unsigned int', 32, xorwow_random_source+ - xorwow_skip_ahead_sequence_source+random_skip_ahead64_source) + seed_getter, + offset, + "curandStateXORWOW", + "unsigned int", + 32, + xorwow_random_source + + xorwow_skip_ahead_sequence_source + + random_skip_ahead64_source, + ) + # }}} @@ -818,6 +959,7 @@ __global__ void skip_ahead_subsequence_array(%(state_type)s *s, const int n, con """ if get_curand_version() >= (4, 1, 0): + class MRG32k3aRandomNumberGenerator(_PseudoRandomNumberGeneratorBase): has_box_muller = True @@ -828,40 +970,65 @@ if get_curand_version() >= (4, 1, 0): """ super(MRG32k3aRandomNumberGenerator, self).__init__( - seed_getter, offset, - 'curandStateMRG32k3a', 'unsigned int', 32, mrg32k3a_random_source+ - mrg32k3a_skip_ahead_sequence_source+random_skip_ahead64_source) + seed_getter, + offset, + "curandStateMRG32k3a", + "unsigned int", + 32, + mrg32k3a_random_source + + mrg32k3a_skip_ahead_sequence_source + + random_skip_ahead64_source, + ) def _prepare_skipahead(self): super(MRG32k3aRandomNumberGenerator, self)._prepare_skipahead() - self.skip_ahead_subsequence = self.module.get_function("skip_ahead_subsequence") + self.skip_ahead_subsequence = self.module.get_function( + "skip_ahead_subsequence" + ) self.skip_ahead_subsequence.prepare("PiQ") - self.skip_ahead_subsequence_array = self.module.get_function("skip_ahead_subsequence_array") + self.skip_ahead_subsequence_array = self.module.get_function( + "skip_ahead_subsequence_array" + ) self.skip_ahead_subsequence_array.prepare("PiP") def call_skip_ahead_subsequence(self, i, stream=None): self.skip_ahead_subsequence.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, self.generators_per_block * self.block_count, i) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + self.generators_per_block * self.block_count, + i, + ) def call_skip_ahead_subsequence_array(self, i, stream=None): self.skip_ahead_subsequence_array.prepared_async_call( - (self.block_count, 1), (self.generators_per_block, 1, 1), stream, - self.state, self.generators_per_block * self.block_count, i.gpudata) + (self.block_count, 1), + (self.generators_per_block, 1, 1), + stream, + self.state, + self.generators_per_block * self.block_count, + i.gpudata, + ) def _kernels(self): - return (_PseudoRandomNumberGeneratorBase._kernels(self) - + [self.module.get_function("skip_ahead_subsequence"), - self.module.get_function("skip_ahead_subsequence_array")]) + return _PseudoRandomNumberGeneratorBase._kernels(self) + [ + self.module.get_function("skip_ahead_subsequence"), + self.module.get_function("skip_ahead_subsequence_array"), + ] + # }}} # {{{ Sobol RNG + def generate_direction_vectors(count, direction=None): if get_curand_version() >= (4, 0, 0): - if direction == direction_vector_set.VECTOR_64 or \ - direction == direction_vector_set.SCRAMBLED_VECTOR_64: + if ( + direction == direction_vector_set.VECTOR_64 + or direction == direction_vector_set.SCRAMBLED_VECTOR_64 + ): result = np.empty((count, 64), dtype=np.uint64) else: result = np.empty((count, 32), dtype=np.uint32) @@ -870,17 +1037,20 @@ def generate_direction_vectors(count, direction=None): _get_direction_vectors(direction, result, count) return pycuda.gpuarray.to_gpu(result) + if get_curand_version() >= (4, 0, 0): + def generate_scramble_constants32(count): - result = np.empty((count, ), dtype=np.uint32) + result = np.empty((count,), dtype=np.uint32) _get_scramble_constants32(result, count) return pycuda.gpuarray.to_gpu(result) def generate_scramble_constants64(count): - result = np.empty((count, ), dtype=np.uint64) + result = np.empty((count,), dtype=np.uint64) _get_scramble_constants64(result, count) return pycuda.gpuarray.to_gpu(result) + sobol_random_source = """ extern "C" { __global__ void prepare(%(state_type)s *s, const int n, @@ -893,6 +1063,7 @@ __global__ void prepare(%(state_type)s *s, const int n, } """ + class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 3.2. @@ -902,25 +1073,40 @@ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): has_box_muller = False - def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size, - dir_vector_set, offset, state_type, vector_type, generator_bits, - sobol_random_source): - super(_SobolRandomNumberGeneratorBase, self).__init__(state_type, - vector_type, generator_bits, sobol_random_source) + def __init__( + self, + dir_vector, + dir_vector_dtype, + dir_vector_size, + dir_vector_set, + offset, + state_type, + vector_type, + generator_bits, + sobol_random_source, + ): + super(_SobolRandomNumberGeneratorBase, self).__init__( + state_type, vector_type, generator_bits, sobol_random_source + ) if dir_vector is None: dir_vector = generate_direction_vectors( - self.block_count * self.generators_per_block, dir_vector_set) - - if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray) - and dir_vector.dtype == dir_vector_dtype - and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)): + self.block_count * self.generators_per_block, dir_vector_set + ) + + if not ( + isinstance(dir_vector, pycuda.gpuarray.GPUArray) + and dir_vector.dtype == dir_vector_dtype + and dir_vector.shape + == (self.block_count * self.generators_per_block, dir_vector_size) + ): raise TypeError("seed must be GPUArray of integers of right length") p = self.module.get_function("prepare") p.prepare("PiPi") from pycuda.characterize import has_stack + has_stack = has_stack() if has_stack: @@ -928,11 +1114,16 @@ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): try: if has_stack: - drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k + drv.Context.set_limit(drv.limit.STACK_SIZE, 1 << 14) # 16k try: - p.prepared_call((self.block_count, 1), (self.generators_per_block, 1, 1), - self.state, self.block_count * self.generators_per_block, - dir_vector.gpudata, offset) + p.prepared_call( + (self.block_count, 1), + (self.generators_per_block, 1, 1), + self.state, + self.block_count * self.generators_per_block, + dir_vector.gpudata, + offset, + ) except drv.LaunchError: raise ValueError("Initialisation failed. Decrease number of threads.") @@ -941,8 +1132,10 @@ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) def _kernels(self): - return (_RandomNumberGeneratorBase._kernels(self) - + [self.module.get_function("prepare")]) + return _RandomNumberGeneratorBase._kernels(self) + [ + self.module.get_function("prepare") + ] + scrambledsobol_random_source = """ extern "C" { @@ -956,6 +1149,7 @@ __global__ void prepare( %(state_type)s *s, const int n, } """ + class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 4.0. @@ -965,36 +1159,55 @@ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): has_box_muller = False - def __init__(self, dir_vector, dir_vector_dtype, dir_vector_size, - dir_vector_set, scramble_vector, scramble_vector_function, - offset, state_type, vector_type, generator_bits, scramble_type, - sobol_random_source): - super(_ScrambledSobolRandomNumberGeneratorBase, self).__init__(state_type, - vector_type, generator_bits, sobol_random_source, scramble_type) + def __init__( + self, + dir_vector, + dir_vector_dtype, + dir_vector_size, + dir_vector_set, + scramble_vector, + scramble_vector_function, + offset, + state_type, + vector_type, + generator_bits, + scramble_type, + sobol_random_source, + ): + super(_ScrambledSobolRandomNumberGeneratorBase, self).__init__( + state_type, vector_type, generator_bits, sobol_random_source, scramble_type + ) if dir_vector is None: dir_vector = generate_direction_vectors( - self.block_count * self.generators_per_block, - dir_vector_set) + self.block_count * self.generators_per_block, dir_vector_set + ) if scramble_vector is None: scramble_vector = scramble_vector_function( - self.block_count * self.generators_per_block) - - if not (isinstance(dir_vector, pycuda.gpuarray.GPUArray) - and dir_vector.dtype == dir_vector_dtype - and dir_vector.shape == (self.block_count * self.generators_per_block, dir_vector_size)): + self.block_count * self.generators_per_block + ) + + if not ( + isinstance(dir_vector, pycuda.gpuarray.GPUArray) + and dir_vector.dtype == dir_vector_dtype + and dir_vector.shape + == (self.block_count * self.generators_per_block, dir_vector_size) + ): raise TypeError("seed must be GPUArray of integers of right length") - if not (isinstance(scramble_vector, pycuda.gpuarray.GPUArray) - and scramble_vector.dtype == dir_vector_dtype - and scramble_vector.shape == (self.block_count * self.generators_per_block, )): + if not ( + isinstance(scramble_vector, pycuda.gpuarray.GPUArray) + and scramble_vector.dtype == dir_vector_dtype + and scramble_vector.shape == (self.block_count * self.generators_per_block,) + ): raise TypeError("scramble must be GPUArray of integers of right length") p = self.module.get_function("prepare") p.prepare("PiPPi") from pycuda.characterize import has_stack + has_stack = has_stack() if has_stack: @@ -1002,11 +1215,17 @@ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): try: if has_stack: - drv.Context.set_limit(drv.limit.STACK_SIZE, 1<<14) # 16k + drv.Context.set_limit(drv.limit.STACK_SIZE, 1 << 14) # 16k try: - p.prepared_call((self.block_count, 1), (self.generators_per_block, 1, 1), - self.state, self.block_count * self.generators_per_block, - dir_vector.gpudata, scramble_vector.gpudata, offset) + p.prepared_call( + (self.block_count, 1), + (self.generators_per_block, 1, 1), + self.state, + self.block_count * self.generators_per_block, + dir_vector.gpudata, + scramble_vector.gpudata, + offset, + ) except drv.LaunchError: raise ValueError("Initialisation failed. Decrease number of threads.") @@ -1015,10 +1234,13 @@ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) def _kernels(self): - return (_RandomNumberGeneratorBase._kernels(self) - + [self.module.get_function("prepare")]) + return _RandomNumberGeneratorBase._kernels(self) + [ + self.module.get_function("prepare") + ] + if get_curand_version() >= (3, 2, 0): + class Sobol32RandomNumberGenerator(_SobolRandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 3.2. @@ -1027,14 +1249,24 @@ if get_curand_version() >= (3, 2, 0): """ def __init__(self, dir_vector=None, offset=0): - super(Sobol32RandomNumberGenerator, self).__init__(dir_vector, - np.uint32, 32, direction_vector_set.VECTOR_32, offset, - 'curandStateSobol32', 'curandDirectionVectors32_t', 32, - sobol_random_source+random_skip_ahead32_source) + super(Sobol32RandomNumberGenerator, self).__init__( + dir_vector, + np.uint32, + 32, + direction_vector_set.VECTOR_32, + offset, + "curandStateSobol32", + "curandDirectionVectors32_t", + 32, + sobol_random_source + random_skip_ahead32_source, + ) if get_curand_version() >= (4, 0, 0): - class ScrambledSobol32RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase): + + class ScrambledSobol32RandomNumberGenerator( + _ScrambledSobolRandomNumberGeneratorBase + ): """ Class surrounding CURAND kernels from CUDA 4.0. It allows for generating quasi-random numbers with uniform @@ -1042,14 +1274,24 @@ if get_curand_version() >= (4, 0, 0): """ def __init__(self, dir_vector=None, scramble_vector=None, offset=0): - super(ScrambledSobol32RandomNumberGenerator, self).__init__(dir_vector, - np.uint32, 32, direction_vector_set.SCRAMBLED_VECTOR_32, - scramble_vector, generate_scramble_constants32, offset, - 'curandStateScrambledSobol32', 'curandDirectionVectors32_t', - 32, 'unsigned int', - scrambledsobol_random_source+random_skip_ahead32_source) + super(ScrambledSobol32RandomNumberGenerator, self).__init__( + dir_vector, + np.uint32, + 32, + direction_vector_set.SCRAMBLED_VECTOR_32, + scramble_vector, + generate_scramble_constants32, + offset, + "curandStateScrambledSobol32", + "curandDirectionVectors32_t", + 32, + "unsigned int", + scrambledsobol_random_source + random_skip_ahead32_source, + ) + if get_curand_version() >= (4, 0, 0): + class Sobol64RandomNumberGenerator(_SobolRandomNumberGeneratorBase): """ Class surrounding CURAND kernels from CUDA 4.0. @@ -1058,13 +1300,24 @@ if get_curand_version() >= (4, 0, 0): """ def __init__(self, dir_vector=None, offset=0): - super(Sobol64RandomNumberGenerator, self).__init__(dir_vector, - np.uint64, 64, direction_vector_set.VECTOR_64, offset, - 'curandStateSobol64', 'curandDirectionVectors64_t', 64, - sobol_random_source+random_skip_ahead64_source) + super(Sobol64RandomNumberGenerator, self).__init__( + dir_vector, + np.uint64, + 64, + direction_vector_set.VECTOR_64, + offset, + "curandStateSobol64", + "curandDirectionVectors64_t", + 64, + sobol_random_source + random_skip_ahead64_source, + ) + if get_curand_version() >= (4, 0, 0): - class ScrambledSobol64RandomNumberGenerator(_ScrambledSobolRandomNumberGeneratorBase): + + class ScrambledSobol64RandomNumberGenerator( + _ScrambledSobolRandomNumberGeneratorBase + ): """ Class surrounding CURAND kernels from CUDA 4.0. It allows for generating quasi-random numbers with uniform @@ -1072,19 +1325,25 @@ if get_curand_version() >= (4, 0, 0): """ def __init__(self, dir_vector=None, scramble_vector=None, offset=0): - super(ScrambledSobol64RandomNumberGenerator, self).__init__(dir_vector, - np.uint64, 64, direction_vector_set.SCRAMBLED_VECTOR_64, - scramble_vector, generate_scramble_constants64, offset, - 'curandStateScrambledSobol64', 'curandDirectionVectors64_t', - 64, 'unsigned long long', - scrambledsobol_random_source+random_skip_ahead64_source) + super(ScrambledSobol64RandomNumberGenerator, self).__init__( + dir_vector, + np.uint64, + 64, + direction_vector_set.SCRAMBLED_VECTOR_64, + scramble_vector, + generate_scramble_constants64, + offset, + "curandStateScrambledSobol64", + "curandDirectionVectors64_t", + 64, + "unsigned long long", + scrambledsobol_random_source + random_skip_ahead64_source, + ) -# }}} # }}} - - +# }}} # vim: foldmethod=marker diff --git a/pycuda/debug.py b/pycuda/debug.py index b2a90b9b..fded1d1e 100644 --- a/pycuda/debug.py +++ b/pycuda/debug.py @@ -1,13 +1,14 @@ from __future__ import absolute_import from __future__ import print_function import pycuda.driver -pycuda.driver.set_debugging() import sys - from optparse import OptionParser -parser = OptionParser( - usage="usage: %prog [options] SCRIPT-TO-RUN [SCRIPT-ARGUMENTS]") +from os.path import exists + +pycuda.driver.set_debugging() + +parser = OptionParser(usage="usage: %prog [options] SCRIPT-TO-RUN [SCRIPT-ARGUMENTS]") parser.disable_interspersed_args() options, args = parser.parse_args() @@ -16,12 +17,12 @@ if len(args) < 1: parser.print_help() sys.exit(2) -mainpyfile = args[0] -from os.path import exists +mainpyfile = args[0] + if not exists(mainpyfile): - print('Error:', mainpyfile, 'does not exist') + print("Error:", mainpyfile, "does not exist") sys.exit(1) sys.argv = args -exec(compile(open(mainpyfile).read(), mainpyfile, 'exec')) +exec(compile(open(mainpyfile).read(), mainpyfile, "exec")) diff --git a/pycuda/driver.py b/pycuda/driver.py index fd042a75..c625d81b 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -10,6 +10,7 @@ import numpy as np # {{{ add cuda lib dir to Python DLL path + def _search_on_path(filenames): """Find file on system path.""" # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224 @@ -32,7 +33,7 @@ def _add_cuda_libdir_to_dll_path(): cuda_path = os.environ.get("CUDA_PATH") if cuda_path is not None: - os.add_dll_directory(join(cuda_path, 'bin')) + os.add_dll_directory(join(cuda_path, "bin")) return nvcc_path = _search_on_path(["nvcc.exe"]) @@ -40,10 +41,13 @@ def _add_cuda_libdir_to_dll_path(): os.add_dll_directory(dirname(nvcc_path)) from warnings import warn - warn("Unable to discover CUDA installation directory " - "while attempting to add it to Python's DLL path. " - "Either set the 'CUDA_PATH' environment variable " - "or ensure that 'nvcc.exe' is on the path.") + + warn( + "Unable to discover CUDA installation directory " + "while attempting to add it to Python's DLL path. " + "Either set the 'CUDA_PATH' environment variable " + "or ensure that 'nvcc.exe' is on the path." + ) try: @@ -63,9 +67,12 @@ try: except ImportError as e: if "_v2" in str(e): from warnings import warn - warn("Failed to import the CUDA driver interface, with an error " - "message indicating that the version of your CUDA header " - "does not match the version of your CUDA driver.") + + warn( + "Failed to import the CUDA driver interface, with an error " + "message indicating that the version of your CUDA header " + "does not match the version of your CUDA driver." + ) raise @@ -127,7 +134,10 @@ class ArgumentHandler(object): try: self.dev_alloc = mem_alloc_like(self.array) except AttributeError: - raise TypeError("could not determine array length of '%s': unsupported array type or not an array" % type(self.array)) + raise TypeError( + "could not determine array length of '%s': unsupported array type or not an array" + % type(self.array) + ) return self.dev_alloc def pre_call(self, stream): @@ -155,7 +165,6 @@ class InOut(In, Out): def _add_functionality(): - def device_get_attributes(dev): result = {} @@ -169,8 +178,11 @@ def _add_functionality(): att_value = dev.get_attribute(att_id) except LogicError as e: from warnings import warn - warn("CUDA driver raised '%s' when querying '%s' on '%s'" - % (e, att_name, dev)) + + warn( + "CUDA driver raised '%s' when querying '%s' on '%s'" + % (e, att_name, dev) + ) else: result[att_id] = att_value @@ -216,6 +228,7 @@ def _add_functionality(): format += "P" from pycuda._pvt_struct import pack + return handlers, pack(format, *arg_data) # {{{ pre-CUDA 4 call interface (stateful) @@ -240,8 +253,8 @@ def _add_functionality(): if kwargs: raise ValueError( - "extra keyword arguments: %s" - % (",".join(six.iterkeys(kwargs)))) + "extra keyword arguments: %s" % (",".join(six.iterkeys(kwargs))) + ) if block is None: raise ValueError("must specify block size") @@ -257,22 +270,23 @@ def _add_functionality(): for texref in texrefs: func.param_set_texref(texref) - post_handlers = [handler - for handler in handlers - if hasattr(handler, "post_call")] + post_handlers = [ + handler for handler in handlers if hasattr(handler, "post_call") + ] if stream is None: if time_kernel: Context.synchronize() from time import time + start_time = time() func._launch_grid(*grid) if post_handlers or time_kernel: Context.synchronize() if time_kernel: - run_time = time()-start_time + run_time = time() - start_time for handler in post_handlers: handler.post_call(stream) @@ -280,25 +294,32 @@ def _add_functionality(): if time_kernel: return run_time else: - assert not time_kernel, \ - "Can't time the kernel on an asynchronous invocation" + assert ( + not time_kernel + ), "Can't time the kernel on an asynchronous invocation" func._launch_grid_async(grid[0], grid[1], stream) if post_handlers: for handler in post_handlers: handler.post_call(stream) - def function_prepare_pre_v4(func, arg_types, block=None, - shared=None, texrefs=[]): + def function_prepare_pre_v4(func, arg_types, block=None, shared=None, texrefs=[]): from warnings import warn + if block is not None: - warn("setting the block size in Function.prepare is deprecated", - DeprecationWarning, stacklevel=2) + warn( + "setting the block size in Function.prepare is deprecated", + DeprecationWarning, + stacklevel=2, + ) func._set_block_shape(*block) if shared is not None: - warn("setting the shared memory size in Function.prepare is deprecated", - DeprecationWarning, stacklevel=2) + warn( + "setting the shared memory size in Function.prepare is deprecated", + DeprecationWarning, + stacklevel=2, + ) func._set_shared_size(shared) func.texrefs = texrefs @@ -306,8 +327,11 @@ def _add_functionality(): func.arg_format = "" for i, arg_type in enumerate(arg_types): - if (isinstance(arg_type, type) - and np is not None and np.number in arg_type.__mro__): + if ( + isinstance(arg_type, type) + and np is not None + and np.number in arg_type.__mro__ + ): func.arg_format += np.dtype(arg_type).char elif isinstance(arg_type, str): func.arg_format += arg_type @@ -315,6 +339,7 @@ def _add_functionality(): func.arg_format += np.dtype(np.uintp).char from pycuda._pvt_struct import calcsize + func._param_set_size(calcsize(func.arg_format)) return func @@ -324,8 +349,13 @@ def _add_functionality(): func._set_block_shape(*block) else: from warnings import warn - warn("Not passing the block size to prepared_call is deprecated as of " - "version 2011.1.", DeprecationWarning, stacklevel=2) + + warn( + "Not passing the block size to prepared_call is deprecated as of " + "version 2011.1.", + DeprecationWarning, + stacklevel=2, + ) args = (block,) + args shared_size = kwargs.pop("shared_size", None) @@ -333,10 +363,12 @@ def _add_functionality(): func._set_shared_size(shared_size) if kwargs: - raise TypeError("unknown keyword arguments: " - + ", ".join(six.iterkeys(kwargs))) + raise TypeError( + "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + ) from pycuda._pvt_struct import pack + func._param_setv(0, pack(func.arg_format, *args)) for texref in func.texrefs: @@ -349,9 +381,13 @@ def _add_functionality(): func._set_block_shape(*block) else: from warnings import warn - warn("Not passing the block size to prepared_timed_call is " - "deprecated as of version 2011.1.", - DeprecationWarning, stacklevel=2) + + warn( + "Not passing the block size to prepared_timed_call is " + "deprecated as of version 2011.1.", + DeprecationWarning, + stacklevel=2, + ) args = (block,) + args shared_size = kwargs.pop("shared_size", None) @@ -359,10 +395,12 @@ def _add_functionality(): func._set_shared_size(shared_size) if kwargs: - raise TypeError("unknown keyword arguments: " - + ", ".join(six.iterkeys(kwargs))) + raise TypeError( + "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + ) from pycuda._pvt_struct import pack + func._param_setv(0, pack(func.arg_format, *args)) for texref in func.texrefs: @@ -377,19 +415,22 @@ def _add_functionality(): def get_call_time(): end.synchronize() - return end.time_since(start)*1e-3 + return end.time_since(start) * 1e-3 return get_call_time - def function_prepared_async_call_pre_v4(func, grid, block, stream, - *args, **kwargs): + def function_prepared_async_call_pre_v4(func, grid, block, stream, *args, **kwargs): if isinstance(block, tuple): func._set_block_shape(*block) else: from warnings import warn - warn("Not passing the block size to prepared_async_call is " - "deprecated as of version 2011.1.", - DeprecationWarning, stacklevel=2) + + warn( + "Not passing the block size to prepared_async_call is " + "deprecated as of version 2011.1.", + DeprecationWarning, + stacklevel=2, + ) args = (stream,) + args stream = block @@ -398,10 +439,12 @@ def _add_functionality(): func._set_shared_size(shared_size) if kwargs: - raise TypeError("unknown keyword arguments: " - + ", ".join(six.iterkeys(kwargs))) + raise TypeError( + "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + ) from pycuda._pvt_struct import pack + func._param_setv(0, pack(func.arg_format, *args)) for texref in func.texrefs: @@ -427,8 +470,8 @@ def _add_functionality(): if kwargs: raise ValueError( - "extra keyword arguments: %s" - % (",".join(six.iterkeys(kwargs)))) + "extra keyword arguments: %s" % (",".join(six.iterkeys(kwargs))) + ) if block is None: raise ValueError("must specify block size") @@ -442,15 +485,16 @@ def _add_functionality(): for texref in texrefs: func.param_set_texref(texref) - post_handlers = [handler - for handler in handlers - if hasattr(handler, "post_call")] + post_handlers = [ + handler for handler in handlers if hasattr(handler, "post_call") + ] if stream is None: if time_kernel: Context.synchronize() from time import time + start_time = time() func._launch_kernel(grid, block, arg_buf, shared, None) @@ -459,7 +503,7 @@ def _add_functionality(): Context.synchronize() if time_kernel: - run_time = time()-start_time + run_time = time() - start_time for handler in post_handlers: handler.post_call(stream) @@ -467,8 +511,9 @@ def _add_functionality(): if time_kernel: return run_time else: - assert not time_kernel, \ - "Can't time the kernel on an asynchronous invocation" + assert ( + not time_kernel + ), "Can't time the kernel on an asynchronous invocation" func._launch_kernel(grid, block, arg_buf, shared, stream) if post_handlers: @@ -481,8 +526,7 @@ def _add_functionality(): func.arg_format = "" for i, arg_type in enumerate(arg_types): - if (isinstance(arg_type, type) - and np.number in arg_type.__mro__): + if isinstance(arg_type, type) and np.number in arg_type.__mro__: func.arg_format += np.dtype(arg_type).char elif isinstance(arg_type, np.dtype): if arg_type.char == "V": @@ -501,17 +545,24 @@ def _add_functionality(): func._set_block_shape(*block) else: from warnings import warn - warn("Not passing the block size to prepared_call is deprecated as of " - "version 2011.1.", DeprecationWarning, stacklevel=2) + + warn( + "Not passing the block size to prepared_call is deprecated as of " + "version 2011.1.", + DeprecationWarning, + stacklevel=2, + ) args = (block,) + args shared_size = kwargs.pop("shared_size", 0) if kwargs: - raise TypeError("unknown keyword arguments: " - + ", ".join(six.iterkeys(kwargs))) + raise TypeError( + "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + ) from pycuda._pvt_struct import pack + arg_buf = pack(func.arg_format, *args) for texref in func.texrefs: @@ -522,10 +573,12 @@ def _add_functionality(): def function_prepared_timed_call(func, grid, block, *args, **kwargs): shared_size = kwargs.pop("shared_size", 0) if kwargs: - raise TypeError("unknown keyword arguments: " - + ", ".join(six.iterkeys(kwargs))) + raise TypeError( + "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + ) from pycuda._pvt_struct import pack + arg_buf = pack(func.arg_format, *args) for texref in func.texrefs: @@ -540,7 +593,7 @@ def _add_functionality(): def get_call_time(): end.synchronize() - return end.time_since(start)*1e-3 + return end.time_since(start) * 1e-3 return get_call_time @@ -549,19 +602,25 @@ def _add_functionality(): func._set_block_shape(*block) else: from warnings import warn - warn("Not passing the block size to prepared_async_call is " - "deprecated as of version 2011.1.", - DeprecationWarning, stacklevel=2) + + warn( + "Not passing the block size to prepared_async_call is " + "deprecated as of version 2011.1.", + DeprecationWarning, + stacklevel=2, + ) args = (stream,) + args stream = block shared_size = kwargs.pop("shared_size", 0) if kwargs: - raise TypeError("unknown keyword arguments: " - + ", ".join(six.iterkeys(kwargs))) + raise TypeError( + "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + ) from pycuda._pvt_struct import pack + arg_buf = pack(func.arg_format, *args) for texref in func.texrefs: @@ -587,9 +646,13 @@ def _add_functionality(): def mark_func_method_deprecated(func): def new_func(*args, **kwargs): from warnings import warn - warn("'%s' has been deprecated in version 2011.1. Please use " - "the stateless launch interface instead." % func.__name__[1:], - DeprecationWarning, stacklevel=2) + + warn( + "'%s' has been deprecated in version 2011.1. Please use " + "the stateless launch interface instead." % func.__name__[1:], + DeprecationWarning, + stacklevel=2, + ) return func(*args, **kwargs) try: @@ -599,7 +662,7 @@ def _add_functionality(): else: try: update_wrapper(new_func, func) - except: + except Exception: # User won't see true signature. Oh well. pass @@ -622,12 +685,23 @@ def _add_functionality(): Function.prepared_timed_call = function_prepared_timed_call_pre_v4 Function.prepared_async_call = function_prepared_async_call_pre_v4 - for meth_name in ["set_block_shape", "set_shared_size", - "param_set_size", "param_set", "param_seti", "param_setf", - "param_setv", - "launch", "launch_grid", "launch_grid_async"]: - setattr(Function, meth_name, mark_func_method_deprecated( - getattr(Function, "_"+meth_name))) + for meth_name in [ + "set_block_shape", + "set_shared_size", + "param_set_size", + "param_set", + "param_seti", + "param_setf", + "param_setv", + "launch", + "launch_grid", + "launch_grid_async", + ]: + setattr( + Function, + meth_name, + mark_func_method_deprecated(getattr(Function, "_" + meth_name)), + ) Function.__getattr__ = function___getattr__ @@ -637,6 +711,7 @@ _add_functionality() # {{{ pagelocked numpy arrays + def pagelocked_zeros(shape, dtype, order="C", mem_flags=0): result = pagelocked_empty(shape, dtype, order, mem_flags) result.fill(0) @@ -659,11 +734,13 @@ def pagelocked_zeros_like(array, mem_flags=0): result.fill(0) return result + # }}} # {{{ aligned numpy arrays + def aligned_zeros(shape, dtype, order="C", alignment=4096): result = aligned_empty(shape, dtype, order, alignment) result.fill(0) @@ -686,11 +763,13 @@ def aligned_zeros_like(array, alignment=4096): result.fill(0) return result + # }}} # {{{ managed numpy arrays (CUDA Unified Memory) + def managed_zeros(shape, dtype, order="C", mem_flags=0): result = managed_empty(shape, dtype, order, mem_flags) result.fill(0) @@ -713,6 +792,7 @@ def managed_zeros_like(array, mem_flags=0): result.fill(0) return result + # }}} @@ -722,6 +802,7 @@ def mem_alloc_like(ary): # {{{ array handling + def dtype_to_array_format(dtype): if dtype == np.uint8: return array_format.UNSIGNED_INT8 @@ -738,9 +819,7 @@ def dtype_to_array_format(dtype): elif dtype == np.float32: return array_format.FLOAT else: - raise TypeError( - "cannot convert dtype '%s' to array format" - % dtype) + raise TypeError("cannot convert dtype '%s' to array format" % dtype) def matrix_to_array(matrix, order, allow_double_hack=False): @@ -771,22 +850,24 @@ def matrix_to_array(matrix, order, allow_double_hack=False): copy = Memcpy2D() copy.set_src_host(matrix) copy.set_dst_array(ary) - copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = \ - matrix.strides[stride] + copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = matrix.strides[stride] copy.height = h copy(aligned=True) return ary -def np_to_array(nparray, order, allowSurfaceBind=False): - case = order in ["C","F"] + +def np_to_array(nparray, order, allowSurfaceBind=False): # noqa: N803 + case = order in ["C", "F"] if not case: raise LogicError("order must be either F or C") dimension = len(nparray.shape) if dimension == 2: - if order == "C": stride = 0 - if order == "F": stride = -1 + if order == "C": + stride = 0 + if order == "F": + stride = -1 h, w = nparray.shape d = 1 if allowSurfaceBind: @@ -799,35 +880,46 @@ def np_to_array(nparray, order, allowSurfaceBind=False): descrArr.width = w descrArr.height = h elif dimension == 3: - if order == "C": stride = 1 - if order == "F": stride = 1 + if order == "C": + stride = 1 + if order == "F": + stride = 1 d, h, w = nparray.shape descrArr = ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: - raise LogicError("CUDArrays dimensions 2 or 3 supported in CUDA at the moment ... ") + raise LogicError( + "CUDArrays dimensions 2 or 3 supported in CUDA at the moment ... " + ) if nparray.dtype == np.complex64: - descrArr.format = array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure + descrArr.format = ( + array_format.SIGNED_INT32 + ) # Reading data as int2 (hi=re,lo=im) structure descrArr.num_channels = 2 elif nparray.dtype == np.float64: - descrArr.format = array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure + descrArr.format = ( + array_format.SIGNED_INT32 + ) # Reading data as int2 (hi,lo) structure descrArr.num_channels = 2 elif nparray.dtype == np.complex128: - descrArr.format = array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure + descrArr.format = ( + array_format.SIGNED_INT32 + ) # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure descrArr.num_channels = 4 else: descrArr.format = dtype_to_array_format(nparray.dtype) descrArr.num_channels = 1 if allowSurfaceBind: - if dimension==2: descrArr.flags |= array3d_flags.ARRAY3D_LAYERED + if dimension == 2: + descrArr.flags |= array3d_flags.ARRAY3D_LAYERED descrArr.flags |= array3d_flags.SURFACE_LDST cudaArray = Array(descrArr) - if allowSurfaceBind or dimension==3: + if allowSurfaceBind or dimension == 3: copy3D = Memcpy3D() copy3D.set_src_host(nparray) copy3D.set_dst_array(cudaArray) @@ -845,15 +937,18 @@ def np_to_array(nparray, order, allowSurfaceBind=False): copy2D(aligned=True) return cudaArray -def gpuarray_to_array(gpuarray, order, allowSurfaceBind=False): - case = order in ["C","F"] + +def gpuarray_to_array(gpuarray, order, allowSurfaceBind=False): # noqa: N803 + case = order in ["C", "F"] if not case: raise LogicError("order must be either F or C") dimension = len(gpuarray.shape) if dimension == 2: - if order == "C": stride = 0 - if order == "F": stride = -1 + if order == "C": + stride = 0 + if order == "F": + stride = -1 h, w = gpuarray.shape d = 1 if allowSurfaceBind: @@ -866,35 +961,46 @@ def gpuarray_to_array(gpuarray, order, allowSurfaceBind=False): descrArr.width = int(w) descrArr.height = int(h) elif dimension == 3: - if order == "C": stride = 1 - if order == "F": stride = 1 + if order == "C": + stride = 1 + if order == "F": + stride = 1 d, h, w = gpuarray.shape descrArr = ArrayDescriptor3D() descrArr.width = int(w) descrArr.height = int(h) descrArr.depth = int(d) else: - raise LogicError("CUDArray dimensions 2 and 3 supported in CUDA at the moment ... ") + raise LogicError( + "CUDArray dimensions 2 and 3 supported in CUDA at the moment ... " + ) if gpuarray.dtype == np.complex64: - descrArr.format = array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure + descrArr.format = ( + array_format.SIGNED_INT32 + ) # Reading data as int2 (hi=re,lo=im) structure descrArr.num_channels = 2 elif gpuarray.dtype == np.float64: - descrArr.format = array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure + descrArr.format = ( + array_format.SIGNED_INT32 + ) # Reading data as int2 (hi,lo) structure descrArr.num_channels = 2 elif gpuarray.dtype == np.complex128: - descrArr.format = array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure + descrArr.format = ( + array_format.SIGNED_INT32 + ) # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure descrArr.num_channels = 4 else: descrArr.format = dtype_to_array_format(gpuarray.dtype) descrArr.num_channels = 1 if allowSurfaceBind: - if dimension==2: descrArr.flags |= array3d_flags.ARRAY3D_LAYERED + if dimension == 2: + descrArr.flags |= array3d_flags.ARRAY3D_LAYERED descrArr.flags |= array3d_flags.SURFACE_LDST cudaArray = Array(descrArr) - if allowSurfaceBind or dimension==3: + if allowSurfaceBind or dimension == 3: copy3D = Memcpy3D() copy3D.set_src_device(gpuarray.ptr) copy3D.set_dst_array(cudaArray) @@ -912,6 +1018,7 @@ def gpuarray_to_array(gpuarray, order, allowSurfaceBind=False): copy2D(aligned=True) return cudaArray + def make_multichannel_2d_array(ndarray, order): """Channel count has to be the first dimension of the C{ndarray}.""" @@ -936,8 +1043,7 @@ def make_multichannel_2d_array(ndarray, order): copy = Memcpy2D() copy.set_src_host(ndarray) copy.set_dst_array(ary) - copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = \ - ndarray.strides[stride] + copy.width_in_bytes = copy.src_pitch = copy.dst_pitch = ndarray.strides[stride] copy.height = h copy(aligned=True) @@ -950,6 +1056,7 @@ def bind_array_to_texref(ary, texref): texref.set_address_mode(1, address_mode.CLAMP) texref.set_filter_mode(filter_mode.POINT) + # }}} @@ -959,8 +1066,10 @@ def matrix_to_texref(matrix, texref, order): # {{{ device copies + def to_device(bf_obj): import sys + if sys.version_info >= (2, 7): bf = memoryview(bf_obj).tobytes() else: @@ -981,6 +1090,7 @@ def from_device_like(devptr, other_ary): memcpy_dtoh(result, devptr) return result + # }}} # vim: fdm=marker diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index ab451ade..d560e5dd 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -38,11 +38,20 @@ from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg from pytools import memoize_method -def get_elwise_module(arguments, operation, - name="kernel", keep=False, options=None, - preamble="", loop_prep="", after_loop=""): +def get_elwise_module( + arguments, + operation, + name="kernel", + keep=False, + options=None, + preamble="", + loop_prep="", + after_loop="", +): from pycuda.compiler import SourceModule - return SourceModule(""" + + return SourceModule( + """ #include %(preamble)s @@ -65,22 +74,35 @@ def get_elwise_module(arguments, operation, %(after_loop)s; } - """ % { + """ + % { "arguments": ", ".join(arg.declarator() for arg in arguments), "operation": operation, "name": name, "preamble": preamble, "loop_prep": loop_prep, "after_loop": after_loop, - }, - options=options, keep=keep, no_extern_c=True) - - -def get_elwise_range_module(arguments, operation, - name="kernel", keep=False, options=None, - preamble="", loop_prep="", after_loop=""): + }, + options=options, + keep=keep, + no_extern_c=True, + ) + + +def get_elwise_range_module( + arguments, + operation, + name="kernel", + keep=False, + options=None, + preamble="", + loop_prep="", + after_loop="", +): from pycuda.compiler import SourceModule - return SourceModule(""" + + return SourceModule( + """ #include %(preamble)s @@ -114,29 +136,43 @@ def get_elwise_range_module(arguments, operation, %(after_loop)s; } - """ % { + """ + % { "arguments": ", ".join(arg.declarator() for arg in arguments), "operation": operation, "name": name, "preamble": preamble, "loop_prep": loop_prep, "after_loop": after_loop, - }, - options=options, keep=keep, no_extern_c=True) - - -def get_elwise_kernel_and_types(arguments, operation, - name="kernel", keep=False, options=None, use_range=False, **kwargs): + }, + options=options, + keep=keep, + no_extern_c=True, + ) + + +def get_elwise_kernel_and_types( + arguments, + operation, + name="kernel", + keep=False, + options=None, + use_range=False, + **kwargs +): if isinstance(arguments, str): from pycuda.tools import parse_c_arg + arguments = [parse_c_arg(arg) for arg in arguments.split(",")] if use_range: - arguments.extend([ - ScalarArg(np.intp, "start"), - ScalarArg(np.intp, "stop"), - ScalarArg(np.intp, "step"), - ]) + arguments.extend( + [ + ScalarArg(np.intp, "start"), + ScalarArg(np.intp, "stop"), + ScalarArg(np.intp, "step"), + ] + ) else: arguments.append(ScalarArg(np.uintp, "n")) @@ -145,8 +181,7 @@ def get_elwise_kernel_and_types(arguments, operation, else: module_builder = get_elwise_module - mod = module_builder(arguments, operation, name, - keep, options, **kwargs) + mod = module_builder(arguments, operation, name, keep, options, **kwargs) func = mod.get_function(name) func.prepare("".join(arg.struct_char for arg in arguments)) @@ -154,24 +189,34 @@ def get_elwise_kernel_and_types(arguments, operation, return mod, func, arguments -def get_elwise_kernel(arguments, operation, - name="kernel", keep=False, options=None, **kwargs): +def get_elwise_kernel( + arguments, operation, name="kernel", keep=False, options=None, **kwargs +): """Return a L{pycuda.driver.Function} that performs the same scalar operation on one or several vectors. """ mod, func, arguments = get_elwise_kernel_and_types( - arguments, operation, name, keep, options, **kwargs) + arguments, operation, name, keep, options, **kwargs + ) return func class ElementwiseKernel: - def __init__(self, arguments, operation, - name="kernel", keep=False, options=None, **kwargs): + def __init__( + self, arguments, operation, name="kernel", keep=False, options=None, **kwargs + ): self.gen_kwargs = kwargs.copy() - self.gen_kwargs.update(dict(keep=keep, options=options, name=name, - operation=operation, arguments=arguments)) + self.gen_kwargs.update( + dict( + keep=keep, + options=options, + name=name, + operation=operation, + arguments=arguments, + ) + ) def get_texref(self, name, use_range=False): mod, knl, arguments = self.generate_stride_kernel_and_types(use_range=use_range) @@ -179,13 +224,14 @@ class ElementwiseKernel: @memoize_method def generate_stride_kernel_and_types(self, use_range): - mod, knl, arguments = get_elwise_kernel_and_types(use_range=use_range, - **self.gen_kwargs) + mod, knl, arguments = get_elwise_kernel_and_types( + use_range=use_range, **self.gen_kwargs + ) - assert [i for i, arg in enumerate(arguments) - if isinstance(arg, VectorArg)], \ - "ElementwiseKernel can only be used with functions that " \ - "have at least one vector argument" + assert [i for i, arg in enumerate(arguments) if isinstance(arg, VectorArg)], ( + "ElementwiseKernel can only be used with functions that " + "have at least one vector argument" + ) return mod, knl, arguments @@ -197,18 +243,22 @@ class ElementwiseKernel: stream = kwargs.pop("stream", None) if kwargs: - raise TypeError("invalid keyword arguments specified: " - + ", ".join(six.iterkeys(kwargs))) + raise TypeError( + "invalid keyword arguments specified: " + + ", ".join(six.iterkeys(kwargs)) + ) invocation_args = [] mod, func, arguments = self.generate_stride_kernel_and_types( - range_ is not None or slice_ is not None) + range_ is not None or slice_ is not None + ) for arg, arg_descr in zip(args, arguments): if isinstance(arg_descr, VectorArg): if not arg.flags.forc: - raise RuntimeError("elementwise kernel cannot " - "deal with non-contiguous arrays") + raise RuntimeError( + "elementwise kernel cannot " "deal with non-contiguous arrays" + ) vectors.append(arg) invocation_args.append(arg.gpudata) @@ -219,8 +269,9 @@ class ElementwiseKernel: if slice_ is not None: if range_ is not None: - raise TypeError("may not specify both range and slice " - "keyword arguments") + raise TypeError( + "may not specify both range and slice " "keyword arguments" + ) range_ = slice(*slice_.indices(repr_vec.size)) @@ -233,7 +284,8 @@ class ElementwiseKernel: invocation_args.append(range_.step) from pycuda.gpuarray import splay - grid, block = splay(abs(range_.stop - range_.start)//range_.step) + + grid, block = splay(abs(range_.stop - range_.start) // range_.step) else: block = repr_vec._block grid = repr_vec._grid @@ -245,124 +297,133 @@ class ElementwiseKernel: @context_dependent_memoize def get_take_kernel(dtype, idx_dtype, vec_count=1): ctx = { - "idx_tp": dtype_to_ctype(idx_dtype), - "tp": dtype_to_ctype(dtype), - "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), - } - - args = [VectorArg(idx_dtype, "idx")] + [ - VectorArg(dtype, "dest"+str(i))for i in range(vec_count)] + [ - ScalarArg(np.intp, "n") - ] + "idx_tp": dtype_to_ctype(idx_dtype), + "tp": dtype_to_ctype(dtype), + "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), + } + + args = ( + [VectorArg(idx_dtype, "idx")] + + [VectorArg(dtype, "dest" + str(i)) for i in range(vec_count)] + + [ScalarArg(np.intp, "n")] + ) preamble = "#include \n\n" + "\n".join( "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) - for i in range(vec_count)) - body = ( - ("%(idx_tp)s src_idx = idx[i];\n" % ctx) - + "\n".join( - "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) - for i in range(vec_count))) + for i in range(vec_count) + ) + body = ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join( + "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) + for i in range(vec_count) + ) mod = get_elwise_module(args, body, "take", preamble=preamble) func = mod.get_function("take") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] - func.prepare("P"+(vec_count*"P")+np.dtype(np.uintp).char, texrefs=tex_src) + func.prepare("P" + (vec_count * "P") + np.dtype(np.uintp).char, texrefs=tex_src) return func, tex_src @context_dependent_memoize def get_take_put_kernel(dtype, idx_dtype, with_offsets, vec_count=1): ctx = { - "idx_tp": dtype_to_ctype(idx_dtype), - "tp": dtype_to_ctype(dtype), - "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), - } + "idx_tp": dtype_to_ctype(idx_dtype), + "tp": dtype_to_ctype(dtype), + "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), + } - args = [ - VectorArg(idx_dtype, "gmem_dest_idx"), - VectorArg(idx_dtype, "gmem_src_idx"), - ] + [ - VectorArg(dtype, "dest%d" % i) - for i in range(vec_count) - ] + [ - ScalarArg(idx_dtype, "offset%d" % i) - for i in range(vec_count) if with_offsets - ] + [ScalarArg(np.intp, "n")] + args = ( + [ + VectorArg(idx_dtype, "gmem_dest_idx"), + VectorArg(idx_dtype, "gmem_src_idx"), + ] + + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)] + + [ + ScalarArg(idx_dtype, "offset%d" % i) + for i in range(vec_count) + if with_offsets + ] + + [ScalarArg(np.intp, "n")] + ) preamble = "#include \n\n" + "\n".join( "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) - for i in range(vec_count)) + for i in range(vec_count) + ) if with_offsets: + def get_copy_insn(i): - return ("dest%d[dest_idx] = " - "fp_tex1Dfetch(tex_src%d, src_idx+offset%d);" - % (i, i, i)) + return ( + "dest%d[dest_idx] = " + "fp_tex1Dfetch(tex_src%d, src_idx+offset%d);" % (i, i, i) + ) + else: + def get_copy_insn(i): - return ("dest%d[dest_idx] = " - "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)) + return "dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) - body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n" - "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) - + "\n".join(get_copy_insn(i) for i in range(vec_count))) + body = ( + "%(idx_tp)s src_idx = gmem_src_idx[i];\n" + "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + ) + "\n".join(get_copy_insn(i) for i in range(vec_count)) mod = get_elwise_module(args, body, "take_put", preamble=preamble) func = mod.get_function("take_put") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] func.prepare( - "PP"+(vec_count*"P") - + (bool(with_offsets)*vec_count*idx_dtype.char) - + np.dtype(np.uintp).char, - texrefs=tex_src) + "PP" + + (vec_count * "P") + + (bool(with_offsets) * vec_count * idx_dtype.char) + + np.dtype(np.uintp).char, + texrefs=tex_src, + ) return func, tex_src @context_dependent_memoize def get_put_kernel(dtype, idx_dtype, vec_count=1): ctx = { - "idx_tp": dtype_to_ctype(idx_dtype), - "tp": dtype_to_ctype(dtype), - } + "idx_tp": dtype_to_ctype(idx_dtype), + "tp": dtype_to_ctype(dtype), + } - args = [ + args = ( + [ VectorArg(idx_dtype, "gmem_dest_idx"), - ] + [ - VectorArg(dtype, "dest%d" % i) - for i in range(vec_count) - ] + [ - VectorArg(dtype, "src%d" % i) - for i in range(vec_count) - ] + [ScalarArg(np.intp, "n")] + ] + + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)] + + [VectorArg(dtype, "src%d" % i) for i in range(vec_count)] + + [ScalarArg(np.intp, "n")] + ) - body = ( - "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx - + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) - for i in range(vec_count))) + body = "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join( + "dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count) + ) func = get_elwise_module(args, body, "put").get_function("put") - func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char) + func.prepare("P" + (2 * vec_count * "P") + np.dtype(np.uintp).char) return func @context_dependent_memoize def get_copy_kernel(dtype_dest, dtype_src): return get_elwise_kernel( - "%(tp_dest)s *dest, %(tp_src)s *src" % { - "tp_dest": dtype_to_ctype(dtype_dest), - "tp_src": dtype_to_ctype(dtype_src), - }, - "dest[i] = src[i]", - "copy") + "%(tp_dest)s *dest, %(tp_src)s *src" + % { + "tp_dest": dtype_to_ctype(dtype_dest), + "tp_src": dtype_to_ctype(dtype_src), + }, + "dest[i] = src[i]", + "copy", + ) @context_dependent_memoize -def get_linear_combination_kernel(summand_descriptors, - dtype_z): +def get_linear_combination_kernel(summand_descriptors, dtype_z): from pycuda.tools import dtype_to_ctype - from pycuda.elementwise import \ - VectorArg, ScalarArg, get_elwise_module + from pycuda.elementwise import VectorArg, ScalarArg, get_elwise_module args = [] preamble = ["#include \n\n"] @@ -370,17 +431,20 @@ def get_linear_combination_kernel(summand_descriptors, summands = [] tex_names = [] - for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \ - enumerate(summand_descriptors): + for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in enumerate( + summand_descriptors + ): if is_gpu_scalar: preamble.append( - "texture <%s, 1, cudaReadModeElementType> tex_a%d;" - % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)) + "texture <%s, 1, cudaReadModeElementType> tex_a%d;" + % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i) + ) args.append(VectorArg(vector_dtype, "x%d" % i)) tex_names.append("tex_a%d" % i) loop_prep.append( - "%s a%d = fp_tex1Dfetch(tex_a%d, 0)" - % (dtype_to_ctype(scalar_dtype), i, i)) + "%s a%d = fp_tex1Dfetch(tex_a%d, 0)" + % (dtype_to_ctype(scalar_dtype), i, i) + ) else: args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i)) @@ -390,16 +454,17 @@ def get_linear_combination_kernel(summand_descriptors, args.append(VectorArg(dtype_z, "z")) args.append(ScalarArg(np.uintp, "n")) - mod = get_elwise_module(args, - "z[i] = " + " + ".join(summands), - "linear_combination", - preamble="\n".join(preamble), - loop_prep=";\n".join(loop_prep)) + mod = get_elwise_module( + args, + "z[i] = " + " + ".join(summands), + "linear_combination", + preamble="\n".join(preamble), + loop_prep=";\n".join(loop_prep), + ) func = mod.get_function("linear_combination") tex_src = [mod.get_texref(tn) for tn in tex_names] - func.prepare("".join(arg.struct_char for arg in args), - texrefs=tex_src) + func.prepare("".join(arg.struct_char for arg in args), texrefs=tex_src) return func, tex_src @@ -407,71 +472,80 @@ def get_linear_combination_kernel(summand_descriptors, @context_dependent_memoize def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): return get_elwise_kernel( - "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_z": dtype_to_ctype(dtype_z), - }, - "z[i] = a*x[i] + b*y[i]", - "axpbyz") + "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = a*x[i] + b*y[i]", + "axpbyz", + ) @context_dependent_memoize def get_axpbz_kernel(dtype_x, dtype_z): return get_elwise_kernel( - "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_z": dtype_to_ctype(dtype_z) - }, - "z[i] = a * x[i] + b", - "axpb") + "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" + % {"tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z)}, + "z[i] = a * x[i] + b", + "axpb", + ) @context_dependent_memoize def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): return get_elwise_kernel( - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_z": dtype_to_ctype(dtype_z), - }, - "z[i] = x[i] %s y[i]" % operator, - "multiply") + "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = x[i] %s y[i]" % operator, + "multiply", + ) @context_dependent_memoize def get_rdivide_elwise_kernel(dtype_x, dtype_z): return get_elwise_kernel( - "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_z": dtype_to_ctype(dtype_z), - }, - "z[i] = y / x[i]", - "divide_r") + "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = y / x[i]", + "divide_r", + ) @context_dependent_memoize def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): return get_elwise_kernel( - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_z": dtype_to_ctype(dtype_z), - }, - "z[i] = %s(x[i], y[i])" % func, - func+"_kernel") + "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = %s(x[i], y[i])" % func, + func + "_kernel", + ) @context_dependent_memoize def get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z): return get_elwise_kernel( - "%(tp_x)s *x, %(tp_y)s y, %(tp_z)s *z" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_z": dtype_to_ctype(dtype_z), - }, - "z[i] = %s(x[i], y)" % func, - func+"_kernel") + "%(tp_x)s *x, %(tp_y)s y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = %s(x[i], y)" % func, + func + "_kernel", + ) def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): @@ -479,8 +553,9 @@ def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): func = func + "f" from pytools import any + if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]): - func = "f"+func + func = "f" + func if use_scalar: return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z) @@ -491,63 +566,75 @@ def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): @context_dependent_memoize def get_fill_kernel(dtype): return get_elwise_kernel( - "%(tp)s a, %(tp)s *z" % { - "tp": dtype_to_ctype(dtype), - }, - "z[i] = a", - "fill") + "%(tp)s a, %(tp)s *z" + % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = a", + "fill", + ) @context_dependent_memoize def get_reverse_kernel(dtype): return get_elwise_kernel( - "%(tp)s *y, %(tp)s *z" % { - "tp": dtype_to_ctype(dtype), - }, - "z[i] = y[n-1-i]", - "reverse") + "%(tp)s *y, %(tp)s *z" + % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = y[n-1-i]", + "reverse", + ) @context_dependent_memoize def get_real_kernel(dtype, real_dtype): return get_elwise_kernel( - "%(tp)s *y, %(real_tp)s *z" % { - "tp": dtype_to_ctype(dtype), - "real_tp": dtype_to_ctype(real_dtype), - }, - "z[i] = real(y[i])", - "real") + "%(tp)s *y, %(real_tp)s *z" + % { + "tp": dtype_to_ctype(dtype), + "real_tp": dtype_to_ctype(real_dtype), + }, + "z[i] = real(y[i])", + "real", + ) @context_dependent_memoize def get_imag_kernel(dtype, real_dtype): return get_elwise_kernel( - "%(tp)s *y, %(real_tp)s *z" % { - "tp": dtype_to_ctype(dtype), - "real_tp": dtype_to_ctype(real_dtype), - }, - "z[i] = imag(y[i])", - "imag") + "%(tp)s *y, %(real_tp)s *z" + % { + "tp": dtype_to_ctype(dtype), + "real_tp": dtype_to_ctype(real_dtype), + }, + "z[i] = imag(y[i])", + "imag", + ) @context_dependent_memoize def get_conj_kernel(dtype): return get_elwise_kernel( - "%(tp)s *y, %(tp)s *z" % { - "tp": dtype_to_ctype(dtype), - }, - "z[i] = pycuda::conj(y[i])", - "conj") + "%(tp)s *y, %(tp)s *z" + % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = pycuda::conj(y[i])", + "conj", + ) @context_dependent_memoize def get_arange_kernel(dtype): return get_elwise_kernel( - "%(tp)s *z, %(tp)s start, %(tp)s step" % { - "tp": dtype_to_ctype(dtype), - }, - "z[i] = start + i*step", - "arange") + "%(tp)s *z, %(tp)s start, %(tp)s step" + % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = start + i*step", + "arange", + ) @context_dependent_memoize @@ -558,11 +645,13 @@ def get_pow_kernel(dtype): func = "pow" return get_elwise_kernel( - "%(tp)s value, %(tp)s *y, %(tp)s *z" % { - "tp": dtype_to_ctype(dtype), - }, - "z[i] = %s(y[i], value)" % func, - "pow_method") + "%(tp)s value, %(tp)s *y, %(tp)s *z" + % { + "tp": dtype_to_ctype(dtype), + }, + "z[i] = %s(y[i], value)" % func, + "pow_method", + ) @context_dependent_memoize @@ -573,49 +662,53 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): func = "powf" return get_elwise_kernel( - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_z": dtype_to_ctype(dtype_z), - }, - "z[i] = %s(x[i], y[i])" % func, - "pow_method") + "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = %s(x[i], y[i])" % func, + "pow_method", + ) @context_dependent_memoize def get_fmod_kernel(): return get_elwise_kernel( - "float *arg, float *mod, float *z", - "z[i] = fmod(arg[i], mod[i])", - "fmod_kernel") + "float *arg, float *mod, float *z", "z[i] = fmod(arg[i], mod[i])", "fmod_kernel" + ) @context_dependent_memoize def get_modf_kernel(): return get_elwise_kernel( - "float *x, float *intpart ,float *fracpart", - "fracpart[i] = modf(x[i], &intpart[i])", - "modf_kernel") + "float *x, float *intpart ,float *fracpart", + "fracpart[i] = modf(x[i], &intpart[i])", + "modf_kernel", + ) @context_dependent_memoize def get_frexp_kernel(): return get_elwise_kernel( - "float *x, float *significand, float *exponent", - """ + "float *x, float *significand, float *exponent", + """ int expt = 0; significand[i] = frexp(x[i], &expt); exponent[i] = expt; """, - "frexp_kernel") + "frexp_kernel", + ) @context_dependent_memoize def get_ldexp_kernel(): return get_elwise_kernel( - "float *sig, float *expt, float *z", - "z[i] = ldexp(sig[i], int(expt[i]))", - "ldexp_kernel") + "float *sig, float *expt, float *z", + "z[i] = ldexp(sig[i], int(expt[i]))", + "ldexp_kernel", + ) @context_dependent_memoize @@ -624,33 +717,39 @@ def get_unary_func_kernel(func_name, in_dtype, out_dtype=None): out_dtype = in_dtype return get_elwise_kernel( - "%(tp_in)s *y, %(tp_out)s *z" % { - "tp_in": dtype_to_ctype(in_dtype), - "tp_out": dtype_to_ctype(out_dtype), - }, - "z[i] = %s(y[i])" % func_name, - "%s_kernel" % func_name) + "%(tp_in)s *y, %(tp_out)s *z" + % { + "tp_in": dtype_to_ctype(in_dtype), + "tp_out": dtype_to_ctype(out_dtype), + }, + "z[i] = %s(y[i])" % func_name, + "%s_kernel" % func_name, + ) @context_dependent_memoize def get_if_positive_kernel(crit_dtype, dtype): - return get_elwise_kernel([ + return get_elwise_kernel( + [ VectorArg(crit_dtype, "crit"), VectorArg(dtype, "then_"), VectorArg(dtype, "else_"), VectorArg(dtype, "result"), - ], - "result[i] = crit[i] > 0 ? then_[i] : else_[i]", - "if_positive") + ], + "result[i] = crit[i] > 0 ? then_[i] : else_[i]", + "if_positive", + ) @context_dependent_memoize def get_scalar_op_kernel(dtype_x, dtype_y, operator): return get_elwise_kernel( - "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_a": dtype_to_ctype(dtype_x), - }, - "y[i] = x[i] %s a" % operator, - "scalarop_kernel") + "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_a": dtype_to_ctype(dtype_x), + }, + "y[i] = x[i] %s a" % operator, + "scalarop_kernel", + ) diff --git a/pycuda/gl/__init__.py b/pycuda/gl/__init__.py index 56c0e642..3120d2de 100644 --- a/pycuda/gl/__init__.py +++ b/pycuda/gl/__init__.py @@ -1,7 +1,7 @@ from __future__ import absolute_import import pycuda._driver as _drv -if not _drv.have_gl_ext(): +if not _drv.have_gl_ext(): raise ImportError("PyCUDA was compiled without GL extension support") init = _drv.gl_init diff --git a/pycuda/gl/autoinit.py b/pycuda/gl/autoinit.py index 13f6717d..fc8c9a66 100644 --- a/pycuda/gl/autoinit.py +++ b/pycuda/gl/autoinit.py @@ -1,13 +1,13 @@ from __future__ import absolute_import import pycuda.driver as cuda import pycuda.gl as cudagl +import atexit cuda.init() assert cuda.Device.count() >= 1 -from pycuda.tools import make_default_context +from pycuda.tools import make_default_context # noqa: E402 context = make_default_context(lambda dev: cudagl.make_context(dev)) device = context.get_device() -import atexit atexit.register(context.pop) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 12e4e039..7bfc48c9 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -5,16 +5,19 @@ import pycuda.elementwise as elementwise from pytools import memoize, memoize_method import pycuda.driver as drv from pycuda.compyte.array import ( - as_strided as _as_strided, - f_contiguous_strides as _f_contiguous_strides, - c_contiguous_strides as _c_contiguous_strides, - ArrayFlags as _ArrayFlags, - get_common_dtype as _get_common_dtype_base) + as_strided as _as_strided, + f_contiguous_strides as _f_contiguous_strides, + c_contiguous_strides as _c_contiguous_strides, + ArrayFlags as _ArrayFlags, + get_common_dtype as _get_common_dtype_base, +) from pycuda.characterize import has_double_support import six from six.moves import range, zip, reduce import numbers +import six.moves.copyreg + def _get_common_dtype(obj1, obj2): return _get_common_dtype_base(obj1, obj2, has_double_support()) @@ -22,12 +25,14 @@ def _get_common_dtype(obj1, obj2): # {{{ vector types + class vec: # noqa pass def _create_vector_types(): from pycuda.characterize import platform_bits + if platform_bits() == 32: long_dtype = np.int32 ulong_dtype = np.uint32 @@ -40,35 +45,40 @@ def _create_vector_types(): from pycuda.tools import get_or_register_dtype for base_name, base_type, counts in [ - ('char', np.int8, [1, 2, 3, 4]), - ('uchar', np.uint8, [1, 2, 3, 4]), - ('short', np.int16, [1, 2, 3, 4]), - ('ushort', np.uint16, [1, 2, 3, 4]), - ('int', np.int32, [1, 2, 3, 4]), - ('uint', np.uint32, [1, 2, 3, 4]), - ('long', long_dtype, [1, 2, 3, 4]), - ('ulong', ulong_dtype, [1, 2, 3, 4]), - ('longlong', np.int64, [1, 2]), - ('ulonglong', np.uint64, [1, 2]), - ('float', np.float32, [1, 2, 3, 4]), - ('double', np.float64, [1, 2]), - ]: + ("char", np.int8, [1, 2, 3, 4]), + ("uchar", np.uint8, [1, 2, 3, 4]), + ("short", np.int16, [1, 2, 3, 4]), + ("ushort", np.uint16, [1, 2, 3, 4]), + ("int", np.int32, [1, 2, 3, 4]), + ("uint", np.uint32, [1, 2, 3, 4]), + ("long", long_dtype, [1, 2, 3, 4]), + ("ulong", ulong_dtype, [1, 2, 3, 4]), + ("longlong", np.int64, [1, 2]), + ("ulonglong", np.uint64, [1, 2]), + ("float", np.float32, [1, 2, 3, 4]), + ("double", np.float64, [1, 2]), + ]: for count in counts: name = "%s%d" % (base_name, count) - dtype = np.dtype([ - (field_names[i], base_type) - for i in range(count)]) + dtype = np.dtype([(field_names[i], base_type) for i in range(count)]) get_or_register_dtype(name, dtype) setattr(vec, name, dtype) my_field_names = ",".join(field_names[:count]) - setattr(vec, "make_"+name, - staticmethod(eval( + setattr( + vec, + "make_" + name, + staticmethod( + eval( "lambda %s: array((%s), dtype=my_dtype)" % (my_field_names, my_field_names), - dict(array=np.array, my_dtype=dtype)))) + dict(array=np.array, my_dtype=dtype), + ) + ), + ) + _create_vector_types() @@ -77,16 +87,21 @@ _create_vector_types() # {{{ helper functionality + @memoize def _splay_backend(n, dev): # heavily modified from cublas from pycuda.tools import DeviceData + devdata = DeviceData(dev) min_threads = devdata.warp_size max_threads = 128 - max_blocks = 4 * devdata.thread_blocks_per_mp \ - * dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT) + max_blocks = ( + 4 + * devdata.thread_blocks_per_mp + * dev.get_attribute(drv.device_attribute.MULTIPROCESSOR_COUNT) + ) if n < min_threads: block_count = 1 @@ -111,40 +126,56 @@ def splay(n, dev=None): dev = drv.Context.get_device() return _splay_backend(n, dev) + # }}} # {{{ main GPUArray class + def _make_binary_op(operator): def func(self, other): if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) if isinstance(other, GPUArray): assert self.shape == other.shape if not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " + "be used as arguments to this operation" + ) result = self._new_like_me() func = elementwise.get_binary_op_kernel( - self.dtype, other.dtype, result.dtype, - operator) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, - self.mem_size) + self.dtype, other.dtype, result.dtype, operator + ) + func.prepared_async_call( + self._grid, + self._block, + None, + self.gpudata, + other.gpudata, + result.gpudata, + self.mem_size, + ) return result else: # scalar operator result = self._new_like_me() - func = elementwise.get_scalar_op_kernel( - self.dtype, result.dtype, operator) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other, result.gpudata, - self.mem_size) + func = elementwise.get_scalar_op_kernel(self.dtype, result.dtype, operator) + func.prepared_async_call( + self._grid, + self._block, + None, + self.gpudata, + other, + result.gpudata, + self.mem_size, + ) return result return func @@ -159,8 +190,16 @@ class GPUArray(object): __array_priority__ = 100 - def __init__(self, shape, dtype, allocator=drv.mem_alloc, - base=None, gpudata=None, strides=None, order="C"): + def __init__( + self, + shape, + dtype, + allocator=drv.mem_alloc, + base=None, + gpudata=None, + strides=None, + order="C", + ): dtype = np.dtype(dtype) try: @@ -184,11 +223,9 @@ class GPUArray(object): if strides is None: if order == "F": - strides = _f_contiguous_strides( - dtype.itemsize, shape) + strides = _f_contiguous_strides(dtype.itemsize, shape) elif order == "C": - strides = _c_contiguous_strides( - dtype.itemsize, shape) + strides = _c_contiguous_strides(dtype.itemsize, shape) else: raise ValueError("invalid order: %s" % order) else: @@ -240,8 +277,7 @@ class GPUArray(object): async_ = False if kwargs: - raise TypeError("extra keyword arguments specified: %s" - % ", ".join(kwargs)) + raise TypeError("extra keyword arguments specified: %s" % ", ".join(kwargs)) # }}} @@ -249,8 +285,8 @@ class GPUArray(object): raise ValueError("ary and self must be the same size") if ary.shape != self.shape: from warnings import warn - warn("Setting array from one with different shape.", - stacklevel=2) + + warn("Setting array from one with different shape.", stacklevel=2) ary = ary.reshape(self.shape) if ary.dtype != self.dtype: @@ -275,8 +311,7 @@ class GPUArray(object): async_ = False if kwargs: - raise TypeError("extra keyword arguments specified: %s" - % ", ".join(kwargs)) + raise TypeError("extra keyword arguments specified: %s" % ", ".join(kwargs)) # }}} @@ -293,9 +328,13 @@ class GPUArray(object): raise ValueError("self and ary must be the same size") if self.shape != ary.shape: from warnings import warn - warn("get() between arrays of different shape is deprecated " - "and will be removed in PyCUDA 2017.x", - DeprecationWarning, stacklevel=2) + + warn( + "get() between arrays of different shape is deprecated " + "and will be removed in PyCUDA 2017.x", + DeprecationWarning, + stacklevel=2, + ) ary = ary.reshape(self.shape) if self.dtype != ary.dtype: @@ -332,19 +371,37 @@ class GPUArray(object): where `other` is a vector..""" assert self.shape == other.shape if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) if add_timer is not None: - add_timer(3*self.size, func.prepared_timed_call(self._grid, - selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size)) + add_timer( + 3 * self.size, + func.prepared_timed_call( + self._grid, + selffac, + self.gpudata, + otherfac, + other.gpudata, + out.gpudata, + self.mem_size, + ), + ) else: - func.prepared_async_call(self._grid, self._block, stream, - selffac, self.gpudata, otherfac, other.gpudata, - out.gpudata, self.mem_size) + func.prepared_async_call( + self._grid, + self._block, + stream, + selffac, + self.gpudata, + otherfac, + other.gpudata, + out.gpudata, + self.mem_size, + ) return out @@ -352,43 +409,64 @@ class GPUArray(object): """Compute ``out = selffac * self + other``, where `other` is a scalar.""" if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) func = elementwise.get_axpbz_kernel(self.dtype, out.dtype) - func.prepared_async_call(self._grid, self._block, stream, - selffac, self.gpudata, - other, out.gpudata, self.mem_size) + func.prepared_async_call( + self._grid, + self._block, + stream, + selffac, + self.gpudata, + other, + out.gpudata, + self.mem_size, + ) return out def _elwise_multiply(self, other, out, stream=None): if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") - - func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, - out.dtype, "*") - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) + + func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "*") + func.prepared_async_call( + self._grid, + self._block, + stream, + self.gpudata, + other.gpudata, + out.gpudata, + self.mem_size, + ) return out def _rdiv_scalar(self, other, out, stream=None): """Divides an array by a scalar:: - y = n / self + y = n / self """ if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) func = elementwise.get_rdivide_elwise_kernel(self.dtype, out.dtype) - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other, - out.gpudata, self.mem_size) + func.prepared_async_call( + self._grid, + self._block, + stream, + self.gpudata, + other, + out.gpudata, + self.mem_size, + ) return out @@ -396,16 +474,22 @@ class GPUArray(object): """Divides an array by another array.""" if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) assert self.shape == other.shape - func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, - out.dtype, "/") - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, other.gpudata, - out.gpudata, self.mem_size) + func = elementwise.get_binary_op_kernel(self.dtype, other.dtype, out.dtype, "/") + func.prepared_async_call( + self._grid, + self._block, + stream, + self.gpudata, + other.gpudata, + out.gpudata, + self.mem_size, + ) return out @@ -416,13 +500,13 @@ class GPUArray(object): if dtype == self.dtype: strides = self.strides - return self.__class__(self.shape, dtype, - allocator=self.allocator, strides=strides, order=order) + return self.__class__( + self.shape, dtype, allocator=self.allocator, strides=strides, order=order + ) # operators --------------------------------------------------------------- def mul_add(self, selffac, other, otherfac, add_timer=None, stream=None): - """Return `selffac * self + otherfac*other`. - """ + """Return `selffac * self + otherfac*other`.""" result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbyz(selffac, other, otherfac, result, add_timer) @@ -460,7 +544,7 @@ class GPUArray(object): def __rsub__(self, other): """Substracts an array by a scalar or an array:: - x = n - self + x = n - self """ # other must be a scalar result = self._new_like_me(_get_common_dtype(self, other)) @@ -503,7 +587,7 @@ class GPUArray(object): def __div__(self, other): """Divides an array by an array or a scalar:: - x = self / n + x = self / n """ if isinstance(other, GPUArray): result = self._new_like_me(_get_common_dtype(self, other)) @@ -514,14 +598,14 @@ class GPUArray(object): else: # create a new array for the result result = self._new_like_me(_get_common_dtype(self, other)) - return self._axpbz(1/other, 0, result) + return self._axpbz(1 / other, 0, result) __truediv__ = __div__ def __rdiv__(self, other): """Divides an array by a scalar or an array:: - x = n / self + x = n / self """ # create a new array for the result result = self._new_like_me(_get_common_dtype(self, other)) @@ -532,7 +616,7 @@ class GPUArray(object): def __idiv__(self, other): """Divides an array by an array or a scalar:: - x /= n + x /= n """ if isinstance(other, GPUArray): return self._div(other, self) @@ -540,33 +624,43 @@ class GPUArray(object): if other == 1: return self else: - return self._axpbz(1/other, 0, self) + return self._axpbz(1 / other, 0, self) __itruediv__ = __idiv__ def fill(self, value, stream=None): """fills the array with the specified value""" func = elementwise.get_fill_kernel(self.dtype) - func.prepared_async_call(self._grid, self._block, stream, - value, self.gpudata, self.mem_size) + func.prepared_async_call( + self._grid, self._block, stream, value, self.gpudata, self.mem_size + ) return self def bind_to_texref(self, texref, allow_offset=False): - return texref.set_address(self.gpudata, self.nbytes, - allow_offset=allow_offset) / self.dtype.itemsize - - def bind_to_texref_ext(self, texref, channels=1, allow_double_hack=False, - allow_complex_hack=False, allow_offset=False): + return ( + texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset) + / self.dtype.itemsize + ) + + def bind_to_texref_ext( + self, + texref, + channels=1, + allow_double_hack=False, + allow_complex_hack=False, + allow_offset=False, + ): if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) if self.dtype == np.float64 and allow_double_hack: if channels != 1: raise ValueError( - "'fake' double precision textures can " - "only have one channel") + "'fake' double precision textures can " "only have one channel" + ) channels = 2 fmt = drv.array_format.SIGNED_INT32 @@ -574,8 +668,8 @@ class GPUArray(object): elif self.dtype == np.complex64 and allow_complex_hack: if channels != 1: raise ValueError( - "'fake' complex64 textures can " - "only have one channel") + "'fake' complex64 textures can " "only have one channel" + ) channels = 2 fmt = drv.array_format.UNSIGNED_INT32 @@ -583,8 +677,8 @@ class GPUArray(object): elif self.dtype == np.complex128 and allow_complex_hack: if channels != 1: raise ValueError( - "'fake' complex128 textures can " - "only have one channel") + "'fake' complex128 textures can " "only have one channel" + ) channels = 4 fmt = drv.array_format.SIGNED_INT32 @@ -593,14 +687,15 @@ class GPUArray(object): fmt = drv.dtype_to_array_format(self.dtype) read_as_int = np.integer in self.dtype.type.__mro__ - offset = texref.set_address(self.gpudata, self.nbytes, - allow_offset=allow_offset) + offset = texref.set_address( + self.gpudata, self.nbytes, allow_offset=allow_offset + ) texref.set_format(fmt, channels) if read_as_int: texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER) - return offset/self.dtype.itemsize + return offset / self.dtype.itemsize def __len__(self): """Return the size of the leading dimension of self.""" @@ -625,16 +720,17 @@ class GPUArray(object): if issubclass(self.dtype.type, np.complexfloating): from pytools import match_precision + out_dtype = match_precision(np.dtype(np.float64), self.dtype) result = self._new_like_me(out_dtype) else: out_dtype = self.dtype - func = elementwise.get_unary_func_kernel(fname, self.dtype, - out_dtype=out_dtype) + func = elementwise.get_unary_func_kernel(fname, self.dtype, out_dtype=out_dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, self.mem_size) + func.prepared_async_call( + self._grid, self._block, None, self.gpudata, result.gpudata, self.mem_size + ) return result @@ -646,8 +742,10 @@ class GPUArray(object): if isinstance(other, GPUArray): if not self.flags.forc or not other.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " + "be used as arguments to this operation" + ) assert self.shape == other.shape @@ -657,50 +755,64 @@ class GPUArray(object): result = self func = elementwise.get_pow_array_kernel( - self.dtype, other.dtype, result.dtype) - - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, other.gpudata, result.gpudata, - self.mem_size) + self.dtype, other.dtype, result.dtype + ) + + func.prepared_async_call( + self._grid, + self._block, + None, + self.gpudata, + other.gpudata, + result.gpudata, + self.mem_size, + ) return result else: if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " + "be used as arguments to this operation" + ) if new: result = self._new_like_me() else: result = self func = elementwise.get_pow_kernel(self.dtype) - func.prepared_async_call(self._grid, self._block, None, - other, self.gpudata, result.gpudata, - self.mem_size) + func.prepared_async_call( + self._grid, + self._block, + None, + other, + self.gpudata, + result.gpudata, + self.mem_size, + ) return result def __pow__(self, other): """pow function:: - example: - array = pow(array) - array = pow(array,4) - array = pow(array,array) + example: + array = pow(array) + array = pow(array,4) + array = pow(array,array) """ - return self._pow(other,new=True) + return self._pow(other, new=True) def __ipow__(self, other): """ipow function:: - example: - array **= 4 - array **= array + example: + array **= 4 + array **= array """ - return self._pow(other,new=False) - + return self._pow(other, new=False) def reverse(self, stream=None): """Return this array in reversed order. The array is treated @@ -708,22 +820,24 @@ class GPUArray(object): """ if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) result = self._new_like_me() func = elementwise.get_reverse_kernel(self.dtype) - func.prepared_async_call(self._grid, self._block, stream, - self.gpudata, result.gpudata, - self.mem_size) + func.prepared_async_call( + self._grid, self._block, stream, self.gpudata, result.gpudata, self.mem_size + ) return result def astype(self, dtype, stream=None): if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) if dtype == self.dtype: return self.copy() @@ -731,9 +845,9 @@ class GPUArray(object): result = self._new_like_me(dtype=dtype) func = elementwise.get_copy_kernel(dtype, self.dtype) - func.prepared_async_call(self._grid, self._block, stream, - result.gpudata, self.gpudata, - self.mem_size) + func.prepared_async_call( + self._grid, self._block, stream, result.gpudata, self.gpudata, self.mem_size + ) return result @@ -745,14 +859,16 @@ class GPUArray(object): # TODO: add more error-checking, perhaps if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " "be used as arguments to this operation" + ) if isinstance(shape[0], tuple) or isinstance(shape[0], list): shape = tuple(shape[0]) - same_contiguity = ((order == "C" and self.flags.c_contiguous) or - (order == "F" and self.flags.f_contiguous)) + same_contiguity = (order == "C" and self.flags.c_contiguous) or ( + order == "F" and self.flags.f_contiguous + ) if shape == self.shape and same_contiguity: return self @@ -771,12 +887,13 @@ class GPUArray(object): raise ValueError("total size of new array must be unchanged") return GPUArray( - shape=shape, - dtype=self.dtype, - allocator=self.allocator, - base=self, - gpudata=int(self.gpudata), - order=order) + shape=shape, + dtype=self.dtype, + allocator=self.allocator, + base=self, + gpudata=int(self.gpudata), + order=order, + ) def ravel(self): return self.reshape(self.size) @@ -789,29 +906,33 @@ class GPUArray(object): itemsize = np.dtype(dtype).itemsize from pytools import argmin2 + min_stride_axis = argmin2( - (axis, abs(stride)) - for axis, stride in enumerate(self.strides)) + (axis, abs(stride)) for axis, stride in enumerate(self.strides) + ) if self.shape[min_stride_axis] * old_itemsize % itemsize != 0: raise ValueError("new type not compatible with array") new_shape = ( - self.shape[:min_stride_axis] - + (self.shape[min_stride_axis] * old_itemsize // itemsize,) - + self.shape[min_stride_axis+1:]) + self.shape[:min_stride_axis] + + (self.shape[min_stride_axis] * old_itemsize // itemsize,) + + self.shape[min_stride_axis + 1:] + ) new_strides = ( - self.strides[:min_stride_axis] - + (self.strides[min_stride_axis] * itemsize // old_itemsize,) - + self.strides[min_stride_axis+1:]) + self.strides[:min_stride_axis] + + (self.strides[min_stride_axis] * itemsize // old_itemsize,) + + self.strides[min_stride_axis + 1:] + ) return GPUArray( - shape=new_shape, - dtype=dtype, - allocator=self.allocator, - strides=new_strides, - base=self, - gpudata=int(self.gpudata)) + shape=new_shape, + dtype=dtype, + allocator=self.allocator, + strides=new_strides, + base=self, + gpudata=int(self.gpudata), + ) def squeeze(self): """ @@ -819,8 +940,9 @@ class GPUArray(object): length 1 removed. """ new_shape = tuple([dim for dim in self.shape if dim > 1]) - new_strides = tuple([self.strides[i] - for i, dim in enumerate(self.shape) if dim > 1]) + new_strides = tuple( + [self.strides[i] for i, dim in enumerate(self.shape) if dim > 1] + ) return GPUArray( shape=new_shape, @@ -828,7 +950,8 @@ class GPUArray(object): allocator=self.allocator, strides=new_strides, base=self, - gpudata=int(self.gpudata)) + gpudata=int(self.gpudata), + ) def transpose(self, axes=None): """Permute the dimensions of an array. @@ -843,17 +966,19 @@ class GPUArray(object): """ if axes is None: - axes = range(self.ndim-1, -1, -1) + axes = range(self.ndim - 1, -1, -1) if len(axes) != len(self.shape): raise ValueError("axes don't match array") new_shape = [self.shape[axes[i]] for i in range(len(axes))] new_strides = [self.strides[axes[i]] for i in range(len(axes))] - return GPUArray(shape=tuple(new_shape), - dtype=self.dtype, - allocator=self.allocator, - base=self.base or self, - gpudata=self.gpudata, - strides=tuple(new_strides)) + return GPUArray( + shape=tuple(new_shape), + dtype=self.dtype, + allocator=self.allocator, + base=self.base or self, + gpudata=self.gpudata, + strides=tuple(new_strides), + ) @property def T(self): # noqa @@ -886,14 +1011,13 @@ class GPUArray(object): raise IndexError("too many axes in index") if isinstance(index_entry, slice): - start, stop, idx_stride = index_entry.indices( - self.shape[array_axis]) + start, stop, idx_stride = index_entry.indices(self.shape[array_axis]) array_stride = self.strides[array_axis] - new_shape.append((abs(stop-start)-1)//abs(idx_stride)+1) - new_strides.append(idx_stride*array_stride) - new_offset += array_stride*start + new_shape.append((abs(stop - start) - 1) // abs(idx_stride) + 1) + new_strides.append(idx_stride * array_stride) + new_offset += array_stride * start index_axis += 1 array_axis += 1 @@ -904,10 +1028,9 @@ class GPUArray(object): index_entry += array_shape if not (0 <= index_entry < array_shape): - raise IndexError( - "subindex in axis %d out of range" % index_axis) + raise IndexError("subindex in axis %d out of range" % index_axis) - new_offset += self.strides[array_axis]*index_entry + new_offset += self.strides[array_axis] * index_entry index_axis += 1 array_axis += 1 @@ -925,8 +1048,7 @@ class GPUArray(object): array_axis += 1 if seen_ellipsis: - raise IndexError( - "more than one ellipsis not allowed in index") + raise IndexError("more than one ellipsis not allowed in index") seen_ellipsis = True elif index_entry is np.newaxis: @@ -944,12 +1066,13 @@ class GPUArray(object): array_axis += 1 return GPUArray( - shape=tuple(new_shape), - dtype=self.dtype, - allocator=self.allocator, - base=self, - gpudata=int(self.gpudata)+new_offset, - strides=tuple(new_strides)) + shape=tuple(new_shape), + dtype=self.dtype, + allocator=self.allocator, + base=self, + gpudata=int(self.gpudata) + new_offset, + strides=tuple(new_strides), + ) def __setitem__(self, index, value): _memcpy_discontig(self[index], value) @@ -963,6 +1086,7 @@ class GPUArray(object): dtype = self.dtype if issubclass(dtype.type, np.complexfloating): from pytools import match_precision + real_dtype = match_precision(np.dtype(np.float64), dtype) if self.flags.f_contiguous: order = "F" @@ -971,9 +1095,14 @@ class GPUArray(object): result = self._new_like_me(dtype=real_dtype, order=order) func = elementwise.get_real_kernel(dtype, real_dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, - self.mem_size) + func.prepared_async_call( + self._grid, + self._block, + None, + self.gpudata, + result.gpudata, + self.mem_size, + ) return result else: @@ -984,10 +1113,13 @@ class GPUArray(object): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " + "be used as arguments to this operation" + ) from pytools import match_precision + real_dtype = match_precision(np.dtype(np.float64), dtype) if self.flags.f_contiguous: order = "F" @@ -996,9 +1128,14 @@ class GPUArray(object): result = self._new_like_me(dtype=real_dtype, order=order) func = elementwise.get_imag_kernel(dtype, real_dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, - self.mem_size) + func.prepared_async_call( + self._grid, + self._block, + None, + self.gpudata, + result.gpudata, + self.mem_size, + ) return result else: @@ -1008,8 +1145,10 @@ class GPUArray(object): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): if not self.flags.forc: - raise RuntimeError("only contiguous arrays may " - "be used as arguments to this operation") + raise RuntimeError( + "only contiguous arrays may " + "be used as arguments to this operation" + ) if self.flags.f_contiguous: order = "F" @@ -1018,9 +1157,14 @@ class GPUArray(object): result = self._new_like_me(order=order) func = elementwise.get_conj_kernel(dtype) - func.prepared_async_call(self._grid, self._block, None, - self.gpudata, result.gpudata, - self.mem_size) + func.prepared_async_call( + self._grid, + self._block, + None, + self.gpudata, + result.gpudata, + self.mem_size, + ) return result else: @@ -1039,11 +1183,13 @@ class GPUArray(object): # }}} + # }}} # {{{ creation helpers + def to_gpu(ary, allocator=drv.mem_alloc): """converts a numpy array to a GPUArray""" result = GPUArray(ary.shape, ary.dtype, allocator, strides=_compact_strides(ary)) @@ -1089,7 +1235,7 @@ def _array_like_helper(other_ary, dtype, order): # scale strides by itemsize when dtype is not the same itemsize = other_ary.nbytes // other_ary.size itemsize_ratio = np.dtype(dtype).itemsize / itemsize - strides = [int(s*itemsize_ratio) for s in strides] + strides = [int(s * itemsize_ratio) for s in strides] elif order not in ["C", "F"]: raise ValueError("Unsupported order: %r" % order) if dtype is None: @@ -1100,16 +1246,16 @@ def _array_like_helper(other_ary, dtype, order): def empty_like(other_ary, dtype=None, order="K"): dtype, order, strides = _array_like_helper(other_ary, dtype, order) result = GPUArray( - other_ary.shape, dtype, other_ary.allocator, order=order, - strides=strides) + other_ary.shape, dtype, other_ary.allocator, order=order, strides=strides + ) return result def zeros_like(other_ary, dtype=None, order="K"): dtype, order, strides = _array_like_helper(other_ary, dtype, order) result = GPUArray( - other_ary.shape, dtype, other_ary.allocator, order=order, - strides=strides) + other_ary.shape, dtype, other_ary.allocator, order=order, strides=strides + ) zero = np.zeros((), result.dtype) result.fill(zero) return result @@ -1118,8 +1264,8 @@ def zeros_like(other_ary, dtype=None, order="K"): def ones_like(other_ary, dtype=None, order="K"): dtype, order, strides = _array_like_helper(other_ary, dtype, order) result = GPUArray( - other_ary.shape, dtype, other_ary.allocator, order=order, - strides=strides) + other_ary.shape, dtype, other_ary.allocator, order=order, strides=strides + ) one = np.ones((), result.dtype) result.fill(one) return result @@ -1197,30 +1343,42 @@ def arange(*args, **kwargs): if not explicit_dtype and dtype != np.float32: from warnings import warn - warn("behavior change: arange guessed dtype other than float32. " - "suggest specifying explicit dtype.") + + warn( + "behavior change: arange guessed dtype other than float32. " + "suggest specifying explicit dtype." + ) from math import ceil - size = int(ceil((stop-start)/step)) + + size = int(ceil((stop - start) / step)) result = GPUArray((size,), dtype) func = elementwise.get_arange_kernel(dtype) - func.prepared_async_call(result._grid, result._block, kwargs.get("stream"), - result.gpudata, start, step, size) + func.prepared_async_call( + result._grid, + result._block, + kwargs.get("stream"), + result.gpudata, + start, + step, + size, + ) return result + # }}} def _compact_strides(a): # Compute strides to have same order as self, but packed info = sorted( - (a.strides[axis], a.shape[axis], axis) - for axis in range(len(a.shape))) + (a.strides[axis], a.shape[axis], axis) for axis in range(len(a.shape)) + ) - strides = [None]*len(a.shape) + strides = [None] * len(a.shape) stride = a.dtype.itemsize for _, dim, axis in info: strides[axis] = stride @@ -1257,17 +1415,21 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): # put src in Fortran order (which should put dst in Fortran order too) # and remove singleton axes src_info = sorted( - (src.strides[axis], axis) - for axis in range(len(src.shape)) if src.shape[axis] > 1) + (src.strides[axis], axis) + for axis in range(len(src.shape)) + if src.shape[axis] > 1 + ) axes = [axis for _, axis in src_info] shape = [src.shape[axis] for axis in axes] src_strides = [src.strides[axis] for axis in axes] dst_strides = [dst.strides[axis] for axis in axes] # copy functions require contiguity in minor axis, so add new axis if needed - if (len(shape) == 0 - or src_strides[0] != src.dtype.itemsize - or dst_strides[0] != dst.dtype.itemsize): + if ( + len(shape) == 0 + or src_strides[0] != src.dtype.itemsize + or dst_strides[0] != dst.dtype.itemsize + ): shape[0:0] = [1] src_strides[0:0] = [0] dst_strides[0:0] = [0] @@ -1277,11 +1439,13 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): # and check that dst is in same order as src i = 1 while i < len(shape): - if dst_strides[i] < dst_strides[i-1]: + if dst_strides[i] < dst_strides[i - 1]: raise ValueError("src and dst must have same order") - if (src_strides[i-1] * shape[i-1] == src_strides[i] and - dst_strides[i-1] * shape[i-1] == dst_strides[i]): - shape[i-1:i+1] = [shape[i-1] * shape[i]] + if ( + src_strides[i - 1] * shape[i - 1] == src_strides[i] + and dst_strides[i - 1] * shape[i - 1] == dst_strides[i] + ): + shape[i - 1:i + 1] = [shape[i - 1] * shape[i]] del src_strides[i] del dst_strides[i] del axes[i] @@ -1293,7 +1457,8 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): if isinstance(dst, GPUArray): if async_: drv.memcpy_dtod_async( - dst.gpudata, src.gpudata, src.nbytes, stream=stream) + dst.gpudata, src.gpudata, src.nbytes, stream=stream + ) else: drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes) else: @@ -1301,8 +1466,7 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): # having no gaps, but the axes could be transposed # so that the order is neither Fortran or C. # So, we attempt to get a contiguous view of dst. - dst = _as_strided( - dst, shape=(dst.size,), strides=(dst.dtype.itemsize,)) + dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,)) if async_: drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream) else: @@ -1321,8 +1485,8 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): copy = drv.Memcpy3D() else: raise ValueError( - "more than 2 discontiguous axes not supported %s" - % (tuple(sorted(axes)),)) + "more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),) + ) if isinstance(src, GPUArray): copy.set_src_device(src.gpudata) @@ -1334,7 +1498,7 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): else: copy.set_dst_host(dst) - copy.width_in_bytes = src.dtype.itemsize*shape[0] + copy.width_in_bytes = src.dtype.itemsize * shape[0] copy.src_pitch = src_strides[1] copy.dst_pitch = dst_strides[1] @@ -1348,13 +1512,11 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): else: # len(shape) == 3 if src_strides[2] % src_strides[1] != 0: - raise RuntimeError( - "src's major stride must be a multiple of middle stride") + raise RuntimeError("src's major stride must be a multiple of middle stride") copy.src_height = src_strides[2] // src_strides[1] if dst_strides[2] % dst_strides[1] != 0: - raise RuntimeError( - "dst's major stride must be a multiple of middle stride") + raise RuntimeError("dst's major stride must be a multiple of middle stride") copy.dst_height = dst_strides[2] // dst_strides[1] copy.depth = shape[2] @@ -1366,16 +1528,14 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): # {{{ pickle support -import six.moves.copyreg -six.moves.copyreg.pickle(GPUArray, - lambda data: (to_gpu, (data.get(),)), - to_gpu) +six.moves.copyreg.pickle(GPUArray, lambda data: (to_gpu, (data.get(),)), to_gpu) # }}} # {{{ take/put + def take(a, indices, out=None, stream=None): if out is None: out = GPUArray(indices.shape, a.dtype, a.allocator) @@ -1385,8 +1545,9 @@ def take(a, indices, out=None, stream=None): func, tex_src = elementwise.get_take_kernel(a.dtype, indices.dtype) a.bind_to_texref_ext(tex_src[0], allow_double_hack=True, allow_complex_hack=True) - func.prepared_async_call(out._grid, out._block, stream, - indices.gpudata, out.gpudata, indices.size) + func.prepared_async_call( + out._grid, out._block, stream, indices.gpudata, out.gpudata, indices.size + ) return out @@ -1398,14 +1559,14 @@ def multi_take(arrays, indices, out=None, stream=None): assert len(indices.shape) == 1 from pytools import single_valued + a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].dtype vec_count = len(arrays) if out is None: - out = [GPUArray(indices.shape, a_dtype, a_allocator) - for i in range(vec_count)] + out = [GPUArray(indices.shape, a_dtype, a_allocator) for i in range(vec_count)] else: if len(out) != len(arrays): raise ValueError("out and arrays must have the same length") @@ -1413,42 +1574,51 @@ def multi_take(arrays, indices, out=None, stream=None): chunk_size = _builtin_min(vec_count, 20) def make_func_for_chunk_size(chunk_size): - return elementwise.get_take_kernel(a_dtype, indices.dtype, - vec_count=chunk_size) + return elementwise.get_take_kernel(a_dtype, indices.dtype, vec_count=chunk_size) func, tex_src = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): - chunk_slice = slice(start_i, start_i+chunk_size) + chunk_slice = slice(start_i, start_i + chunk_size) if start_i + chunk_size > vec_count: - func, tex_src = make_func_for_chunk_size(vec_count-start_i) + func, tex_src = make_func_for_chunk_size(vec_count - start_i) for i, a in enumerate(arrays[chunk_slice]): a.bind_to_texref_ext(tex_src[i], allow_double_hack=True) - func.prepared_async_call(indices._grid, indices._block, stream, - indices.gpudata, - *([o.gpudata for o in out[chunk_slice]] - + [indices.size])) + func.prepared_async_call( + indices._grid, + indices._block, + stream, + indices.gpudata, + *([o.gpudata for o in out[chunk_slice]] + [indices.size]) + ) return out -def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, - out=None, stream=None, src_offsets=None): +def multi_take_put( + arrays, + dest_indices, + src_indices, + dest_shape=None, + out=None, + stream=None, + src_offsets=None, +): if not len(arrays): return [] from pytools import single_valued + a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator vec_count = len(arrays) if out is None: - out = [GPUArray(dest_shape, a_dtype, a_allocator) - for i in range(vec_count)] + out = [GPUArray(dest_shape, a_dtype, a_allocator) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") @@ -1477,26 +1647,35 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, def make_func_for_chunk_size(chunk_size): return elementwise.get_take_put_kernel( - a_dtype, src_indices.dtype, - with_offsets=src_offsets is not None, - vec_count=chunk_size) + a_dtype, + src_indices.dtype, + with_offsets=src_offsets is not None, + vec_count=chunk_size, + ) func, tex_src = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): - chunk_slice = slice(start_i, start_i+chunk_size) + chunk_slice = slice(start_i, start_i + chunk_size) if start_i + chunk_size > vec_count: - func, tex_src = make_func_for_chunk_size(vec_count-start_i) + func, tex_src = make_func_for_chunk_size(vec_count - start_i) for src_tr, a in zip(tex_src, arrays[chunk_slice]): a.bind_to_texref_ext(src_tr, allow_double_hack=True) - func.prepared_async_call(src_indices._grid, src_indices._block, stream, - dest_indices.gpudata, src_indices.gpudata, - *([o.gpudata for o in out[chunk_slice]] - + src_offsets_list[chunk_slice] - + [src_indices.size])) + func.prepared_async_call( + src_indices._grid, + src_indices._block, + stream, + dest_indices.gpudata, + src_indices.gpudata, + *( + [o.gpudata for o in out[chunk_slice]] + + src_offsets_list[chunk_slice] + + [src_indices.size] + ) + ) return out @@ -1506,14 +1685,14 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None): return [] from pytools import single_valued + a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator vec_count = len(arrays) if out is None: - out = [GPUArray(dest_shape, a_dtype, a_allocator) - for i in range(vec_count)] + out = [GPUArray(dest_shape, a_dtype, a_allocator) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") @@ -1527,29 +1706,38 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None): def make_func_for_chunk_size(chunk_size): return elementwise.get_put_kernel( - a_dtype, dest_indices.dtype, vec_count=chunk_size) + a_dtype, dest_indices.dtype, vec_count=chunk_size + ) func = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): - chunk_slice = slice(start_i, start_i+chunk_size) + chunk_slice = slice(start_i, start_i + chunk_size) if start_i + chunk_size > vec_count: - func = make_func_for_chunk_size(vec_count-start_i) - - func.prepared_async_call(dest_indices._grid, dest_indices._block, stream, - dest_indices.gpudata, - *([o.gpudata for o in out[chunk_slice]] - + [i.gpudata for i in arrays[chunk_slice]] - + [dest_indices.size])) + func = make_func_for_chunk_size(vec_count - start_i) + + func.prepared_async_call( + dest_indices._grid, + dest_indices._block, + stream, + dest_indices.gpudata, + *( + [o.gpudata for o in out[chunk_slice]] + + [i.gpudata for i in arrays[chunk_slice]] + + [dest_indices.size] + ) + ) return out + # }}} # {{{ shape manipulation + def transpose(a, axes=None): """Permute the dimensions of an array. @@ -1573,11 +1761,13 @@ def reshape(a, *shape, **kwargs): return a.reshape(*shape, **kwargs) + # }}} # {{{ conditionals + def if_positive(criterion, then_, else_, out=None, stream=None): if not (criterion.shape == then_.shape == else_.shape): raise ValueError("shapes do not match") @@ -1585,15 +1775,21 @@ def if_positive(criterion, then_, else_, out=None, stream=None): if not (then_.dtype == else_.dtype): raise ValueError("dtypes do not match") - func = elementwise.get_if_positive_kernel( - criterion.dtype, then_.dtype) + func = elementwise.get_if_positive_kernel(criterion.dtype, then_.dtype) if out is None: out = empty_like(then_) - func.prepared_async_call(criterion._grid, criterion._block, stream, - criterion.gpudata, then_.gpudata, else_.gpudata, out.gpudata, - criterion.size) + func.prepared_async_call( + criterion._grid, + criterion._block, + stream, + criterion.gpudata, + then_.gpudata, + else_.gpudata, + out.gpudata, + criterion.size, + ) return out @@ -1603,27 +1799,34 @@ def _make_binary_minmax_func(which): if isinstance(a, GPUArray) and isinstance(b, GPUArray): if out is None: out = empty_like(a) - func = elementwise.get_binary_minmax_kernel(which, - a.dtype, b.dtype, out.dtype, use_scalar=False) + func = elementwise.get_binary_minmax_kernel( + which, a.dtype, b.dtype, out.dtype, use_scalar=False + ) - func.prepared_async_call(a._grid, a._block, stream, - a.gpudata, b.gpudata, out.gpudata, a.size) + func.prepared_async_call( + a._grid, a._block, stream, a.gpudata, b.gpudata, out.gpudata, a.size + ) elif isinstance(a, GPUArray): if out is None: out = empty_like(a) - func = elementwise.get_binary_minmax_kernel(which, - a.dtype, a.dtype, out.dtype, use_scalar=True) - func.prepared_async_call(a._grid, a._block, stream, - a.gpudata, b, out.gpudata, a.size) + func = elementwise.get_binary_minmax_kernel( + which, a.dtype, a.dtype, out.dtype, use_scalar=True + ) + func.prepared_async_call( + a._grid, a._block, stream, a.gpudata, b, out.gpudata, a.size + ) else: # assuming b is a GPUArray if out is None: out = empty_like(b) - func = elementwise.get_binary_minmax_kernel(which, - b.dtype, b.dtype, out.dtype, use_scalar=True) + func = elementwise.get_binary_minmax_kernel( + which, b.dtype, b.dtype, out.dtype, use_scalar=True + ) # NOTE: we switch the order of a and b here! - func.prepared_async_call(b._grid, b._block, stream, - b.gpudata, a, out.gpudata, b.size) + func.prepared_async_call( + b._grid, b._block, stream, b.gpudata, a, out.gpudata, b.size + ) return out + return f @@ -1635,20 +1838,24 @@ maximum = _make_binary_minmax_func("max") # {{{ reductions + def sum(a, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_sum_kernel + krnl = get_sum_kernel(dtype, a.dtype) return krnl(a, stream=stream, allocator=allocator) def subset_sum(subset, a, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_subset_sum_kernel + krnl = get_subset_sum_kernel(dtype, subset.dtype, a.dtype) return krnl(subset, a, stream=stream) def dot(a, b, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_dot_kernel + if dtype is None: dtype = _get_common_dtype(a, b) krnl = get_dot_kernel(dtype, a.dtype, b.dtype) @@ -1657,6 +1864,7 @@ def dot(a, b, dtype=None, stream=None, allocator=None): def subset_dot(subset, a, b, dtype=None, stream=None, allocator=None): from pycuda.reduction import get_subset_dot_kernel + krnl = get_subset_dot_kernel(dtype, subset.dtype, a.dtype, b.dtype) return krnl(subset, a, b, stream=stream, allocator=allocator) @@ -1664,11 +1872,13 @@ def subset_dot(subset, a, b, dtype=None, stream=None, allocator=None): def _make_minmax_kernel(what): def f(a, stream=None): from pycuda.reduction import get_minmax_kernel + krnl = get_minmax_kernel(what, a.dtype) - return krnl(a, stream=stream) + return krnl(a, stream=stream) return f + _builtin_min = min _builtin_max = max min = _make_minmax_kernel("min") @@ -1678,11 +1888,13 @@ max = _make_minmax_kernel("max") def _make_subset_minmax_kernel(what): def f(subset, a, stream=None): from pycuda.reduction import get_subset_minmax_kernel + krnl = get_subset_minmax_kernel(what, a.dtype, subset.dtype) - return krnl(subset, a, stream=stream) + return krnl(subset, a, stream=stream) return f + subset_min = _make_subset_minmax_kernel("min") subset_max = _make_subset_minmax_kernel("max") diff --git a/pycuda/reduction.py b/pycuda/reduction.py index 939a006a..3ae1ce36 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -65,11 +65,21 @@ from pycuda.tools import dtype_to_ctype import numpy as np -def get_reduction_module(out_type, block_size, - neutral, reduce_expr, map_expr, arguments, - name="reduce_kernel", keep=False, options=None, preamble=""): +def get_reduction_module( + out_type, + block_size, + neutral, + reduce_expr, + map_expr, + arguments, + name="reduce_kernel", + keep=False, + options=None, + preamble="", +): from pycuda.compiler import SourceModule + src = """ #include @@ -138,23 +148,31 @@ def get_reduction_module(out_type, block_size, if (tid == 0) out[blockIdx.x] = sdata[0]; } """ % { - "out_type": out_type, - "arguments": arguments, - "block_size": block_size, - "neutral": neutral, - "reduce_expr": reduce_expr, - "map_expr": map_expr, - "name": name, - "preamble": preamble - } + "out_type": out_type, + "arguments": arguments, + "block_size": block_size, + "neutral": neutral, + "reduce_expr": reduce_expr, + "map_expr": map_expr, + "name": name, + "preamble": preamble, + } return SourceModule(src, options=options, keep=keep, no_extern_c=True) - - -def get_reduction_kernel_and_types(stage, out_type, block_size, - neutral, reduce_expr, map_expr=None, arguments=None, - name="reduce_kernel", keep=False, options=None, preamble=""): +def get_reduction_kernel_and_types( + stage, + out_type, + block_size, + neutral, + reduce_expr, + map_expr=None, + arguments=None, + name="reduce_kernel", + keep=False, + options=None, + preamble="", +): if stage == 1: if map_expr is None: @@ -173,11 +191,21 @@ def get_reduction_kernel_and_types(stage, out_type, block_size, else: assert False - mod = get_reduction_module(out_type, block_size, - neutral, reduce_expr, map_expr, arguments, - name, keep, options, preamble) + mod = get_reduction_module( + out_type, + block_size, + neutral, + reduce_expr, + map_expr, + arguments, + name, + keep, + options, + preamble, + ) from pycuda.tools import get_arg_type + func = mod.get_function(name) arg_types = [get_arg_type(arg) for arg in arguments.split(",")] func.prepare("P%sII" % "".join(arg_types)) @@ -185,35 +213,58 @@ def get_reduction_kernel_and_types(stage, out_type, block_size, return func, arg_types - - class ReductionKernel: - def __init__(self, dtype_out, - neutral, reduce_expr, map_expr=None, arguments=None, - name="reduce_kernel", keep=False, options=None, preamble=""): + def __init__( + self, + dtype_out, + neutral, + reduce_expr, + map_expr=None, + arguments=None, + name="reduce_kernel", + keep=False, + options=None, + preamble="", + ): self.dtype_out = np.dtype(dtype_out) self.block_size = 512 s1_func, self.stage1_arg_types = get_reduction_kernel_and_types( - 1, dtype_to_ctype(dtype_out), self.block_size, - neutral, reduce_expr, map_expr, - arguments, name=name+"_stage1", keep=keep, options=options, - preamble=preamble) + 1, + dtype_to_ctype(dtype_out), + self.block_size, + neutral, + reduce_expr, + map_expr, + arguments, + name=name + "_stage1", + keep=keep, + options=options, + preamble=preamble, + ) self.stage1_func = s1_func.prepared_async_call # stage 2 has only one input and no map expression s2_func, self.stage2_arg_types = get_reduction_kernel_and_types( - 2, dtype_to_ctype(dtype_out), self.block_size, - neutral, reduce_expr, arguments=arguments, - name=name+"_stage2", keep=keep, options=options, - preamble=preamble) + 2, + dtype_to_ctype(dtype_out), + self.block_size, + neutral, + reduce_expr, + arguments=arguments, + name=name + "_stage2", + keep=keep, + options=options, + preamble=preamble, + ) self.stage2_func = s2_func.prepared_async_call - assert [i for i, arg_tp in enumerate(self.stage1_arg_types) if arg_tp == "P"], \ - "ReductionKernel can only be used with functions that have at least one " \ - "vector argument" + assert [i for i, arg_tp in enumerate(self.stage1_arg_types) if arg_tp == "P"], ( + "ReductionKernel can only be used with functions that have at least one " + "vector argument" + ) def __call__(self, *args, **kwargs): MAX_BLOCK_COUNT = 1024 @@ -244,8 +295,9 @@ class ReductionKernel: for arg, arg_tp in zip(args, arg_types): if arg_tp == "P": if not arg.flags.forc: - raise RuntimeError("ReductionKernel cannot " - "deal with non-contiguous arrays") + raise RuntimeError( + "ReductionKernel cannot " "deal with non-contiguous arrays" + ) vectors.append(arg) invocation_args.append(arg.gpudata) @@ -259,13 +311,13 @@ class ReductionKernel: if allocator is None: allocator = repr_vec.allocator - if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT: - total_block_size = SMALL_SEQ_COUNT*self.block_size + if sz <= self.block_size * SMALL_SEQ_COUNT * MAX_BLOCK_COUNT: + total_block_size = SMALL_SEQ_COUNT * self.block_size block_count = (sz + total_block_size - 1) // total_block_size seq_count = SMALL_SEQ_COUNT else: block_count = MAX_BLOCK_COUNT - macroblock_size = block_count*self.block_size + macroblock_size = block_count * self.block_size seq_count = (sz + macroblock_size - 1) // macroblock_size if block_count == 1 and out is not None: @@ -279,12 +331,16 @@ class ReductionKernel: else: result = empty((block_count,), self.dtype_out, allocator=allocator) - kwargs = dict(shared_size=self.block_size*self.dtype_out.itemsize) + kwargs = dict(shared_size=self.block_size * self.dtype_out.itemsize) # print block_count, seq_count, self.block_size, sz - f((block_count, 1), (self.block_size, 1, 1), stream, - *([result.gpudata]+invocation_args+[seq_count, sz]), - **kwargs) + f( + (block_count, 1), + (self.block_size, 1, 1), + stream, + *([result.gpudata] + invocation_args + [seq_count, sz]), + **kwargs + ) if block_count == 1: return result @@ -294,17 +350,17 @@ class ReductionKernel: args = (result,) + stage1_args - - @context_dependent_memoize def get_sum_kernel(dtype_out, dtype_in): if dtype_out is None: dtype_out = dtype_in - return ReductionKernel(dtype_out, "0", "a+b", - arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)}) - - + return ReductionKernel( + dtype_out, + "0", + "a+b", + arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)}, + ) @context_dependent_memoize @@ -312,27 +368,33 @@ def get_subset_sum_kernel(dtype_out, dtype_subset, dtype_in): if dtype_out is None: dtype_out = dtype_in - return ReductionKernel(dtype_out, "0", "a+b", - map_expr="in[lookup_tbl[i]]", - arguments="const %(tp_lut)s *lookup_tbl, const %(tp)s *in" - % { - "tp": dtype_to_ctype(dtype_in), - "tp_lut": dtype_to_ctype(dtype_subset), - }) - - + return ReductionKernel( + dtype_out, + "0", + "a+b", + map_expr="in[lookup_tbl[i]]", + arguments="const %(tp_lut)s *lookup_tbl, const %(tp)s *in" + % { + "tp": dtype_to_ctype(dtype_in), + "tp_lut": dtype_to_ctype(dtype_subset), + }, + ) @context_dependent_memoize def get_dot_kernel(dtype_out, dtype_a, dtype_b): - return ReductionKernel(dtype_out, neutral="0", - reduce_expr="a+b", map_expr="a[i]*b[i]", - arguments="const %(tp_a)s *a, const %(tp_b)s *b" % { - "tp_a": dtype_to_ctype(dtype_a), - "tp_b": dtype_to_ctype(dtype_b), - }, keep=True) - - + return ReductionKernel( + dtype_out, + neutral="0", + reduce_expr="a+b", + map_expr="a[i]*b[i]", + arguments="const %(tp_a)s *a, const %(tp_b)s *b" + % { + "tp_a": dtype_to_ctype(dtype_a), + "tp_b": dtype_to_ctype(dtype_b), + }, + keep=True, + ) @context_dependent_memoize @@ -350,16 +412,19 @@ def get_subset_dot_kernel(dtype_out, dtype_subset, dtype_a=None, dtype_b=None): dtype_a = dtype_out # important: lookup_tbl must be first--it controls the length - return ReductionKernel(dtype_out, neutral="0", - reduce_expr="a+b", map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]", - arguments="const %(tp_lut)s *lookup_tbl, " - "const %(tp_a)s *a, const %(tp_b)s *b" % { + return ReductionKernel( + dtype_out, + neutral="0", + reduce_expr="a+b", + map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]", + arguments="const %(tp_lut)s *lookup_tbl, " + "const %(tp_a)s *a, const %(tp_b)s *b" + % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), "tp_lut": dtype_to_ctype(dtype_subset), - }) - - + }, + ) def get_minmax_neutral(what, dtype): @@ -380,8 +445,6 @@ def get_minmax_neutral(what, dtype): raise ValueError("what is not min or max.") - - @context_dependent_memoize def get_minmax_kernel(what, dtype): if dtype == np.float64: @@ -393,14 +456,16 @@ def get_minmax_kernel(what, dtype): else: raise TypeError("unsupported dtype specified") - return ReductionKernel(dtype, - neutral=get_minmax_neutral(what, dtype), - reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, - arguments="const %(tp)s *in" % { - "tp": dtype_to_ctype(dtype), - }, preamble="#define MY_INFINITY (1./0)") - - + return ReductionKernel( + dtype, + neutral=get_minmax_neutral(what, dtype), + reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, + arguments="const %(tp)s *in" + % { + "tp": dtype_to_ctype(dtype), + }, + preamble="#define MY_INFINITY (1./0)", + ) @context_dependent_memoize @@ -414,12 +479,16 @@ def get_subset_minmax_kernel(what, dtype, dtype_subset): else: raise TypeError("unsupported dtype specified") - return ReductionKernel(dtype, - neutral=get_minmax_neutral(what, dtype), - reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, - map_expr="in[lookup_tbl[i]]", - arguments="const %(tp_lut)s *lookup_tbl, " - "const %(tp)s *in" % { + return ReductionKernel( + dtype, + neutral=get_minmax_neutral(what, dtype), + reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, + map_expr="in[lookup_tbl[i]]", + arguments="const %(tp_lut)s *lookup_tbl, " + "const %(tp)s *in" + % { "tp": dtype_to_ctype(dtype), "tp_lut": dtype_to_ctype(dtype_subset), - }, preamble="#define MY_INFINITY (1./0)") + }, + preamble="#define MY_INFINITY (1./0)", + ) diff --git a/pycuda/scan.py b/pycuda/scan.py index e48217b7..68c91f56 100644 --- a/pycuda/scan.py +++ b/pycuda/scan.py @@ -10,7 +10,6 @@ Copyright 2008-2011 NVIDIA Corporation """ - __license__ = """ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -28,8 +27,6 @@ Derived from code within the Thrust project, https://github.com/thrust/thrust/ """ - - import numpy as np import pycuda.driver as driver @@ -40,9 +37,9 @@ import pycuda._mymako as mako from pycuda._cluda import CLUDA_PREAMBLE - - -SHARED_PREAMBLE = CLUDA_PREAMBLE + """ +SHARED_PREAMBLE = ( + CLUDA_PREAMBLE + + """ #define WG_SIZE ${wg_size} #define SCAN_EXPR(a, b) ${scan_expr} @@ -50,11 +47,12 @@ ${preamble} typedef ${scan_type} scan_type; """ +) - - -SCAN_INTERVALS_SOURCE = mako.template.Template(SHARED_PREAMBLE + """//CL// +SCAN_INTERVALS_SOURCE = mako.template.Template( + SHARED_PREAMBLE + + """//CL// #define K ${wg_seq_batches} <%def name="make_group_scan(name, with_bounds_check)"> @@ -237,12 +235,13 @@ void ${name_prefix}_scan_intervals( group_results[GID_0] = output[interval_end - 1]; } } -""") - - +""" +) -INCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """//CL// +INCLUSIVE_UPDATE_SOURCE = mako.template.Template( + SHARED_PREAMBLE + + """//CL// KERNEL REQD_WG_SIZE(WG_SIZE, 1, 1) void ${name_prefix}_final_update( @@ -275,12 +274,13 @@ void ${name_prefix}_final_update( } } } -""") - - +""" +) -EXCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """//CL// +EXCLUSIVE_UPDATE_SOURCE = mako.template.Template( + SHARED_PREAMBLE + + """//CL// KERNEL REQD_WG_SIZE(WG_SIZE, 1, 1) void ${name_prefix}_final_update( @@ -337,15 +337,21 @@ void ${name_prefix}_final_update( local_barrier(); } } -""") - - +""" +) class _ScanKernelBase(object): - def __init__(self, dtype, - scan_expr, neutral=None, - name_prefix="scan", options=None, preamble="", devices=None): + def __init__( + self, + dtype, + scan_expr, + neutral=None, + name_prefix="scan", + options=None, + preamble="", + devices=None, + ): if isinstance(self, ExclusiveScanKernel) and neutral is None: raise ValueError("neutral element is required for exclusive scan") @@ -363,30 +369,37 @@ class _ScanKernelBase(object): name_prefix=name_prefix, scan_type=dtype_to_ctype(dtype), scan_expr=scan_expr, - neutral=neutral) - - scan_intervals_src = str(SCAN_INTERVALS_SOURCE.render( - wg_size=self.scan_wg_size, - wg_seq_batches=self.scan_wg_seq_batches, - **kw_values)) + neutral=neutral, + ) + + scan_intervals_src = str( + SCAN_INTERVALS_SOURCE.render( + wg_size=self.scan_wg_size, + wg_seq_batches=self.scan_wg_seq_batches, + **kw_values + ) + ) scan_intervals_prg = SourceModule( - scan_intervals_src, options=options, no_extern_c=True) + scan_intervals_src, options=options, no_extern_c=True + ) self.scan_intervals_knl = scan_intervals_prg.get_function( - name_prefix+"_scan_intervals") + name_prefix + "_scan_intervals" + ) self.scan_intervals_knl.prepare("PIIPP") - final_update_src = str(self.final_update_tp.render( - wg_size=self.update_wg_size, - **kw_values)) + final_update_src = str( + self.final_update_tp.render(wg_size=self.update_wg_size, **kw_values) + ) final_update_prg = SourceModule( - final_update_src, options=options, no_extern_c=True) + final_update_src, options=options, no_extern_c=True + ) self.final_update_knl = final_update_prg.get_function( - name_prefix+"_final_update") + name_prefix + "_final_update" + ) self.final_update_knl.prepare("PIIP") - def __call__(self, input_ary, output_ary=None, allocator=None, - stream=None): + def __call__(self, input_ary, output_ary=None, allocator=None, stream=None): allocator = allocator or input_ary.allocator if output_ary is None: @@ -399,56 +412,68 @@ class _ScanKernelBase(object): raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: - raise RuntimeError("ScanKernel cannot " - "deal with non-contiguous arrays") + raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") - n, = input_ary.shape + (n,) = input_ary.shape if not n: return output_ary - unit_size = self.scan_wg_size * self.scan_wg_seq_batches + unit_size = self.scan_wg_size * self.scan_wg_seq_batches dev = driver.Context.get_device() - max_groups = 3*dev.get_attribute( - driver.device_attribute.MULTIPROCESSOR_COUNT) + max_groups = 3 * dev.get_attribute(driver.device_attribute.MULTIPROCESSOR_COUNT) from pytools import uniform_interval_splitting - interval_size, num_groups = uniform_interval_splitting( - n, unit_size, max_groups); - block_results = allocator(self.dtype.itemsize*num_groups) + interval_size, num_groups = uniform_interval_splitting(n, unit_size, max_groups) + + block_results = allocator(self.dtype.itemsize * num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl.prepared_async_call( - (num_groups, 1), (self.scan_wg_size, 1, 1), stream, - input_ary.gpudata, - n, interval_size, - output_ary.gpudata, - block_results) + (num_groups, 1), + (self.scan_wg_size, 1, 1), + stream, + input_ary.gpudata, + n, + interval_size, + output_ary.gpudata, + block_results, + ) # second level inclusive scan of per-block results self.scan_intervals_knl.prepared_async_call( - (1,1), (self.scan_wg_size, 1, 1), stream, - block_results, - num_groups, interval_size, - block_results, - dummy_results) + (1, 1), + (self.scan_wg_size, 1, 1), + stream, + block_results, + num_groups, + interval_size, + block_results, + dummy_results, + ) # update intervals with result of second level scan self.final_update_knl.prepared_async_call( - (num_groups, 1,), (self.update_wg_size, 1, 1), stream, - output_ary.gpudata, - n, interval_size, - block_results) + ( + num_groups, + 1, + ), + (self.update_wg_size, 1, 1), + stream, + output_ary.gpudata, + n, + interval_size, + block_results, + ) return output_ary - - class InclusiveScanKernel(_ScanKernelBase): final_update_tp = INCLUSIVE_UPDATE_SOURCE + class ExclusiveScanKernel(_ScanKernelBase): final_update_tp = EXCLUSIVE_UPDATE_SOURCE diff --git a/pycuda/sparse/__init__.py b/pycuda/sparse/__init__.py index 3bb8a67f..0949981b 100644 --- a/pycuda/sparse/__init__.py +++ b/pycuda/sparse/__init__.py @@ -1,4 +1,8 @@ from __future__ import absolute_import from warnings import warn -warn("pycuda.sparse is deprecated. and will be removed in 2015.x", - DeprecationWarning, stacklevel=2) + +warn( + "pycuda.sparse is deprecated. and will be removed in 2015.x", + DeprecationWarning, + stacklevel=2, +) diff --git a/pycuda/sparse/cg.py b/pycuda/sparse/cg.py index c583af01..42148ece 100644 --- a/pycuda/sparse/cg.py +++ b/pycuda/sparse/cg.py @@ -2,22 +2,21 @@ from __future__ import division from __future__ import absolute_import from pycuda.sparse.inner import AsyncInnerProduct from pytools import memoize_method +import pycuda.driver as drv import pycuda.gpuarray as gpuarray import numpy as np - - class ConvergenceError(RuntimeError): pass - class CGStateContainer: def __init__(self, operator, precon=None, pagelocked_allocator=None): if precon is None: from pycuda.sparse.operator import IdentityOperator + precon = IdentityOperator(operator.dtype, operator.shape[0]) self.operator = operator @@ -28,23 +27,21 @@ class CGStateContainer: @memoize_method def make_lc2_kernel(self, dtype, a_is_gpu, b_is_gpu): from pycuda.elementwise import get_linear_combination_kernel - return get_linear_combination_kernel(( - (a_is_gpu, dtype, dtype), - (b_is_gpu, dtype, dtype) - ), dtype) + + return get_linear_combination_kernel( + ((a_is_gpu, dtype, dtype), (b_is_gpu, dtype, dtype)), dtype + ) def lc2(self, a, x, b, y, out=None): if out is None: - out = gpuarray.empty(x.shape, dtype=x.dtype, - allocator=x.allocator) + out = gpuarray.empty(x.shape, dtype=x.dtype, allocator=x.allocator) assert x.dtype == y.dtype == out.dtype a_is_gpu = isinstance(a, gpuarray.GPUArray) b_is_gpu = isinstance(b, gpuarray.GPUArray) assert x.shape == y.shape == out.shape - kernel, texrefs = self.make_lc2_kernel( - x.dtype, a_is_gpu, b_is_gpu) + kernel, texrefs = self.make_lc2_kernel(x.dtype, a_is_gpu, b_is_gpu) texrefs = texrefs[:] @@ -76,25 +73,29 @@ class CGStateContainer: def guarded_div_kernel(self, dtype_x, dtype_y, dtype_z): from pycuda.elementwise import get_elwise_kernel from pycuda.tools import dtype_to_ctype + return get_elwise_kernel( - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_z": dtype_to_ctype(dtype_z), - }, - "z[i] = y[i] == 0 ? 0 : (x[i] / y[i])", - "divide") + "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = y[i] == 0 ? 0 : (x[i] / y[i])", + "divide", + ) def guarded_div(self, a, b): from pycuda.gpuarray import _get_common_dtype + result = a._new_like_me(_get_common_dtype(a, b)) assert a.shape == b.shape func = self.guarded_div_kernel(a.dtype, b.dtype, result.dtype) - func.prepared_async_call(a._grid, a._block, None, - a.gpudata, b.gpudata, - result.gpudata, a.mem_size) + func.prepared_async_call( + a._grid, a._block, None, a.gpudata, b.gpudata, result.gpudata, a.mem_size + ) return result @@ -110,8 +111,7 @@ class CGStateContainer: self.d = self.precon(self.residual) # grows at the end - delta = AsyncInnerProduct(self.residual, self.d, - self.pagelocked_allocator) + delta = AsyncInnerProduct(self.residual, self.d, self.pagelocked_allocator) self.real_delta_queue = [delta] self.delta = delta.gpu_result @@ -128,15 +128,13 @@ class CGStateContainer: self.lc2(1, self.x, alpha, self.d, out=self.x) if compute_real_residual: - self.residual = self.lc2( - 1, self.rhs, -1, self.operator(self.x)) + self.residual = self.lc2(1, self.rhs, -1, self.operator(self.x)) else: self.lc2(1, self.residual, -alpha, q, out=self.residual) s = self.precon(self.residual) delta_old = self.delta - delta = AsyncInnerProduct(self.residual, s, - self.pagelocked_allocator) + delta = AsyncInnerProduct(self.residual, s, self.pagelocked_allocator) self.delta = delta.gpu_result beta = self.guarded_div(self.delta, delta_old) @@ -149,18 +147,15 @@ class CGStateContainer: check_interval = 20 if max_iterations is None: - max_iterations = max( - 3*check_interval+1, 10 * self.operator.shape[0]) + max_iterations = max(3 * check_interval + 1, 10 * self.operator.shape[0]) real_resid_interval = min(self.operator.shape[0], 50) iterations = 0 delta_0 = None while iterations < max_iterations: - compute_real_residual = \ - iterations % real_resid_interval == 0 + compute_real_residual = iterations % real_resid_interval == 0 - self.one_iteration( - compute_real_residual=compute_real_residual) + self.one_iteration(compute_real_residual=compute_real_residual) if debug_callback is not None: if compute_real_residual: @@ -168,8 +163,9 @@ class CGStateContainer: else: what = "it" - debug_callback(what, iterations, self.x, - self.residual, self.d, self.delta) + debug_callback( + what, iterations, self.x, self.residual, self.d, self.delta + ) # do often enough to allow AsyncInnerProduct # to progress through (polled) event chain @@ -185,11 +181,16 @@ class CGStateContainer: while i < len(rdq): delta = rdq[i].get_host_result() if delta is not None: - if abs(delta) < tol*tol * abs(delta_0): + if abs(delta) < tol * tol * abs(delta_0): if debug_callback is not None: - debug_callback("end", iterations, - self.x, self.residual, - self.d, self.delta) + debug_callback( + "end", + iterations, + self.x, + self.residual, + self.d, + self.delta, + ) return self.x rdq.pop(i) else: @@ -200,26 +201,33 @@ class CGStateContainer: raise ConvergenceError("cg failed to converge") - - -def solve_pkt_with_cg(pkt_spmv, b, precon=None, x=None, tol=1e-7, max_iterations=None, - debug=False, pagelocked_allocator=None): +def solve_pkt_with_cg( + pkt_spmv, + b, + precon=None, + x=None, + tol=1e-7, + max_iterations=None, + debug=False, + pagelocked_allocator=None, +): if x is None: - x = gpuarray.zeros(pkt_spmv.shape[0], dtype=pkt_spmv.dtype, - allocator=b.allocator) + x = gpuarray.zeros( + pkt_spmv.shape[0], dtype=pkt_spmv.dtype, allocator=b.allocator + ) else: x = pkt_spmv.permute(x) if pagelocked_allocator is None: pagelocked_allocator = drv.pagelocked_empty - cg = CGStateContainer(pkt_spmv, precon, - pagelocked_allocator=pagelocked_allocator) + cg = CGStateContainer(pkt_spmv, precon, pagelocked_allocator=pagelocked_allocator) cg.reset(pkt_spmv.permute(b), x) it_count = [0] res_count = [0] + def debug_callback(what, it_number, x, resid, d, delta): if what == "it": it_count[0] += 1 @@ -227,11 +235,6 @@ def solve_pkt_with_cg(pkt_spmv, b, precon=None, x=None, tol=1e-7, max_iterations res_count[0] += 1 it_count[0] += 1 - result = cg.run(max_iterations, tol, - debug_callback=debug_callback) + result = cg.run(max_iterations, tol, debug_callback=debug_callback) return pkt_spmv.unpermute(result), it_count[0], res_count[0] - - - - diff --git a/pycuda/sparse/coordinate.py b/pycuda/sparse/coordinate.py index bf068dd8..8299387f 100644 --- a/pycuda/sparse/coordinate.py +++ b/pycuda/sparse/coordinate.py @@ -7,8 +7,6 @@ from pycuda.compiler import SourceModule import numpy as np - - COO_FLAT_KERNEL_TEMPLATE = """ #include @@ -122,7 +120,6 @@ spmv_coo_flat_kernel(const index_type num_nonzeros, """ - COO_SERIAL_KERNEL_TEMPLATE = """ typedef %(value_type)s value_type; typedef %(index_type)s index_type; @@ -141,8 +138,6 @@ spmv_coo_serial_kernel(const index_type num_nonzeros, """ - - class CoordinateSpMV: def __init__(self, mat, dtype): self.dtype = np.dtype(dtype) @@ -152,6 +147,7 @@ class CoordinateSpMV: self.block_size = 128 from scipy.sparse import coo_matrix + coo_mat = coo_matrix(mat, dtype=self.dtype) self.row_gpu = gpuarray.to_gpu(coo_mat.row.astype(self.index_dtype)) @@ -160,43 +156,49 @@ class CoordinateSpMV: self.nnz = coo_mat.nnz from pycuda.tools import DeviceData + dev = drv.Context.get_device() devdata = DeviceData() - max_threads = (devdata.warps_per_mp*devdata.warp_size* - dev.multiprocessor_count) - max_blocks = 4*max_threads // self.block_size + max_threads = ( + devdata.warps_per_mp * devdata.warp_size * dev.multiprocessor_count + ) + max_blocks = 4 * max_threads // self.block_size warps_per_block = self.block_size // dev.warp_size if self.nnz: + def divide_into(x, y): - return (x+y-1)//y + return (x + y - 1) // y - num_units = self.nnz // dev.warp_size - num_warps = min(num_units, warps_per_block * max_blocks) + num_units = self.nnz // dev.warp_size + num_warps = min(num_units, warps_per_block * max_blocks) self.num_blocks = divide_into(num_warps, warps_per_block) - num_iters = divide_into(num_units, num_warps) + num_iters = divide_into(num_units, num_warps) self.interval_size = dev.warp_size * num_iters self.tail = num_units * dev.warp_size - @memoize_method def get_flat_kernel(self): from pycuda.tools import dtype_to_ctype mod = SourceModule( - COO_FLAT_KERNEL_TEMPLATE % { - "value_type": dtype_to_ctype(self.dtype), - "tex_value_type": dtype_to_ctype( - self.dtype, with_fp_tex_hack=True), - "index_type": dtype_to_ctype(self.index_dtype), - "block_size": self.block_size, - "warp_size": drv.Context.get_device().warp_size, - }) + COO_FLAT_KERNEL_TEMPLATE + % { + "value_type": dtype_to_ctype(self.dtype), + "tex_value_type": dtype_to_ctype(self.dtype, with_fp_tex_hack=True), + "index_type": dtype_to_ctype(self.index_dtype), + "block_size": self.block_size, + "warp_size": drv.Context.get_device().warp_size, + } + ) func = mod.get_function("spmv_coo_flat_kernel") x_texref = mod.get_texref("tex_x") - func.prepare(self.index_dtype.char*2 + "PPPP", - (self.block_size, 1, 1), texrefs=[x_texref]) + func.prepare( + self.index_dtype.char * 2 + "PPPP", + (self.block_size, 1, 1), + texrefs=[x_texref], + ) return func, x_texref @memoize_method @@ -204,37 +206,43 @@ class CoordinateSpMV: from pycuda.tools import dtype_to_ctype mod = SourceModule( - COO_SERIAL_KERNEL_TEMPLATE % { - "value_type": dtype_to_ctype(self.dtype), - "index_type": dtype_to_ctype(self.index_dtype), - }) + COO_SERIAL_KERNEL_TEMPLATE + % { + "value_type": dtype_to_ctype(self.dtype), + "index_type": dtype_to_ctype(self.index_dtype), + } + ) func = mod.get_function("spmv_coo_serial_kernel") func.prepare(self.index_dtype.char + "PPPPP", (1, 1, 1)) return func def __call__(self, x, y=None): if y is None: - y = gpuarray.zeros(self.shape[0], dtype=self.dtype, - allocator=x.allocator) + y = gpuarray.zeros(self.shape[0], dtype=self.dtype, allocator=x.allocator) if self.nnz == 0: return y flat_func, x_texref = self.get_flat_kernel() x.bind_to_texref_ext(x_texref, allow_double_hack=True) - flat_func.prepared_call((self.num_blocks, 1), - self.tail, self.interval_size, - self.row_gpu.gpudata, - self.col_gpu.gpudata, - self.data_gpu.gpudata, - y.gpudata) + flat_func.prepared_call( + (self.num_blocks, 1), + self.tail, + self.interval_size, + self.row_gpu.gpudata, + self.col_gpu.gpudata, + self.data_gpu.gpudata, + y.gpudata, + ) self.get_serial_kernel().prepared_call( - (1, 1), - self.nnz - self.tail, - self.row_gpu[self.tail:].gpudata, - self.col_gpu[self.tail:].gpudata, - self.data_gpu[self.tail:].gpudata, - x.gpudata, y.gpudata) + (1, 1), + self.nnz - self.tail, + self.row_gpu[self.tail:].gpudata, + self.col_gpu[self.tail:].gpudata, + self.data_gpu[self.tail:].gpudata, + x.gpudata, + y.gpudata, + ) return y diff --git a/pycuda/sparse/inner.py b/pycuda/sparse/inner.py index c68405d6..7e55e307 100644 --- a/pycuda/sparse/inner.py +++ b/pycuda/sparse/inner.py @@ -3,14 +3,12 @@ from __future__ import absolute_import import pycuda.driver as drv import pycuda.gpuarray as gpuarray - +import atexit STREAM_POOL = [] - - def get_stream(): if STREAM_POOL: return STREAM_POOL.pop() @@ -18,9 +16,6 @@ def get_stream(): return drv.Stream() - - - class AsyncInnerProduct: def __init__(self, a, b, pagelocked_allocator): self.gpu_result = gpuarray.dot(a, b) @@ -36,11 +31,11 @@ class AsyncInnerProduct: self.gpu_finished = True self.copy_stream = get_stream() self.host_dest = self.pagelocked_allocator( - self.gpu_result.shape, self.gpu_result.dtype, - self.copy_stream) - drv.memcpy_dtoh_async(self.host_dest, - self.gpu_result.gpudata, - self.copy_stream) + self.gpu_result.shape, self.gpu_result.dtype, self.copy_stream + ) + drv.memcpy_dtoh_async( + self.host_dest, self.gpu_result.gpudata, self.copy_stream + ) self.copy_finished_evt = drv.Event() self.copy_finished_evt.record() else: @@ -49,11 +44,8 @@ class AsyncInnerProduct: return self.host_dest - - def _at_exit(): STREAM_POOL[:] = [] -import atexit -atexit.register(_at_exit) +atexit.register(_at_exit) diff --git a/pycuda/sparse/operator.py b/pycuda/sparse/operator.py index 529f7875..bf8231c2 100644 --- a/pycuda/sparse/operator.py +++ b/pycuda/sparse/operator.py @@ -7,11 +7,6 @@ class OperatorBase(object): def shape(self): raise NotImplementedError - def __neg__(self): - return NegOperator(self) - - - class IdentityOperator(OperatorBase): def __init__(self, dtype, n): @@ -30,8 +25,6 @@ class IdentityOperator(OperatorBase): return operand - - class DiagonalPreconditioner(OperatorBase): def __init__(self, diagonal): self.diagonal = diagonal @@ -46,8 +39,4 @@ class DiagonalPreconditioner(OperatorBase): return n, n def __call__(self, operand): - return self.diagonal*operand - - - - + return self.diagonal * operand diff --git a/pycuda/sparse/packeted.py b/pycuda/sparse/packeted.py index 973cb42c..518c089c 100644 --- a/pycuda/sparse/packeted.py +++ b/pycuda/sparse/packeted.py @@ -2,15 +2,12 @@ from __future__ import division from __future__ import absolute_import from __future__ import print_function from pytools import memoize_method -import pycuda.driver as drv import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np from six.moves import range - - PKT_KERNEL_TEMPLATE = """ typedef %(index_type)s index_type; typedef %(value_type)s value_type; @@ -87,11 +84,10 @@ spmv_pkt_kernel(const index_type *row_ptr, """ - - class PacketedSpMV: def __init__(self, mat, is_symmetric, dtype): from pycuda.tools import DeviceData + devdata = DeviceData() # all row indices in the data structure generation code are @@ -105,17 +101,19 @@ class PacketedSpMV: if h != w: raise ValueError("only square matrices are supported") - self.rows_per_packet = (devdata.shared_memory - 100) \ - // (2*self.dtype.itemsize) + self.rows_per_packet = (devdata.shared_memory - 100) // ( + 2 * self.dtype.itemsize + ) - self.block_count = \ - (h + self.rows_per_packet - 1) // self.rows_per_packet + self.block_count = (h + self.rows_per_packet - 1) // self.rows_per_packet # get metis partition ------------------------------------------------- from scipy.sparse import csr_matrix + csr_mat = csr_matrix(mat, dtype=self.dtype) from pymetis import part_graph + if not is_symmetric: # make sure adjacency graph is undirected adj_mat = csr_mat + csr_mat.T @@ -123,8 +121,9 @@ class PacketedSpMV: adj_mat = csr_mat while True: - cut_count, dof_to_packet_nr = part_graph(int(self.block_count), - xadj=adj_mat.indptr, adjncy=adj_mat.indices) + cut_count, dof_to_packet_nr = part_graph( + int(self.block_count), xadj=adj_mat.indptr, adjncy=adj_mat.indices + ) # build packet_nr_to_dofs packet_nr_to_dofs = {} @@ -136,8 +135,9 @@ class PacketedSpMV: dof_packet.append(i) - packet_nr_to_dofs = [packet_nr_to_dofs.get(i) - for i in range(len(packet_nr_to_dofs))] + packet_nr_to_dofs = [ + packet_nr_to_dofs.get(i) for i in range(len(packet_nr_to_dofs)) + ] too_big = False for packet_dofs in packet_nr_to_dofs: @@ -147,10 +147,13 @@ class PacketedSpMV: if too_big: old_block_count = self.block_count - self.block_count = int(2+1.05*self.block_count) - print(("Metis produced a big block at block count " - "%d--retrying with %d" - % (old_block_count, self.block_count))) + self.block_count = int(2 + 1.05 * self.block_count) + print( + ( + "Metis produced a big block at block count " + "%d--retrying with %d" % (old_block_count, self.block_count) + ) + ) continue break @@ -158,51 +161,55 @@ class PacketedSpMV: assert len(packet_nr_to_dofs) == self.block_count # permutations, base rows --------------------------------------------- - new2old_fetch_indices, \ - old2new_fetch_indices, \ - packet_base_rows = self.find_simple_index_stuff( - packet_nr_to_dofs) + ( + new2old_fetch_indices, + old2new_fetch_indices, + packet_base_rows, + ) = self.find_simple_index_stuff(packet_nr_to_dofs) # find local row cost and remaining_coo ------------------------------- - local_row_costs, remaining_coo = \ - self.find_local_row_costs_and_remaining_coo( - csr_mat, dof_to_packet_nr, old2new_fetch_indices) + local_row_costs, remaining_coo = self.find_local_row_costs_and_remaining_coo( + csr_mat, dof_to_packet_nr, old2new_fetch_indices + ) local_nnz = np.sum(local_row_costs) assert remaining_coo.nnz == csr_mat.nnz - local_nnz # find thread assignment for each block ------------------------------- - thread_count = len(packet_nr_to_dofs)*self.threads_per_packet + thread_count = len(packet_nr_to_dofs) * self.threads_per_packet thread_assignments, thread_costs = self.find_thread_assignment( - packet_nr_to_dofs, local_row_costs, thread_count) + packet_nr_to_dofs, local_row_costs, thread_count + ) max_thread_costs = np.max(thread_costs) # build data structure ------------------------------------------------ from .pkt_build import build_pkt_data_structure - build_pkt_data_structure(self, packet_nr_to_dofs, max_thread_costs, - old2new_fetch_indices, csr_mat, thread_count, thread_assignments, - local_row_costs) + + build_pkt_data_structure( + self, + packet_nr_to_dofs, + max_thread_costs, + old2new_fetch_indices, + csr_mat, + thread_count, + thread_assignments, + local_row_costs, + ) self.packet_base_rows = gpuarray.to_gpu(packet_base_rows) - self.new2old_fetch_indices = gpuarray.to_gpu( - new2old_fetch_indices) - self.old2new_fetch_indices = gpuarray.to_gpu( - old2new_fetch_indices) + self.new2old_fetch_indices = gpuarray.to_gpu(new2old_fetch_indices) + self.old2new_fetch_indices = gpuarray.to_gpu(old2new_fetch_indices) from .coordinate import CoordinateSpMV - self.remaining_coo_gpu = CoordinateSpMV( - remaining_coo, dtype) + + self.remaining_coo_gpu = CoordinateSpMV(remaining_coo, dtype) def find_simple_index_stuff(self, packet_nr_to_dofs): - new2old_fetch_indices = np.zeros( - self.shape[0], dtype=self.index_dtype) - old2new_fetch_indices = np.zeros( - self.shape[0], dtype=self.index_dtype) + new2old_fetch_indices = np.zeros(self.shape[0], dtype=self.index_dtype) + old2new_fetch_indices = np.zeros(self.shape[0], dtype=self.index_dtype) - packet_base_rows = np.zeros( - self.block_count+1, - dtype=self.index_dtype) + packet_base_rows = np.zeros(self.block_count + 1, dtype=self.index_dtype) row_start = 0 for packet_nr, packet in enumerate(packet_nr_to_dofs): @@ -210,22 +217,22 @@ class PacketedSpMV: row_end = row_start + len(packet) pkt_indices = np.array(packet, dtype=self.index_dtype) - new2old_fetch_indices[row_start:row_end] = \ - pkt_indices - old2new_fetch_indices[pkt_indices] = \ - np.arange(row_start, row_end, dtype=self.index_dtype) + new2old_fetch_indices[row_start:row_end] = pkt_indices + old2new_fetch_indices[pkt_indices] = np.arange( + row_start, row_end, dtype=self.index_dtype + ) row_start += len(packet) packet_base_rows[self.block_count] = row_start - return (new2old_fetch_indices, old2new_fetch_indices, - packet_base_rows) + return (new2old_fetch_indices, old2new_fetch_indices, packet_base_rows) - def find_local_row_costs_and_remaining_coo(self, csr_mat, dof_to_packet_nr, - old2new_fetch_indices): + def find_local_row_costs_and_remaining_coo( + self, csr_mat, dof_to_packet_nr, old2new_fetch_indices + ): h, w = self.shape - local_row_costs = [0]*h + local_row_costs = [0] * h rem_coo_values = [] rem_coo_i = [] rem_coo_j = [] @@ -235,7 +242,7 @@ class PacketedSpMV: data = csr_mat.data for i in range(h): - for idx in range(iptr[i], iptr[i+1]): + for idx in range(iptr[i], iptr[i + 1]): j = indices[idx] if dof_to_packet_nr[i] == dof_to_packet_nr[j]: @@ -246,29 +253,29 @@ class PacketedSpMV: rem_coo_j.append(old2new_fetch_indices[j]) from scipy.sparse import coo_matrix + remaining_coo = coo_matrix( - (rem_coo_values, (rem_coo_i, rem_coo_j)), self.shape, - dtype=self.dtype) + (rem_coo_values, (rem_coo_i, rem_coo_j)), self.shape, dtype=self.dtype + ) return local_row_costs, remaining_coo - def find_thread_assignment(self, packet_nr_to_dofs, local_row_cost, - thread_count): + def find_thread_assignment(self, packet_nr_to_dofs, local_row_cost, thread_count): thread_assignments = [[] for i in range(thread_count)] thread_costs = np.zeros(thread_count) for packet_nr, packet_dofs in enumerate(packet_nr_to_dofs): row_costs_and_numbers = sorted( - [(local_row_cost[i], i) for i in packet_dofs], - reverse=True) + [(local_row_cost[i], i) for i in packet_dofs], reverse=True + ) - base_thread_nr = packet_nr*self.threads_per_packet + base_thread_nr = packet_nr * self.threads_per_packet thread_offset = 0 # zigzag assignment step = 1 for row_cost, row_number in row_costs_and_numbers: - ti = base_thread_nr+thread_offset + ti = base_thread_nr + thread_offset thread_assignments[ti].append(row_number) thread_costs[ti] += row_cost @@ -281,16 +288,29 @@ class PacketedSpMV: return thread_assignments, thread_costs - def build_gpu_data_structure(self, packet_nr_to_dofs, max_thread_costs, - old2new_fetch_indices, csr_mat, thread_count, thread_assignments, - local_row_costs): + def build_gpu_data_structure( + self, + packet_nr_to_dofs, + max_thread_costs, + old2new_fetch_indices, + csr_mat, + thread_count, + thread_assignments, + local_row_costs, + ): # these arrays will likely be too long, but that's ok from .pkt_build import build_pkt_structure - build_pkt_structure(self, packet_nr_to_dofs, thread_assignments, - thread_starts, thread_ends, index_array, data_array) - + build_pkt_structure( + self, + packet_nr_to_dofs, + thread_assignments, + # thread_starts, + # thread_ends, + # index_array, + # data_array, + ) # copy data to the gpu ------------------------------------------------ @@ -300,13 +320,16 @@ class PacketedSpMV: from pycuda.tools import dtype_to_ctype mod = SourceModule( - PKT_KERNEL_TEMPLATE % { - "value_type": dtype_to_ctype(self.dtype), - "index_type": dtype_to_ctype(self.index_dtype), - "packed_index_type": dtype_to_ctype(self.packed_index_dtype), - "threads_per_packet": self.threads_per_packet, - "rows_per_packet": self.rows_per_packet, - }, no_extern_c=True) + PKT_KERNEL_TEMPLATE + % { + "value_type": dtype_to_ctype(self.dtype), + "index_type": dtype_to_ctype(self.index_dtype), + "packed_index_type": dtype_to_ctype(self.packed_index_dtype), + "threads_per_packet": self.threads_per_packet, + "rows_per_packet": self.rows_per_packet, + }, + no_extern_c=True, + ) func = mod.get_function("spmv_pkt_kernel") func.prepare("PPPPPPP") return func @@ -319,21 +342,20 @@ class PacketedSpMV: def __call__(self, x, y=None): if y is None: - y = gpuarray.zeros(self.shape[0], dtype=self.dtype, - allocator=x.allocator) + y = gpuarray.zeros(self.shape[0], dtype=self.dtype, allocator=x.allocator) self.get_kernel().prepared_call( - (self.block_count, 1), - (self.threads_per_packet, 1, 1), - self.packet_base_rows.gpudata, - self.thread_starts.gpudata, - self.thread_ends.gpudata, - self.index_array.gpudata, - self.data_array.gpudata, - x.gpudata, - y.gpudata) + (self.block_count, 1), + (self.threads_per_packet, 1, 1), + self.packet_base_rows.gpudata, + self.thread_starts.gpudata, + self.thread_ends.gpudata, + self.index_array.gpudata, + self.data_array.gpudata, + x.gpudata, + y.gpudata, + ) self.remaining_coo_gpu(x, y) return y - diff --git a/pycuda/sparse/pkt_build.py b/pycuda/sparse/pkt_build.py index 278ca8ed..cde771ad 100644 --- a/pycuda/sparse/pkt_build.py +++ b/pycuda/sparse/pkt_build.py @@ -4,41 +4,44 @@ import pycuda.gpuarray as gpuarray from six.moves import range - - -def build_pkt_data_structure(spmv, packet_nr_to_dofs, max_thread_costs, - old2new_fetch_indices, csr_mat, thread_count, thread_assignments, - local_row_costs): +def build_pkt_data_structure( + spmv, + packet_nr_to_dofs, + max_thread_costs, + old2new_fetch_indices, + csr_mat, + thread_count, + thread_assignments, + local_row_costs, +): packet_start = 0 base_dof_nr = 0 max_thread_costs = int(max_thread_costs) index_array = np.zeros( - max_thread_costs*thread_count, dtype=spmv.packed_index_dtype) - data_array = np.zeros( - max_thread_costs*thread_count, dtype=spmv.dtype) - thread_starts = np.zeros( - thread_count, dtype=spmv.index_dtype) - thread_ends = np.zeros( - thread_count, dtype=spmv.index_dtype) + max_thread_costs * thread_count, dtype=spmv.packed_index_dtype + ) + data_array = np.zeros(max_thread_costs * thread_count, dtype=spmv.dtype) + thread_starts = np.zeros(thread_count, dtype=spmv.index_dtype) + thread_ends = np.zeros(thread_count, dtype=spmv.index_dtype) for packet_nr, packet_dofs in enumerate(packet_nr_to_dofs): - base_thread_nr = packet_nr*spmv.threads_per_packet + base_thread_nr = packet_nr * spmv.threads_per_packet max_packet_items = 0 for thread_offset in range(spmv.threads_per_packet): - thread_write_idx = packet_start+thread_offset - thread_start = packet_start+thread_offset - thread_starts[base_thread_nr+thread_offset] = thread_write_idx + thread_write_idx = packet_start + thread_offset + thread_start = packet_start + thread_offset + thread_starts[base_thread_nr + thread_offset] = thread_write_idx - for row_nr in thread_assignments[base_thread_nr+thread_offset]: + for row_nr in thread_assignments[base_thread_nr + thread_offset]: perm_row_nr = old2new_fetch_indices[row_nr] rel_row_nr = perm_row_nr - base_dof_nr assert 0 <= rel_row_nr < len(packet_dofs) row_entries = 0 - for idx in range(csr_mat.indptr[row_nr], csr_mat.indptr[row_nr+1]): + for idx in range(csr_mat.indptr[row_nr], csr_mat.indptr[row_nr + 1]): col_nr = csr_mat.indices[idx] perm_col_nr = old2new_fetch_indices[col_nr] @@ -52,14 +55,13 @@ def build_pkt_data_structure(spmv, packet_nr_to_dofs, max_thread_costs, assert row_entries == local_row_costs[row_nr] - thread_ends[base_thread_nr+thread_offset] = thread_write_idx + thread_ends[base_thread_nr + thread_offset] = thread_write_idx - thread_items = (thread_write_idx - thread_start)//spmv.threads_per_packet - max_packet_items = max( - max_packet_items, thread_items) + thread_items = (thread_write_idx - thread_start) // spmv.threads_per_packet + max_packet_items = max(max_packet_items, thread_items) base_dof_nr += len(packet_dofs) - packet_start += max_packet_items*spmv.threads_per_packet + packet_start += max_packet_items * spmv.threads_per_packet spmv.thread_starts = gpuarray.to_gpu(thread_starts) spmv.thread_ends = gpuarray.to_gpu(thread_ends) @@ -67,12 +69,10 @@ def build_pkt_data_structure(spmv, packet_nr_to_dofs, max_thread_costs, spmv.data_array = gpuarray.to_gpu(data_array) - - try: import pyximport except ImportError: pass else: pyximport.install() - from pycuda.sparse.pkt_build_cython import build_pkt_data_structure + from pycuda.sparse.pkt_build_cython import build_pkt_data_structure # noqa: F811, F401 diff --git a/pycuda/tools.py b/pycuda/tools.py index c243b911..4b7f6b5b 100644 --- a/pycuda/tools.py +++ b/pycuda/tools.py @@ -37,15 +37,18 @@ import pycuda._driver as _drv import numpy as np +from pycuda.compyte.dtypes import ( # noqa: F401 + register_dtype, + get_or_register_dtype, + _fill_dtype_registry, + dtype_to_ctype as base_dtype_to_ctype, +) + bitlog2 = _drv.bitlog2 DeviceMemoryPool = _drv.DeviceMemoryPool PageLockedMemoryPool = _drv.PageLockedMemoryPool PageLockedAllocator = _drv.PageLockedAllocator -from pycuda.compyte.dtypes import ( - register_dtype, get_or_register_dtype, _fill_dtype_registry, - dtype_to_ctype as base_dtype_to_ctype) - _fill_dtype_registry(respect_windows=True) get_or_register_dtype("pycuda::complex", np.complex64) get_or_register_dtype("pycuda::complex", np.complex128) @@ -53,6 +56,7 @@ get_or_register_dtype("pycuda::complex", np.complex128) # {{{ debug memory pool + class DebugMemoryPool(DeviceMemoryPool): def __init__(self, interactive=True, logfile=None): DeviceMemoryPool.__init__(self) @@ -61,19 +65,23 @@ class DebugMemoryPool(DeviceMemoryPool): if logfile is None: import sys + logfile = sys.stdout self.logfile = logfile from weakref import WeakKeyDictionary + self.blocks = WeakKeyDictionary() if interactive: from pytools.diskdict import DiskDict + self.stacktrace_mnemonics = DiskDict("pycuda-stacktrace-mnemonics") def allocate(self, size): from traceback import extract_stack + stack = tuple(frm[2] for frm in extract_stack()) description = self.describe(stack, size) @@ -82,21 +90,27 @@ class DebugMemoryPool(DeviceMemoryPool): histogram[bsize, descr] = histogram.get((bsize, descr), 0) + 1 from pytools import common_prefix + cpfx = common_prefix(descr for bsize, descr in histogram) print( - "\n Allocation of size %d occurring " - "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):" - "\n at: %s" % ( - (size, self.last_free) + cuda.mem_get_info() - + (self.held_blocks, self.active_blocks, - description)), - file=self.logfile) + "\n Allocation of size %d occurring " + "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):" + "\n at: %s" + % ( + (size, self.last_free) + + cuda.mem_get_info() + + (self.held_blocks, self.active_blocks, description) + ), + file=self.logfile, + ) hist_items = sorted(list(six.iteritems(histogram))) for (bsize, descr), count in hist_items: - print(" %s (%d bytes): %dx" % (descr[len(cpfx):], bsize, count), - file=self.logfile) + print( + " %s (%d bytes): %dx" % (descr[len(cpfx):], bsize, count), + file=self.logfile, + ) if self.interactive: input(" [Enter]") @@ -116,33 +130,43 @@ class DebugMemoryPool(DeviceMemoryPool): print(size, stack) while True: mnemonic = input("Enter mnemonic or [Enter] for more info:") - if mnemonic == '': + if mnemonic == "": from traceback import print_stack + print_stack() else: break self.stacktrace_mnemonics[stack, size] = mnemonic return mnemonic + # }}} # {{{ default device/context + def get_default_device(default=0): from warnings import warn - warn("get_default_device() is deprecated; " - "use make_default_context() instead", DeprecationWarning) + + warn( + "get_default_device() is deprecated; " "use make_default_context() instead", + DeprecationWarning, + ) from pycuda.driver import Device import os + dev = os.environ.get("CUDA_DEVICE") if dev is None: try: - dev = (open(os.path.join(os.path.expanduser("~"), ".cuda_device")) - .read().strip()) - except: + dev = ( + open(os.path.join(os.path.expanduser("~"), ".cuda_device")) + .read() + .strip() + ) + except Exception: pass if dev is None: @@ -151,24 +175,28 @@ def get_default_device(default=0): try: dev = int(dev) except TypeError: - raise TypeError("CUDA device number (CUDA_DEVICE or ~/.cuda-device) " - "must be an integer") + raise TypeError( + "CUDA device number (CUDA_DEVICE or ~/.cuda-device) " "must be an integer" + ) return Device(dev) def make_default_context(ctx_maker=None): if ctx_maker is None: + def ctx_maker(dev): return dev.make_context() ndevices = cuda.Device.count() if ndevices == 0: - raise RuntimeError("No CUDA enabled device found. " - "Please check your installation.") + raise RuntimeError( + "No CUDA enabled device found. " "Please check your installation." + ) # Is CUDA_DEVICE set? import os + devn = os.environ.get("CUDA_DEVICE") # Is $HOME/.cuda_device set ? @@ -176,9 +204,8 @@ def make_default_context(ctx_maker=None): try: homedir = os.environ.get("HOME") assert homedir is not None - devn = (open(os.path.join(homedir, ".cuda_device")) - .read().strip()) - except: + devn = open(os.path.join(homedir, ".cuda_device")).read().strip() + except Exception: pass # If either CUDA_DEVICE or $HOME/.cuda_device is set, try to use it @@ -186,8 +213,10 @@ def make_default_context(ctx_maker=None): try: devn = int(devn) except TypeError: - raise TypeError("CUDA device number (CUDA_DEVICE or ~/.cuda_device)" - " must be an integer") + raise TypeError( + "CUDA device number (CUDA_DEVICE or ~/.cuda_device)" + " must be an integer" + ) dev = cuda.Device(devn) return ctx_maker(dev) @@ -201,14 +230,18 @@ def make_default_context(ctx_maker=None): except cuda.Error: pass - raise RuntimeError("make_default_context() wasn't able to create a context " - "on any of the %d detected devices" % ndevices) + raise RuntimeError( + "make_default_context() wasn't able to create a context " + "on any of the %d detected devices" % ndevices + ) + # }}} # {{{ rounding helpers + def _exact_div(dividend, divisor): quot, rem = divmod(dividend, divisor) assert rem == 0 @@ -220,7 +253,8 @@ def _int_ceiling(value, multiple_of=1): # Mimicks the Excel "floor" function (for code stolen from occupancy calculator) from math import ceil - return int(ceil(value/multiple_of))*multiple_of + + return int(ceil(value / multiple_of)) * multiple_of def _int_floor(value, multiple_of=1): @@ -228,13 +262,16 @@ def _int_floor(value, multiple_of=1): # Mimicks the Excel "floor" function (for code stolen from occupancy calculator) from math import floor - return int(floor(value/multiple_of))*multiple_of + + return int(floor(value / multiple_of)) * multiple_of + # }}} # {{{ device data + class DeviceData: def __init__(self, dev=None): import pycuda.driver as drv @@ -242,8 +279,7 @@ class DeviceData: if dev is None: dev = cuda.Context.get_device() - self.max_threads = dev.get_attribute( - drv.device_attribute.MAX_THREADS_PER_BLOCK) + self.max_threads = dev.get_attribute(drv.device_attribute.MAX_THREADS_PER_BLOCK) self.warp_size = dev.get_attribute(drv.device_attribute.WARP_SIZE) if dev.compute_capability() >= (3, 0): @@ -256,10 +292,10 @@ class DeviceData: self.warps_per_mp = 24 self.thread_blocks_per_mp = 8 - self.registers = dev.get_attribute( - drv.device_attribute.MAX_REGISTERS_PER_BLOCK) + self.registers = dev.get_attribute(drv.device_attribute.MAX_REGISTERS_PER_BLOCK) self.shared_memory = dev.get_attribute( - drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK) + drv.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK + ) if dev.compute_capability() >= (2, 0): self.smem_alloc_granularity = 128 @@ -277,8 +313,7 @@ class DeviceData: return _int_ceiling(bytes, self.align_bytes(word_size)) def align_dtype(self, elements, dtype_size): - return _int_ceiling(elements, - self.align_words(dtype_size)) + return _int_ceiling(elements, self.align_words(dtype_size)) def align_words(self, word_size): return _exact_div(self.align_bytes(word_size), word_size) @@ -298,31 +333,35 @@ class DeviceData: @staticmethod def make_valid_tex_channel_count(size): - valid_sizes = [1,2,4] + valid_sizes = [1, 2, 4] for vs in valid_sizes: if size <= vs: return vs raise ValueError("could not enlarge argument to valid channel count") + # }}} # {{{ occupancy + class OccupancyRecord: def __init__(self, devdata, threads, shared_mem=0, registers=0): if threads > devdata.max_threads: raise ValueError("too many threads") # copied literally from occupancy calculator - alloc_warps = _int_ceiling(threads/devdata.warp_size) + alloc_warps = _int_ceiling(threads / devdata.warp_size) alloc_smem = _int_ceiling(shared_mem, devdata.smem_alloc_granularity) if devdata.register_allocation_unit == "warp": - alloc_regs = alloc_warps*32*registers + alloc_regs = alloc_warps * 32 * registers elif devdata.register_allocation_unit == "block": - alloc_regs = _int_ceiling(alloc_warps*2, 4)*16*registers + alloc_regs = _int_ceiling(alloc_warps * 2, 4) * 16 * registers else: - raise ValueError("Improper register allocation unit:"+devdata.register_allocation_unit) + raise ValueError( + "Improper register allocation unit:" + devdata.register_allocation_unit + ) if alloc_regs > devdata.registers: raise ValueError("too many registers") @@ -330,33 +369,37 @@ class OccupancyRecord: if alloc_smem > devdata.shared_memory: raise ValueError("too much smem") - self.tb_per_mp_limits = [(devdata.thread_blocks_per_mp, "device"), - (_int_floor(devdata.warps_per_mp/alloc_warps), "warps") - ] + self.tb_per_mp_limits = [ + (devdata.thread_blocks_per_mp, "device"), + (_int_floor(devdata.warps_per_mp / alloc_warps), "warps"), + ] if registers > 0: - self.tb_per_mp_limits.append((_int_floor(devdata.registers/alloc_regs), "regs")) + self.tb_per_mp_limits.append( + (_int_floor(devdata.registers / alloc_regs), "regs") + ) if shared_mem > 0: - self.tb_per_mp_limits.append((_int_floor(devdata.shared_memory/alloc_smem), "smem")) + self.tb_per_mp_limits.append( + (_int_floor(devdata.shared_memory / alloc_smem), "smem") + ) self.tb_per_mp, self.limited_by = min(self.tb_per_mp_limits) self.warps_per_mp = self.tb_per_mp * alloc_warps self.occupancy = self.warps_per_mp / devdata.warps_per_mp + # }}} # {{{ C types <-> dtypes + class Argument: def __init__(self, dtype, name): self.dtype = np.dtype(dtype) self.name = name def __repr__(self): - return "%s(%r, %s)" % ( - self.__class__.__name__, - self.name, - self.dtype) + return "%s(%r, %s)" % (self.__class__.__name__, self.name, self.dtype) def dtype_to_ctype(dtype, with_fp_tex_hack=False): @@ -383,6 +426,7 @@ class VectorArg(Argument): struct_char = "P" + class ScalarArg(Argument): def declarator(self): return "%s %s" % (dtype_to_ctype(self.dtype), self.name) @@ -396,15 +440,16 @@ class ScalarArg(Argument): return result - - def parse_c_arg(c_arg): from pycuda.compyte.dtypes import parse_c_arg_backend + return parse_c_arg_backend(c_arg, ScalarArg, VectorArg) + def get_arg_type(c_arg): return parse_c_arg(c_arg).struct_char + # }}} # {{{ context-dep memoization @@ -412,8 +457,6 @@ def get_arg_type(c_arg): context_dependent_memoized_functions = [] - - @decorator def context_dependent_memoize(func, *args): try: @@ -435,7 +478,6 @@ def context_dependent_memoize(func, *args): return result - def clear_context_caches(): for func in context_dependent_memoized_functions: try: @@ -445,13 +487,16 @@ def clear_context_caches(): else: ctx_dict.clear() + # }}} # {{{ py.test interaction + def mark_cuda_test(inner_f): def f(*args, **kwargs): import pycuda.driver + # appears to be idempotent, i.e. no harm in calling it more than once pycuda.driver.init() @@ -465,9 +510,11 @@ def mark_cuda_test(inner_f): ctx.pop() from pycuda.tools import clear_context_caches + clear_context_caches() from gc import collect + collect() try: @@ -477,8 +524,8 @@ def mark_cuda_test(inner_f): return mark_test.cuda(f) -# }}} +# }}} # vim: foldmethod=marker diff --git a/setup.cfg b/setup.cfg index b4f38af6..29183507 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,9 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,W503 +ignore = E126,E127,E128,E123,E226,E241,E242,W503,N806,F405,E501 +# FIXME: fix in future: N806, F405, E501 max-line-length=85 +exclude=pycuda/compyte/ndarray,pycuda/compyte/array.py + +inline-quotes = " +docstring-quotes = """ +multiline-quotes = """ diff --git a/test/test_cumath.py b/test/test_cumath.py index 35fcbfa3..a8f1f15c 100644 --- a/test/test_cumath.py +++ b/test/test_cumath.py @@ -6,18 +6,9 @@ from pycuda.tools import mark_cuda_test from six.moves import range -def have_pycuda(): - try: - import pycuda # noqa - return True - except: - return False - - -if have_pycuda(): - import pycuda.gpuarray as gpuarray - import pycuda.driver as drv # noqa - import pycuda.cumath as cumath +import pycuda.gpuarray as gpuarray +import pycuda.driver as drv # noqa +import pycuda.cumath as cumath sizes = [10, 128, 1024, 1 << 10, 1 << 13] @@ -26,10 +17,10 @@ complex_dtypes = [np.complex64, np.complex128] numpy_func_names = { - "asin": "arcsin", - "acos": "arccos", - "atan": "arctan", - } + "asin": "arcsin", + "acos": "arccos", + "atan": "arctan", +} def make_unary_function_test(name, a=0, b=1, threshold=0, complex=False): @@ -44,71 +35,67 @@ def make_unary_function_test(name, a=0, b=1, threshold=0, complex=False): for s in sizes: for dtype in _dtypes: np.random.seed(1) - A = (np.random.random(s)*(b-a) + a).astype(dtype) + A = (np.random.random(s) * (b - a) + a).astype(dtype) if complex: - A += (np.random.random(s)*(b-a) + a)*1j + A += (np.random.random(s) * (b - a) + a) * 1j args = gpuarray.to_gpu(A) gpu_results = gpu_func(args).get() cpu_results = cpu_func(A) max_err = np.max(np.abs(cpu_results - gpu_results)) - assert (max_err <= threshold).all(), \ - (max_err, name, dtype) + assert (max_err <= threshold).all(), (max_err, name, dtype) gpu_results2 = gpuarray.empty_like(args) gr2 = gpu_func(args, out=gpu_results2) assert gpu_results2 is gr2 gr2 = gr2.get() max_err = np.max(np.abs(cpu_results - gr2)) - assert (max_err <= threshold).all(), \ - (max_err, name, dtype) + assert (max_err <= threshold).all(), (max_err, name, dtype) return mark_cuda_test(test) -if have_pycuda(): - test_ceil = make_unary_function_test("ceil", -10, 10) - test_floor = make_unary_function_test("ceil", -10, 10) - test_fabs = make_unary_function_test("fabs", -10, 10) - test_exp = make_unary_function_test("exp", -3, 3, 1e-5) - test_exp_c = make_unary_function_test("exp", -3, 3, 1e-5, complex=True) - test_log = make_unary_function_test("log", 1e-5, 1, 5e-7) - test_log10 = make_unary_function_test("log10", 1e-5, 1, 3e-7) - test_sqrt = make_unary_function_test("sqrt", 1e-5, 1, 2e-7) - - test_sin = make_unary_function_test("sin", -10, 10, 1e-7) - test_sin_c = make_unary_function_test("sin", -3, 3, 2.1e-6, complex=True) - test_cos = make_unary_function_test("cos", -10, 10, 1e-7) - test_cos_c = make_unary_function_test("cos", -3, 3, 2.1e-6, complex=True) - test_asin = make_unary_function_test("asin", -0.9, 0.9, 5e-7) - #test_sin_c = make_unary_function_test("sin", -0.9, 0.9, 2e-6, complex=True) - test_acos = make_unary_function_test("acos", -0.9, 0.9, 5e-7) - #test_acos_c = make_unary_function_test("acos", -0.9, 0.9, 2e-6, complex=True) - test_tan = make_unary_function_test("tan", - -math.pi/2 + 0.1, math.pi/2 - 0.1, 1e-5) - test_tan_c = make_unary_function_test("tan", - -math.pi/2 + 0.1, math.pi/2 - 0.1, 3e-5, complex=True) - test_atan = make_unary_function_test("atan", -10, 10, 2e-7) - - test_sinh = make_unary_function_test("sinh", -3, 3, 2e-6) - test_sinh_c = make_unary_function_test("sinh", -3, 3, 3e-6, complex=True) - test_cosh = make_unary_function_test("cosh", -3, 3, 2e-6) - test_cosh_c = make_unary_function_test("cosh", -3, 3, 3e-6, complex=True) - test_tanh = make_unary_function_test("tanh", -3, 3, 2e-6) - test_tanh_c = make_unary_function_test("tanh", - -math.pi/2 + 0.1, math.pi/2 - 0.1, 3e-5, complex=True) +test_ceil = make_unary_function_test("ceil", -10, 10) +test_floor = make_unary_function_test("ceil", -10, 10) +test_fabs = make_unary_function_test("fabs", -10, 10) +test_exp = make_unary_function_test("exp", -3, 3, 1e-5) +test_exp_c = make_unary_function_test("exp", -3, 3, 1e-5, complex=True) +test_log = make_unary_function_test("log", 1e-5, 1, 5e-7) +test_log10 = make_unary_function_test("log10", 1e-5, 1, 3e-7) +test_sqrt = make_unary_function_test("sqrt", 1e-5, 1, 2e-7) + +test_sin = make_unary_function_test("sin", -10, 10, 1e-7) +test_sin_c = make_unary_function_test("sin", -3, 3, 2.1e-6, complex=True) +test_cos = make_unary_function_test("cos", -10, 10, 1e-7) +test_cos_c = make_unary_function_test("cos", -3, 3, 2.1e-6, complex=True) +test_asin = make_unary_function_test("asin", -0.9, 0.9, 5e-7) +# test_sin_c = make_unary_function_test("sin", -0.9, 0.9, 2e-6, complex=True) +test_acos = make_unary_function_test("acos", -0.9, 0.9, 5e-7) +# test_acos_c = make_unary_function_test("acos", -0.9, 0.9, 2e-6, complex=True) +test_tan = make_unary_function_test("tan", -math.pi / 2 + 0.1, math.pi / 2 - 0.1, 1e-5) +test_tan_c = make_unary_function_test( + "tan", -math.pi / 2 + 0.1, math.pi / 2 - 0.1, 3e-5, complex=True +) +test_atan = make_unary_function_test("atan", -10, 10, 2e-7) + +test_sinh = make_unary_function_test("sinh", -3, 3, 2e-6) +test_sinh_c = make_unary_function_test("sinh", -3, 3, 3e-6, complex=True) +test_cosh = make_unary_function_test("cosh", -3, 3, 2e-6) +test_cosh_c = make_unary_function_test("cosh", -3, 3, 3e-6, complex=True) +test_tanh = make_unary_function_test("tanh", -3, 3, 2e-6) +test_tanh_c = make_unary_function_test( + "tanh", -math.pi / 2 + 0.1, math.pi / 2 - 0.1, 3e-5, complex=True +) class TestMath: - disabled = not have_pycuda() - @mark_cuda_test def test_fmod(self): """tests if the fmod function works""" for s in sizes: - a = gpuarray.arange(s, dtype=np.float32)/10 - a2 = gpuarray.arange(s, dtype=np.float32)/45.2 + 0.1 + a = gpuarray.arange(s, dtype=np.float32) / 10 + a2 = gpuarray.arange(s, dtype=np.float32) / 45.2 + 0.1 b = cumath.fmod(a, a2) a = a.get() @@ -123,7 +110,7 @@ class TestMath: """tests if the ldexp function works""" for s in sizes: a = gpuarray.arange(s, dtype=np.float32) - a2 = gpuarray.arange(s, dtype=np.float32)*1e-3 + a2 = gpuarray.arange(s, dtype=np.float32) * 1e-3 b = cumath.ldexp(a, a2) a = a.get() @@ -137,7 +124,7 @@ class TestMath: def test_modf(self): """tests if the modf function works""" for s in sizes: - a = gpuarray.arange(s, dtype=np.float32)/10 + a = gpuarray.arange(s, dtype=np.float32) / 10 fracpart, intpart = cumath.modf(a) a = a.get() @@ -154,7 +141,7 @@ class TestMath: def test_frexp(self): """tests if the frexp function works""" for s in sizes: - a = gpuarray.arange(s, dtype=np.float32)/10 + a = gpuarray.arange(s, dtype=np.float32) / 10 significands, exponents = cumath.frexp(a) a = a.get() @@ -178,12 +165,12 @@ class TestMath: for s in sizes: for dtype in dtypes: np.random.seed(1) - A = (np.random.random(s)*(b-a) + a).astype(dtype) + A = (np.random.random(s) * (b - a) + a).astype(dtype) if complex: - A = A + (np.random.random(s)*(b-a) + a)*1j + A = A + (np.random.random(s) * (b - a) + a) * 1j np.random.seed(1) - A = (np.random.random(s)*(b-a) + a).astype(dtype) + A = (np.random.random(s) * (b - a) + a).astype(dtype) args = gpuarray.to_gpu(A) # 'out' kw @@ -203,7 +190,7 @@ class TestMath: # 'stream' kw mystream = Stream() np.random.seed(1) - A = (np.random.random(s)*(b-a) + a).astype(dtype) + A = (np.random.random(s) * (b - a) + a).astype(dtype) args = gpuarray.to_gpu(A) gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, stream=mystream).get() @@ -214,7 +201,7 @@ class TestMath: # 'stream' position mystream = Stream() np.random.seed(1) - A = (np.random.random(s)*(b-a) + a).astype(dtype) + A = (np.random.random(s) * (b - a) + a).astype(dtype) args = gpuarray.to_gpu(A) gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, mystream).get() @@ -225,7 +212,7 @@ class TestMath: # 'out' and 'stream' kw mystream = Stream() np.random.seed(1) - A = (np.random.random(s)*(b-a) + a).astype(dtype) + A = (np.random.random(s) * (b - a) + a).astype(dtype) args = gpuarray.to_gpu(A) gpu_results = gpuarray.empty_like(args) gpu_results = gpu_func(args, stream=mystream, out=gpu_results).get() @@ -239,8 +226,10 @@ if __name__ == "__main__": import pycuda.autoinit # noqa import sys + if len(sys.argv) > 1: - exec (sys.argv[1]) + exec(sys.argv[1]) else: from pytest import main + main([__file__]) diff --git a/test/test_driver.py b/test/test_driver.py index 9ff010e6..35a52e1d 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -6,38 +6,29 @@ import pytest # noqa from six.moves import range -def have_pycuda(): - try: - import pycuda # noqa - return True - except Exception: - return False - - -if have_pycuda(): - import pycuda.gpuarray as gpuarray - import pycuda.driver as drv - from pycuda.compiler import SourceModule +import pycuda.gpuarray as gpuarray +import pycuda.driver as drv +from pycuda.compiler import SourceModule class TestDriver: - disabled = not have_pycuda() - @mark_cuda_test def test_memory(self): z = np.random.randn(400).astype(np.float32) new_z = drv.from_device_like(drv.to_device(z), z) - assert la.norm(new_z-z) == 0 + assert la.norm(new_z - z) == 0 @mark_cuda_test def test_simple_kernel(self): - mod = SourceModule(""" + mod = SourceModule( + """ __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } - """) + """ + ) multiply_them = mod.get_function("multiply_them") @@ -45,20 +36,20 @@ class TestDriver: b = np.random.randn(400).astype(np.float32) dest = np.zeros_like(a) - multiply_them( - drv.Out(dest), drv.In(a), drv.In(b), - block=(400, 1, 1)) - assert la.norm(dest-a*b) == 0 + multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1)) + assert la.norm(dest - a * b) == 0 @mark_cuda_test def test_simple_kernel_2(self): - mod = SourceModule(""" + mod = SourceModule( + """ __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } - """) + """ + ) multiply_them = mod.get_function("multiply_them") @@ -68,29 +59,29 @@ class TestDriver: b_gpu = drv.to_device(b) dest = np.zeros_like(a) - multiply_them( - drv.Out(dest), a_gpu, b_gpu, - block=(400, 1, 1)) - assert la.norm(dest-a*b) == 0 + multiply_them(drv.Out(dest), a_gpu, b_gpu, block=(400, 1, 1)) + assert la.norm(dest - a * b) == 0 drv.Context.synchronize() # now try with offsets dest = np.zeros_like(a) multiply_them( - drv.Out(dest), np.intp(a_gpu)+a.itemsize, b_gpu, - block=(399, 1, 1)) + drv.Out(dest), np.intp(a_gpu) + a.itemsize, b_gpu, block=(399, 1, 1) + ) - assert la.norm((dest[:-1]-a[1:]*b[:-1])) == 0 + assert la.norm((dest[:-1] - a[1:] * b[:-1])) == 0 @mark_cuda_test def test_vector_types(self): - mod = SourceModule(""" + mod = SourceModule( + """ __global__ void set_them(float3 *dest, float3 x) { const int i = threadIdx.x; dest[i] = x; } - """) + """ + ) set_them = mod.get_function("set_them") a = gpuarray.vec.make_float3(1, 2, 3) @@ -105,13 +96,15 @@ class TestDriver: # and data copying is asynchronous. Observe how this necessitates the # use of page-locked memory. - mod = SourceModule(""" + mod = SourceModule( + """ __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x*blockDim.y + threadIdx.y; dest[i] = a[i] * b[i]; } - """) + """ + ) multiply_them = mod.get_function("multiply_them") @@ -130,28 +123,27 @@ class TestDriver: strm.synchronize() dest = drv.pagelocked_empty_like(a) - multiply_them( - drv.Out(dest), a_gpu, b_gpu, - block=shape+(1,), stream=strm) + multiply_them(drv.Out(dest), a_gpu, b_gpu, block=shape + (1,), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) drv.memcpy_dtoh_async(b, b_gpu, strm) strm.synchronize() - assert la.norm(dest-a*b) == 0 + assert la.norm(dest - a * b) == 0 @mark_cuda_test def test_gpuarray(self): a = np.arange(200000, dtype=np.float32) b = a + 17 import pycuda.gpuarray as gpuarray + a_g = gpuarray.to_gpu(a) b_g = gpuarray.to_gpu(b) - diff = (a_g-3*b_g+(-a_g)).get() - (a-3*b+(-a)) + diff = (a_g - 3 * b_g + (-a_g)).get() - (a - 3 * b + (-a)) assert la.norm(diff) == 0 - diff = ((a_g*b_g).get()-a*b) + diff = (a_g * b_g).get() - a * b assert la.norm(diff) == 0 @mark_cuda_test @@ -162,14 +154,15 @@ class TestDriver: shape = (10,) a = blas.ones(shape, dtype=np.float32) - b = 33*blas.ones(shape, dtype=np.float32) - assert ((-a+b).from_gpu() == 32).all() + b = 33 * blas.ones(shape, dtype=np.float32) + assert ((-a + b).from_gpu() == 32).all() self.test_streamed_kernel() @mark_cuda_test def test_2d_texture(self): - mod = SourceModule(""" + mod = SourceModule( + """ texture mtx_tex; __global__ void copy_texture(float *dest) @@ -179,7 +172,8 @@ class TestDriver: int w = blockDim.y; dest[row*w+col] = tex2D(mtx_tex, row, col); } - """) + """ + ) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") @@ -189,16 +183,13 @@ class TestDriver: drv.matrix_to_texref(a, mtx_tex, order="F") dest = np.zeros(shape, dtype=np.float32) - copy_texture( - drv.Out(dest), - block=shape+(1,), - texrefs=[mtx_tex] - ) - assert la.norm(dest-a) == 0 + copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex]) + assert la.norm(dest - a) == 0 @mark_cuda_test def test_multiple_2d_textures(self): - mod = SourceModule(""" + mod = SourceModule( + """ texture mtx_tex; texture mtx2_tex; @@ -212,28 +203,27 @@ class TestDriver: + tex2D(mtx2_tex, row, col); } - """) + """ + ) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx2_tex = mod.get_texref("mtx2_tex") - shape = (3,4) + shape = (3, 4) a = np.random.randn(*shape).astype(np.float32) b = np.random.randn(*shape).astype(np.float32) drv.matrix_to_texref(a, mtx_tex, order="F") drv.matrix_to_texref(b, mtx2_tex, order="F") dest = np.zeros(shape, dtype=np.float32) - copy_texture(drv.Out(dest), - block=shape+(1,), - texrefs=[mtx_tex, mtx2_tex] - ) - assert la.norm(dest-a-b) < 1e-6 + copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex, mtx2_tex]) + assert la.norm(dest - a - b) < 1e-6 @mark_cuda_test def test_multichannel_2d_texture(self): - mod = SourceModule(""" + mod = SourceModule( + """ #define CHANNELS 4 texture mtx_tex; @@ -248,7 +238,8 @@ class TestDriver: dest[(row*w+col)*CHANNELS + 2] = texval.z; dest[(row*w+col)*CHANNELS + 3] = texval.w; } - """) + """ + ) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") @@ -256,25 +247,21 @@ class TestDriver: shape = (5, 6) channels = 4 a = np.asarray( - np.random.randn(*((channels,)+shape)), - dtype=np.float32, order="F") - drv.bind_array_to_texref( - drv.make_multichannel_2d_array(a, order="F"), mtx_tex) + np.random.randn(*((channels,) + shape)), dtype=np.float32, order="F" + ) + drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex) - dest = np.zeros(shape+(channels,), dtype=np.float32) - copy_texture( - drv.Out(dest), - block=shape+(1,), - texrefs=[mtx_tex] - ) + dest = np.zeros(shape + (channels,), dtype=np.float32) + copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex]) reshaped_a = a.transpose(1, 2, 0) - #print reshaped_a - #print dest - assert la.norm(dest-reshaped_a) == 0 + # print reshaped_a + # print dest + assert la.norm(dest - reshaped_a) == 0 @mark_cuda_test def test_multichannel_linear_texture(self): - mod = SourceModule(""" + mod = SourceModule( + """ #define CHANNELS 4 texture mtx_tex; @@ -287,43 +274,45 @@ class TestDriver: dest[i*CHANNELS + 2] = texval.z; dest[i*CHANNELS + 3] = texval.w; } - """) + """ + ) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") shape = (16, 16) channels = 4 - a = np.random.randn(*(shape+(channels,))).astype(np.float32) + a = np.random.randn(*(shape + (channels,))).astype(np.float32) a_gpu = drv.to_device(a) mtx_tex.set_address(a_gpu, a.nbytes) mtx_tex.set_format(drv.array_format.FLOAT, 4) - dest = np.zeros(shape+(channels,), dtype=np.float32) - copy_texture(drv.Out(dest), - block=shape+(1,), - texrefs=[mtx_tex] - ) - #print a - #print dest - assert la.norm(dest-a) == 0 + dest = np.zeros(shape + (channels,), dtype=np.float32) + copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex]) + # print a + # print dest + assert la.norm(dest - a) == 0 @mark_cuda_test def test_2d_fp_textures(self): orden = "F" npoints = 32 - for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: + for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]: prec_str = dtype_to_ctype(prec) - if prec == np.complex64: fpName_str = 'fp_tex_cfloat' - elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' - elif prec == np.float64: fpName_str = 'fp_tex_double' - else: fpName_str = prec_str - A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec) - A_cpu[:] = np.random.rand(npoints,npoints)[:] - A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden) - - myKern = ''' + if prec == np.complex64: + fpName_str = "fp_tex_cfloat" + elif prec == np.complex128: + fpName_str = "fp_tex_cdouble" + elif prec == np.float64: + fpName_str = "fp_tex_double" + else: + fpName_str = prec_str + A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec) + A_cpu[:] = np.random.rand(npoints, npoints)[:] + A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) + + myKern = """ #include texture mtx_tex; @@ -334,40 +323,50 @@ class TestDriver: dest[row + col*blockDim.x*gridDim.x] = fp_tex2D(mtx_tex, col, row); } - ''' - myKern = myKern.replace('fpName',fpName_str) - myKern = myKern.replace('cuPres',prec_str) + """ + myKern = myKern.replace("fpName", fpName_str) + myKern = myKern.replace("cuPres", prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") - cuBlock = (16,16,1) - if cuBlock[0]>npoints: - cuBlock = (npoints,npoints,1) - cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1) - copy_texture.prepare('P',texrefs=[mtx_tex]) - cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=False) + cuBlock = (16, 16, 1) + if cuBlock[0] > npoints: + cuBlock = (npoints, npoints, 1) + cuGrid = ( + npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), + npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), + 1, + ) + copy_texture.prepare("P", texrefs=[mtx_tex]) + cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False) mtx_tex.set_array(cudaArray) - copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata) - assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) + copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) + assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array( + 0, dtype=prec + ) A_gpu.gpudata.free() @mark_cuda_test - def test_2d_fp_texturesLayered(self): + def test_2d_fp_textures_layered(self): orden = "F" npoints = 32 - for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: + for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]: prec_str = dtype_to_ctype(prec) - if prec == np.complex64: fpName_str = 'fp_tex_cfloat' - elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' - elif prec == np.float64: fpName_str = 'fp_tex_double' - else: fpName_str = prec_str - A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec) - A_cpu[:] = np.random.rand(npoints,npoints)[:] - A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden) - - myKern = ''' + if prec == np.complex64: + fpName_str = "fp_tex_cfloat" + elif prec == np.complex128: + fpName_str = "fp_tex_cdouble" + elif prec == np.float64: + fpName_str = "fp_tex_double" + else: + fpName_str = prec_str + A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec) + A_cpu[:] = np.random.rand(npoints, npoints)[:] + A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) + + myKern = """ #include texture mtx_tex; @@ -378,22 +377,28 @@ class TestDriver: dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1); } - ''' - myKern = myKern.replace('fpName',fpName_str) - myKern = myKern.replace('cuPres',prec_str) + """ + myKern = myKern.replace("fpName", fpName_str) + myKern = myKern.replace("cuPres", prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") - cuBlock = (16,16,1) - if cuBlock[0]>npoints: - cuBlock = (npoints,npoints,1) - cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1) - copy_texture.prepare('P',texrefs=[mtx_tex]) - cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=True) + cuBlock = (16, 16, 1) + if cuBlock[0] > npoints: + cuBlock = (npoints, npoints, 1) + cuGrid = ( + npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), + npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), + 1, + ) + copy_texture.prepare("P", texrefs=[mtx_tex]) + cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=True) mtx_tex.set_array(cudaArray) - copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata) - assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) + copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) + assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array( + 0, dtype=prec + ) A_gpu.gpudata.free() @mark_cuda_test @@ -401,17 +406,21 @@ class TestDriver: orden = "C" npoints = 32 - for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: + for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]: prec_str = dtype_to_ctype(prec) - if prec == np.complex64: fpName_str = 'fp_tex_cfloat' - elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' - elif prec == np.float64: fpName_str = 'fp_tex_double' - else: fpName_str = prec_str - A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec) - A_cpu[:] = np.random.rand(npoints,npoints,npoints)[:] - A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden) - - myKern = ''' + if prec == np.complex64: + fpName_str = "fp_tex_cfloat" + elif prec == np.complex128: + fpName_str = "fp_tex_cdouble" + elif prec == np.float64: + fpName_str = "fp_tex_double" + else: + fpName_str = prec_str + A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) + A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:] + A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) + + myKern = """ #include texture mtx_tex; @@ -422,22 +431,28 @@ class TestDriver: int slice = blockIdx.z*blockDim.z + threadIdx.z; dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row); } - ''' - myKern = myKern.replace('fpName',fpName_str) - myKern = myKern.replace('cuPres',prec_str) + """ + myKern = myKern.replace("fpName", fpName_str) + myKern = myKern.replace("cuPres", prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") - cuBlock = (8,8,8) - if cuBlock[0]>npoints: - cuBlock = (npoints,npoints,npoints) - cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),npoints//cuBlock[2]+1*(npoints % cuBlock[1] != 0 )) - copy_texture.prepare('P',texrefs=[mtx_tex]) - cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=False) + cuBlock = (8, 8, 8) + if cuBlock[0] > npoints: + cuBlock = (npoints, npoints, npoints) + cuGrid = ( + npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), + npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), + npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0), + ) + copy_texture.prepare("P", texrefs=[mtx_tex]) + cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False) mtx_tex.set_array(cudaArray) - copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata) - assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) + copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) + assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array( + 0, dtype=prec + ) A_gpu.gpudata.free() @mark_cuda_test @@ -445,30 +460,30 @@ class TestDriver: orden = "C" npoints = 32 - for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: + for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: - fpName_str = 'fp_tex_cfloat' - A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec) - A_cpu[:].real = np.random.rand(npoints,npoints,npoints)[:] - A_cpu[:].imag = np.random.rand(npoints,npoints,npoints)[:] + fpName_str = "fp_tex_cfloat" + A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) + A_cpu[:].real = np.random.rand(npoints, npoints, npoints)[:] + A_cpu[:].imag = np.random.rand(npoints, npoints, npoints)[:] elif prec == np.complex128: - fpName_str = 'fp_tex_cdouble' - A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec) - A_cpu[:].real = np.random.rand(npoints,npoints,npoints)[:] - A_cpu[:].imag = np.random.rand(npoints,npoints,npoints)[:] + fpName_str = "fp_tex_cdouble" + A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) + A_cpu[:].real = np.random.rand(npoints, npoints, npoints)[:] + A_cpu[:].imag = np.random.rand(npoints, npoints, npoints)[:] elif prec == np.float64: - fpName_str = 'fp_tex_double' - A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec) - A_cpu[:] = np.random.rand(npoints,npoints,npoints)[:] + fpName_str = "fp_tex_double" + A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) + A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:] else: fpName_str = prec_str - A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec) - A_cpu[:] = np.random.rand(npoints,npoints,npoints)[:]*100. + A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) + A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:] * 100.0 - A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized + A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized - myKernRW = ''' + myKernRW = """ #include surface mtx_tex; @@ -488,25 +503,35 @@ class TestDriver: dest[tid] = aux; } } - ''' - myKernRW = myKernRW.replace('fpName',fpName_str) - myKernRW = myKernRW.replace('cuPres',prec_str) + """ + myKernRW = myKernRW.replace("fpName", fpName_str) + myKernRW = myKernRW.replace("cuPres", prec_str) modW = SourceModule(myKernRW) copy_texture = modW.get_function("copy_texture") mtx_tex = modW.get_surfref("mtx_tex") - cuBlock = (8,8,8) - if cuBlock[0]>npoints: - cuBlock = (npoints,npoints,npoints) - cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),npoints//cuBlock[2]+1*(npoints % cuBlock[1] != 0 )) - copy_texture.prepare('Pi')#,texrefs=[mtx_tex]) - A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros - cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True) - A_cpu = A_gpu.get() # To remember original array + cuBlock = (8, 8, 8) + if cuBlock[0] > npoints: + cuBlock = (npoints, npoints, npoints) + cuGrid = ( + npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), + npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), + npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0), + ) + copy_texture.prepare("Pi") # ,texrefs=[mtx_tex]) + A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros + cudaArray = drv.gpuarray_to_array(A_gpu2, orden, allowSurfaceBind=True) + A_cpu = A_gpu.get() # To remember original array mtx_tex.set_array(cudaArray) - copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array - copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed - assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) + copy_texture.prepared_call( + cuGrid, cuBlock, A_gpu.gpudata, np.int32(0) + ) # Write random array + copy_texture.prepared_call( + cuGrid, cuBlock, A_gpu.gpudata, np.int32(1) + ) # Read, but transposed + assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array( + 0, dtype=prec + ) A_gpu.gpudata.free() @mark_cuda_test @@ -514,17 +539,21 @@ class TestDriver: orden = "C" npoints = 32 - for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: + for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]: prec_str = dtype_to_ctype(prec) - if prec == np.complex64: fpName_str = 'fp_tex_cfloat' - elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' - elif prec == np.float64: fpName_str = 'fp_tex_double' - else: fpName_str = prec_str - A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec) - A_cpu[:] = np.random.rand(npoints,npoints)[:] - A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized - - myKernRW = ''' + if prec == np.complex64: + fpName_str = "fp_tex_cfloat" + elif prec == np.complex128: + fpName_str = "fp_tex_cdouble" + elif prec == np.float64: + fpName_str = "fp_tex_double" + else: + fpName_str = prec_str + A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec) + A_cpu[:] = np.random.rand(npoints, npoints)[:] + A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized + + myKernRW = """ #include surface mtx_tex; @@ -544,31 +573,42 @@ class TestDriver: dest[tid] = aux; } } - ''' - myKernRW = myKernRW.replace('fpName',fpName_str) - myKernRW = myKernRW.replace('cuPres',prec_str) + """ + myKernRW = myKernRW.replace("fpName", fpName_str) + myKernRW = myKernRW.replace("cuPres", prec_str) modW = SourceModule(myKernRW) copy_texture = modW.get_function("copy_texture") mtx_tex = modW.get_surfref("mtx_tex") - cuBlock = (8,8,1) - if cuBlock[0]>npoints: - cuBlock = (npoints,npoints,1) - cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1) - copy_texture.prepare('Pi')#,texrefs=[mtx_tex]) - A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros - cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True) - A_cpu = A_gpu.get() # To remember original array + cuBlock = (8, 8, 1) + if cuBlock[0] > npoints: + cuBlock = (npoints, npoints, 1) + cuGrid = ( + npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), + npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), + 1, + ) + copy_texture.prepare("Pi") # ,texrefs=[mtx_tex]) + A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros + cudaArray = drv.gpuarray_to_array(A_gpu2, orden, allowSurfaceBind=True) + A_cpu = A_gpu.get() # To remember original array mtx_tex.set_array(cudaArray) - copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array - copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed - assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) + copy_texture.prepared_call( + cuGrid, cuBlock, A_gpu.gpudata, np.int32(0) + ) # Write random array + copy_texture.prepared_call( + cuGrid, cuBlock, A_gpu.gpudata, np.int32(1) + ) # Read, but transposed + assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array( + 0, dtype=prec + ) A_gpu.gpudata.free() @mark_cuda_test def test_large_smem(self): n = 4000 - mod = SourceModule(""" + mod = SourceModule( + """ #include __global__ void kernel(int *d_data) @@ -577,36 +617,47 @@ class TestDriver: sdata[threadIdx.x] = threadIdx.x; d_data[threadIdx.x] = sdata[threadIdx.x]; } - """ % n) + """ + % n + ) kernel = mod.get_function("kernel") import pycuda.gpuarray as gpuarray + arg = gpuarray.zeros((n,), dtype=np.float32) - kernel(arg, block=(1,1,1,), ) + kernel( + arg, + block=( + 1, + 1, + 1, + ), + ) @mark_cuda_test def test_bitlog(self): from pycuda.tools import bitlog2 + assert bitlog2(17) == 4 - assert bitlog2(0xaffe) == 15 - assert bitlog2(0x3affe) == 17 - assert bitlog2(0xcc3affe) == 27 + assert bitlog2(0xAFFE) == 15 + assert bitlog2(0x3AFFE) == 17 + assert bitlog2(0xCC3AFFE) == 27 @mark_cuda_test def test_mempool_2(self): - from pycuda.tools import DeviceMemoryPool as DMP + from pycuda.tools import DeviceMemoryPool from random import randrange for i in range(2000): - s = randrange(1<<31) >> randrange(32) - bin_nr = DMP.bin_number(s) - asize = DMP.alloc_size(bin_nr) + s = randrange(1 << 31) >> randrange(32) + bin_nr = DeviceMemoryPool.bin_number(s) + asize = DeviceMemoryPool.alloc_size(bin_nr) assert asize >= s, s - assert DMP.bin_number(asize) == bin_nr, s - assert asize < asize*(1+1/8) + assert DeviceMemoryPool.bin_number(asize) == bin_nr, s + assert asize < asize * (1 + 1 / 8) @mark_cuda_test def test_mempool(self): @@ -614,13 +665,12 @@ class TestDriver: from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() - maxlen = 10 queue = [] free, total = drv.mem_get_info() e0 = bitlog2(free) - for e in range(e0-6, e0-4): + for e in range(e0 - 6, e0 - 4): for i in range(100): queue.append(pool.allocate(1 << e)) if len(queue) > 10: @@ -669,9 +719,7 @@ class TestDriver: d = 8 shape = (w, h, d) - a = np.asarray( - np.random.randn(*shape), - dtype=np.float32, order="F") + a = np.asarray(np.random.randn(*shape), dtype=np.float32, order="F") descr = drv.ArrayDescriptor3D() descr.width = w @@ -692,7 +740,8 @@ class TestDriver: copy() - mod = SourceModule(""" + mod = SourceModule( + """ texture mtx_tex; __global__ void copy_texture(float *dest) @@ -706,7 +755,8 @@ class TestDriver: dest[i] = tex3D(mtx_tex, x, y, z); //dest[i] = x; } - """) + """ + ) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") @@ -715,47 +765,49 @@ class TestDriver: dest = np.zeros(shape, dtype=np.float32, order="F") copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex]) - assert la.norm(dest-a) == 0 + assert la.norm(dest - a) == 0 @mark_cuda_test def test_prepared_invocation(self): - a = np.random.randn(4,4).astype(np.float32) + a = np.random.randn(4, 4).astype(np.float32) a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize) drv.memcpy_htod(a_gpu, a) - mod = SourceModule(""" + mod = SourceModule( + """ __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*blockDim.x; a[idx] *= 2; } - """) + """ + ) func = mod.get_function("doublify") func.prepare("P") - func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20) + func.prepared_call((1, 1), (4, 4, 1), a_gpu, shared_size=20) a_doubled = np.empty_like(a) drv.memcpy_dtoh(a_doubled, a_gpu) - print (a) - print (a_doubled) - assert la.norm(a_doubled-2*a) == 0 + print(a) + print(a_doubled) + assert la.norm(a_doubled - 2 * a) == 0 # now with offsets func.prepare("P") a_quadrupled = np.empty_like(a) - func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize) + func.prepared_call((1, 1), (15, 1, 1), int(a_gpu) + a.dtype.itemsize) drv.memcpy_dtoh(a_quadrupled, a_gpu) - assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0 + assert la.norm(a_quadrupled[1:] - 4 * a[1:]) == 0 @mark_cuda_test def test_prepared_with_vector(self): - cuda_source = r''' + cuda_source = r""" __global__ void cuda_function(float3 input) { float3 result = make_float3(input.x, input.y, input.z); } - ''' + """ mod = SourceModule(cuda_source, cache_dir=False, keep=False) @@ -763,8 +815,9 @@ class TestDriver: arg_types = [gpuarray.vec.float3] kernel.prepare(arg_types) - kernel.prepared_call((1, 1, 1), (1, 1, 1), - gpuarray.vec.make_float3(0.0, 1.0, 2.0)) + kernel.prepared_call( + (1, 1, 1), (1, 1, 1), gpuarray.vec.make_float3(0.0, 1.0, 2.0) + ) @mark_cuda_test def test_fp_textures(self): @@ -772,10 +825,9 @@ class TestDriver: return for tp in [np.float32, np.float64]: - from pycuda.tools import dtype_to_ctype - tp_cstr = dtype_to_ctype(tp) - mod = SourceModule(""" + mod = SourceModule( + """ #include texture my_tex; @@ -785,49 +837,55 @@ class TestDriver: int i = threadIdx.x; dest[i] = fp_tex1Dfetch(my_tex, i); } - """ % {"tp": tp_cstr}) + """ + % {"tp": tp_cstr} + ) copy_texture = mod.get_function("copy_texture") my_tex = mod.get_texref("my_tex") - import pycuda.gpuarray as gpuarray - shape = (384,) a = np.random.randn(*shape).astype(tp) a_gpu = gpuarray.to_gpu(a) a_gpu.bind_to_texref_ext(my_tex, allow_double_hack=True) dest = np.zeros(shape, dtype=tp) - copy_texture(drv.Out(dest), - block=shape+(1,1,), - texrefs=[my_tex]) + copy_texture( + drv.Out(dest), + block=shape + + ( + 1, + 1, + ), + texrefs=[my_tex], + ) - assert la.norm(dest-a) == 0 + assert la.norm(dest - a) == 0 @mark_cuda_test def test_constant_memory(self): # contributed by Andrew Wagner - module = SourceModule(""" + module = SourceModule( + """ __constant__ float const_array[32]; __global__ void copy_constant_into_global(float* global_result_array) { global_result_array[threadIdx.x] = const_array[threadIdx.x]; } - """) + """ + ) copy_constant_into_global = module.get_function("copy_constant_into_global") - const_array, _ = module.get_global('const_array') + const_array, _ = module.get_global("const_array") - host_array = np.random.randint(0,255,(32,)).astype(np.float32) + host_array = np.random.randint(0, 255, (32,)).astype(np.float32) global_result_array = drv.mem_alloc_like(host_array) drv.memcpy_htod(const_array, host_array) - copy_constant_into_global( - global_result_array, - grid=(1, 1), block=(32, 1, 1)) + copy_constant_into_global(global_result_array, grid=(1, 1), block=(32, 1, 1)) host_result_array = np.zeros_like(host_array) drv.memcpy_dtoh(host_result_array, global_result_array) @@ -838,14 +896,17 @@ class TestDriver: def test_register_host_memory(self): if drv.get_version() < (4,): from py.test import skip + skip("register_host_memory only exists on CUDA 4.0 and later") import sys + if sys.platform == "darwin": from py.test import skip + skip("register_host_memory is not supported on OS X") - a = drv.aligned_empty((2**20,), np.float64) + a = drv.aligned_empty((2 ** 20,), np.float64) a_pin = drv.register_host_memory(a) gpu_ary = drv.mem_alloc_like(a) @@ -860,6 +921,7 @@ class TestDriver: if drv.Context.get_device().compute_capability() < (3, 5): from pytest import skip + skip("need compute capability 3.5 or higher for dynamic parallelism") cuda_string = """ @@ -909,11 +971,20 @@ class TestDriver: drv.memcpy_htod(b_gpu, b) from pycuda.compiler import DynamicSourceModule + mod = DynamicSourceModule(cuda_string, keep=True) func = mod.get_function("math") - func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu, - block=(100, 1, 1), grid=(1, 1, 1)) + func( + a_gpu, + b_gpu, + c_gpu, + d_gpu, + e_gpu, + f_gpu, + block=(100, 1, 1), + grid=(1, 1, 1), + ) drv.memcpy_dtoh(c, c_gpu) drv.memcpy_dtoh(d, d_gpu) @@ -927,41 +998,46 @@ class TestDriver: c = np.empty_like(a) d = np.empty_like(a) e = np.empty_like(a) - f = np.array(a, dtype='d') + f = np.array(a, dtype="d") math(a, b, c, d, e, f) @mark_cuda_test def test_jit_link_module(self): from pycuda.compiler import DEFAULT_NVCC_FLAGS + if drv.Context.get_device().compute_capability() < (3, 5): from pytest import skip + skip("need compute capability 3.5 or higher for dynamic parallelism") - test_outer_cu = '''#include + test_outer_cu = """#include __global__ void test_kernel() { extern __global__ void test_kernel_inner(); printf("Hello outer world!\\n"); test_kernel_inner<<<2, 1>>>(); - }''' + }""" - test_inner_cu = '''#include + test_inner_cu = """#include __global__ void test_kernel_inner() { printf(" Hello inner world!\\n"); - }''' + }""" from pycuda.compiler import DynamicModule + mod = DynamicModule() mod.add_source( - test_outer_cu, nvcc_options=( - ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS)) + test_outer_cu, + nvcc_options=(["-rdc=true", "-lcudadevrt"] + DEFAULT_NVCC_FLAGS), + ) mod.add_source( - test_inner_cu, nvcc_options=( - ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS)) - mod.add_stdlib('cudadevrt') + test_inner_cu, + nvcc_options=(["-rdc=true", "-lcudadevrt"] + DEFAULT_NVCC_FLAGS), + ) + mod.add_stdlib("cudadevrt") mod.link() - test_kernel = mod.get_function('test_kernel') + test_kernel = mod.get_function("test_kernel") test_kernel(grid=(2, 1), block=(1, 1, 1)) @@ -978,8 +1054,10 @@ if __name__ == "__main__": import pycuda.autoinit # noqa import sys + if len(sys.argv) > 1: exec(sys.argv[1]) else: from pytest import main + main([__file__]) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index b8f2d43e..7fbc08da 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -9,31 +9,21 @@ from pycuda.characterize import has_double_support from six.moves import range -def have_pycuda(): - try: - import pycuda # noqa - return True - except: - return False - -if have_pycuda(): - import pycuda.gpuarray as gpuarray - import pycuda.driver as drv - from pycuda.compiler import SourceModule +import pycuda.gpuarray as gpuarray +import pycuda.driver as drv +from pycuda.compiler import SourceModule class TestGPUArray: - disabled = not have_pycuda() - @mark_cuda_test def test_pow_array(self): a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) result = pow(a_gpu, a_gpu).get() - assert (np.abs(a**a - result) < 1e-3).all() + assert (np.abs(a ** a - result) < 1e-3).all() - result = (a_gpu**a_gpu).get() + result = (a_gpu ** a_gpu).get() assert (np.abs(pow(a, a) - result) < 1e-3).all() a_gpu **= a_gpu @@ -46,11 +36,11 @@ class TestGPUArray: a_gpu = gpuarray.to_gpu(a) result = pow(a_gpu, 2).get() - assert (np.abs(a**2 - result) < 1e-3).all() + assert (np.abs(a ** 2 - result) < 1e-3).all() a_gpu **= 2 a_gpu = a_gpu.get() - assert (np.abs(a**2 - a_gpu) < 1e-3).all() + assert (np.abs(a ** 2 - a_gpu) < 1e-3).all() @mark_cuda_test def test_numpy_integer_shape(self): @@ -90,10 +80,7 @@ class TestGPUArray: """Test the muliplication of an array with a scalar. """ for sz in [10, 50000]: - for dtype, scalars in [ - (np.float32, [2]), - (np.complex64, [2, 2j]) - ]: + for dtype, scalars in [(np.float32, [2]), (np.complex64, [2, 2j])]: for scalar in scalars: a = np.arange(sz).astype(dtype) a_gpu = gpuarray.to_gpu(a) @@ -106,10 +93,10 @@ class TestGPUArray: a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) - two_a = 2*a_gpu + two_a = 2 * a_gpu assert isinstance(two_a, gpuarray.GPUArray) - two_a = np.float32(2)*a_gpu + two_a = np.float32(2) * a_gpu assert isinstance(two_a, gpuarray.GPUArray) @mark_cuda_test @@ -121,9 +108,9 @@ class TestGPUArray: a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(a) - a_squared = (b_gpu*a_gpu).get() + a_squared = (b_gpu * a_gpu).get() - assert (a*a == a_squared).all() + assert (a * a == a_squared).all() @mark_cuda_test def test_addition_array(self): @@ -131,9 +118,9 @@ class TestGPUArray: a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) - a_added = (a_gpu+a_gpu).get() + a_added = (a_gpu + a_gpu).get() - assert (a+a == a_added).all() + assert (a + a == a_added).all() @mark_cuda_test def test_iaddition_array(self): @@ -144,7 +131,7 @@ class TestGPUArray: a_gpu += a_gpu a_added = a_gpu.get() - assert (a+a == a_added).all() + assert (a + a == a_added).all() @mark_cuda_test def test_addition_scalar(self): @@ -152,9 +139,9 @@ class TestGPUArray: a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) - a_added = (7+a_gpu).get() + a_added = (7 + a_gpu).get() - assert (7+a == a_added).all() + assert (7 + a == a_added).all() @mark_cuda_test def test_iaddition_scalar(self): @@ -165,39 +152,39 @@ class TestGPUArray: a_gpu += 7 a_added = a_gpu.get() - assert (7+a == a_added).all() + assert (7 + a == a_added).all() @mark_cuda_test def test_substract_array(self): """Test the substraction of two arrays.""" - #test data + # test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) b = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) - result = (a_gpu-b_gpu).get() - assert (a-b == result).all() + result = (a_gpu - b_gpu).get() + assert (a - b == result).all() - result = (b_gpu-a_gpu).get() - assert (b-a == result).all() + result = (b_gpu - a_gpu).get() + assert (b - a == result).all() @mark_cuda_test def test_substract_scalar(self): """Test the substraction of an array and a scalar.""" - #test data + # test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) - #convert a to a gpu object + # convert a to a gpu object a_gpu = gpuarray.to_gpu(a) - result = (a_gpu-7).get() - assert (a-7 == result).all() + result = (a_gpu - 7).get() + assert (a - 7 == result).all() - result = (7-a_gpu).get() - assert (7-a == result).all() + result = (7 - a_gpu).get() + assert (7 - a == result).all() @mark_cuda_test def test_divide_scalar(self): @@ -206,28 +193,28 @@ class TestGPUArray: a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) - result = (a_gpu/2).get() - assert (a/2 == result).all() + result = (a_gpu / 2).get() + assert (a / 2 == result).all() - result = (2/a_gpu).get() - assert (2/a == result).all() + result = (2 / a_gpu).get() + assert (2 / a == result).all() @mark_cuda_test def test_divide_array(self): """Test the division of an array and a scalar. """ - #test data + # test data a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) - a_divide = (a_gpu/b_gpu).get() - assert (np.abs(a/b - a_divide) < 1e-3).all() + a_divide = (a_gpu / b_gpu).get() + assert (np.abs(a / b - a_divide) < 1e-3).all() - a_divide = (b_gpu/a_gpu).get() - assert (np.abs(b/a - a_divide) < 1e-3).all() + a_divide = (b_gpu / a_gpu).get() + assert (np.abs(b / a - a_divide) < 1e-3).all() @mark_cuda_test def test_random(self): @@ -247,29 +234,39 @@ class TestGPUArray: @mark_cuda_test def test_curand_wrappers(self): from pycuda.curandom import get_curand_version + if get_curand_version() is None: from pytest import skip + skip("curand not installed") generator_types = [] if get_curand_version() >= (3, 2, 0): from pycuda.curandom import ( - XORWOWRandomNumberGenerator, - Sobol32RandomNumberGenerator) - generator_types.extend([ - XORWOWRandomNumberGenerator, - Sobol32RandomNumberGenerator]) + XORWOWRandomNumberGenerator, + Sobol32RandomNumberGenerator, + ) + + generator_types.extend( + [XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator] + ) if get_curand_version() >= (4, 0, 0): from pycuda.curandom import ( + ScrambledSobol32RandomNumberGenerator, + Sobol64RandomNumberGenerator, + ScrambledSobol64RandomNumberGenerator, + ) + + generator_types.extend( + [ ScrambledSobol32RandomNumberGenerator, Sobol64RandomNumberGenerator, - ScrambledSobol64RandomNumberGenerator) - generator_types.extend([ - ScrambledSobol32RandomNumberGenerator, - Sobol64RandomNumberGenerator, - ScrambledSobol64RandomNumberGenerator]) + ScrambledSobol64RandomNumberGenerator, + ] + ) if get_curand_version() >= (4, 1, 0): from pycuda.curandom import MRG32k3aRandomNumberGenerator + generator_types.extend([MRG32k3aRandomNumberGenerator]) if has_double_support(): @@ -303,7 +300,7 @@ class TestGPUArray: v = 10 a.fill(v) gen.fill_poisson(a) - tmp = (a.get() == (v-1)).sum() / a.size + tmp = (a.get() == (v - 1)).sum() / a.size # noqa: F841 # Commented out for CI on the off chance it'd fail # # Check Poisson statistics (need 1e6 values) # # Compare with scipy.stats.poisson.pmf(v - 1, v) @@ -394,11 +391,12 @@ class TestGPUArray: def make_nan_contaminated_vector(size): shape = (size,) a = np.random.randn(*shape).astype(np.float32) - #for i in range(0, shape[0], 3): - #a[i] = float('nan') + # for i in range(0, shape[0], 3): + # a[i] = float('nan') from random import randrange - for i in range(size//10): - a[randrange(0, size)] = float('nan') + + for i in range(size // 10): + a[randrange(0, size)] = float("nan") return a size = 1 << 20 @@ -408,8 +406,8 @@ class TestGPUArray: b = make_nan_contaminated_vector(size) b_gpu = gpuarray.to_gpu(b) - ab = a*b - ab_gpu = (a_gpu*b_gpu).get() + ab = a * b + ab_gpu = (a_gpu * b_gpu).get() assert (np.isnan(ab) == np.isnan(ab_gpu)).all() @@ -421,30 +419,32 @@ class TestGPUArray: b_gpu = curand((50,)) from pycuda.elementwise import ElementwiseKernel + lin_comb = ElementwiseKernel( - "float a, float *x, float b, float *y, float *z", - "z[i] = a*x[i] + b*y[i]", - "linear_combination") + "float a, float *x, float b, float *y, float *z", + "z[i] = a*x[i] + b*y[i]", + "linear_combination", + ) c_gpu = gpuarray.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) - assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 + assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5 @mark_cuda_test def test_ranged_elwise_kernel(self): from pycuda.elementwise import ElementwiseKernel - set_to_seven = ElementwiseKernel( - "float *z", - "z[i] = 7", - "set_to_seven") - for i, slc in enumerate([ + set_to_seven = ElementwiseKernel("float *z", "z[i] = 7", "set_to_seven") + + for i, slc in enumerate( + [ slice(5, 20000), slice(5, 20000, 17), slice(3000, 5, -1), slice(1000, -1), - ]): + ] + ): a_gpu = gpuarray.zeros((50000,), dtype=np.float32) a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype) @@ -480,11 +480,12 @@ class TestGPUArray: b = a_cpu.get() for i in range(0, 10): - assert a[len(a)-1-i] == b[i] + assert a[len(a) - 1 - i] == b[i] @mark_cuda_test def test_sum(self): from pycuda.curandom import rand as curand + a_gpu = curand((200000,)) a = a_gpu.get() @@ -492,7 +493,7 @@ class TestGPUArray: sum_a_gpu = gpuarray.sum(a_gpu).get() - assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4 + assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4 @mark_cuda_test def test_minmax(self): @@ -550,37 +551,55 @@ class TestGPUArray: @mark_cuda_test def test_dot(self): from pycuda.curandom import rand as curand - for l in [2, 3, 4, 5, 6, 7, 31, 32, 33, 127, 128, 129, - 255, 256, 257, 16384 - 993, - 20000]: - a_gpu = curand((l,)) + + for sz in [ + 2, + 3, + 4, + 5, + 6, + 7, + 31, + 32, + 33, + 127, + 128, + 129, + 255, + 256, + 257, + 16384 - 993, + 20000, + ]: + a_gpu = curand((sz,)) a = a_gpu.get() - b_gpu = curand((l,)) + b_gpu = curand((sz,)) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get() - assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4 + assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 @mark_cuda_test def test_slice(self): from pycuda.curandom import rand as curand - l = 20000 - a_gpu = curand((l,)) + sz = 20000 + a_gpu = curand((sz,)) a = a_gpu.get() from random import randrange + for i in range(200): - start = randrange(l) - end = randrange(start, l) + start = randrange(sz) + end = randrange(start, sz) a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] - assert la.norm(a_gpu_slice.get()-a_slice) == 0 + assert la.norm(a_gpu_slice.get() - a_slice) == 0 @mark_cuda_test def test_2d_slice_c(self): @@ -592,6 +611,7 @@ class TestGPUArray: a = a_gpu.get() from random import randrange + for i in range(200): start = randrange(n) end = randrange(start, n) @@ -599,7 +619,7 @@ class TestGPUArray: a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] - assert la.norm(a_gpu_slice.get()-a_slice) == 0 + assert la.norm(a_gpu_slice.get() - a_slice) == 0 @mark_cuda_test def test_2d_slice_f(self): @@ -609,12 +629,13 @@ class TestGPUArray: n = 1000 m = 300 a_gpu = curand((n, m)) - a_gpu_f = gpuarray.GPUArray((m, n), np.float32, - gpudata=a_gpu.gpudata, - order="F") + a_gpu_f = gpuarray.GPUArray( + (m, n), np.float32, gpudata=a_gpu.gpudata, order="F" + ) a = a_gpu_f.get() from random import randrange + for i in range(200): start = randrange(n) end = randrange(start, n) @@ -622,15 +643,15 @@ class TestGPUArray: a_gpu_slice = a_gpu_f[:, start:end] a_slice = a[:, start:end] - assert la.norm(a_gpu_slice.get()-a_slice) == 0 + assert la.norm(a_gpu_slice.get() - a_slice) == 0 @mark_cuda_test def test_if_positive(self): from pycuda.curandom import rand as curand - l = 20 - a_gpu = curand((l,)) - b_gpu = curand((l,)) + sz = 20 + a_gpu = curand((sz,)) + b_gpu = curand((sz,)) a = a_gpu.get() b = b_gpu.get() @@ -639,7 +660,7 @@ class TestGPUArray: max_a_b_gpu = gpuarray.maximum(a_gpu, b_gpu) min_a_b_gpu = gpuarray.minimum(a_gpu, b_gpu) - print (max_a_b_gpu) + print(max_a_b_gpu) print((np.maximum(a, b))) assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 @@ -649,18 +670,21 @@ class TestGPUArray: def test_take_put(self): for n in [5, 17, 333]: one_field_size = 8 - buf_gpu = gpuarray.zeros(n*one_field_size, dtype=np.float32) - dest_indices = gpuarray.to_gpu(np.array( - [0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)) + buf_gpu = gpuarray.zeros(n * one_field_size, dtype=np.float32) + dest_indices = gpuarray.to_gpu( + np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32) + ) read_map = gpuarray.to_gpu( - np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32)) + np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32) + ) gpuarray.multi_take_put( - arrays=[buf_gpu for i in range(n)], - dest_indices=dest_indices, - src_indices=read_map, - src_offsets=[i*one_field_size for i in range(n)], - dest_shape=(96,)) + arrays=[buf_gpu for i in range(n)], + dest_indices=dest_indices, + src_indices=read_map, + src_offsets=[i * one_field_size for i in range(n)], + dest_shape=(96,), + ) drv.Context.synchronize() @@ -685,7 +709,7 @@ class TestGPUArray: a2 = a_gpu.astype(np.float32).get() assert a2.dtype == np.float32 - assert la.norm(a - a2)/la.norm(a) < 1e-7 + assert la.norm(a - a2) / la.norm(a) < 1e-7 @mark_cuda_test def test_complex_bits(self): @@ -700,10 +724,12 @@ class TestGPUArray: for tp in dtypes: dtype = np.dtype(tp) from pytools import match_precision + real_dtype = match_precision(np.dtype(np.float64), dtype) - z = (curand((n,), real_dtype).astype(dtype) - + 1j*curand((n,), real_dtype).astype(dtype)) + z = curand((n,), real_dtype).astype(dtype) + 1j * curand( + (n,), real_dtype + ).astype(dtype) assert la.norm(z.get().real - z.real.get()) == 0 assert la.norm(z.get().imag - z.imag.get()) == 0 @@ -712,57 +738,61 @@ class TestGPUArray: # verify contiguity is preserved for order in ["C", "F"]: # test both zero and non-zero value code paths - z_real = gpuarray.zeros(z.shape, dtype=real_dtype, - order=order) + z_real = gpuarray.zeros(z.shape, dtype=real_dtype, order=order) z2 = z.reshape(z.shape, order=order) for zdata in [z_real, z2]: if order == "C": - assert zdata.flags.c_contiguous == True - assert zdata.real.flags.c_contiguous == True - assert zdata.imag.flags.c_contiguous == True - assert zdata.conj().flags.c_contiguous == True + assert zdata.flags.c_contiguous + assert zdata.real.flags.c_contiguous + assert zdata.imag.flags.c_contiguous + assert zdata.conj().flags.c_contiguous elif order == "F": - assert zdata.flags.f_contiguous == True - assert zdata.real.flags.f_contiguous == True - assert zdata.imag.flags.f_contiguous == True - assert zdata.conj().flags.f_contiguous == True - + assert zdata.flags.f_contiguous + assert zdata.real.flags.f_contiguous + assert zdata.imag.flags.f_contiguous + assert zdata.conj().flags.f_contiguous @mark_cuda_test def test_pass_slice_to_kernel(self): - mod = SourceModule(""" + mod = SourceModule( + """ __global__ void twice(float *a) { const int i = threadIdx.x + blockIdx.x * blockDim.x; a[i] *= 2; } - """) + """ + ) multiply_them = mod.get_function("twice") - a = np.ones(256**2, np.float32) + a = np.ones(256 ** 2, np.float32) a_gpu = gpuarray.to_gpu(a) multiply_them(a_gpu[256:-256], block=(256, 1, 1), grid=(254, 1)) a = a_gpu.get() assert (a[255:257] == np.array([1, 2], np.float32)).all() - assert (a[255*256-1:255*256+1] == np.array([2, 1], np.float32)).all() + assert (a[255 * 256 - 1: 255 * 256 + 1] == np.array([2, 1], np.float32)).all() @mark_cuda_test def test_scan(self): from pycuda.scan import ExclusiveScanKernel, InclusiveScanKernel + for cls in [ExclusiveScanKernel, InclusiveScanKernel]: scan_kern = cls(np.int32, "a+b", "0") for n in [ - 10, 2**10-5, 2**10, - 2**20-2**18, - 2**20-2**18+5, - 2**10+5, - 2**20+5, - 2**20, 2**24 - ]: + 10, + 2 ** 10 - 5, + 2 ** 10, + 2 ** 20 - 2 ** 18, + 2 ** 20 - 2 ** 18 + 5, + 2 ** 10 + 5, + 2 ** 20 + 5, + 2 ** 20, + 2 ** 24, + ]: host_data = np.random.randint(0, 10, n).astype(np.int32) gpu_data = gpuarray.to_gpu(host_data) @@ -817,11 +847,11 @@ class TestGPUArray: assert throws_exception # with order specified - a_gpu = a_gpu.reshape((4, 32), order='C') + a_gpu = a_gpu.reshape((4, 32), order="C") assert a_gpu.flags.c_contiguous - a_gpu = a_gpu.reshape(4, 32, order='F') + a_gpu = a_gpu.reshape(4, 32, order="F") assert a_gpu.flags.f_contiguous - a_gpu = a_gpu.reshape((4, 32), order='F') + a_gpu = a_gpu.reshape((4, 32), order="F") assert a_gpu.flags.f_contiguous # default is C-contiguous a_gpu = a_gpu.reshape((4, 32)) @@ -851,26 +881,26 @@ class TestGPUArray: a_gpu = gpuarray.to_gpu(a_cpu) # Slice with length 1 on dimensions 0 and 1 - a_gpu_slice = a_gpu[0:1,1:2,:,:] - assert a_gpu_slice.shape == (1,1,shape[2],shape[3]) + a_gpu_slice = a_gpu[0:1, 1:2, :, :] + assert a_gpu_slice.shape == (1, 1, shape[2], shape[3]) assert a_gpu_slice.flags.c_contiguous # Squeeze it and obtain contiguity - a_gpu_squeezed_slice = a_gpu[0:1,1:2,:,:].squeeze() - assert a_gpu_squeezed_slice.shape == (shape[2],shape[3]) + a_gpu_squeezed_slice = a_gpu[0:1, 1:2, :, :].squeeze() + assert a_gpu_squeezed_slice.shape == (shape[2], shape[3]) assert a_gpu_squeezed_slice.flags.c_contiguous # Check that we get the original values out assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel()) # Slice with length 1 on dimensions 2 - a_gpu_slice = a_gpu[:,:,2:3,:] - assert a_gpu_slice.shape == (shape[0],shape[1],1,shape[3]) + a_gpu_slice = a_gpu[:, :, 2:3, :] + assert a_gpu_slice.shape == (shape[0], shape[1], 1, shape[3]) assert not a_gpu_slice.flags.c_contiguous # Squeeze it, but no contiguity here - a_gpu_squeezed_slice = a_gpu[:,:,2:3,:].squeeze() - assert a_gpu_squeezed_slice.shape == (shape[0],shape[1],shape[3]) + a_gpu_squeezed_slice = a_gpu[:, :, 2:3, :].squeeze() + assert a_gpu_squeezed_slice.shape == (shape[0], shape[1], shape[3]) assert not a_gpu_squeezed_slice.flags.c_contiguous # Check that we get the original values out @@ -921,22 +951,29 @@ class TestGPUArray: mmc_dtype = np.dtype([("cur_min", np.float32), ("cur_max", np.float32)]) from pycuda.curandom import rand as curand + a_gpu = curand((20000,), dtype=np.float32) a = a_gpu.get() from pycuda.tools import register_dtype + register_dtype(mmc_dtype, "minmax_collector") from pycuda.reduction import ReductionKernel - red = ReductionKernel(mmc_dtype, - neutral="minmax_collector(10000, -10000)", - # FIXME: needs infinity literal in real use, ok here - reduce_expr="agg_mmc(a, b)", map_expr="minmax_collector(x[i], x[i])", - arguments="float *x", preamble=preamble) + + red = ReductionKernel( + mmc_dtype, + neutral="minmax_collector(10000, -10000)", + # FIXME: needs infinity literal in real use, ok here + reduce_expr="agg_mmc(a, b)", + map_expr="minmax_collector(x[i], x[i])", + arguments="float *x", + preamble=preamble, + ) minmax = red(a_gpu).get() - #print minmax["cur_min"], minmax["cur_max"] - #print np.min(a), np.max(a) + # print minmax["cur_min"], minmax["cur_max"] + # print np.min(a), np.max(a) assert minmax["cur_min"] == np.min(a) assert minmax["cur_max"] == np.max(a) @@ -944,13 +981,15 @@ class TestGPUArray: @mark_cuda_test def test_reduce_out(self): from pycuda.curandom import rand as curand + a_gpu = curand((10, 200), dtype=np.float32) a = a_gpu.get() from pycuda.reduction import ReductionKernel - red = ReductionKernel(np.float32, neutral=0, - reduce_expr="max(a,b)", - arguments="float *in") + + red = ReductionKernel( + np.float32, neutral=0, reduce_expr="max(a,b)", arguments="float *in" + ) max_gpu = gpuarray.empty(10, dtype=np.float32) for i in range(10): red(a_gpu[i], out=max_gpu[i]) @@ -961,22 +1000,24 @@ class TestGPUArray: def test_sum_allocator(self): # FIXME from pytest import skip + skip("https://github.com/inducer/pycuda/issues/163") # crashes with terminate called after throwing an instance of 'pycuda::error' # what(): explicit_context_dependent failed: invalid device context - no currently active context? import pycuda.tools + pool = pycuda.tools.DeviceMemoryPool() - rng = np.random.randint(low=512,high=1024) + rng = np.random.randint(low=512, high=1024) - a = gpuarray.arange(rng,dtype=np.int32) + a = gpuarray.arange(rng, dtype=np.int32) b = gpuarray.sum(a) c = gpuarray.sum(a, allocator=pool.allocate) # Test that we get the correct results - assert b.get() == rng*(rng-1)//2 - assert c.get() == rng*(rng-1)//2 + assert b.get() == rng * (rng - 1) // 2 + assert c.get() == rng * (rng - 1) // 2 # Test that result arrays were allocated with the appropriate allocator assert b.allocator == a.allocator @@ -986,13 +1027,15 @@ class TestGPUArray: def test_dot_allocator(self): # FIXME from pytest import skip + skip("https://github.com/inducer/pycuda/issues/163") import pycuda.tools + pool = pycuda.tools.DeviceMemoryPool() - a_cpu = np.random.randint(low=512,high=1024,size=1024) - b_cpu = np.random.randint(low=512,high=1024,size=1024) + a_cpu = np.random.randint(low=512, high=1024, size=1024) + b_cpu = np.random.randint(low=512, high=1024, size=1024) # Compute the result on the CPU dot_cpu_1 = np.dot(a_cpu, b_cpu) @@ -1012,7 +1055,6 @@ class TestGPUArray: assert dot_gpu_1.allocator == a_gpu.allocator assert dot_gpu_2.allocator == pool.allocate - @mark_cuda_test def test_view_and_strides(self): from pycuda.curandom import rand as curand @@ -1051,8 +1093,8 @@ class TestGPUArray: def test_minimum_maximum_scalar(self): from pycuda.curandom import rand as curand - l = 20 - a_gpu = curand((l,)) + sz = 20 + a_gpu = curand((sz,)) a = a_gpu.get() import pycuda.gpuarray as gpuarray @@ -1065,25 +1107,23 @@ class TestGPUArray: @mark_cuda_test def test_transpose(self): - import pycuda.gpuarray as gpuarray from pycuda.curandom import rand as curand - a_gpu = curand((10,20,30)) + a_gpu = curand((10, 20, 30)) a = a_gpu.get() - #assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0))) # not contiguous + # assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0))) # not contiguous assert np.allclose(a_gpu.T.get(), a.T) @mark_cuda_test def test_newaxis(self): - import pycuda.gpuarray as gpuarray from pycuda.curandom import rand as curand - a_gpu = curand((10,20,30)) + a_gpu = curand((10, 20, 30)) a = a_gpu.get() - b_gpu = a_gpu[:,np.newaxis] - b = a[:,np.newaxis] + b_gpu = a_gpu[:, np.newaxis] + b = a[:, np.newaxis] assert b_gpu.shape == b.shape assert b_gpu.strides == b.strides @@ -1091,43 +1131,55 @@ class TestGPUArray: @mark_cuda_test def test_copy(self): from pycuda.curandom import rand as curand - a_gpu = curand((3,3)) - - for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]: - assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]) - - a_gpu = curand((3,1)) - for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]: - assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]) - a_gpu = curand((3,3,3)) - for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]: - assert np.allclose(a_gpu[start:stop:step,start:stop:step].get(), a_gpu.get()[start:stop:step,start:stop:step]) - - a_gpu = curand((3,3,3)).transpose((1,2,0)) - a = a_gpu.get() - for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]: - assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step]) + a_gpu = curand((3, 3)) + + for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]: + assert np.allclose( + a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step] + ) + + a_gpu = curand((3, 1)) + for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]: + assert np.allclose( + a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step] + ) + + a_gpu = curand((3, 3, 3)) + for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]: + assert np.allclose( + a_gpu[start:stop:step, start:stop:step].get(), + a_gpu.get()[start:stop:step, start:stop:step], + ) + + a_gpu = curand((3, 3, 3)).transpose((1, 2, 0)) + for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]: + assert np.allclose( + a_gpu[start:stop:step, :, start:stop:step].get(), + a_gpu.get()[start:stop:step, :, start:stop:step], + ) # 4-d should work as long as only 2 axes are discontiguous - a_gpu = curand((3,3,3,3)) - a = a_gpu.get() - for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]: - assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step]) + a_gpu = curand((3, 3, 3, 3)) + for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 3)]: + assert np.allclose( + a_gpu[start:stop:step, :, start:stop:step].get(), + a_gpu.get()[start:stop:step, :, start:stop:step], + ) @mark_cuda_test def test_get_set(self): import pycuda.gpuarray as gpuarray - a = np.random.normal(0., 1., (4,4)) + a = np.random.normal(0.0, 1.0, (4, 4)) a_gpu = gpuarray.to_gpu(a) assert np.allclose(a_gpu.get(), a) - assert np.allclose(a_gpu[1:3,1:3].get(), a[1:3,1:3]) + assert np.allclose(a_gpu[1:3, 1:3].get(), a[1:3, 1:3]) - a = np.random.normal(0., 1., (4,4,4)).transpose((1,2,0)) + a = np.random.normal(0.0, 1.0, (4, 4, 4)).transpose((1, 2, 0)) a_gpu = gpuarray.to_gpu(a) assert np.allclose(a_gpu.get(), a) - assert np.allclose(a_gpu[1:3,1:3,1:3].get(), a[1:3,1:3,1:3]) + assert np.allclose(a_gpu[1:3, 1:3, 1:3].get(), a[1:3, 1:3, 1:3]) @mark_cuda_test def test_zeros_like_etc(self): @@ -1135,11 +1187,9 @@ class TestGPUArray: a = np.random.randn(*shape).astype(np.float32) z = gpuarray.to_gpu(a) zf = gpuarray.to_gpu(np.asfortranarray(a)) - a_noncontig = np.arange(3*4*5).reshape(3, 4, 5).swapaxes(1, 2) + a_noncontig = np.arange(3 * 4 * 5).reshape(3, 4, 5).swapaxes(1, 2) z_noncontig = gpuarray.to_gpu(a_noncontig) - for func in [gpuarray.empty_like, - gpuarray.zeros_like, - gpuarray.ones_like]: + for func in [gpuarray.empty_like, gpuarray.zeros_like, gpuarray.ones_like]: for arr in [z, zf, z_noncontig]: contig = arr.flags.c_contiguous or arr.flags.f_contiguous @@ -1184,7 +1234,8 @@ if __name__ == "__main__": import pycuda.autoinit # noqa if len(sys.argv) > 1: - exec (sys.argv[1]) + exec(sys.argv[1]) else: from pytest import main + main([__file__]) diff --git a/test/undistributed/elwise-perf.py b/test/undistributed/elwise-perf.py index 7d609e24..dc2c2821 100644 --- a/test/undistributed/elwise-perf.py +++ b/test/undistributed/elwise-perf.py @@ -1,5 +1,6 @@ from __future__ import absolute_import from __future__ import print_function + #! /usr/bin/env python import pycuda.driver as drv import pycuda.autoinit @@ -8,10 +9,9 @@ import numpy.linalg as la from six.moves import range - - def main(): from pytools import Table + tbl = Table() tbl.add_row(("size [MiB]", "time [s]", "mem.bw [GB/s]")) @@ -19,7 +19,7 @@ def main(): # they're floats, i.e. 4 bytes each for power in range(10, 28): - size = 1< Date: Sun, 11 Oct 2020 15:48:24 -0500 Subject: [PATCH 2/3] Fix coding conventions in setup.py --- setup.py | 303 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 162 insertions(+), 141 deletions(-) diff --git a/setup.py b/setup.py index febe2f86..2a7acbe3 100644 --- a/setup.py +++ b/setup.py @@ -22,9 +22,17 @@ def search_on_path(filenames): def get_config_schema(): - from aksetup_helper import (ConfigSchema, Option, - IncludeDir, LibraryDir, Libraries, BoostLibraries, - Switch, StringListOption, make_boost_base_options) + from aksetup_helper import ( + ConfigSchema, + Option, + IncludeDir, + LibraryDir, + Libraries, + BoostLibraries, + Switch, + StringListOption, + make_boost_base_options, + ) nvcc_path = search_on_path(["nvcc", "nvcc.exe"]) if nvcc_path is None: @@ -41,69 +49,79 @@ def get_config_schema(): lib64 = "lib64" import sys + if sys.platform.startswith("win"): # https://github.com/inducer/pycuda/issues/113 lib64 = "lib/x64" - cxxflags_default.extend(['/EHsc']) - ldflags_default.extend(['/FORCE']) - elif 'darwin' in sys.platform: + cxxflags_default.extend(["/EHsc"]) + ldflags_default.extend(["/FORCE"]) + elif "darwin" in sys.platform: import glob - root_candidates = glob.glob('/Developer/NVIDIA/CUDA-*') + + root_candidates = glob.glob("/Developer/NVIDIA/CUDA-*") if root_candidates: cuda_root_default = root_candidates[-1] lib64 = "lib" default_lib_dirs = [ "${CUDA_ROOT}/lib", - "${CUDA_ROOT}/"+lib64, + "${CUDA_ROOT}/" + lib64, # https://github.com/inducer/pycuda/issues/98 "${CUDA_ROOT}/lib/stubs", "${CUDA_ROOT}/%s/stubs" % lib64, + ] + + if "darwin" in sys.platform: + default_lib_dirs.append("/usr/local/cuda/lib") + + return ConfigSchema( + make_boost_base_options() + + [ + Switch("USE_SHIPPED_BOOST", True, "Use included Boost library"), + BoostLibraries("python"), + BoostLibraries("thread"), + Switch("CUDA_TRACE", False, "Enable CUDA API tracing"), + Option( + "CUDA_ROOT", default=cuda_root_default, help="Path to the CUDA toolkit" + ), + Option( + "CUDA_PRETEND_VERSION", + help="Assumed CUDA version, in the form 3010 for 3.1.", + ), + IncludeDir("CUDA", None), + Switch("CUDA_ENABLE_GL", False, "Enable CUDA GL interoperability"), + Switch("CUDA_ENABLE_CURAND", True, "Enable CURAND library"), + LibraryDir("CUDADRV", default_lib_dirs), + Libraries("CUDADRV", ["cuda"]), + LibraryDir("CUDART", default_lib_dirs), + Libraries("CUDART", ["cudart"]), + LibraryDir("CURAND", default_lib_dirs), + Libraries("CURAND", ["curand"]), + StringListOption( + "CXXFLAGS", + cxxflags_default, + help="Any extra C++ compiler options to include", + ), + StringListOption( + "LDFLAGS", ldflags_default, help="Any extra linker options to include" + ), ] - - if 'darwin' in sys.platform: - default_lib_dirs.append( - "/usr/local/cuda/lib") - - return ConfigSchema(make_boost_base_options() + [ - Switch("USE_SHIPPED_BOOST", True, "Use included Boost library"), - - BoostLibraries("python"), - BoostLibraries("thread"), - - Switch("CUDA_TRACE", False, "Enable CUDA API tracing"), - Option("CUDA_ROOT", default=cuda_root_default, - help="Path to the CUDA toolkit"), - Option("CUDA_PRETEND_VERSION", - help="Assumed CUDA version, in the form 3010 for 3.1."), - IncludeDir("CUDA", None), - - Switch("CUDA_ENABLE_GL", False, "Enable CUDA GL interoperability"), - Switch("CUDA_ENABLE_CURAND", True, "Enable CURAND library"), - - LibraryDir("CUDADRV", default_lib_dirs), - Libraries("CUDADRV", ["cuda"]), - - LibraryDir("CUDART", default_lib_dirs), - Libraries("CUDART", ["cudart"]), - - LibraryDir("CURAND", default_lib_dirs), - Libraries("CURAND", ["curand"]), - - StringListOption("CXXFLAGS", cxxflags_default, - help="Any extra C++ compiler options to include"), - StringListOption("LDFLAGS", ldflags_default, - help="Any extra linker options to include"), - ]) + ) def main(): import sys - from aksetup_helper import (hack_distutils, get_config, setup, - ExtensionUsingNumpy, set_up_shipped_boost_if_requested, - check_git_submodules, NumpyBuildExtCommand) + from aksetup_helper import ( + hack_distutils, + get_config, + setup, + ExtensionUsingNumpy, + set_up_shipped_boost_if_requested, + check_git_submodules, + NumpyBuildExtCommand, + ) check_git_submodules() @@ -116,8 +134,11 @@ def main(): EXTRA_DEFINES["PYGPU_PYCUDA"] = "1" LIBRARY_DIRS = conf["BOOST_LIB_DIR"] + conf["CUDADRV_LIB_DIR"] - LIBRARIES = (conf["BOOST_PYTHON_LIBNAME"] + conf["BOOST_THREAD_LIBNAME"] - + conf["CUDADRV_LIBNAME"]) + LIBRARIES = ( + conf["BOOST_PYTHON_LIBNAME"] + + conf["BOOST_THREAD_LIBNAME"] + + conf["CUDADRV_LIBNAME"] + ) if not conf["CUDA_INC_DIR"] and conf["CUDA_ROOT"]: conf["CUDA_INC_DIR"] = [join(conf["CUDA_ROOT"], "include")] @@ -128,20 +149,20 @@ def main(): if conf["CUDA_PRETEND_VERSION"]: EXTRA_DEFINES["CUDAPP_PRETEND_CUDA_VERSION"] = conf["CUDA_PRETEND_VERSION"] - INCLUDE_DIRS = ['src/cpp'] + conf["BOOST_INC_DIR"] + INCLUDE_DIRS = ["src/cpp"] + conf["BOOST_INC_DIR"] if conf["CUDA_INC_DIR"]: INCLUDE_DIRS += conf["CUDA_INC_DIR"] conf["USE_CUDA"] = True - if 'darwin' in sys.platform and sys.maxsize == 2147483647: + if "darwin" in sys.platform and sys.maxsize == 2147483647: # The Python interpreter is running in 32 bit mode on OS X if "-arch" not in conf["CXXFLAGS"]: - conf["CXXFLAGS"].extend(['-arch', 'i386', '-m32']) + conf["CXXFLAGS"].extend(["-arch", "i386", "-m32"]) if "-arch" not in conf["LDFLAGS"]: - conf["LDFLAGS"].extend(['-arch', 'i386', '-m32']) + conf["LDFLAGS"].extend(["-arch", "i386", "-m32"]) - if 'darwin' in sys.platform: + if "darwin" in sys.platform: # set path to Cuda dynamic libraries, # as a safe substitute for DYLD_LIBRARY_PATH for lib_dir in conf["CUDADRV_LIB_DIR"]: @@ -153,103 +174,103 @@ def main(): if conf["CUDA_ENABLE_CURAND"]: EXTRA_DEFINES["HAVE_CURAND"] = 1 - EXTRA_SOURCES.extend([ - "src/wrapper/wrap_curand.cpp" - ]) + EXTRA_SOURCES.extend(["src/wrapper/wrap_curand.cpp"]) LIBRARIES.extend(conf["CURAND_LIBNAME"]) LIBRARY_DIRS.extend(conf["CURAND_LIB_DIR"]) ver_dic = {} - exec(compile(open("pycuda/__init__.py").read(), "pycuda/__init__.py", 'exec'), - ver_dic) + exec( + compile(open("pycuda/__init__.py").read(), "pycuda/__init__.py", "exec"), + ver_dic, + ) import sys + if sys.version_info >= (3,): pvt_struct_source = "src/wrapper/_pvt_struct_v3.cpp" else: pvt_struct_source = "src/wrapper/_pvt_struct_v2.cpp" - setup(name="pycuda", - # metadata - version=ver_dic["VERSION_TEXT"], - description="Python wrapper for Nvidia CUDA", - long_description=open("README.rst", "rt").read(), - author="Andreas Kloeckner", - author_email="inform@tiker.net", - license="MIT", - url="http://mathema.tician.de/software/pycuda", - classifiers=[ - 'Environment :: Console', - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'Intended Audience :: Other Audience', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Programming Language :: C++', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Mathematics', - 'Topic :: Scientific/Engineering :: Physics', - 'Topic :: Scientific/Engineering :: Visualization', - ], - - # build info - packages=["pycuda", "pycuda.gl", "pycuda.sparse", "pycuda.compyte"], - - setup_requires=[ - "numpy>=1.6", - ], - - python_requires="~=3.6", - install_requires=[ - "pytools>=2011.2", - "decorator>=3.2.0", - "appdirs>=1.4.0", - "mako", - ], - - test_requires=[ - "pytest>=2", - ], - - ext_package="pycuda", - ext_modules=[ - ExtensionUsingNumpy("_driver", - [ - "src/cpp/cuda.cpp", - "src/cpp/bitlog.cpp", - "src/wrapper/wrap_cudadrv.cpp", - "src/wrapper/mempool.cpp", - ]+EXTRA_SOURCES, - include_dirs=INCLUDE_DIRS, - library_dirs=LIBRARY_DIRS, - libraries=LIBRARIES, - define_macros=list(EXTRA_DEFINES.items()), - extra_compile_args=conf["CXXFLAGS"], - extra_link_args=conf["LDFLAGS"], - ), - ExtensionUsingNumpy("_pvt_struct", - [pvt_struct_source], - extra_compile_args=conf["CXXFLAGS"], - extra_link_args=conf["LDFLAGS"], - ), - ], - cmdclass={'build_ext': NumpyBuildExtCommand}, - include_package_data=True, - package_data={ - "pycuda": [ - "cuda/*.hpp", - ] - }, - - zip_safe=False) - - -if __name__ == '__main__': + setup( + name="pycuda", + # metadata + version=ver_dic["VERSION_TEXT"], + description="Python wrapper for Nvidia CUDA", + long_description=open("README.rst", "rt").read(), + author="Andreas Kloeckner", + author_email="inform@tiker.net", + license="MIT", + url="http://mathema.tician.de/software/pycuda", + classifiers=[ + "Environment :: Console", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Other Audience", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: C++", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.3", + "Programming Language :: Python :: 3.4", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Scientific/Engineering :: Physics", + "Topic :: Scientific/Engineering :: Visualization", + ], + # build info + packages=["pycuda", "pycuda.gl", "pycuda.sparse", "pycuda.compyte"], + setup_requires=[ + "numpy>=1.6", + ], + python_requires="~=3.6", + install_requires=[ + "pytools>=2011.2", + "decorator>=3.2.0", + "appdirs>=1.4.0", + "mako", + ], + test_requires=[ + "pytest>=2", + ], + ext_package="pycuda", + ext_modules=[ + ExtensionUsingNumpy( + "_driver", + [ + "src/cpp/cuda.cpp", + "src/cpp/bitlog.cpp", + "src/wrapper/wrap_cudadrv.cpp", + "src/wrapper/mempool.cpp", + ] + + EXTRA_SOURCES, + include_dirs=INCLUDE_DIRS, + library_dirs=LIBRARY_DIRS, + libraries=LIBRARIES, + define_macros=list(EXTRA_DEFINES.items()), + extra_compile_args=conf["CXXFLAGS"], + extra_link_args=conf["LDFLAGS"], + ), + ExtensionUsingNumpy( + "_pvt_struct", + [pvt_struct_source], + extra_compile_args=conf["CXXFLAGS"], + extra_link_args=conf["LDFLAGS"], + ), + ], + cmdclass={"build_ext": NumpyBuildExtCommand}, + include_package_data=True, + package_data={ + "pycuda": [ + "cuda/*.hpp", + ] + }, + zip_safe=False, + ) + + +if __name__ == "__main__": main() -- GitLab From f95bf36d314c60aa1ef8dc7643eee45acb87feac Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 11 Oct 2020 15:52:57 -0500 Subject: [PATCH 3/3] Run pyupgrade 3.6+ --- examples/demo.py | 2 -- examples/demo_elementwise.py | 1 - examples/demo_meta_codepy.py | 2 -- examples/demo_meta_template.py | 1 - examples/demo_struct.py | 2 -- examples/dump_properties.py | 5 +-- examples/fill_gpu_with_nans.py | 2 -- examples/from-wiki/2d_fft.py | 2 -- examples/from-wiki/2dfft.py | 2 -- examples/from-wiki/arithmetic_example.py | 2 -- examples/from-wiki/c++_function_templates.py | 2 -- examples/from-wiki/computing.py | 2 -- examples/from-wiki/convolution.py | 2 -- examples/from-wiki/demo_complex.py | 2 -- examples/from-wiki/demo_meta_cgen.py | 2 -- .../from-wiki/demo_meta_matrixmul_cheetah.py | 3 -- examples/from-wiki/distance_element_wise3d.py | 2 -- examples/from-wiki/game_of_life.py | 2 -- examples/from-wiki/gl_interop.py | 2 -- examples/from-wiki/gpu_scalar_mult.py | 2 -- examples/from-wiki/kernel_concurrency.py | 4 +-- examples/from-wiki/light_field_3d_viewer.py | 9 ++--- examples/from-wiki/mandelbrot.py | 2 -- examples/from-wiki/mandelbrot_interactive.py | 10 +++--- .../manhattan_distance_for_2D_array.py | 2 -- examples/from-wiki/matrix_transpose.py | 3 -- examples/from-wiki/matrixmul_simple.py | 2 -- examples/from-wiki/matrixmul_tiled.py | 3 -- .../measure_gpuarray_speed_random.py | 2 -- examples/from-wiki/multiple_threads.py | 2 -- examples/from-wiki/plot_random_data.py | 2 -- examples/from-wiki/rotate.py | 2 -- examples/from-wiki/select_to_list.py | 3 -- examples/from-wiki/simple_rgb2gray.py | 2 -- examples/from-wiki/simple_speed_test.py | 2 -- examples/from-wiki/sobel_filter.py | 2 -- examples/from-wiki/sparse_solve.py | 3 -- examples/from-wiki/threads_and_blocks.py | 2 -- examples/from-wiki/thrust_interop.py | 2 -- examples/from-wiki/using_printf.py | 2 -- examples/hello_gpu.py | 2 -- pycuda/_mymako.py | 2 -- pycuda/autoinit.py | 1 - pycuda/characterize.py | 3 -- pycuda/compiler.py | 9 ++--- pycuda/cumath.py | 1 - pycuda/curandom.py | 30 +++++++--------- pycuda/debug.py | 2 -- pycuda/driver.py | 34 +++++++------------ pycuda/elementwise.py | 8 +---- pycuda/gpuarray.py | 15 ++++---- pycuda/reduction.py | 9 ++--- pycuda/scan.py | 8 ++--- pycuda/sparse/packeted.py | 1 - pycuda/sparse/pkt_build.py | 1 - pycuda/tools.py | 16 +++------ test/test_cumath.py | 3 -- test/test_driver.py | 4 +-- test/test_gpuarray.py | 6 ++-- 59 files changed, 57 insertions(+), 201 deletions(-) diff --git a/examples/demo.py b/examples/demo.py index 2bd78b4e..a1c40420 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -1,6 +1,4 @@ # Sample source code from the Tutorial Introduction in the documentation. -from __future__ import print_function -from __future__ import absolute_import import pycuda.driver as cuda import pycuda.autoinit # noqa from pycuda.compiler import SourceModule diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py index 0fcbb5fe..9baadc13 100644 --- a/examples/demo_elementwise.py +++ b/examples/demo_elementwise.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy diff --git a/examples/demo_meta_codepy.py b/examples/demo_meta_codepy.py index efb4d4c1..ddd89f5a 100644 --- a/examples/demo_meta_codepy.py +++ b/examples/demo_meta_codepy.py @@ -1,10 +1,8 @@ -from __future__ import absolute_import import pycuda.driver as cuda import pycuda.autoinit import numpy import numpy.linalg as la from pycuda.compiler import SourceModule -from six.moves import range thread_strides = 16 block_size = 256 diff --git a/examples/demo_meta_template.py b/examples/demo_meta_template.py index bac4e021..4093434d 100644 --- a/examples/demo_meta_template.py +++ b/examples/demo_meta_template.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import import pycuda.driver as cuda import pycuda.autoinit import numpy diff --git a/examples/demo_struct.py b/examples/demo_struct.py index 5591ffb7..3f230264 100644 --- a/examples/demo_struct.py +++ b/examples/demo_struct.py @@ -1,6 +1,4 @@ # prepared invocations and structures ----------------------------------------- -from __future__ import print_function -from __future__ import absolute_import import pycuda.driver as cuda import pycuda.autoinit import numpy diff --git a/examples/dump_properties.py b/examples/dump_properties.py index 65a684bb..a960619c 100644 --- a/examples/dump_properties.py +++ b/examples/dump_properties.py @@ -1,7 +1,4 @@ -from __future__ import print_function -from __future__ import absolute_import import pycuda.driver as drv -from six.moves import range @@ -18,5 +15,5 @@ for ordinal in range(drv.Device.count()): atts.sort() for att, value in atts: - print(" %s: %s" % (att, value)) + print(f" {att}: {value}") diff --git a/examples/fill_gpu_with_nans.py b/examples/fill_gpu_with_nans.py index 3a8e9497..d9306d84 100644 --- a/examples/fill_gpu_with_nans.py +++ b/examples/fill_gpu_with_nans.py @@ -1,5 +1,3 @@ -from __future__ import print_function -from __future__ import absolute_import import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as cuda diff --git a/examples/from-wiki/2d_fft.py b/examples/from-wiki/2d_fft.py index 0efb82c1..424984a4 100644 --- a/examples/from-wiki/2d_fft.py +++ b/examples/from-wiki/2d_fft.py @@ -1,5 +1,3 @@ - - #!python import numpy import scipy.misc diff --git a/examples/from-wiki/2dfft.py b/examples/from-wiki/2dfft.py index cd2143ba..67585107 100644 --- a/examples/from-wiki/2dfft.py +++ b/examples/from-wiki/2dfft.py @@ -1,5 +1,3 @@ - - #!python # Paste code for your example here. diff --git a/examples/from-wiki/arithmetic_example.py b/examples/from-wiki/arithmetic_example.py index bf0ed190..35dd9d3e 100644 --- a/examples/from-wiki/arithmetic_example.py +++ b/examples/from-wiki/arithmetic_example.py @@ -1,5 +1,3 @@ - - #!python #!python import pycuda.driver as cuda diff --git a/examples/from-wiki/c++_function_templates.py b/examples/from-wiki/c++_function_templates.py index 7a50ba34..b3ef4f81 100644 --- a/examples/from-wiki/c++_function_templates.py +++ b/examples/from-wiki/c++_function_templates.py @@ -1,5 +1,3 @@ - - #!python import pycuda.gpuarray as gpuarray import pycuda.driver as drv diff --git a/examples/from-wiki/computing.py b/examples/from-wiki/computing.py index cd2143ba..67585107 100644 --- a/examples/from-wiki/computing.py +++ b/examples/from-wiki/computing.py @@ -1,5 +1,3 @@ - - #!python # Paste code for your example here. diff --git a/examples/from-wiki/convolution.py b/examples/from-wiki/convolution.py index 9502daf3..12246a09 100644 --- a/examples/from-wiki/convolution.py +++ b/examples/from-wiki/convolution.py @@ -1,5 +1,3 @@ - - #!python ''' /* diff --git a/examples/from-wiki/demo_complex.py b/examples/from-wiki/demo_complex.py index 53263e37..031221ee 100644 --- a/examples/from-wiki/demo_complex.py +++ b/examples/from-wiki/demo_complex.py @@ -1,5 +1,3 @@ - - #!python import pycuda.driver as cuda import pycuda.gpuarray as gpuarray diff --git a/examples/from-wiki/demo_meta_cgen.py b/examples/from-wiki/demo_meta_cgen.py index f914b412..51e1d737 100644 --- a/examples/from-wiki/demo_meta_cgen.py +++ b/examples/from-wiki/demo_meta_cgen.py @@ -1,5 +1,3 @@ - - #!python import pycuda.driver as cuda import pycuda.autoinit diff --git a/examples/from-wiki/demo_meta_matrixmul_cheetah.py b/examples/from-wiki/demo_meta_matrixmul_cheetah.py index a3cf4b9c..9eb6531e 100644 --- a/examples/from-wiki/demo_meta_matrixmul_cheetah.py +++ b/examples/from-wiki/demo_meta_matrixmul_cheetah.py @@ -1,10 +1,7 @@ - - #!python #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import division """ PyCuda Optimized Matrix Multiplication diff --git a/examples/from-wiki/distance_element_wise3d.py b/examples/from-wiki/distance_element_wise3d.py index 9a5fa74f..1a513839 100644 --- a/examples/from-wiki/distance_element_wise3d.py +++ b/examples/from-wiki/distance_element_wise3d.py @@ -1,5 +1,3 @@ - - #!python import pycuda.gpuarray as gpuarray import pycuda.driver as cuda diff --git a/examples/from-wiki/game_of_life.py b/examples/from-wiki/game_of_life.py index 16958bf2..eb9427a7 100644 --- a/examples/from-wiki/game_of_life.py +++ b/examples/from-wiki/game_of_life.py @@ -1,5 +1,3 @@ - - #!python # Conway's Game of Life Accelerated with PyCUDA # Luis Villasenor diff --git a/examples/from-wiki/gl_interop.py b/examples/from-wiki/gl_interop.py index 1f7f2f81..505a7166 100644 --- a/examples/from-wiki/gl_interop.py +++ b/examples/from-wiki/gl_interop.py @@ -1,5 +1,3 @@ - - #!python # GL interoperability example, by Peter Berrington. # Draws a rotating teapot, using cuda to invert the RGB value diff --git a/examples/from-wiki/gpu_scalar_mult.py b/examples/from-wiki/gpu_scalar_mult.py index 669d9469..e3c17421 100644 --- a/examples/from-wiki/gpu_scalar_mult.py +++ b/examples/from-wiki/gpu_scalar_mult.py @@ -1,5 +1,3 @@ - - #!python import numpy import pycuda.autoinit diff --git a/examples/from-wiki/kernel_concurrency.py b/examples/from-wiki/kernel_concurrency.py index a48e642b..c561980a 100644 --- a/examples/from-wiki/kernel_concurrency.py +++ b/examples/from-wiki/kernel_concurrency.py @@ -1,5 +1,3 @@ - - #!python #! /usr/bin/env python # A simple program to illustrate kernel concurrency with PyCuda. @@ -52,7 +50,7 @@ stream, event = [], [] marker_names = ['kernel_begin', 'kernel_end'] for k in range(n): stream.append(drv.Stream()) - event.append(dict([(marker_names[l], drv.Event()) for l in range(len(marker_names))])) + event.append({marker_names[l]: drv.Event() for l in range(len(marker_names))}) # Transfer to device. for k in range(n): diff --git a/examples/from-wiki/light_field_3d_viewer.py b/examples/from-wiki/light_field_3d_viewer.py index 1129d2ef..3f22e005 100644 --- a/examples/from-wiki/light_field_3d_viewer.py +++ b/examples/from-wiki/light_field_3d_viewer.py @@ -1,5 +1,3 @@ - - #!python """ 3D display of Light Field images. @@ -18,7 +16,6 @@ Prerequisites: Author: Amit Aides. amitibo at technion . ac . il """ -from __future__ import division from enthought.traits.api import HasTraits, Range, on_trait_change from enthought.traits.ui.api import View, Item @@ -110,7 +107,7 @@ class LFapplication(HasTraits): ) def __init__(self, img_path): - super(LFapplication, self).__init__() + super().__init__() # # Load image data @@ -119,12 +116,12 @@ class LFapplication(HasTraits): lenslet_path = base_path + '-lenslet.txt' optics_path = base_path + '-optics.txt' - with open(lenslet_path, 'r') as f: + with open(lenslet_path) as f: tmp = eval(f.readline()) x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \ np.array(tmp, dtype=np.float32) - with open(optics_path, 'r') as f: + with open(optics_path) as f: for line in f: name, val = line.strip().split() try: diff --git a/examples/from-wiki/mandelbrot.py b/examples/from-wiki/mandelbrot.py index 21873dd1..1d40e337 100644 --- a/examples/from-wiki/mandelbrot.py +++ b/examples/from-wiki/mandelbrot.py @@ -1,5 +1,3 @@ - - #!python # Mandelbrot calculate using GPU, Serial numpy and faster numpy # Use to show the speed difference between CPU and GPU calculations diff --git a/examples/from-wiki/mandelbrot_interactive.py b/examples/from-wiki/mandelbrot_interactive.py index 15d9e801..e434f1be 100644 --- a/examples/from-wiki/mandelbrot_interactive.py +++ b/examples/from-wiki/mandelbrot_interactive.py @@ -1,5 +1,3 @@ - - #!python # Interactive Mandelbrot Set Accelerated using PyCUDA # Classical Iteration Method @@ -108,21 +106,21 @@ def zoom_on_square(eclick, erelease): def key_selector(event): global N,side,x0,y0,myobj,M,power,L,i_cmap,n_grid #print(' Key pressed.') - if event.key == u'up': # Increase max number of iterations + if event.key == 'up': # Increase max number of iterations L=int(L*1.2); print("Maximum number of iterations changed to %d" % L) func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower') ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) plt.draw() - if event.key == u'down': # Decrease max number of iterations + if event.key == 'down': # Decrease max number of iterations L=int(L/1.2); print("Maximum number of iterations changed to %d" % L) func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower') ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) plt.draw() - if event.key == u'right': # Increase number of pixels + if event.key == 'right': # Increase number of pixels N=int(N*1.2); n_grid=int(N/16.); N=n_block*n_grid; @@ -132,7 +130,7 @@ def key_selector(event): myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower') ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) plt.draw() - if event.key == u'left': # Decrease number of pixels + if event.key == 'left': # Decrease number of pixels N=int(N/1.2); n_grid=int(N/16.); N=n_block*n_grid; diff --git a/examples/from-wiki/manhattan_distance_for_2D_array.py b/examples/from-wiki/manhattan_distance_for_2D_array.py index e3201b8a..3d7c8371 100644 --- a/examples/from-wiki/manhattan_distance_for_2D_array.py +++ b/examples/from-wiki/manhattan_distance_for_2D_array.py @@ -1,5 +1,3 @@ - - #!python import numpy diff --git a/examples/from-wiki/matrix_transpose.py b/examples/from-wiki/matrix_transpose.py index 2075fc7c..d8ad7848 100644 --- a/examples/from-wiki/matrix_transpose.py +++ b/examples/from-wiki/matrix_transpose.py @@ -1,12 +1,9 @@ - - #!python # Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises # Transposition of a matrix # by Hendrik Riedmann -from __future__ import division, print_function import pycuda.driver as cuda import pycuda.gpuarray as gpuarray diff --git a/examples/from-wiki/matrixmul_simple.py b/examples/from-wiki/matrixmul_simple.py index 7686cbfd..5eda5648 100644 --- a/examples/from-wiki/matrixmul_simple.py +++ b/examples/from-wiki/matrixmul_simple.py @@ -1,5 +1,3 @@ - - #!python #!/usr/bin/env python # -*- coding: utf-8 -*- diff --git a/examples/from-wiki/matrixmul_tiled.py b/examples/from-wiki/matrixmul_tiled.py index 072011ea..65dd7b03 100644 --- a/examples/from-wiki/matrixmul_tiled.py +++ b/examples/from-wiki/matrixmul_tiled.py @@ -1,10 +1,7 @@ - - #!python #!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import division """ Multiples two square matrices together using multiple blocks and shared memory. diff --git a/examples/from-wiki/measure_gpuarray_speed_random.py b/examples/from-wiki/measure_gpuarray_speed_random.py index b0d87ec4..8f822af0 100644 --- a/examples/from-wiki/measure_gpuarray_speed_random.py +++ b/examples/from-wiki/measure_gpuarray_speed_random.py @@ -1,5 +1,3 @@ - - #!python #! /usr/bin/env python # DO NOT USE THIS AS A BENCHMARK. See diff --git a/examples/from-wiki/multiple_threads.py b/examples/from-wiki/multiple_threads.py index 73068170..0a5a7738 100644 --- a/examples/from-wiki/multiple_threads.py +++ b/examples/from-wiki/multiple_threads.py @@ -1,5 +1,3 @@ - - #!python # Derived from a test case by Chris Heuser # Also see FAQ about PyCUDA and threads. diff --git a/examples/from-wiki/plot_random_data.py b/examples/from-wiki/plot_random_data.py index d3fb1a10..2e0118c3 100644 --- a/examples/from-wiki/plot_random_data.py +++ b/examples/from-wiki/plot_random_data.py @@ -1,5 +1,3 @@ - - #!python # simple module to show the plotting of random data diff --git a/examples/from-wiki/rotate.py b/examples/from-wiki/rotate.py index db7307c9..a94ec5aa 100644 --- a/examples/from-wiki/rotate.py +++ b/examples/from-wiki/rotate.py @@ -1,5 +1,3 @@ - - #!python #!/usr/bin/env python -tt # encoding: utf-8 diff --git a/examples/from-wiki/select_to_list.py b/examples/from-wiki/select_to_list.py index d6237147..13c31072 100644 --- a/examples/from-wiki/select_to_list.py +++ b/examples/from-wiki/select_to_list.py @@ -1,12 +1,9 @@ - - #!python # Exercise 2 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises # Generate an array of random numbers between 0 and 1 # List the indices of those numbers that are greater than a given limit -from __future__ import division import pycuda.driver as cuda import pycuda.autoinit import pycuda.gpuarray as gpuarray diff --git a/examples/from-wiki/simple_rgb2gray.py b/examples/from-wiki/simple_rgb2gray.py index c90045eb..9c648f4a 100644 --- a/examples/from-wiki/simple_rgb2gray.py +++ b/examples/from-wiki/simple_rgb2gray.py @@ -1,5 +1,3 @@ - - #!python __author__ = 'ashwin' diff --git a/examples/from-wiki/simple_speed_test.py b/examples/from-wiki/simple_speed_test.py index 1d6588f9..fccc57c8 100644 --- a/examples/from-wiki/simple_speed_test.py +++ b/examples/from-wiki/simple_speed_test.py @@ -1,5 +1,3 @@ - - #!python # SimpleSpeedTest.py diff --git a/examples/from-wiki/sobel_filter.py b/examples/from-wiki/sobel_filter.py index 2170da90..a3dd9f5b 100644 --- a/examples/from-wiki/sobel_filter.py +++ b/examples/from-wiki/sobel_filter.py @@ -1,5 +1,3 @@ - - #!python #!/usr/bin/env python #-*- coding: utf-8 -*- diff --git a/examples/from-wiki/sparse_solve.py b/examples/from-wiki/sparse_solve.py index 8d057c32..359414bc 100644 --- a/examples/from-wiki/sparse_solve.py +++ b/examples/from-wiki/sparse_solve.py @@ -1,7 +1,4 @@ - - #!python -from __future__ import division import pycuda.autoinit import pycuda.driver as drv import pycuda.gpuarray as gpuarray diff --git a/examples/from-wiki/threads_and_blocks.py b/examples/from-wiki/threads_and_blocks.py index 3b0b2c21..caa851f3 100644 --- a/examples/from-wiki/threads_and_blocks.py +++ b/examples/from-wiki/threads_and_blocks.py @@ -1,5 +1,3 @@ - - #!python import pycuda.driver as cuda import pycuda.autoinit diff --git a/examples/from-wiki/thrust_interop.py b/examples/from-wiki/thrust_interop.py index a9868f65..f408e029 100644 --- a/examples/from-wiki/thrust_interop.py +++ b/examples/from-wiki/thrust_interop.py @@ -1,5 +1,3 @@ - - #!python import pycuda diff --git a/examples/from-wiki/using_printf.py b/examples/from-wiki/using_printf.py index 160a156c..c12cfa42 100644 --- a/examples/from-wiki/using_printf.py +++ b/examples/from-wiki/using_printf.py @@ -1,5 +1,3 @@ - - #!python import pycuda.driver as cuda import pycuda.autoinit diff --git a/examples/hello_gpu.py b/examples/hello_gpu.py index 1c8fd04e..23c51adb 100644 --- a/examples/hello_gpu.py +++ b/examples/hello_gpu.py @@ -1,5 +1,3 @@ -from __future__ import print_function -from __future__ import absolute_import import pycuda.driver as drv import pycuda.tools import pycuda.autoinit diff --git a/pycuda/_mymako.py b/pycuda/_mymako.py index f2a98c91..826dde61 100644 --- a/pycuda/_mymako.py +++ b/pycuda/_mymako.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - try: import mako.template # noqa: F401 except ImportError: diff --git a/pycuda/autoinit.py b/pycuda/autoinit.py index 00f39827..664ed1c8 100644 --- a/pycuda/autoinit.py +++ b/pycuda/autoinit.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import import pycuda.driver as cuda import atexit diff --git a/pycuda/characterize.py b/pycuda/characterize.py index 2badde2e..3b46ff7a 100644 --- a/pycuda/characterize.py +++ b/pycuda/characterize.py @@ -1,6 +1,3 @@ -from __future__ import division -from __future__ import absolute_import - from pycuda.tools import context_dependent_memoize import numpy as np diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 9bbd05d0..065a6314 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import -from __future__ import print_function from pytools import memoize # don't import pycuda.driver here--you'll create an import loop @@ -10,7 +8,6 @@ from tempfile import mkstemp from os import unlink from pytools.prefork import call_capture_output -from six.moves import map @memoize @@ -136,7 +133,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): try: result_f = open(join(file_dir, file_root + "." + target), "rb") - except IOError: + except OSError: no_output = True else: no_output = False @@ -301,7 +298,7 @@ def compile( return compile_plain(source, options, keep, nvcc, cache_dir, target) -class CudaModule(object): +class CudaModule: def _check_arch(self, arch): if arch is None: return @@ -570,7 +567,7 @@ class DynamicSourceModule(DynamicModule): include_dirs=[], cuda_libdir=None, ): - super(DynamicSourceModule, self).__init__( + super().__init__( nvcc=nvcc, link_options=None, keep=keep, diff --git a/pycuda/cumath.py b/pycuda/cumath.py index 42a510e6..ab2b32c8 100644 --- a/pycuda/cumath.py +++ b/pycuda/cumath.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import import pycuda.gpuarray as gpuarray import pycuda.elementwise as elementwise import numpy as np diff --git a/pycuda/curandom.py b/pycuda/curandom.py index 1d6f930b..fe6f54b5 100644 --- a/pycuda/curandom.py +++ b/pycuda/curandom.py @@ -1,12 +1,8 @@ -from __future__ import division -from __future__ import absolute_import - import numpy as np import pycuda.compiler import pycuda.driver as drv import pycuda.gpuarray as array from pytools import memoize_method -import six # {{{ MD5-based random number generation @@ -369,7 +365,7 @@ __global__ void skip_ahead_array(%(state_type)s *s, const int n, const unsigned """ -class _RandomNumberGeneratorBase(object): +class _RandomNumberGeneratorBase: """ Class surrounding CURAND kernels from CUDA 3.2. It allows for generating random numbers with uniform @@ -413,7 +409,7 @@ class _RandomNumberGeneratorBase(object): scramble_type=None, ): if get_curand_version() < (3, 2, 0): - raise EnvironmentError("Need at least CUDA 3.2") + raise OSError("Need at least CUDA 3.2") dev = drv.Context.get_device() @@ -557,7 +553,7 @@ class _RandomNumberGeneratorBase(object): self.skip_ahead_array.prepare("PiP") def _kernels(self): - return list(six.itervalues(self.generators)) + [ + return list(self.generators.values()) + [ self.skip_ahead, self.skip_ahead_array, ] @@ -751,7 +747,7 @@ class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase): scramble_type=None, ): - super(_PseudoRandomNumberGeneratorBase, self).__init__( + super().__init__( state_type, vector_type, generator_bits, additional_source ) @@ -898,7 +894,7 @@ if get_curand_version() >= (3, 2, 0): :class:`GPUArray` of seeds. """ - super(XORWOWRandomNumberGenerator, self).__init__( + super().__init__( seed_getter, offset, "curandStateXORWOW", @@ -969,7 +965,7 @@ if get_curand_version() >= (4, 1, 0): :class:`GPUArray` of seeds. """ - super(MRG32k3aRandomNumberGenerator, self).__init__( + super().__init__( seed_getter, offset, "curandStateMRG32k3a", @@ -981,7 +977,7 @@ if get_curand_version() >= (4, 1, 0): ) def _prepare_skipahead(self): - super(MRG32k3aRandomNumberGenerator, self)._prepare_skipahead() + super()._prepare_skipahead() self.skip_ahead_subsequence = self.module.get_function( "skip_ahead_subsequence" ) @@ -1085,7 +1081,7 @@ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): generator_bits, sobol_random_source, ): - super(_SobolRandomNumberGeneratorBase, self).__init__( + super().__init__( state_type, vector_type, generator_bits, sobol_random_source ) @@ -1174,7 +1170,7 @@ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): scramble_type, sobol_random_source, ): - super(_ScrambledSobolRandomNumberGeneratorBase, self).__init__( + super().__init__( state_type, vector_type, generator_bits, sobol_random_source, scramble_type ) @@ -1249,7 +1245,7 @@ if get_curand_version() >= (3, 2, 0): """ def __init__(self, dir_vector=None, offset=0): - super(Sobol32RandomNumberGenerator, self).__init__( + super().__init__( dir_vector, np.uint32, 32, @@ -1274,7 +1270,7 @@ if get_curand_version() >= (4, 0, 0): """ def __init__(self, dir_vector=None, scramble_vector=None, offset=0): - super(ScrambledSobol32RandomNumberGenerator, self).__init__( + super().__init__( dir_vector, np.uint32, 32, @@ -1300,7 +1296,7 @@ if get_curand_version() >= (4, 0, 0): """ def __init__(self, dir_vector=None, offset=0): - super(Sobol64RandomNumberGenerator, self).__init__( + super().__init__( dir_vector, np.uint64, 64, @@ -1325,7 +1321,7 @@ if get_curand_version() >= (4, 0, 0): """ def __init__(self, dir_vector=None, scramble_vector=None, offset=0): - super(ScrambledSobol64RandomNumberGenerator, self).__init__( + super().__init__( dir_vector, np.uint64, 64, diff --git a/pycuda/debug.py b/pycuda/debug.py index fded1d1e..dee5974c 100644 --- a/pycuda/debug.py +++ b/pycuda/debug.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import -from __future__ import print_function import pycuda.driver import sys diff --git a/pycuda/driver.py b/pycuda/driver.py index c625d81b..6bfd097e 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -1,10 +1,4 @@ -from __future__ import absolute_import, print_function - import os -import sys - -import six - import numpy as np @@ -76,12 +70,8 @@ except ImportError as e: raise -if sys.version_info >= (3,): - _memoryview = memoryview - _my_bytes = bytes -else: - _memoryview = buffer - _my_bytes = str +_memoryview = memoryview +_my_bytes = bytes try: @@ -90,7 +80,7 @@ except NameError: # Provide ManagedAllocationOrStub if not on CUDA 6. # This avoids having to do a version check in a high-traffic code path below. - class ManagedAllocationOrStub(object): + class ManagedAllocationOrStub: pass @@ -124,7 +114,7 @@ class CompileError(Error): return result -class ArgumentHandler(object): +class ArgumentHandler: def __init__(self, ary): self.array = ary self.dev_alloc = None @@ -253,7 +243,7 @@ def _add_functionality(): if kwargs: raise ValueError( - "extra keyword arguments: %s" % (",".join(six.iterkeys(kwargs))) + "extra keyword arguments: %s" % (",".join(kwargs.keys())) ) if block is None: @@ -364,7 +354,7 @@ def _add_functionality(): if kwargs: raise TypeError( - "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + "unknown keyword arguments: " + ", ".join(kwargs.keys()) ) from pycuda._pvt_struct import pack @@ -396,7 +386,7 @@ def _add_functionality(): if kwargs: raise TypeError( - "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + "unknown keyword arguments: " + ", ".join(kwargs.keys()) ) from pycuda._pvt_struct import pack @@ -440,7 +430,7 @@ def _add_functionality(): if kwargs: raise TypeError( - "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + "unknown keyword arguments: " + ", ".join(kwargs.keys()) ) from pycuda._pvt_struct import pack @@ -470,7 +460,7 @@ def _add_functionality(): if kwargs: raise ValueError( - "extra keyword arguments: %s" % (",".join(six.iterkeys(kwargs))) + "extra keyword arguments: %s" % (",".join(kwargs.keys())) ) if block is None: @@ -558,7 +548,7 @@ def _add_functionality(): if kwargs: raise TypeError( - "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + "unknown keyword arguments: " + ", ".join(kwargs.keys()) ) from pycuda._pvt_struct import pack @@ -574,7 +564,7 @@ def _add_functionality(): shared_size = kwargs.pop("shared_size", 0) if kwargs: raise TypeError( - "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + "unknown keyword arguments: " + ", ".join(kwargs.keys()) ) from pycuda._pvt_struct import pack @@ -616,7 +606,7 @@ def _add_functionality(): if kwargs: raise TypeError( - "unknown keyword arguments: " + ", ".join(six.iterkeys(kwargs)) + "unknown keyword arguments: " + ", ".join(kwargs.keys()) ) from pycuda._pvt_struct import pack diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index d560e5dd..e6dd5a92 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -1,11 +1,5 @@ """Elementwise functionality.""" -from __future__ import division -from __future__ import absolute_import -import six -from six.moves import range -from six.moves import zip - __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" __license__ = """ @@ -245,7 +239,7 @@ class ElementwiseKernel: if kwargs: raise TypeError( "invalid keyword arguments specified: " - + ", ".join(six.iterkeys(kwargs)) + + ", ".join(kwargs.keys()) ) invocation_args = [] diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 7bfc48c9..97630e3b 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1,5 +1,3 @@ -from __future__ import division, absolute_import - import numpy as np import pycuda.elementwise as elementwise from pytools import memoize, memoize_method @@ -12,11 +10,10 @@ from pycuda.compyte.array import ( get_common_dtype as _get_common_dtype_base, ) from pycuda.characterize import has_double_support -import six -from six.moves import range, zip, reduce +from functools import reduce import numbers -import six.moves.copyreg +import copyreg def _get_common_dtype(obj1, obj2): @@ -181,7 +178,7 @@ def _make_binary_op(operator): return func -class GPUArray(object): +class GPUArray: """A GPUArray is used to do array-based calculation on the GPU. This is mostly supposed to be a numpy-workalike. Operators @@ -1317,7 +1314,7 @@ def arange(*args, **kwargs): raise ValueError("too many arguments") admissible_names = ["start", "stop", "step", "dtype"] - for k, v in six.iteritems(kwargs): + for k, v in kwargs.items(): if k in admissible_names: if getattr(inf, k) is None: setattr(inf, k, v) @@ -1485,7 +1482,7 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): copy = drv.Memcpy3D() else: raise ValueError( - "more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),) + "more than 2 discontiguous axes not supported {}".format(tuple(sorted(axes))) ) if isinstance(src, GPUArray): @@ -1528,7 +1525,7 @@ def _memcpy_discontig(dst, src, async_=False, stream=None): # {{{ pickle support -six.moves.copyreg.pickle(GPUArray, lambda data: (to_gpu, (data.get(),)), to_gpu) +copyreg.pickle(GPUArray, lambda data: (to_gpu, (data.get(),)), to_gpu) # }}} diff --git a/pycuda/reduction.py b/pycuda/reduction.py index 3ae1ce36..2651353f 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -1,8 +1,5 @@ """Computation of reductions on vectors.""" -from __future__ import division -from __future__ import absolute_import -from six.moves import zip __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" @@ -359,7 +356,7 @@ def get_sum_kernel(dtype_out, dtype_in): dtype_out, "0", "a+b", - arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)}, + arguments="const {tp} *in".format(tp=dtype_to_ctype(dtype_in)), ) @@ -459,7 +456,7 @@ def get_minmax_kernel(what, dtype): return ReductionKernel( dtype, neutral=get_minmax_neutral(what, dtype), - reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, + reduce_expr=f"{reduce_expr}", arguments="const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), @@ -482,7 +479,7 @@ def get_subset_minmax_kernel(what, dtype, dtype_subset): return ReductionKernel( dtype, neutral=get_minmax_neutral(what, dtype), - reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, + reduce_expr=f"{reduce_expr}", map_expr="in[lookup_tbl[i]]", arguments="const %(tp_lut)s *lookup_tbl, " "const %(tp)s *in" diff --git a/pycuda/scan.py b/pycuda/scan.py index 68c91f56..15e670ea 100644 --- a/pycuda/scan.py +++ b/pycuda/scan.py @@ -1,9 +1,5 @@ """Scan primitive.""" -from __future__ import division -from __future__ import absolute_import -import six - __copyright__ = """ Copyright 2011 Andreas Kloeckner Copyright 2008-2011 NVIDIA Corporation @@ -341,7 +337,7 @@ void ${name_prefix}_final_update( ) -class _ScanKernelBase(object): +class _ScanKernelBase: def __init__( self, dtype, @@ -405,7 +401,7 @@ class _ScanKernelBase(object): if output_ary is None: output_ary = input_ary - if isinstance(output_ary, (str, six.text_type)) and output_ary == "new": + if isinstance(output_ary, (str, str)) and output_ary == "new": output_ary = gpuarray.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: diff --git a/pycuda/sparse/packeted.py b/pycuda/sparse/packeted.py index 518c089c..f2819db8 100644 --- a/pycuda/sparse/packeted.py +++ b/pycuda/sparse/packeted.py @@ -5,7 +5,6 @@ from pytools import memoize_method import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np -from six.moves import range PKT_KERNEL_TEMPLATE = """ diff --git a/pycuda/sparse/pkt_build.py b/pycuda/sparse/pkt_build.py index cde771ad..b3591584 100644 --- a/pycuda/sparse/pkt_build.py +++ b/pycuda/sparse/pkt_build.py @@ -1,7 +1,6 @@ from __future__ import absolute_import import numpy as np import pycuda.gpuarray as gpuarray -from six.moves import range def build_pkt_data_structure( diff --git a/pycuda/tools.py b/pycuda/tools.py index 4b7f6b5b..1a2b50f4 100644 --- a/pycuda/tools.py +++ b/pycuda/tools.py @@ -1,11 +1,5 @@ """Miscallenous helper functionality.""" -from __future__ import division, print_function -from __future__ import absolute_import -import six -from six.moves import range -from six.moves import input - __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" __license__ = """ @@ -86,7 +80,7 @@ class DebugMemoryPool(DeviceMemoryPool): description = self.describe(stack, size) histogram = {} - for bsize, descr in six.itervalues(self.blocks): + for bsize, descr in self.blocks.values(): histogram[bsize, descr] = histogram.get((bsize, descr), 0) + 1 from pytools import common_prefix @@ -105,7 +99,7 @@ class DebugMemoryPool(DeviceMemoryPool): file=self.logfile, ) - hist_items = sorted(list(six.iteritems(histogram))) + hist_items = sorted(list(histogram.items())) for (bsize, descr), count in hist_items: print( " %s (%d bytes): %dx" % (descr[len(cpfx):], bsize, count), @@ -399,7 +393,7 @@ class Argument: self.name = name def __repr__(self): - return "%s(%r, %s)" % (self.__class__.__name__, self.name, self.dtype) + return f"{self.__class__.__name__}({self.name!r}, {self.dtype})" def dtype_to_ctype(dtype, with_fp_tex_hack=False): @@ -422,14 +416,14 @@ def dtype_to_ctype(dtype, with_fp_tex_hack=False): class VectorArg(Argument): def declarator(self): - return "%s *%s" % (dtype_to_ctype(self.dtype), self.name) + return "{} *{}".format(dtype_to_ctype(self.dtype), self.name) struct_char = "P" class ScalarArg(Argument): def declarator(self): - return "%s %s" % (dtype_to_ctype(self.dtype), self.name) + return "{} {}".format(dtype_to_ctype(self.dtype), self.name) @property def struct_char(self): diff --git a/test/test_cumath.py b/test/test_cumath.py index a8f1f15c..2db96d15 100644 --- a/test/test_cumath.py +++ b/test/test_cumath.py @@ -1,9 +1,6 @@ -from __future__ import division -from __future__ import absolute_import import math import numpy as np from pycuda.tools import mark_cuda_test -from six.moves import range import pycuda.gpuarray as gpuarray diff --git a/test/test_driver.py b/test/test_driver.py index 35a52e1d..0c074efe 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -1,9 +1,7 @@ -from __future__ import division, absolute_import, print_function import numpy as np import numpy.linalg as la from pycuda.tools import mark_cuda_test, dtype_to_ctype import pytest # noqa -from six.moves import range import pycuda.gpuarray as gpuarray @@ -69,7 +67,7 @@ class TestDriver: drv.Out(dest), np.intp(a_gpu) + a.itemsize, b_gpu, block=(399, 1, 1) ) - assert la.norm((dest[:-1] - a[1:] * b[:-1])) == 0 + assert la.norm(dest[:-1] - a[1:] * b[:-1]) == 0 @mark_cuda_test def test_vector_types(self): diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 7fbc08da..fb6a20fc 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -1,12 +1,10 @@ #! /usr/bin/env python -from __future__ import absolute_import, print_function import numpy as np import numpy.linalg as la import sys from pycuda.tools import mark_cuda_test from pycuda.characterize import has_double_support -from six.moves import range import pycuda.gpuarray as gpuarray @@ -661,7 +659,7 @@ class TestGPUArray: min_a_b_gpu = gpuarray.minimum(a_gpu, b_gpu) print(max_a_b_gpu) - print((np.maximum(a, b))) + print(np.maximum(a, b)) assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0 @@ -837,7 +835,7 @@ class TestGPUArray: # using -1 as unknown dimension assert a_gpu.reshape(-1, 32).shape == (4, 32) assert a_gpu.reshape((32, -1)).shape == (32, 4) - assert a_gpu.reshape(((8, -1, 4))).shape == (8, 4, 4) + assert a_gpu.reshape((8, -1, 4)).shape == (8, 4, 4) throws_exception = False try: -- GitLab