diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c38d0153e021384d840060b03cb680bcdb56f39c..32f795f9c6b3b6560c42f0b492e9364975a78b6d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,17 +8,16 @@ on: - cron: '17 3 * * 0' jobs: - flake8: - name: Flake8 + ruff: + name: Ruff runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - # matches compat target in setup.py - python-version: '3.8' + python-version: '3.x' - name: "Main Script" run: | - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh - . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" test/*.py + pipx install ruff + ruff check diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4f4d93b8c1193a4ed134af6c20afaf393d3fa64a..cc92cb031e970e6256ed1f7c7e33f67adbc7bd27 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,17 +1,3 @@ -Python 3 Titan X: - script: - - py_version=3 - - EXTRA_INSTALL="numpy mako" - - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh - - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" - - ". ./build-and-test-py-project.sh" - tags: - - python3 - - nvidia-titan-x - except: - - tags - Python 3 Titan V: script: - py_version=3 @@ -26,21 +12,6 @@ Python 3 Titan V: except: - tags -Python 3 K40: - script: - - py_version=3 - - EXTRA_INSTALL="numpy mako" - - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh - - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" - - ". ./build-and-test-py-project.sh" - - tags: - - python3 - - nvidia-k40 - except: - - tags - Documentation: script: | EXTRA_INSTALL="numpy mako" @@ -50,15 +21,15 @@ Documentation: build_docs --no-check maybe_upload_docs tags: - - python3 # needs CUDA headers to compile - - nvidia-titan-x + - nvidia-titan-v + - python3 -Flake8: +Ruff: script: - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh - - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test/*.py + - pipx install ruff + - ruff check tags: - - python3 + - docker-runner except: - tags diff --git a/aksetup_helper.py b/aksetup_helper.py index 57e82ebd96c75c3dab86d96431f1a3b6b3a10659..32f80cdec881142ee4bca9a7a46d42677a745d70 100644 --- a/aksetup_helper.py +++ b/aksetup_helper.py @@ -1,9 +1,14 @@ +from __future__ import annotations + import os import sys + + try: from setuptools import Extension from setuptools.command.build_ext import ( # noqa: N812 - build_ext as BaseBuildExtCommand) + build_ext as BaseBuildExtCommand, + ) except ImportError: class Extension: @@ -43,8 +48,8 @@ def setup(*args, **kwargs): def get_numpy_incpath(): - from os.path import join, dirname, exists from importlib.util import find_spec + from os.path import dirname, exists, join origin = find_spec("numpy").origin if origin is None: raise RuntimeError("origin of numpy package not found") @@ -294,7 +299,7 @@ class ConfigSchema: def read_config_from_pyfile(self, filename): result = {} filevars = {} - infile = open(filename, "r") + infile = open(filename) try: contents = infile.read() finally: @@ -313,12 +318,11 @@ class ConfigSchema: filevars = {} try: - exec(compile(open(filename, "r").read(), filename, "exec"), filevars) - except IOError: + exec(compile(open(filename).read(), filename, "exec"), filevars) + except OSError: pass - if "__builtins__" in filevars: - del filevars["__builtins__"] + filevars.pop("__builtins__", None) for key, value in config.items(): if value is not None: @@ -405,7 +409,7 @@ class ConfigSchema: result = self.get_default_config_with_files() if os.access(cfile, os.R_OK): - with open(cfile, "r") as inf: + with open(cfile) as inf: py_snippet = inf.read() self.update_from_python_snippet(result, py_snippet, cfile) @@ -438,7 +442,7 @@ class ConfigSchema: for opt in self.options) -class Option(object): +class Option: def __init__(self, name, default=None, help=None): self.name = name self.default = default @@ -734,7 +738,7 @@ def substitute(substitutions, fname): string_var_re = re.compile(r"\$str\{([A-Za-z_0-9]+)\}") fname_in = fname+".in" - with open(fname_in, "r") as inf: + with open(fname_in) as inf: lines = inf.readlines() new_lines = [] @@ -769,7 +773,7 @@ def substitute(substitutions, fname): with open(fname, "w") as outf: outf.write("".join(new_lines)) - from os import stat, chmod + from os import chmod, stat infile_stat_res = stat(fname_in) chmod(fname, infile_stat_res.st_mode) @@ -778,7 +782,7 @@ def substitute(substitutions, fname): def _run_git_command(cmd): git_error = None - from subprocess import Popen, PIPE + from subprocess import PIPE, Popen stdout = None try: popen = Popen(["git"] + cmd, stdout=PIPE) @@ -909,7 +913,7 @@ def check_pybind11(): # {{{ (modified) boilerplate from https://github.com/pybind/python_example/blob/2ed5a68759cd6ff5d2e5992a91f08616ef457b5c/setup.py # noqa -class get_pybind_include(object): # noqa: N801 +class get_pybind_include: # noqa: N801 """Helper class to determine the pybind11 include path The purpose of this class is to postpone importing pybind11 diff --git a/configure.py b/configure.py index 85c9841d9f995805d8b64bb8aae976b6ecc43ba3..c3400879412b730b92243669856f53fc8c76a51b 100755 --- a/configure.py +++ b/configure.py @@ -1,4 +1,7 @@ #! /usr/bin/env python3 +from __future__ import annotations from aksetup_helper import configure_frontend + + configure_frontend() diff --git a/doc/conf.py b/doc/conf.py index 5796b12c456d0a1a7dd7d907374b0e2224cdd298..8dfb883be448c29a952204253ee67bdaa09a2fba 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,5 +1,8 @@ +from __future__ import annotations + from urllib.request import urlopen + _conf_url = \ "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py" with urlopen(_conf_url) as _inf: @@ -8,10 +11,10 @@ with urlopen(_conf_url) as _inf: copyright = "2008-21, Andreas Kloeckner" ver_dic = {} +with open("../pycuda/__init__.py") as initf: + init_contents = initf.read() exec( - compile( - open("../pycuda/__init__.py").read(), "../pycuda/__init__.py", "exec" - ), + compile(init_contents, "../pycuda/__init__.py", "exec"), ver_dic, ) version = ".".join(str(x) for x in ver_dic["VERSION"]) diff --git a/examples/cai_cupy_arrays.py b/examples/cai_cupy_arrays.py index 2c524ba945a5bd61c593c1081b46a2ba1ddf080f..fbc276f2527083a20436d377c4cde7efa18f2e50 100644 --- a/examples/cai_cupy_arrays.py +++ b/examples/cai_cupy_arrays.py @@ -1,11 +1,12 @@ # Copyright 2008-2021 Andreas Kloeckner # Copyright 2021 NVIDIA Corporation +from __future__ import annotations + +import cupy as cp import pycuda.autoinit # noqa from pycuda.compiler import SourceModule -import cupy as cp - # Create a CuPy array (and a copy for comparison later) cupy_a = cp.random.randn(4, 4).astype(cp.float32) diff --git a/examples/cai_numba.py b/examples/cai_numba.py index 0a94ee48f495e2a152d219ab7aaf02b8d7c87d1c..f42df68c637590f361908485c9dbe40042235d6e 100644 --- a/examples/cai_numba.py +++ b/examples/cai_numba.py @@ -1,16 +1,14 @@ # Copyright 2008-2021 Andreas Kloeckner # Copyright 2021 NVIDIA Corporation +from __future__ import annotations +import numpy from numba import cuda -import pycuda.driver as pycuda # We use autoprimaryctx instead of autoinit because Numba can only operate on a # primary context -import pycuda.autoprimaryctx # noqa import pycuda.gpuarray as gpuarray -import numpy - # Create a PyCUDA gpuarray a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32)) diff --git a/examples/demo.py b/examples/demo.py index a1c404209e929a2c3126efeaf21d9efc48ca7bdb..3d49c92f5a4b68958db9667778059ac96238a447 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -1,9 +1,13 @@ # Sample source code from the Tutorial Introduction in the documentation. -import pycuda.driver as cuda +from __future__ import annotations + +import numpy + import pycuda.autoinit # noqa +import pycuda.driver as cuda from pycuda.compiler import SourceModule -import numpy + a = numpy.random.randn(4, 4) a = a.astype(numpy.float32) @@ -39,6 +43,8 @@ print(a) # part 2 ---------------------------------------------------------------------- import pycuda.gpuarray as gpuarray + + a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32)) a_doubled = (2*a_gpu).get() diff --git a/examples/demo_cdpSimplePrint.py b/examples/demo_cdpSimplePrint.py index 1814369f1a11056585487ad2be1e5029895256a0..f0908e243db7931282857a4a4479a68a4ff4deef 100644 --- a/examples/demo_cdpSimplePrint.py +++ b/examples/demo_cdpSimplePrint.py @@ -16,13 +16,16 @@ * is strictly prohibited. * --------------------------------------------------------------------------- ''' +from __future__ import annotations + +import sys -import sys, os import pycuda.autoinit import pycuda.driver as cuda from pycuda.compiler import DynamicSourceModule -cdpSimplePrint_cu = ''' + +cdpSimplePrint_cu = """ #include //////////////////////////////////////////////////////////////////////////////// @@ -66,7 +69,7 @@ __global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid { // We create a unique ID per block. Thread 0 does that and shares the value with the other threads. __shared__ int s_uid; - if( threadIdx.x == 0 ) + if( threadIdx.x == 0 ) { s_uid = atomicAdd( &g_uids, 1 ); } @@ -74,7 +77,7 @@ __global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid // We print the ID of the block and information about its parent. print_info( depth, thread, s_uid, parent_uid ); - + // We launch new blocks if we haven't reached the max_depth yet. if( ++depth >= max_depth ) { @@ -82,7 +85,8 @@ __global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid } cdp_kernel<<>>( max_depth, depth, threadIdx.x, s_uid ); } -''' +""" + def main(argv): max_depth = 2 @@ -96,7 +100,7 @@ def main(argv): print("starting Simple Print (CUDA Dynamic Parallelism)") mod = DynamicSourceModule(cdpSimplePrint_cu) - cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call + cdp_kernel = mod.get_function("cdp_kernel").prepare("iiii").prepared_call print("***************************************************************************") print("The CPU launches 2 blocks of 2 threads each. On the device each thread will") @@ -104,7 +108,7 @@ def main(argv): print("until it reaches max_depth=%d\n" % max_depth) print("In total 2") num_blocks, sum = 2, 2 - for i in range(1, max_depth): + for _i in range(1, max_depth): num_blocks *= 4 print("+%d" % num_blocks) sum += num_blocks @@ -115,7 +119,8 @@ def main(argv): pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH, max_depth) print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n") - cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1) + cdp_kernel((2, 1), (2, 1, 1), max_depth, 0, 0, -1) + if __name__ == "__main__": main(sys.argv) diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py index 9baadc13a307d04d1c27b4c18acff5b50d0dbe65..8c361332baecd470dd1b42e13d31d8d1d1f5aa00 100644 --- a/examples/demo_elementwise.py +++ b/examples/demo_elementwise.py @@ -1,19 +1,24 @@ -import pycuda.gpuarray as gpuarray -import pycuda.autoinit +from __future__ import annotations + import numpy + +import pycuda.gpuarray as gpuarray from pycuda.curandom import rand as curand + a_gpu = curand((50,)) b_gpu = curand((50,)) from pycuda.elementwise import ElementwiseKernel + + lin_comb = ElementwiseKernel( "float a, float *x, float b, float *y, float *z", "z[i] = my_f(a*x[i], b*y[i])", "linear_combination", preamble=""" __device__ float my_f(float x, float y) - { + { return sin(x*y); } """) @@ -22,4 +27,6 @@ c_gpu = gpuarray.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) import numpy.linalg as la + + assert la.norm(c_gpu.get() - numpy.sin((5*a_gpu*6*b_gpu).get())) < 1e-5 diff --git a/examples/demo_meta_codepy.py b/examples/demo_meta_codepy.py index ddd89f5afafc891b75649854bb5bc2f72b5e00d4..e3ac5104da4d675187d12e2689d20b1dd151694d 100644 --- a/examples/demo_meta_codepy.py +++ b/examples/demo_meta_codepy.py @@ -1,9 +1,12 @@ -import pycuda.driver as cuda -import pycuda.autoinit +from __future__ import annotations + import numpy import numpy.linalg as la + +import pycuda.driver as cuda from pycuda.compiler import SourceModule + thread_strides = 16 block_size = 256 macroblock_count = 33 @@ -18,38 +21,46 @@ a_gpu = cuda.to_device(a) b_gpu = cuda.to_device(b) c_gpu = cuda.mem_alloc(a.nbytes) -from cgen import FunctionBody, \ - FunctionDeclaration, POD, Value, \ - Pointer, Module, Block, Initializer, Assign +from cgen import ( + POD, + Assign, + Block, + FunctionBody, + FunctionDeclaration, + Initializer, + Module, + Pointer, + Value, +) from cgen.cuda import CudaGlobal + mod = Module([ FunctionBody( CudaGlobal(FunctionDeclaration( Value("void", "add"), - arg_decls=[Pointer(POD(dtype, name)) + arg_decls=[Pointer(POD(dtype, name)) for name in ["tgt", "op1", "op2"]])), Block([ Initializer( POD(numpy.int32, "idx"), - "threadIdx.x + %d*blockIdx.x" + "threadIdx.x + %d*blockIdx.x" % (block_size*thread_strides)), ]+[ Assign( "tgt[idx+%d]" % (o*block_size), "op1[idx+%d] + op2[idx+%d]" % ( - o*block_size, + o*block_size, o*block_size)) for o in range(thread_strides)]))]) mod = SourceModule(mod) func = mod.get_function("add") -func(c_gpu, a_gpu, b_gpu, - block=(block_size,1,1), - grid=(macroblock_count,1)) +func(c_gpu, a_gpu, b_gpu, + block=(block_size, 1, 1), + grid=(macroblock_count, 1)) c = cuda.from_device_like(c_gpu, a) assert la.norm(c-(a+b)) == 0 - diff --git a/examples/demo_meta_template.py b/examples/demo_meta_template.py index 4093434d81c1e66b31d99ec62944d0352c64b09d..d8ffeb79b9bb9ff98dd766b4a8977cef3e3b67be 100644 --- a/examples/demo_meta_template.py +++ b/examples/demo_meta_template.py @@ -1,9 +1,12 @@ -import pycuda.driver as cuda -import pycuda.autoinit +from __future__ import annotations + import numpy import numpy.linalg as la + +import pycuda.driver as cuda from pycuda.compiler import SourceModule + thread_strides = 16 block_size = 32 macroblock_count = 33 @@ -20,20 +23,21 @@ c_gpu = cuda.mem_alloc(a.nbytes) from jinja2 import Template + tpl = Template(""" __global__ void add( - {{ type_name }} *tgt, - {{ type_name }} *op1, + {{ type_name }} *tgt, + {{ type_name }} *op1, {{ type_name }} *op2) { - int idx = threadIdx.x + + int idx = threadIdx.x + {{ block_size }} * {{thread_strides}} * blockIdx.x; {% for i in range(thread_strides) %} {% set offset = i*block_size %} - tgt[idx + {{ offset }}] = - op1[idx + {{ offset }}] + tgt[idx + {{ offset }}] = + op1[idx + {{ offset }}] + op2[idx + {{ offset }}]; {% endfor %} }""") @@ -46,9 +50,9 @@ mod = SourceModule(rendered_tpl) # end func = mod.get_function("add") -func(c_gpu, a_gpu, b_gpu, - block=(block_size,1,1), - grid=(macroblock_count,1)) +func(c_gpu, a_gpu, b_gpu, + block=(block_size, 1, 1), + grid=(macroblock_count, 1)) c = cuda.from_device_like(c_gpu, a) diff --git a/examples/demo_struct.py b/examples/demo_struct.py index 37f890604f903188ecd0b3142d88cdd95f1c0077..a273e0e452e430d9ad47edd746b1beb44b267521 100644 --- a/examples/demo_struct.py +++ b/examples/demo_struct.py @@ -1,12 +1,17 @@ # prepared invocations and structures ----------------------------------------- -import pycuda.driver as cuda -import pycuda.autoinit -import numpy +from __future__ import annotations + import struct + +import numpy + +import pycuda.driver as cuda from pycuda.compiler import SourceModule + class DoubleOpStruct: mem_size = 8 + numpy.uintp(0).nbytes + def __init__(self, array, struct_arr_ptr): self.data = cuda.to_device(array) self.shape, self.dtype = array.shape, array.dtype @@ -17,6 +22,7 @@ class DoubleOpStruct: def __str__(self): return str(cuda.from_device(self.data, self.shape, self.dtype)) + struct_arr = cuda.mem_alloc(2 * DoubleOpStruct.mem_size) do2_ptr = int(struct_arr) + DoubleOpStruct.mem_size diff --git a/examples/dump_properties.py b/examples/dump_properties.py index a960619cf706ca1a7e912636238bfedc90d57384..70c7131931c2d47a2e71da438d4847cc41aac4f9 100644 --- a/examples/dump_properties.py +++ b/examples/dump_properties.py @@ -1,5 +1,6 @@ -import pycuda.driver as drv +from __future__ import annotations +import pycuda.driver as drv drv.init() @@ -10,10 +11,9 @@ for ordinal in range(drv.Device.count()): print("Device #%d: %s" % (ordinal, dev.name())) print(" Compute Capability: %d.%d" % dev.compute_capability()) print(" Total Memory: %s KB" % (dev.total_memory()//(1024))) - atts = [(str(att), value) + atts = [(str(att), value) for att, value in list(dev.get_attributes().items())] atts.sort() - + for att, value in atts: print(f" {att}: {value}") - diff --git a/examples/fill_gpu_with_nans.py b/examples/fill_gpu_with_nans.py index ffa65c449e7ae1adb5749591a88a6707213785db..928b513390cbf1fe608d679b2a75a9a794bc9940 100644 --- a/examples/fill_gpu_with_nans.py +++ b/examples/fill_gpu_with_nans.py @@ -1,8 +1,11 @@ -import pycuda.autoinit -import pycuda.gpuarray as gpuarray -import pycuda.driver as cuda +from __future__ import annotations + import numpy +import pycuda.driver as cuda +import pycuda.gpuarray as gpuarray + + free_bytes, total_bytes = cuda.mem_get_info() exp = 10 while True: @@ -22,4 +25,3 @@ while True: ary.fill(float("nan")) print("filled %d out of %d bytes with NaNs" % (fill_floats*4, free_bytes)) - diff --git a/examples/from-wiki/2d_fft.py b/examples/from-wiki/2d_fft.py index 952d96eb24e519540ac5c2dfea0a4726019b5e28..07dd2875b5a5de04d50bc7c9c5377f9d93eea007 100644 --- a/examples/from-wiki/2d_fft.py +++ b/examples/from-wiki/2d_fft.py @@ -1,14 +1,14 @@ -#!python -import numpy -import scipy.misc -import numpy.fft as nfft +#!python +from __future__ import annotations + import multiprocessing +import numpy +import scipy.misc from pyfft.cuda import Plan -from pycuda.tools import make_default_context -import pycuda.tools as pytools -import pycuda.gpuarray as garray + import pycuda.driver as drv +import pycuda.gpuarray as garray class GPUMulti(multiprocessing.Process): @@ -20,15 +20,15 @@ class GPUMulti(multiprocessing.Process): def run(self): drv.init() - a0=numpy.zeros((p,),dtype=numpy.complex64) + a0 = numpy.zeros((p,), dtype=numpy.complex64) self.dev = drv.Device(self.number) self.ctx = self.dev.make_context() -#TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING) +# TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING) print(drv.mem_get_info()) self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64) self.gpu_b = garray.zeros_like(self.gpu_a) self.gpu_a = garray.to_gpu(self.input_cpu) - plan = Plan(a0.shape,context=self.ctx) + plan = Plan(a0.shape, context=self.ctx) plan.execute(self.gpu_a, self.gpu_b, batch=p/m) self.temp = self.gpu_b.get() self.output_cpu.put(self.temp) @@ -38,56 +38,55 @@ class GPUMulti(multiprocessing.Process): del self.gpu_b del self.ctx - print("till the end %d" %self.number) + print("till the end %d" % self.number) -p = 8192; # INPUT IMAGE SIZE (8192 * 8192) +p = 8192 # INPUT IMAGE SIZE (8192 * 8192) m = 4 # TO DIVIDE THE INPUT IMAGE INTO 4* (2048 * 8192) SIZED IMAGES (Depends on the total memory of your GPU) -trans = 2 # FOR TRANSPOSE-SPLIT (TS) ALGORITHM WHICH loops 2 times +trans = 2 # FOR TRANSPOSE-SPLIT (TS) ALGORITHM WHICH loops 2 times -#INPUT IMAGE (GENERATE A 2d SINE WAVE PATTERN) -p_n = 8000 # No. OF PERIODS OF SINE WAVES -x=numpy.arange(0,p_n,float(p_n)/float(p)) +# INPUT IMAGE (GENERATE A 2d SINE WAVE PATTERN) +p_n = 8000 # No. OF PERIODS OF SINE WAVES +x = numpy.arange(0, p_n, float(p_n)/float(p)) a_i = 128 + 128 * numpy.sin(2*numpy.pi*x) -a2 = numpy.zeros([p,p],dtype=numpy.complex64) -a2[::]=a_i -scipy.misc.imsave("sine.bmp",numpy.absolute(a2)) #TEST THE GENERATION OF INPUT IMAGE +a2 = numpy.zeros([p, p], dtype=numpy.complex64) +a2[::] = a_i +scipy.misc.imsave("sine.bmp", numpy.absolute(a2)) # TEST THE GENERATION OF INPUT IMAGE -#INITIALISE THE VARIABLES -a2_1 = numpy.zeros([m,p*p/m],dtype = numpy.complex64) #INPUT TO THE GPU (1d ARRAY) -#VERY IMPORTANT -output_cpu = multiprocessing.Queue() #STORE RESULT IN GPU (MULTIPROCESSING DOES NOT ALLOW SHARING AND HENCE THIS IS NEEDED FOR COMMUNICATION OF DATA) +# INITIALISE THE VARIABLES +a2_1 = numpy.zeros([m, p*p/m], dtype=numpy.complex64) # INPUT TO THE GPU (1d ARRAY) +# VERY IMPORTANT +output_cpu = multiprocessing.Queue() # STORE RESULT IN GPU (MULTIPROCESSING DOES NOT ALLOW SHARING AND HENCE THIS IS NEEDED FOR COMMUNICATION OF DATA) -b2pa = numpy.zeros([p/m,p,m],dtype = numpy.complex64) #OUTPUT FROM GPU -b2_a = numpy.zeros([p,p],dtype = numpy.complex64) #RESHAPED (8192*8192) OUTPUT +b2pa = numpy.zeros([p/m, p, m], dtype=numpy.complex64) # OUTPUT FROM GPU +b2_a = numpy.zeros([p, p], dtype=numpy.complex64) # RESHAPED (8192*8192) OUTPUT -#NOW WE ARE READY TO KICK START THE GPU +# NOW WE ARE READY TO KICK START THE GPU # THE NO OF GPU'S PRESENT (CHANGE ACCORDING TO THE No.OF GPUS YOU HAVE) -num = 2 # I KNOW THIS IS A BAD PRACTISE, BUT I COUNDN'T FIND ANY OTHER WAY(INIT CANNOT BE USED HERE) +num = 2 # I KNOW THIS IS A BAD PRACTISE, BUT I COUNDN'T FIND ANY OTHER WAY(INIT CANNOT BE USED HERE) -#THE TRANSPOSE-SPLIT ALGORITHM FOR FFT -for t in range (0,trans): - for i in range (m): - a2_1[i,:] = a2[i*p/m:(i+1)*p/m,:].flatten()#DIVIDE AND RESHAPE THE INPUT IMAGE INTO 1D ARRAY +# THE TRANSPOSE-SPLIT ALGORITHM FOR FFT +for _t in range(0, trans): + for i in range(m): + a2_1[i, :] = a2[i*p/m:(i+1)*p/m, :].flatten() # DIVIDE AND RESHAPE THE INPUT IMAGE INTO 1D ARRAY - for j in range (m/num): + for j in range(m/num): gpu_multi_list = [] -#CREATE AND START THE MULTIPROCESS - for i in range (num): - gpu_multi = GPUMulti(i,a2_1[i+j*num,:],output_cpu) #FEED THE DATA INTO THE GPU +# CREATE AND START THE MULTIPROCESS + for i in range(num): + gpu_multi = GPUMulti(i, a2_1[i+j*num, :], output_cpu) # FEED THE DATA INTO THE GPU gpu_multi_list.append(gpu_multi) - gpu_multi.start()#THERE YOU GO + gpu_multi.start() # THERE YOU GO -#COLLECT THE OUTPUT FROM THE RUNNING MULTIPROCESS AND RESHAPE +# COLLECT THE OUTPUT FROM THE RUNNING MULTIPROCESS AND RESHAPE for gpu_pro in gpu_multi_list: temp_b2_1 = output_cpu.get(gpu_pro) - b2pa[:,:,gpu_pro.number+j*num] = numpy.reshape(temp_b2_1,(p/m,p)) + b2pa[:, :, gpu_pro.number+j*num] = numpy.reshape(temp_b2_1, (p/m, p)) gpu_multi.terminate() -#RESHAPE AGAIN TO (8192 * 8192) IMAGE +# RESHAPE AGAIN TO (8192 * 8192) IMAGE for i in range(m): - b2_a[i*p/m:(i+1)*p/m,:] = b2pa[:,:,i] - + b2_a[i*p/m:(i+1)*p/m, :] = b2pa[:, :, i] diff --git a/examples/from-wiki/2dfft.py b/examples/from-wiki/2dfft.py deleted file mode 100644 index 67585107019452911a3233ecaecd49d8ba7492b5..0000000000000000000000000000000000000000 --- a/examples/from-wiki/2dfft.py +++ /dev/null @@ -1,5 +0,0 @@ -#!python -# Paste code for your example here. - -Note: please leave the '#!python' marker in place above. The script 'examples/download-examples-from-wiki.py' in the PyCUDA distribution relies on it. - diff --git a/examples/from-wiki/arithmetic_example.py b/examples/from-wiki/arithmetic_example.py index 3bf5e7143df06da11b17c5319f159fe98d7fb6cb..c70023d0b51fd8442946b588e0eec2c27131ac5e 100644 --- a/examples/from-wiki/arithmetic_example.py +++ b/examples/from-wiki/arithmetic_example.py @@ -1,19 +1,22 @@ -#!python #!python +# !python +from __future__ import annotations + +import numpy as np + import pycuda.driver as cuda -import pycuda.autoinit -from pycuda.compiler import SourceModule import pycuda.gpuarray as gpuarray -import numpy as np +from pycuda.compiler import SourceModule + -# Converting the list into numpy array for faster access and putting it into the GPU for processing... +# Converting the list into numpy array for faster access and putting it into the GPU for processing... start = cuda.Event() end = cuda.Event() N = 222341 values = np.random.randn(N) -number_of_blocks=N/1024 +number_of_blocks = N/1024 # Calculating the (value-max)/max-min computation and storing it in a numpy array. Pre-calculating the maximum and minimum values. @@ -32,7 +35,7 @@ if (idx < N) } """, no_extern_c=1) -func = func_mod.get_function('func') +func = func_mod.get_function("func") x = np.asarray(values, np.float32) x_gpu = gpuarray.to_gpu(x) h_minval = np.float32(0) @@ -40,16 +43,13 @@ h_denom = np.int32(255) start.record() # a function to the GPU to calculate the computation in the GPU. -func(x_gpu.gpudata, np.uint32(N), np.float32(h_minval), np.uint32(h_denom), block=(1024, 1, 1), grid=(number_of_blocks+1,1,1)) -end.record() +func(x_gpu.gpudata, np.uint32(N), np.float32(h_minval), np.uint32(h_denom), block=(1024, 1, 1), grid=(number_of_blocks+1, 1, 1)) +end.record() end.synchronize() secs = start.time_till(end)*1e-3 print("SourceModule time") print("%fs" % (secs)) -print('x: ', x[N-1]) -print('Func(x): ', x_gpu.get()[N-1],'Actual: ',(values[N-1]-0)/(h_denom)) -x_colors=x_gpu.get() - - - +print("x: ", x[N-1]) +print("Func(x): ", x_gpu.get()[N-1], "Actual: ", (values[N-1]-0)/(h_denom)) +x_colors = x_gpu.get() diff --git a/examples/from-wiki/c++_function_templates.py b/examples/from-wiki/c++_function_templates.py index b25d122d057b69eb57e2556a3ac179431fd3f857..61fcfd498274d2752eafc1e9967f721868609567 100644 --- a/examples/from-wiki/c++_function_templates.py +++ b/examples/from-wiki/c++_function_templates.py @@ -1,10 +1,12 @@ -#!python -import pycuda.gpuarray as gpuarray -import pycuda.driver as drv -import pycuda.autoinit +#!python +from __future__ import annotations + import numpy as np +import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule + + func_mod = SourceModule(""" template __device__ T incr(T x) { @@ -23,7 +25,7 @@ extern "C" { } """, no_extern_c=1) -func = func_mod.get_function('func') +func = func_mod.get_function("func") N = 5 x = np.asarray(np.random.rand(N), np.float32) @@ -31,6 +33,5 @@ x_orig = x.copy() x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(N), block=(N, 1, 1)) -print('x: ', x) -print('incr(x): ', x_gpu.get()) - +print("x: ", x) +print("incr(x): ", x_gpu.get()) diff --git a/examples/from-wiki/computing.py b/examples/from-wiki/computing.py deleted file mode 100644 index 67585107019452911a3233ecaecd49d8ba7492b5..0000000000000000000000000000000000000000 --- a/examples/from-wiki/computing.py +++ /dev/null @@ -1,5 +0,0 @@ -#!python -# Paste code for your example here. - -Note: please leave the '#!python' marker in place above. The script 'examples/download-examples-from-wiki.py' in the PyCUDA distribution relies on it. - diff --git a/examples/from-wiki/convolution.py b/examples/from-wiki/convolution.py index 09140abd4aee017b657b0765e3036c0de8e1465e..84db43a84d1e4dc73f38497e4066fc7b9f3e032d 100644 --- a/examples/from-wiki/convolution.py +++ b/examples/from-wiki/convolution.py @@ -1,4 +1,4 @@ -#!python +#!python ''' /* * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. @@ -42,12 +42,15 @@ Ported to pycuda by Andrew Wagner , June 2009. ''' +from __future__ import annotations + +import string import numpy -import pycuda.autoinit + import pycuda.driver as cuda from pycuda.compiler import SourceModule -import string + # Pull out a bunch of stuff that was hard coded as pre-processor directives used # by both the kernel and calling code. @@ -58,7 +61,7 @@ ROW_TILE_W = 128 KERNEL_RADIUS_ALIGNED = 16 COLUMN_TILE_W = 16 COLUMN_TILE_H = 48 -template = ''' +template = """ //24-bit multiplication is faster on G80, //but we must be sure to multiply integers //only within [-8M, 8M - 1] range @@ -139,20 +142,20 @@ __global__ void convolutionRowGPU( if(writePos <= tileEndClamped){ const int smemPos = writePos - apronStart; float sum = 0; -''' -originalLoop = ''' +""" +originalLoop = """ for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) sum += data[smemPos + k] * d_Kernel_rows[KERNEL_RADIUS - k]; -''' -unrolledLoop = '' -for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1): +""" +unrolledLoop = "" +for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1): loopTemplate = string.Template( - 'sum += data[smemPos + $k] * d_Kernel_rows[KERNEL_RADIUS - $k];\n') + "sum += data[smemPos + $k] * d_Kernel_rows[KERNEL_RADIUS - $k];\n") unrolledLoop += loopTemplate.substitute(k=k) -#print unrolledLoop +# print unrolledLoop template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop -template += ''' +template += """ d_Result[rowStart + writePos] = sum; //d_Result[rowStart + writePos] = 128; } @@ -212,40 +215,41 @@ __global__ void convolutionColumnGPU( //Calculate and output the results for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y){ float sum = 0; -''' -originalLoop = ''' +""" +originalLoop = """ for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) sum += data[smemPos + IMUL(k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - k]; -''' -unrolledLoop = '' -for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1): - loopTemplate = string.Template('sum += data[smemPos + IMUL($k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - $k];\n') +""" +unrolledLoop = "" +for k in range(-KERNEL_RADIUS, KERNEL_RADIUS+1): + loopTemplate = string.Template("sum += data[smemPos + IMUL($k, COLUMN_TILE_W)] * d_Kernel_columns[KERNEL_RADIUS - $k];\n") unrolledLoop += loopTemplate.substitute(k=k) -#print unrolledLoop +# print unrolledLoop template += unrolledLoop if UNROLL_INNER_LOOP else originalLoop -template += ''' +template += """ d_Result[gmemPos] = sum; //d_Result[gmemPos] = 128; smemPos += smemStride; gmemPos += gmemStride; } } -''' +""" template = string.Template(template) -code = template.substitute(KERNEL_RADIUS = KERNEL_RADIUS, - KERNEL_W = KERNEL_W, +code = template.substitute(KERNEL_RADIUS=KERNEL_RADIUS, + KERNEL_W=KERNEL_W, COLUMN_TILE_H=COLUMN_TILE_H, COLUMN_TILE_W=COLUMN_TILE_W, ROW_TILE_W=ROW_TILE_W, KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED) module = SourceModule(code) -convolutionRowGPU = module.get_function('convolutionRowGPU') -convolutionColumnGPU = module.get_function('convolutionColumnGPU') -d_Kernel_rows = module.get_global('d_Kernel_rows')[0] -d_Kernel_columns = module.get_global('d_Kernel_columns')[0] +convolutionRowGPU = module.get_function("convolutionRowGPU") +convolutionColumnGPU = module.get_function("convolutionColumnGPU") +d_Kernel_rows = module.get_global("d_Kernel_rows")[0] +d_Kernel_columns = module.get_global("d_Kernel_columns")[0] + # Helper functions for computing alignment... def iDivUp(a, b): @@ -254,11 +258,13 @@ def iDivUp(a, b): b = numpy.int32(b) return (a / b + 1) if (a % b != 0) else (a / b) + def iDivDown(a, b): # Round a / b to nearest lower integer value a = numpy.int32(a) b = numpy.int32(b) - return a / b; + return a / b + def iAlignUp(a, b): # Align a to nearest higher multiple of b @@ -266,31 +272,34 @@ def iAlignUp(a, b): b = numpy.int32(b) return (a - a % b + b) if (a % b != 0) else a + def iAlignDown(a, b): # Align a to nearest lower multiple of b a = numpy.int32(a) b = numpy.int32(b) return a - a % b -def gaussian_kernel(width = KERNEL_W, sigma = 4.0): - assert width == numpy.floor(width), 'argument width should be an integer!' + +def gaussian_kernel(width=KERNEL_W, sigma=4.0): + assert width == numpy.floor(width), "argument width should be an integer!" radius = (width - 1)/2.0 - x = numpy.linspace(-radius, radius, width) + x = numpy.linspace(-radius, radius, width) x = numpy.float32(x) sigma = numpy.float32(sigma) filterx = x*x / (2 * sigma * sigma) filterx = numpy.exp(-1 * filterx) - assert filterx.sum()>0, 'something very wrong if gaussian kernel sums to zero!' + assert filterx.sum() > 0, "something very wrong if gaussian kernel sums to zero!" filterx /= filterx.sum() return filterx -def derivative_of_gaussian_kernel(width = KERNEL_W, sigma = 4): - assert width == numpy.floor(width), 'argument width should be an integer!' + +def derivative_of_gaussian_kernel(width=KERNEL_W, sigma=4): + assert width == numpy.floor(width), "argument width should be an integer!" radius = (width - 1)/2.0 - x = numpy.linspace(-radius, radius, width) + x = numpy.linspace(-radius, radius, width) x = numpy.float32(x) # The derivative of a gaussian is really just a gaussian times x, up to scale. - filterx = gaussian_kernel(width, sigma) + filterx = gaussian_kernel(width, sigma) filterx *= x # Rescale so that filter returns derivative of 1 when applied to x: scale = (x * filterx).sum() @@ -299,43 +308,45 @@ def derivative_of_gaussian_kernel(width = KERNEL_W, sigma = 4): filterx *= -1.0 return filterx + def test_derivative_of_gaussian_kernel(): width = 20 sigma = 10.0 - filterx = derivative_of_gaussian_kernel(width, sigma) + filterx = derivative_of_gaussian_kernel(width, sigma) x = 2 * numpy.arange(0, width) x = numpy.float32(x) response = (filter * x).sum() - assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!' + assert abs(response - (-2.0)) < .0001, "derivative of gaussian failed scale test!" width = 19 sigma = 10.0 - filterx = derivative_of_gaussian_kernel(width, sigma) + filterx = derivative_of_gaussian_kernel(width, sigma) x = 2 * numpy.arange(0, width) x = numpy.float32(x) response = (filterx * x).sum() - assert abs(response - (-2.0)) < .0001, 'derivative of gaussian failed scale test!' + assert abs(response - (-2.0)) < .0001, "derivative of gaussian failed scale test!" + -def convolution_cuda(sourceImage, filterx, filtery): +def convolution_cuda(sourceImage, filterx, filtery): # Perform separable convolution on sourceImage using CUDA. # Operates on floating point images with row-major storage. destImage = sourceImage.copy() - assert sourceImage.dtype == 'float32', 'source image must be float32' - (imageHeight, imageWidth) = sourceImage.shape - assert filterx.shape == filtery.shape == (KERNEL_W, ) , 'Kernel is compiled for a different kernel size! Try changing KERNEL_W' + assert sourceImage.dtype == "float32", "source image must be float32" + (imageHeight, imageWidth) = sourceImage.shape + assert filterx.shape == filtery.shape == (KERNEL_W, ), "Kernel is compiled for a different kernel size! Try changing KERNEL_W" filterx = numpy.float32(filterx) filtery = numpy.float32(filtery) - DATA_W = iAlignUp(imageWidth, 16); - DATA_H = imageHeight; - BYTES_PER_WORD = 4; # 4 for float32 - DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD; - KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD; + DATA_W = iAlignUp(imageWidth, 16) + DATA_H = imageHeight + BYTES_PER_WORD = 4 # 4 for float32 + DATA_W * DATA_H * BYTES_PER_WORD + KERNEL_W * BYTES_PER_WORD # Prepare device arrays destImage_gpu = cuda.mem_alloc_like(destImage) sourceImage_gpu = cuda.mem_alloc_like(sourceImage) intermediateImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) - cuda.memcpy_htod(d_Kernel_rows, filterx) # The kernel goes into constant memory via a symbol defined in the kernel - cuda.memcpy_htod(d_Kernel_columns, filtery) + cuda.memcpy_htod(d_Kernel_rows, filterx) # The kernel goes into constant memory via a symbol defined in the kernel + cuda.memcpy_htod(d_Kernel_columns, filtery) # Call the kernels for convolution in each direction. blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H) blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H)) @@ -343,32 +354,33 @@ def convolution_cuda(sourceImage, filterx, filtery): threadBlockColumns = (COLUMN_TILE_W, 8, 1) DATA_H = numpy.int32(DATA_H) DATA_W = numpy.int32(DATA_W) - grid_rows = tuple([int(e) for e in blockGridRows]) - block_rows = tuple([int(e) for e in threadBlockRows]) - grid_cols = tuple([int(e) for e in blockGridColumns]) - block_cols = tuple([int(e) for e in threadBlockColumns]) - convolutionRowGPU(intermediateImage_gpu, sourceImage_gpu, DATA_W, DATA_H, grid=grid_rows, block=block_rows) - convolutionColumnGPU(destImage_gpu, intermediateImage_gpu, DATA_W, DATA_H, numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]), numpy.int32(DATA_W * threadBlockColumns[1]), grid=grid_cols, block=block_cols) + grid_rows = tuple(int(e) for e in blockGridRows) + block_rows = tuple(int(e) for e in threadBlockRows) + grid_cols = tuple(int(e) for e in blockGridColumns) + block_cols = tuple(int(e) for e in threadBlockColumns) + convolutionRowGPU(intermediateImage_gpu, sourceImage_gpu, DATA_W, DATA_H, grid=grid_rows, block=block_rows) + convolutionColumnGPU(destImage_gpu, intermediateImage_gpu, DATA_W, DATA_H, numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]), numpy.int32(DATA_W * threadBlockColumns[1]), grid=grid_cols, block=block_cols) # Pull the data back from the GPU. - cuda.memcpy_dtoh(destImage, destImage_gpu) + cuda.memcpy_dtoh(destImage, destImage_gpu) return destImage + def test_convolution_cuda(): # Test the convolution kernel. # Generate or load a test image - original = numpy.random.rand(768, 1024) * 255 + original = numpy.random.rand(768, 1024) * 255 original = numpy.float32(original) # You probably want to display the image using the tool of your choice here. filterx = gaussian_kernel() destImage = original.copy() destImage[:] = numpy.nan - destImage = convolution_cuda(original, filterx, filterx) + destImage = convolution_cuda(original, filterx, filterx) # You probably want to display the result image using the tool of your choice here. - print('Done running the convolution kernel!') + print("Done running the convolution kernel!") -if __name__ == '__main__': - test_convolution_cuda() - #test_derivative_of_gaussian_kernel() - boo = input('Pausing so you can look at results... to finish...') +if __name__ == "__main__": + test_convolution_cuda() + # test_derivative_of_gaussian_kernel() + boo = input("Pausing so you can look at results... to finish...") diff --git a/examples/from-wiki/demo_complex.py b/examples/from-wiki/demo_complex.py index a89a36b9bd0be3c2f705d14353d0b56743ab98a0..a704f4f1aae8392e4d98af4ef0ec421dbb047fda 100644 --- a/examples/from-wiki/demo_complex.py +++ b/examples/from-wiki/demo_complex.py @@ -1,9 +1,10 @@ -#!python -import pycuda.driver as cuda -import pycuda.gpuarray as gpuarray -import pycuda.autoinit +#!python +from __future__ import annotations + import numpy -from pycuda.curandom import rand as curand + +import pycuda.gpuarray as gpuarray + a = (numpy.random.randn(400) +1j*numpy.random.randn(400)).astype(numpy.complex64) @@ -14,6 +15,8 @@ a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) from pycuda.elementwise import ElementwiseKernel + + complex_mul = ElementwiseKernel( "pycuda::complex *x, pycuda::complex *y, pycuda::complex *z", "z[i] = x[i] * y[i]", @@ -24,8 +27,8 @@ c_gpu = gpuarray.empty_like(a_gpu) complex_mul(a_gpu, b_gpu, c_gpu) import numpy.linalg as la + + error = la.norm(c_gpu.get() - (a*b)) print(error) assert error < 1e-5 - - diff --git a/examples/from-wiki/demo_meta_cgen.py b/examples/from-wiki/demo_meta_cgen.py index 51e1d737da1f0d46e935464e735146aa0fb6fef6..2a44f83c6418bb75e4352b12a826964da16535cc 100644 --- a/examples/from-wiki/demo_meta_cgen.py +++ b/examples/from-wiki/demo_meta_cgen.py @@ -1,10 +1,13 @@ -#!python -import pycuda.driver as cuda -import pycuda.autoinit +#!python +from __future__ import annotations + import numpy import numpy.linalg as la + +import pycuda.driver as cuda from pycuda.compiler import SourceModule + thread_strides = 16 block_size = 256 macroblock_count = 33 @@ -19,40 +22,46 @@ a_gpu = cuda.to_device(a) b_gpu = cuda.to_device(b) c_gpu = cuda.mem_alloc(a.nbytes) -from cgen import FunctionBody, \ - FunctionDeclaration, Typedef, POD, Value, \ - Pointer, Module, Block, Initializer, Assign +from cgen import ( + POD, + Assign, + Block, + FunctionBody, + FunctionDeclaration, + Initializer, + Module, + Pointer, + Value, +) from cgen.cuda import CudaGlobal + mod = Module([ FunctionBody( CudaGlobal(FunctionDeclaration( Value("void", "add"), - arg_decls=[Pointer(POD(dtype, name)) + arg_decls=[Pointer(POD(dtype, name)) for name in ["tgt", "op1", "op2"]])), Block([ Initializer( POD(numpy.int32, "idx"), - "threadIdx.x + %d*blockIdx.x" + "threadIdx.x + %d*blockIdx.x" % (block_size*thread_strides)), ]+[ Assign( "tgt[idx+%d]" % (o*block_size), "op1[idx+%d] + op2[idx+%d]" % ( - o*block_size, + o*block_size, o*block_size)) for o in range(thread_strides)]))]) mod = SourceModule(mod) func = mod.get_function("add") -func(c_gpu, a_gpu, b_gpu, - block=(block_size,1,1), - grid=(macroblock_count,1)) +func(c_gpu, a_gpu, b_gpu, + block=(block_size, 1, 1), + grid=(macroblock_count, 1)) c = cuda.from_device_like(c_gpu, a) assert la.norm(c-(a+b)) == 0 - - - diff --git a/examples/from-wiki/demo_meta_matrixmul_cheetah.py b/examples/from-wiki/demo_meta_matrixmul_cheetah.py index 196edbbe3aca07f7d72f3d214feefc94bd60288f..ef80a627c3c1398b0bdeb213beef0428a767d8ec 100644 --- a/examples/from-wiki/demo_meta_matrixmul_cheetah.py +++ b/examples/from-wiki/demo_meta_matrixmul_cheetah.py @@ -1,20 +1,22 @@ -#!python -#!/usr/bin/env python +#!python +# !/usr/bin/env python # -*- coding: utf-8 -*- -""" -PyCuda Optimized Matrix Multiplication +""" +PyCuda Optimized Matrix Multiplication Template Meta-programming Example using Cheetah (modified from SciPy09 Advanced Tutorial) """ # ------------------------------------------------------------------------------ +from __future__ import annotations + import numpy as np -from pycuda import driver, compiler, gpuarray, tools from Cheetah.Template import Template -import pycuda.autoinit +from pycuda import compiler, gpuarray + # -- default parameters DEFAULT_BLOCK_SIZE = 16 @@ -24,29 +26,32 @@ DEFAULT_SPILL = False DEFAULT_PREFETCH = False from os import path + + MYPATH = path.dirname(path.abspath(__file__)) TEMPLATE_FILENAME = path.join(MYPATH, "demo_meta_matrixmul_cheetah.template.cu") + # ------------------------------------------------------------------------------ -def matrixmul_opt(mat_a, mat_b, - block_size = DEFAULT_BLOCK_SIZE, - work_size = DEFAULT_WORK_SIZE, - unroll = DEFAULT_UNROLL, - spill = DEFAULT_SPILL, - prefetch = DEFAULT_PREFETCH): - +def matrixmul_opt(mat_a, mat_b, + block_size=DEFAULT_BLOCK_SIZE, + work_size=DEFAULT_WORK_SIZE, + unroll=DEFAULT_UNROLL, + spill=DEFAULT_SPILL, + prefetch=DEFAULT_PREFETCH): + ah, aw = mat_a.shape bh, bw = mat_b.shape - + assert aw == bh # -- pad input matrices appropriately ah_padded = int(np.ceil(ah/block_size)) * block_size aw_padded = int(np.ceil(aw/block_size)) * (block_size*work_size) mat_a_padded = np.zeros((ah_padded, aw_padded), np.float32) - mat_a_padded[:ah,:aw] = mat_a + mat_a_padded[:ah, :aw] = mat_a - bh_padded = aw_padded + bh_padded = aw_padded bw_padded = int(np.ceil(bw/(block_size*work_size))) * (block_size*work_size) mat_b_padded = np.zeros((bh_padded, bw_padded), np.float32) mat_b_padded[:bh, :bw] = mat_b @@ -55,7 +60,7 @@ def matrixmul_opt(mat_a, mat_b, cw_padded = bw_padded # -- upload padded input matrices to the GPU - mat_a_gpu = gpuarray.to_gpu(mat_a_padded) + mat_a_gpu = gpuarray.to_gpu(mat_a_padded) mat_b_gpu = gpuarray.to_gpu(mat_b_padded) # -- create empty container matrix for the result (C = A * B) @@ -63,26 +68,26 @@ def matrixmul_opt(mat_a, mat_b, # -- generate and compile the code # prepare the template parameters - template_params = { - 'BLOCK_SIZE': block_size, - 'WORK_SIZE': work_size, - 'UNROLL': unroll, - 'SPILL': spill, - 'PREFETCH': prefetch, - 'A_WIDTH': aw_padded, - 'A_HEIGHT': ah_padded, - 'B_WIDTH': bw_padded, + template_params = { + "BLOCK_SIZE": block_size, + "WORK_SIZE": work_size, + "UNROLL": unroll, + "SPILL": spill, + "PREFETCH": prefetch, + "A_WIDTH": aw_padded, + "A_HEIGHT": ah_padded, + "B_WIDTH": bw_padded, } - + # run the template engine to get the code kernel_code = Template( - file = TEMPLATE_FILENAME, - searchList = [template_params], + file=TEMPLATE_FILENAME, + searchList=[template_params], ) - + # compile the code module = compiler.SourceModule(kernel_code) - + # get the kernel from the module matrixmul_func = module.get_function("matrixMul") @@ -90,30 +95,30 @@ def matrixmul_opt(mat_a, mat_b, print("number of registers used:", matrixmul_func.num_regs) # block of threads - # ATTENTION: block is (threadDim.x, threadDim.y, threadDim.z) + # ATTENTION: block is (threadDim.x, threadDim.y, threadDim.z) # and not (threadDim.z, threadDim.y, threadDim.x) - block = block_size, block_size, 1 - - # grid of blocks - # ATTENTION: it's (blockDim.x, blockDim.y) + block = block_size, block_size, 1 + + # grid of blocks + # ATTENTION: it's (blockDim.x, blockDim.y) # and not (blockDim.y, blockDim.x) grid = int(cw_padded/block_size/work_size), int(ch_padded/block_size) # -- call the kernel on the GPU - # Note that when we use time_kernel=True pycuda will automatically synchronize the kernel + # Note that when we use time_kernel=True pycuda will automatically synchronize the kernel # to make sure that the timing is correct. If you time the code yourself, you'll have to # synchronize the current Context. gpu_time = matrixmul_func( # -- output - mat_c_gpu, + mat_c_gpu, # -- inputs - mat_a_gpu, mat_b_gpu, + mat_a_gpu, mat_b_gpu, # -- grid of blocks - grid = grid, + grid=grid, # -- block of threads - block = block, + block=block, # -- time the kernel (approx.) - time_kernel = True, + time_kernel=True, ) # get the GPU matrix back to CPU memory @@ -122,20 +127,21 @@ def matrixmul_opt(mat_a, mat_b, return mat_c, gpu_time + # ------------------------------------------------------------------------------ -if __name__ == "__main__": +if __name__ == "__main__": # matrix sizes a_height = 1024 a_width = 1024 b_height = a_width b_width = 1024 - + # create random square matrices np.random.seed(0) mat_a = np.random.randn(a_height, a_width).astype(np.float32) mat_b = np.random.randn(b_height, b_width).astype(np.float32) - + # compute reference on the cpu to verify GPU computation mat_ref = np.dot(mat_a, mat_b) @@ -155,5 +161,3 @@ if __name__ == "__main__": gflop = mat_c.size * (a_width * 2.) / (1000**3.) gflops = gflop / gpu_time print("gflops:", gflops) - - diff --git a/examples/from-wiki/distance_element_wise3d.py b/examples/from-wiki/distance_element_wise3d.py index 240b32e9484d54848c39d6f929553b17e69c1872..64a15f74019050803e03567aae955bf3788e3b27 100644 --- a/examples/from-wiki/distance_element_wise3d.py +++ b/examples/from-wiki/distance_element_wise3d.py @@ -1,12 +1,15 @@ -#!python -import pycuda.gpuarray as gpuarray +#!python +from __future__ import annotations + +import random + +import numpy + import pycuda.driver as cuda -import pycuda.tools as tools -import pycuda.autoinit -import numpy, random, time -from pycuda.curandom import rand as curand +import pycuda.gpuarray as gpuarray from pycuda.elementwise import ElementwiseKernel as Elementwise + x = 50 y = 50 z = 2 @@ -14,6 +17,7 @@ width = 100 height = 100 depth = 100 + def main(): """ Computes a set of distances from a given point in a search space in parallel on a GPU. @@ -50,10 +54,9 @@ def main(): random.uniform(-width, width)), n, r) end.record() end.synchronize() - print((start.time_till(end))) + print(start.time_till(end)) print(r) -if __name__ == '__main__': - main() - +if __name__ == "__main__": + main() diff --git a/examples/from-wiki/game_of_life.py b/examples/from-wiki/game_of_life.py index 60564c3287c8fc723bc5756ad99ebd53ac384125..8663b216365bd902695b08ac2a8c29c95b483551 100644 --- a/examples/from-wiki/game_of_life.py +++ b/examples/from-wiki/game_of_life.py @@ -1,4 +1,4 @@ -#!python +#!python # Conway's Game of Life Accelerated with PyCUDA # Luis Villasenor # lvillasen@gmail.com @@ -6,27 +6,34 @@ # Licence: GPLv3 # Usage: python GameOfLife.py n n_iter # where n is the board size and n_iter the number of iterations -import pycuda.driver as cuda -import pycuda.tools -import pycuda.autoinit -import pycuda.gpuarray as gpuarray -from pycuda.compiler import SourceModule +from __future__ import annotations + import sys + +import matplotlib.pyplot as plt import numpy as np from pylab import cm as cm -import matplotlib.pyplot as plt -n=int(sys.argv[1]) -n_iter=int(sys.argv[2]) -n_block=16 -n_grid=int(n/n_block); -n=n_block*n_grid; + +import pycuda.gpuarray as gpuarray +from pycuda.compiler import SourceModule + + +n = int(sys.argv[1]) +n_iter = int(sys.argv[2]) +n_block = 16 +n_grid = int(n/n_block) +n = n_block*n_grid + + def random_init(n): - #np.random.seed(100) - M=np.zeros((n,n)).astype(np.int32) + # np.random.seed(100) + M = np.zeros((n, n)).astype(np.int32) for i in range(n): for j in range(n): - M[j,i]=np.int32(np.random.randint(2)) + M[j, i] = np.int32(np.random.randint(2)) return M + + mod = SourceModule(""" __global__ void step(int *C, int *M) { @@ -54,28 +61,27 @@ __global__ void step(int *C, int *M) } """) func = mod.get_function("step") -C=random_init(n) +C = random_init(n) M = np.empty_like(C) -C_gpu = gpuarray.to_gpu( C ) -M_gpu = gpuarray.to_gpu( M ) -for k in range(n_iter): - func(C_gpu,M_gpu,block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) +C_gpu = gpuarray.to_gpu(C) +M_gpu = gpuarray.to_gpu(M) +for _k in range(n_iter): + func(C_gpu, M_gpu, block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) C_gpu, M_gpu = M_gpu, C_gpu -print(("%d live cells after %d iterations" %(np.sum(C_gpu.get()),n_iter))) -fig = plt.figure(figsize=(12,12)) +print("%d live cells after %d iterations" % (np.sum(C_gpu.get()), n_iter)) +fig = plt.figure(figsize=(12, 12)) ax = fig.add_subplot(111) fig.suptitle("Conway's Game of Life Accelerated with PyCUDA") -ax.set_title('Number of Iterations = %d'%(n_iter)) -myobj =plt.imshow(C_gpu.get(),origin='lower',cmap='Greys', interpolation='nearest',vmin=0, vmax=1) +ax.set_title("Number of Iterations = %d" % (n_iter)) +myobj = plt.imshow(C_gpu.get(), origin="lower", cmap="Greys", interpolation="nearest", vmin=0, vmax=1) plt.pause(.01) plt.draw() -m=n_iter +m = n_iter while True: - m+=1 - func(C_gpu,M_gpu,block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) + m += 1 + func(C_gpu, M_gpu, block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) C_gpu, M_gpu = M_gpu, C_gpu myobj.set_data(C_gpu.get()) - ax.set_title('Number of Iterations = %d'%(m)) + ax.set_title("Number of Iterations = %d" % (m)) plt.pause(.01) plt.draw() - diff --git a/examples/from-wiki/gl_interop.py b/examples/from-wiki/gl_interop.py index 77249e51f7f6daca369f6d1b8d867feedc694f92..37268afec6e9fa8cd0fc7a018bc2d82a674928de 100644 --- a/examples/from-wiki/gl_interop.py +++ b/examples/from-wiki/gl_interop.py @@ -1,23 +1,27 @@ -#!python +#!python # GL interoperability example, by Peter Berrington. # Draws a rotating teapot, using cuda to invert the RGB value # each frame +from __future__ import annotations +import sys +import time + +import numpy from OpenGL.GL import * -from OpenGL.GLUT import * -from OpenGL.GLU import * -from OpenGL.GL.ARB.vertex_buffer_object import * from OpenGL.GL.ARB.pixel_buffer_object import * +from OpenGL.GL.ARB.vertex_buffer_object import * +from OpenGL.GLU import * +from OpenGL.GLUT import * - -import numpy, sys, time import pycuda.driver as cuda_driver import pycuda.gl as cuda_gl from pycuda.compiler import SourceModule -#this is all munged together from the CUDA SDK postprocessGL example. -initial_size = 512,512 +# this is all munged together from the CUDA SDK postprocessGL example. + +initial_size = 512, 512 current_size = initial_size animate = True enable_cuda = True @@ -26,15 +30,16 @@ time_of_last_draw = 0.0 time_of_last_titleupdate = 0.0 frames_per_second = 0.0 frame_counter = 0 -output_texture = None # pointer to offscreen render target +output_texture = None # pointer to offscreen render target (source_pbo, dest_pbo, cuda_module, invert, pycuda_source_pbo, pycuda_dest_pbo) = [None]*6 -heading,pitch,bank = [0.0]*3 +heading, pitch, bank = [0.0]*3 + -def create_PBOs(w,h): +def create_PBOs(w, h): global source_pbo, dest_pbo, pycuda_source_pbo, pycuda_dest_pbo num_texels = w*h - data = numpy.zeros((num_texels,4),numpy.uint8) + data = numpy.zeros((num_texels, 4), numpy.uint8) source_pbo = glGenBuffers(1) glBindBuffer(GL_ARRAY_BUFFER, source_pbo) glBufferData(GL_ARRAY_BUFFER, data, GL_DYNAMIC_DRAW) @@ -46,15 +51,17 @@ def create_PBOs(w,h): glBindBuffer(GL_ARRAY_BUFFER, 0) pycuda_dest_pbo = cuda_gl.BufferObject(int(dest_pbo)) + def destroy_PBOs(): global source_pbo, dest_pbo, pycuda_source_pbo, pycuda_dest_pbo for pbo in [source_pbo, dest_pbo]: glBindBuffer(GL_ARRAY_BUFFER, int(pbo)) glDeleteBuffers(1, int(pbo)) glBindBuffer(GL_ARRAY_BUFFER, 0) - source_pbo,dest_pbo,pycuda_source_pbo,pycuda_dest_pbo = [None]*4 + source_pbo, dest_pbo, pycuda_source_pbo, pycuda_dest_pbo = [None]*4 + -def create_texture(w,h): +def create_texture(w, h): global output_texture output_texture = glGenTextures(1) glBindTexture(GL_TEXTURE_2D, output_texture) @@ -67,11 +74,13 @@ def create_texture(w,h): glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, None) + def destroy_texture(): global output_texture glDeleteTextures(output_texture) output_texture = None + def init_gl(): Width, Height = current_size glClearColor(0.1, 0.1, 0.5, 1.0) @@ -82,11 +91,12 @@ def init_gl(): gluPerspective(60.0, Width/float(Height), 0.1, 10.0) glPolygonMode(GL_FRONT_AND_BACK, GL_FILL) glEnable(GL_LIGHT0) - red = ( 1.0, 0.1, 0.1, 1.0 ) - white = ( 1.0, 1.0, 1.0, 1.0 ) - glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, red ) + red = (1.0, 0.1, 0.1, 1.0) + white = (1.0, 1.0, 1.0, 1.0) + glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, red) glMaterialfv(GL_FRONT_AND_BACK, GL_SPECULAR, white) - glMaterialf( GL_FRONT_AND_BACK, GL_SHININESS, 60.0) + glMaterialf(GL_FRONT_AND_BACK, GL_SHININESS, 60.0) + def resize(Width, Height): global current_size @@ -96,32 +106,35 @@ def resize(Width, Height): glLoadIdentity() gluPerspective(60.0, Width/float(Height), 0.1, 10.0) + def do_tick(): global time_of_last_titleupdate, frame_counter, frames_per_second - if ((time.clock () * 1000.0) - time_of_last_titleupdate >= 1000.): + if ((time.clock() * 1000.0) - time_of_last_titleupdate >= 1000.): frames_per_second = frame_counter # Save The FPS frame_counter = 0 # Reset The FPS Counter - szTitle = "%d FPS" % (frames_per_second ) - glutSetWindowTitle ( szTitle ) - time_of_last_titleupdate = time.clock () * 1000.0 + szTitle = "%d FPS" % (frames_per_second) + glutSetWindowTitle(szTitle) + time_of_last_titleupdate = time.clock() * 1000.0 frame_counter += 1 + # The function called whenever a key is pressed. Note the use of Python tuples to pass in: (key, x, y) def keyPressed(*args): global animate, enable_cuda # If escape is pressed, kill everything. - if args[0] == '\033': - print('Closing..') + if args[0] == "\033": + print("Closing..") destroy_PBOs() destroy_texture() exit() - elif args[0] == 'a': - print('toggling animation') + elif args[0] == "a": + print("toggling animation") animate = not animate - elif args[0] == 'e': - print('toggling cuda') + elif args[0] == "e": + print("toggling cuda") enable_cuda = not enable_cuda + def idle(): global heading, pitch, bank if animate: @@ -131,6 +144,7 @@ def idle(): glutPostRedisplay() + def display(): try: render_scene() @@ -144,9 +158,10 @@ def display(): from os import _exit _exit(0) + def process(width, height): """ Use PyCuda """ - grid_dimensions = (width//16,height//16) + grid_dimensions = (width//16, height//16) source_mapping = pycuda_source_pbo.map() dest_mapping = pycuda_dest_pbo.map() @@ -160,9 +175,10 @@ def process(width, height): source_mapping.unmap() dest_mapping.unmap() + def process_image(): """ copy image and process using CUDA """ - global pycuda_source_pbo,source_pbo,current_size, dest_pbo + global pycuda_source_pbo, source_pbo, current_size, dest_pbo image_width, image_height = current_size assert source_pbo is not None @@ -174,12 +190,12 @@ def process_image(): # read data into pbo. note: use BGRA format for optimal performance glReadPixels( - 0, #start x - 0, #start y - image_width, #end x - image_height, #end y - GL_BGRA, #format - GL_UNSIGNED_BYTE, #output type + 0, # start x + 0, # start y + image_width, # end x + image_height, # end y + GL_BGRA, # format + GL_UNSIGNED_BYTE, # output type ctypes.c_void_p(0)) pycuda_source_pbo = cuda_gl.BufferObject(int(source_pbo)) @@ -195,6 +211,7 @@ def process_image(): image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, ctypes.c_void_p(0)) + def display_image(): """ render a screen sized quad """ glDisable(GL_DEPTH_TEST) @@ -205,7 +222,7 @@ def display_image(): glPushMatrix() glLoadIdentity() glOrtho(-1.0, 1.0, -1.0, 1.0, -1.0, 1.0) - glMatrixMode( GL_MODELVIEW) + glMatrixMode(GL_MODELVIEW) glLoadIdentity() glViewport(0, 0, current_size[0], current_size[1]) glBegin(GL_QUADS) @@ -226,21 +243,22 @@ def display_image(): def render_scene(): - glClear (GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)# Clear Screen And Depth Buffer + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT) # Clear Screen And Depth Buffer glMatrixMode(GL_MODELVIEW) - glLoadIdentity () # Reset The Modelview Matrix + glLoadIdentity() # Reset The Modelview Matrix glTranslatef(0.0, 0.0, -3.0) glRotatef(heading, 1.0, 0.0, 0.0) - glRotatef(pitch , 0.0, 1.0, 0.0) - glRotatef(bank , 0.0, 0.0, 1.0) - glViewport(0, 0, current_size[0],current_size[1]) + glRotatef(pitch, 0.0, 1.0, 0.0) + glRotatef(bank, 0.0, 0.0, 1.0) + glViewport(0, 0, current_size[0], current_size[1]) glEnable(GL_LIGHTING) glEnable(GL_DEPTH_TEST) glDepthFunc(GL_LESS) glutSolidTeapot(1.0) - do_tick()#just for fps display.. + do_tick() # just for fps display.. return True + def main(): global window, cuda_module, cuda_gl, cuda_driver, invert glutInit(sys.argv) @@ -258,9 +276,9 @@ def main(): # create texture for blitting to screen create_texture(*initial_size) - #setup pycuda gl interop - import pycuda.gl.autoinit + # setup pycuda gl interop import pycuda.gl + import pycuda.gl.autoinit cuda_gl = pycuda.gl cuda_driver = pycuda.driver @@ -287,8 +305,8 @@ def main(): glutMainLoop() + # Print message to console, and kick off the main to get it rolling. if __name__ == "__main__": print("Hit ESC key to quit, 'a' to toggle animation, and 'e' to toggle cuda") main() - diff --git a/examples/from-wiki/gpu_scalar_mult.py b/examples/from-wiki/gpu_scalar_mult.py index 5d343db159cfe7f82dea518bdb1ba4044de62452..11e133cdde98d6f6c1efe418d4ec19935fc21308 100644 --- a/examples/from-wiki/gpu_scalar_mult.py +++ b/examples/from-wiki/gpu_scalar_mult.py @@ -1,13 +1,13 @@ -#!python +#!python +from __future__ import annotations + import numpy -import pycuda.autoinit + import pycuda.driver as drv import pycuda.gpuarray as gpuarray from pycuda.tools import context_dependent_memoize - - def main(dtype): from pycuda.elementwise import get_linear_combination_kernel lc_kernel, lc_texrefs = get_linear_combination_kernel(( @@ -30,7 +30,7 @@ def main(dtype): stop = drv.Event() start.record() - for i in range(20): + for _i in range(20): a.bind_to_texref_ext(lc_texrefs[0], allow_double_hack=True) b.bind_to_texref_ext(lc_texrefs[1], allow_double_hack=True) lc_kernel.prepared_call(x._grid, x._block, @@ -42,13 +42,11 @@ def main(dtype): print(size, size_exp, stop.time_since(start)) - @context_dependent_memoize def get_lin_comb_kernel_no_tex(summand_descriptors, dtype_z): + from pycuda.elementwise import ScalarArg, VectorArg, get_elwise_module from pycuda.tools import dtype_to_ctype - from pycuda.elementwise import \ - VectorArg, ScalarArg, get_elwise_module args = [] loop_prep = [] @@ -81,14 +79,13 @@ def get_lin_comb_kernel_no_tex(summand_descriptors, return func - def main_no_tex(dtype): lc_kernel = get_lin_comb_kernel_no_tex(( (True, dtype, dtype), (True, dtype, dtype) ), dtype) - for size_exp in range(10,26): + for size_exp in range(10, 26): size = 1 << size_exp from pycuda.curandom import rand @@ -103,7 +100,7 @@ def main_no_tex(dtype): stop = drv.Event() start.record() - for i in range(20): + for _i in range(20): lc_kernel.prepared_call(x._grid, x._block, a.gpudata, x.gpudata, b.gpudata, y.gpudata, @@ -115,12 +112,9 @@ def main_no_tex(dtype): print(size, size_exp, stop.time_since(start)) - - if __name__ == "__main__": dtype = numpy.float32 main(dtype) print() main_no_tex(dtype) - diff --git a/examples/from-wiki/kernel_concurrency.py b/examples/from-wiki/kernel_concurrency.py index 8b04a19fe44e000b2683932f7bda6e5c34aad21a..958675fd70f760b633636aa207eb1faa172c95ec 100644 --- a/examples/from-wiki/kernel_concurrency.py +++ b/examples/from-wiki/kernel_concurrency.py @@ -1,17 +1,20 @@ -#!python -#! /usr/bin/env python +#!python +# ! /usr/bin/env python # A simple program to illustrate kernel concurrency with PyCuda. # Reference: Chapter 3.2.6.5 in Cuda C Programming Guide Version 3.2. # Jesse Lu, 2011-04-04 +from __future__ import annotations import numpy as np + import pycuda.autoinit import pycuda.driver as drv from pycuda.compiler import SourceModule + # # Set up test scenario. -# +# # Create a simple test kernel. mod = SourceModule(""" @@ -29,13 +32,13 @@ __global__ void my_kernel(float *d) { my_kernel = mod.get_function("my_kernel") # Create the test data on the host. -N = 400 # Size of datasets. -n = 2 # Number of datasets (and concurrent operations) used. +N = 400 # Size of datasets. +n = 2 # Number of datasets (and concurrent operations) used. data, data_check, d_data = [], [], [] for k in range(n): - data.append(np.random.randn(N).astype(np.float32)) # Create random data. - data_check.append(data[k].copy()) # For checking the result afterwards. - d_data.append(drv.mem_alloc(data[k].nbytes)) # Allocate memory on device. + data.append(np.random.randn(N).astype(np.float32)) # Create random data. + data_check.append(data[k].copy()) # For checking the result afterwards. + d_data.append(drv.mem_alloc(data[k].nbytes)) # Allocate memory on device. # # Start concurrency test. @@ -47,49 +50,48 @@ ref.record() # Create the streams and events needed. stream, event = [], [] -marker_names = ['kernel_begin', 'kernel_end'] +marker_names = ["kernel_begin", "kernel_end"] for k in range(n): stream.append(drv.Stream()) event.append({marker_names[l]: drv.Event() for l in range(len(marker_names))}) # Transfer to device. for k in range(n): - drv.memcpy_htod(d_data[k], data[k]) + drv.memcpy_htod(d_data[k], data[k]) # Run kernels many times, we will only keep data from last loop iteration. for j in range(10): for k in range(n): - event[k]['kernel_begin'].record(stream[k]) - my_kernel(d_data[k], block=(N,1,1), stream=stream[k]) - for k in range(n): # Commenting out this line should break concurrency. - event[k]['kernel_end'].record(stream[k]) + event[k]["kernel_begin"].record(stream[k]) + my_kernel(d_data[k], block=(N, 1, 1), stream=stream[k]) + for k in range(n): # Commenting out this line should break concurrency. + event[k]["kernel_end"].record(stream[k]) # Transfer data back to host. for k in range(n): - drv.memcpy_dtoh(data[k], d_data[k]) + drv.memcpy_dtoh(data[k], d_data[k]) -# +# # Output results. # -print('\n=== Device attributes') +print("\n=== Device attributes") dev = pycuda.autoinit.device -print(('Name:', dev.name())) -print(('Compute capability:', dev.compute_capability())) -print(('Concurrent Kernels:', \ +print(("Name:", dev.name())) +print(("Compute capability:", dev.compute_capability())) +print(("Concurrent Kernels:", bool(dev.get_attribute(drv.device_attribute.CONCURRENT_KERNELS)))) -print('\n=== Checking answers') +print("\n=== Checking answers") for k in range(n): - print(('Dataset', k, ':',)) + print(("Dataset", k, ":",)) if (np.linalg.norm((data_check[k] * 2**(j+1)) - data[k]) == 0.0): - print('passed.') + print("passed.") else: - print('FAILED!') + print("FAILED!") -print('\n=== Timing info (for last set of kernel launches)') +print("\n=== Timing info (for last set of kernel launches)") for k in range(n): - print(('Dataset', k)) + print(("Dataset", k)) for l in range(len(marker_names)): - print((marker_names[l], ':', ref.time_till(event[k][marker_names[l]]))) - + print((marker_names[l], ":", ref.time_till(event[k][marker_names[l]]))) diff --git a/examples/from-wiki/light_field_3d_viewer.py b/examples/from-wiki/light_field_3d_viewer.py index 3f22e005a43c983631444babc97e8fbb35b0b0a3..44e235504ab3c14467863e68188ac1d5e70b6145 100644 --- a/examples/from-wiki/light_field_3d_viewer.py +++ b/examples/from-wiki/light_field_3d_viewer.py @@ -1,4 +1,4 @@ -#!python +#!python """ 3D display of Light Field images. Example images can be download from: @@ -15,25 +15,25 @@ Prerequisites: Author: Amit Aides. amitibo at technion . ac . il """ +from __future__ import annotations - -from enthought.traits.api import HasTraits, Range, on_trait_change -from enthought.traits.ui.api import View, Item -from enthought.chaco.api import Plot, ArrayPlotData, gray -from enthought.enable.component_editor import ComponentEditor - -import numpy as np -import Image import argparse -import os.path +import contextlib import math +import os.path -import pycuda.driver as cuda -import pycuda.compiler -import pycuda.autoinit - +import Image +import numpy as np +from enthought.chaco.api import ArrayPlotData, Plot, gray +from enthought.enable.component_editor import ComponentEditor +from enthought.traits.api import HasTraits, Range, on_trait_change +from enthought.traits.ui.api import Item, View from jinja2 import Template +import pycuda.autoinit +import pycuda.compiler +import pycuda.driver as cuda + _kernel_tpl = Template(""" {% if NCHANNELS == 3 %} @@ -64,7 +64,7 @@ __global__ void LFview_kernel( // calculate offset into destination array // unsigned int didx = (y * {{newiw}} + x) * {{NCHANNELS}}; - + // // calculate offset into source array (be aware of rotation and scaling) // @@ -99,10 +99,10 @@ def ceil(x): class LFapplication(HasTraits): traits_view = View( - Item('LF_img', editor=ComponentEditor(), show_label=False), - Item('X_angle', label='Angle in the X axis'), - Item('Y_angle', label='Angle in the Y axis'), - resizable = True, + Item("LF_img", editor=ComponentEditor(), show_label=False), + Item("X_angle", label="Angle in the X axis"), + Item("Y_angle", label="Angle in the Y axis"), + resizable=True, title="LF Image" ) @@ -113,21 +113,19 @@ class LFapplication(HasTraits): # Load image data # base_path = os.path.splitext(img_path)[0] - lenslet_path = base_path + '-lenslet.txt' - optics_path = base_path + '-optics.txt' + lenslet_path = base_path + "-lenslet.txt" + optics_path = base_path + "-optics.txt" with open(lenslet_path) as f: tmp = eval(f.readline()) - x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \ + x_offset, y_offset, right_dx, _right_dy, _down_dx, down_dy = \ np.array(tmp, dtype=np.float32) with open(optics_path) as f: for line in f: name, val = line.strip().split() - try: + with contextlib.suppress(Exception): setattr(self, name, np.float32(val)) - except: - pass max_angle = math.atan(self.pitch/2/self.flen) @@ -135,7 +133,7 @@ class LFapplication(HasTraits): # Prepare image # im_pil = Image.open(img_path) - if im_pil.mode == 'RGB': + if im_pil.mode == "RGB": self.NCHANNELS = 3 w, h = im_pil.size im = np.zeros((h, w, 4), dtype=np.float32) @@ -170,10 +168,10 @@ class LFapplication(HasTraits): NCHANNELS=self.NCHANNELS ) ) - + self.LFview_func = mod_LFview.get_function("LFview_kernel") self.texref = mod_LFview.get_texref("tex") - + # # Now generate the cuda texture # @@ -184,7 +182,7 @@ class LFapplication(HasTraits): ) else: cuda.matrix_to_texref(im, self.texref, order="C") - + # # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) @@ -195,9 +193,9 @@ class LFapplication(HasTraits): # # Prepare the traits # - self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0)) - self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0)) - + self.add_trait("X_angle", Range(-max_angle, max_angle, 0.0)) + self.add_trait("Y_angle", Range(-max_angle, max_angle, 0.0)) + self.plotdata = ArrayPlotData(LF_img=self.sampleLF()) self.LF_img = Plot(self.plotdata) if self.NCHANNELS == 3: @@ -210,9 +208,9 @@ class LFapplication(HasTraits): # Get the output image # output = np.zeros(self.LF_dim, dtype=np.uint8) - + # - # Calculate the gridsize. This is entirely given by the size of our image. + # Calculate the gridsize. This is entirely given by the size of our image. # blocks = (16, 16, 1) gridx = ceil(self.LF_dim[1]/blocks[1]) @@ -233,22 +231,21 @@ class LFapplication(HasTraits): return output - @on_trait_change('X_angle, Y_angle') + @on_trait_change("X_angle, Y_angle") def updateImge(self): - self.plotdata.set_data('LF_img', self.sampleLF()) - - + self.plotdata.set_data("LF_img", self.sampleLF()) + + def main(img_path): """Main function""" app = LFapplication(img_path) app.configure_traits() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='View an LF image') - parser.add_argument('img_path', type=str, help='Path to LF image') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="View an LF image") + parser.add_argument("img_path", type=str, help="Path to LF image") args = parser.parse_args() main(args.img_path) - diff --git a/examples/from-wiki/mandelbrot.py b/examples/from-wiki/mandelbrot.py index 92958d2bdfe894d2da3352b747b5e1a902643602..8c214cc78353fa5621a9ea384aa08d4e2d102c51 100644 --- a/examples/from-wiki/mandelbrot.py +++ b/examples/from-wiki/mandelbrot.py @@ -1,4 +1,4 @@ -#!python +#!python # Mandelbrot calculate using GPU, Serial numpy and faster numpy # Use to show the speed difference between CPU and GPU calculations # ian@ianozsvald.com July 2010 @@ -6,31 +6,33 @@ # Based on vegaseat's TKinter/numpy example code from 2006 # http://www.daniweb.com/code/snippet216851.html# # with minor changes to move to numpy from the obsolete Numeric +from __future__ import annotations import sys -import numpy as nm - import tkinter as tk -import Image # PIL -import ImageTk # PIL + +import Image # PIL +import ImageTk # PIL +import numpy as nm import pycuda.driver as drv -import pycuda.tools -import pycuda.autoinit -from pycuda.compiler import SourceModule import pycuda.gpuarray as gpuarray + # set width and height of window, more pixels take longer to calculate w = 1000 h = 1000 from pycuda.elementwise import ElementwiseKernel + + complex_gpu = ElementwiseKernel( "pycuda::complex *z, pycuda::complex *q, int *iteration, int maxiter", "for (int n=0; n < maxiter; n++) {z[i] = (z[i]*z[i])+q[i]; if (abs(z[i]) > 2.0f) {iteration[i]=n; z[i] = pycuda::complex(); q[i] = pycuda::complex();};}", "complex5", preamble="#include ",) + def calculate_z_gpu(q, maxiter, z): output = nm.resize(nm.array(0,), q.shape) q_gpu = gpuarray.to_gpu(q.astype(nm.complex64)) @@ -58,7 +60,7 @@ def calculate_z_numpy_gpu(q, maxiter, z): # we'll add 1 to iterg after each iteration iterg = gpuarray.to_gpu(nm.array([0]*zg.size).astype(nm.int32)) - for iter in range(maxiter): + for _iter in range(maxiter): zg = zg*zg + qg # abs returns a complex (rather than a float) from the complex @@ -84,11 +86,12 @@ def calculate_z_numpy(q, maxiter, z): for iter in range(maxiter): z = z*z + q done = nm.greater(abs(z), 2.0) - q = nm.where(done,0+0j, q) - z = nm.where(done,0+0j, z) + q = nm.where(done, 0+0j, q) + z = nm.where(done, 0+0j, z) output = nm.where(done, iter, output) return output + def calculate_z_serial(q, maxiter, z): # calculate z using pure python with numpy arrays # this routine unrolls calculate_z_numpy as an intermediate @@ -112,9 +115,8 @@ show_instructions = False if len(sys.argv) == 1: show_instructions = True -if len(sys.argv) > 1: - if sys.argv[1] not in ['gpu', 'gpuarray', 'numpy', 'python']: - show_instructions = True +if len(sys.argv) > 1 and sys.argv[1] not in ["gpu", "gpuarray", "numpy", "python"]: + show_instructions = True if show_instructions: print("Usage: python mandelbrot.py [gpu|gpuarray|numpy|python]") @@ -125,13 +127,13 @@ if show_instructions: print(" python is a pure Python solution on the CPU with numpy arrays") sys.exit(0) -routine = {'gpuarray':calculate_z_numpy_gpu, - 'gpu':calculate_z_gpu, - 'numpy':calculate_z_numpy, - 'python':calculate_z_serial} +routine = {"gpuarray": calculate_z_numpy_gpu, + "gpu": calculate_z_gpu, + "numpy": calculate_z_numpy, + "python": calculate_z_serial} calculate_z = routine[sys.argv[1]] -##if sys.argv[1] == 'python': +# if sys.argv[1] == 'python': # import psyco # psyco.full() @@ -149,7 +151,8 @@ calculate_z = routine[sys.argv[1]] # numpy: 43.4s # python (serial): 1605.6s -class Mandelbrot(object): + +class Mandelbrot: def __init__(self): # create window self.root = tk.Tk() @@ -159,7 +162,6 @@ class Mandelbrot(object): # start event loop self.root.mainloop() - def draw(self, x1, x2, y1, y2, maxiter=300): # draw the Mandelbrot set, from numpy example xx = nm.arange(x1, x2, (x2-x1)/w*2) @@ -199,7 +201,7 @@ class Mandelbrot(object): self.label = tk.Label(self.root, image=self.image) self.label.pack() + # test the class -if __name__ == '__main__': +if __name__ == "__main__": test = Mandelbrot() - diff --git a/examples/from-wiki/mandelbrot_interactive.py b/examples/from-wiki/mandelbrot_interactive.py index 66c3a011d39e0a4ddaf991e9e5591a808613b1b9..4b81767c2bc7b882844555e4b0a45e51909340c7 100644 --- a/examples/from-wiki/mandelbrot_interactive.py +++ b/examples/from-wiki/mandelbrot_interactive.py @@ -1,4 +1,4 @@ -#!python +#!python # Interactive Mandelbrot Set Accelerated using PyCUDA # Classical Iteration Method # Luis Villasenor @@ -7,11 +7,11 @@ # Licence: GPLv3 # Usage -# Use the left buttom to draw a square to zoom into +# Use the left buttom to draw a square to zoom into # Point and click with the right buttom to magnify by a factor of 10 -# Click with the left button on the right side of the +# Click with the left button on the right side of the # image to randomly change the colormap # Click with right button on the right side of the image to set the default colormap @@ -27,30 +27,31 @@ # Type 'f' to toggle full-screen mode # Type 's' to save the image +from __future__ import annotations - -import pycuda.driver as drv -import pycuda.tools -import pycuda.autoinit -import numpy as np -from pylab import cm as cm import matplotlib.pyplot as plt -from matplotlib.widgets import RectangleSelector +import numpy as np from matplotlib.patches import Rectangle +from matplotlib.widgets import RectangleSelector +from pylab import cm as cm + +import pycuda.driver as drv from pycuda.compiler import SourceModule -global N,n_block,n_grid,x0,y0,side,L,M,power -L=400; -N=800;n_block=16;n_grid=int(N/16); -N=n_block*n_grid; -x0=-.5;y0=0. -side=3.0 -i_cmap=49 -power=2 -fig = plt.figure(figsize=(12,12)) -fig.suptitle('Interactive Mandelbrot Set, Accelerated with PyCUDA') + + +global N, n_block, n_grid, x0, y0, side, L, M, power +L = 400 +N = 800; n_block = 16; n_grid = int(N/16) +N = n_block*n_grid +x0 = -.5; y0 = 0. +side = 3.0 +i_cmap = 49 +power = 2 +fig = plt.figure(figsize=(12, 12)) +fig.suptitle("Interactive Mandelbrot Set, Accelerated with PyCUDA") ax = fig.add_subplot(111) -cmaps=[m for m in cm.datad if not m.endswith("_r")] -N,x0,y0,side,L,power +cmaps = [m for m in cm.datad if not m.endswith("_r")] +N, x0, y0, side, L, power mod = SourceModule(""" #include @@ -78,123 +79,130 @@ __global__ void mandelbrot(double x0, double y0,double side, int L,int power,int M[threadId]=h; } """) -M = np.zeros((N,N)).astype(np.int32) +M = np.zeros((N, N)).astype(np.int32) func = mod.get_function("mandelbrot") -func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) +func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) + + def zoom_on_square(eclick, erelease): - 'eclick and erelease are the press and release events' - global N,side,x0,y0,myobj,M,power - x1, y1 = min(eclick.xdata,erelease.xdata),min( eclick.ydata,erelease.ydata) - x2, y2 = max(eclick.xdata,erelease.xdata),max( eclick.ydata,erelease.ydata) - #print(" The button you used were: %s %s" % (eclick.button, erelease.button)) - #print ' Nx=%d, Ny=%d, x0=%f, y0=%f'%(x1, y1, x0,y0) - #print ' Nx=%d, Ny=%d, x0=%f, y0=%f'%(x2, y2, x0,y0) - x_1=x0+side*(x1-N/2.)/N - y_1=y0+side*(y1-N/2.)/N - x_2=x0+side*(x2-N/2.)/N - y_2=y0+side*(y2-N/2.)/N - x0=(x_2+x_1)/2. - y0=(y_2+y_1)/2. - side=side*(x2-x1+y2-y1)/N/2 # Average of the 2 rectangle sides - func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) - myobj = plt.imshow(M,origin='lower',cmap=cmaps[i_cmap]) + "eclick and erelease are the press and release events" + global N, side, x0, y0, myobj, M, power + x1, y1 = min(eclick.xdata, erelease.xdata), min(eclick.ydata, erelease.ydata) + x2, y2 = max(eclick.xdata, erelease.xdata), max(eclick.ydata, erelease.ydata) + # print(" The button you used were: %s %s" % (eclick.button, erelease.button)) + # print ' Nx=%d, Ny=%d, x0=%f, y0=%f'%(x1, y1, x0,y0) + # print ' Nx=%d, Ny=%d, x0=%f, y0=%f'%(x2, y2, x0,y0) + x_1 = x0+side*(x1-N/2.)/N + y_1 = y0+side*(y1-N/2.)/N + x_2 = x0+side*(x2-N/2.)/N + y_2 = y0+side*(y2-N/2.)/N + x0 = (x_2+x_1)/2. + y0 = (y_2+y_1)/2. + side = side*(x2-x1+y2-y1)/N/2 # Average of the 2 rectangle sides + func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) + myobj = plt.imshow(M, origin="lower", cmap=cmaps[i_cmap]) myobj.set_data(M) - ax.add_patch(Rectangle((1 - .1, 1 - .1), 0.2, 0.2,alpha=1, facecolor='none',fill=None, )) - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) + ax.add_patch(Rectangle((1 - .1, 1 - .1), 0.2, 0.2, alpha=1, facecolor="none", fill=None, )) + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) plt.draw() + def key_selector(event): - global N,side,x0,y0,myobj,M,power,L,i_cmap,n_grid - #print(' Key pressed.') - if event.key == 'up': # Increase max number of iterations - L=int(L*1.2); - print(("Maximum number of iterations changed to %d" % L)) - func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) - myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower') - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) + global N, side, x0, y0, myobj, M, power, L, i_cmap, n_grid + # print(' Key pressed.') + if event.key == "up": # Increase max number of iterations + L = int(L*1.2) + print("Maximum number of iterations changed to %d" % L) + func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) + myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower") + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) plt.draw() - if event.key == 'down': # Decrease max number of iterations - L=int(L/1.2); - print(("Maximum number of iterations changed to %d" % L)) - func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) - myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower') - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) + if event.key == "down": # Decrease max number of iterations + L = int(L/1.2) + print("Maximum number of iterations changed to %d" % L) + func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) + myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower") + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) plt.draw() - if event.key == 'right': # Increase number of pixels - N=int(N*1.2); - n_grid=int(N/16.); - N=n_block*n_grid; - M = np.zeros((N,N)).astype(np.int32) - print(("Number of pixels per dimension changed to %d" % N)) - func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) - myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower') - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) + if event.key == "right": # Increase number of pixels + N = int(N*1.2) + n_grid = int(N/16.) + N = n_block*n_grid + M = np.zeros((N, N)).astype(np.int32) + print("Number of pixels per dimension changed to %d" % N) + func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) + myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower") + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) plt.draw() - if event.key == 'left': # Decrease number of pixels - N=int(N/1.2); - n_grid=int(N/16.); - N=n_block*n_grid; - M = np.zeros((N,N)).astype(np.int32) - print(("Number of pixels per dimension changed to %d" % N)) - func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) - myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower') - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) + if event.key == "left": # Decrease number of pixels + N = int(N/1.2) + n_grid = int(N/16.) + N = n_block*n_grid + M = np.zeros((N, N)).astype(np.int32) + print("Number of pixels per dimension changed to %d" % N) + func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) + myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower") + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) plt.draw() - if event.key in ['1','2','3','4','5','6','7','8','9'] : # Decrease number of pixels - power=int(event.key) - if power <10 and power >0 : - print(("Power index set to %d" % power)) - i_cmap=49 - side=3.0; x0=-.5;y0=0.;L=200; - func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) - myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower') - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) + if event.key in ["1", "2", "3", "4", "5", "6", "7", "8", "9"]: # Decrease number of pixels + power = int(event.key) + if power < 10 and power > 0: + print("Power index set to %d" % power) + i_cmap = 49 + side = 3.0; x0 = -.5; y0 = 0.; L = 200 + func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) + myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower") + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) plt.draw() + key_selector.RS = RectangleSelector(ax, zoom_on_square, - drawtype='box', useblit=True, + drawtype="box", useblit=True, button=[1, 3], # don't use middle button minspanx=5, minspany=5, - spancoords='pixels') + spancoords="pixels") + + # interactive=False) def zoom_on_point(event): - global N,side,x0,y0,myobj,L,M,i_cmap,power - #print(" Button pressed: %d" % (event.button)) - #print(' event.x= %f, event.y= %f '%(event.x,event.y)) - if event.button==3 and event.inaxes: # Zoom on clicked point; new side=10% of old side + global N, side, x0, y0, myobj, L, M, i_cmap, power + # print(" Button pressed: %d" % (event.button)) + # print(' event.x= %f, event.y= %f '%(event.x,event.y)) + if event.button == 3 and event.inaxes: # Zoom on clicked point; new side=10% of old side x1, y1 = event.xdata, event.ydata - x0=x0+side*(x1-N/2.)/N - y0=y0+side*(y1-N/2.)/N - side=side*.1 - func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) - myobj = plt.imshow(M,origin='lower',cmap=cmaps[i_cmap]) - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) + x0 = x0+side*(x1-N/2.)/N + y0 = y0+side*(y1-N/2.)/N + side = side*.1 + func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) + myobj = plt.imshow(M, origin="lower", cmap=cmaps[i_cmap]) + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) plt.draw() - if not event.inaxes and event.x<.3*N : # Click on left side of image to reset to full fractal - power=2; side=3.0; x0=-.5;y0=0.;i_cmap=49 - func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) - myobj = plt.imshow(M,cmap=cmaps[i_cmap],origin='lower') - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) + if not event.inaxes and event.x < .3*N: # Click on left side of image to reset to full fractal + power = 2; side = 3.0; x0 = -.5; y0 = 0.; i_cmap = 49 + func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) + myobj = plt.imshow(M, cmap=cmaps[i_cmap], origin="lower") + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) + plt.draw() + if event.button == 1 and not event.inaxes and event.x > .7*N: # Left click on right side of image to set a random colormap + i_cmap_current = i_cmap + i_cmap = np.random.randint(len(cmaps)) + if i_cmap == i_cmap_current: + i_cmap -= 1 + if i_cmap < 0: i_cmap = len(cmaps)-1 + # print("color=",i_cmap) + myobj = plt.imshow(M, origin="lower", cmap=cmaps[i_cmap]) + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) + plt.draw() + if event.button == 3 and not event.inaxes and event.x > .7*N: # Right click on right side to set default mapolormap + i_cmap = 49 + myobj = plt.imshow(M, origin="lower", cmap=cmaps[i_cmap]) + ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) plt.draw() - if event.button==1 and not event.inaxes and event.x>.7*N : # Left click on right side of image to set a random colormap - i_cmap_current=i_cmap - i_cmap=np.random.randint(len(cmaps)) - if i_cmap==i_cmap_current: - i_cmap-=1 - if i_cmap< 0 : i_cmap=len(cmaps)-1 - #print("color=",i_cmap) - myobj = plt.imshow(M,origin='lower',cmap=cmaps[i_cmap]) - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) - plt.draw() - if event.button==3 and not event.inaxes and event.x>.7*N : # Right click on right side to set default mapolormap - i_cmap=49 - myobj = plt.imshow(M,origin='lower',cmap=cmaps[i_cmap]) - ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) - plt.draw() -fig.canvas.mpl_connect('button_press_event', zoom_on_point) -fig.canvas.mpl_connect('key_press_event', key_selector) -func(np.float64(x0),np.float64(y0),np.float64(side), np.int32(L),np.int32(power),drv.Out(M),block=(n_block,n_block,1),grid=(n_grid,n_grid,1)) -ax.set_title('Side=%.2e, x=%.2e, y=%.2e, %s, L=%d'%(side,x0,y0,cmaps[i_cmap],L)) -plt.imshow(M,origin='lower',cmap=cmaps[i_cmap]) -plt.show() + +fig.canvas.mpl_connect("button_press_event", zoom_on_point) +fig.canvas.mpl_connect("key_press_event", key_selector) +func(np.float64(x0), np.float64(y0), np.float64(side), np.int32(L), np.int32(power), drv.Out(M), block=(n_block, n_block, 1), grid=(n_grid, n_grid, 1)) +ax.set_title("Side=%.2e, x=%.2e, y=%.2e, %s, L=%d" % (side, x0, y0, cmaps[i_cmap], L)) +plt.imshow(M, origin="lower", cmap=cmaps[i_cmap]) +plt.show() diff --git a/examples/from-wiki/manhattan_distance_for_2D_array.py b/examples/from-wiki/manhattan_distance_for_2D_array.py index f92ab1e16d80c8d22ddf80e73ea919284f5f8435..6a71d7d956e4357064d14a53f6f847dc8cc480cb 100644 --- a/examples/from-wiki/manhattan_distance_for_2D_array.py +++ b/examples/from-wiki/manhattan_distance_for_2D_array.py @@ -1,11 +1,12 @@ -#!python +#!python +from __future__ import annotations import numpy -import pycuda.autoinit -import pycuda.driver as cuda +import pycuda.driver as cuda from pycuda.compiler import SourceModule + w = 7 mod = SourceModule(""" @@ -35,24 +36,24 @@ def diffusion(res): y = numpy.float32(1) z = numpy.float32(1) - height, width = numpy.int32(len(res)), numpy.int32(len(res[0])) diff_func( cuda.InOut(res), width, - height,x,y,z, - block=(w,w,1) + height, x, y, z, + block=(w, w, 1) ) + def run(res, step): diffusion(res) print(res) -res = numpy.array([[0 \ - for _ in range(0, w)]\ - for _ in range(0, w)], dtype='int32') + +res = numpy.array([[0 + for _ in range(0, w)] + for _ in range(0, w)], dtype="int32") print(res) run(res, 0) - diff --git a/examples/from-wiki/matrix_transpose.py b/examples/from-wiki/matrix_transpose.py index bc2428d76acfb51127215ee81d7ea5c7cbb8a5fd..a2e66d2543414d078b716deedddc619ad7b885a8 100644 --- a/examples/from-wiki/matrix_transpose.py +++ b/examples/from-wiki/matrix_transpose.py @@ -1,22 +1,23 @@ -#!python +#!python # Exercise 1 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises # Transposition of a matrix # by Hendrik Riedmann +from __future__ import annotations +import numpy +import numpy.linalg as la +import pycuda.autoinit import pycuda.driver as cuda import pycuda.gpuarray as gpuarray -import pycuda.autoinit from pycuda.compiler import SourceModule - -import numpy -import numpy.linalg as la - from pycuda.tools import context_dependent_memoize + block_size = 16 + @context_dependent_memoize def _get_transpose_kernel(): mod = SourceModule(""" @@ -46,12 +47,13 @@ def _get_transpose_kernel(): // Write transposed submatrix to global memory A_t[glob_idx_a_t] = A_shared[threadIdx.x][threadIdx.y]; } - """% {"block_size": block_size}) + """ % {"block_size": block_size}) func = mod.get_function("transpose") func.prepare("PPii") from pytools import Record + class TransposeKernelInfo(Record): pass return TransposeKernelInfo(func=func, @@ -60,7 +62,6 @@ def _get_transpose_kernel(): granularity=block_size) - def _get_big_block_transpose_kernel(): mod = SourceModule(""" #define BLOCK_SIZE %(block_size)d @@ -101,12 +102,13 @@ def _get_big_block_transpose_kernel(): A_t[glob_idx_a_t + A_T_BLOCK_STRIDE + BLOCK_SIZE] = A_shared[threadIdx.x + BLOCK_SIZE][threadIdx.y + BLOCK_SIZE]; } - """% {"block_size": block_size}) + """ % {"block_size": block_size}) func = mod.get_function("transpose") func.prepare("PPii") from pytools import Record + class TransposeKernelInfo(Record): pass return TransposeKernelInfo(func=func, @@ -115,8 +117,6 @@ def _get_big_block_transpose_kernel(): granularity=2*block_size) - - def _transpose(tgt, src): krnl = _get_transpose_kernel() @@ -130,8 +130,6 @@ def _transpose(tgt, src): tgt.gpudata, src.gpudata, w, h) - - def transpose(src): w, h = src.shape @@ -140,9 +138,6 @@ def transpose(src): return result - - - def check_transpose(): from pycuda.curandom import rand @@ -163,8 +158,6 @@ def check_transpose(): assert err_norm == 0, (size, err_norm) - - def run_benchmark(): from pycuda.curandom import rand @@ -183,7 +176,7 @@ def run_benchmark(): warmup = 2 - for i in range(warmup): + for _i in range(warmup): _transpose(target, source) count = 10 @@ -191,7 +184,7 @@ def run_benchmark(): cuda.Context.synchronize() start.record() - for i in range(count): + for _i in range(count): _transpose(target, source) stop.record() @@ -206,21 +199,17 @@ def run_benchmark(): slow_sizes = [s for s, bw in zip(sizes, bandwidths) if bw < 40e9] print(("Sizes for which bandwidth was low:", slow_sizes)) print(("Ditto, mod 64:", [s % 64 for s in slow_sizes])) - from matplotlib.pyplot import semilogx, loglog, show, savefig, clf, xlabel, ylabel - xlabel('matrix size') - ylabel('bandwidth') + from matplotlib.pyplot import clf, loglog, savefig, semilogx, xlabel, ylabel + xlabel("matrix size") + ylabel("bandwidth") semilogx(sizes, bandwidths) savefig("transpose-bw.png") clf() - xlabel('matrix size') - ylabel('time') + xlabel("matrix size") + ylabel("time") loglog(sizes, times) savefig("transpose-times.png") - - -#check_transpose() +# check_transpose() run_benchmark() - - diff --git a/examples/from-wiki/matrixmul_simple.py b/examples/from-wiki/matrixmul_simple.py index 13e286d24fbaaa8c4152703989f9307ec45dc74e..4b37362ddbeb45aea24364112c986740aab3dee8 100644 --- a/examples/from-wiki/matrixmul_simple.py +++ b/examples/from-wiki/matrixmul_simple.py @@ -1,17 +1,18 @@ -#!python -#!/usr/bin/env python +#!python +# !/usr/bin/env python # -*- coding: utf-8 -*- -""" -Multiplies two square matrices together using a *single* block of threads and +""" +Multiplies two square matrices together using a *single* block of threads and global memory only. Each thread computes one element of the resulting matrix. """ +from __future__ import annotations import numpy as np -from pycuda import driver, compiler, gpuarray, tools # -- initialize the device -import pycuda.autoinit +from pycuda import compiler, gpuarray + kernel_code_template = """ __global__ void MatrixMulKernel(float *a, float *b, float *c) @@ -24,7 +25,7 @@ __global__ void MatrixMulKernel(float *a, float *b, float *c) // that is computed by the thread float Pvalue = 0; - // Each thread loads one row of M and one column of N, + // Each thread loads one row of M and one column of N, // to produce one element of P. for (int k = 0; k < %(MATRIX_SIZE)s; ++k) { float Aelement = a[ty * %(MATRIX_SIZE)s + k]; @@ -52,20 +53,20 @@ b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32) # compute reference on the CPU to verify GPU computation c_cpu = np.dot(a_cpu, b_cpu) -# transfer host (CPU) memory to device (GPU) memory -a_gpu = gpuarray.to_gpu(a_cpu) +# transfer host (CPU) memory to device (GPU) memory +a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) # create empty gpu array for the result (C = A * B) c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32) -# get the kernel code from the template +# get the kernel code from the template # by specifying the constant MATRIX_SIZE kernel_code = kernel_code_template % { - 'MATRIX_SIZE': MATRIX_SIZE + "MATRIX_SIZE": MATRIX_SIZE } -# compile the kernel code +# compile the kernel code mod = compiler.SourceModule(kernel_code) # get the kernel function from the compiled module @@ -74,11 +75,11 @@ matrixmul = mod.get_function("MatrixMulKernel") # call the kernel on the card matrixmul( # inputs - a_gpu, b_gpu, + a_gpu, b_gpu, # output - c_gpu, + c_gpu, # (only one) block of MATRIX_SIZE x MATRIX_SIZE threads - block = (MATRIX_SIZE, MATRIX_SIZE, 1), + block=(MATRIX_SIZE, MATRIX_SIZE, 1), ) # print the results @@ -99,5 +100,3 @@ print("CPU-GPU difference:") print(c_cpu - c_gpu.get()) np.allclose(c_cpu, c_gpu.get()) - - diff --git a/examples/from-wiki/matrixmul_tiled.py b/examples/from-wiki/matrixmul_tiled.py index 225075ba96d7247f106afad166a83474e8d32596..0a50ac66be97487885dd1f47278a4da730795cb4 100644 --- a/examples/from-wiki/matrixmul_tiled.py +++ b/examples/from-wiki/matrixmul_tiled.py @@ -1,29 +1,30 @@ -#!python -#!/usr/bin/env python +#!python +# !/usr/bin/env python # -*- coding: utf-8 -*- -""" -Multiples two square matrices together using multiple blocks and shared memory. +""" +Multiples two square matrices together using multiple blocks and shared memory. Each thread block is assigned a "tile" of the resulting matrix and is responsible -for generating the elements in that tile. Each thread in a block computes one element +for generating the elements in that tile. Each thread in a block computes one element of the tile. """ +from __future__ import annotations import numpy as np from numpy import linalg as la -from pycuda import driver, compiler, gpuarray, tools # -- initialize the device -import pycuda.autoinit +from pycuda import compiler, gpuarray + kernel_code_template = """ __global__ void MatrixMulKernel(float *A, float *B, float *C) { const uint wA = %(MATRIX_SIZE)s; - const uint wB = %(MATRIX_SIZE)s; - + const uint wB = %(MATRIX_SIZE)s; + // Block index const uint bx = blockIdx.x; const uint by = blockIdx.y; @@ -51,7 +52,7 @@ __global__ void MatrixMulKernel(float *A, float *B, float *C) // compute the block sub-matrix for (int a = aBegin, b = bBegin; a <= aEnd; - a += aStep, b += bStep) + a += aStep, b += bStep) { // Shared memory for the sub-matrix of A __shared__ float As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s]; @@ -87,7 +88,7 @@ __global__ void MatrixMulKernel(float *A, float *B, float *C) # define the (square) matrix size MATRIX_SIZE = 4 -# define size of blocks and tiles sub-matrix +# define size of blocks and tiles sub-matrix # (we assume that the block size is same as tile size) TILE_SIZE = 2 BLOCK_SIZE = TILE_SIZE @@ -99,18 +100,18 @@ b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32) # compute reference on the CPU to verify GPU computation c_cpu = np.dot(a_cpu, b_cpu) -# transfer host (CPU) memory to device (GPU) memory -a_gpu = gpuarray.to_gpu(a_cpu) +# transfer host (CPU) memory to device (GPU) memory +a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) # create empty gpu array for the result (C = A * B) c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32) -# get the kernel code from the template +# get the kernel code from the template # by specifying the constants MATRIX_SIZE and BLOCK_SIZE -kernel_code = kernel_code_template % { - 'MATRIX_SIZE': MATRIX_SIZE, - 'BLOCK_SIZE': BLOCK_SIZE, +kernel_code = kernel_code_template % { + "MATRIX_SIZE": MATRIX_SIZE, + "BLOCK_SIZE": BLOCK_SIZE, } # compile the kernel code @@ -122,13 +123,13 @@ matrixmul = mod.get_function("MatrixMulKernel") # call the kernel on the card matrixmul( # inputs - a_gpu, b_gpu, + a_gpu, b_gpu, # output - c_gpu, + c_gpu, # grid of multiple blocks - grid = (MATRIX_SIZE // TILE_SIZE, MATRIX_SIZE // TILE_SIZE), + grid=(MATRIX_SIZE // TILE_SIZE, MATRIX_SIZE // TILE_SIZE), # block of multiple threads - block = (TILE_SIZE, TILE_SIZE, 1), + block=(TILE_SIZE, TILE_SIZE, 1), ) # print the results @@ -149,5 +150,3 @@ print("CPU-GPU difference:") print(c_cpu - c_gpu.get()) print("L2 norm:", la.norm(c_cpu - c_gpu.get())) np.allclose(c_cpu, c_gpu.get()) - - diff --git a/examples/from-wiki/measure_gpuarray_speed_random.py b/examples/from-wiki/measure_gpuarray_speed_random.py index fbd793d479d4f4836a346d9aa5352b2a83684ca4..517863408455e7651d9e003c6770dd3ee3d88821 100644 --- a/examples/from-wiki/measure_gpuarray_speed_random.py +++ b/examples/from-wiki/measure_gpuarray_speed_random.py @@ -1,16 +1,15 @@ -#!python -#! /usr/bin/env python +#!python +# ! /usr/bin/env python # DO NOT USE THIS AS A BENCHMARK. See # http://documen.tician.de/pycuda/array.html#pycuda.curandom.rand +from __future__ import annotations -import pycuda.autoinit -import pycuda.driver as drv -import pycuda.curandom as curandom import numpy -import numpy.linalg as la -from pytools import Table +from pytools import Table +import pycuda.curandom as curandom +import pycuda.driver as drv def main(): @@ -22,72 +21,67 @@ def main(): flopsCPU = [] timesCPU = [] - for power in range(10, 25): # 24 - size = 1< 20: - count = 100 - else: - count = 1000 + count = 100 if power > 20 else 1000 - #start timer + # start timer start = drv.Event() end = drv.Event() start.record() - #cuda operation which fills the array with random numbers - for i in range(count): + # cuda operation which fills the array with random numbers + for _i in range(count): curandom.rand((size, )) - #stop timer + # stop timer end.record() end.synchronize() - #calculate used time + # calculate used time secs = start.time_till(end)*1e-3 times.append(secs/count) flops.append(size) - #cpu operations which fills teh array with random data - a = numpy.array((size,), dtype=numpy.float32) + # cpu operations which fills teh array with random data + numpy.array((size,), dtype=numpy.float32) - #start timer + # start timer start = drv.Event() end = drv.Event() start.record() - #cpu operation which fills the array with random data - for i in range(count): + # cpu operation which fills the array with random data + for _i in range(count): numpy.random.rand(size).astype(numpy.float32) - #stop timer + # stop timer end.record() end.synchronize() - #calculate used time + # calculate used time secs = start.time_till(end)*1e-3 - #add results to variable + # add results to variable timesCPU.append(secs/count) flopsCPU.append(size) + # calculate pseudo flops + flops = [f/t for f, t in zip(flops, times)] + flopsCPU = [f/t for f, t in zip(flopsCPU, timesCPU)] - #calculate pseudo flops - flops = [f/t for f, t in zip(flops,times)] - flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)] - - #print the data out + # print the data out tbl = Table() - tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup")) - for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU): - tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu)) + tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU", "Size/Time CPU", "GPU vs CPU speedup")) + for s, t, f, tCpu, fCpu in zip(sizes, times, flops, timesCPU, flopsCPU): + tbl.add_row((s, t, f, tCpu, fCpu, f/fCpu)) print(tbl) if __name__ == "__main__": main() - diff --git a/examples/from-wiki/multiple_threads.py b/examples/from-wiki/multiple_threads.py index a067766d6b0829dbca83527f6339130483194fef..ae51c8aab4e7524149f8941933bff06cf470fdf2 100644 --- a/examples/from-wiki/multiple_threads.py +++ b/examples/from-wiki/multiple_threads.py @@ -1,13 +1,15 @@ -#!python +#!python # Derived from a test case by Chris Heuser # Also see FAQ about PyCUDA and threads. +from __future__ import annotations +import threading + +import numpy -import pycuda import pycuda.driver as cuda from pycuda.compiler import SourceModule -import threading -import numpy + class GPUThread(threading.Thread): def __init__(self, number, some_array): @@ -30,6 +32,7 @@ class GPUThread(threading.Thread): del self.array_gpu del self.ctx + def test_kernel(input_array_gpu): mod = SourceModule(""" __global__ void f(float * out, float * in) @@ -40,18 +43,19 @@ def test_kernel(input_array_gpu): """) func = mod.get_function("f") - output_array = numpy.zeros((1,512)) + output_array = numpy.zeros((1, 512)) output_array_gpu = cuda.mem_alloc(output_array.nbytes) func(output_array_gpu, input_array_gpu, - block=(512,1,1)) + block=(512, 1, 1)) cuda.memcpy_dtoh(output_array, output_array_gpu) return output_array + cuda.init() -some_array = numpy.ones((1,512), dtype=numpy.float32) +some_array = numpy.ones((1, 512), dtype=numpy.float32) num = cuda.Device.count() gpu_thread_list = [] @@ -59,5 +63,3 @@ for i in range(num): gpu_thread = GPUThread(i, some_array) gpu_thread.start() gpu_thread_list.append(gpu_thread) - - diff --git a/examples/from-wiki/plot_random_data.py b/examples/from-wiki/plot_random_data.py index 2e0118c3fe59e78d53ca77d85abcaf89ba1e2495..9f0ef4195b2dc1d0f2dc0703b31f3fb5509d419e 100644 --- a/examples/from-wiki/plot_random_data.py +++ b/examples/from-wiki/plot_random_data.py @@ -1,24 +1,25 @@ -#!python +#!python # simple module to show the plotting of random data +from __future__ import annotations -import pycuda.autoinit import pycuda.curandom as curandom + size = 1000 a = curandom.rand((size,)).get() from matplotlib.pylab import * + + subplot(211) plot(a) grid(True) -ylabel('plot - gpu') +ylabel("plot - gpu") subplot(212) hist(a, 100) grid(True) -ylabel('histogram - gpu') - -#and save it -savefig('plot-random-data') - +ylabel("histogram - gpu") +# and save it +savefig("plot-random-data") diff --git a/examples/from-wiki/rotate.py b/examples/from-wiki/rotate.py index 0adc26ba8f480f4571052d9693046fe26d811a64..1c89ba010d74d946fb3af500529d40b66362001b 100644 --- a/examples/from-wiki/rotate.py +++ b/examples/from-wiki/rotate.py @@ -1,23 +1,27 @@ -#!python -#!/usr/bin/env python -tt +#!python +# !/usr/bin/env python -tt # encoding: utf-8 # # Created by Holger Rapp on 2009-03-11. # HolgerRapp@gmx.net # +from __future__ import annotations + +from math import cos, pi, sin -import pycuda.driver as cuda -import pycuda.compiler -import pycuda.autoinit import numpy -from math import pi,cos,sin + +import pycuda.autoinit +import pycuda.compiler +import pycuda.driver as cuda + _rotation_kernel_source = """ texture tex; __global__ void copy_texture_kernel( - const float resize_val, - const float alpha, + const float resize_val, + const float alpha, unsigned short oldiw, unsigned short oldih, unsigned short newiw, unsigned short newih, unsigned char* data) { @@ -25,38 +29,39 @@ __global__ void copy_texture_kernel( // calculate pixel idx unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; - + // We might be outside the reachable pixels. Don't do anything if( (x >= newiw) || (y >= newih) ) return; - + // calculate offset into destination array unsigned int didx = y * newiw + x; - + // calculate offset into source array (be aware of rotation and scaling) float xmiddle = (x-newiw/2.) / resize_val; float ymiddle = (y-newih/2.) / resize_val; float sx = ( xmiddle*cos(alpha)+ymiddle*sin(alpha) + oldiw/2.) ; float sy = ( -xmiddle*sin(alpha)+ymiddle*cos(alpha) + oldih/2.); - - if( (sx < 0) || (sx >= oldiw) || (sy < 0) || (sy >= oldih) ) { - data[didx] = 255; + + if( (sx < 0) || (sx >= oldiw) || (sy < 0) || (sy >= oldih) ) { + data[didx] = 255; return; } data[didx] = tex2D(tex, sx, sy); } """ -mod_copy_texture=pycuda.compiler.SourceModule( _rotation_kernel_source ) +mod_copy_texture = pycuda.compiler.SourceModule(_rotation_kernel_source) copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel") texref = mod_copy_texture.get_texref("tex") -def rotate_image( a, resize = 1.5, angle = 20., interpolation = "linear", blocks = (16,16,1) ): + +def rotate_image(a, resize=1.5, angle=20., interpolation="linear", blocks=(16, 16, 1)): """ Rotates the array. The new array has the new size and centers the picture in the middle. - + a - array (2-dim) resize - new_image w/old_image w angle - degrees to rotate the image @@ -67,72 +72,74 @@ def rotate_image( a, resize = 1.5, angle = 20., interpolation = "linear", blocks returns: a new array with dtype=uint8 containing the rotated image """ angle = angle/180. *pi - - # Convert this image to float. Unsigned int texture gave + + # Convert this image to float. Unsigned int texture gave # strange results for me. This conversion is slow though :( a = a.astype("float32") # Calculate the dimensions of the new image - calc_x = lambda x_y: (x_y[0]*a.shape[1]/2.*cos(angle)-x_y[1]*a.shape[0]/2.*sin(angle)) - calc_y = lambda x_y1: (x_y1[0]*a.shape[1]/2.*sin(angle)+x_y1[1]*a.shape[0]/2.*cos(angle)) + def calc_x(x_y): + return (x_y[0]*a.shape[1]/2.*cos(angle)-x_y[1]*a.shape[0]/2.*sin(angle)) - xs = [ calc_x(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ] - ys = [ calc_y(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ] + def calc_y(x_y1): + return (x_y1[0]*a.shape[1]/2.*sin(angle)+x_y1[1]*a.shape[0]/2.*cos(angle)) + + xs = [calc_x(p) for p in [(-1., -1.), (1., -1.), (1., 1.), (-1., 1.)]] + ys = [calc_y(p) for p in [(-1., -1.), (1., -1.), (1., 1.), (-1., 1.)]] new_image_dim = ( int(numpy.ceil(max(ys)-min(ys))*resize), int(numpy.ceil(max(xs)-min(xs))*resize), ) - + # Now generate the cuda texture cuda.matrix_to_texref(a, texref, order="C") - + # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) if interpolation == "linear": texref.set_filter_mode(cuda.filter_mode.LINEAR) - # Calculate the gridsize. This is entirely given by the size of our image. + # Calculate the gridsize. This is entirely given by the size of our image. gridx = new_image_dim[0]/blocks[0] if \ - new_image_dim[0]%blocks[0]==1 else new_image_dim[0]/blocks[0] +1 + new_image_dim[0] % blocks[0] == 1 else new_image_dim[0]/blocks[0] +1 gridy = new_image_dim[1]/blocks[1] if \ - new_image_dim[1]%blocks[1]==0 else new_image_dim[1]/blocks[1] +1 + new_image_dim[1] % blocks[1] == 0 else new_image_dim[1]/blocks[1] +1 # Get the output image - output = numpy.zeros(new_image_dim,dtype="uint8") - + output = numpy.zeros(new_image_dim, dtype="uint8") + # Call the kernel copy_texture_func( numpy.float32(resize), numpy.float32(angle), numpy.uint16(a.shape[1]), numpy.uint16(a.shape[0]), numpy.uint16(new_image_dim[1]), numpy.uint16(new_image_dim[0]), - cuda.Out(output),texrefs=[texref],block=blocks,grid=(gridx,gridy)) - + cuda.Out(output), texrefs=[texref], block=blocks, grid=(gridx, gridy)) + return output -if __name__ == '__main__': - import Image + +if __name__ == "__main__": import sys - - def main( ): + + import Image + + def main(): if len(sys.argv) != 2: print("You should really read the source...\n\nUsage: rotate.py \n") sys.exit(-1) # Open, convert to grayscale, convert to numpy array img = Image.open(sys.argv[1]).convert("L") - i = numpy.fromstring(img.tostring(),dtype="uint8").reshape(img.size[1],img.size[0]) - + i = numpy.fromstring(img.tostring(), dtype="uint8").reshape(img.size[1], img.size[0]) + # Rotate & convert back to PIL Image irot = rotate_image(i) - rotimg = Image.fromarray(irot,mode="L") + rotimg = Image.fromarray(irot, mode="L") # Save and display rotimg.save("rotated.png") rotimg.show() - - main() - - + main() diff --git a/examples/from-wiki/select_to_list.py b/examples/from-wiki/select_to_list.py index b221cccd43ffbca7c7e75c3128d16e3fff98c781..ff3d34c54e3ee4ef6e41da4106582b31cfedfb88 100644 --- a/examples/from-wiki/select_to_list.py +++ b/examples/from-wiki/select_to_list.py @@ -1,14 +1,16 @@ -#!python +#!python # Exercise 2 from http://webapp.dam.brown.edu/wiki/SciComp/CudaExercises # Generate an array of random numbers between 0 and 1 # List the indices of those numbers that are greater than a given limit +from __future__ import annotations + +import numpy import pycuda.driver as cuda -import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule -import numpy + # Define block size and number of elements per thread block_size = 512 @@ -106,7 +108,7 @@ grid = (amount // multiple_block_size, 1) # Warmup warmup = 2 -for i in range(warmup): +for _i in range(warmup): func.prepared_call(grid, block, a_gpu.gpudata, selec_gpu.gpudata, limit, counter_gpu.gpudata) counter_gpu = gpuarray.zeros(1, dtype=numpy.int32) @@ -119,7 +121,7 @@ stop = cuda.Event() cuda.Context.synchronize() start.record() count = 10 -for i in range(count): +for _i in range(count): func.prepared_call(grid, block, a_gpu.gpudata, selec_gpu.gpudata, limit, counter_gpu.gpudata) counter_gpu = gpuarray.zeros(1, dtype=numpy.int32) @@ -136,7 +138,6 @@ elems_in_selec = len(numpy.nonzero(selec >= 0)) elapsed_seconds = stop.time_since(start) * 1e-3 print("mem bw:", (a.nbytes + elems_in_selec * 4) / elapsed_seconds / 1e9 * count) -filtered_set = sorted(list(item for item in selec if item != -1)) -reference_set = sorted(list(i for i, x in enumerate(a) if x >= limit)) +filtered_set = sorted(item for item in selec if item != -1) +reference_set = sorted(i for i, x in enumerate(a) if x >= limit) assert filtered_set == reference_set - diff --git a/examples/from-wiki/simple_rgb2gray.py b/examples/from-wiki/simple_rgb2gray.py index 9d4b155723adc798a70f5bb61c40d23b11d80bc8..125808100806e77cc5e54c54568359254fff681f 100644 --- a/examples/from-wiki/simple_rgb2gray.py +++ b/examples/from-wiki/simple_rgb2gray.py @@ -1,17 +1,18 @@ -#!python +#!python +from __future__ import annotations -__author__ = 'ashwin' -import pycuda.driver as drv -import pycuda.tools -import pycuda.autoinit -from pycuda.compiler import SourceModule +__author__ = "ashwin" + +import matplotlib.pyplot as p import numpy as np import scipy.misc as scm -import matplotlib.pyplot as p -mod = SourceModule \ - ( +import pycuda.driver as drv +from pycuda.compiler import SourceModule + + +mod = SourceModule( """ #include #define INDEX(a, b) a*256+b @@ -32,21 +33,18 @@ dest[INDEX(a, b)] = (0.299*r_img[INDEX(a, b)]+0.587*g_img[INDEX(a, b)]+0.114*b_i """ ) -a = scm.imread('Lenna.png').astype(np.float32) +a = scm.imread("Lenna.png").astype(np.float32) print(a) -r_img = a[:, :, 0].reshape(65536, order='F') -g_img = a[:, :, 1].reshape(65536, order='F') -b_img = a[:, :, 2].reshape(65536, order='F') -dest=r_img +r_img = a[:, :, 0].reshape(65536, order="F") +g_img = a[:, :, 1].reshape(65536, order="F") +b_img = a[:, :, 2].reshape(65536, order="F") +dest = r_img print(dest) rgb2gray = mod.get_function("rgb2gray") -rgb2gray(drv.Out(dest), drv.In(r_img), drv.In(g_img),drv.In(b_img),block=(1024, 1, 1), grid=(64, 1, 1)) +rgb2gray(drv.Out(dest), drv.In(r_img), drv.In(g_img), drv.In(b_img), block=(1024, 1, 1), grid=(64, 1, 1)) -dest=np.reshape(dest,(256,256), order='F') +dest = np.reshape(dest, (256, 256), order="F") p.imshow(dest) p.show() - - - diff --git a/examples/from-wiki/simple_speed_test.py b/examples/from-wiki/simple_speed_test.py index aadefd2bb6ad791a9858f270a1ea148ec51976f3..6f603628d60dbf92e80d0472b113dd9335e94b55 100644 --- a/examples/from-wiki/simple_speed_test.py +++ b/examples/from-wiki/simple_speed_test.py @@ -1,4 +1,4 @@ -#!python +#!python # SimpleSpeedTest.py # Very simple speed testing code @@ -24,7 +24,7 @@ # CPU time and first three results: # 32.933660s, [ 0.005477 0.005477 0.005477] # -# +# # Using Win 7 x64, GTX 470 GPU, X5650 Xeon, # Driver v301.42, CUDA 4.2, Python 2.7 x64, # PyCuda 2012.1 gave the following results: @@ -41,17 +41,19 @@ # 8.421861s, [ 0.005477 0.005477 0.005477] # CPU time measured using : # 5.905661s, [ 0.005477 0.005477 0.005477] +from __future__ import annotations +import numpy +import pycuda.autoinit +import pycuda.cumath import pycuda.driver as drv +import pycuda.gpuarray as gpuarray import pycuda.tools -import pycuda.autoinit -import numpy from pycuda.compiler import SourceModule -import pycuda.gpuarray as gpuarray -import pycuda.cumath from pycuda.elementwise import ElementwiseKernel + blocks = 64 block_size = 128 nbr_values = blocks * block_size @@ -89,9 +91,9 @@ a = numpy.ones(nbr_values).astype(numpy.float32) # create a destination array that will receive the result dest = numpy.zeros_like(a) -start.record() # start timing -gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1), block=(block_size,1,1) ) -end.record() # end timing +start.record() # start timing +gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks, 1), block=(block_size, 1, 1)) +end.record() # end timing # calculate the run length end.synchronize() secs = start.time_till(end)*1e-3 @@ -109,9 +111,9 @@ kernel = ElementwiseKernel( a = numpy.ones(nbr_values).astype(numpy.float32) a_gpu = gpuarray.to_gpu(a) -start.record() # start timing -kernel(a_gpu, numpy.int(n_iter)) -end.record() # end timing +start.record() # start timing +kernel(a_gpu, int(n_iter)) +end.record() # end timing # calculate the run length end.synchronize() secs = start.time_till(end)*1e-3 @@ -129,10 +131,10 @@ kernel = ElementwiseKernel( a = numpy.ones(nbr_values).astype(numpy.float32) a_gpu = gpuarray.to_gpu(a) -start.record() # start timing -for i in range(n_iter): +start.record() # start timing +for _i in range(n_iter): kernel(a_gpu) -end.record() # end timing +end.record() # end timing # calculate the run length end.synchronize() secs = start.time_till(end)*1e-3 @@ -146,10 +148,10 @@ print("%fs, %s" % (secs, str(a_gpu.get()[:3]))) a = numpy.ones(nbr_values).astype(numpy.float32) a_gpu = gpuarray.to_gpu(a) -start.record() # start timing -for i in range(n_iter): +start.record() # start timing +for _i in range(n_iter): a_gpu = pycuda.cumath.sin(a_gpu) -end.record() # end timing +end.record() # end timing # calculate the run length end.synchronize() secs = start.time_till(end)*1e-3 @@ -162,16 +164,15 @@ print("%fs, %s" % (secs, str(a_gpu.get()[:3]))) # use numpy the calculate the result on the CPU for reference a = numpy.ones(nbr_values).astype(numpy.float32) -start.record() # start timing +start.record() # start timing start.synchronize() -for i in range(n_iter): +for _i in range(n_iter): a = numpy.sin(a) -end.record() # end timing +end.record() # end timing # calculate the run length end.synchronize() secs = start.time_till(end)*1e-3 print("CPU time and first three results:") print("%fs, %s" % (secs, str(a[:3]))) - diff --git a/examples/from-wiki/sobel_filter.py b/examples/from-wiki/sobel_filter.py index f5ad622ffce30ea892bfaeec98b7d6f33a37de32..b3c51cce0d42ce7731d4a44ed5b5b04cd453d471 100644 --- a/examples/from-wiki/sobel_filter.py +++ b/examples/from-wiki/sobel_filter.py @@ -1,6 +1,6 @@ -#!python -#!/usr/bin/env python -#-*- coding: utf-8 -*- +#!python +# !/usr/bin/env python +# -*- coding: utf-8 -*- # # Requires PyCuda, PyOpenGL, and Pil # MAKE SURE YOU HAVE AN UPDATED VERSION OF THESE PACKAGES!! @@ -20,19 +20,27 @@ # this software and related documentation outside the terms of the EULA # is strictly prohibited. # +from __future__ import annotations +import os +import sys +import time + +import Image +import numpy as np from OpenGL.GL import * -from OpenGL.GLUT import * -from OpenGL.GLU import * from OpenGL.GL.ARB.vertex_buffer_object import * -import numpy as np, Image -import sys, time, os +from OpenGL.GLU import * +from OpenGL.GLUT import * + +import pycuda import pycuda.driver as cuda_driver import pycuda.gl as cuda_gl -import pycuda -#import pycuda.gl.autoinit + +# import pycuda.gl.autoinit from pycuda.compiler import SourceModule + imWidth = 0 imHeight = 0 wWidth = 0 @@ -52,6 +60,7 @@ fpsLimit = 8 timer = 0.0 ver2011 = False + def copy2D_array_to_device(dst, src, type_sz, width, height): copy = cuda_driver.Memcpy2D() copy.set_src_array(src) @@ -60,6 +69,7 @@ def copy2D_array_to_device(dst, src, type_sz, width, height): copy.dst_pitch = copy.src_pitch = copy.width_in_bytes = width*type_sz copy(aligned=True) + def computeFPS(): global frameCount, fpsCount, fpsLimit, timer frameCount += 1 @@ -69,6 +79,7 @@ def computeFPS(): glutSetWindowTitle("Cuda Edge Detection: %f fps" % ifps) fpsCount = 0 + def sobelFilter(odata, iw, ih): global array, pixels, mode, scale if mode == 3: @@ -369,27 +380,28 @@ def sobelFilter(odata, iw, ih): # fixed BlockSize Launch RADIUS = 1 threads = (16, 4, 1) - BlockWidth = 80 # Do not change! - blocks = (iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)), - ih/threads[1]+(0!=ih%threads[1]) ) - SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f); - sharedMem = SharedPitch*(threads[1]+2*RADIUS); + BlockWidth = 80 # Do not change! + blocks = (iw/(4*BlockWidth)+(iw % (4*BlockWidth) != 0), + ih/threads[1]+(ih % threads[1] != 0)) + SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f) + sharedMem = SharedPitch*(threads[1]+2*RADIUS) iw = iw & ~3 - cuda_function(np.intp(odata), np.uint16(iw), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref],block=threads, grid=blocks, shared=sharedMem) + cuda_function(np.intp(odata), np.uint16(iw), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref], block=threads, grid=blocks, shared=sharedMem) elif mode == 2: # variable BlockSize launch RADIUS = 1 threads = (16, 4, 1) - BlockWidth = 80 # Change only with divisible by 16 values! - blocks = (iw/(4*BlockWidth)+(0!=iw%(4*BlockWidth)), - ih/threads[1]+(0!=ih%threads[1]) ) - SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f); - sharedMem = SharedPitch*(threads[1]+2*RADIUS); + BlockWidth = 80 # Change only with divisible by 16 values! + blocks = (iw/(4*BlockWidth)+(iw % (4*BlockWidth) != 0), + ih/threads[1]+(ih % threads[1] != 0)) + SharedPitch = ~0x3f & (4*(BlockWidth+2*RADIUS)+0x3f) + sharedMem = SharedPitch*(threads[1]+2*RADIUS) iw = iw & ~3 - cuda_function(np.intp(odata), np.uint16(iw), np.int16(BlockWidth), np.int16(SharedPitch), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref],block=threads, grid=blocks, shared=sharedMem) + cuda_function(np.intp(odata), np.uint16(iw), np.int16(BlockWidth), np.int16(SharedPitch), np.int16(iw), np.int16(ih), np.float32(scale), texrefs=[texref], block=threads, grid=blocks, shared=sharedMem) else: BlockWidth = 384 - cuda_function(np.intp(odata), np.uint32(iw), np.int32(iw), np.int32(ih), np.float32(scale), texrefs=[texref],block=(BlockWidth,1,1),grid=(ih,1)) + cuda_function(np.intp(odata), np.uint32(iw), np.int32(iw), np.int32(ih), np.float32(scale), texrefs=[texref], block=(BlockWidth, 1, 1), grid=(ih, 1)) + def initGL(): global wWidth, wHeight, wName @@ -397,72 +409,72 @@ def initGL(): glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA) glutInitWindowSize(wWidth, wHeight) glutCreateWindow(wName) - import pycuda.gl.autoinit + def loadImage(fn=None): global pixels, imWidth, imHeight, wWidth, wHeight try: - im = Image.open(fn) # Open the image - except IOError: + im = Image.open(fn) # Open the image + except OSError: print("Usage:", os.path.basename(sys.argv[0]), "[IMAGE=defaultimage.jpg]") print("Can't open", fn) sys.exit(1) - imWidth, imHeight = im.size # Window size is set to image size + imWidth, imHeight = im.size # Window size is set to image size wWidth, wHeight = im.size - im.draft("L", im.size) # L-flag is for Luminance - pixels = np.fromstring(im.tostring(), dtype=np.uint8) # Got the array - pixels.resize((imHeight, imWidth)) # Resize to 2d array + im.draft("L", im.size) # L-flag is for Luminance + pixels = np.fromstring(im.tostring(), dtype=np.uint8) # Got the array + pixels.resize((imHeight, imWidth)) # Resize to 2d array print("Reading image:", fn, "size:", imWidth, "x", imHeight) + def initData(fn=None): global pixels, array, pbo_buffer, cuda_pbo_resource, imWidth, imHeight, texid # Cuda array initialization - array = cuda_driver.matrix_to_array(pixels, "C") # C-style instead of Fortran-style: row-major + array = cuda_driver.matrix_to_array(pixels, "C") # C-style instead of Fortran-style: row-major - pixels.fill(0) # Resetting the array to 0 + pixels.fill(0) # Resetting the array to 0 - pbo_buffer = glGenBuffers(1) # generate 1 buffer reference - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer) # binding to this buffer - glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth*imHeight, pixels, GL_STREAM_DRAW) # Allocate the buffer - bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE) # Check allocated buffer size - assert(bsize == imWidth*imHeight) - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind + pbo_buffer = glGenBuffers(1) # generate 1 buffer reference + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer) # binding to this buffer + glBufferData(GL_PIXEL_UNPACK_BUFFER, imWidth*imHeight, pixels, GL_STREAM_DRAW) # Allocate the buffer + bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE) # Check allocated buffer size + assert (bsize == imWidth*imHeight) + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind if ver2011: cuda_pbo_resource = pycuda.gl.RegisteredBuffer(int(pbo_buffer), cuda_gl.graphics_map_flags.WRITE_DISCARD) else: - cuda_pbo_resource = cuda_gl.BufferObject(int(pbo_buffer)) # Mapping GLBuffer to cuda_resource - + cuda_pbo_resource = cuda_gl.BufferObject(int(pbo_buffer)) # Mapping GLBuffer to cuda_resource - glGenTextures(1, texid); # generate 1 texture reference - glBindTexture(GL_TEXTURE_2D, texid); # binding to this texture - glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None); # Allocate the texture - glBindTexture(GL_TEXTURE_2D, 0) # Unbind + glGenTextures(1, texid) # generate 1 texture reference + glBindTexture(GL_TEXTURE_2D, texid) # binding to this texture + glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None) # Allocate the texture + glBindTexture(GL_TEXTURE_2D, 0) # Unbind - glPixelStorei(GL_UNPACK_ALIGNMENT, 1) # 1-byte row alignment - glPixelStorei(GL_PACK_ALIGNMENT, 1) # 1-byte row alignment + glPixelStorei(GL_UNPACK_ALIGNMENT, 1) # 1-byte row alignment + glPixelStorei(GL_PACK_ALIGNMENT, 1) # 1-byte row alignment def display(): global cuda_pbo_resource, pbo_buffer, texid, imWidth, imHeight, timer - timer = time.time() # Starting timer - mapping_obj = cuda_pbo_resource.map() # Map the GlBuffer + timer = time.time() # Starting timer + mapping_obj = cuda_pbo_resource.map() # Map the GlBuffer if ver2011: - data, sz = mapping_obj.device_ptr_and_size() # Got the CUDA pointer to GlBuffer + data, _sz = mapping_obj.device_ptr_and_size() # Got the CUDA pointer to GlBuffer else: data = mapping_obj.device_ptr() - sobelFilter(data, imWidth, imHeight) # Writing to "data" - mapping_obj.unmap() # Unmap the GlBuffer + sobelFilter(data, imWidth, imHeight) # Writing to "data" + mapping_obj.unmap() # Unmap the GlBuffer - glClear(GL_COLOR_BUFFER_BIT) # Clear + glClear(GL_COLOR_BUFFER_BIT) # Clear glBindTexture(GL_TEXTURE_2D, texid) glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_buffer) # Copyng from buffer to texture glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, imWidth, imHeight, GL_LUMINANCE, GL_UNSIGNED_BYTE, None) - #glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind + # glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE, imWidth, imHeight, 0, GL_LUMINANCE, GL_UNSIGNED_BYTE, None); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) # Unbind glDisable(GL_DEPTH_TEST) glEnable(GL_TEXTURE_2D) @@ -487,6 +499,7 @@ def display(): computeFPS() glutPostRedisplay() + def reshape(x, y): glViewport(0, 0, x, y) glMatrixMode(GL_PROJECTION) @@ -496,32 +509,35 @@ def reshape(x, y): glLoadIdentity() glutPostRedisplay() + def keyboard(key, x=0, y=0): global mode, scale - if key=="q": + if key == "q": sys.exit(0) - elif key=="I" or key=="i": + elif key == "I" or key == "i": mode = 0 - elif key=="T" or key=="t": + elif key == "T" or key == "t": mode = 1 - elif key=="S" or key=="s": + elif key == "S" or key == "s": mode = 2 - elif key=="D" or key=="d": + elif key == "D" or key == "d": mode = 3 elif key == "-": scale -= 0.1 elif key == "=": scale += 0.1 + def idle(): glutPostRedisplay() + def main(argv): fn = "defaultimage.jpg" if len(argv) > 1: fn = argv[1] - loadImage(fn) # Loading the image + loadImage(fn) # Loading the image initGL() initData(fn) @@ -540,11 +556,10 @@ def main(argv): glutKeyboardFunc(keyboard) glutReshapeFunc(reshape) glutIdleFunc(idle) - glutMainLoop(); + glutMainLoop() + if __name__ == "__main__": if pycuda.VERSION[0] >= 2011: ver2011 = True main(sys.argv) - - diff --git a/examples/from-wiki/sparse_solve.py b/examples/from-wiki/sparse_solve.py index 243579e0ca21fa757b00215825835c81e9c2975f..38505249a1a613fb3513782647f03a371543680d 100644 --- a/examples/from-wiki/sparse_solve.py +++ b/examples/from-wiki/sparse_solve.py @@ -1,11 +1,11 @@ -#!python -import pycuda.autoinit -import pycuda.driver as drv -import pycuda.gpuarray as gpuarray +#!python +from __future__ import annotations + import numpy import numpy.linalg as la - +import pycuda.driver as drv +import pycuda.gpuarray as gpuarray def main_cg(): @@ -41,7 +41,7 @@ def main_cg(): from pycuda.sparse.cg import solve_pkt_with_cg print("start solve") - for i in range(4): + for _i in range(4): start = drv.Event() stop = drv.Event() start.record() @@ -64,11 +64,11 @@ def main_cg(): est_flops += csr_mat.shape[0] * it_count print("residual norm: %g" % (la.norm(csr_mat*res - rhs)/la.norm(rhs))) - print(("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " + print("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % ( csr_mat.shape[0], elapsed, it_count, res_count, it_count/elapsed, - est_flops/elapsed/1e9))) + est_flops/elapsed/1e9)) # TODO: mixed precision # TODO: benchmark @@ -76,12 +76,6 @@ def main_cg(): dev_pool.stop_holding() - - - if __name__ == "__main__": print("starting...") main_cg() - - - diff --git a/examples/from-wiki/threads_and_blocks.py b/examples/from-wiki/threads_and_blocks.py index caa851f30a3424f42ad0b99da77abcb0e926d812..b94caa0b6f913d60d74ec74ab0d95c5e3b5b9334 100644 --- a/examples/from-wiki/threads_and_blocks.py +++ b/examples/from-wiki/threads_and_blocks.py @@ -1,8 +1,9 @@ -#!python -import pycuda.driver as cuda -import pycuda.autoinit +#!python +from __future__ import annotations + from pycuda.compiler import SourceModule + mod = SourceModule(""" #include @@ -13,5 +14,4 @@ mod = SourceModule(""" """) func = mod.get_function("say_hi") -func(block=(4,4,1),grid=(2,2,1)) - +func(block=(4, 4, 1), grid=(2, 2, 1)) diff --git a/examples/from-wiki/thrust_interop.py b/examples/from-wiki/thrust_interop.py index 78d877a5291e415b8781fb650084ea25e57df466..1f19a0967cf0ab3b1232c17f2e3e61f838efd8f5 100644 --- a/examples/from-wiki/thrust_interop.py +++ b/examples/from-wiki/thrust_interop.py @@ -1,93 +1,94 @@ -#!python +#!python +from __future__ import annotations -import pycuda -import pycuda.autoinit -import pycuda.gpuarray as gpuarray import numpy as np - from cgen import * from codepy.bpl import BoostPythonModule from codepy.cuda import CudaModule -#Make a host_module, compiled for CPU +import pycuda.gpuarray as gpuarray + + +# Make a host_module, compiled for CPU host_mod = BoostPythonModule() -#Make a device module, compiled with NVCC +# Make a device module, compiled with NVCC nvcc_mod = CudaModule(host_mod) -#Describe device module code -#NVCC includes +# Describe device module code +# NVCC includes nvcc_includes = [ - 'thrust/sort.h', - 'thrust/device_vector.h', - 'cuda.h', + "thrust/sort.h", + "thrust/device_vector.h", + "cuda.h", ] -#Add includes to module +# Add includes to module nvcc_mod.add_to_preamble([Include(x) for x in nvcc_includes]) -#NVCC function +# NVCC function nvcc_function = FunctionBody( - FunctionDeclaration(Value('void', 'my_sort'), - [Value('CUdeviceptr', 'input_ptr'), - Value('int', 'length')]), - Block([Statement('thrust::device_ptr thrust_ptr((float*)input_ptr)'), - Statement('thrust::sort(thrust_ptr, thrust_ptr+length)')])) - -#Add declaration to nvcc_mod -#Adds declaration to host_mod as well + FunctionDeclaration(Value("void", "my_sort"), + [Value("CUdeviceptr", "input_ptr"), + Value("int", "length")]), + Block([Statement("thrust::device_ptr thrust_ptr((float*)input_ptr)"), + Statement("thrust::sort(thrust_ptr, thrust_ptr+length)")])) + +# Add declaration to nvcc_mod +# Adds declaration to host_mod as well nvcc_mod.add_function(nvcc_function) host_includes = [ - 'boost/python/extract.hpp', + "boost/python/extract.hpp", ] -#Add host includes to module +# Add host includes to module host_mod.add_to_preamble([Include(x) for x in host_includes]) host_namespaces = [ - 'using namespace boost::python', + "using namespace boost::python", ] -#Add BPL using statement +# Add BPL using statement host_mod.add_to_preamble([Statement(x) for x in host_namespaces]) host_statements = [ - #Extract information from PyCUDA GPUArray - #Get length + # Extract information from PyCUDA GPUArray + # Get length 'tuple shape = extract(gpu_array.attr("shape"))', - 'int length = extract(shape[0])', - #Get data pointer + "int length = extract(shape[0])", + # Get data pointer 'CUdeviceptr ptr = extract(gpu_array.attr("ptr"))', - #Call Thrust routine, compiled into the CudaModule - 'my_sort(ptr, length)', - #Return result - 'return gpu_array', + # Call Thrust routine, compiled into the CudaModule + "my_sort(ptr, length)", + # Return result + "return gpu_array", ] host_mod.add_function( FunctionBody( - FunctionDeclaration(Value('object', 'host_entry'), - [Value('object', 'gpu_array')]), + FunctionDeclaration(Value("object", "host_entry"), + [Value("object", "gpu_array")]), Block([Statement(x) for x in host_statements]))) -#Print out generated code, to see what we're actually compiling +# Print out generated code, to see what we're actually compiling print("---------------------- Host code ----------------------") -print((host_mod.generate())) +print(host_mod.generate()) print("--------------------- Device code ---------------------") -print((nvcc_mod.generate())) +print(nvcc_mod.generate()) print("-------------------------------------------------------") +# Compile modules +import codepy.jit +import codepy.toolchain + -#Compile modules -import codepy.jit, codepy.toolchain gcc_toolchain = codepy.toolchain.guess_toolchain() nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain() module = nvcc_mod.compile(gcc_toolchain, nvcc_toolchain, debug=True) - length = 100 a = np.array(np.random.rand(length), dtype=np.float32) print("---------------------- Unsorted -----------------------") @@ -98,4 +99,3 @@ c = module.host_entry(b) print("----------------------- Sorted ------------------------") print(c.get()) print("-------------------------------------------------------") - diff --git a/examples/from-wiki/using_printf.py b/examples/from-wiki/using_printf.py index c12cfa42717c16c193af9989995c15e9d1f78674..797c58a63fa7d8d4d8acd9515fa10aaa379d1b78 100644 --- a/examples/from-wiki/using_printf.py +++ b/examples/from-wiki/using_printf.py @@ -1,8 +1,9 @@ -#!python -import pycuda.driver as cuda -import pycuda.autoinit +#!python +from __future__ import annotations + from pycuda.compiler import SourceModule + mod = SourceModule(""" #include @@ -13,5 +14,4 @@ mod = SourceModule(""" """) func = mod.get_function("say_hi") -func(block=(4,4,1)) - +func(block=(4, 4, 1)) diff --git a/examples/hello_gpu.py b/examples/hello_gpu.py index 23c51adba399ebf5b7b2ccecd2c295df17f496a4..d19fb09f4c33d9de7fd46da55ab7d3308140b81f 100644 --- a/examples/hello_gpu.py +++ b/examples/hello_gpu.py @@ -1,10 +1,11 @@ -import pycuda.driver as drv -import pycuda.tools -import pycuda.autoinit +from __future__ import annotations + import numpy -import numpy.linalg as la + +import pycuda.driver as drv from pycuda.compiler import SourceModule + mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { @@ -21,6 +22,6 @@ b = numpy.random.randn(400).astype(numpy.float32) dest = numpy.zeros_like(a) multiply_them( drv.Out(dest), drv.In(a), drv.In(b), - block=(400,1,1)) + block=(400, 1, 1)) print(dest-a*b) diff --git a/pycuda/__init__.py b/pycuda/__init__.py index 5ace743eb79ffafe2241db589eb6c1044125c671..3a00359e8688074aa6cd5f8614d3f398d509fd32 100644 --- a/pycuda/__init__.py +++ b/pycuda/__init__.py @@ -1,3 +1,6 @@ +from __future__ import annotations + + VERSION = (2025, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/pycuda/_cluda.py b/pycuda/_cluda.py index fa5eda2a94fc849eec0870b312ee94eab4851e76..ff738fa35c1b027a4231ad4246813ca68211dda2 100644 --- a/pycuda/_cluda.py +++ b/pycuda/_cluda.py @@ -1,3 +1,6 @@ +from __future__ import annotations + + CLUDA_PREAMBLE = """ #define local_barrier() __syncthreads(); diff --git a/pycuda/_mymako.py b/pycuda/_mymako.py index 826dde6198fd5c48746c312ed313091e78845633..37070b5fbdd18c9518bcb9133b16c84858915495 100644 --- a/pycuda/_mymako.py +++ b/pycuda/_mymako.py @@ -1,3 +1,6 @@ +from __future__ import annotations + + try: import mako.template # noqa: F401 except ImportError: @@ -13,4 +16,4 @@ except ImportError: ) -from mako import * # noqa: F403, F401 +from mako import * # noqa: F403 diff --git a/pycuda/autoinit.py b/pycuda/autoinit.py index 664ed1c8e1725c46c2adcb17f5f15bcfc5ffe7eb..b628b3cd057ea95b8f8af1cb2923818e7e49c27e 100644 --- a/pycuda/autoinit.py +++ b/pycuda/autoinit.py @@ -1,10 +1,15 @@ -import pycuda.driver as cuda +from __future__ import annotations + import atexit +import pycuda.driver as cuda + + # Initialize CUDA cuda.init() -from pycuda.tools import make_default_context # noqa: E402 +from pycuda.tools import make_default_context + global context context = make_default_context() diff --git a/pycuda/autoprimaryctx.py b/pycuda/autoprimaryctx.py index 537c8610261ba97c07605377297b103ffe2855ea..5a860b734f90ff38823716bdd4a9afbff5504125 100644 --- a/pycuda/autoprimaryctx.py +++ b/pycuda/autoprimaryctx.py @@ -1,10 +1,14 @@ -import pycuda.driver as cuda +from __future__ import annotations + import atexit +import pycuda.driver as cuda + + # Initialize CUDA cuda.init() -from pycuda.tools import make_default_context # noqa: E402 +from pycuda.tools import make_default_context def _retain_primary_context(dev): diff --git a/pycuda/characterize.py b/pycuda/characterize.py index 3b46ff7aeac1a2e169968258b4e3e1a27351ced8..74e6c9c71611d9f313372f852c41843ae5789024 100644 --- a/pycuda/characterize.py +++ b/pycuda/characterize.py @@ -1,6 +1,9 @@ -from pycuda.tools import context_dependent_memoize +from __future__ import annotations + import numpy as np +from pycuda.tools import context_dependent_memoize + def platform_bits(): import sys diff --git a/pycuda/compiler.py b/pycuda/compiler.py index 6459e1573fa77bd8e2cd81c6c819936047f5b488..d4edd6537a8f5c2a7749af59755b64b5ccdc6011 100644 --- a/pycuda/compiler.py +++ b/pycuda/compiler.py @@ -1,24 +1,24 @@ -from pytools import memoize - # don't import pycuda.driver here--you'll create an import loop -import os +from __future__ import annotations +import os import sys -from tempfile import mkstemp from os import unlink +from tempfile import mkstemp +from pytools import memoize from pytools.prefork import call_capture_output @memoize def get_nvcc_version(nvcc): cmdline = [nvcc, "--version"] - result, stdout, stderr = call_capture_output(cmdline) + result, stdout, _stderr = call_capture_output(cmdline) if result != 0 or not stdout: from warnings import warn - warn("NVCC version could not be determined.") + warn("NVCC version could not be determined.", stacklevel=2) stdout = b"nvcc unknown version" return stdout.decode("utf-8", "replace") @@ -39,12 +39,12 @@ def _new_md5(): def preprocess_source(source, options, nvcc): handle, source_path = mkstemp(suffix=".cu") - outf = open(source_path, "w") - outf.write(source) - outf.close() + with open(source_path, "w") as outf: + outf.write(source) + os.close(handle) - cmdline = [nvcc, "--preprocess"] + options + [source_path] + cmdline = [nvcc, "--preprocess", *options, source_path] if "win32" in sys.platform: cmdline.extend(["--compiler-options", "-EP"]) else: @@ -102,11 +102,8 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): cache_path = join(cache_dir, cache_file + "." + target) try: - cache_file = open(cache_path, "rb") - try: + with open(cache_path, "rb") as cache_file: return cache_file.read() - finally: - cache_file.close() except Exception: pass @@ -119,9 +116,8 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): cu_file_name = file_root + ".cu" cu_file_path = join(file_dir, cu_file_name) - outf = open(cu_file_path, "w") - outf.write(str(source)) - outf.close() + with open(cu_file_path, "w") as outf: + outf.write(str(source)) if keep: options = options[:] @@ -129,13 +125,13 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): print("*** compiler output in %s" % file_dir) - cmdline = [nvcc, "--" + target] + options + [cu_file_name] + cmdline = [nvcc, "--" + target, *options, cu_file_name] result, stdout, stderr = call_capture_output( cmdline, cwd=file_dir, error_on_nonzero=False ) try: - result_f = open(join(file_dir, file_root + "." + target), "rb") + result_f = open(join(file_dir, file_root + "." + target), "rb") # noqa: SIM115 except OSError: no_output = True else: @@ -147,7 +143,7 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): warn( "PyCUDA: nvcc exited with status 0, but appears to have " - "encountered an error" + "encountered an error", stacklevel=2 ) from pycuda.driver import CompileError @@ -178,12 +174,11 @@ def compile_plain(source, options, keep, nvcc, cache_dir, target="cubin"): result_f.close() if cache_dir: - outf = open(cache_path, "wb") - outf.write(result_data) - outf.close() + with open(cache_path, "wb") as outf: + outf.write(result_data) if not keep: - from os import listdir, unlink, rmdir + from os import listdir, rmdir, unlink for name in listdir(file_dir): unlink(join(file_dir, name)) @@ -230,10 +225,12 @@ def compile( arch=None, code=None, cache_dir=None, - include_dirs=[], + include_dirs=None, target="cubin", ): + if include_dirs is None: + include_dirs = [] assert target in ["cubin", "ptx", "fatbin"] if not no_extern_c: @@ -282,14 +279,15 @@ def compile( if code is not None: options.extend(["-code", code]) - if "darwin" in sys.platform and sys.maxsize == 9223372036854775807: - options.append("-m64") - elif "win32" in sys.platform and sys.maxsize == 9223372036854775807: + if ( + ("darwin" in sys.platform and sys.maxsize == 9223372036854775807) + or + ("win32" in sys.platform and sys.maxsize == 9223372036854775807)): options.append("-m64") elif "win32" in sys.platform and sys.maxsize == 2147483647: options.append("-m32") - include_dirs = include_dirs + [_find_pycuda_include_path()] + include_dirs = [*include_dirs, _find_pycuda_include_path()] for i in include_dirs: options.append("-I" + i) @@ -310,7 +308,7 @@ class CudaModule: warn( "trying to compile for a compute capability " - "higher than selected GPU" + "higher than selected GPU", stacklevel=2 ) except Exception: pass @@ -341,8 +339,10 @@ class SourceModule(CudaModule): arch=None, code=None, cache_dir=None, - include_dirs=[], + include_dirs=None, ): + if include_dirs is None: + include_dirs = [] self._check_arch(arch) cubin = compile( @@ -368,8 +368,8 @@ def _search_on_path(filenames): """Find file on system path.""" # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224 - from os.path import exists, abspath, join - from os import pathsep, environ + from os import environ, pathsep + from os.path import abspath, exists, join search_path = environ["PATH"] @@ -400,13 +400,15 @@ class DynamicModule(CudaModule): arch=None, code=None, cache_dir=None, - include_dirs=[], + include_dirs=None, message_handler=None, log_verbose=False, cuda_libdir=None, ): from pycuda.driver import Context + if include_dirs is None: + include_dirs = [] compute_capability = Context.get_device().compute_capability() if compute_capability < (3, 5): raise Exception( @@ -563,9 +565,11 @@ class DynamicSourceModule(DynamicModule): arch=None, code=None, cache_dir=None, - include_dirs=[], + include_dirs=None, cuda_libdir=None, ): + if include_dirs is None: + include_dirs = [] super().__init__( nvcc=nvcc, link_options=None, diff --git a/pycuda/cumath.py b/pycuda/cumath.py index ab2b32c86079b05af501e0eebeabacd7e181d2e7..4ac71d587bdd90b30a2659869b2a709b24f6fd9a 100644 --- a/pycuda/cumath.py +++ b/pycuda/cumath.py @@ -1,7 +1,11 @@ -import pycuda.gpuarray as gpuarray -import pycuda.elementwise as elementwise -import numpy as np +from __future__ import annotations + import warnings + +import numpy as np + +import pycuda.elementwise as elementwise +import pycuda.gpuarray as gpuarray from pycuda.driver import Stream @@ -10,7 +14,7 @@ def _make_unary_array_func(name): if stream_or_out is not None: warnings.warn( - "please use 'out' or 'stream' keyword arguments", DeprecationWarning + "please use 'out' or 'stream' keyword arguments", DeprecationWarning, stacklevel=2 ) if isinstance(stream_or_out, Stream): stream = stream_or_out @@ -25,10 +29,7 @@ def _make_unary_array_func(name): if "stream" in kwargs: stream = kwargs["stream"] - if array.dtype == np.float32: - func_name = name + "f" - else: - func_name = name + func_name = name + "f" if array.dtype == np.float32 else name if not array.flags.forc: raise RuntimeError( diff --git a/pycuda/curandom.py b/pycuda/curandom.py index 6dbbeede174017b0d79b7cf03ed78c088b2fd7cc..4382b888176e2a6597a7130202d36a6a83d7ab95 100644 --- a/pycuda/curandom.py +++ b/pycuda/curandom.py @@ -1,8 +1,12 @@ +from __future__ import annotations + import numpy as np + +from pytools import memoize_method + import pycuda.compiler import pycuda.driver as drv import pycuda.gpuarray as array -from pytools import memoize_method # {{{ MD5-based random number generation @@ -182,8 +186,8 @@ md5_code = """ def rand(shape, dtype=np.float32, stream=None): - from pycuda.gpuarray import GPUArray from pycuda.elementwise import get_elwise_kernel + from pycuda.gpuarray import GPUArray result = GPUArray(shape, dtype) @@ -515,12 +519,12 @@ class _RandomNumberGeneratorBase: self.module = module = pycuda.compiler.SourceModule(source, no_extern_c=True) self.generators = {} - for name, out_type, suffix in my_generators: + for name, _out_type, _suffix in my_generators: gen_func = module.get_function(name) gen_func.prepare("PPn") self.generators[name] = gen_func if get_curand_version() >= (4, 0, 0): - for name, in_type, out_type, suffix in my_log_generators: + for name, in_type, _out_type, _suffix in my_log_generators: gen_func = module.get_function(name) if in_type == "float": gen_func.prepare("PPffn") @@ -528,11 +532,11 @@ class _RandomNumberGeneratorBase: gen_func.prepare("PPddn") self.generators[name] = gen_func if get_curand_version() >= (5, 0, 0): - for name, out_type, suffix in my_poisson_generators: + for name, _out_type, _suffix in my_poisson_generators: gen_func = module.get_function(name) gen_func.prepare("PPdn") self.generators[name] = gen_func - for name, inout_type, suffix in my_poisson_inplace_generators: + for name, _inout_type, _suffix in my_poisson_inplace_generators: gen_func = module.get_function(name) gen_func.prepare("PPn") self.generators[name] = gen_func @@ -553,10 +557,7 @@ class _RandomNumberGeneratorBase: self.skip_ahead_array.prepare("PnP") def _kernels(self): - return list(self.generators.values()) + [ - self.skip_ahead, - self.skip_ahead_array, - ] + return [*list(self.generators.values()), self.skip_ahead, self.skip_ahead_array] @property @memoize_method @@ -831,12 +832,7 @@ class _PseudoRandomNumberGeneratorBase(_RandomNumberGeneratorBase): def _kernels(self): return ( - _RandomNumberGeneratorBase._kernels(self) - + [self.module.get_function("prepare")] - + [ - self.module.get_function("skip_ahead_sequence"), - self.module.get_function("skip_ahead_sequence_array"), - ] + [*_RandomNumberGeneratorBase._kernels(self), self.module.get_function("prepare"), self.module.get_function("skip_ahead_sequence"), self.module.get_function("skip_ahead_sequence_array")] ) @@ -1008,10 +1004,7 @@ if get_curand_version() >= (4, 1, 0): ) def _kernels(self): - return _PseudoRandomNumberGeneratorBase._kernels(self) + [ - self.module.get_function("skip_ahead_subsequence"), - self.module.get_function("skip_ahead_subsequence_array"), - ] + return [*_PseudoRandomNumberGeneratorBase._kernels(self), self.module.get_function("skip_ahead_subsequence"), self.module.get_function("skip_ahead_subsequence_array")] # }}} @@ -1128,9 +1121,7 @@ class _SobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) def _kernels(self): - return _RandomNumberGeneratorBase._kernels(self) + [ - self.module.get_function("prepare") - ] + return [*_RandomNumberGeneratorBase._kernels(self), self.module.get_function("prepare")] scrambledsobol_random_source = """ @@ -1230,9 +1221,7 @@ class _ScrambledSobolRandomNumberGeneratorBase(_RandomNumberGeneratorBase): drv.Context.set_limit(drv.limit.STACK_SIZE, prev_stack_size) def _kernels(self): - return _RandomNumberGeneratorBase._kernels(self) + [ - self.module.get_function("prepare") - ] + return [*_RandomNumberGeneratorBase._kernels(self), self.module.get_function("prepare")] if get_curand_version() >= (3, 2, 0): diff --git a/pycuda/debug.py b/pycuda/debug.py index dee5974c93efc3c5d239df2782abfe3f0aed9071..f2973b0832ac5c51b5c62e690ac263bf239665fe 100644 --- a/pycuda/debug.py +++ b/pycuda/debug.py @@ -1,9 +1,12 @@ -import pycuda.driver +from __future__ import annotations import sys from optparse import OptionParser from os.path import exists +import pycuda.driver + + pycuda.driver.set_debugging() parser = OptionParser(usage="usage: %prog [options] SCRIPT-TO-RUN [SCRIPT-ARGUMENTS]") @@ -23,4 +26,5 @@ if not exists(mainpyfile): sys.argv = args -exec(compile(open(mainpyfile).read(), mainpyfile, "exec")) +with open(mainpyfile) as mainpy: + exec(compile(mainpy.read(), mainpyfile, "exec")) diff --git a/pycuda/driver.py b/pycuda/driver.py index 4bce8347a486d853ab989b3181b8e77053ae0fd7..b5070c9ca3016186201f88beec7dcb1ae1af2cc4 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -1,9 +1,13 @@ +from __future__ import annotations + + __copyright__ = """ Copyright 2008-2021 Andreas Kloeckner Copyright 2021 NVIDIA Corporation """ import os + import numpy as np @@ -14,8 +18,8 @@ def _search_on_path(filenames): """Find file on system path.""" # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224 - from os.path import exists, abspath, join - from os import pathsep, environ + from os import environ, pathsep + from os.path import abspath, exists, join search_path = environ["PATH"] @@ -27,7 +31,7 @@ def _search_on_path(filenames): def _add_cuda_libdir_to_dll_path(): - from os.path import join, dirname + from os.path import dirname, join cuda_path = os.environ.get("CUDA_PATH") @@ -46,12 +50,12 @@ def _add_cuda_libdir_to_dll_path(): "Unable to discover CUDA installation directory " "while attempting to add it to Python's DLL path. " "Either set the 'CUDA_PATH' environment variable " - "or ensure that 'nvcc.exe' is on the path." + "or ensure that 'nvcc.exe' is on the path.", stacklevel=2 ) try: - os.add_dll_directory + os.add_dll_directory # noqa: B018 except AttributeError: # likely not on Py3.8 and Windows # https://github.com/inducer/pycuda/issues/213 @@ -71,7 +75,7 @@ except ImportError as e: warn( "Failed to import the CUDA driver interface, with an error " "message indicating that the version of your CUDA header " - "does not match the version of your CUDA driver." + "does not match the version of your CUDA driver.", stacklevel=2 ) raise @@ -131,7 +135,8 @@ class ArgumentHandler: self.dev_alloc = mem_alloc_like(self.array) except AttributeError: raise TypeError( - "could not determine array length of '%s': unsupported array type or not an array" + "could not determine array length of '%s': " + "unsupported array type or not an array" % type(self.array) ) return self.dev_alloc @@ -177,7 +182,7 @@ def _add_functionality(): warn( "CUDA driver raised '%s' when querying '%s' on '%s'" - % (e, att_name, dev) + % (e, att_name, dev), stacklevel=2 ) else: result[att_id] = att_value @@ -305,9 +310,11 @@ def _add_functionality(): for handler in post_handlers: handler.post_call(stream) - def function_prepare_pre_v4(func, arg_types, block=None, shared=None, texrefs=[]): + def function_prepare_pre_v4(func, arg_types, block=None, shared=None, texrefs=None): from warnings import warn + if texrefs is None: + texrefs = [] if block is not None: warn( "setting the block size in Function.prepare is deprecated", @@ -328,7 +335,7 @@ def _add_functionality(): func.arg_format = "" - for i, arg_type in enumerate(arg_types): + for _i, arg_type in enumerate(arg_types): if ( isinstance(arg_type, type) and np is not None @@ -358,7 +365,7 @@ def _add_functionality(): DeprecationWarning, stacklevel=2, ) - args = (block,) + args + args = (block, *args) shared_size = kwargs.pop("shared_size", None) if shared_size is not None: @@ -390,7 +397,7 @@ def _add_functionality(): DeprecationWarning, stacklevel=2, ) - args = (block,) + args + args = (block, *args) shared_size = kwargs.pop("shared_size", None) if shared_size is not None: @@ -433,7 +440,7 @@ def _add_functionality(): DeprecationWarning, stacklevel=2, ) - args = (stream,) + args + args = (stream, *args) stream = block shared_size = kwargs.pop("shared_size", None) @@ -522,12 +529,14 @@ def _add_functionality(): for handler in post_handlers: handler.post_call(stream) - def function_prepare(func, arg_types, texrefs=[]): + def function_prepare(func, arg_types, texrefs=None): + if texrefs is None: + texrefs = [] func.texrefs = texrefs func.arg_format = "" - for i, arg_type in enumerate(arg_types): + for _i, arg_type in enumerate(arg_types): if isinstance(arg_type, type) and np.number in arg_type.__mro__: func.arg_format += np.dtype(arg_type).char elif isinstance(arg_type, np.dtype): @@ -554,7 +563,7 @@ def _add_functionality(): DeprecationWarning, stacklevel=2, ) - args = (block,) + args + args = (block, *args) shared_size = kwargs.pop("shared_size", 0) @@ -611,7 +620,7 @@ def _add_functionality(): DeprecationWarning, stacklevel=2, ) - args = (stream,) + args + args = (stream, *args) stream = block shared_size = kwargs.pop("shared_size", 0) @@ -657,16 +666,8 @@ def _add_functionality(): ) return func(*args, **kwargs) - try: - from functools import update_wrapper - except ImportError: - pass - else: - try: - update_wrapper(new_func, func) - except Exception: - # User won't see true signature. Oh well. - pass + from functools import update_wrapper + update_wrapper(new_func, func) return new_func @@ -1070,12 +1071,8 @@ def matrix_to_texref(matrix, texref, order): def to_device(bf_obj): - import sys - if sys.version_info >= (2, 7): - bf = memoryview(bf_obj).tobytes() - else: - bf = buffer(bf_obj) + bf = memoryview(bf_obj).tobytes() result = mem_alloc(len(bf)) memcpy_htod(result, bf) return result diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 1ca6706f4d922444d00a33b56cb9d352dd1192e8..cc710409c832e92b2116e6c167b0417e3836a4b5 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -1,4 +1,6 @@ """Elementwise functionality.""" +from __future__ import annotations + __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" @@ -26,12 +28,14 @@ OTHER DEALINGS IN THE SOFTWARE. """ -from pycuda.tools import context_dependent_memoize from typing import Any + import numpy as np -from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg + from pytools import memoize_method +from pycuda.tools import ScalarArg, VectorArg, context_dependent_memoize, dtype_to_ctype + def get_elwise_module( arguments, @@ -170,10 +174,7 @@ def get_elwise_kernel_and_types( else: arguments.append(ScalarArg(np.uintp, "n")) - if use_range: - module_builder = get_elwise_range_module - else: - module_builder = get_elwise_module + module_builder = get_elwise_range_module if use_range else get_elwise_module mod = module_builder(arguments, operation, name, keep, options, **kwargs) @@ -189,7 +190,7 @@ def get_elwise_kernel( """Return a L{pycuda.driver.Function} that performs the same scalar operation on one or several vectors. """ - mod, func, arguments = get_elwise_kernel_and_types( + _mod, func, arguments = get_elwise_kernel_and_types( arguments, operation, name, keep, options, **kwargs ) @@ -211,7 +212,7 @@ class ElementwiseKernel: }) def get_texref(self, name, use_range=False): - mod, knl, arguments = self.generate_stride_kernel_and_types(use_range=use_range) + mod, _knl, _arguments = self.generate_stride_kernel_and_types(use_range=use_range) return mod.get_texref(name) @memoize_method @@ -241,7 +242,7 @@ class ElementwiseKernel: ) invocation_args = [] - mod, func, arguments = self.generate_stride_kernel_and_types( + _mod, func, arguments = self.generate_stride_kernel_and_types( range_ is not None or slice_ is not None ) @@ -414,8 +415,8 @@ def get_copy_kernel(dtype_dest, dtype_src): @context_dependent_memoize def get_linear_combination_kernel(summand_descriptors, dtype_z): + from pycuda.elementwise import ScalarArg, VectorArg, get_elwise_module from pycuda.tools import dtype_to_ctype - from pycuda.elementwise import VectorArg, ScalarArg, get_elwise_module args = [] preamble = ["#include \n\n"] @@ -461,7 +462,7 @@ def get_linear_combination_kernel(summand_descriptors, dtype_z): return func, tex_src -def _get_real_dtype(dtype: "np.dtype[Any]") -> "np.dtype[Any]": +def _get_real_dtype(dtype: np.dtype[Any]) -> np.dtype[Any]: assert dtype.kind == "c" return np.empty(0, dtype).real.dtype diff --git a/pycuda/gl/__init__.py b/pycuda/gl/__init__.py index 3120d2de74f72fc4b8eddc54ea2b82475d3e259e..bbe607ec24aa6d2eac9300ea271df20894c32b63 100644 --- a/pycuda/gl/__init__.py +++ b/pycuda/gl/__init__.py @@ -1,6 +1,8 @@ -from __future__ import absolute_import +from __future__ import annotations + import pycuda._driver as _drv + if not _drv.have_gl_ext(): raise ImportError("PyCUDA was compiled without GL extension support") diff --git a/pycuda/gl/autoinit.py b/pycuda/gl/autoinit.py index fc8c9a668f365ff5aa2be3c89cc97413c634f87a..e4104774a812611152e3af560ab84f70b119a2dd 100644 --- a/pycuda/gl/autoinit.py +++ b/pycuda/gl/autoinit.py @@ -1,12 +1,17 @@ -from __future__ import absolute_import +from __future__ import annotations + +import atexit + import pycuda.driver as cuda import pycuda.gl as cudagl -import atexit + cuda.init() assert cuda.Device.count() >= 1 -from pycuda.tools import make_default_context # noqa: E402 +from pycuda.tools import make_default_context + + context = make_default_context(lambda dev: cudagl.make_context(dev)) device = context.get_device() diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index b2d3ac373d128eba036561606404095c29a1f254..cd5b7b233d54ab373f443395d39741d0c1718a47 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1,24 +1,29 @@ +from __future__ import annotations + + __copyright__ = """ Copyright 2008-2021 Andreas Kloeckner Copyright 2021 NVIDIA Corporation """ +import copyreg +import numbers +from functools import reduce + import numpy as np -import pycuda.elementwise as elementwise + from pytools import memoize, memoize_method + import pycuda.driver as drv +import pycuda.elementwise as elementwise +from pycuda.characterize import has_double_support from pycuda.compyte.array import ( + ArrayFlags as _ArrayFlags, as_strided as _as_strided, - f_contiguous_strides as _f_contiguous_strides, c_contiguous_strides as _c_contiguous_strides, - ArrayFlags as _ArrayFlags, + f_contiguous_strides as _f_contiguous_strides, get_common_dtype as _get_common_dtype_base, ) -from pycuda.characterize import has_double_support -from functools import reduce -import numbers - -import copyreg def _get_common_dtype(obj1, obj2): @@ -286,10 +291,7 @@ class GPUArray: def __cuda_array_interface__(self): """Returns a CUDA Array Interface dictionary describing this array's data.""" - if self.gpudata is not None: - ptr = int(self.gpudata) - else: - ptr = 0 + ptr = int(self.gpudata) if self.gpudata is not None else 0 return { "shape": self.shape, @@ -828,10 +830,7 @@ class GPUArray: with new, the user can choose between ipow or just pow """ common_dtype = _get_common_dtype(self, other) - if new: - result = self._new_like_me(common_dtype) - else: - result = self + result = self._new_like_me(common_dtype) if new else self # {{{ sanity checks @@ -960,7 +959,7 @@ class GPUArray: "only contiguous arrays may " "be used as arguments to this operation" ) - if isinstance(shape[0], tuple) or isinstance(shape[0], list): + if isinstance(shape[0], (tuple, list)): shape = tuple(shape[0]) same_contiguity = (order == "C" and self.flags.c_contiguous) or ( @@ -1012,14 +1011,14 @@ class GPUArray: raise ValueError("new type not compatible with array") new_shape = ( - self.shape[:min_stride_axis] - + (self.shape[min_stride_axis] * old_itemsize // itemsize,) - + self.shape[min_stride_axis + 1:] + (*self.shape[:min_stride_axis], + self.shape[min_stride_axis] * old_itemsize // itemsize, + *self.shape[min_stride_axis + 1:]) ) new_strides = ( - self.strides[:min_stride_axis] - + (self.strides[min_stride_axis] * itemsize // old_itemsize,) - + self.strides[min_stride_axis + 1:] + (*self.strides[:min_stride_axis], + self.strides[min_stride_axis] * itemsize // old_itemsize, + *self.strides[min_stride_axis + 1:]) ) return GPUArray( @@ -1036,9 +1035,9 @@ class GPUArray: Returns a view of the array with dimensions of length 1 removed. """ - new_shape = tuple([dim for dim in self.shape if dim > 1]) + new_shape = tuple(dim for dim in self.shape if dim > 1) new_strides = tuple( - [self.strides[i] for i, dim in enumerate(self.shape) if dim > 1] + self.strides[i] for i, dim in enumerate(self.shape) if dim > 1 ) return GPUArray( @@ -1188,10 +1187,7 @@ class GPUArray: from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) - if self.flags.f_contiguous: - order = "F" - else: - order = "C" + order = "F" if self.flags.f_contiguous else "C" result = self._new_like_me(dtype=real_dtype, order=order) func = elementwise.get_real_kernel(dtype, real_dtype) @@ -1221,10 +1217,7 @@ class GPUArray: from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) - if self.flags.f_contiguous: - order = "F" - else: - order = "C" + order = "F" if self.flags.f_contiguous else "C" result = self._new_like_me(dtype=real_dtype, order=order) func = elementwise.get_imag_kernel(dtype, real_dtype) @@ -1250,14 +1243,8 @@ class GPUArray: "be used as arguments to this operation" ) - if self.flags.f_contiguous: - order = "F" - else: - order = "C" - if out is None: - result = self._new_like_me(order=order) - else: - result = out + order = "F" if self.flags.f_contiguous else "C" + result = self._new_like_me(order=order) if out is None else out func = elementwise.get_conj_kernel(dtype, result.dtype) func.prepared_async_call( @@ -1459,12 +1446,12 @@ def arange(*args, **kwargs): warn( "behavior change: arange guessed dtype other than float32. " - "suggest specifying explicit dtype." + "suggest specifying explicit dtype.", stacklevel=2 ) from math import ceil - size = int(ceil((stop - start) / step)) + size = ceil((stop - start) / step) result = GPUArray((size,), dtype) @@ -1897,7 +1884,9 @@ def concatenate(arrays, axis=0, allocator=None): base_idx = 0 for ary in arrays: my_len = ary.shape[axis] - result[full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:]] = ary + result[(*full_slice[:axis], + slice(base_idx, base_idx + my_len), + *full_slice[axis + 1:])] = ary base_idx += my_len return result @@ -1928,7 +1917,7 @@ def stack(arrays, axis=0, allocator=None): if not (0 <= axis <= input_ndim): raise ValueError("invalid axis") - result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:] + result_shape = (*input_shape[:axis], len(arrays), *input_shape[axis:]) result = empty(shape=result_shape, dtype=np.result_type(*(ary.dtype for ary in arrays)), allocator=allocator, order="C" if axis == 0 else "F") diff --git a/pycuda/reduction.py b/pycuda/reduction.py index 0ae3f094e86f59c495e004caf56b0740f9449344..ef5d0fc14b549574e147058e6957e898e41a39e3 100644 --- a/pycuda/reduction.py +++ b/pycuda/reduction.py @@ -1,4 +1,5 @@ """Computation of reductions on vectors.""" +from __future__ import annotations __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" @@ -57,10 +58,10 @@ Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through source code with only those rights set forth herein. """ -from pycuda.tools import context_dependent_memoize -from pycuda.tools import dtype_to_ctype import numpy as np +from pycuda.tools import context_dependent_memoize, dtype_to_ctype + def get_reduction_module( out_type, @@ -180,13 +181,10 @@ def get_reduction_kernel_and_types( map_expr = "pycuda_reduction_inp[i]" in_arg = "const %s *pycuda_reduction_inp" % out_type - if arguments: - arguments = in_arg + ", " + arguments - else: - arguments = in_arg + arguments = in_arg + ", " + arguments if arguments else in_arg else: - assert False + raise AssertionError() mod = get_reduction_module( out_type, @@ -264,8 +262,8 @@ class ReductionKernel: ) def __call__(self, *args, **kwargs): - MAX_BLOCK_COUNT = 1024 - SMALL_SEQ_COUNT = 4 + MAX_BLOCK_COUNT = 1024 # noqa: N806 + SMALL_SEQ_COUNT = 4 # noqa: N806 s1_func = self.stage1_func s2_func = self.stage2_func @@ -304,7 +302,7 @@ class ReductionKernel: repr_vec = vectors[0] sz = repr_vec.size - allocator = kwargs.get("allocator", None) + allocator = kwargs.get("allocator") if allocator is None: allocator = repr_vec.allocator @@ -335,7 +333,7 @@ class ReductionKernel: (block_count, 1), (self.block_size, 1, 1), stream, - *([result.gpudata] + invocation_args + [seq_count, sz]), + *([result.gpudata, *invocation_args, seq_count, sz]), **kwargs ) @@ -344,7 +342,7 @@ class ReductionKernel: else: f = s2_func arg_types = self.stage2_arg_types - args = (result,) + stage1_args + args = (result, *stage1_args) @context_dependent_memoize @@ -426,10 +424,7 @@ def get_subset_dot_kernel(dtype_out, dtype_subset, dtype_a=None, dtype_b=None): dtype_out = dtype_a if dtype_b is None: - if dtype_a is None: - dtype_b = dtype_out - else: - dtype_b = dtype_a + dtype_b = dtype_out if dtype_a is None else dtype_a if dtype_a is None: dtype_a = dtype_out diff --git a/pycuda/scan.py b/pycuda/scan.py index 1a1b5b937763de81cec2f1af1c1d3b4751ecc3e4..65a9b0d6c9498182bf290530c93903d838af2579 100644 --- a/pycuda/scan.py +++ b/pycuda/scan.py @@ -1,4 +1,6 @@ """Scan primitive.""" +from __future__ import annotations + __copyright__ = """ Copyright 2011 Andreas Kloeckner @@ -25,12 +27,12 @@ Derived from code within the Thrust project, https://github.com/thrust/thrust/ import numpy as np +import pycuda._mymako as mako import pycuda.driver as driver import pycuda.gpuarray as gpuarray +from pycuda._cluda import CLUDA_PREAMBLE from pycuda.compiler import SourceModule from pycuda.tools import dtype_to_ctype -import pycuda._mymako as mako -from pycuda._cluda import CLUDA_PREAMBLE SHARED_PREAMBLE = ( diff --git a/pycuda/sparse/__init__.py b/pycuda/sparse/__init__.py index 0949981b69451fbb55ca1c8ed78af7f9a37457f6..16b468914cc90bb832c005cc3920d72c5bfaf3ba 100644 --- a/pycuda/sparse/__init__.py +++ b/pycuda/sparse/__init__.py @@ -1,6 +1,8 @@ -from __future__ import absolute_import +from __future__ import annotations + from warnings import warn + warn( "pycuda.sparse is deprecated. and will be removed in 2015.x", DeprecationWarning, diff --git a/pycuda/sparse/cg.py b/pycuda/sparse/cg.py index 42148ecec971e0743b51c8309c85099cf9b86138..4079c1fa12218691c3a652581467a7eae4c2611a 100644 --- a/pycuda/sparse/cg.py +++ b/pycuda/sparse/cg.py @@ -1,11 +1,12 @@ -from __future__ import division -from __future__ import absolute_import -from pycuda.sparse.inner import AsyncInnerProduct +from __future__ import annotations + +import numpy as np + from pytools import memoize_method + import pycuda.driver as drv import pycuda.gpuarray as gpuarray - -import numpy as np +from pycuda.sparse.inner import AsyncInnerProduct class ConvergenceError(RuntimeError): @@ -158,10 +159,7 @@ class CGStateContainer: self.one_iteration(compute_real_residual=compute_real_residual) if debug_callback is not None: - if compute_real_residual: - what = "it+residual" - else: - what = "it" + what = "it+residual" if compute_real_residual else "it" debug_callback( what, iterations, self.x, self.residual, self.d, self.delta diff --git a/pycuda/sparse/coordinate.py b/pycuda/sparse/coordinate.py index 8299387f97f0c5f9e0a43ba8315b1e6705678387..13191a86b4f90ebc44f7c72f8160712d86663569 100644 --- a/pycuda/sparse/coordinate.py +++ b/pycuda/sparse/coordinate.py @@ -1,10 +1,12 @@ -from __future__ import division -from __future__ import absolute_import +from __future__ import annotations + +import numpy as np + from pytools import memoize_method + import pycuda.driver as drv import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule -import numpy as np COO_FLAT_KERNEL_TEMPLATE = """ diff --git a/pycuda/sparse/inner.py b/pycuda/sparse/inner.py index 7e55e3078130ecfca3c1592d2a621797eb18470b..7e61498f56ae65698765604f67feb591a9d8efed 100644 --- a/pycuda/sparse/inner.py +++ b/pycuda/sparse/inner.py @@ -1,10 +1,10 @@ -from __future__ import division -from __future__ import absolute_import -import pycuda.driver as drv -import pycuda.gpuarray as gpuarray +from __future__ import annotations import atexit +import pycuda.driver as drv +import pycuda.gpuarray as gpuarray + STREAM_POOL = [] diff --git a/pycuda/sparse/operator.py b/pycuda/sparse/operator.py index bf8231c29fabffbdc1a70035b27735d9ab7d0240..9785564179995d082ed378c572f33550e1dfd40b 100644 --- a/pycuda/sparse/operator.py +++ b/pycuda/sparse/operator.py @@ -1,4 +1,7 @@ -class OperatorBase(object): +from __future__ import annotations + + +class OperatorBase: @property def dtype(self): raise NotImplementedError diff --git a/pycuda/sparse/packeted.py b/pycuda/sparse/packeted.py index f2819db81bfc0be180c71beb68e802b4040b7c67..c06ae3226a805dce33839623477283bad5cb14dc 100644 --- a/pycuda/sparse/packeted.py +++ b/pycuda/sparse/packeted.py @@ -1,10 +1,11 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function +from __future__ import annotations + +import numpy as np + from pytools import memoize_method + import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule -import numpy as np PKT_KERNEL_TEMPLATE = """ @@ -120,7 +121,7 @@ class PacketedSpMV: adj_mat = csr_mat while True: - cut_count, dof_to_packet_nr = part_graph( + _cut_count, dof_to_packet_nr = part_graph( int(self.block_count), xadj=adj_mat.indptr, adjncy=adj_mat.indices ) @@ -148,10 +149,10 @@ class PacketedSpMV: old_block_count = self.block_count self.block_count = int(2 + 1.05 * self.block_count) print( - ( + "Metis produced a big block at block count " "%d--retrying with %d" % (old_block_count, self.block_count) - ) + ) continue @@ -230,7 +231,7 @@ class PacketedSpMV: def find_local_row_costs_and_remaining_coo( self, csr_mat, dof_to_packet_nr, old2new_fetch_indices ): - h, w = self.shape + h, _w = self.shape local_row_costs = [0] * h rem_coo_values = [] rem_coo_i = [] diff --git a/pycuda/sparse/pkt_build.py b/pycuda/sparse/pkt_build.py index b3591584bd6b54f492a53e3da9fdc1ebca8b0e2e..235e1a0f782fc94477e7addb06bd5a1cf5f6693d 100644 --- a/pycuda/sparse/pkt_build.py +++ b/pycuda/sparse/pkt_build.py @@ -1,5 +1,7 @@ -from __future__ import absolute_import +from __future__ import annotations + import numpy as np + import pycuda.gpuarray as gpuarray @@ -74,4 +76,6 @@ except ImportError: pass else: pyximport.install() - from pycuda.sparse.pkt_build_cython import build_pkt_data_structure # noqa: F811, F401 + from pycuda.sparse.pkt_build_cython import ( + build_pkt_data_structure, # noqa: F401 + ) diff --git a/pycuda/tools.py b/pycuda/tools.py index 3e0ccbd2cea5378294e12c9a8cee1fabcf6269c6..e37e616f3bea6ad4fe4b60b1b7d1920e4a4545a7 100644 --- a/pycuda/tools.py +++ b/pycuda/tools.py @@ -1,4 +1,6 @@ """Miscallenous helper functionality.""" +from __future__ import annotations + __copyright__ = "Copyright (C) 2008 Andreas Kloeckner" @@ -25,18 +27,20 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import pycuda.driver as cuda -import pycuda._driver as _drv -import numpy as np +import contextlib +import numpy as np +import pycuda._driver as _drv +import pycuda.driver as cuda from pycuda.compyte.dtypes import ( # noqa: F401 - register_dtype, - get_or_register_dtype, _fill_dtype_registry, dtype_to_ctype as base_dtype_to_ctype, + get_or_register_dtype, + register_dtype, ) + bitlog2 = _drv.bitlog2 DeviceMemoryPool = _drv.DeviceMemoryPool PageLockedMemoryPool = _drv.PageLockedMemoryPool @@ -91,9 +95,8 @@ class DebugMemoryPool(DeviceMemoryPool): "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):" "\n at: %s" % ( - (size, self.last_free) - + cuda.mem_get_info() - + (self.held_blocks, self.active_blocks, description) + (size, self.last_free, *cuda.mem_get_info(), + self.held_blocks, self.active_blocks, description) ), file=self.logfile, ) @@ -144,23 +147,19 @@ def get_default_device(default=0): warn( "get_default_device() is deprecated; " "use make_default_context() instead", - DeprecationWarning, + DeprecationWarning, stacklevel=2, ) - from pycuda.driver import Device import os + from pycuda.driver import Device + dev = os.environ.get("CUDA_DEVICE") if dev is None: - try: - dev = ( - open(os.path.join(os.path.expanduser("~"), ".cuda_device")) - .read() - .strip() - ) - except Exception: - pass + with contextlib.suppress(Exception): # noqa: SIM117 + with open(os.path.join(os.path.expanduser("~"), ".cuda_device")) as devrc: + dev = devrc.read().strip() if dev is None: dev = default @@ -197,7 +196,8 @@ def make_default_context(ctx_maker=None): try: homedir = os.environ.get("HOME") assert homedir is not None - devn = open(os.path.join(homedir, ".cuda_device")).read().strip() + with open(os.path.join(homedir, ".cuda_device")) as devrc: + devn = devrc.read().strip() except Exception: pass @@ -247,7 +247,7 @@ def _int_ceiling(value, multiple_of=1): from math import ceil - return int(ceil(value / multiple_of)) * multiple_of + return ceil(value / multiple_of) * multiple_of def _int_floor(value, multiple_of=1): @@ -256,7 +256,7 @@ def _int_floor(value, multiple_of=1): from math import floor - return int(floor(value / multiple_of)) * multiple_of + return floor(value / multiple_of) * multiple_of # }}} @@ -314,9 +314,7 @@ class DeviceData: def align_bytes(self, word_size=4): if word_size == 4: return 64 - elif word_size == 8: - return 128 - elif word_size == 16: + elif word_size == 8 or word_size == 16: return 128 else: raise ValueError("no alignment possible for fetches of size %d" % word_size) @@ -452,10 +450,7 @@ context_dependent_memoized_functions = [] def context_dependent_memoize(func): def wrapper(*args, **kwargs): - if kwargs: - cache_key = (args, frozenset(kwargs.items())) - else: - cache_key = (args,) + cache_key = (args, frozenset(kwargs.items())) if kwargs else (args,) try: ctx_dict = func._pycuda_ctx_dep_memoize_dic diff --git a/pyproject.toml b/pyproject.toml index 8d784a39029a9ae13aef61b18f064dec0e6d3aeb..fb3d887696ec40085d1bfc3fce410d8f395cf0cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,3 +6,91 @@ requires = [ "wheel", "numpy>=1.24", ] + +[tool.ruff] +preview = true +# FIXME +line-length = 100 + +[tool.ruff.lint] +extend-select = [ + "B", # flake8-bugbear + "C", # flake8-comprehensions + "E", # pycodestyle + "F", # pyflakes + "G", # flake8-logging-format + "I", # flake8-isort + "N", # pep8-naming + "NPY", # numpy + "PGH", # pygrep-hooks + "Q", # flake8-quotes + "RUF", # ruff + "SIM", # flake8-simplify + "TC", # flake8-type-checking + "UP", # pyupgrade + "W", # pycodestyle +] +extend-ignore = [ + "C90", # McCabe complexity + "E221", # multiple spaces before operator + "E226", # missing whitespace around arithmetic operator + "E402", # module-level import not at top of file + "UP031", # use f-strings instead of % + "UP032", # use f-strings instead of .format + "NPY002", # legacy random + "SIM102", # single if instead of nested + "PGH004", # noqa requires rule + "B904", # raise within except: use from + "RUF012", # ClassVar + "SIM108", # ternary instead of if-then-else +] + +[tool.ruff.lint.flake8-quotes] +docstring-quotes = "double" +inline-quotes = "double" +multiline-quotes = "double" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-local-folder = [ + "pycuda", +] +known-first-party = [ + "pytools", +] +lines-after-imports = 2 +required-imports = ["from __future__ import annotations"] + +[tool.ruff.lint.pep8-naming] +extend-ignore-names = ["update_for_*"] + +[tool.ruff.lint.per-file-ignores] +"test/test_*.py" = [ + "N806", # upper case locals +] +"setup.py" = [ + "N806", # upper case locals + "SIM115", # context manager for files +] +"pycuda/sparse/coordinate.py" = [ + "E501", # line length +] +"pycuda/driver.py" = [ + "F405", # undefined symbol + "N806", # upper case locals +] +"pycuda/curandom.py" = [ + "E501", # line length +] +"examples/from-wiki/*.py" = [ + "F", "E", "N", "B" +] +"examples/demo_cdpSimplePrint.py" = [ + "E501", # line length + "N816", # mixed case locals +] +"aksetup_helper.py" = [ + # effectively unmaintained, will go away + "UP", "C", "E501", "B", "SIM", "RUF", +] +"test/undistributed/*.py" = ["B"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 291835073f52a47141bc239a0be603bc742d7d39..0000000000000000000000000000000000000000 --- a/setup.cfg +++ /dev/null @@ -1,9 +0,0 @@ -[flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,W503,N806,F405,E501 -# FIXME: fix in future: N806, F405, E501 -max-line-length=85 -exclude=pycuda/compyte/ndarray,pycuda/compyte/array.py - -inline-quotes = " -docstring-quotes = """ -multiline-quotes = """ diff --git a/setup.py b/setup.py index 5dd88d2920f6fcdc871b4e771ec1148835e5b281..a15ef2b8b3b1af1c13ad67ffbccaf844983d4ef7 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- +from __future__ import annotations -from __future__ import absolute_import, print_function from os.path import dirname, join, normpath @@ -9,8 +8,8 @@ def search_on_path(filenames): """Find file on system path.""" # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52224 - from os.path import exists, abspath - from os import pathsep, environ + from os import environ, pathsep + from os.path import abspath, exists search_path = environ["PATH"] @@ -23,14 +22,14 @@ def search_on_path(filenames): def get_config_schema(): from aksetup_helper import ( + BoostLibraries, ConfigSchema, - Option, IncludeDir, - LibraryDir, Libraries, - BoostLibraries, - Switch, + LibraryDir, + Option, StringListOption, + Switch, make_boost_base_options, ) @@ -79,19 +78,15 @@ def get_config_schema(): default_lib_dirs.append("/usr/local/cuda/lib") return ConfigSchema( - make_boost_base_options() - + [ + [*make_boost_base_options(), Switch("USE_SHIPPED_BOOST", True, "Use included Boost library"), BoostLibraries("python"), BoostLibraries("thread"), Switch("CUDA_TRACE", False, "Enable CUDA API tracing"), - Option( - "CUDA_ROOT", default=cuda_root_default, help="Path to the CUDA toolkit" - ), - Option( - "CUDA_PRETEND_VERSION", - help="Assumed CUDA version, in the form 3010 for 3.1.", - ), + Option("CUDA_ROOT", default=cuda_root_default, + help="Path to the CUDA toolkit"), + Option("CUDA_PRETEND_VERSION", + help="Assumed CUDA version, in the form 3010 for 3.1."), IncludeDir("CUDA", None), Switch("CUDA_ENABLE_GL", False, "Enable CUDA GL interoperability"), Switch("CUDA_ENABLE_CURAND", True, "Enable CURAND library"), @@ -101,15 +96,10 @@ def get_config_schema(): Libraries("CUDART", ["cudart"]), LibraryDir("CURAND", default_lib_dirs), Libraries("CURAND", ["curand"]), - StringListOption( - "CXXFLAGS", - cxxflags_default, - help="Any extra C++ compiler options to include", - ), - StringListOption( - "LDFLAGS", ldflags_default, help="Any extra linker options to include" - ), - ] + StringListOption("CXXFLAGS", cxxflags_default, + help="Any extra C++ compiler options to include"), + StringListOption("LDFLAGS", ldflags_default, + help="Any extra linker options to include")] ) @@ -117,13 +107,13 @@ def main(): import sys from aksetup_helper import ( - hack_distutils, - get_config, - setup, ExtensionUsingNumpy, - set_up_shipped_boost_if_requested, - check_git_submodules, NumpyBuildExtCommand, + check_git_submodules, + get_config, + hack_distutils, + set_up_shipped_boost_if_requested, + setup, ) check_git_submodules() @@ -194,7 +184,7 @@ def main(): # metadata version=ver_dic["VERSION_TEXT"], description="Python wrapper for Nvidia CUDA", - long_description=open("README.rst", "rt").read(), + long_description=open("README.rst").read(), author="Andreas Kloeckner", author_email="inform@tiker.net", license="MIT", @@ -233,13 +223,11 @@ def main(): ext_modules=[ ExtensionUsingNumpy( "_driver", - [ - "src/cpp/cuda.cpp", + ["src/cpp/cuda.cpp", "src/cpp/bitlog.cpp", "src/wrapper/wrap_cudadrv.cpp", "src/wrapper/mempool.cpp", - ] - + EXTRA_SOURCES, + *EXTRA_SOURCES], include_dirs=INCLUDE_DIRS, library_dirs=LIBRARY_DIRS, libraries=LIBRARIES, diff --git a/test/test_cumath.py b/test/test_cumath.py index 2db96d15573d1ba6878362ae734c6b387ddf25db..4267e120a7037339500bf3f6ecc5229febbb0e03 100644 --- a/test/test_cumath.py +++ b/test/test_cumath.py @@ -1,11 +1,13 @@ +from __future__ import annotations + import math -import numpy as np -from pycuda.tools import mark_cuda_test +import numpy as np -import pycuda.gpuarray as gpuarray -import pycuda.driver as drv # noqa import pycuda.cumath as cumath +import pycuda.driver as drv # noqa +import pycuda.gpuarray as gpuarray +from pycuda.tools import mark_cuda_test sizes = [10, 128, 1024, 1 << 10, 1 << 13] @@ -24,13 +26,10 @@ def make_unary_function_test(name, a=0, b=1, threshold=0, complex=False): def test(): gpu_func = getattr(cumath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) - if complex: - _dtypes = complex_dtypes - else: - _dtypes = dtypes + dtypes_ = complex_dtypes if complex else dtypes for s in sizes: - for dtype in _dtypes: + for dtype in dtypes_: np.random.seed(1) A = (np.random.random(s) * (b - a) + a).astype(dtype) if complex: diff --git a/test/test_driver.py b/test/test_driver.py index d167aed218b96a18cf49c12e61958eecdf547fcc..5e9befe7464aaed0d2f3a6046ee56c253c619638 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -1,3 +1,6 @@ +from __future__ import annotations + + __copyright__ = """ Copyright 2008-2021 Andreas Kloeckner Copyright 2021 NVIDIA Corporation @@ -5,13 +8,12 @@ Copyright 2021 NVIDIA Corporation import numpy as np import numpy.linalg as la -from pycuda.tools import mark_cuda_test, dtype_to_ctype -import pytest # noqa - +import pytest -import pycuda.gpuarray as gpuarray import pycuda.driver as drv +import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule +from pycuda.tools import dtype_to_ctype, mark_cuda_test class TestDriver: @@ -126,7 +128,7 @@ class TestDriver: strm.synchronize() dest = drv.pagelocked_empty_like(a) - multiply_them(drv.Out(dest), a_gpu, b_gpu, block=shape + (1,), stream=strm) + multiply_them(drv.Out(dest), a_gpu, b_gpu, block=(*shape, 1), stream=strm) strm.synchronize() drv.memcpy_dtoh_async(a, a_gpu, strm) @@ -206,7 +208,7 @@ class TestDriver: drv.matrix_to_texref(a, mtx_tex, order="F") dest = np.zeros(shape, dtype=np.float32) - copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex]) + copy_texture(drv.Out(dest), block=(*shape, 1), texrefs=[mtx_tex]) assert la.norm(dest - a) == 0 @mark_cuda_test @@ -243,7 +245,7 @@ class TestDriver: drv.matrix_to_texref(b, mtx2_tex, order="F") dest = np.zeros(shape, dtype=np.float32) - copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex, mtx2_tex]) + copy_texture(drv.Out(dest), block=(*shape, 1), texrefs=[mtx_tex, mtx2_tex]) assert la.norm(dest - a - b) < 1e-6 @mark_cuda_test @@ -276,12 +278,12 @@ class TestDriver: shape = (5, 6) channels = 4 a = np.asarray( - np.random.randn(*((channels,) + shape)), dtype=np.float32, order="F" + np.random.randn(*((channels, *shape))), dtype=np.float32, order="F" ) drv.bind_array_to_texref(drv.make_multichannel_2d_array(a, order="F"), mtx_tex) - dest = np.zeros(shape + (channels,), dtype=np.float32) - copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex]) + dest = np.zeros((*shape, channels), dtype=np.float32) + copy_texture(drv.Out(dest), block=(*shape, 1), texrefs=[mtx_tex]) reshaped_a = a.transpose(1, 2, 0) # print reshaped_a # print dest @@ -314,13 +316,13 @@ class TestDriver: shape = (16, 16) channels = 4 - a = np.random.randn(*(shape + (channels,))).astype(np.float32) + a = np.random.randn(*((*shape, channels))).astype(np.float32) a_gpu = drv.to_device(a) mtx_tex.set_address(a_gpu, a.nbytes) mtx_tex.set_format(drv.array_format.FLOAT, 4) - dest = np.zeros(shape + (channels,), dtype=np.float32) - copy_texture(drv.Out(dest), block=shape + (1,), texrefs=[mtx_tex]) + dest = np.zeros((*shape, channels), dtype=np.float32) + copy_texture(drv.Out(dest), block=(*shape, 1), texrefs=[mtx_tex]) # print a # print dest assert la.norm(dest - a) == 0 @@ -470,7 +472,9 @@ class TestDriver: int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int slice = blockIdx.z*blockDim.z + threadIdx.z; - dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row); + dest[row + col*blockDim.x*gridDim.x + + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D( + mtx_tex, slice, col, row); } """ myKern = myKern.replace("fpName", fpName_str) @@ -537,7 +541,8 @@ class TestDriver: int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int slice = blockIdx.z*blockDim.z + threadIdx.z; - int tid = row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y; + int tid = row + col*blockDim.x*gridDim.x + + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y; if (rw==0){ cuPres aux = dest[tid]; fp_surf3Dwrite(aux, mtx_tex, row, col, slice,cudaBoundaryModeClamp);} @@ -694,11 +699,12 @@ class TestDriver: @mark_cuda_test def test_mempool_2(self): - from pycuda.tools import DeviceMemoryPool from random import randrange + + from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() - for i in range(2000): + for _i in range(2000): s = randrange(1 << 31) >> randrange(32) bin_nr = pool.bin_number(s) asize = pool.alloc_size(bin_nr) @@ -709,17 +715,16 @@ class TestDriver: @mark_cuda_test def test_mempool(self): - from pycuda.tools import bitlog2 - from pycuda.tools import DeviceMemoryPool + from pycuda.tools import DeviceMemoryPool, bitlog2 pool = DeviceMemoryPool() queue = [] - free, total = drv.mem_get_info() + free, _total = drv.mem_get_info() e0 = bitlog2(free) for e in range(e0 - 6, e0 - 4): - for i in range(100): + for _i in range(100): queue.append(pool.allocate(1 << e)) if len(queue) > 10: queue.pop(0) @@ -906,11 +911,7 @@ class TestDriver: dest = np.zeros(shape, dtype=tp) copy_texture( drv.Out(dest), - block=shape - + ( - 1, - 1, - ), + block=(*shape, 1, 1), texrefs=[my_tex], ) @@ -1084,11 +1085,11 @@ class TestDriver: mod = DynamicModule() mod.add_source( test_outer_cu, - nvcc_options=(["-rdc=true", "-lcudadevrt"] + DEFAULT_NVCC_FLAGS), + nvcc_options=(["-rdc=true", "-lcudadevrt", *DEFAULT_NVCC_FLAGS]), ) mod.add_source( test_inner_cu, - nvcc_options=(["-rdc=true", "-lcudadevrt"] + DEFAULT_NVCC_FLAGS), + nvcc_options=(["-rdc=true", "-lcudadevrt", *DEFAULT_NVCC_FLAGS]), ) mod.add_stdlib("cudadevrt") mod.link() diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 0bc37eb405e41482149303339504a8a623c4b940..6bc0556618dc9d2ff820752d7ec8510e1ab2fc2c 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -1,16 +1,18 @@ #! /usr/bin/env python +from __future__ import annotations + +import operator +import sys import numpy as np import numpy.linalg as la -import sys -from pycuda.tools import init_cuda_context_fixture -from pycuda.characterize import has_double_support +import pytest -import pycuda.gpuarray as gpuarray import pycuda.driver as drv +import pycuda.gpuarray as gpuarray +from pycuda.characterize import has_double_support from pycuda.compiler import SourceModule -import pytest -import operator +from pycuda.tools import init_cuda_context_fixture @pytest.fixture(autouse=True) @@ -235,7 +237,7 @@ class TestGPUArray: assert (7 + a == a_added).all() - def test_substract_array(self): + def test_subtract_array(self): """Test the subtraction of two arrays.""" # test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) @@ -312,15 +314,12 @@ class TestGPUArray: def test_random(self): from pycuda.curandom import rand as curand - if has_double_support(): - dtypes = [np.float32, np.float64] - else: - dtypes = [np.float32] + dtypes = [np.float32, np.float64] if has_double_support() else [np.float32] for dtype in dtypes: a = curand((10, 100), dtype=dtype).get() - assert (0 <= a).all() + assert (a >= 0).all() assert (a < 1).all() def test_curand_wrappers(self): @@ -334,8 +333,8 @@ class TestGPUArray: generator_types = [] if get_curand_version() >= (3, 2, 0): from pycuda.curandom import ( - XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator, + XORWOWRandomNumberGenerator, ) generator_types.extend( @@ -344,8 +343,8 @@ class TestGPUArray: if get_curand_version() >= (4, 0, 0): from pycuda.curandom import ( ScrambledSobol32RandomNumberGenerator, - Sobol64RandomNumberGenerator, ScrambledSobol64RandomNumberGenerator, + Sobol64RandomNumberGenerator, ) generator_types.extend( @@ -360,10 +359,7 @@ class TestGPUArray: generator_types.extend([MRG32k3aRandomNumberGenerator]) - if has_double_support(): - dtypes = [np.float32, np.float64] - else: - dtypes = [np.float32] + dtypes = [np.float32, np.float64] if has_double_support() else [np.float32] for gen_type in generator_types: gen = gen_type() @@ -380,13 +376,13 @@ class TestGPUArray: x = gen.gen_uniform(10000, dtype) x_host = x.get() - assert (-1 <= x_host).all() + assert (x_host >= -1).all() assert (x_host <= 1).all() gen.gen_uniform(10000, np.uint32) if get_curand_version() >= (5, 0, 0): gen.gen_poisson(10000, np.uint32, 13.0) - for dtype in dtypes + [np.uint32]: + for dtype in [*dtypes, np.uint32]: a = gpuarray.empty(1000000, dtype=dtype) v = 10 a.fill(v) @@ -411,8 +407,8 @@ class TestGPUArray: generator_types = [] if get_curand_version() >= (3, 2, 0): from pycuda.curandom import ( - XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator, + XORWOWRandomNumberGenerator, ) generator_types.extend( @@ -421,8 +417,8 @@ class TestGPUArray: if get_curand_version() >= (4, 0, 0): from pycuda.curandom import ( ScrambledSobol32RandomNumberGenerator, - Sobol64RandomNumberGenerator, ScrambledSobol64RandomNumberGenerator, + Sobol64RandomNumberGenerator, ) generator_types.extend( @@ -457,7 +453,7 @@ class TestGPUArray: x = gen.gen_uniform(2 ** 31, dtype) if dtype in [np.float32, np.float64]: x_host = x.get() - assert (-1 <= x_host).all() + assert (x_host >= -1).all() assert (x_host <= 1).all() del x @@ -468,7 +464,7 @@ class TestGPUArray: v = 10 a.fill(v) gen.fill_poisson(a) - tmp = (a.get() == (v - 1)).sum() / a.size # noqa: F841 + tmp = (a.get() == (v - 1)).sum() / a.size # Check Poisson statistics (need 1e6 values) # Compare with scipy.stats.poisson.pmf(v - 1, v) assert np.isclose(0.12511, tmp, atol=0.005) @@ -488,8 +484,8 @@ class TestGPUArray: generator_types = [] if get_curand_version() >= (3, 2, 0): from pycuda.curandom import ( - XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator, + XORWOWRandomNumberGenerator, ) generator_types.extend( @@ -498,8 +494,8 @@ class TestGPUArray: if get_curand_version() >= (4, 0, 0): from pycuda.curandom import ( ScrambledSobol32RandomNumberGenerator, - Sobol64RandomNumberGenerator, ScrambledSobol64RandomNumberGenerator, + Sobol64RandomNumberGenerator, ) generator_types.extend( @@ -538,7 +534,7 @@ class TestGPUArray: x = gen.gen_uniform(s, dtype) if dtype in [np.float32, np.float64]: x_host = x.get() - assert (-1 <= x_host).all() + assert (x_host >= -1).all() assert (x_host <= 1).all() del x @@ -550,7 +546,7 @@ class TestGPUArray: v = 10 a.fill(v) gen.fill_poisson(a) - tmp = (a.get() == (v - 1)).sum() / a.size # noqa: F841 + tmp = (a.get() == (v - 1)).sum() / a.size # Check Poisson statistics (need 1e6 values) # Compare with scipy.stats.poisson.pmf(v - 1, v) assert np.isclose(0.12511, tmp, atol=0.005) @@ -638,7 +634,7 @@ class TestGPUArray: # a[i] = float('nan') from random import randrange - for i in range(size // 10): + for _i in range(size // 10): a[randrange(0, size)] = float("nan") return a @@ -921,10 +917,11 @@ class TestGPUArray: def allocator(size): nonlocal alloc_uses, pool alloc_uses += 1 - return pool.allocate(size) + return pool.allocate(size) # noqa: B023 alloc = None if pool is None else allocator - sum_a_gpu = gpuarray.subset_sum(meaningful_indices_gpu, a_gpu, allocator=alloc).get() + sum_a_gpu = gpuarray.subset_sum( + meaningful_indices_gpu, a_gpu, allocator=alloc).get() assert np.allclose(sum_a_gpu, sum_a) if pool is not None: assert alloc_uses == 1 @@ -970,7 +967,7 @@ class TestGPUArray: from random import randrange - for i in range(200): + for _i in range(200): start = randrange(sz) end = randrange(start, sz) @@ -989,7 +986,7 @@ class TestGPUArray: from random import randrange - for i in range(200): + for _i in range(200): start = randrange(n) end = randrange(start, n) @@ -999,8 +996,8 @@ class TestGPUArray: assert la.norm(a_gpu_slice.get() - a_slice) == 0 def test_2d_slice_f(self): - from pycuda.curandom import rand as curand import pycuda.gpuarray as gpuarray + from pycuda.curandom import rand as curand n = 1000 m = 300 @@ -1012,7 +1009,7 @@ class TestGPUArray: from random import randrange - for i in range(200): + for _i in range(200): start = randrange(n) end = randrange(start, n) diff --git a/test/undistributed/elwise-perf.py b/test/undistributed/elwise-perf.py index 8562b99c0d5afb3296a628c3ee2d9e0d18c3514d..960c275b6c0d58bcb8794c0d062c8125ea9f75f1 100644 --- a/test/undistributed/elwise-perf.py +++ b/test/undistributed/elwise-perf.py @@ -1,7 +1,6 @@ -import pycuda.driver as drv -import pycuda.autoinit +from __future__ import annotations + import numpy -import numpy.linalg as la def main(): @@ -22,17 +21,14 @@ def main(): a.fill(1) b.fill(2) - if power > 20: - count = 10 - else: - count = 100 + count = 10 if power > 20 else 100 elapsed = [0] def add_timer(_, time): elapsed[0] += time() - for i in range(count): + for _i in range(count): a.mul_add(1, b, 2, add_timer) bytes = a.nbytes * count * 3 diff --git a/test/undistributed/measure_gpuarray_speed.py b/test/undistributed/measure_gpuarray_speed.py index 4f6f390e621aa90cbcd9f19f7eacab44e6773c11..4e441fde9a749b5a09a02db3b2f477dcd0a2349c 100755 --- a/test/undistributed/measure_gpuarray_speed.py +++ b/test/undistributed/measure_gpuarray_speed.py @@ -1,8 +1,9 @@ #! /usr/bin/env python -import pycuda.driver as drv -import pycuda.autoinit +from __future__ import annotations + import numpy -import numpy.linalg as la + +import pycuda.driver as drv def main(): @@ -26,17 +27,14 @@ def main(): b = gpuarray.zeros((size,), dtype=numpy.float32) b.fill(1) - if power > 20: - count = 100 - else: - count = 1000 + count = 100 if power > 20 else 1000 # gpu ----------------------------------------------------------------- start = drv.Event() end = drv.Event() start.record() - for i in range(count): + for _i in range(count): a + b end.record() @@ -57,7 +55,7 @@ def main(): from time import time start = time() - for i in range(count): + for _i in range(count): a_cpu + b_cpu secs = time() - start diff --git a/test/undistributed/reduction-perf.py b/test/undistributed/reduction-perf.py index da4a0d03c8a62e78838a8acb202dd9f452ffcf08..e9f604cbf257065cd206bab001fdc6874d6f080e 100644 --- a/test/undistributed/reduction-perf.py +++ b/test/undistributed/reduction-perf.py @@ -1,8 +1,9 @@ -import pycuda.autoinit -import pycuda.gpuarray as gpuarray -import pycuda.driver as cuda +from __future__ import annotations + import numpy +import pycuda.driver as cuda + def main(): from pytools import Table @@ -41,12 +42,12 @@ def main(): return result # warm-up - for i in range(3): + for _i in range(3): krnl(a_gpu, b_gpu) cnt = 10 - for i in range(cnt): + for _i in range(cnt): krnl( a_gpu, b_gpu,