diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2f447df4f7c601672f037ffe3374d216d1ddd151..21ba2fbaec2231e264a39db1770d9aa8ffa79b4b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: - name: "Main Script" run: | curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh - . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test + . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" ./test examples pylint: name: Pylint diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dab7af8219adb414bbd9c2a07769da100ee60482..5fb7eccdf83826e581ef9957cbb4a1b192246b25 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -247,7 +247,7 @@ Documentation: Flake8: script: - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh - - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test + - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples tags: - python3 except: diff --git a/examples/black-hole-accretion.py b/examples/black-hole-accretion.py old mode 100755 new mode 100644 index 693470b04290e5e251c9abb2694aa84b1641f915..e4cdbbbe612299e8d63aec2f46359f06cdc706ea --- a/examples/black-hole-accretion.py +++ b/examples/black-hole-accretion.py @@ -28,80 +28,29 @@ # - Pierre Lena for his passion about science and vulgarisation # If crash on OpenCL Intel implementation, add following options and force -#export PYOPENCL_COMPILER_OUTPUT=1 -#export CL_CONFIG_USE_VECTORIZER=True -#export CL_CONFIG_CPU_VECTORIZER_MODE=16 +# export PYOPENCL_COMPILER_OUTPUT=1 +# export CL_CONFIG_USE_VECTORIZER=True +# export CL_CONFIG_CPU_VECTORIZER_MODE=16 import pyopencl as cl import numpy -import time,string -from numpy.random import randint as nprnd +import time import sys import getopt from socket import gethostname + def DictionariesAPI(): - PhysicsList={'Einstein':0,'Newton':1} - return(PhysicsList) + PhysicsList = {"Einstein": 0, "Newton": 1} + return PhysicsList + # # Blank space below to simplify debugging on OpenCL code # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -BlobOpenCL= """ +BlobOpenCL = """ #define PI (float)3.14159265359e0f #define nbr 256 @@ -218,7 +167,7 @@ float spectre_cn(float rf32,float b32,float db32, MYFLOAT v=1.e0f-3.e0f/r; - qu=1.e0f/sqrt((1.e0f-3.e0f/r)*r)*(sqrt(r)-sqrt(6.e0f)+sqrt(3.e0f)/2.e0f*log((sqrt(r)+sqrt(3.e0f))/(sqrt(r)-sqrt(3.e0f))* 0.17157287525380988e0f )); + qu=1.e0f/sqrt((1.e0f-3.e0f/r)*r)*(sqrt(r)-sqrt(6.e0f)+sqrt(3.e0f)/2.e0f*log((sqrt(r)+sqrt(3.e0f))/(sqrt(r)-sqrt(3.e0f))* 0.17157287525380988e0f )); // # noqa: E501 temp_em=temp*sqrt(m)*exp(0.25e0f*log(m_point)-0.75e0f*log(r)-0.125e0f*log(v)+0.25e0f*log(fabs(qu))); @@ -340,7 +289,7 @@ __kernel void EachPixel(__global float *zImage,__global float *fImage, vp=vs; rungekutta(&ps,&us,&vs,pp,up,vp,h,m,b); rps=fabs(b/us); - ExitOnImpact = ((fmod(pp,PI)<fmod(phd,PI))&&(fmod(ps,PI)>fmod(phd,PI)))&&(rps>=ri)&&(rps<=re)?1:0; + ExitOnImpact = ((fmod(pp,PI)<fmod(phd,PI))&&(fmod(ps,PI)>fmod(phd,PI)))&&(rps>=ri)&&(rps<=re)?1:0; } while ((rps>=rs)&&(rps<=rp0)&&(ExitOnImpact==0)&&(nh<TRACKPOINTS)); @@ -827,78 +776,8 @@ __kernel void Original(__global float *zImage,__global float *fImage, """ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def KernelCodeCuda(): - BlobCUDA= """ + BlobCUDA = """ #define PI (float)3.14159265359 #define nbr 256 @@ -1019,7 +898,7 @@ __device__ float spectre_cn(float rf32,float b32,float db32, MYFLOAT v=1.-3./r; - qu=1./sqrt((1.-3./r)*r)*(sqrt(r)-sqrt(6.)+sqrt(3.)/2.*log((sqrt(r)+sqrt(3.))/(sqrt(r)-sqrt(3.))* 0.17157287525380988 )); + qu=1./sqrt((1.-3./r)*r)*(sqrt(r)-sqrt(6.)+sqrt(3.)/2.*log((sqrt(r)+sqrt(3.))/(sqrt(r)-sqrt(3.))* 0.17157287525380988 )); // # noqa: #051 temp_em=temp*sqrt(m)*exp(0.25*log(m_point)-0.75*log(r)-0.125*log(v)+0.25*log(fabs(qu))); @@ -1145,7 +1024,7 @@ __global__ void EachPixel(float *zImage,float *fImage, rungekutta(&ps,&us,&vs,pp,up,vp,h,m,b); rpp=rps; rps=fabs(b/us); - ExitOnImpact = ((fmod(pp,PI)<fmod(phd,PI))&&(fmod(ps,PI)>fmod(phd,PI)))&&(rps>ri)&&(rps<re)?1:0; + ExitOnImpact = ((fmod(pp,PI)<fmod(phd,PI))&&(fmod(ps,PI)>fmod(phd,PI)))&&(rps>ri)&&(rps<re)?1:0; } while ((rps>=rs)&&(rps<=rp0)&&(ExitOnImpact==0)); @@ -1612,7 +1491,8 @@ __global__ void Original(float *zImage,float *fImage, } """ - return(BlobCUDA) + return BlobCUDA + # def ImageOutput(sigma,prefix,Colors): # import matplotlib.pyplot as plt @@ -1625,459 +1505,602 @@ __global__ void Original(float *zImage,float *fImage, # print("Save image as %s.png file" % prefix) # print("Save Time : %f" % save_time) -def ImageOutput(sigma,prefix,Colors): + +def ImageOutput(sigma, prefix, Colors): from PIL import Image - Max=sigma.max() - Min=sigma.min() + + Max = sigma.max() + Min = sigma.min() # Normalize value as 8bits Integer - SigmaInt=(255*(sigma-Min)/(Max-Min)).astype('uint8') + SigmaInt = (255 * (sigma - Min) / (Max - Min)).astype("uint8") image = Image.fromarray(SigmaInt) image.save("%s.jpg" % prefix) - -def BlackHoleCL(zImage,fImage,InputCL): - - kernel_params = {} - - Device=InputCL['Device'] - GpuStyle=InputCL['GpuStyle'] - VariableType=InputCL['VariableType'] - Size=InputCL['Size'] - Mass=InputCL['Mass'] - InternalRadius=InputCL['InternalRadius'] - ExternalRadius=InputCL['ExternalRadius'] - Angle=InputCL['Angle'] - Method=InputCL['Method'] - TrackPoints=InputCL['TrackPoints'] - Physics=InputCL['Physics'] - NoImage=InputCL['NoImage'] - TrackSave=InputCL['TrackSave'] - - PhysicsList=DictionariesAPI() - - if InputCL['BlackBody']: + + +def BlackHoleCL(zImage, fImage, InputCL): + Device = InputCL["Device"] + Mass = InputCL["Mass"] + InternalRadius = InputCL["InternalRadius"] + ExternalRadius = InputCL["ExternalRadius"] + Angle = InputCL["Angle"] + Method = InputCL["Method"] + TrackPoints = InputCL["TrackPoints"] + Physics = InputCL["Physics"] + NoImage = InputCL["NoImage"] + TrackSave = InputCL["TrackSave"] + + PhysicsList = DictionariesAPI() + + if InputCL["BlackBody"]: # Spectrum is Black Body one - Line=0 + Line = 0 else: # Spectrum is Monochromatic Line one - Line=1 + Line = 1 - Trajectories=numpy.zeros((int(InputCL['Size']/2),InputCL['TrackPoints']),dtype=numpy.float32) - IdLast=numpy.zeros(int(InputCL['Size']/2),dtype=numpy.int32) + Trajectories = numpy.zeros( + (int(InputCL["Size"] / 2), InputCL["TrackPoints"]), dtype=numpy.float32 + ) + IdLast = numpy.zeros(int(InputCL["Size"] / 2), dtype=numpy.int32) # Je detecte un peripherique GPU dans la liste des peripheriques - Id=0 - HasXPU=False + Id = 0 + HasXPU = False for platform in cl.get_platforms(): for device in platform.get_devices(): - if Id==Device: - PF4XPU=platform.name - XPU=device - print("CPU/GPU selected: ",device.name.lstrip()) - HasXPU=True - Id+=1 - - if HasXPU==False: - print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1)) + if Id == Device: + PF4XPU = platform.name + XPU = device + print("CPU/GPU selected: ", device.name.lstrip()) + HasXPU = True + Id += 1 + + if not HasXPU: + print("No XPU #%i found in all of %i devices, sorry..." % (Device, Id - 1)) sys.exit() ctx = cl.Context([XPU]) - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) - - BuildOptions="-DPHYSICS=%i -DSETTRACKPOINTS=%i " % (PhysicsList[Physics],InputCL['TrackPoints']) - - print('My Platform is ',PF4XPU) - - if 'Intel' in PF4XPU or 'Experimental' in PF4XPU or 'Clover' in PF4XPU or 'Portable' in PF4XPU : - print('No extra options for Intel and Clover!') + queue = cl.CommandQueue( + ctx, properties=cl.command_queue_properties.PROFILING_ENABLE + ) + + BuildOptions = "-DPHYSICS=%i -DSETTRACKPOINTS=%i " % ( + PhysicsList[Physics], + InputCL["TrackPoints"], + ) + + print("My Platform is ", PF4XPU) + + if ( + "Intel" in PF4XPU + or "Experimental" in PF4XPU + or "Clover" in PF4XPU + or "Portable" in PF4XPU + ): + print("No extra options for Intel and Clover!") else: - BuildOptions = BuildOptions+" -cl-mad-enable" + BuildOptions = BuildOptions + " -cl-mad-enable" + + BlackHoleCL = cl.Program(ctx, BlobOpenCL).build(options=BuildOptions) - BlackHoleCL = cl.Program(ctx,BlobOpenCL).build(options = BuildOptions) - # Je recupere les flag possibles pour les buffers mf = cl.mem_flags - if Method=='TrajectoPixel' or Method=='TrajectoCircle': - TrajectoriesCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=Trajectories) + if Method == "TrajectoPixel" or Method == "TrajectoCircle": + TrajectoriesCL = cl.Buffer( + ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=Trajectories + ) IdLastCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=IdLast) zImageCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=zImage) fImageCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=fImage) - - start_time=time.time() - - if Method=='EachPixel': - CLLaunch=BlackHoleCL.EachPixel(queue,(zImage.shape[0],zImage.shape[1]), - None,zImageCL,fImageCL, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line)) + + start_time = time.time() + + if Method == "EachPixel": + CLLaunch = BlackHoleCL.EachPixel( + queue, + (zImage.shape[0], zImage.shape[1]), + None, + zImageCL, + fImageCL, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + ) CLLaunch.wait() - elif Method=='Original': - CLLaunch=BlackHoleCL.Original(queue,(1,), - None,zImageCL,fImageCL, - numpy.uint32(zImage.shape[0]/2), - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line)) + elif Method == "Original": + CLLaunch = BlackHoleCL.Original( + queue, + (1,), + None, + zImageCL, + fImageCL, + numpy.uint32(zImage.shape[0] / 2), + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + ) CLLaunch.wait() - elif Method=='EachCircle': - CLLaunch=BlackHoleCL.EachCircle(queue,(int(zImage.shape[0]/2),), - None,zImageCL,fImageCL, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line)) + elif Method == "EachCircle": + CLLaunch = BlackHoleCL.EachCircle( + queue, + (int(zImage.shape[0] / 2),), + None, + zImageCL, + fImageCL, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + ) CLLaunch.wait() - elif Method=='TrajectoCircle': - CLLaunch=BlackHoleCL.Trajectory(queue,(Trajectories.shape[0],), - None,TrajectoriesCL,IdLastCL, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line)) - - CLLaunch=BlackHoleCL.Circle(queue,(Trajectories.shape[0], - int(zImage.shape[0]*4)),None, - TrajectoriesCL,IdLastCL, - zImageCL,fImageCL, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line)) + elif Method == "TrajectoCircle": + CLLaunch = BlackHoleCL.Trajectory( + queue, + (Trajectories.shape[0],), + None, + TrajectoriesCL, + IdLastCL, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + ) + + CLLaunch = BlackHoleCL.Circle( + queue, + (Trajectories.shape[0], int(zImage.shape[0] * 4)), + None, + TrajectoriesCL, + IdLastCL, + zImageCL, + fImageCL, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + ) CLLaunch.wait() else: - CLLaunch=BlackHoleCL.Trajectory(queue,(Trajectories.shape[0],), - None,TrajectoriesCL,IdLastCL, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line)) - - CLLaunch=BlackHoleCL.Pixel(queue,(zImage.shape[0],zImage.shape[1]),None, - zImageCL,fImageCL,TrajectoriesCL,IdLastCL, - numpy.uint32(Trajectories.shape[0]), - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line)) + CLLaunch = BlackHoleCL.Trajectory( + queue, + (Trajectories.shape[0],), + None, + TrajectoriesCL, + IdLastCL, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + ) + + CLLaunch = BlackHoleCL.Pixel( + queue, + (zImage.shape[0], zImage.shape[1]), + None, + zImageCL, + fImageCL, + TrajectoriesCL, + IdLastCL, + numpy.uint32(Trajectories.shape[0]), + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + ) CLLaunch.wait() - - compute = time.time()-start_time - - cl.enqueue_copy(queue,zImage,zImageCL).wait() - cl.enqueue_copy(queue,fImage,fImageCL).wait() - if Method=='TrajectoPixel' or Method=='TrajectoCircle': - cl.enqueue_copy(queue,Trajectories,TrajectoriesCL).wait() - cl.enqueue_copy(queue,IdLast,IdLastCL).wait() - elapsed = time.time()-start_time + + compute = time.time() - start_time + + cl.enqueue_copy(queue, zImage, zImageCL).wait() + cl.enqueue_copy(queue, fImage, fImageCL).wait() + if Method == "TrajectoPixel" or Method == "TrajectoCircle": + cl.enqueue_copy(queue, Trajectories, TrajectoriesCL).wait() + cl.enqueue_copy(queue, IdLast, IdLastCL).wait() + elapsed = time.time() - start_time print("\nCompute Time : %f" % compute) print("Elapsed Time : %f\n" % elapsed) - zMaxPosition=numpy.where(zImage[:,:]==zImage.max()) - fMaxPosition=numpy.where(fImage[:,:]==fImage.max()) - print("Z max @(%f,%f) : %f" % ((1.*zMaxPosition[1][0]/zImage.shape[1]-0.5,1.*zMaxPosition[0][0]/zImage.shape[0]-0.5,zImage.max()))) - print("Flux max @(%f,%f) : %f" % ((1.*fMaxPosition[1][0]/fImage.shape[1]-0.5,1.*fMaxPosition[0][0]/fImage.shape[0]-0.5,fImage.max()))) + zMaxPosition = numpy.where(zImage[:, :] == zImage.max()) + fMaxPosition = numpy.where(fImage[:, :] == fImage.max()) + print( + "Z max @(%f,%f) : %f" + % ( + ( + 1.0 * zMaxPosition[1][0] / zImage.shape[1] - 0.5, + 1.0 * zMaxPosition[0][0] / zImage.shape[0] - 0.5, + zImage.max(), + ) + ) + ) + print( + "Flux max @(%f,%f) : %f" + % ( + ( + 1.0 * fMaxPosition[1][0] / fImage.shape[1] - 0.5, + 1.0 * fMaxPosition[0][0] / fImage.shape[0] - 0.5, + fImage.max(), + ) + ) + ) zImageCL.release() fImageCL.release() - if Method=='TrajectoPixel' or Method=='TrajectoCircle': + if Method == "TrajectoPixel" or Method == "TrajectoCircle": if not NoImage: - AngleStep=4*numpy.pi/TrackPoints - Angles=numpy.arange(0.,4*numpy.pi,AngleStep) - Angles.shape=(1,TrackPoints) - Hostname=gethostname() - Date=time.strftime("%Y%m%d_%H%M%S") - ImageInfo="%s_Device%i_%s_%s" % (Method,Device,Hostname,Date) - + AngleStep = 4 * numpy.pi / TrackPoints + Angles = numpy.arange(0.0, 4 * numpy.pi, AngleStep) + Angles.shape = (1, TrackPoints) + if TrackSave: # numpy.savetxt("TrouNoirTrajectories_%s.csv" % ImageInfo, # numpy.transpose(numpy.concatenate((Angles,Trajectories),axis=0)), # delimiter=' ', fmt='%.2e') - numpy.savetxt("TrouNoirTrajectories.csv", - numpy.transpose(numpy.concatenate((Angles,Trajectories),axis=0)),delimiter=' ', fmt='%.2e') - + numpy.savetxt( + "TrouNoirTrajectories.csv", + numpy.transpose(numpy.concatenate((Angles, Trajectories), + axis=0)), + delimiter=" ", + fmt="%.2e", + ) + TrajectoriesCL.release() IdLastCL.release() - - return(elapsed) - -def BlackHoleCUDA(zImage,fImage,InputCL): - - kernel_params = {} - - Device=InputCL['Device'] - GpuStyle=InputCL['GpuStyle'] - VariableType=InputCL['VariableType'] - Size=InputCL['Size'] - Mass=InputCL['Mass'] - InternalRadius=InputCL['InternalRadius'] - ExternalRadius=InputCL['ExternalRadius'] - Angle=InputCL['Angle'] - Method=InputCL['Method'] - TrackPoints=InputCL['TrackPoints'] - Physics=InputCL['Physics'] - Threads=InputCL['Threads'] - - PhysicsList=DictionariesAPI() - - if InputCL['BlackBody']: + + return elapsed + + +def BlackHoleCUDA(zImage, fImage, InputCL): + Device = InputCL["Device"] + Mass = InputCL["Mass"] + InternalRadius = InputCL["InternalRadius"] + ExternalRadius = InputCL["ExternalRadius"] + Angle = InputCL["Angle"] + Method = InputCL["Method"] + TrackPoints = InputCL["TrackPoints"] + Physics = InputCL["Physics"] + Threads = InputCL["Threads"] + + PhysicsList = DictionariesAPI() + + if InputCL["BlackBody"]: # Spectrum is Black Body one - Line=0 + Line = 0 else: # Spectrum is Monochromatic Line one - Line=1 + Line = 1 - Trajectories=numpy.zeros((int(InputCL['Size']/2),InputCL['TrackPoints']),dtype=numpy.float32) - IdLast=numpy.zeros(int(InputCL['Size']/2),dtype=numpy.int32) + Trajectories = numpy.zeros( + (int(InputCL["Size"] / 2), InputCL["TrackPoints"]), dtype=numpy.float32 + ) + IdLast = numpy.zeros(int(InputCL["Size"] / 2), dtype=numpy.int32) try: # For PyCUDA import import pycuda.driver as cuda from pycuda.compiler import SourceModule - + cuda.init() for Id in range(cuda.Device.count()): - if Id==Device: - XPU=cuda.Device(Id) + if Id == Device: + XPU = cuda.Device(Id) print("GPU selected %s" % XPU.name()) print except ImportError: print("Platform does not seem to support CUDA") - Context=XPU.make_context() + Context = XPU.make_context() try: - mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DPHYSICS=%i -DSETTRACKPOINTS=%i' % (PhysicsList[Physics],TrackPoints)]) + mod = SourceModule( + KernelCodeCuda(), + options=[ + "--compiler-options", + "-DPHYSICS=%i -DSETTRACKPOINTS=%i" + % (PhysicsList[Physics], TrackPoints), + ], + ) print("Compilation seems to be OK") - except: + except Exception: print("Compilation seems to break") - EachPixelCU=mod.get_function("EachPixel") - OriginalCU=mod.get_function("Original") - EachCircleCU=mod.get_function("EachCircle") - TrajectoryCU=mod.get_function("Trajectory") - PixelCU=mod.get_function("Pixel") - CircleCU=mod.get_function("Circle") - - TrajectoriesCU = cuda.mem_alloc(Trajectories.size*Trajectories.dtype.itemsize) + EachPixelCU = mod.get_function("EachPixel") + OriginalCU = mod.get_function("Original") + EachCircleCU = mod.get_function("EachCircle") + TrajectoryCU = mod.get_function("Trajectory") + PixelCU = mod.get_function("Pixel") + CircleCU = mod.get_function("Circle") + + TrajectoriesCU = cuda.mem_alloc(Trajectories.size * Trajectories.dtype.itemsize) cuda.memcpy_htod(TrajectoriesCU, Trajectories) - zImageCU = cuda.mem_alloc(zImage.size*zImage.dtype.itemsize) + zImageCU = cuda.mem_alloc(zImage.size * zImage.dtype.itemsize) cuda.memcpy_htod(zImageCU, zImage) - fImageCU = cuda.mem_alloc(fImage.size*fImage.dtype.itemsize) + fImageCU = cuda.mem_alloc(fImage.size * fImage.dtype.itemsize) cuda.memcpy_htod(zImageCU, fImage) - IdLastCU = cuda.mem_alloc(IdLast.size*IdLast.dtype.itemsize) + IdLastCU = cuda.mem_alloc(IdLast.size * IdLast.dtype.itemsize) cuda.memcpy_htod(IdLastCU, IdLast) - start_time=time.time() - - if Method=='EachPixel': - EachPixelCU(zImageCU,fImageCU, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line), - grid=(int(zImage.shape[0]/Threads), - int(zImage.shape[1]/Threads)), - block=(Threads,Threads,1)) - elif Method=='EachCircle': - EachCircleCU(zImageCU,fImageCU, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line), - grid=(int(zImage.shape[0]/Threads/2),1), - block=(Threads,1,1)) - elif Method=='Original': - OriginalCU(zImageCU,fImageCU, - numpy.uint32(zImage.shape[0]/2), - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line), - grid=(1,1), - block=(1,1,1)) - elif Method=='TrajectoCircle': - TrajectoryCU(TrajectoriesCU,IdLastCU, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line), - grid=(int(Trajectories.shape[0]/Threads),1), - block=(Threads,1,1)) - - CircleCU(TrajectoriesCU,IdLastCU,zImageCU,fImageCU, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line), - grid=(int(Trajectories.shape[0]/Threads), - int(zImage.shape[0]*4/Threads)), - block=(Threads,Threads,1)) + start_time = time.time() + + if Method == "EachPixel": + EachPixelCU( + zImageCU, + fImageCU, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + grid=(int(zImage.shape[0] / Threads), int(zImage.shape[1] / Threads)), + block=(Threads, Threads, 1), + ) + elif Method == "EachCircle": + EachCircleCU( + zImageCU, + fImageCU, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + grid=(int(zImage.shape[0] / Threads / 2), 1), + block=(Threads, 1, 1), + ) + elif Method == "Original": + OriginalCU( + zImageCU, + fImageCU, + numpy.uint32(zImage.shape[0] / 2), + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + grid=(1, 1), + block=(1, 1, 1), + ) + elif Method == "TrajectoCircle": + TrajectoryCU( + TrajectoriesCU, + IdLastCU, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + grid=(int(Trajectories.shape[0] / Threads), 1), + block=(Threads, 1, 1), + ) + + CircleCU( + TrajectoriesCU, + IdLastCU, + zImageCU, + fImageCU, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + grid=( + int(Trajectories.shape[0] / Threads), + int(zImage.shape[0] * 4 / Threads), + ), + block=(Threads, Threads, 1), + ) else: # Default method: TrajectoPixel - TrajectoryCU(TrajectoriesCU,IdLastCU, - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line), - grid=(int(Trajectories.shape[0]/Threads),1), - block=(Threads,1,1)) - - PixelCU(zImageCU,fImageCU,TrajectoriesCU,IdLastCU, - numpy.uint32(Trajectories.shape[0]), - numpy.float32(Mass), - numpy.float32(InternalRadius), - numpy.float32(ExternalRadius), - numpy.float32(Angle), - numpy.int32(Line), - grid=(int(zImage.shape[0]/Threads), - int(zImage.shape[1]/Threads),1), - block=(Threads,Threads,1)) + TrajectoryCU( + TrajectoriesCU, + IdLastCU, + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + grid=(int(Trajectories.shape[0] / Threads), 1), + block=(Threads, 1, 1), + ) + + PixelCU( + zImageCU, + fImageCU, + TrajectoriesCU, + IdLastCU, + numpy.uint32(Trajectories.shape[0]), + numpy.float32(Mass), + numpy.float32(InternalRadius), + numpy.float32(ExternalRadius), + numpy.float32(Angle), + numpy.int32(Line), + grid=(int(zImage.shape[0] / Threads), int(zImage.shape[1] / Threads), 1), + block=(Threads, Threads, 1), + ) Context.synchronize() - compute = time.time()-start_time + compute = time.time() - start_time - cuda.memcpy_dtoh(zImage,zImageCU) - cuda.memcpy_dtoh(fImage,fImageCU) - if Method=='TrajectoPixel' or Method=='TrajectoCircle': - cuda.memcpy_dtoh(Trajectories,TrajectoriesCU) - elapsed = time.time()-start_time + cuda.memcpy_dtoh(zImage, zImageCU) + cuda.memcpy_dtoh(fImage, fImageCU) + if Method == "TrajectoPixel" or Method == "TrajectoCircle": + cuda.memcpy_dtoh(Trajectories, TrajectoriesCU) + elapsed = time.time() - start_time print("\nCompute Time : %f" % compute) print("Elapsed Time : %f\n" % elapsed) - zMaxPosition=numpy.where(zImage[:,:]==zImage.max()) - fMaxPosition=numpy.where(fImage[:,:]==fImage.max()) - print("Z max @(%f,%f) : %f" % ((1.*zMaxPosition[1][0]/zImage.shape[1]-0.5,1.*zMaxPosition[0][0]/zImage.shape[0]-0.5,zImage.max()))) - print("Flux max @(%f,%f) : %f" % ((1.*fMaxPosition[1][0]/fImage.shape[1]-0.5,1.*fMaxPosition[0][0]/fImage.shape[0]-0.5,fImage.max()))) + zMaxPosition = numpy.where(zImage[:, :] == zImage.max()) + fMaxPosition = numpy.where(fImage[:, :] == fImage.max()) + print( + "Z max @(%f,%f) : %f" + % ( + ( + 1.0 * zMaxPosition[1][0] / zImage.shape[1] - 0.5, + 1.0 * zMaxPosition[0][0] / zImage.shape[0] - 0.5, + zImage.max(), + ) + ) + ) + print( + "Flux max @(%f,%f) : %f" + % ( + ( + 1.0 * fMaxPosition[1][0] / fImage.shape[1] - 0.5, + 1.0 * fMaxPosition[0][0] / fImage.shape[0] - 0.5, + fImage.max(), + ) + ) + ) - Context.pop() Context.detach() - if Method=='TrajectoPixel' or Method=='TrajectoCircle': + if Method == "TrajectoPixel" or Method == "TrajectoCircle": if not NoImage: - AngleStep=4*numpy.pi/TrackPoints - Angles=numpy.arange(0.,4*numpy.pi,AngleStep) - Angles.shape=(1,TrackPoints) - Hostname=gethostname() - Date=time.strftime("%Y%m%d_%H%M%S") - ImageInfo="%s_Device%i_%s_%s" % (Method,Device,Hostname,Date) - + AngleStep = 4 * numpy.pi / TrackPoints + Angles = numpy.arange(0.0, 4 * numpy.pi, AngleStep) + Angles.shape = (1, TrackPoints) + # numpy.savetxt("TrouNoirTrajectories_%s.csv" % ImageInfo, # numpy.transpose(numpy.concatenate((Angles,Trajectories),axis=0)), # delimiter=' ', fmt='%.2e') - numpy.savetxt("TrouNoirTrajectories.csv", - numpy.transpose(numpy.concatenate((Angles,Trajectories),axis=0)), - delimiter=' ', fmt='%.2e') + numpy.savetxt( + "TrouNoirTrajectories.csv", + numpy.transpose(numpy.concatenate((Angles, Trajectories), axis=0)), + delimiter=" ", + fmt="%.2e", + ) - - return(elapsed) + return elapsed -if __name__=='__main__': + +if __name__ == "__main__": # Default device: first one! - Device=0 + Device = 0 # Default implementation: OpenCL, most versatile! - GpuStyle = 'OpenCL' - Mass = 1. + GpuStyle = "OpenCL" + Mass = 1.0 # Internal Radius 3 times de Schwarzschild Radius - InternalRadius=6.*Mass + InternalRadius = 6.0 * Mass # - ExternalRadius=12. + ExternalRadius = 12.0 # # Angle with normal to disc 10 degrees - Angle = numpy.pi/180.*(90.-10.) + Angle = numpy.pi / 180.0 * (90.0 - 10.0) # Radiation of disc : BlackBody or Monochromatic BlackBody = False # Size of image - Size=1024 + Size = 1024 # Variable Type - VariableType='FP32' + VariableType = "FP32" # ? - q=-2 + q = -2 # Method of resolution - Method='TrajectoPixel' + Method = "TrajectoPixel" # Colors for output image - Colors='Greyscale' + Colors = "Greyscale" # Physics - Physics='Einstein' + Physics = "Einstein" # No output as image NoImage = False # Threads in CUDA Threads = 32 # Trackpoints of trajectories - TrackPoints=2048 + TrackPoints = 2048 # Tracksave of trajectories - TrackSave=False - - HowToUse='%s -h [Help] -b [BlackBodyEmission] -j [TrackSave] -n [NoImage] -p <Einstein/Newton> -s <SizeInPixels> -m <Mass> -i <DiscInternalRadius> -x <DiscExternalRadius> -a <AngleAboveDisc> -d <DeviceId> -c <Greyscale/Red2Yellow> -g <CUDA/OpenCL> -o <EachPixel/TrajectoCircle/TrajectoPixel/EachCircle/Original> -t <ThreadsInCuda> -v <FP32/FP64> -k <TrackPoints>' + TrackSave = False + + HowToUse = "%s -h [Help] -b [BlackBodyEmission] -j [TrackSave] -n [NoImage] -p <Einstein/Newton> -s <SizeInPixels> -m <Mass> -i <DiscInternalRadius> -x <DiscExternalRadius> -a <AngleAboveDisc> -d <DeviceId> -c <Greyscale/Red2Yellow> -g <CUDA/OpenCL> -o <EachPixel/TrajectoCircle/TrajectoPixel/EachCircle/Original> -t <ThreadsInCuda> -v <FP32/FP64> -k <TrackPoints>" # noqa: E501 try: - opts, args = getopt.getopt(sys.argv[1:],"hbnjs:m:i:x:a:d:g:v:o:t:c:p:k:",["tracksave","blackbody","noimage","camera","size=","mass=","internal=","external=","angle=","device=","gpustyle=","variabletype=","method=","threads=","colors=","physics=","trackpoints="]) + opts, args = getopt.getopt( + sys.argv[1:], + "hbnjs:m:i:x:a:d:g:v:o:t:c:p:k:", + [ + "tracksave", + "blackbody", + "noimage", + "camera", + "size=", + "mass=", + "internal=", + "external=", + "angle=", + "device=", + "gpustyle=", + "variabletype=", + "method=", + "threads=", + "colors=", + "physics=", + "trackpoints=", + ], + ) except getopt.GetoptError: print(HowToUse % sys.argv[0]) sys.exit(2) # List of Devices - Devices=[] - Alu={} - + Devices = [] + Alu = {} + for opt, arg in opts: - if opt == '-h': + if opt == "-h": print(HowToUse % sys.argv[0]) - + print("\nInformations about devices detected under OpenCL API:") # For PyOpenCL import try: - import pyopencl as cl - Id=0 + Id = 0 for platform in cl.get_platforms(): for device in platform.get_devices(): - #deviceType=cl.device_type.to_string(device.type) - deviceType="xPU" - print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())) - Id=Id+1 - - except: + # deviceType=cl.device_type.to_string(device.type) + deviceType = "xPU" + print( + "Device #%i from %s of type %s : %s" + % ( + Id, + platform.vendor.lstrip(), + deviceType, + device.name.lstrip(), + ) + ) + Id = Id + 1 + + except Exception: print("Your platform does not seem to support OpenCL") - + print("\nInformations about devices detected under CUDA API:") - # For PyCUDA import + # For PyCUDA import try: import pycuda.driver as cuda + cuda.init() for Id in range(cuda.Device.count()): - device=cuda.Device(Id) - print("Device #%i of type GPU : %s" % (Id,device.name())) + device = cuda.Device(Id) + print("Device #%i of type GPU : %s" % (Id, device.name())) print - except: + except Exception: print("Your platform does not seem to support CUDA") - + sys.exit() - + elif opt in ("-d", "--device"): -# Devices.append(int(arg)) - Device=int(arg) + # Devices.append(int(arg)) + Device = int(arg) elif opt in ("-g", "--gpustyle"): GpuStyle = arg elif opt in ("-v", "--variabletype"): @@ -2093,7 +2116,7 @@ if __name__=='__main__': elif opt in ("-e", "--external"): ExternalRadius = float(arg) elif opt in ("-a", "--angle"): - Angle = numpy.pi/180.*(90.-float(arg)) + Angle = numpy.pi / 180.0 * (90.0 - float(arg)) elif opt in ("-b", "--blackbody"): BlackBody = True elif opt in ("-j", "--tracksave"): @@ -2124,72 +2147,79 @@ if __name__=='__main__': print("Trackpoints of Trajectories : %i" % TrackPoints) print("Tracksave of Trajectories : %i" % TrackSave) - if GpuStyle=='CUDA': - print("\nSelection of CUDA device") + if GpuStyle == "CUDA": + print("\nSelection of CUDA device") try: # For PyCUDA import import pycuda.driver as cuda - + cuda.init() for Id in range(cuda.Device.count()): - device=cuda.Device(Id) - print("Device #%i of type GPU : %s" % (Id,device.name())) + device = cuda.Device(Id) + print("Device #%i of type GPU : %s" % (Id, device.name())) if Id in Devices: - Alu[Id]='GPU' - + Alu[Id] = "GPU" + except ImportError: print("Platform does not seem to support CUDA") - if GpuStyle=='OpenCL': - print("\nSelection of OpenCL device") + if GpuStyle == "OpenCL": + print("\nSelection of OpenCL device") try: # For PyOpenCL import import pyopencl as cl - Id=0 + + Id = 0 for platform in cl.get_platforms(): for device in platform.get_devices(): - #deviceType=cl.device_type.to_string(device.type) - deviceType="xPU" - print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip())) + # deviceType=cl.device_type.to_string(device.type) + deviceType = "xPU" + print( + "Device #%i from %s of type %s : %s" + % ( + Id, + platform.vendor.lstrip().rstrip(), + deviceType, + device.name.lstrip().rstrip(), + ) + ) if Id in Devices: - # Set the Alu as detected Device Type - Alu[Id]=deviceType - Id=Id+1 + # Set the Alu as detected Device Type + Alu[Id] = deviceType + Id = Id + 1 except ImportError: print("Platform does not seem to support OpenCL") - - zImage=numpy.zeros((Size,Size),dtype=numpy.float32) - fImage=numpy.zeros((Size,Size),dtype=numpy.float32) - - InputCL={} - InputCL['Device']=Device - InputCL['GpuStyle']=GpuStyle - InputCL['VariableType']=VariableType - InputCL['Size']=Size - InputCL['Mass']=Mass - InputCL['InternalRadius']=InternalRadius - InputCL['ExternalRadius']=ExternalRadius - InputCL['Angle']=Angle - InputCL['BlackBody']=BlackBody - InputCL['Method']=Method - InputCL['TrackPoints']=TrackPoints - InputCL['Physics']=Physics - InputCL['Threads']=Threads - InputCL['NoImage']=NoImage - InputCL['TrackSave']=TrackSave - - if GpuStyle=='OpenCL': - duration=BlackHoleCL(zImage,fImage,InputCL) + zImage = numpy.zeros((Size, Size), dtype=numpy.float32) + fImage = numpy.zeros((Size, Size), dtype=numpy.float32) + + InputCL = {} + InputCL["Device"] = Device + InputCL["GpuStyle"] = GpuStyle + InputCL["VariableType"] = VariableType + InputCL["Size"] = Size + InputCL["Mass"] = Mass + InputCL["InternalRadius"] = InternalRadius + InputCL["ExternalRadius"] = ExternalRadius + InputCL["Angle"] = Angle + InputCL["BlackBody"] = BlackBody + InputCL["Method"] = Method + InputCL["TrackPoints"] = TrackPoints + InputCL["Physics"] = Physics + InputCL["Threads"] = Threads + InputCL["NoImage"] = NoImage + InputCL["TrackSave"] = TrackSave + + if GpuStyle == "OpenCL": + duration = BlackHoleCL(zImage, fImage, InputCL) else: - duration=BlackHoleCUDA(zImage,fImage,InputCL) - - Hostname=gethostname() - Date=time.strftime("%Y%m%d_%H%M%S") - ImageInfo="%s_Device%i_%s_%s" % (Method,Device,Hostname,Date) - - + duration = BlackHoleCUDA(zImage, fImage, InputCL) + + Hostname = gethostname() + Date = time.strftime("%Y%m%d_%H%M%S") + ImageInfo = "%s_Device%i_%s_%s" % (Method, Device, Hostname, Date) + if not NoImage: - ImageOutput(zImage,"TrouNoirZ_%s" % ImageInfo,Colors) - ImageOutput(fImage,"TrouNoirF_%s" % ImageInfo,Colors) + ImageOutput(zImage, "TrouNoirZ_%s" % ImageInfo, Colors) + ImageOutput(fImage, "TrouNoirF_%s" % ImageInfo, Colors) diff --git a/examples/demo-struct-reduce.py b/examples/demo-struct-reduce.py index c0c26e34743687a281c534c05d9c8cb74c6587ec..c6369a6a3774d121cdbb6990c03e9d8e0aa1293b 100644 --- a/examples/demo-struct-reduce.py +++ b/examples/demo-struct-reduce.py @@ -1,6 +1,7 @@ import numpy as np import pyopencl as cl + def make_collector_dtype(device): dtype = np.dtype([ ("cur_min", np.int32), @@ -16,6 +17,7 @@ def make_collector_dtype(device): return dtype, c_decl + ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) diff --git a/examples/demo_elementwise.py b/examples/demo_elementwise.py index 3521089210a6dcde9d957443cc909049f39b61c8..ec1a8d5d89eefb808b8f6f8cf757234f0bc42a7d 100644 --- a/examples/demo_elementwise.py +++ b/examples/demo_elementwise.py @@ -18,8 +18,7 @@ b_g = cl.array.to_device(queue, b_np) lin_comb = ElementwiseKernel(ctx, "float k1, float *a_g, float k2, float *b_g, float *res_g", "res_g[i] = k1 * a_g[i] + k2 * b_g[i]", - "lin_comb" -) + "lin_comb") res_g = cl.array.empty_like(a_g) lin_comb(2, a_g, 3, b_g, res_g) diff --git a/examples/demo_elementwise_complex.py b/examples/demo_elementwise_complex.py index 4fe98ec9d0f0d514c84180e2775d84c7f808b152..5f0bfc297047e5aa1fde06d5331bc00330af9e51 100644 --- a/examples/demo_elementwise_complex.py +++ b/examples/demo_elementwise_complex.py @@ -8,11 +8,9 @@ queue = cl.CommandQueue(ctx) n = 10 a_gpu = cl_array.to_device(queue, - ( numpy.random.randn(n) + 1j*numpy.random.randn(n) - ).astype(numpy.complex64)) + (numpy.random.randn(n) + 1j*numpy.random.randn(n)).astype(numpy.complex64)) b_gpu = cl_array.to_device(queue, - ( numpy.random.randn(n) + 1j*numpy.random.randn(n) - ).astype(numpy.complex64)) + (numpy.random.randn(n) + 1j*numpy.random.randn(n)).astype(numpy.complex64)) from pyopencl.elementwise import ElementwiseKernel complex_prod = ElementwiseKernel(ctx, @@ -24,7 +22,7 @@ complex_prod = ElementwiseKernel(ctx, "complex_prod", preamble=""" #define complex_ctr(x, y) (float2)(x, y) - #define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)) + #define complex_mul(a, b) complex_ctr(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)) # noqa: E501 #define complex_div_scalar(a, b) complex_ctr((a).x / (b), (a).y / (b)) #define conj(a) complex_ctr((a).x, -(a).y) #define conj_transp(a) complex_ctr(-(a).y, (a).x) diff --git a/examples/demo_meta_codepy.py b/examples/demo_meta_codepy.py index 2ba293c5dfc3783f449b8bc6e0b060a90a4d0c3e..d27aed437128c818a2049a0feb1a930544b8a557 100644 --- a/examples/demo_meta_codepy.py +++ b/examples/demo_meta_codepy.py @@ -20,7 +20,7 @@ b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b) c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes) from cgen import FunctionBody, \ - FunctionDeclaration, Typedef, POD, Value, \ + FunctionDeclaration, POD, Value, \ Pointer, Module, Block, Initializer, Assign, Const from cgen.opencl import CLKernel, CLGlobal, \ CLRequiredWorkGroupSize @@ -53,4 +53,3 @@ c = numpy.empty_like(a) cl.enqueue_copy(queue, c, c_buf).wait() assert la.norm(c-(a+b)) == 0 - diff --git a/examples/dump-performance.py b/examples/dump-performance.py index f582cd99fcae98df7325717b4e1541dbf873bbcb..8e5eec501e47dc6a0fa144b7db133690cb7b23b9 100644 --- a/examples/dump-performance.py +++ b/examples/dump-performance.py @@ -27,10 +27,12 @@ def main(): for i in range(6, 31, 2): bs = 1 << i try: - result = "%g GB/s" % (perf.transfer_bandwidth(queue, tx_type, bs)/1e9) + result = "%g GB/s" % ( + perf.transfer_bandwidth(queue, tx_type, bs)/1e9) except Exception as e: result = "exception: %s" % e.__class__.__name__ print("bandwidth @ %d bytes: %s" % (bs, result)) + if __name__ == "__main__": main() diff --git a/examples/dump-properties.py b/examples/dump-properties.py index 07d9159827c315605286d46a4f7de494b7d7489e..3ca409b6a780bb17ecf155972fce37708a5f8795 100644 --- a/examples/dump-properties.py +++ b/examples/dump-properties.py @@ -14,7 +14,7 @@ def print_info(obj, info_cls): info = getattr(info_cls, info_name) try: info_value = obj.get_info(info) - except: + except Exception: info_value = "<error>" if (info_cls == cl.device_info and info_name == "PARTITION_TYPES_EXT" @@ -26,9 +26,10 @@ def print_info(obj, info_cls): else: try: print(f"{info_name}: {info_value}") - except: + except Exception: print("%s: <error>" % info_name) + for platform in cl.get_platforms(): print(75*"=") print(platform) @@ -55,7 +56,7 @@ for platform in cl.get_platforms(): ]: try: formats = cl.get_supported_image_formats(ctx, mf, itype) - except: + except Exception: formats = "<error>" else: def str_chd_type(chdtype): diff --git a/examples/median-filter.py b/examples/median-filter.py index 7f787500ccf82a77b5961413f86e16dbf3cfe8a9..fccb9f583ee95df3e7d6e82f27bcfdd5b4b6d8fa 100644 --- a/examples/median-filter.py +++ b/examples/median-filter.py @@ -2,8 +2,8 @@ import pyopencl as cl import numpy as np from imageio import imread, imsave -#Read in image -img = imread('noisyImage.jpg').astype(np.float32) +# Read in image +img = imread("noisyImage.jpg").astype(np.float32) print(img.shape) img = np.mean(img, axis=2) print(img.shape) @@ -14,7 +14,7 @@ queue = cl.CommandQueue(ctx) mf = cl.mem_flags # Kernel function -src = ''' +src = """ void sort(int *a, int *b, int *c) { int swap; if(*a > *b) { @@ -33,7 +33,9 @@ void sort(int *a, int *b, int *c) { *c = swap; } } -__kernel void medianFilter(__global float *img, __global float *result, __global int *width, __global int *height) +__kernel void medianFilter( + __global float *img, __global float *result, __global int *width, __global + int *height) { int w = *width; int h = *height; @@ -47,7 +49,8 @@ __kernel void medianFilter(__global float *img, __global float *result, __global } else { - int pixel00, pixel01, pixel02, pixel10, pixel11, pixel12, pixel20, pixel21, pixel22; + int pixel00, pixel01, pixel02, pixel10, pixel11, pixel12, pixel20, + pixel21, pixel22; pixel00 = img[i - 1 - w]; pixel01 = img[i- w]; pixel02 = img[i + 1 - w]; @@ -71,19 +74,23 @@ __kernel void medianFilter(__global float *img, __global float *result, __global result[i] = pixel11; } } -''' +""" -#Kernel function instantiation +# Kernel function instantiation prg = cl.Program(ctx, src).build() -#Allocate memory for variables on the device -img_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=img) +# Allocate memory for variables on the device +img_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=img) result_g = cl.Buffer(ctx, mf.WRITE_ONLY, img.nbytes) -width_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[1])) -height_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[0])) +width_g = cl.Buffer( + ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[1]) +) +height_g = cl.Buffer( + ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[0]) +) # Call Kernel. Automatically takes care of block/grid distribution -prg.medianFilter(queue, img.shape, None , img_g, result_g, width_g, height_g) +prg.medianFilter(queue, img.shape, None, img_g, result_g, width_g, height_g) result = np.empty_like(img) cl.enqueue_copy(queue, result, result_g) # Show the blurred image -imsave('medianFilter-OpenCL.jpg', result) +imsave("medianFilter-OpenCL.jpg", result) diff --git a/examples/n-body.py b/examples/n-body.py old mode 100755 new mode 100644 index 4c207b76b23401d439ab356f862a029c1a84392f..ec8a3369c693ce4b427a78ce00a2c4db852a42de --- a/examples/n-body.py +++ b/examples/n-body.py @@ -7,30 +7,31 @@ By default, rendering in OpenGL is disabled. Add -g option to activate. Part of matrix programs from: https://forge.cbp.ens-lyon.fr/svn/bench4gpu/ -CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> +CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> Thanks to Andreas Klockner for PyOpenCL: http://mathema.tician.de/software/pyopencl - + """ import getopt import sys import time import numpy as np import pyopencl as cl -import pyopencl.array as cl_array +import pyopencl.array from numpy.random import randint as nprnd -import string, sys + def DictionariesAPI(): - Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3} - Computing={'FP32':0,'FP64':1} - Interaction={'Force':0,'Potential':1} - Artevasion={'None':0,'NegExp':1,'CorRad':2} - return(Marsaglia,Computing,Interaction,Artevasion) + Marsaglia = {"CONG": 0, "SHR3": 1, "MWC": 2, "KISS": 3} + Computing = {"FP32": 0, "FP64": 1} + Interaction = {"Force": 0, "Potential": 1} + Artevasion = {"None": 0, "NegExp": 1, "CorRad": 2} + return (Marsaglia, Computing, Interaction, Artevasion) -BlobOpenCL= """ + +BlobOpenCL = """ #define TFP32 0 #define TFP64 1 @@ -114,14 +115,14 @@ MYFLOAT PairPotential(MYFLOAT4 m,MYFLOAT4 n) MYFLOAT4 Interaction(MYFLOAT4 m,MYFLOAT4 n) #if INTERACTION == TFORCE #if ARTEVASION == NEGEXP -// Force gradient of potential, set as (1-exp(-r))/r +// Force gradient of potential, set as (1-exp(-r))/r { private MYFLOAT r=MyDistance(n,m); private MYFLOAT num=1.e0f+exp(-r)*(r-1.e0f); return((n-m)*num/(MYFLOAT)(r*r*r)); } #elif ARTEVASION == CORRAD -// Force gradient of potential, (Core Radius) set as 1/sqrt(r**2+CoreRadius**2) +// Force gradient of potential, (Core Radius) set as 1/sqrt(r**2+CoreRadius**2) { private MYFLOAT r=MyDistance(n,m); private MYFLOAT den=sqrt(r*r+CoreRadius*CoreRadius); @@ -152,8 +153,8 @@ MYFLOAT4 Interaction(MYFLOAT4 m,MYFLOAT4 n) MYFLOAT AtomicPotential(__global MYFLOAT4* clDataX,int gid) { private MYFLOAT potential=(MYFLOAT)0.e0f; - private MYFLOAT4 x=clDataX[gid]; - + private MYFLOAT4 x=clDataX[gid]; + for (int i=0;i<get_global_size(0);i++) { if (gid != i) @@ -164,7 +165,7 @@ MYFLOAT AtomicPotential(__global MYFLOAT4* clDataX,int gid) return(potential); } -MYFLOAT AtomicPotentialCoM(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,int gid) +MYFLOAT AtomicPotentialCoM(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,int gid) // # noqa: E501 { return(PairPotential(clDataX[gid],clCoM[0])); } @@ -179,14 +180,14 @@ MYFLOAT8 AtomicRungeKutta(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clData a0=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); v0=(MYFLOAT4)clDataInV[gid]; x0=(MYFLOAT4)clDataInX[gid]; - int N = get_global_size(0); - + int N = get_global_size(0); + for (private int i=0;i<N;i++) { if (gid != i) a0+=Interaction(x0,clDataInX[i]); } - + a1=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); v1=a0*dt+v0; x1=v0*dt+x0; @@ -204,7 +205,7 @@ MYFLOAT8 AtomicRungeKutta(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clData if (gid != k) a2+=Interaction(x2,clDataInX[k]); } - + a3=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); v3=a2*(MYFLOAT)(dt/2.e0f)+v0; x3=v2*(MYFLOAT)(dt/2.e0f)+x0; @@ -213,7 +214,7 @@ MYFLOAT8 AtomicRungeKutta(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clData if (gid != l) a3+=Interaction(x3,clDataInX[l]); } - + a4=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); v4=a3*dt+v0; x4=v3*dt+x0; @@ -222,10 +223,10 @@ MYFLOAT8 AtomicRungeKutta(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clData if (gid != m) a4+=Interaction(x4,clDataInX[m]); } - + xf=x0+dt*(v1+(MYFLOAT)2.e0f*(v2+v3)+v4)/(MYFLOAT)6.e0f; vf=v0+dt*(a1+(MYFLOAT)2.e0f*(a2+a3)+a4)/(MYFLOAT)6.e0f; - + return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f)); } @@ -275,7 +276,7 @@ MYFLOAT8 AtomicImplicitEuler(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clD if (gid != i) a+=Interaction(x0,clDataInX[i]); } - + vf=v0+dt*a; xf=x0+dt*vf; @@ -295,21 +296,21 @@ MYFLOAT8 AtomicExplicitEuler(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clD if (gid != i) a+=Interaction(x0,clDataInX[i]); } - + vf=v0+dt*a; xf=x0+dt*v0; - + return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f)); } -__kernel void InBallSplutterPoints(__global MYFLOAT4* clDataX, +__kernel void InBallSplutterPoints(__global MYFLOAT4* clDataX, MYFLOAT diameter,uint seed_z,uint seed_w) { private int gid=get_global_id(0); private uint zmwc=seed_z+gid; private uint wmwc=seed_w+(gid+1)%2; private MYFLOAT Heat; - + for (int i=0;i<gid;i++) { Heat=MWCfp; @@ -335,19 +336,19 @@ __kernel void InBallSplutterPoints(__global MYFLOAT4* clDataX, Length=(MYFLOAT)length((MYFLOAT4)Position); } - clDataX[gid]=Position; + clDataX[gid]=Position; barrier(CLK_GLOBAL_MEM_FENCE); } -__kernel void InBoxSplutterPoints(__global MYFLOAT4* clDataX, MYFLOAT box, +__kernel void InBoxSplutterPoints(__global MYFLOAT4* clDataX, MYFLOAT box, uint seed_z,uint seed_w) { int gid=get_global_id(0); uint zmwc=seed_z+gid; uint wmwc=seed_w-gid; private MYFLOAT Heat; - + for (int i=0;i<gid;i++) { Heat=MWCfp; @@ -417,7 +418,7 @@ __kernel void RungeKutta(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,M { private int gid = get_global_id(0); private MYFLOAT8 clDataGid; - + clDataGid=AtomicRungeKutta(clDataX,clDataV,gid,h); barrier(CLK_GLOBAL_MEM_FENCE); clDataX[gid]=clDataGid.s0123; @@ -428,7 +429,7 @@ __kernel void Heun(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT { private int gid = get_global_id(0); private MYFLOAT8 clDataGid; - + clDataGid=AtomicHeun(clDataX,clDataV,gid,h); barrier(CLK_GLOBAL_MEM_FENCE); clDataX[gid]=clDataGid.s0123; @@ -439,7 +440,7 @@ __kernel void ImplicitEuler(__global MYFLOAT4* clDataX,__global MYFLOAT4* clData { private int gid = get_global_id(0); private MYFLOAT8 clDataGid; - + clDataGid=AtomicImplicitEuler(clDataX,clDataV,gid,h); barrier(CLK_GLOBAL_MEM_FENCE); clDataX[gid]=clDataGid.s0123; @@ -449,7 +450,7 @@ __kernel void ImplicitEuler(__global MYFLOAT4* clDataX,__global MYFLOAT4* clData __kernel void ExplicitEuler(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h) { private int gid = get_global_id(0); - private MYFLOAT8 clDataGid; + private MYFLOAT8 clDataGid; clDataGid=AtomicExplicitEuler(clDataX,clDataV,gid,h); barrier(CLK_GLOBAL_MEM_FENCE); @@ -469,21 +470,21 @@ __kernel void Potential(__global MYFLOAT4* clDataX,__global MYFLOAT* clPotential int gid = get_global_id(0); MYFLOAT potential=(MYFLOAT)0.e0f; - MYFLOAT4 x=clDataX[gid]; - + MYFLOAT4 x=clDataX[gid]; + for (int i=0;i<get_global_size(0);i++) { if (gid != i) potential+=PairPotential(x,clDataX[i]); } - + barrier(CLK_GLOBAL_MEM_FENCE); clPotential[gid]=potential*(MYFLOAT)5.e-1f; } __kernel void CenterOfMass(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,int Size) { - MYFLOAT4 CoM=clDataX[0]; + MYFLOAT4 CoM=clDataX[0]; for (int i=1;i<Size;i++) { @@ -497,7 +498,7 @@ __kernel void CenterOfMass(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,i __kernel void Kinetic(__global MYFLOAT4* clDataV,__global MYFLOAT* clKinetic) { int gid = get_global_id(0); - + barrier(CLK_GLOBAL_MEM_FENCE); MYFLOAT d=(MYFLOAT)length(clDataV[gid]); clKinetic[gid]=(MYFLOAT)5.e-1f*(MYFLOAT)(d*d); @@ -505,63 +506,73 @@ __kernel void Kinetic(__global MYFLOAT4* clDataV,__global MYFLOAT* clKinetic) """ -def MainOpenCL(clDataX,clDataV,Step,Method): - time_start=time.time() - if Method=="RungeKutta": - CLLaunch=MyRoutines.RungeKutta(queue,(Number,1),None,clDataX,clDataV,Step) - elif Method=="ExplicitEuler": - CLLaunch=MyRoutines.ExplicitEuler(queue,(Number,1),None,clDataX,clDataV,Step) - elif Method=="Heun": - CLLaunch=MyRoutines.Heun(queue,(Number,1),None,clDataX,clDataV,Step) + +def MainOpenCL(clDataX, clDataV, Step, Method): + time_start = time.time() + if Method == "RungeKutta": + CLLaunch = MyRoutines.RungeKutta( + queue, (Number, 1), None, clDataX, clDataV, Step + ) + elif Method == "ExplicitEuler": + CLLaunch = MyRoutines.ExplicitEuler( + queue, (Number, 1), None, clDataX, clDataV, Step + ) + elif Method == "Heun": + CLLaunch = MyRoutines.Heun(queue, (Number, 1), None, clDataX, clDataV, Step) else: - CLLaunch=MyRoutines.ImplicitEuler(queue,(Number,1),None,clDataX,clDataV,Step) + CLLaunch = MyRoutines.ImplicitEuler( + queue, (Number, 1), None, clDataX, clDataV, Step + ) CLLaunch.wait() - Elapsed=time.time()-time_start - return(Elapsed) - + Elapsed = time.time() - time_start + return Elapsed + + def display(*args): + global MyDataX, MyDataV, clDataX, clDataV, Step, Method, Number, Iterations, \ + Durations, Verbose, SpeedRendering + + gl.glClearColor(0.0, 0.0, 0.0, 0.0) + gl.glClear(gl.GL_COLOR_BUFFER_BIT) + gl.glColor3f(1.0, 1.0, 1.0) - global MyDataX,MyDataV,clDataX,clDataV,Step,Method,Number,Iterations,Durations,Verbose,SpeedRendering - - glClearColor(0.0, 0.0, 0.0, 0.0) - glClear(GL_COLOR_BUFFER_BIT) - glColor3f(1.0,1.0,1.0) - - Elapsed=MainOpenCL(clDataX,clDataV,Step,Method) + MainOpenCL(clDataX, clDataV, Step, Method) if SpeedRendering: cl.enqueue_copy(queue, MyDataV, clDataV) - MyDataV.reshape(Number,4)[:,3]=1 - glVertexPointerf(MyDataV.reshape(Number,4)) + MyDataV.reshape(Number, 4)[:, 3] = 1 + gl.glVertexPointerf(MyDataV.reshape(Number, 4)) else: cl.enqueue_copy(queue, MyDataX, clDataX) - MyDataX.reshape(Number,4)[:,3]=1 - glVertexPointerf(MyDataX.reshape(Number,4)) + MyDataX.reshape(Number, 4)[:, 3] = 1 + gl.glVertexPointerf(MyDataX.reshape(Number, 4)) if Verbose: - print("Positions for #%s iteration: %s" % (Iterations,MyDataX)) + print("Positions for #%s iteration: %s" % (Iterations, MyDataX)) else: - sys.stdout.write('.') + sys.stdout.write(".") sys.stdout.flush() - Durations=np.append(Durations,MainOpenCL(clDataX,clDataV,Step,Method)) - glEnableClientState(GL_VERTEX_ARRAY) - glDrawArrays(GL_POINTS, 0, Number) - glDisableClientState(GL_VERTEX_ARRAY) - glFlush() - Iterations+=1 - glutSwapBuffers() + Durations = np.append(Durations, MainOpenCL(clDataX, clDataV, Step, Method)) + gl.glEnableClientState(gl.GL_VERTEX_ARRAY) + gl.glDrawArrays(gl.GL_POINTS, 0, Number) + gl.glDisableClientState(gl.GL_VERTEX_ARRAY) + gl.glFlush() + Iterations += 1 + glut.glutSwapBuffers() + def halt(): pass -def keyboard(k,x,y): - global ViewRZ,SpeedRendering - LC_Z = as_8_bit( 'z' ) - UC_Z = as_8_bit( 'Z' ) - Plus = as_8_bit( '+' ) - Minus = as_8_bit( '-' ) - Switch = as_8_bit( 's' ) - Zoom=1 +def keyboard(k, x, y): + global ViewRZ, SpeedRendering + LC_Z = glut.as_8_bit("z") + UC_Z = glut.as_8_bit("Z") + Plus = glut.as_8_bit("+") + Minus = glut.as_8_bit("-") + Switch = glut.as_8_bit("s") + + Zoom = 1 if k == LC_Z: ViewRZ += 1.0 elif k == UC_Z: @@ -572,161 +583,203 @@ def keyboard(k,x,y): Zoom /= 2.0 elif k == Switch: if SpeedRendering: - SpeedRendering=False + SpeedRendering = False else: - SpeedRendering=True - elif ord(k) == 27: # Escape - glutLeaveMainLoop() - return(False) + SpeedRendering = True + elif ord(k) == 27: # Escape + glut.glutLeaveMainLoop() + return False else: return - glRotatef(ViewRZ, 0.0, 0.0, 1.0) - glScalef(Zoom,Zoom,Zoom) - glutPostRedisplay() + gl.glRotatef(ViewRZ, 0.0, 0.0, 1.0) + gl.glScalef(Zoom, Zoom, Zoom) + glut.glutPostRedisplay() + -def special(k,x,y): +def special(k, x, y): global ViewRX, ViewRY - Step=1. - if k == GLUT_KEY_UP: + Step = 1.0 + if k == glut.GLUT_KEY_UP: ViewRX += Step - elif k == GLUT_KEY_DOWN: + elif k == glut.GLUT_KEY_DOWN: ViewRX -= Step - elif k == GLUT_KEY_LEFT: + elif k == glut.GLUT_KEY_LEFT: ViewRY += Step - elif k == GLUT_KEY_RIGHT: + elif k == glut.GLUT_KEY_RIGHT: ViewRY -= Step else: return - glRotatef(ViewRX, 1.0, 0.0, 0.0) - glRotatef(ViewRY, 0.0, 1.0, 0.0) - glutPostRedisplay() + gl.glRotatef(ViewRX, 1.0, 0.0, 0.0) + gl.glRotatef(ViewRY, 0.0, 1.0, 0.0) + glut.glutPostRedisplay() + def setup_viewport(): global SizeOfBox - glMatrixMode(GL_PROJECTION) - glLoadIdentity() - glOrtho(-SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox) - glutPostRedisplay() - + gl.glMatrixMode(gl.GL_PROJECTION) + gl.glLoadIdentity() + gl.glOrtho(-SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox) + glut.glutPostRedisplay() + + def reshape(w, h): - glViewport(0, 0, w, h) + gl.glViewport(0, 0, w, h) setup_viewport() -if __name__=='__main__': - global Number,Step,clDataX,clDataV,MyDataX,MyDataV,Method,SizeOfBox,Iterations,Verbose,Durations - +if __name__ == "__main__": + + global Number, Step, clDataX, clDataV, MyDataX, MyDataV, Method, SizeOfBox, \ + Iterations, Verbose, Durations + # ValueType - ValueType='FP32' - class MyFloat(np.float32):pass + ValueType = "FP32" + + class MyFloat(np.float32): + pass + # clType8=cl_array.vec.float8 # Set defaults values - np.set_printoptions(precision=2) + np.set_printoptions(precision=2) # Id of Device : 1 is for first find ! - Device=0 + Device = 0 # Number of bodies is integer - Number=2 + Number = 2 # Number of iterations (for standalone execution) - Iterations=10 + Iterations = 10 # Size of shape - SizeOfShape=MyFloat(1.) + SizeOfShape = MyFloat(1.0) # Initial velocity of particules - Velocity=MyFloat(1.) + Velocity = MyFloat(1.0) # Step - Step=MyFloat(1./32) + Step = MyFloat(1.0 / 32) # Method of integration - Method='ImplicitEuler' + Method = "ImplicitEuler" # InitialRandom - InitialRandom=False + InitialRandom = False # RNG Marsaglia Method - RNG='MWC' + RNG = "MWC" # Viriel Distribution of stress - VirielStress=True + VirielStress = True # Verbose - Verbose=False + Verbose = False # OpenGL real time rendering - OpenGL=False + OpenGL = False # Speed rendering - SpeedRendering=False + SpeedRendering = False # Counter ArtEvasions Measures (artefact evasion) - CoArEv='None' + CoArEv = "None" # Shape to distribute - Shape='Ball' + Shape = "Ball" # Type of Interaction - InterType='Force' - - HowToUse='%s -h [Help] -r [InitialRandom] -g [OpenGL] -e [VirielStress] -o [Verbose] -p [Potential] -x <None|NegExp|CorRad> -d <DeviceId> -n <NumberOfParticules> -i <Iterations> -z <SizeOfBoxOrBall> -v <Velocity> -s <Step> -b <Ball|Box> -m <ImplicitEuler|RungeKutta|ExplicitEuler|Heun> -t <FP32|FP64>' + InterType = "Force" + + HowToUse = "%s -h [Help] -r [InitialRandom] -g [OpenGL] -e [VirielStress] -o [Verbose] -p [Potential] -x <None|NegExp|CorRad> -d <DeviceId> -n <NumberOfParticules> -i <Iterations> -z <SizeOfBoxOrBall> -v <Velocity> -s <Step> -b <Ball|Box> -m <ImplicitEuler|RungeKutta|ExplicitEuler|Heun> -t <FP32|FP64>" # noqa: E501 try: - opts, args = getopt.getopt(sys.argv[1:],"rpgehod:n:i:z:v:s:m:t:b:x:",["random","potential","coarev","opengl","viriel","verbose","device=","number=","iterations=","size=","velocity=","step=","method=","valuetype=","shape="]) + opts, args = getopt.getopt( + sys.argv[1:], + "rpgehod:n:i:z:v:s:m:t:b:x:", + [ + "random", + "potential", + "coarev", + "opengl", + "viriel", + "verbose", + "device=", + "number=", + "iterations=", + "size=", + "velocity=", + "step=", + "method=", + "valuetype=", + "shape=", + ], + ) except getopt.GetoptError: print(HowToUse % sys.argv[0]) sys.exit(2) for opt, arg in opts: - if opt == '-h': + if opt == "-h": print(HowToUse % sys.argv[0]) print("\nInformations about devices detected under OpenCL:") try: - Id=0 + Id = 0 for platform in cl.get_platforms(): for device in platform.get_devices(): # Failed now because of POCL implementation - #deviceType=cl.device_type.to_string(device.type) - deviceType="xPU" - print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())) - Id=Id+1 + # deviceType=cl.device_type.to_string(device.type) + deviceType = "xPU" + print( + "Device #%i from %s of type %s : %s" + % ( + Id, + platform.vendor.lstrip(), + deviceType, + device.name.lstrip(), + ) + ) + Id = Id + 1 sys.exit() except ImportError: print("Your platform does not seem to support OpenCL") sys.exit() elif opt in ("-t", "--valuetype"): - if arg=='FP64': - class MyFloat(np.float64): pass + if arg == "FP64": + + class MyFloat(np.float64): + pass + else: - class MyFloat(np.float32):pass + + class MyFloat(np.float32): + pass + ValueType = arg elif opt in ("-d", "--device"): - Device=int(arg) + Device = int(arg) elif opt in ("-m", "--method"): - Method=arg + Method = arg elif opt in ("-b", "--shape"): - Shape=arg - if Shape!='Ball' or Shape!='Box': - print('Wrong argument: set to Ball') + Shape = arg + if Shape != "Ball" or Shape != "Box": + print("Wrong argument: set to Ball") elif opt in ("-n", "--number"): - Number=int(arg) + Number = int(arg) elif opt in ("-i", "--iterations"): - Iterations=int(arg) + Iterations = int(arg) elif opt in ("-z", "--size"): - SizeOfShape=MyFloat(arg) + SizeOfShape = MyFloat(arg) elif opt in ("-v", "--velocity"): - Velocity=MyFloat(arg) - VirielStress=False + Velocity = MyFloat(arg) + VirielStress = False elif opt in ("-s", "--step"): - Step=MyFloat(arg) + Step = MyFloat(arg) elif opt in ("-r", "--random"): - InitialRandom=True + InitialRandom = True elif opt in ("-c", "--check"): - CheckEnergies=True + CheckEnergies = True elif opt in ("-e", "--viriel"): - VirielStress=True + VirielStress = True elif opt in ("-g", "--opengl"): - OpenGL=True + OpenGL = True elif opt in ("-p", "--potential"): - InterType='Potential' + InterType = "Potential" elif opt in ("-x", "--coarev"): - CoArEv=arg + CoArEv = arg elif opt in ("-o", "--verbose"): - Verbose=True + Verbose = True + + SizeOfShape = np.sqrt(MyFloat(SizeOfShape * Number)) + Velocity = MyFloat(Velocity) + Step = MyFloat(Step) - SizeOfShape=np.sqrt(MyFloat(SizeOfShape*Number)) - Velocity=MyFloat(Velocity) - Step=MyFloat(Step) - print("Device choosed : %s" % Device) print("Number of particules : %s" % Number) print("Size of Shape : %s" % SizeOfShape) @@ -742,49 +795,63 @@ if __name__=='__main__': print("Interaction type : %s" % InterType) print("Counter Artevasion type : %s" % CoArEv) - # Create Numpy array of CL vector with 8 FP32 - MyCoM = np.zeros(4,dtype=MyFloat) - MyDataX = np.zeros(Number*4, dtype=MyFloat) - MyDataV = np.zeros(Number*4, dtype=MyFloat) + # Create Numpy array of CL vector with 8 FP32 + MyCoM = np.zeros(4, dtype=MyFloat) + MyDataX = np.zeros(Number * 4, dtype=MyFloat) + MyDataV = np.zeros(Number * 4, dtype=MyFloat) MyPotential = np.zeros(Number, dtype=MyFloat) MyKinetic = np.zeros(Number, dtype=MyFloat) - Marsaglia,Computing,Interaction,Artevasion=DictionariesAPI() + Marsaglia, Computing, Interaction, Artevasion = DictionariesAPI() # Scan the OpenCL arrays - Id=0 - HasXPU=False + Id = 0 + HasXPU = False for platform in cl.get_platforms(): for device in platform.get_devices(): - if Id==Device: - PlatForm=platform - XPU=device - print("CPU/GPU selected: ",device.name.lstrip()) - print("Platform selected: ",platform.name) - HasXPU=True - Id+=1 - - if HasXPU==False: - print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1)) - sys.exit() + if Id == Device: + PlatForm = platform + XPU = device + print("CPU/GPU selected: ", device.name.lstrip()) + print("Platform selected: ", platform.name) + HasXPU = True + Id += 1 + + if not HasXPU: + print("No XPU #%i found in all of %i devices, sorry..." % (Device, Id - 1)) + sys.exit() # Create Context try: ctx = cl.Context([XPU]) - queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE) - except: + queue = cl.CommandQueue( + ctx, properties=cl.command_queue_properties.PROFILING_ENABLE + ) + except Exception: print("Crash during context creation") # Build all routines used for the computing - #BuildOptions="-cl-mad-enable -cl-kernel-arg-info -cl-fast-relaxed-math -cl-std=CL1.2 -DTRNG=%i -DTYPE=%i" % (Marsaglia[RNG],Computing[ValueType]) - BuildOptions="-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%i -DINTERACTION=%i -DARTEVASION=%i" % (Marsaglia[RNG],Computing[ValueType],Interaction[InterType],Artevasion[CoArEv]) - - if 'Intel' in PlatForm.name or 'Experimental' in PlatForm.name or 'Clover' in PlatForm.name or 'Portable' in PlatForm.name : - MyRoutines = cl.Program(ctx, BlobOpenCL).build(options = BuildOptions) + # BuildOptions="-cl-mad-enable -cl-kernel-arg-info -cl-fast-relaxed-math -cl-std=CL1.2 -DTRNG=%i -DTYPE=%i" % (Marsaglia[RNG],Computing[ValueType]) # noqa: E501 + BuildOptions = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%i -DINTERACTION=%i -DARTEVASION=%i" % ( # noqa: E501 + Marsaglia[RNG], + Computing[ValueType], + Interaction[InterType], + Artevasion[CoArEv], + ) + + if ( + "Intel" in PlatForm.name + or "Experimental" in PlatForm.name + or "Clover" in PlatForm.name + or "Portable" in PlatForm.name + ): + MyRoutines = cl.Program(ctx, BlobOpenCL).build(options=BuildOptions) else: - MyRoutines = cl.Program(ctx, BlobOpenCL).build(options = BuildOptions+" -cl-strict-aliasing") - + MyRoutines = cl.Program(ctx, BlobOpenCL).build( + options=BuildOptions + " -cl-strict-aliasing" + ) + mf = cl.mem_flags # Read/Write approach for buffering clDataX = cl.Buffer(ctx, mf.READ_WRITE, MyDataX.nbytes) @@ -792,130 +859,210 @@ if __name__=='__main__': clPotential = cl.Buffer(ctx, mf.READ_WRITE, MyPotential.nbytes) clKinetic = cl.Buffer(ctx, mf.READ_WRITE, MyKinetic.nbytes) clCoM = cl.Buffer(ctx, mf.READ_WRITE, MyCoM.nbytes) - + # Write/HostPointer approach for buffering # clDataX = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataX) # clDataV = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataV) - # clPotential = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyPotential) + # clPotential = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyPotential) # noqa: E501 # clKinetic = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyKinetic) # clCoM = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyCoM) - print('All particles superimposed.') + print("All particles superimposed.") # Set particles to RNG points if InitialRandom: - seed_w=np.uint32(nprnd(2**32)) - seed_z=np.uint32(nprnd(2**32)) + seed_w = np.uint32(nprnd(2 ** 32)) + seed_z = np.uint32(nprnd(2 ** 32)) else: - seed_w=np.uint32(19710211) - seed_z=np.uint32(20081010) - - if Shape=='Ball': - MyRoutines.InBallSplutterPoints(queue,(Number,1),None,clDataX,SizeOfShape,seed_w,seed_z) + seed_w = np.uint32(19710211) + seed_z = np.uint32(20081010) + + if Shape == "Ball": + MyRoutines.InBallSplutterPoints( + queue, (Number, 1), None, clDataX, SizeOfShape, seed_w, seed_z + ) else: - MyRoutines.InBoxSplutterPoints(queue,(Number,1),None,clDataX,SizeOfShape,seed_w,seed_z) + MyRoutines.InBoxSplutterPoints( + queue, (Number, 1), None, clDataX, SizeOfShape, seed_w, seed_z + ) - print('All particules distributed') + print("All particules distributed") - CLLaunch=MyRoutines.CenterOfMass(queue,(1,1),None,clDataX,clCoM,np.int32(Number)) + CLLaunch = MyRoutines.CenterOfMass( + queue, (1, 1), None, clDataX, clCoM, np.int32(Number) + ) CLLaunch.wait() - cl.enqueue_copy(queue,MyCoM,clCoM) - print('Center Of Mass estimated: (%s,%s,%s)' % (MyCoM[0],MyCoM[1],MyCoM[2])) - + cl.enqueue_copy(queue, MyCoM, clCoM) + print("Center Of Mass estimated: (%s,%s,%s)" % (MyCoM[0], MyCoM[1], MyCoM[2])) + if VirielStress: - CLLaunch=MyRoutines.SplutterStress(queue,(Number,1),None,clDataX,clDataV,clCoM,MyFloat(0.),np.uint32(110271),np.uint32(250173)) + CLLaunch = MyRoutines.SplutterStress( + queue, + (Number, 1), + None, + clDataX, + clDataV, + clCoM, + MyFloat(0.0), + np.uint32(110271), + np.uint32(250173), + ) else: - CLLaunch=MyRoutines.SplutterStress(queue,(Number,1),None,clDataX,clDataV,clCoM,Velocity,np.uint32(110271),np.uint32(250173)) + CLLaunch = MyRoutines.SplutterStress( + queue, + (Number, 1), + None, + clDataX, + clDataV, + clCoM, + Velocity, + np.uint32(110271), + np.uint32(250173), + ) CLLaunch.wait() - print('All particules stressed') + print("All particules stressed") - CLLaunch=MyRoutines.Potential(queue,(Number,1),None,clDataX,clPotential) - CLLaunch=MyRoutines.Kinetic(queue,(Number,1),None,clDataV,clKinetic) + CLLaunch = MyRoutines.Potential(queue, (Number, 1), None, clDataX, clPotential) + CLLaunch = MyRoutines.Kinetic(queue, (Number, 1), None, clDataV, clKinetic) CLLaunch.wait() - cl.enqueue_copy(queue,MyPotential,clPotential) - cl.enqueue_copy(queue,MyKinetic,clKinetic) - print('Energy estimated: Viriel=%s Potential=%s Kinetic=%s\n'% (np.sum(MyPotential)+2*np.sum(MyKinetic),np.sum(MyPotential),np.sum(MyKinetic))) + cl.enqueue_copy(queue, MyPotential, clPotential) + cl.enqueue_copy(queue, MyKinetic, clKinetic) + print( + "Energy estimated: Viriel=%s Potential=%s Kinetic=%s\n" + % ( + np.sum(MyPotential) + 2 * np.sum(MyKinetic), + np.sum(MyPotential), + np.sum(MyKinetic), + ) + ) if SpeedRendering: - SizeOfBox=max(2*MyKinetic) + SizeOfBox = max(2 * MyKinetic) else: - SizeOfBox=SizeOfShape - + SizeOfBox = SizeOfShape + if OpenGL: - print('\tTiny documentation to interact OpenGL rendering:\n') - print('\t<Left|Right> Rotate around X axis') - print('\t <Up|Down> Rotate around Y axis') - print('\t <z|Z> Rotate around Z axis') - print('\t <-|+> Unzoom/Zoom') - print('\t <s> Toggle to display Positions or Velocities') - print('\t <Esc> Quit\n') - - wall_time_start=time.time() - - Durations=np.array([],dtype=MyFloat) - print('Starting!') + print("\tTiny documentation to interact OpenGL rendering:\n") + print("\t<Left|Right> Rotate around X axis") + print("\t <Up|Down> Rotate around Y axis") + print("\t <z|Z> Rotate around Z axis") + print("\t <-|+> Unzoom/Zoom") + print("\t <s> Toggle to display Positions or Velocities") + print("\t <Esc> Quit\n") + + wall_time_start = time.time() + + Durations = np.array([], dtype=MyFloat) + print("Starting!") if OpenGL: - from OpenGL.GL import * - from OpenGL.GLUT import * - global ViewRX,ViewRY,ViewRZ - Iterations=0 - ViewRX,ViewRY,ViewRZ = 0.,0.,0. + import OpenGL.GL as gl + import OpenGL.GLUT as glut + + global ViewRX, ViewRY, ViewRZ + Iterations = 0 + ViewRX, ViewRY, ViewRZ = 0.0, 0.0, 0.0 # Launch OpenGL Loop - glutInit(sys.argv) - glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB) - glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE,GLUT_ACTION_CONTINUE_EXECUTION) - glutInitWindowSize(512,512) - glutCreateWindow(b'NBodyGL') + glut.glutInit(sys.argv) + glut.glutInitDisplayMode(glut.GLUT_DOUBLE | glut.GLUT_RGB) + glut.glutSetOption(glut.GLUT_ACTION_ON_WINDOW_CLOSE, + glut.GLUT_ACTION_CONTINUE_EXECUTION) + glut.glutInitWindowSize(512, 512) + glut.glutCreateWindow(b"NBodyGL") setup_viewport() - glutReshapeFunc(reshape) - glutDisplayFunc(display) - glutIdleFunc(display) + glut.glutReshapeFunc(reshape) + glut.glutDisplayFunc(display) + glut.glutIdleFunc(display) # glutMouseFunc(mouse) - glutSpecialFunc(special) - glutKeyboardFunc(keyboard) - glutMainLoop() + glut.glutSpecialFunc(special) + glut.glutKeyboardFunc(keyboard) + glut.glutMainLoop() else: for iteration in range(Iterations): - Elapsed=MainOpenCL(clDataX,clDataV,Step,Method) + Elapsed = MainOpenCL(clDataX, clDataV, Step, Method) if Verbose: # print("Duration of #%s iteration: %s" % (iteration,Elapsed)) cl.enqueue_copy(queue, MyDataX, clDataX) - print("Positions for #%s iteration: %s" % (iteration,MyDataX)) + print("Positions for #%s iteration: %s" % (iteration, MyDataX)) else: - sys.stdout.write('.') + sys.stdout.write(".") sys.stdout.flush() - Durations=np.append(Durations,Elapsed) + Durations = np.append(Durations, Elapsed) - print('\nEnding!') - - MyRoutines.CenterOfMass(queue,(1,1),None,clDataX,clCoM,np.int32(Number)) - CLLaunch=MyRoutines.Potential(queue,(Number,1),None,clDataX,clPotential) - CLLaunch=MyRoutines.Kinetic(queue,(Number,1),None,clDataV,clKinetic) - CLLaunch.wait() - cl.enqueue_copy(queue,MyCoM,clCoM) - cl.enqueue_copy(queue,MyPotential,clPotential) - cl.enqueue_copy(queue,MyKinetic,clKinetic) - print('\nCenter Of Mass estimated: (%s,%s,%s)' % (MyCoM[0],MyCoM[1],MyCoM[2])) - print('Energy estimated: Viriel=%s Potential=%s Kinetic=%s\n'% (np.sum(MyPotential)+2.*np.sum(MyKinetic),np.sum(MyPotential),np.sum(MyKinetic))) + print("\nEnding!") - print("Duration stats on device %s with %s iterations :\n\tMean:\t%s\n\tMedian:\t%s\n\tStddev:\t%s\n\tMin:\t%s\n\tMax:\t%s\n\n\tVariability:\t%s\n" % (Device,Iterations,np.mean(Durations),np.median(Durations),np.std(Durations),np.min(Durations),np.max(Durations),np.std(Durations)/np.median(Durations))) + MyRoutines.CenterOfMass(queue, (1, 1), None, clDataX, clCoM, np.int32(Number)) + CLLaunch = MyRoutines.Potential(queue, (Number, 1), None, clDataX, clPotential) + CLLaunch = MyRoutines.Kinetic(queue, (Number, 1), None, clDataV, clKinetic) + CLLaunch.wait() + cl.enqueue_copy(queue, MyCoM, clCoM) + cl.enqueue_copy(queue, MyPotential, clPotential) + cl.enqueue_copy(queue, MyKinetic, clKinetic) + print("\nCenter Of Mass estimated: (%s,%s,%s)" % (MyCoM[0], MyCoM[1], MyCoM[2])) + print( + "Energy estimated: Viriel=%s Potential=%s Kinetic=%s\n" + % ( + np.sum(MyPotential) + 2.0 * np.sum(MyKinetic), + np.sum(MyPotential), + np.sum(MyKinetic), + ) + ) + + print( + "Duration stats on device %s with %s iterations :\n\tMean:\t%s\n\tMedian:\t%s\n\tStddev:\t%s\n\tMin:\t%s\n\tMax:\t%s\n\n\tVariability:\t%s\n" # noqa: E501 + % ( + Device, + Iterations, + np.mean(Durations), + np.median(Durations), + np.std(Durations), + np.min(Durations), + np.max(Durations), + np.std(Durations) / np.median(Durations), + ) + ) # FPS: 1/Elapsed - FPS=np.ones(len(Durations)) - FPS/=Durations - - print("FPS stats on device %s with %s iterations :\n\tMean:\t%s\n\tMedian:\t%s\n\tStddev:\t%s\n\tMin:\t%s\n\tMax:\t%s\n" % (Device,Iterations,np.mean(FPS),np.median(FPS),np.std(FPS),np.min(FPS),np.max(FPS))) + FPS = np.ones(len(Durations)) + FPS /= Durations + + print( + "FPS stats on device %s with %s iterations :\n\tMean:\t%s\n\tMedian:\t%s\n\tStddev:\t%s\n\tMin:\t%s\n\tMax:\t%s\n" # noqa: E501 + % ( + Device, + Iterations, + np.mean(FPS), + np.median(FPS), + np.std(FPS), + np.min(FPS), + np.max(FPS), + ) + ) # Contraction of Square*Size*Hertz: Size*Size/Elapsed - Squertz=np.ones(len(Durations)) - Squertz*=Number*Number - Squertz/=Durations + Squertz = np.ones(len(Durations)) + Squertz *= Number * Number + Squertz /= Durations + + print( + "Squertz in log10 & complete stats on device %s with %s iterations :\n\tMean:\t%s\t%s\n\tMedian:\t%s\t%s\n\tStddev:\t%s\t%s\n\tMin:\t%s\t%s\n\tMax:\t%s\t%s\n" # noqa: E501 + % ( + Device, + Iterations, + np.log10(np.mean(Squertz)), + np.mean(Squertz), + np.log10(np.median(Squertz)), + np.median(Squertz), + np.log10(np.std(Squertz)), + np.std(Squertz), + np.log10(np.min(Squertz)), + np.min(Squertz), + np.log10(np.max(Squertz)), + np.max(Squertz), + ) + ) - print("Squertz in log10 & complete stats on device %s with %s iterations :\n\tMean:\t%s\t%s\n\tMedian:\t%s\t%s\n\tStddev:\t%s\t%s\n\tMin:\t%s\t%s\n\tMax:\t%s\t%s\n" % (Device,Iterations,np.log10(np.mean(Squertz)),np.mean(Squertz),np.log10(np.median(Squertz)),np.median(Squertz),np.log10(np.std(Squertz)),np.std(Squertz),np.log10(np.min(Squertz)),np.min(Squertz),np.log10(np.max(Squertz)),np.max(Squertz))) - clDataX.release() clDataV.release() clKinetic.release() clPotential.release() - diff --git a/examples/narray.py b/examples/narray.py index 924c0d69cd89754574b68939c403c92822c5aa07..2ea00fa7f7eab024dc18b314f7dadb70015509a1 100644 --- a/examples/narray.py +++ b/examples/narray.py @@ -2,7 +2,7 @@ import pyopencl as cl import numpy as np -demo_r = np.empty( (500,5), dtype=np.uint32) +demo_r = np.empty((500, 5), dtype=np.uint32) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) @@ -23,7 +23,7 @@ __kernel void demo(__global uint *demo) try: prg.build() -except: +except Exception: print("Error:") print(prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG)) raise @@ -33,4 +33,3 @@ cl.enqueue_copy(queue, demo_r, demo_buf).wait() for res in demo_r: print(res) - diff --git a/examples/pi-monte-carlo.py b/examples/pi-monte-carlo.py old mode 100755 new mode 100644 index 1fc0a91e22181a40ef87ab577bec5118951601c7..dcba18f5ef1d16beaea3ed21b2c0dec38a4dce9e --- a/examples/pi-monte-carlo.py +++ b/examples/pi-monte-carlo.py @@ -9,7 +9,7 @@ # # use -h for complete set of options # -# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> +# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> # Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> # @@ -17,7 +17,7 @@ # http://mathema.tician.de/software/pycuda # Thanks to Andreas Klockner for PyOpenCL: # http://mathema.tician.de/software/pyopencl -# +# # 2013-01-01 : problems with launch timeout # http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time @@ -25,107 +25,119 @@ # Common tools import numpy -from numpy.random import randint as nprnd import sys import getopt import time import itertools from socket import gethostname + class PenStacle: """Pentacle of Statistics from data""" - Avg=0 - Med=0 - Std=0 - Min=0 - Max=0 - def __init__(self,Data): - self.Avg=numpy.average(Data) - self.Med=numpy.median(Data) - self.Std=numpy.std(Data) - self.Max=numpy.max(Data) - self.Min=numpy.min(Data) + + Avg = 0 + Med = 0 + Std = 0 + Min = 0 + Max = 0 + + def __init__(self, Data): + self.Avg = numpy.average(Data) + self.Med = numpy.median(Data) + self.Std = numpy.std(Data) + self.Max = numpy.max(Data) + self.Min = numpy.min(Data) + def display(self): - print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max)) + print("%s %s %s %s %s" % (self.Avg, self.Med, self.Std, self.Min, self.Max)) + class Experience: """Metrology for experiences""" - DeviceStyle='' - DeviceId=0 - AvgD=0 - MedD=0 - StdD=0 - MinD=0 - MaxD=0 - AvgR=0 - MedR=0 - StdR=0 - MinR=0 - MaxR=0 - def __init__(self,DeviceStyle,DeviceId,Iterations): - self.DeviceStyle=DeviceStyle - self.DeviceId=DeviceId + + DeviceStyle = "" + DeviceId = 0 + AvgD = 0 + MedD = 0 + StdD = 0 + MinD = 0 + MaxD = 0 + AvgR = 0 + MedR = 0 + StdR = 0 + MinR = 0 + MaxR = 0 + + def __init__(self, DeviceStyle, DeviceId, Iterations): + self.DeviceStyle = DeviceStyle + self.DeviceId = DeviceId self.Iterations - - def Metrology(self,Data): - Duration=PenStacle(Data) - Rate=PenStacle(Iterations/Data) + + def Metrology(self, Data): + Duration = PenStacle(Data) + Rate = PenStacle(Iterations / Data) print("Duration %s" % Duration) print("Rate %s" % Rate) - def DictionariesAPI(): - Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3} - Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3} - Test={True:1,False:0} - return(Marsaglia,Computing,Test) - + Marsaglia = {"CONG": 0, "SHR3": 1, "MWC": 2, "KISS": 3} + Computing = {"INT32": 0, "INT64": 1, "FP32": 2, "FP64": 3} + Test = {True: 1, False: 0} + return (Marsaglia, Computing, Test) + + # find prime factors of a number # Get for WWW : # http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/ def PrimeFactors(x): - factorlist=numpy.array([]).astype('uint32') - loop=2 - while loop<=x: - if x%loop==0: - x/=loop - factorlist=numpy.append(factorlist,[loop]) + factorlist = numpy.array([]).astype("uint32") + loop = 2 + while loop <= x: + if x % loop == 0: + x /= loop + factorlist = numpy.append(factorlist, [loop]) else: - loop+=1 + loop += 1 return factorlist + # Try to find the best thread number in Hybrid approach (Blocks&Threads) # output is thread number def BestThreadsNumber(jobs): - factors=PrimeFactors(jobs) - matrix=numpy.append([factors],[factors[::-1]],axis=0) - threads=1 + factors = PrimeFactors(jobs) + matrix = numpy.append([factors], [factors[::-1]], axis=0) + threads = 1 for factor in matrix.transpose().ravel(): - threads=threads*factor - if threads*threads>jobs or threads>512: + threads = threads * factor + if threads * threads > jobs or threads > 512: break - return(long(threads)) + return int(threads) + -# Predicted Amdahl Law (Reduced with s=1-p) +# Predicted Amdahl Law (Reduced with s=1-p) def AmdahlR(N, T1, p): - return (T1*(1-p+p/N)) + return T1 * (1 - p + p / N) + # Predicted Amdahl Law def Amdahl(N, T1, s, p): - return (T1*(s+p/N)) + return T1 * (s + p / N) + # Predicted Mylq Law with first order -def Mylq(N, T1,s,c,p): - return (T1*(s+p/N)+c*N) +def Mylq(N, T1, s, c, p): + return T1 * (s + p / N) + c * N + # Predicted Mylq Law with second order -def Mylq2(N, T1,s,c1,c2,p): - return (T1*(s+p/N)+c1*N+c2*N*N) +def Mylq2(N, T1, s, c1, c2, p): + return T1 * (s + p / N) + c1 * N + c2 * N * N + def KernelCodeCuda(): - KERNEL_CODE_CUDA=""" + KERNEL_CODE_CUDA = """ #define TCONG 0 #define TSHR3 1 #define TMWC 2 @@ -272,10 +284,11 @@ __global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z } """ - return(KERNEL_CODE_CUDA) + return KERNEL_CODE_CUDA + def KernelCodeOpenCL(): - KERNEL_CODE_OPENCL=""" + KERNEL_CODE_OPENCL = """ #define TCONG 0 #define TSHR3 1 #define TMWC 2 @@ -395,25 +408,28 @@ ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work) total+=inside; #endif } - + return(total); } -__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z) +__kernel void MainLoopGlobal( + __global ulong *s,ulong iterations,uint seed_w,uint seed_z) { ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0)); barrier(CLK_GLOBAL_MEM_FENCE); - s[get_global_id(0)]=total; + s[get_global_id(0)]=total; } -__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z) +__kernel void MainLoopLocal( + __global ulong *s,ulong iterations,uint seed_w,uint seed_z) { ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0)); barrier(CLK_LOCAL_MEM_FENCE); s[get_local_id(0)]=total; } -__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z) +__kernel void MainLoopHybrid( + __global ulong *s,ulong iterations,uint seed_w,uint seed_z) { ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0)); barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE); @@ -421,184 +437,221 @@ __kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint } """ - return(KERNEL_CODE_OPENCL) - + return KERNEL_CODE_OPENCL + + def MetropolisCuda(InputCU): - print("Inside ",InputCU) - - iterations=InputCU['Iterations'] - steps=InputCU['Steps'] - blocks=InputCU['Blocks'] - threads=InputCU['Threads'] - Device=InputCU['Device'] - RNG=InputCU['RNG'] - ValueType=InputCU['ValueType'] - TestType=InputCU['IfThen'] - Seeds=InputCU['Seeds'] - - Marsaglia,Computing,Test=DictionariesAPI() - + print("Inside ", InputCU) + + iterations = InputCU["Iterations"] + steps = InputCU["Steps"] + blocks = InputCU["Blocks"] + threads = InputCU["Threads"] + Device = InputCU["Device"] + RNG = InputCU["RNG"] + ValueType = InputCU["ValueType"] + Seeds = InputCU["Seeds"] + + Marsaglia, Computing, Test = DictionariesAPI() + try: # For PyCUDA import import pycuda.driver as cuda from pycuda.compiler import SourceModule - + cuda.init() for Id in range(cuda.Device.count()): - if Id==Device: - XPU=cuda.Device(Id) + if Id == Device: + XPU = cuda.Device(Id) print("GPU selected %s" % XPU.name()) print except ImportError: print("Platform does not seem to support CUDA") - circle=numpy.zeros(blocks*threads).astype(numpy.uint64) + circle = numpy.zeros(blocks * threads).astype(numpy.uint64) circleCU = cuda.InOut(circle) - #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize) - #cuda.memcpy_htod(circleCU, circle) + # circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize) + # cuda.memcpy_htod(circleCU, circle) - Context=XPU.make_context() + Context = XPU.make_context() try: - mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])]) - #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True) + mod = SourceModule( + KernelCodeCuda(), + options=[ + "--compiler-options", + "-DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG], Computing[ValueType]), + ], + ) + # mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True) # Needed to set the compiler via ccbin for CUDA9 implementation - #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True) - except: + # mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True) # noqa: E501 + except Exception: print("Compilation seems to break") - - MetropolisBlocksCU=mod.get_function("MainLoopBlocks") - MetropolisThreadsCU=mod.get_function("MainLoopThreads") - MetropolisHybridCU=mod.get_function("MainLoopHybrid") - MyDuration=numpy.zeros(steps) + MetropolisBlocksCU = mod.get_function("MainLoopBlocks") # noqa: F841 + MetropolisThreadsCU = mod.get_function("MainLoopThreads") # noqa: F841 + MetropolisHybridCU = mod.get_function("MainLoopHybrid") + + MyDuration = numpy.zeros(steps) - jobs=blocks*threads; + jobs = blocks * threads - iterationsCU=numpy.uint64(iterations/jobs) - if iterations%jobs!=0: - iterationsCU+=numpy.uint64(1) + iterationsCU = numpy.uint64(iterations / jobs) + if iterations % jobs != 0: + iterationsCU += numpy.uint64(1) for i in range(steps): - start_time=time.time() + start_time = time.time() try: - MetropolisHybridCU(circleCU, - numpy.uint64(iterationsCU), - numpy.uint32(Seeds[0]), - numpy.uint32(Seeds[1]), - grid=(blocks,1),block=(threads,1,1)) - except: + MetropolisHybridCU( + circleCU, + numpy.uint64(iterationsCU), + numpy.uint32(Seeds[0]), + numpy.uint32(Seeds[1]), + grid=(blocks, 1), + block=(threads, 1, 1), + ) + except Exception: print("Crash during CUDA call") - elapsed = time.time()-start_time - print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed)) + elapsed = time.time() - start_time + print( + "(Blocks/Threads)=(%i,%i) method done in %.2f s..." + % (blocks, threads, elapsed) + ) - MyDuration[i]=elapsed + MyDuration[i] = elapsed - OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration} + OutputCU = { + "Inside": sum(circle), + "NewIterations": numpy.uint64(iterationsCU * jobs), + "Duration": MyDuration, + } print(OutputCU) Context.pop() - + Context.detach() - return(OutputCU) + return OutputCU + def MetropolisOpenCL(InputCL): import pyopencl as cl - iterations=InputCL['Iterations'] - steps=InputCL['Steps'] - blocks=InputCL['Blocks'] - threads=InputCL['Threads'] - Device=InputCL['Device'] - RNG=InputCL['RNG'] - ValueType=InputCL['ValueType'] - TestType=InputCL['IfThen'] - Seeds=InputCL['Seeds'] + iterations = InputCL["Iterations"] + steps = InputCL["Steps"] + blocks = InputCL["Blocks"] + threads = InputCL["Threads"] + Device = InputCL["Device"] + RNG = InputCL["RNG"] + ValueType = InputCL["ValueType"] + TestType = InputCL["IfThen"] + Seeds = InputCL["Seeds"] - Marsaglia,Computing,Test=DictionariesAPI() + Marsaglia, Computing, Test = DictionariesAPI() # Initialisation des variables en les CASTant correctement - Id=0 - HasXPU=False + Id = 0 + HasXPU = False for platform in cl.get_platforms(): for device in platform.get_devices(): - if Id==Device: - XPU=device - print("CPU/GPU selected: ",device.name.lstrip()) - HasXPU=True - Id+=1 + if Id == Device: + XPU = device + print("CPU/GPU selected: ", device.name.lstrip()) + HasXPU = True + Id += 1 # print(Id) - if HasXPU==False: - print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1)) - sys.exit() + if not HasXPU: + print("No XPU #%i found in all of %i devices, sorry..." % (Device, Id - 1)) + sys.exit() # Je cree le contexte et la queue pour son execution try: ctx = cl.Context(devices=[XPU]) - queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE) - except: + queue = cl.CommandQueue( + ctx, properties=cl.command_queue_properties.PROFILING_ENABLE + ) + except Exception: print("Crash during context creation") # Je recupere les flag possibles pour les buffers mf = cl.mem_flags - - circle=numpy.zeros(blocks*threads).astype(numpy.uint64) - circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle) - - MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType])) - MyDuration=numpy.zeros(steps) - - jobs=blocks*threads; + circle = numpy.zeros(blocks * threads).astype(numpy.uint64) + circleCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=circle) + + MetropolisCL = cl.Program(ctx, KernelCodeOpenCL()).build( + options="-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" + % (Marsaglia[RNG], Computing[ValueType], Test[TestType]) + ) + + MyDuration = numpy.zeros(steps) - iterationsCL=numpy.uint64(iterations/jobs) - if iterations%jobs!=0: - iterationsCL+=1 + jobs = blocks * threads + + iterationsCL = numpy.uint64(iterations / jobs) + if iterations % jobs != 0: + iterationsCL += 1 for i in range(steps): - start_time=time.time() + start_time = time.time() if threads == 1: - CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None, - circleCL, - numpy.uint64(iterationsCL), - numpy.uint32(Seeds[0]), - numpy.uint32(Seeds[1])) + CLLaunch = MetropolisCL.MainLoopGlobal( + queue, + (blocks,), + None, + circleCL, + numpy.uint64(iterationsCL), + numpy.uint32(Seeds[0]), + numpy.uint32(Seeds[1]), + ) else: - CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,), - circleCL, - numpy.uint64(iterationsCL), - numpy.uint32(Seeds[0]), - numpy.uint32(Seeds[1])) + CLLaunch = MetropolisCL.MainLoopHybrid( + queue, + (jobs,), + (threads,), + circleCL, + numpy.uint64(iterationsCL), + numpy.uint32(Seeds[0]), + numpy.uint32(Seeds[1]), + ) CLLaunch.wait() cl.enqueue_copy(queue, circle, circleCL).wait() - elapsed = time.time()-start_time - print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed)) - + elapsed = time.time() - start_time + print( + "(Blocks/Threads)=(%i,%i) method done in %.2f s..." + % (blocks, threads, elapsed) + ) + # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start) # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle) - MyDuration[i]=elapsed + MyDuration[i] = elapsed # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32) # MyPi[i]=numpy.median(AllPi) # print MyPi[i],numpy.std(AllPi),MyDuration[i] circleCL.release() - OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration} + OutputCL = { + "Inside": sum(circle), + "NewIterations": numpy.uint64(iterationsCL * jobs), + "Duration": MyDuration, + } # print(OutputCL) - return(OutputCL) + return OutputCL -def FitAndPrint(N,D,Curves): +def FitAndPrint(N, D, Curves): from scipy.optimize import curve_fit import matplotlib.pyplot as plt @@ -606,158 +659,201 @@ def FitAndPrint(N,D,Curves): try: coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D) - D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]) - coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0] - coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0] - coeffs_Amdahl[0]=D[0] - print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])) - except: + D_Amdahl = Amdahl(N, coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2]) + coeffs_Amdahl[1] = coeffs_Amdahl[1] * coeffs_Amdahl[0] / D[0] + coeffs_Amdahl[2] = coeffs_Amdahl[2] * coeffs_Amdahl[0] / D[0] + coeffs_Amdahl[0] = D[0] + print( + "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" + % (coeffs_Amdahl[0], coeffs_Amdahl[1], coeffs_Amdahl[2]) + ) + except Exception: print("Impossible to fit for Amdahl law : only %i elements" % len(D)) try: coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D) - D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1]) - coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0] - coeffs_AmdahlR[0]=D[0] - print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])) + # D_AmdahlR = AmdahlR(N, coeffs_AmdahlR[0], coeffs_AmdahlR[1]) + coeffs_AmdahlR[1] = coeffs_AmdahlR[1] * coeffs_AmdahlR[0] / D[0] + coeffs_AmdahlR[0] = D[0] + print( + "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" + % (coeffs_AmdahlR[0], 1 - coeffs_AmdahlR[1], coeffs_AmdahlR[1]) + ) - except: + except Exception: print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)) try: coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D) - coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0] + coeffs_Mylq[1] = coeffs_Mylq[1] * coeffs_Mylq[0] / D[0] # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0] - coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0] - coeffs_Mylq[0]=D[0] - print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0], - coeffs_Mylq[1], - coeffs_Mylq[3], - coeffs_Mylq[2])) - D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2], - coeffs_Mylq[3]) - except: + coeffs_Mylq[3] = coeffs_Mylq[3] * coeffs_Mylq[0] / D[0] + coeffs_Mylq[0] = D[0] + print( + "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" + % (coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[3], coeffs_Mylq[2]) + ) + D_Mylq = Mylq(N, coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[2], + coeffs_Mylq[3]) + except Exception: print("Impossible to fit for Mylq law : only %i elements" % len(D)) try: coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D) - coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0] + coeffs_Mylq2[1] = coeffs_Mylq2[1] * coeffs_Mylq2[0] / D[0] # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0] # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0] - coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0] - coeffs_Mylq2[0]=D[0] - print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])) - - except: + coeffs_Mylq2[4] = coeffs_Mylq2[4] * coeffs_Mylq2[0] / D[0] + coeffs_Mylq2[0] = D[0] + print( + "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" + % ( + coeffs_Mylq2[0], + coeffs_Mylq2[1], + coeffs_Mylq2[4], + coeffs_Mylq2[2], + coeffs_Mylq2[3], + ) + ) + + except Exception: print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)) if Curves: plt.xlabel("Number of Threads/work Items") plt.ylabel("Total Elapsed Time") - Experience,=plt.plot(N,D,'ro') + (Experience,) = plt.plot(N, D, "ro") try: - pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl") - pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq") - except: + (pAmdahl,) = plt.plot(N, D_Amdahl, label="Loi de Amdahl") + (pMylq,) = plt.plot(N, D_Mylq, label="Loi de Mylq") + except Exception: print("Fit curves seem not to be available") plt.legend() plt.show() -if __name__=='__main__': - + +if __name__ == "__main__": + # Set defaults values - + # GPU style can be Cuda (Nvidia implementation) or OpenCL - GpuStyle='OpenCL' + GpuStyle = "OpenCL" # Iterations is integer - Iterations=1000000000 + Iterations = 1000000000 # BlocksBlocks in first number of Blocks to explore - BlocksBegin=1024 + BlocksBegin = 1024 # BlocksEnd is last number of Blocks to explore - BlocksEnd=1024 + BlocksEnd = 1024 # BlocksStep is the step of Blocks to explore - BlocksStep=1 + BlocksStep = 1 # ThreadsBlocks in first number of Blocks to explore - ThreadsBegin=1 + ThreadsBegin = 1 # ThreadsEnd is last number of Blocks to explore - ThreadsEnd=1 + ThreadsEnd = 1 # ThreadsStep is the step of Blocks to explore - ThreadsStep=1 + ThreadsStep = 1 # Redo is the times to redo the test to improve metrology - Redo=1 + Redo = 1 # OutMetrology is method for duration estimation : False is GPU inside - OutMetrology=False - Metrology='InMetro' + OutMetrology = False + Metrology = "InMetro" # Curves is True to print the curves - Curves=False + Curves = False # Fit is True to print the curves - Fit=False + Fit = False # Inside based on If - IfThen=False + IfThen = False # Marsaglia RNG - RNG='MWC' + RNG = "MWC" # Value type : INT32, INT64, FP32, FP64 - ValueType='FP32' + ValueType = "FP32" # Seeds for RNG - Seeds=110271,101008 + Seeds = 110271, 101008 + + HowToUse = "%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>" # noqa: E501 - HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>' - try: - opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="]) + opts, args = getopt.getopt( + sys.argv[1:], + "hockg:i:b:e:s:f:l:t:r:d:m:v:", + [ + "gpustyle=", + "iterations=", + "blocksBegin=", + "blocksEnd=", + "blocksStep=", + "threadsFirst=", + "threadsLast=", + "threadssTep=", + "redo=", + "device=", + "marsaglia=", + "valuetype=", + ], + ) except getopt.GetoptError: print(HowToUse % sys.argv[0]) sys.exit(2) # List of Devices - Devices=[] - Alu={} - + Devices = [] + Alu = {} + for opt, arg in opts: - if opt == '-h': + if opt == "-h": print(HowToUse % sys.argv[0]) print("\nInformations about devices detected under OpenCL API:") # For PyOpenCL import try: import pyopencl as cl - Id=0 + + Id = 0 for platform in cl.get_platforms(): for device in platform.get_devices(): - #deviceType=cl.device_type.to_string(device.type) - deviceType="xPU" - print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())) - Id=Id+1 - - except: + # deviceType=cl.device_type.to_string(device.type) + deviceType = "xPU" + print( + "Device #%i from %s of type %s : %s" + % ( + Id, + platform.vendor.lstrip(), + deviceType, + device.name.lstrip(), + ) + ) + Id = Id + 1 + + except Exception: print("Your platform does not seem to support OpenCL") print("\nInformations about devices detected under CUDA API:") # For PyCUDA import try: import pycuda.driver as cuda + cuda.init() for Id in range(cuda.Device.count()): - device=cuda.Device(Id) - print("Device #%i of type GPU : %s" % (Id,device.name())) + device = cuda.Device(Id) + print("Device #%i of type GPU : %s" % (Id, device.name())) print - except: + except Exception: print("Your platform does not seem to support CUDA") - + sys.exit() - - elif opt == '-o': - OutMetrology=True - Metrology='OutMetro' - elif opt == '-c': - Curves=True - elif opt == '-k': - IfThen=True + + elif opt == "-o": + OutMetrology = True + Metrology = "OutMetro" + elif opt == "-c": + Curves = True + elif opt == "-k": + IfThen = True elif opt in ("-d", "--device"): Devices.append(int(arg)) elif opt in ("-g", "--gpustyle"): @@ -770,7 +866,7 @@ if __name__=='__main__': Iterations = numpy.uint64(arg) elif opt in ("-b", "--blocksbegin"): BlocksBegin = int(arg) - BlocksEnd= BlocksBegin + BlocksEnd = BlocksBegin elif opt in ("-e", "--blocksend"): BlocksEnd = int(arg) elif opt in ("-s", "--blocksstep"): @@ -786,9 +882,9 @@ if __name__=='__main__': Redo = int(arg) # If no device has been specified, take the first one! - if len(Devices)==0: + if len(Devices) == 0: Devices.append(0) - + print("Devices Identification : %s" % Devices) print("GpuStyle used : %s" % GpuStyle) print("Iterations : %s" % Iterations) @@ -803,168 +899,267 @@ if __name__=='__main__': print("Type of Marsaglia RNG used : %s" % RNG) print("Type of variable : %s" % ValueType) - if GpuStyle=='CUDA': + if GpuStyle == "CUDA": try: # For PyCUDA import import pycuda.driver as cuda - + cuda.init() for Id in range(cuda.Device.count()): - device=cuda.Device(Id) - print("Device #%i of type GPU : %s" % (Id,device.name())) + device = cuda.Device(Id) + print("Device #%i of type GPU : %s" % (Id, device.name())) if Id in Devices: - Alu[Id]='GPU' - + Alu[Id] = "GPU" + except ImportError: print("Platform does not seem to support CUDA") - if GpuStyle=='OpenCL': + if GpuStyle == "OpenCL": try: # For PyOpenCL import import pyopencl as cl - Id=0 + + Id = 0 for platform in cl.get_platforms(): for device in platform.get_devices(): - #deviceType=cl.device_type.to_string(device.type) - deviceType="xPU" - print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip())) + # deviceType=cl.device_type.to_string(device.type) + deviceType = "xPU" + print( + "Device #%i from %s of type %s : %s" + % ( + Id, + platform.vendor.lstrip().rstrip(), + deviceType, + device.name.lstrip().rstrip(), + ) + ) if Id in Devices: - # Set the Alu as detected Device Type - Alu[Id]=deviceType - Id=Id+1 + # Set the Alu as detected Device Type + Alu[Id] = deviceType + Id = Id + 1 except ImportError: print("Platform does not seem to support OpenCL") - #print(Devices,Alu) - - BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep) - ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep) - - ExploredJobs=numpy.array([]).astype(numpy.uint32) - ExploredBlocks=numpy.array([]).astype(numpy.uint32) - ExploredThreads=numpy.array([]).astype(numpy.uint32) - avgD=numpy.array([]).astype(numpy.float32) - medD=numpy.array([]).astype(numpy.float32) - stdD=numpy.array([]).astype(numpy.float32) - minD=numpy.array([]).astype(numpy.float32) - maxD=numpy.array([]).astype(numpy.float32) - avgR=numpy.array([]).astype(numpy.float32) - medR=numpy.array([]).astype(numpy.float32) - stdR=numpy.array([]).astype(numpy.float32) - minR=numpy.array([]).astype(numpy.float32) - maxR=numpy.array([]).astype(numpy.float32) - - for Blocks,Threads in itertools.product(BlocksList,ThreadsList): - + # print(Devices,Alu) + + BlocksList = range(BlocksBegin, BlocksEnd + BlocksStep, BlocksStep) + ThreadsList = range(ThreadsBegin, ThreadsEnd + ThreadsStep, ThreadsStep) + + ExploredJobs = numpy.array([]).astype(numpy.uint32) + ExploredBlocks = numpy.array([]).astype(numpy.uint32) + ExploredThreads = numpy.array([]).astype(numpy.uint32) + avgD = numpy.array([]).astype(numpy.float32) + medD = numpy.array([]).astype(numpy.float32) + stdD = numpy.array([]).astype(numpy.float32) + minD = numpy.array([]).astype(numpy.float32) + maxD = numpy.array([]).astype(numpy.float32) + avgR = numpy.array([]).astype(numpy.float32) + medR = numpy.array([]).astype(numpy.float32) + stdR = numpy.array([]).astype(numpy.float32) + minR = numpy.array([]).astype(numpy.float32) + maxR = numpy.array([]).astype(numpy.float32) + + for Blocks, Threads in itertools.product(BlocksList, ThreadsList): + # print Blocks,Threads - circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64) - ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads) - ExploredBlocks=numpy.append(ExploredBlocks,Blocks) - ExploredThreads=numpy.append(ExploredThreads,Threads) - - if OutMetrology: - DurationItem=numpy.array([]).astype(numpy.float32) - Duration=numpy.array([]).astype(numpy.float32) - Rate=numpy.array([]).astype(numpy.float32) + circle = numpy.zeros(Blocks * Threads).astype(numpy.uint64) + ExploredJobs = numpy.append(ExploredJobs, Blocks * Threads) + ExploredBlocks = numpy.append(ExploredBlocks, Blocks) + ExploredThreads = numpy.append(ExploredThreads, Threads) + + if OutMetrology: + DurationItem = numpy.array([]).astype(numpy.float32) + Duration = numpy.array([]).astype(numpy.float32) + Rate = numpy.array([]).astype(numpy.float32) for i in range(Redo): - start=time.time() - if GpuStyle=='CUDA': + start = time.time() + if GpuStyle == "CUDA": try: - InputCU={} - InputCU['Iterations']=Iterations - InputCU['Steps']=1 - InputCU['Blocks']=Blocks - InputCU['Threads']=Threads - InputCU['Device']=Devices[0] - InputCU['RNG']=RNG - InputCU['Seeds']=Seeds - InputCU['ValueType']=ValueType - InputCU['IfThen']=IfThen - OutputCU=MetropolisCuda(InputCU) - Inside=OutputCU['Circle'] - NewIterations=OutputCU['NewIterations'] - Duration=OutputCU['Duration'] - except: - print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads)) - elif GpuStyle=='OpenCL': + InputCU = {} + InputCU["Iterations"] = Iterations + InputCU["Steps"] = 1 + InputCU["Blocks"] = Blocks + InputCU["Threads"] = Threads + InputCU["Device"] = Devices[0] + InputCU["RNG"] = RNG + InputCU["Seeds"] = Seeds + InputCU["ValueType"] = ValueType + InputCU["IfThen"] = IfThen + OutputCU = MetropolisCuda(InputCU) + Inside = OutputCU["Circle"] + NewIterations = OutputCU["NewIterations"] + Duration = OutputCU["Duration"] + except Exception: + print( + "Problem with (%i,%i) // computations on Cuda" + % (Blocks, Threads) + ) + elif GpuStyle == "OpenCL": try: - InputCL={} - InputCL['Iterations']=Iterations - InputCL['Steps']=1 - InputCL['Blocks']=Blocks - InputCL['Threads']=Threads - InputCL['Device']=Devices[0] - InputCL['RNG']=RNG - InputCL['Seeds']=Seeds - InputCL['ValueType']=ValueType - InputCL['IfThen']=IfThen - OutputCL=MetropolisOpenCL(InputCL) - Inside=OutputCL['Circle'] - NewIterations=OutputCL['NewIterations'] - Duration=OutputCL['Duration'] - except: - print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads)) - Duration=numpy.append(Duration,time.time()-start) - Rate=numpy.append(Rate,NewIterations/Duration[-1]) + InputCL = {} + InputCL["Iterations"] = Iterations + InputCL["Steps"] = 1 + InputCL["Blocks"] = Blocks + InputCL["Threads"] = Threads + InputCL["Device"] = Devices[0] + InputCL["RNG"] = RNG + InputCL["Seeds"] = Seeds + InputCL["ValueType"] = ValueType + InputCL["IfThen"] = IfThen + OutputCL = MetropolisOpenCL(InputCL) + Inside = OutputCL["Circle"] + NewIterations = OutputCL["NewIterations"] + Duration = OutputCL["Duration"] + except Exception: + print( + "Problem with (%i,%i) // computations on OpenCL" + % (Blocks, Threads) + ) + Duration = numpy.append(Duration, time.time() - start) + Rate = numpy.append(Rate, NewIterations / Duration[-1]) else: - if GpuStyle=='CUDA': + if GpuStyle == "CUDA": try: - InputCU={} - InputCU['Iterations']=Iterations - InputCU['Steps']=Redo - InputCU['Blocks']=Blocks - InputCU['Threads']=Threads - InputCU['Device']=Devices[0] - InputCU['RNG']=RNG - InputCU['Seeds']=Seeds - InputCU['ValueType']=ValueType - InputCU['IfThen']=IfThen - OutputCU=MetropolisCuda(InputCU) - Inside=OutputCU['Inside'] - NewIterations=OutputCU['NewIterations'] - Duration=OutputCU['Duration'] - pycuda.context.pop() - except: - print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads)) - elif GpuStyle=='OpenCL': + InputCU = {} + InputCU["Iterations"] = Iterations + InputCU["Steps"] = Redo + InputCU["Blocks"] = Blocks + InputCU["Threads"] = Threads + InputCU["Device"] = Devices[0] + InputCU["RNG"] = RNG + InputCU["Seeds"] = Seeds + InputCU["ValueType"] = ValueType + InputCU["IfThen"] = IfThen + OutputCU = MetropolisCuda(InputCU) + Inside = OutputCU["Inside"] + NewIterations = OutputCU["NewIterations"] + Duration = OutputCU["Duration"] + pycuda.context.pop() # noqa: F821 + except Exception: + print( + "Problem with (%i,%i) // computations on Cuda" + % (Blocks, Threads) + ) + elif GpuStyle == "OpenCL": try: - InputCL={} - InputCL['Iterations']=Iterations - InputCL['Steps']=Redo - InputCL['Blocks']=Blocks - InputCL['Threads']=Threads - InputCL['Device']=Devices[0] - InputCL['RNG']=RNG - InputCL['Seeds']=Seeds - InputCL['ValueType']=ValueType - InputCL['IfThen']=IfThen - OutputCL=MetropolisOpenCL(InputCL) - Inside=OutputCL['Inside'] - NewIterations=OutputCL['NewIterations'] - Duration=OutputCL['Duration'] - except: - print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads)) - Rate=NewIterations/Duration[-1] - print("Itops %i\nLogItops %.2f " % (int(Rate),numpy.log(Rate)/numpy.log(10))) - print("Pi estimation %.8f" % (4./NewIterations*Inside)) - - avgD=numpy.append(avgD,numpy.average(Duration)) - medD=numpy.append(medD,numpy.median(Duration)) - stdD=numpy.append(stdD,numpy.std(Duration)) - minD=numpy.append(minD,numpy.min(Duration)) - maxD=numpy.append(maxD,numpy.max(Duration)) - avgR=numpy.append(avgR,numpy.average(Rate)) - medR=numpy.append(medR,numpy.median(Rate)) - stdR=numpy.append(stdR,numpy.std(Rate)) - minR=numpy.append(minR,numpy.min(Rate)) - maxR=numpy.append(maxR,numpy.max(Rate)) - - print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1])) - - numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR)) - ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ] - numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i') + InputCL = {} + InputCL["Iterations"] = Iterations + InputCL["Steps"] = Redo + InputCL["Blocks"] = Blocks + InputCL["Threads"] = Threads + InputCL["Device"] = Devices[0] + InputCL["RNG"] = RNG + InputCL["Seeds"] = Seeds + InputCL["ValueType"] = ValueType + InputCL["IfThen"] = IfThen + OutputCL = MetropolisOpenCL(InputCL) + Inside = OutputCL["Inside"] + NewIterations = OutputCL["NewIterations"] + Duration = OutputCL["Duration"] + except Exception: + print( + "Problem with (%i,%i) // computations on OpenCL" + % (Blocks, Threads) + ) + Rate = NewIterations / Duration[-1] + print( + "Itops %i\nLogItops %.2f " + % (int(Rate), numpy.log(Rate) / numpy.log(10)) + ) + print("Pi estimation %.8f" % (4.0 / NewIterations * Inside)) + + avgD = numpy.append(avgD, numpy.average(Duration)) + medD = numpy.append(medD, numpy.median(Duration)) + stdD = numpy.append(stdD, numpy.std(Duration)) + minD = numpy.append(minD, numpy.min(Duration)) + maxD = numpy.append(maxD, numpy.max(Duration)) + avgR = numpy.append(avgR, numpy.average(Rate)) + medR = numpy.append(medR, numpy.median(Rate)) + stdR = numpy.append(stdR, numpy.std(Rate)) + minR = numpy.append(minR, numpy.min(Rate)) + maxR = numpy.append(maxR, numpy.max(Rate)) + + print( + "%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" + % ( + avgD[-1], + medD[-1], + stdD[-1], + minD[-1], + maxD[-1], + avgR[-1], + medR[-1], + stdR[-1], + minR[-1], + maxR[-1], + ) + ) + + numpy.savez( + "Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" + % ( + ValueType, + RNG, + Alu[Devices[0]], + GpuStyle, + BlocksBegin, + BlocksEnd, + ThreadsBegin, + ThreadsEnd, + Iterations, + Devices[0], + Metrology, + gethostname(), + ), + ( + ExploredBlocks, + ExploredThreads, + avgD, + medD, + stdD, + minD, + maxD, + avgR, + medR, + stdR, + minR, + maxR, + ), + ) + ToSave = [ + ExploredBlocks, + ExploredThreads, + avgD, + medD, + stdD, + minD, + maxD, + avgR, + medR, + stdR, + minR, + maxR, + ] + numpy.savetxt( + "Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" + % ( + ValueType, + RNG, + Alu[Devices[0]], + GpuStyle, + BlocksBegin, + BlocksEnd, + ThreadsBegin, + ThreadsEnd, + Iterations, + Devices[0], + Metrology, + gethostname(), + ), + numpy.transpose(ToSave), + fmt="%i %i %e %e %e %e %e %i %i %i %i %i", + ) if Fit: - FitAndPrint(ExploredJobs,median,Curves) + FitAndPrint(ExploredJobs, median, Curves) # noqa: F821, E501 # FIXME: undefined var 'median' diff --git a/examples/transpose.py b/examples/transpose.py index 6b06a98802eda2e26f4ad3ffb86cd7c761abd87c..d8915343ca222ee4173fc44209572cf55bb392f2 100644 --- a/examples/transpose.py +++ b/examples/transpose.py @@ -6,18 +6,16 @@ import numpy import numpy.linalg as la - - block_size = 16 - - class NaiveTranspose: def __init__(self, ctx): - self.kernel = cl.Program(ctx, """ - __kernel - void transpose( + self.kernel = ( + cl.Program( + ctx, + """ + __kernel void transpose( __global float *a_t, __global float *a, unsigned a_width, unsigned a_height) { @@ -26,17 +24,25 @@ class NaiveTranspose: a_t[write_idx] = a[read_idx]; } - """% {"block_size": block_size}).build().transpose + """,) + .build() + .transpose + ) def __call__(self, queue, tgt, src, shape): w, h = shape assert w % block_size == 0 assert h % block_size == 0 - return self.kernel(queue, (w, h), (block_size, block_size), - tgt, src, numpy.uint32(w), numpy.uint32(h)) - - + return self.kernel( + queue, + (w, h), + (block_size, block_size), + tgt, + src, + numpy.uint32(w), + numpy.uint32(h), + ) class SillyTranspose(NaiveTranspose): @@ -45,15 +51,17 @@ class SillyTranspose(NaiveTranspose): assert w % block_size == 0 assert h % block_size == 0 - return self.kernel(queue, (w, h), None, - tgt, src, numpy.uint32(w), numpy.uint32(h)) - - + return self.kernel( + queue, (w, h), None, tgt, src, numpy.uint32(w), numpy.uint32(h) + ) class TransposeWithLocal: def __init__(self, ctx): - self.kernel = cl.Program(ctx, """ + self.kernel = ( + cl.Program( + ctx, + """ #define BLOCK_SIZE %(block_size)d #define A_BLOCK_STRIDE (BLOCK_SIZE * a_width) #define A_T_BLOCK_STRIDE (BLOCK_SIZE * a_height) @@ -71,8 +79,10 @@ class TransposeWithLocal: get_group_id(1) * BLOCK_SIZE + get_group_id(0) * A_T_BLOCK_STRIDE; - int glob_idx_a = base_idx_a + get_local_id(0) + a_width * get_local_id(1); - int glob_idx_a_t = base_idx_a_t + get_local_id(0) + a_height * get_local_id(1); + int glob_idx_a = + base_idx_a + get_local_id(0) + a_width * get_local_id(1); + int glob_idx_a_t = + base_idx_a_t + get_local_id(0) + a_height * get_local_id(1); a_local[get_local_id(1)*BLOCK_SIZE+get_local_id(0)] = a[glob_idx_a]; @@ -80,18 +90,28 @@ class TransposeWithLocal: a_t[glob_idx_a_t] = a_local[get_local_id(0)*BLOCK_SIZE+get_local_id(1)]; } - """% {"block_size": block_size}).build().transpose + """ + % {"block_size": block_size}, + ) + .build() + .transpose + ) def __call__(self, queue, tgt, src, shape): w, h = shape assert w % block_size == 0 assert h % block_size == 0 - return self.kernel(queue, (w, h), (block_size, block_size), - tgt, src, numpy.uint32(w), numpy.uint32(h), - cl.LocalMemory(4*block_size*(block_size+1))) - - + return self.kernel( + queue, + (w, h), + (block_size, block_size), + tgt, + src, + numpy.uint32(w), + numpy.uint32(h), + cl.LocalMemory(4 * block_size * (block_size + 1)), + ) def transpose_using_cl(ctx, queue, cpu_src, cls): @@ -110,9 +130,6 @@ def transpose_using_cl(ctx, queue, cpu_src, cls): return result - - - def check_transpose(): for cls in [NaiveTranspose, SillyTranspose, TransposeWithLocal]: print("checking", cls.__name__) @@ -124,7 +141,7 @@ def check_transpose(): queue = cl.CommandQueue(ctx) for i in numpy.arange(10, 13, 0.125): - size = int(((2**i) // 32) * 32) + size = int(((2 ** i) // 32) * 32) print(size) source = numpy.random.rand(size, size).astype(numpy.float32) @@ -136,20 +153,18 @@ def check_transpose(): assert err_norm == 0, (size, err_norm) - - def benchmark_transpose(): ctx = cl.create_some_context() for dev in ctx.devices: assert dev.local_mem_size > 0 - queue = cl.CommandQueue(ctx, - properties=cl.command_queue_properties.PROFILING_ENABLE) + queue = cl.CommandQueue( + ctx, properties=cl.command_queue_properties.PROFILING_ENABLE + ) - sizes = [int(((2**i) // 32) * 32) - for i in numpy.arange(10, 13, 0.125)] - #for i in numpy.arange(10, 10.5, 0.125)] + sizes = [int(((2 ** i) // 32) * 32) for i in numpy.arange(10, 13, 0.125)] + # for i in numpy.arange(10, 10.5, 0.125)] mem_bandwidths = {} @@ -168,36 +183,44 @@ def benchmark_transpose(): a_t_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=source.nbytes) method = cls(ctx) - for i in range(4): + for _i in range(4): method(queue, a_t_buf, a_buf, source.shape) count = 12 events = [] - for i in range(count): + for _i in range(count): events.append(method(queue, a_t_buf, a_buf, source.shape)) events[-1].wait() time = sum(evt.profile.end - evt.profile.start for evt in events) - mem_bw = 2*source.nbytes*count/(time*1e-9) - print("benchmarking", name, size, mem_bw/1e9, "GB/s") + mem_bw = 2 * source.nbytes * count / (time * 1e-9) + print("benchmarking", name, size, mem_bw / 1e9, "GB/s") meth_mem_bws.append(mem_bw) a_buf.release() a_t_buf.release() try: - from matplotlib.pyplot import clf, plot, title, xlabel, ylabel, \ - savefig, legend, grid + from matplotlib.pyplot import ( + clf, + plot, + xlabel, + ylabel, + savefig, + legend, + grid, + ) except ModuleNotFoundError: pass else: for i in range(len(methods)): clf() - for j in range(i+1): + for j in range(i + 1): method = methods[j] name = method.__name__.replace("Transpose", "") - plot(sizes, numpy.array(mem_bandwidths[method])/1e9, "o-", label=name) + plot(sizes, numpy.array(mem_bandwidths[method]) / 1e9, "o-", + label=name) xlabel("Matrix width/height $N$") ylabel("Memory Bandwidth [GB/s]") @@ -209,4 +232,3 @@ def benchmark_transpose(): check_transpose() benchmark_transpose() - diff --git a/setup.cfg b/setup.cfg index 845fb8c484af1df89a79136b4bd54f97d89de4c1..59c86e50063bdd72882cd937a252acdf6e8e5ef4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,12 @@ [flake8] ignore = E126,E127,E128,E123,E226,E241,E242,E265,W503,E402 max-line-length=85 -exclude=pyopencl/compyte/ndarray,pyopencl/compyte/array.py +exclude=pyopencl/compyte/ndarray,pyopencl/compyte/array.py,gl_particle_animation.py,gl_interop_demo.py + +per-file-ignores= + examples/pi-monte-carlo.py:N,B + examples/black-hole-accretion.py:N + examples/n-body.py:N inline-quotes = " docstring-quotes = """