#!/usr/bin/env python3 # # Pi-by-MonteCarlo using PyCUDA/PyOpenCL # # performs an estimation of Pi using Monte Carlo method # a large amount of iterations is divided and distributed to compute units # a lot of options are provided to perform scalabilty tests # # use -h for complete set of options # # CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> # Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> # # Thanks to Andreas Klockner for PyCUDA: # http://mathema.tician.de/software/pycuda # Thanks to Andreas Klockner for PyOpenCL: # http://mathema.tician.de/software/pyopencl # # 2013-01-01 : problems with launch timeout # http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time # Option "Interactive" "0" in /etc/X11/xorg.conf # Common tools import numpy from numpy.random import randint as nprnd import sys import getopt import time import itertools from socket import gethostname class PenStacle: """Pentacle of Statistics from data""" Avg=0 Med=0 Std=0 Min=0 Max=0 def __init__(self,Data): self.Avg=numpy.average(Data) self.Med=numpy.median(Data) self.Std=numpy.std(Data) self.Max=numpy.max(Data) self.Min=numpy.min(Data) def display(self): print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max)) class Experience: """Metrology for experiences""" DeviceStyle='' DeviceId=0 AvgD=0 MedD=0 StdD=0 MinD=0 MaxD=0 AvgR=0 MedR=0 StdR=0 MinR=0 MaxR=0 def __init__(self,DeviceStyle,DeviceId,Iterations): self.DeviceStyle=DeviceStyle self.DeviceId=DeviceId self.Iterations def Metrology(self,Data): Duration=PenStacle(Data) Rate=PenStacle(Iterations/Data) print("Duration %s" % Duration) print("Rate %s" % Rate) def DictionariesAPI(): Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3} Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3} Test={True:1,False:0} return(Marsaglia,Computing,Test) # find prime factors of a number # Get for WWW : # http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/ def PrimeFactors(x): factorlist=numpy.array([]).astype('uint32') loop=2 while loop<=x: if x%loop==0: x/=loop factorlist=numpy.append(factorlist,[loop]) else: loop+=1 return factorlist # Try to find the best thread number in Hybrid approach (Blocks&Threads) # output is thread number def BestThreadsNumber(jobs): factors=PrimeFactors(jobs) matrix=numpy.append([factors],[factors[::-1]],axis=0) threads=1 for factor in matrix.transpose().ravel(): threads=threads*factor if threads*threads>jobs or threads>512: break return(long(threads)) # Predicted Amdahl Law (Reduced with s=1-p) def AmdahlR(N, T1, p): return (T1*(1-p+p/N)) # Predicted Amdahl Law def Amdahl(N, T1, s, p): return (T1*(s+p/N)) # Predicted Mylq Law with first order def Mylq(N, T1,s,c,p): return (T1*(s+p/N)+c*N) # Predicted Mylq Law with second order def Mylq2(N, T1,s,c1,c2,p): return (T1*(s+p/N)+c1*N+c2*N*N) def KernelCodeCuda(): KERNEL_CODE_CUDA=""" #define TCONG 0 #define TSHR3 1 #define TMWC 2 #define TKISS 3 #define TINT32 0 #define TINT64 1 #define TFP32 2 #define TFP64 3 #define IFTHEN 1 // Marsaglia RNG very simple implementation #define znew ((z=36969*(z&65535)+(z>>16))<<16) #define wnew ((w=18000*(w&65535)+(w>>16))&65535) #define MWC (znew+wnew) #define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) #define CONG (jcong=69069*jcong+1234567) #define KISS ((MWC^CONG)+SHR3) #define MWCfp MWC * 2.328306435454494e-10f #define KISSfp KISS * 2.328306435454494e-10f #define SHR3fp SHR3 * 2.328306435454494e-10f #define CONGfp CONG * 2.328306435454494e-10f __device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work) { #if TRNG == TCONG uint jcong=seed_z+work; #elif TRNG == TSHR3 uint jsr=seed_w+work; #elif TRNG == TMWC uint z=seed_z+work; uint w=seed_w+work; #elif TRNG == TKISS uint jcong=seed_z+work; uint jsr=seed_w+work; uint z=seed_z-work; uint w=seed_w-work; #endif ulong total=0; for (ulong i=0;i<iterations;i++) { #if TYPE == TINT32 #define THEONE 1073741824 #if TRNG == TCONG uint x=CONG>>17 ; uint y=CONG>>17 ; #elif TRNG == TSHR3 uint x=SHR3>>17 ; uint y=SHR3>>17 ; #elif TRNG == TMWC uint x=MWC>>17 ; uint y=MWC>>17 ; #elif TRNG == TKISS uint x=KISS>>17 ; uint y=KISS>>17 ; #endif #elif TYPE == TINT64 #define THEONE 4611686018427387904 #if TRNG == TCONG ulong x=(ulong)(CONG>>1) ; ulong y=(ulong)(CONG>>1) ; #elif TRNG == TSHR3 ulong x=(ulong)(SHR3>>1) ; ulong y=(ulong)(SHR3>>1) ; #elif TRNG == TMWC ulong x=(ulong)(MWC>>1) ; ulong y=(ulong)(MWC>>1) ; #elif TRNG == TKISS ulong x=(ulong)(KISS>>1) ; ulong y=(ulong)(KISS>>1) ; #endif #elif TYPE == TFP32 #define THEONE 1.0f #if TRNG == TCONG float x=CONGfp ; float y=CONGfp ; #elif TRNG == TSHR3 float x=SHR3fp ; float y=SHR3fp ; #elif TRNG == TMWC float x=MWCfp ; float y=MWCfp ; #elif TRNG == TKISS float x=KISSfp ; float y=KISSfp ; #endif #elif TYPE == TFP64 #define THEONE 1.0f #if TRNG == TCONG double x=(double)CONGfp ; double y=(double)CONGfp ; #elif TRNG == TSHR3 double x=(double)SHR3fp ; double y=(double)SHR3fp ; #elif TRNG == TMWC double x=(double)MWCfp ; double y=(double)MWCfp ; #elif TRNG == TKISS double x=(double)KISSfp ; double y=(double)KISSfp ; #endif #endif #if TEST == IFTHEN if ((x*x+y*y) <=THEONE) { total+=1; } #else ulong inside=((x*x+y*y) <= THEONE) ? 1:0; total+=inside; #endif } return(total); } __global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z) { ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x); s[blockIdx.x]=total; __syncthreads(); } __global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z) { ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x); s[threadIdx.x]=total; __syncthreads(); } __global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z) { ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x); s[blockDim.x*blockIdx.x+threadIdx.x]=total; __syncthreads(); } """ return(KERNEL_CODE_CUDA) def KernelCodeOpenCL(): KERNEL_CODE_OPENCL=""" #define TCONG 0 #define TSHR3 1 #define TMWC 2 #define TKISS 3 #define TINT32 0 #define TINT64 1 #define TFP32 2 #define TFP64 3 #define IFTHEN 1 // Marsaglia RNG very simple implementation #define znew ((z=36969*(z&65535)+(z>>16))<<16) #define wnew ((w=18000*(w&65535)+(w>>16))&65535) #define MWC (znew+wnew) #define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) #define CONG (jcong=69069*jcong+1234567) #define KISS ((MWC^CONG)+SHR3) #define MWCfp MWC * 2.328306435454494e-10f #define KISSfp KISS * 2.328306435454494e-10f #define CONGfp CONG * 2.328306435454494e-10f #define SHR3fp SHR3 * 2.328306435454494e-10f ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work) { #if TRNG == TCONG uint jcong=seed_z+work; #elif TRNG == TSHR3 uint jsr=seed_w+work; #elif TRNG == TMWC uint z=seed_z+work; uint w=seed_w+work; #elif TRNG == TKISS uint jcong=seed_z+work; uint jsr=seed_w+work; uint z=seed_z-work; uint w=seed_w-work; #endif ulong total=0; for (ulong i=0;i<iterations;i++) { #if TYPE == TINT32 #define THEONE 1073741824 #if TRNG == TCONG uint x=CONG>>17 ; uint y=CONG>>17 ; #elif TRNG == TSHR3 uint x=SHR3>>17 ; uint y=SHR3>>17 ; #elif TRNG == TMWC uint x=MWC>>17 ; uint y=MWC>>17 ; #elif TRNG == TKISS uint x=KISS>>17 ; uint y=KISS>>17 ; #endif #elif TYPE == TINT64 #define THEONE 4611686018427387904 #if TRNG == TCONG ulong x=(ulong)(CONG>>1) ; ulong y=(ulong)(CONG>>1) ; #elif TRNG == TSHR3 ulong x=(ulong)(SHR3>>1) ; ulong y=(ulong)(SHR3>>1) ; #elif TRNG == TMWC ulong x=(ulong)(MWC>>1) ; ulong y=(ulong)(MWC>>1) ; #elif TRNG == TKISS ulong x=(ulong)(KISS>>1) ; ulong y=(ulong)(KISS>>1) ; #endif #elif TYPE == TFP32 #define THEONE 1.0f #if TRNG == TCONG float x=CONGfp ; float y=CONGfp ; #elif TRNG == TSHR3 float x=SHR3fp ; float y=SHR3fp ; #elif TRNG == TMWC float x=MWCfp ; float y=MWCfp ; #elif TRNG == TKISS float x=KISSfp ; float y=KISSfp ; #endif #elif TYPE == TFP64 #pragma OPENCL EXTENSION cl_khr_fp64: enable #define THEONE 1.0f #if TRNG == TCONG double x=(double)CONGfp ; double y=(double)CONGfp ; #elif TRNG == TSHR3 double x=(double)SHR3fp ; double y=(double)SHR3fp ; #elif TRNG == TMWC double x=(double)MWCfp ; double y=(double)MWCfp ; #elif TRNG == TKISS double x=(double)KISSfp ; double y=(double)KISSfp ; #endif #endif #if TEST == IFTHEN if ((x*x+y*y) <= THEONE) { total+=1; } #else ulong inside=((x*x+y*y) <= THEONE) ? 1:0; total+=inside; #endif } return(total); } __kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z) { ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0)); barrier(CLK_GLOBAL_MEM_FENCE); s[get_global_id(0)]=total; } __kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z) { ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0)); barrier(CLK_LOCAL_MEM_FENCE); s[get_local_id(0)]=total; } __kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z) { ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0)); barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE); s[get_global_id(0)]=total; } """ return(KERNEL_CODE_OPENCL) def MetropolisCuda(InputCU): print("Inside ",InputCU) iterations=InputCU['Iterations'] steps=InputCU['Steps'] blocks=InputCU['Blocks'] threads=InputCU['Threads'] Device=InputCU['Device'] RNG=InputCU['RNG'] ValueType=InputCU['ValueType'] TestType=InputCU['IfThen'] Seeds=InputCU['Seeds'] Marsaglia,Computing,Test=DictionariesAPI() try: # For PyCUDA import import pycuda.driver as cuda from pycuda.compiler import SourceModule cuda.init() for Id in range(cuda.Device.count()): if Id==Device: XPU=cuda.Device(Id) print("GPU selected %s" % XPU.name()) print except ImportError: print("Platform does not seem to support CUDA") circle=numpy.zeros(blocks*threads).astype(numpy.uint64) circleCU = cuda.InOut(circle) #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize) #cuda.memcpy_htod(circleCU, circle) Context=XPU.make_context() try: mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])]) #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True) # Needed to set the compiler via ccbin for CUDA9 implementation #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True) except: print("Compilation seems to break") MetropolisBlocksCU=mod.get_function("MainLoopBlocks") MetropolisThreadsCU=mod.get_function("MainLoopThreads") MetropolisHybridCU=mod.get_function("MainLoopHybrid") MyDuration=numpy.zeros(steps) jobs=blocks*threads; iterationsCU=numpy.uint64(iterations/jobs) if iterations%jobs!=0: iterationsCU+=numpy.uint64(1) for i in range(steps): start_time=time.time() try: MetropolisHybridCU(circleCU, numpy.uint64(iterationsCU), numpy.uint32(Seeds[0]), numpy.uint32(Seeds[1]), grid=(blocks,1),block=(threads,1,1)) except: print("Crash during CUDA call") elapsed = time.time()-start_time print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed)) MyDuration[i]=elapsed OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration} print(OutputCU) Context.pop() Context.detach() return(OutputCU) def MetropolisOpenCL(InputCL): import pyopencl as cl iterations=InputCL['Iterations'] steps=InputCL['Steps'] blocks=InputCL['Blocks'] threads=InputCL['Threads'] Device=InputCL['Device'] RNG=InputCL['RNG'] ValueType=InputCL['ValueType'] TestType=InputCL['IfThen'] Seeds=InputCL['Seeds'] Marsaglia,Computing,Test=DictionariesAPI() # Initialisation des variables en les CASTant correctement Id=0 HasXPU=False for platform in cl.get_platforms(): for device in platform.get_devices(): if Id==Device: XPU=device print("CPU/GPU selected: ",device.name.lstrip()) HasXPU=True Id+=1 # print(Id) if HasXPU==False: print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1)) sys.exit() # Je cree le contexte et la queue pour son execution try: ctx = cl.Context(devices=[XPU]) queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE) except: print("Crash during context creation") # Je recupere les flag possibles pour les buffers mf = cl.mem_flags circle=numpy.zeros(blocks*threads).astype(numpy.uint64) circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle) MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType])) MyDuration=numpy.zeros(steps) jobs=blocks*threads; iterationsCL=numpy.uint64(iterations/jobs) if iterations%jobs!=0: iterationsCL+=1 for i in range(steps): start_time=time.time() if threads == 1: CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None, circleCL, numpy.uint64(iterationsCL), numpy.uint32(Seeds[0]), numpy.uint32(Seeds[1])) else: CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,), circleCL, numpy.uint64(iterationsCL), numpy.uint32(Seeds[0]), numpy.uint32(Seeds[1])) CLLaunch.wait() cl.enqueue_copy(queue, circle, circleCL).wait() elapsed = time.time()-start_time print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed)) # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start) # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle) MyDuration[i]=elapsed # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32) # MyPi[i]=numpy.median(AllPi) # print MyPi[i],numpy.std(AllPi),MyDuration[i] circleCL.release() OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration} # print(OutputCL) return(OutputCL) def FitAndPrint(N,D,Curves): from scipy.optimize import curve_fit import matplotlib.pyplot as plt try: coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D) D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]) coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0] coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0] coeffs_Amdahl[0]=D[0] print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])) except: print("Impossible to fit for Amdahl law : only %i elements" % len(D)) try: coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D) D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1]) coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0] coeffs_AmdahlR[0]=D[0] print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])) except: print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)) try: coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D) coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0] # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0] coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0] coeffs_Mylq[0]=D[0] print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0], coeffs_Mylq[1], coeffs_Mylq[3], coeffs_Mylq[2])) D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2], coeffs_Mylq[3]) except: print("Impossible to fit for Mylq law : only %i elements" % len(D)) try: coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D) coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0] # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0] # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0] coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0] coeffs_Mylq2[0]=D[0] print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])) except: print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)) if Curves: plt.xlabel("Number of Threads/work Items") plt.ylabel("Total Elapsed Time") Experience,=plt.plot(N,D,'ro') try: pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl") pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq") except: print("Fit curves seem not to be available") plt.legend() plt.show() if __name__=='__main__': # Set defaults values # GPU style can be Cuda (Nvidia implementation) or OpenCL GpuStyle='OpenCL' # Iterations is integer Iterations=1000000000 # BlocksBlocks in first number of Blocks to explore BlocksBegin=1024 # BlocksEnd is last number of Blocks to explore BlocksEnd=1024 # BlocksStep is the step of Blocks to explore BlocksStep=1 # ThreadsBlocks in first number of Blocks to explore ThreadsBegin=1 # ThreadsEnd is last number of Blocks to explore ThreadsEnd=1 # ThreadsStep is the step of Blocks to explore ThreadsStep=1 # Redo is the times to redo the test to improve metrology Redo=1 # OutMetrology is method for duration estimation : False is GPU inside OutMetrology=False Metrology='InMetro' # Curves is True to print the curves Curves=False # Fit is True to print the curves Fit=False # Inside based on If IfThen=False # Marsaglia RNG RNG='MWC' # Value type : INT32, INT64, FP32, FP64 ValueType='FP32' # Seeds for RNG Seeds=110271,101008 HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>' try: opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="]) except getopt.GetoptError: print(HowToUse % sys.argv[0]) sys.exit(2) # List of Devices Devices=[] Alu={} for opt, arg in opts: if opt == '-h': print(HowToUse % sys.argv[0]) print("\nInformations about devices detected under OpenCL API:") # For PyOpenCL import try: import pyopencl as cl Id=0 for platform in cl.get_platforms(): for device in platform.get_devices(): #deviceType=cl.device_type.to_string(device.type) deviceType="xPU" print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())) Id=Id+1 except: print("Your platform does not seem to support OpenCL") print("\nInformations about devices detected under CUDA API:") # For PyCUDA import try: import pycuda.driver as cuda cuda.init() for Id in range(cuda.Device.count()): device=cuda.Device(Id) print("Device #%i of type GPU : %s" % (Id,device.name())) print except: print("Your platform does not seem to support CUDA") sys.exit() elif opt == '-o': OutMetrology=True Metrology='OutMetro' elif opt == '-c': Curves=True elif opt == '-k': IfThen=True elif opt in ("-d", "--device"): Devices.append(int(arg)) elif opt in ("-g", "--gpustyle"): GpuStyle = arg elif opt in ("-m", "--marsaglia"): RNG = arg elif opt in ("-v", "--valuetype"): ValueType = arg elif opt in ("-i", "--iterations"): Iterations = numpy.uint64(arg) elif opt in ("-b", "--blocksbegin"): BlocksBegin = int(arg) BlocksEnd= BlocksBegin elif opt in ("-e", "--blocksend"): BlocksEnd = int(arg) elif opt in ("-s", "--blocksstep"): BlocksStep = int(arg) elif opt in ("-f", "--threadsfirst"): ThreadsBegin = int(arg) ThreadsEnd = ThreadsBegin elif opt in ("-l", "--threadslast"): ThreadsEnd = int(arg) elif opt in ("-t", "--threadsstep"): ThreadsStep = int(arg) elif opt in ("-r", "--redo"): Redo = int(arg) # If no device has been specified, take the first one! if len(Devices)==0: Devices.append(0) print("Devices Identification : %s" % Devices) print("GpuStyle used : %s" % GpuStyle) print("Iterations : %s" % Iterations) print("Number of Blocks on begin : %s" % BlocksBegin) print("Number of Blocks on end : %s" % BlocksEnd) print("Step on Blocks : %s" % BlocksStep) print("Number of Threads on begin : %s" % ThreadsBegin) print("Number of Threads on end : %s" % ThreadsEnd) print("Step on Threads : %s" % ThreadsStep) print("Number of redo : %s" % Redo) print("Metrology done out of XPU : %r" % OutMetrology) print("Type of Marsaglia RNG used : %s" % RNG) print("Type of variable : %s" % ValueType) if GpuStyle=='CUDA': try: # For PyCUDA import import pycuda.driver as cuda cuda.init() for Id in range(cuda.Device.count()): device=cuda.Device(Id) print("Device #%i of type GPU : %s" % (Id,device.name())) if Id in Devices: Alu[Id]='GPU' except ImportError: print("Platform does not seem to support CUDA") if GpuStyle=='OpenCL': try: # For PyOpenCL import import pyopencl as cl Id=0 for platform in cl.get_platforms(): for device in platform.get_devices(): #deviceType=cl.device_type.to_string(device.type) deviceType="xPU" print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip())) if Id in Devices: # Set the Alu as detected Device Type Alu[Id]=deviceType Id=Id+1 except ImportError: print("Platform does not seem to support OpenCL") #print(Devices,Alu) BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep) ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep) ExploredJobs=numpy.array([]).astype(numpy.uint32) ExploredBlocks=numpy.array([]).astype(numpy.uint32) ExploredThreads=numpy.array([]).astype(numpy.uint32) avgD=numpy.array([]).astype(numpy.float32) medD=numpy.array([]).astype(numpy.float32) stdD=numpy.array([]).astype(numpy.float32) minD=numpy.array([]).astype(numpy.float32) maxD=numpy.array([]).astype(numpy.float32) avgR=numpy.array([]).astype(numpy.float32) medR=numpy.array([]).astype(numpy.float32) stdR=numpy.array([]).astype(numpy.float32) minR=numpy.array([]).astype(numpy.float32) maxR=numpy.array([]).astype(numpy.float32) for Blocks,Threads in itertools.product(BlocksList,ThreadsList): # print Blocks,Threads circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64) ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads) ExploredBlocks=numpy.append(ExploredBlocks,Blocks) ExploredThreads=numpy.append(ExploredThreads,Threads) if OutMetrology: DurationItem=numpy.array([]).astype(numpy.float32) Duration=numpy.array([]).astype(numpy.float32) Rate=numpy.array([]).astype(numpy.float32) for i in range(Redo): start=time.time() if GpuStyle=='CUDA': try: InputCU={} InputCU['Iterations']=Iterations InputCU['Steps']=1 InputCU['Blocks']=Blocks InputCU['Threads']=Threads InputCU['Device']=Devices[0] InputCU['RNG']=RNG InputCU['Seeds']=Seeds InputCU['ValueType']=ValueType InputCU['IfThen']=IfThen OutputCU=MetropolisCuda(InputCU) Inside=OutputCU['Circle'] NewIterations=OutputCU['NewIterations'] Duration=OutputCU['Duration'] except: print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads)) elif GpuStyle=='OpenCL': try: InputCL={} InputCL['Iterations']=Iterations InputCL['Steps']=1 InputCL['Blocks']=Blocks InputCL['Threads']=Threads InputCL['Device']=Devices[0] InputCL['RNG']=RNG InputCL['Seeds']=Seeds InputCL['ValueType']=ValueType InputCL['IfThen']=IfThen OutputCL=MetropolisOpenCL(InputCL) Inside=OutputCL['Circle'] NewIterations=OutputCL['NewIterations'] Duration=OutputCL['Duration'] except: print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads)) Duration=numpy.append(Duration,time.time()-start) Rate=numpy.append(Rate,NewIterations/Duration[-1]) else: if GpuStyle=='CUDA': try: InputCU={} InputCU['Iterations']=Iterations InputCU['Steps']=Redo InputCU['Blocks']=Blocks InputCU['Threads']=Threads InputCU['Device']=Devices[0] InputCU['RNG']=RNG InputCU['Seeds']=Seeds InputCU['ValueType']=ValueType InputCU['IfThen']=IfThen OutputCU=MetropolisCuda(InputCU) Inside=OutputCU['Inside'] NewIterations=OutputCU['NewIterations'] Duration=OutputCU['Duration'] pycuda.context.pop() except: print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads)) elif GpuStyle=='OpenCL': try: InputCL={} InputCL['Iterations']=Iterations InputCL['Steps']=Redo InputCL['Blocks']=Blocks InputCL['Threads']=Threads InputCL['Device']=Devices[0] InputCL['RNG']=RNG InputCL['Seeds']=Seeds InputCL['ValueType']=ValueType InputCL['IfThen']=IfThen OutputCL=MetropolisOpenCL(InputCL) Inside=OutputCL['Inside'] NewIterations=OutputCL['NewIterations'] Duration=OutputCL['Duration'] except: print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads)) Rate=NewIterations/Duration[-1] print("Itops %i\nLogItops %.2f " % (int(Rate),numpy.log(Rate)/numpy.log(10))) print("Pi estimation %.8f" % (4./NewIterations*Inside)) avgD=numpy.append(avgD,numpy.average(Duration)) medD=numpy.append(medD,numpy.median(Duration)) stdD=numpy.append(stdD,numpy.std(Duration)) minD=numpy.append(minD,numpy.min(Duration)) maxD=numpy.append(maxD,numpy.max(Duration)) avgR=numpy.append(avgR,numpy.average(Rate)) medR=numpy.append(medR,numpy.median(Rate)) stdR=numpy.append(stdR,numpy.std(Rate)) minR=numpy.append(minR,numpy.min(Rate)) maxR=numpy.append(maxR,numpy.max(Rate)) print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1])) numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR)) ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ] numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i') if Fit: FitAndPrint(ExploredJobs,median,Curves)