#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ NBody Demonstrator implemented in OpenCL, rendering OpenGL By default, rendering in OpenGL is disabled. Add -g option to activate. Part of matrix programs from: https://forge.cbp.ens-lyon.fr/svn/bench4gpu/ CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> Thanks to Andreas Klockner for PyOpenCL: http://mathema.tician.de/software/pyopencl """ import getopt import sys import time import numpy as np import pyopencl as cl import pyopencl.array as cl_array from numpy.random import randint as nprnd import string, sys def DictionariesAPI(): Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3} Computing={'FP32':0,'FP64':1} Interaction={'Force':0,'Potential':1} Artevasion={'None':0,'NegExp':1,'CorRad':2} return(Marsaglia,Computing,Interaction,Artevasion) BlobOpenCL= """ #define TFP32 0 #define TFP64 1 #define TFORCE 0 #define TPOTENTIAL 1 #define NONE 0 #define NEGEXP 1 #define CORRAD 2 #if TYPE == TFP32 #define MYFLOAT4 float4 #define MYFLOAT8 float8 #define MYFLOAT float #define DISTANCE fast_distance #else #define MYFLOAT4 double4 #define MYFLOAT8 double8 #define MYFLOAT double #define DISTANCE distance #if defined(cl_khr_fp64) // Khronos extension available? #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif #endif #define znew ((zmwc=36969*(zmwc&65535)+(zmwc>>16))<<16) #define wnew ((wmwc=18000*(wmwc&65535)+(wmwc>>16))&65535) #define MWC (znew+wnew) #define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) #define CONG (jcong=69069*jcong+1234567) #define KISS ((MWC^CONG)+SHR3) #define MWCfp (MYFLOAT)(MWC * 2.3283064365386963e-10f) #define KISSfp (MYFLOAT)(KISS * 2.3283064365386963e-10f) #define SHR3fp (MYFLOAT)(SHR3 * 2.3283064365386963e-10f) #define CONGfp (MYFLOAT)(CONG * 2.3283064365386963e-10f) #define PI (MYFLOAT)3.141592653589793238e0f #define SMALL_NUM (MYFLOAT)1.e-9f #define CoreRadius (MYFLOAT)(1.e0f) // Create my own Distance implementation: distance buggy on Oland AMD chipset MYFLOAT MyDistance(MYFLOAT4 n,MYFLOAT4 m) { private MYFLOAT x2,y2,z2; x2=n.s0-m.s0; x2*=x2; y2=n.s1-m.s1; y2*=y2; z2=n.s2-m.s2; z2*=z2; return(sqrt(x2+y2+z2)); } // Potential between 2 m,n bodies MYFLOAT PairPotential(MYFLOAT4 m,MYFLOAT4 n) #if ARTEVASION == NEGEXP // Add exp(-r) to numerator to avoid divergence for low distances { MYFLOAT r=DISTANCE(n,m); return((-1.e0f+exp(-r))/r); } #elif ARTEVASION == CORRAD // Add Core Radius to avoid divergence for low distances { MYFLOAT r=DISTANCE(n,m); return(-1.e0f/sqrt(r*r+CoreRadius*CoreRadius)); } #else // Classical potential in 1/r { // return((MYFLOAT)(-1.e0f)/(MyDistance(m,n))); return((MYFLOAT)(-1.e0f)/(DISTANCE(n,m))); } #endif // Interaction based of Force as gradient of Potential MYFLOAT4 Interaction(MYFLOAT4 m,MYFLOAT4 n) #if INTERACTION == TFORCE #if ARTEVASION == NEGEXP // Force gradient of potential, set as (1-exp(-r))/r { private MYFLOAT r=MyDistance(n,m); private MYFLOAT num=1.e0f+exp(-r)*(r-1.e0f); return((n-m)*num/(MYFLOAT)(r*r*r)); } #elif ARTEVASION == CORRAD // Force gradient of potential, (Core Radius) set as 1/sqrt(r**2+CoreRadius**2) { private MYFLOAT r=MyDistance(n,m); private MYFLOAT den=sqrt(r*r+CoreRadius*CoreRadius); return((n-m)/(MYFLOAT)(den*den*den)); } #else // Simplest implementation of force (equals to acceleration) // seems to bo bad (numerous artevasions) // MYFLOAT4 InteractionForce(MYFLOAT4 m,MYFLOAT4 n) { private MYFLOAT r=MyDistance(n,m); return((n-m)/(MYFLOAT)(r*r*r)); } #endif #else // Force definited as gradient of potential // Estimate potential and proximate potential to estimate force { // 1/1024 seems to be a good factor: larger one provides bad results private MYFLOAT epsilon=(MYFLOAT)(1.e0f/1024); private MYFLOAT4 er=normalize(n-m); private MYFLOAT4 dr=er*(MYFLOAT)epsilon; return(er/epsilon*(PairPotential(m,n)-PairPotential(m+dr,n))); } #endif MYFLOAT AtomicPotential(__global MYFLOAT4* clDataX,int gid) { private MYFLOAT potential=(MYFLOAT)0.e0f; private MYFLOAT4 x=clDataX[gid]; for (int i=0;i<get_global_size(0);i++) { if (gid != i) potential+=PairPotential(x,clDataX[i]); } barrier(CLK_GLOBAL_MEM_FENCE); return(potential); } MYFLOAT AtomicPotentialCoM(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,int gid) { return(PairPotential(clDataX[gid],clCoM[0])); } // Elements from : http://doswa.com/2009/01/02/fourth-order-runge-kutta-numerical-integration.html MYFLOAT8 AtomicRungeKutta(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt) { private MYFLOAT4 a0,v0,x0,a1,v1,x1,a2,v2,x2,a3,v3,x3,a4,v4,x4,xf,vf; MYFLOAT4 DT=dt*(MYFLOAT4)(1.e0f,1.e0f,1.e0f,1.e0f); a0=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); v0=(MYFLOAT4)clDataInV[gid]; x0=(MYFLOAT4)clDataInX[gid]; int N = get_global_size(0); for (private int i=0;i<N;i++) { if (gid != i) a0+=Interaction(x0,clDataInX[i]); } a1=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); v1=a0*dt+v0; x1=v0*dt+x0; for (private int j=0;j<N;j++) { if (gid != j) a1+=Interaction(x1,clDataInX[j]); } a2=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); v2=a1*(MYFLOAT)(dt/2.e0f)+v0; x2=v1*(MYFLOAT)(dt/2.e0f)+x0; for (private int k=0;k<N;k++) { if (gid != k) a2+=Interaction(x2,clDataInX[k]); } a3=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); v3=a2*(MYFLOAT)(dt/2.e0f)+v0; x3=v2*(MYFLOAT)(dt/2.e0f)+x0; for (private int l=0;l<N;l++) { if (gid != l) a3+=Interaction(x3,clDataInX[l]); } a4=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); v4=a3*dt+v0; x4=v3*dt+x0; for (private int m=0;m<N;m++) { if (gid != m) a4+=Interaction(x4,clDataInX[m]); } xf=x0+dt*(v1+(MYFLOAT)2.e0f*(v2+v3)+v4)/(MYFLOAT)6.e0f; vf=v0+dt*(a1+(MYFLOAT)2.e0f*(a2+a3)+a4)/(MYFLOAT)6.e0f; return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f)); } MYFLOAT8 AtomicHeun(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt) { private MYFLOAT4 x0,v0,a0,x1,v1,a1,xf,vf; MYFLOAT4 Dt=dt*(MYFLOAT4)(1.e0f,1.e0f,1.e0f,1.e0f); x0=(MYFLOAT4)clDataInX[gid]; v0=(MYFLOAT4)clDataInV[gid]; a0=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); for (private int i=0;i<get_global_size(0);i++) { if (gid != i) a0+=Interaction(x0,clDataInX[i]); } a1=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); //v1=v0+dt*a0; //x1=x0+dt*v0; v1=dt*a0+v0; x1=dt*v0+x0; for (private int j=0;j<get_global_size(0);j++) { if (gid != j) a1+=Interaction(x1,clDataInX[j]); } vf=v0+dt*(a0+a1)/(MYFLOAT)2.e0f; xf=x0+dt*(v0+v1)/(MYFLOAT)2.e0f; return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f)); } MYFLOAT8 AtomicImplicitEuler(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt) { MYFLOAT4 x0,v0,a,xf,vf; x0=(MYFLOAT4)clDataInX[gid]; v0=(MYFLOAT4)clDataInV[gid]; a=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); for (private int i=0;i<get_global_size(0);i++) { if (gid != i) a+=Interaction(x0,clDataInX[i]); } vf=v0+dt*a; xf=x0+dt*vf; return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f)); } MYFLOAT8 AtomicExplicitEuler(__global MYFLOAT4* clDataInX,__global MYFLOAT4* clDataInV,int gid,MYFLOAT dt) { MYFLOAT4 x0,v0,a,xf,vf; x0=(MYFLOAT4)clDataInX[gid]; v0=(MYFLOAT4)clDataInV[gid]; a=(MYFLOAT4)(0.e0f,0.e0f,0.e0f,0.e0f); for (private int i=0;i<get_global_size(0);i++) { if (gid != i) a+=Interaction(x0,clDataInX[i]); } vf=v0+dt*a; xf=x0+dt*v0; return((MYFLOAT8)(xf.s0,xf.s1,xf.s2,0.e0f,vf.s0,vf.s1,vf.s2,0.e0f)); } __kernel void InBallSplutterPoints(__global MYFLOAT4* clDataX, MYFLOAT diameter,uint seed_z,uint seed_w) { private int gid=get_global_id(0); private uint zmwc=seed_z+gid; private uint wmwc=seed_w+(gid+1)%2; private MYFLOAT Heat; for (int i=0;i<gid;i++) { Heat=MWCfp; } // More accurate distribution based on spherical coordonates // Disactivated because of AMD Oland GPU crash on launch // private MYFLOAT Radius,Theta,Phi,PosX,PosY,PosZ,SinTheta; // Radius=MWCfp*diameter/2.e0f; // Theta=(MYFLOAT)acos((float)(-2.e0f*MWCfp+1.0e0f)); // Phi=(MYFLOAT)(2.e0f*PI*MWCfp); // SinTheta=sin((float)Theta); // PosX=cos((float)Phi)*Radius*SinTheta; // PosY=sin((float)Phi)*Radius*SinTheta; // PosZ=cos((float)Theta)*Radius; // clDataX[gid]=(MYFLOAT4)(PosX,PosY,PosZ,0.e0f); private MYFLOAT Radius=diameter/2.e0f; private MYFLOAT Length=diameter; private MYFLOAT4 Position; while (Length>Radius) { Position=(MYFLOAT4)((MWCfp-0.5e0f)*diameter,(MWCfp-0.5e0f)*diameter,(MWCfp-0.5e0f)*diameter,0.e0f); Length=(MYFLOAT)length((MYFLOAT4)Position); } clDataX[gid]=Position; barrier(CLK_GLOBAL_MEM_FENCE); } __kernel void InBoxSplutterPoints(__global MYFLOAT4* clDataX, MYFLOAT box, uint seed_z,uint seed_w) { int gid=get_global_id(0); uint zmwc=seed_z+gid; uint wmwc=seed_w-gid; private MYFLOAT Heat; for (int i=0;i<gid;i++) { Heat=MWCfp; } clDataX[gid]=(MYFLOAT4)((MWCfp-0.5e0f)*box,(MWCfp-0.5e0f)*box,(MWCfp-0.5e0f)*box,0.e0f); barrier(CLK_GLOBAL_MEM_FENCE); } __kernel void SplutterStress(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,__global MYFLOAT4* clCoM, MYFLOAT velocity,uint seed_z,uint seed_w) { int gid = get_global_id(0); MYFLOAT N = (MYFLOAT)get_global_size(0); uint zmwc=seed_z+(uint)gid; uint wmwc=seed_w-(uint)gid; MYFLOAT4 CrossVector,SpeedVector,FromCoM; MYFLOAT Heat,ThetaA,PhiA,ThetaB,PhiB,Length,tA,tB,Polar; for (int i=0;i<gid;i++) { Heat=MWCfp; } // cast to float for sin,cos are NEEDED by Mesa FP64 implementation! // Implemention on AMD Oland are probably broken in float FromCoM=(MYFLOAT4)(clDataX[gid]-clCoM[0]); Length=length(FromCoM); //Theta=acos(FromCoM.z/Length); //Phi=atan(FromCoM.y/FromCoM.x); // First tangential vector to sphere of length radius ThetaA=acos(FromCoM.x/Length)+5.e-1f*PI; PhiA=atan(FromCoM.y/FromCoM.z); // Second tangential vector to sphere of length radius ThetaB=acos((float)(FromCoM.x/Length)); PhiB=atan((float)(FromCoM.y/FromCoM.z))+5.e-1f*PI; // (x,y) random coordonates to plane tangential to sphere Polar=MWCfp*2.e0f*PI; tA=cos((float)Polar); tB=sin((float)Polar); // Exception for 2 particules to ovoid shifting if (get_global_size(0)==2) { CrossVector=(MYFLOAT4)(1.e0f,1.e0f,1.e0f,0.e0f); } else { CrossVector.s0=tA*cos((float)ThetaA)+tB*cos((float)ThetaB); CrossVector.s1=tA*sin((float)ThetaA)*sin((float)PhiA)+tB*sin((float)ThetaB)*sin((float)PhiB); CrossVector.s2=tA*sin((float)ThetaA)*cos((float)PhiA)+tB*sin((float)ThetaB)*cos((float)PhiB); CrossVector.s3=0.e0f; } if (velocity<SMALL_NUM) { SpeedVector=(MYFLOAT4)normalize(cross(FromCoM,CrossVector))*sqrt((-AtomicPotential(clDataX,gid)/(MYFLOAT)2.e0f)); } else { SpeedVector=(MYFLOAT4)((MWCfp-5e-1f)*velocity,(MWCfp-5e-1f)*velocity, (MWCfp-5e-1f)*velocity,0.e0f); } clDataV[gid]=SpeedVector; barrier(CLK_GLOBAL_MEM_FENCE); } __kernel void RungeKutta(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h) { private int gid = get_global_id(0); private MYFLOAT8 clDataGid; clDataGid=AtomicRungeKutta(clDataX,clDataV,gid,h); barrier(CLK_GLOBAL_MEM_FENCE); clDataX[gid]=clDataGid.s0123; clDataV[gid]=clDataGid.s4567; } __kernel void Heun(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h) { private int gid = get_global_id(0); private MYFLOAT8 clDataGid; clDataGid=AtomicHeun(clDataX,clDataV,gid,h); barrier(CLK_GLOBAL_MEM_FENCE); clDataX[gid]=clDataGid.s0123; clDataV[gid]=clDataGid.s4567; } __kernel void ImplicitEuler(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h) { private int gid = get_global_id(0); private MYFLOAT8 clDataGid; clDataGid=AtomicImplicitEuler(clDataX,clDataV,gid,h); barrier(CLK_GLOBAL_MEM_FENCE); clDataX[gid]=clDataGid.s0123; clDataV[gid]=clDataGid.s4567; } __kernel void ExplicitEuler(__global MYFLOAT4* clDataX,__global MYFLOAT4* clDataV,MYFLOAT h) { private int gid = get_global_id(0); private MYFLOAT8 clDataGid; clDataGid=AtomicExplicitEuler(clDataX,clDataV,gid,h); barrier(CLK_GLOBAL_MEM_FENCE); clDataX[gid]=clDataGid.s0123; clDataV[gid]=clDataGid.s4567; } __kernel void CoMPotential(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,__global MYFLOAT* clPotential) { int gid = get_global_id(0); clPotential[gid]=PairPotential(clDataX[gid],clCoM[0]); } __kernel void Potential(__global MYFLOAT4* clDataX,__global MYFLOAT* clPotential) { int gid = get_global_id(0); MYFLOAT potential=(MYFLOAT)0.e0f; MYFLOAT4 x=clDataX[gid]; for (int i=0;i<get_global_size(0);i++) { if (gid != i) potential+=PairPotential(x,clDataX[i]); } barrier(CLK_GLOBAL_MEM_FENCE); clPotential[gid]=potential*(MYFLOAT)5.e-1f; } __kernel void CenterOfMass(__global MYFLOAT4* clDataX,__global MYFLOAT4* clCoM,int Size) { MYFLOAT4 CoM=clDataX[0]; for (int i=1;i<Size;i++) { CoM+=clDataX[i]; } barrier(CLK_GLOBAL_MEM_FENCE); clCoM[0]=(MYFLOAT4)(CoM.s0,CoM.s1,CoM.s2,0.e0f)/(MYFLOAT)Size; } __kernel void Kinetic(__global MYFLOAT4* clDataV,__global MYFLOAT* clKinetic) { int gid = get_global_id(0); barrier(CLK_GLOBAL_MEM_FENCE); MYFLOAT d=(MYFLOAT)length(clDataV[gid]); clKinetic[gid]=(MYFLOAT)5.e-1f*(MYFLOAT)(d*d); } """ def MainOpenCL(clDataX,clDataV,Step,Method): time_start=time.time() if Method=="RungeKutta": CLLaunch=MyRoutines.RungeKutta(queue,(Number,1),None,clDataX,clDataV,Step) elif Method=="ExplicitEuler": CLLaunch=MyRoutines.ExplicitEuler(queue,(Number,1),None,clDataX,clDataV,Step) elif Method=="Heun": CLLaunch=MyRoutines.Heun(queue,(Number,1),None,clDataX,clDataV,Step) else: CLLaunch=MyRoutines.ImplicitEuler(queue,(Number,1),None,clDataX,clDataV,Step) CLLaunch.wait() Elapsed=time.time()-time_start return(Elapsed) def display(*args): global MyDataX,MyDataV,clDataX,clDataV,Step,Method,Number,Iterations,Durations,Verbose,SpeedRendering glClearColor(0.0, 0.0, 0.0, 0.0) glClear(GL_COLOR_BUFFER_BIT) glColor3f(1.0,1.0,1.0) Elapsed=MainOpenCL(clDataX,clDataV,Step,Method) if SpeedRendering: cl.enqueue_copy(queue, MyDataV, clDataV) MyDataV.reshape(Number,4)[:,3]=1 glVertexPointerf(MyDataV.reshape(Number,4)) else: cl.enqueue_copy(queue, MyDataX, clDataX) MyDataX.reshape(Number,4)[:,3]=1 glVertexPointerf(MyDataX.reshape(Number,4)) if Verbose: print("Positions for #%s iteration: %s" % (Iterations,MyDataX)) else: sys.stdout.write('.') sys.stdout.flush() Durations=np.append(Durations,MainOpenCL(clDataX,clDataV,Step,Method)) glEnableClientState(GL_VERTEX_ARRAY) glDrawArrays(GL_POINTS, 0, Number) glDisableClientState(GL_VERTEX_ARRAY) glFlush() Iterations+=1 glutSwapBuffers() def halt(): pass def keyboard(k,x,y): global ViewRZ,SpeedRendering LC_Z = as_8_bit( 'z' ) UC_Z = as_8_bit( 'Z' ) Plus = as_8_bit( '+' ) Minus = as_8_bit( '-' ) Switch = as_8_bit( 's' ) Zoom=1 if k == LC_Z: ViewRZ += 1.0 elif k == UC_Z: ViewRZ -= 1.0 elif k == Plus: Zoom *= 2.0 elif k == Minus: Zoom /= 2.0 elif k == Switch: if SpeedRendering: SpeedRendering=False else: SpeedRendering=True elif ord(k) == 27: # Escape glutLeaveMainLoop() return(False) else: return glRotatef(ViewRZ, 0.0, 0.0, 1.0) glScalef(Zoom,Zoom,Zoom) glutPostRedisplay() def special(k,x,y): global ViewRX, ViewRY Step=1. if k == GLUT_KEY_UP: ViewRX += Step elif k == GLUT_KEY_DOWN: ViewRX -= Step elif k == GLUT_KEY_LEFT: ViewRY += Step elif k == GLUT_KEY_RIGHT: ViewRY -= Step else: return glRotatef(ViewRX, 1.0, 0.0, 0.0) glRotatef(ViewRY, 0.0, 1.0, 0.0) glutPostRedisplay() def setup_viewport(): global SizeOfBox glMatrixMode(GL_PROJECTION) glLoadIdentity() glOrtho(-SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox, -SizeOfBox, SizeOfBox) glutPostRedisplay() def reshape(w, h): glViewport(0, 0, w, h) setup_viewport() if __name__=='__main__': global Number,Step,clDataX,clDataV,MyDataX,MyDataV,Method,SizeOfBox,Iterations,Verbose,Durations # ValueType ValueType='FP32' class MyFloat(np.float32):pass # clType8=cl_array.vec.float8 # Set defaults values np.set_printoptions(precision=2) # Id of Device : 1 is for first find ! Device=0 # Number of bodies is integer Number=2 # Number of iterations (for standalone execution) Iterations=10 # Size of shape SizeOfShape=MyFloat(1.) # Initial velocity of particules Velocity=MyFloat(1.) # Step Step=MyFloat(1./32) # Method of integration Method='ImplicitEuler' # InitialRandom InitialRandom=False # RNG Marsaglia Method RNG='MWC' # Viriel Distribution of stress VirielStress=True # Verbose Verbose=False # OpenGL real time rendering OpenGL=False # Speed rendering SpeedRendering=False # Counter ArtEvasions Measures (artefact evasion) CoArEv='None' # Shape to distribute Shape='Ball' # Type of Interaction InterType='Force' HowToUse='%s -h [Help] -r [InitialRandom] -g [OpenGL] -e [VirielStress] -o [Verbose] -p [Potential] -x <None|NegExp|CorRad> -d <DeviceId> -n <NumberOfParticules> -i <Iterations> -z <SizeOfBoxOrBall> -v <Velocity> -s <Step> -b <Ball|Box> -m <ImplicitEuler|RungeKutta|ExplicitEuler|Heun> -t <FP32|FP64>' try: opts, args = getopt.getopt(sys.argv[1:],"rpgehod:n:i:z:v:s:m:t:b:x:",["random","potential","coarev","opengl","viriel","verbose","device=","number=","iterations=","size=","velocity=","step=","method=","valuetype=","shape="]) except getopt.GetoptError: print(HowToUse % sys.argv[0]) sys.exit(2) for opt, arg in opts: if opt == '-h': print(HowToUse % sys.argv[0]) print("\nInformations about devices detected under OpenCL:") try: Id=0 for platform in cl.get_platforms(): for device in platform.get_devices(): # Failed now because of POCL implementation #deviceType=cl.device_type.to_string(device.type) deviceType="xPU" print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())) Id=Id+1 sys.exit() except ImportError: print("Your platform does not seem to support OpenCL") sys.exit() elif opt in ("-t", "--valuetype"): if arg=='FP64': class MyFloat(np.float64): pass else: class MyFloat(np.float32):pass ValueType = arg elif opt in ("-d", "--device"): Device=int(arg) elif opt in ("-m", "--method"): Method=arg elif opt in ("-b", "--shape"): Shape=arg if Shape!='Ball' or Shape!='Box': print('Wrong argument: set to Ball') elif opt in ("-n", "--number"): Number=int(arg) elif opt in ("-i", "--iterations"): Iterations=int(arg) elif opt in ("-z", "--size"): SizeOfShape=MyFloat(arg) elif opt in ("-v", "--velocity"): Velocity=MyFloat(arg) VirielStress=False elif opt in ("-s", "--step"): Step=MyFloat(arg) elif opt in ("-r", "--random"): InitialRandom=True elif opt in ("-c", "--check"): CheckEnergies=True elif opt in ("-e", "--viriel"): VirielStress=True elif opt in ("-g", "--opengl"): OpenGL=True elif opt in ("-p", "--potential"): InterType='Potential' elif opt in ("-x", "--coarev"): CoArEv=arg elif opt in ("-o", "--verbose"): Verbose=True SizeOfShape=np.sqrt(MyFloat(SizeOfShape*Number)) Velocity=MyFloat(Velocity) Step=MyFloat(Step) print("Device choosed : %s" % Device) print("Number of particules : %s" % Number) print("Size of Shape : %s" % SizeOfShape) print("Initial velocity : %s" % Velocity) print("Step of iteration : %s" % Step) print("Number of iterations : %s" % Iterations) print("Method of resolution : %s" % Method) print("Initial Random for RNG Seed : %s" % InitialRandom) print("ValueType is : %s" % ValueType) print("Viriel distribution of stress : %s" % VirielStress) print("OpenGL real time rendering : %s" % OpenGL) print("Speed rendering : %s" % SpeedRendering) print("Interaction type : %s" % InterType) print("Counter Artevasion type : %s" % CoArEv) # Create Numpy array of CL vector with 8 FP32 MyCoM = np.zeros(4,dtype=MyFloat) MyDataX = np.zeros(Number*4, dtype=MyFloat) MyDataV = np.zeros(Number*4, dtype=MyFloat) MyPotential = np.zeros(Number, dtype=MyFloat) MyKinetic = np.zeros(Number, dtype=MyFloat) Marsaglia,Computing,Interaction,Artevasion=DictionariesAPI() # Scan the OpenCL arrays Id=0 HasXPU=False for platform in cl.get_platforms(): for device in platform.get_devices(): if Id==Device: PlatForm=platform XPU=device print("CPU/GPU selected: ",device.name.lstrip()) print("Platform selected: ",platform.name) HasXPU=True Id+=1 if HasXPU==False: print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1)) sys.exit() # Create Context try: ctx = cl.Context([XPU]) queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE) except: print("Crash during context creation") # Build all routines used for the computing #BuildOptions="-cl-mad-enable -cl-kernel-arg-info -cl-fast-relaxed-math -cl-std=CL1.2 -DTRNG=%i -DTYPE=%i" % (Marsaglia[RNG],Computing[ValueType]) BuildOptions="-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%i -DINTERACTION=%i -DARTEVASION=%i" % (Marsaglia[RNG],Computing[ValueType],Interaction[InterType],Artevasion[CoArEv]) if 'Intel' in PlatForm.name or 'Experimental' in PlatForm.name or 'Clover' in PlatForm.name or 'Portable' in PlatForm.name : MyRoutines = cl.Program(ctx, BlobOpenCL).build(options = BuildOptions) else: MyRoutines = cl.Program(ctx, BlobOpenCL).build(options = BuildOptions+" -cl-strict-aliasing") mf = cl.mem_flags # Read/Write approach for buffering clDataX = cl.Buffer(ctx, mf.READ_WRITE, MyDataX.nbytes) clDataV = cl.Buffer(ctx, mf.READ_WRITE, MyDataV.nbytes) clPotential = cl.Buffer(ctx, mf.READ_WRITE, MyPotential.nbytes) clKinetic = cl.Buffer(ctx, mf.READ_WRITE, MyKinetic.nbytes) clCoM = cl.Buffer(ctx, mf.READ_WRITE, MyCoM.nbytes) # Write/HostPointer approach for buffering # clDataX = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataX) # clDataV = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataV) # clPotential = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyPotential) # clKinetic = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyKinetic) # clCoM = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyCoM) print('All particles superimposed.') # Set particles to RNG points if InitialRandom: seed_w=np.uint32(nprnd(2**32)) seed_z=np.uint32(nprnd(2**32)) else: seed_w=np.uint32(19710211) seed_z=np.uint32(20081010) if Shape=='Ball': MyRoutines.InBallSplutterPoints(queue,(Number,1),None,clDataX,SizeOfShape,seed_w,seed_z) else: MyRoutines.InBoxSplutterPoints(queue,(Number,1),None,clDataX,SizeOfShape,seed_w,seed_z) print('All particules distributed') CLLaunch=MyRoutines.CenterOfMass(queue,(1,1),None,clDataX,clCoM,np.int32(Number)) CLLaunch.wait() cl.enqueue_copy(queue,MyCoM,clCoM) print('Center Of Mass estimated: (%s,%s,%s)' % (MyCoM[0],MyCoM[1],MyCoM[2])) if VirielStress: CLLaunch=MyRoutines.SplutterStress(queue,(Number,1),None,clDataX,clDataV,clCoM,MyFloat(0.),np.uint32(110271),np.uint32(250173)) else: CLLaunch=MyRoutines.SplutterStress(queue,(Number,1),None,clDataX,clDataV,clCoM,Velocity,np.uint32(110271),np.uint32(250173)) CLLaunch.wait() print('All particules stressed') CLLaunch=MyRoutines.Potential(queue,(Number,1),None,clDataX,clPotential) CLLaunch=MyRoutines.Kinetic(queue,(Number,1),None,clDataV,clKinetic) CLLaunch.wait() cl.enqueue_copy(queue,MyPotential,clPotential) cl.enqueue_copy(queue,MyKinetic,clKinetic) print('Energy estimated: Viriel=%s Potential=%s Kinetic=%s\n'% (np.sum(MyPotential)+2*np.sum(MyKinetic),np.sum(MyPotential),np.sum(MyKinetic))) if SpeedRendering: SizeOfBox=max(2*MyKinetic) else: SizeOfBox=SizeOfShape if OpenGL: print('\tTiny documentation to interact OpenGL rendering:\n') print('\t<Left|Right> Rotate around X axis') print('\t <Up|Down> Rotate around Y axis') print('\t <z|Z> Rotate around Z axis') print('\t <-|+> Unzoom/Zoom') print('\t <s> Toggle to display Positions or Velocities') print('\t <Esc> Quit\n') wall_time_start=time.time() Durations=np.array([],dtype=MyFloat) print('Starting!') if OpenGL: from OpenGL.GL import * from OpenGL.GLUT import * global ViewRX,ViewRY,ViewRZ Iterations=0 ViewRX,ViewRY,ViewRZ = 0.,0.,0. # Launch OpenGL Loop glutInit(sys.argv) glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB) glutSetOption(GLUT_ACTION_ON_WINDOW_CLOSE,GLUT_ACTION_CONTINUE_EXECUTION) glutInitWindowSize(512,512) glutCreateWindow(b'NBodyGL') setup_viewport() glutReshapeFunc(reshape) glutDisplayFunc(display) glutIdleFunc(display) # glutMouseFunc(mouse) glutSpecialFunc(special) glutKeyboardFunc(keyboard) glutMainLoop() else: for iteration in range(Iterations): Elapsed=MainOpenCL(clDataX,clDataV,Step,Method) if Verbose: # print("Duration of #%s iteration: %s" % (iteration,Elapsed)) cl.enqueue_copy(queue, MyDataX, clDataX) print("Positions for #%s iteration: %s" % (iteration,MyDataX)) else: sys.stdout.write('.') sys.stdout.flush() Durations=np.append(Durations,Elapsed) print('\nEnding!') MyRoutines.CenterOfMass(queue,(1,1),None,clDataX,clCoM,np.int32(Number)) CLLaunch=MyRoutines.Potential(queue,(Number,1),None,clDataX,clPotential) CLLaunch=MyRoutines.Kinetic(queue,(Number,1),None,clDataV,clKinetic) CLLaunch.wait() cl.enqueue_copy(queue,MyCoM,clCoM) cl.enqueue_copy(queue,MyPotential,clPotential) cl.enqueue_copy(queue,MyKinetic,clKinetic) print('\nCenter Of Mass estimated: (%s,%s,%s)' % (MyCoM[0],MyCoM[1],MyCoM[2])) print('Energy estimated: Viriel=%s Potential=%s Kinetic=%s\n'% (np.sum(MyPotential)+2.*np.sum(MyKinetic),np.sum(MyPotential),np.sum(MyKinetic))) print("Duration stats on device %s with %s iterations :\n\tMean:\t%s\n\tMedian:\t%s\n\tStddev:\t%s\n\tMin:\t%s\n\tMax:\t%s\n\n\tVariability:\t%s\n" % (Device,Iterations,np.mean(Durations),np.median(Durations),np.std(Durations),np.min(Durations),np.max(Durations),np.std(Durations)/np.median(Durations))) # FPS: 1/Elapsed FPS=np.ones(len(Durations)) FPS/=Durations print("FPS stats on device %s with %s iterations :\n\tMean:\t%s\n\tMedian:\t%s\n\tStddev:\t%s\n\tMin:\t%s\n\tMax:\t%s\n" % (Device,Iterations,np.mean(FPS),np.median(FPS),np.std(FPS),np.min(FPS),np.max(FPS))) # Contraction of Square*Size*Hertz: Size*Size/Elapsed Squertz=np.ones(len(Durations)) Squertz*=Number*Number Squertz/=Durations print("Squertz in log10 & complete stats on device %s with %s iterations :\n\tMean:\t%s\t%s\n\tMedian:\t%s\t%s\n\tStddev:\t%s\t%s\n\tMin:\t%s\t%s\n\tMax:\t%s\t%s\n" % (Device,Iterations,np.log10(np.mean(Squertz)),np.mean(Squertz),np.log10(np.median(Squertz)),np.median(Squertz),np.log10(np.std(Squertz)),np.std(Squertz),np.log10(np.min(Squertz)),np.min(Squertz),np.log10(np.max(Squertz)),np.max(Squertz))) clDataX.release() clDataV.release() clKinetic.release() clPotential.release()